1 /*
2 * Copyright (c) 2011-2021, 2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap_internal.h>
84
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93
94 #include <pexpert/device_tree.h>
95
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106
107
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131
132 const struct page_table_ops native_pt_ops =
133 {
134 .alloc_id = alloc_asid,
135 .free_id = free_asid,
136 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 .wimg_to_pte = wimg_to_pte,
139 };
140
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 [0] = {
144 .size = ARM_16K_TT_L0_SIZE,
145 .offmask = ARM_16K_TT_L0_OFFMASK,
146 .shift = ARM_16K_TT_L0_SHIFT,
147 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 .valid_mask = ARM_TTE_VALID,
149 .type_mask = ARM_TTE_TYPE_MASK,
150 .type_block = ARM_TTE_TYPE_BLOCK
151 },
152 [1] = {
153 .size = ARM_16K_TT_L1_SIZE,
154 .offmask = ARM_16K_TT_L1_OFFMASK,
155 .shift = ARM_16K_TT_L1_SHIFT,
156 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 .valid_mask = ARM_TTE_VALID,
158 .type_mask = ARM_TTE_TYPE_MASK,
159 .type_block = ARM_TTE_TYPE_BLOCK
160 },
161 [2] = {
162 .size = ARM_16K_TT_L2_SIZE,
163 .offmask = ARM_16K_TT_L2_OFFMASK,
164 .shift = ARM_16K_TT_L2_SHIFT,
165 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 .valid_mask = ARM_TTE_VALID,
167 .type_mask = ARM_TTE_TYPE_MASK,
168 .type_block = ARM_TTE_TYPE_BLOCK
169 },
170 [3] = {
171 .size = ARM_16K_TT_L3_SIZE,
172 .offmask = ARM_16K_TT_L3_OFFMASK,
173 .shift = ARM_16K_TT_L3_SHIFT,
174 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 .valid_mask = ARM_PTE_TYPE_VALID,
176 .type_mask = ARM_PTE_TYPE_MASK,
177 .type_block = ARM_TTE_TYPE_L3BLOCK
178 }
179 };
180
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 [0] = {
184 .size = ARM_4K_TT_L0_SIZE,
185 .offmask = ARM_4K_TT_L0_OFFMASK,
186 .shift = ARM_4K_TT_L0_SHIFT,
187 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 .valid_mask = ARM_TTE_VALID,
189 .type_mask = ARM_TTE_TYPE_MASK,
190 .type_block = ARM_TTE_TYPE_BLOCK
191 },
192 [1] = {
193 .size = ARM_4K_TT_L1_SIZE,
194 .offmask = ARM_4K_TT_L1_OFFMASK,
195 .shift = ARM_4K_TT_L1_SHIFT,
196 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 .valid_mask = ARM_TTE_VALID,
198 .type_mask = ARM_TTE_TYPE_MASK,
199 .type_block = ARM_TTE_TYPE_BLOCK
200 },
201 [2] = {
202 .size = ARM_4K_TT_L2_SIZE,
203 .offmask = ARM_4K_TT_L2_OFFMASK,
204 .shift = ARM_4K_TT_L2_SHIFT,
205 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 .valid_mask = ARM_TTE_VALID,
207 .type_mask = ARM_TTE_TYPE_MASK,
208 .type_block = ARM_TTE_TYPE_BLOCK
209 },
210 [3] = {
211 .size = ARM_4K_TT_L3_SIZE,
212 .offmask = ARM_4K_TT_L3_OFFMASK,
213 .shift = ARM_4K_TT_L3_SHIFT,
214 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 .valid_mask = ARM_PTE_TYPE_VALID,
216 .type_mask = ARM_PTE_TYPE_MASK,
217 .type_block = ARM_TTE_TYPE_L3BLOCK
218 }
219 };
220
221 const struct page_table_attr pmap_pt_attr_4k = {
222 .pta_level_info = pmap_table_level_info_4k,
223 .pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 .pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 .pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 .pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 .pta_max_level = PMAP_TT_L3_LEVEL,
234 .pta_ops = &native_pt_ops,
235 .ap_ro = ARM_PTE_AP(AP_RORO),
236 .ap_rw = ARM_PTE_AP(AP_RWRW),
237 .ap_rona = ARM_PTE_AP(AP_RONA),
238 .ap_rwna = ARM_PTE_AP(AP_RWNA),
239 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 .ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 .pta_tcr_value = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 .pta_page_size = 4096,
245 .pta_pagezero_size = 4096,
246 .pta_page_shift = 12,
247 };
248
249 const struct page_table_attr pmap_pt_attr_16k = {
250 .pta_level_info = pmap_table_level_info_16k,
251 .pta_root_level = PMAP_TT_L1_LEVEL,
252 .pta_commpage_level = PMAP_TT_L2_LEVEL,
253 .pta_max_level = PMAP_TT_L3_LEVEL,
254 .pta_ops = &native_pt_ops,
255 .ap_ro = ARM_PTE_AP(AP_RORO),
256 .ap_rw = ARM_PTE_AP(AP_RWRW),
257 .ap_rona = ARM_PTE_AP(AP_RONA),
258 .ap_rwna = ARM_PTE_AP(AP_RWNA),
259 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
260 .ap_x = ARM_PTE_PNX,
261 #if __ARM_MIXED_PAGE_SIZE__
262 .pta_tcr_value = TCR_EL1_16KB,
263 #endif /* __ARM_MIXED_PAGE_SIZE__ */
264 .pta_page_size = 16384,
265 .pta_pagezero_size = 16384,
266 .pta_page_shift = 14,
267 };
268
269 #if __ARM_16K_PG__
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
271 #else /* !__ARM_16K_PG__ */
272 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
273 #endif /* !__ARM_16K_PG__ */
274
275
276 #if MACH_ASSERT
277 int vm_footprint_suspend_allowed = 1;
278
279 extern int pmap_ledgers_panic;
280 extern int pmap_ledgers_panic_leeway;
281
282 #endif /* MACH_ASSERT */
283
284 #if DEVELOPMENT || DEBUG
285 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
286 (current_thread()->pmap_footprint_suspended)
287 #else /* DEVELOPMENT || DEBUG */
288 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
289 #endif /* DEVELOPMENT || DEBUG */
290
291
292 /*
293 * Represents a tlb range that will be flushed before exiting
294 * the ppl.
295 * Used by phys_attribute_clear_range to defer flushing pages in
296 * this range until the end of the operation.
297 */
298 typedef struct pmap_tlb_flush_range {
299 pmap_t ptfr_pmap;
300 vm_map_address_t ptfr_start;
301 vm_map_address_t ptfr_end;
302 bool ptfr_flush_needed;
303 } pmap_tlb_flush_range_t;
304
305 #if XNU_MONITOR
306 /*
307 * PPL External References.
308 */
309 extern vm_offset_t segPPLDATAB;
310 extern unsigned long segSizePPLDATA;
311 extern vm_offset_t segPPLTEXTB;
312 extern unsigned long segSizePPLTEXT;
313 extern vm_offset_t segPPLDATACONSTB;
314 extern unsigned long segSizePPLDATACONST;
315
316
317 /*
318 * PPL Global Variables
319 */
320
321 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
322 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
323 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
324 #else
325 const boolean_t pmap_ppl_disable = FALSE;
326 #endif
327
328 /*
329 * Indicates if the PPL has started applying APRR.
330 * This variable is accessed from various assembly trampolines, so be sure to change
331 * those if you change the size or layout of this variable.
332 */
333 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
334
335 extern void *pmap_stacks_start;
336 extern void *pmap_stacks_end;
337
338 #endif /* !XNU_MONITOR */
339
340
341
342 /* Virtual memory region for early allocation */
343 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
344 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
345 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
346
347 extern uint8_t bootstrap_pagetables[];
348
349 extern unsigned int not_in_kdp;
350
351 extern vm_offset_t first_avail;
352
353 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
354 extern vm_offset_t virtual_space_end; /* End of kernel address space */
355 extern vm_offset_t static_memory_end;
356
357 extern const vm_map_address_t physmap_base;
358 extern const vm_map_address_t physmap_end;
359
360 extern int maxproc, hard_maxproc;
361
362 /* The number of address bits one TTBR can cover. */
363 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
364
365 /*
366 * The bounds on our TTBRs. These are for sanity checking that
367 * an address is accessible by a TTBR before we attempt to map it.
368 */
369
370 /* The level of the root of a page table. */
371 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
372
373 /* The number of entries in the root TT of a page table. */
374 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
375
376 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
377 const pmap_t kernel_pmap = &kernel_pmap_store;
378
379 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
380
381 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
382 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
383 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
384
385 typedef struct tt_free_entry {
386 struct tt_free_entry *next;
387 } tt_free_entry_t;
388
389 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
390
391 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
392 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
393 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
394 #define FREE_PAGE_SIZE_TT_MAX 4
395 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
396 unsigned int free_two_page_size_tt_count MARK_AS_PMAP_DATA;
397 unsigned int free_two_page_size_tt_max MARK_AS_PMAP_DATA;
398 #define FREE_TWO_PAGE_SIZE_TT_MAX 4
399 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
400 unsigned int free_tt_count MARK_AS_PMAP_DATA;
401 unsigned int free_tt_max MARK_AS_PMAP_DATA;
402
403 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
404
405 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
406 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
407 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
408 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
409 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
410 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
411
412 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
413 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
414
415 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
416 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
417
418 /* Lock group used for all pmap object locks. */
419 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
420
421 #if DEVELOPMENT || DEBUG
422 int nx_enabled = 1; /* enable no-execute protection */
423 int allow_data_exec = 0; /* No apps may execute data */
424 int allow_stack_exec = 0; /* No apps may execute from the stack */
425 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
426 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
427 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
428 #else /* DEVELOPMENT || DEBUG */
429 const int nx_enabled = 1; /* enable no-execute protection */
430 const int allow_data_exec = 0; /* No apps may execute data */
431 const int allow_stack_exec = 0; /* No apps may execute from the stack */
432 #endif /* DEVELOPMENT || DEBUG */
433
434 /**
435 * This variable is set true during hibernation entry to protect pmap data structures
436 * during image copying, and reset false on hibernation exit.
437 */
438 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
439
440 #if MACH_ASSERT
441 static void pmap_check_ledgers(pmap_t pmap);
442 #else
443 static inline void
pmap_check_ledgers(__unused pmap_t pmap)444 pmap_check_ledgers(__unused pmap_t pmap)
445 {
446 }
447 #endif /* MACH_ASSERT */
448
449 /**
450 * This helper function ensures that potentially-long-running batched PPL operations are
451 * called in preemptible context before entering the PPL, so that the PPL call may
452 * periodically exit to allow pending urgent ASTs to be taken.
453 */
454 static inline void
pmap_verify_preemptible(void)455 pmap_verify_preemptible(void)
456 {
457 assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
458 }
459
460 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
461
462 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
463 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
464
465 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
466
467 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
468 #if defined(__arm64__)
469 /* end of shared region + 512MB for various purposes */
470 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
471 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
472 "Minimum address space size outside allowable range");
473
474 // Max offset is 15.375GB for devices with "large" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
476 // Max offset is 11.375GB for devices with "small" memory config
477 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
478
479
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
481 "Large device address space size outside allowable range");
482 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
483 "Small device address space size outside allowable range");
484
485 # ifdef XNU_TARGET_OS_OSX
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
487 # else
488 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
489 # endif
490 #endif /* __arm64__ */
491
492 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
493 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
494 #else
495 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
496 #endif
497
498 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
499 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
500 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
501 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
502 #if !HAS_16BIT_ASID
503 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
504 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
505 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
506 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
507 #else
508 static uint16_t last_allocated_asid = 0;
509 #endif /* !HAS_16BIT_ASID */
510
511
512 #if __ARM_MIXED_PAGE_SIZE__
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
514 #endif
515 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
516 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
517 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
518
519 /* PTE Define Macros */
520
521 #define ARM_PTE_IS_COMPRESSED(x, p) \
522 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
523 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
524 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
525 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
526 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
527
528 #define pte_is_wired(pte) \
529 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
530
531 #define pte_was_writeable(pte) \
532 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
533
534 #define pte_set_was_writeable(pte, was_writeable) \
535 do { \
536 if ((was_writeable)) { \
537 (pte) |= ARM_PTE_WRITEABLE; \
538 } else { \
539 (pte) &= ~ARM_PTE_WRITEABLE; \
540 } \
541 } while(0)
542
543 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)544 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
545 {
546 if (wired) {
547 *ptep |= ARM_PTE_WIRED;
548 } else {
549 *ptep &= ~ARM_PTE_WIRED;
550 }
551 /*
552 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
553 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
554 * never reclaimed.
555 */
556 if (pmap == kernel_pmap) {
557 return;
558 }
559 unsigned short *ptd_wiredcnt_ptr;
560 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
561 if (wired) {
562 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 } else {
564 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
565 if (__improbable(prev_wired == 0)) {
566 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
567 }
568 }
569 }
570
571 #if HAS_FEAT_XS
572
573 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)574 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
575 {
576 if (__improbable(pt_attr->stage2)) {
577 return false;
578 }
579 switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
580 case CACHE_ATTRINDX_POSTED_XS:
581 case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
582 return true;
583 default:
584 return false;
585 }
586 }
587
588 #endif /* HAS_FEAT_XS */
589
590 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
591 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
592 arm64_sync_tlb(strong); \
593 }
594
595 /*
596 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
597 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
598 * will observe the updated PTE.
599 */
600 #define FLUSH_PTE() \
601 __builtin_arm_dmb(DMB_ISH);
602
603 /*
604 * Synchronize updates to PTEs that were previously valid and thus may be cached in
605 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
606 * TLBI. This should only require a store-store barrier, as subsequent accesses in
607 * program order will not issue until the DSB completes. Prior loads may be reordered
608 * after the barrier, but their behavior should not be materially affected by the
609 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
610 * matter for loads until the access is re-driven well after the TLB update is
611 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
612 * we should be in a position to handle access faults. For "voluntary" PTE access
613 * restriction due to unmapping or protection, the decision to restrict access should
614 * have a data dependency on prior loads in order to avoid a data race.
615 */
616 #define FLUSH_PTE_STRONG() \
617 __builtin_arm_dsb(DSB_ISHST);
618
619 /**
620 * Write enough page table entries to map a single VM page. On systems where the
621 * VM page size does not match the hardware page size, multiple page table
622 * entries will need to be written.
623 *
624 * @note This function does not emit a barrier to ensure these page table writes
625 * have completed before continuing. This is commonly needed. In the case
626 * where a DMB or DSB barrier is needed, then use the write_pte() and
627 * write_pte_strong() functions respectively instead of this one.
628 *
629 * @param ptep Pointer to the first page table entry to update.
630 * @param pte The value to write into each page table entry. In the case that
631 * multiple PTEs are updated to a non-empty value, then the address
632 * in this value will automatically be incremented for each PTE
633 * write.
634 */
635 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)636 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
637 {
638 /**
639 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
640 * systems, which is why it's checked at runtime instead of compile time.
641 * The "unreachable" warning needs to be suppressed because it still is a
642 * compile time constant on some systems.
643 */
644 __unreachable_ok_push
645 if (TEST_PAGE_RATIO_4) {
646 if (((uintptr_t)ptep) & 0x1f) {
647 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
648 __func__, ptep, (void*)pte);
649 }
650
651 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
652 /**
653 * If we're writing an empty/compressed PTE value, then don't
654 * auto-increment the address for each PTE write.
655 */
656 *ptep = pte;
657 *(ptep + 1) = pte;
658 *(ptep + 2) = pte;
659 *(ptep + 3) = pte;
660 } else {
661 *ptep = pte;
662 *(ptep + 1) = pte | 0x1000;
663 *(ptep + 2) = pte | 0x2000;
664 *(ptep + 3) = pte | 0x3000;
665 }
666 } else {
667 *ptep = pte;
668 }
669 __unreachable_ok_pop
670 }
671
672 /**
673 * Writes enough page table entries to map a single VM page and then ensures
674 * those writes complete by executing a Data Memory Barrier.
675 *
676 * @note The DMB issued by this function is not strong enough to protect against
677 * TLB invalidates from being reordered above the PTE writes. If a TLBI
678 * instruction is going to immediately be called after this write, it's
679 * recommended to call write_pte_strong() instead of this function.
680 *
681 * See the function header for write_pte_fast() for more details on the
682 * parameters.
683 */
684 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)685 write_pte(pt_entry_t *ptep, pt_entry_t pte)
686 {
687 write_pte_fast(ptep, pte);
688 FLUSH_PTE();
689 }
690
691 /**
692 * Writes enough page table entries to map a single VM page and then ensures
693 * those writes complete by executing a Data Synchronization Barrier. This
694 * barrier provides stronger guarantees than the DMB executed by write_pte().
695 *
696 * @note This function is useful if you're going to immediately flush the TLB
697 * after making the PTE write. A DSB is required to protect against the
698 * TLB invalidate being reordered before the PTE write.
699 *
700 * See the function header for write_pte_fast() for more details on the
701 * parameters.
702 */
703 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)704 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
705 {
706 write_pte_fast(ptep, pte);
707 FLUSH_PTE_STRONG();
708 }
709
710 /**
711 * Retrieve the pmap structure for the thread running on the current CPU.
712 */
713 pmap_t
current_pmap()714 current_pmap()
715 {
716 const pmap_t current = vm_map_pmap(current_thread()->map);
717
718 assert(current != NULL);
719
720 #if XNU_MONITOR
721 /**
722 * On PPL-enabled systems, it's important that PPL policy decisions aren't
723 * decided by kernel-writable memory. This function is used in various parts
724 * of the PPL, and besides validating that the pointer returned by this
725 * function is indeed a pmap structure, it's also important to ensure that
726 * it's actually the current thread's pmap. This is because different pmaps
727 * will have access to different entitlements based on the code signature of
728 * their loaded process. So if a different user pmap is set in the current
729 * thread structure (in an effort to bypass code signing restrictions), even
730 * though the structure would validate correctly as it is a real pmap
731 * structure, it should fail here.
732 *
733 * This only needs to occur for user pmaps because the kernel pmap's root
734 * page table is always the same as TTBR1 (it's set during bootstrap and not
735 * changed so it'd be redundant to check), and its code signing fields are
736 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
737 * it shouldn't be possible to set those fields. Due to that, an attacker
738 * setting the current thread's pmap to the kernel pmap as a way to bypass
739 * this check won't accomplish anything as it doesn't provide any extra code
740 * signing entitlements.
741 */
742 if ((current != kernel_pmap) &&
743 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
744 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
745 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
746 }
747 #endif /* XNU_MONITOR */
748
749 return current;
750 }
751
752 #if DEVELOPMENT || DEBUG
753
754 /*
755 * Trace levels are controlled by a bitmask in which each
756 * level can be enabled/disabled by the (1<<level) position
757 * in the boot arg
758 * Level 0: PPL extension functionality
759 * Level 1: pmap lifecycle (create/destroy/switch)
760 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
761 * Level 3: internal state management (attributes/fast-fault)
762 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
763 */
764
765 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
766
767 #define PMAP_TRACE(level, ...) \
768 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
769 KDBG_RELEASE(__VA_ARGS__); \
770 }
771 #else /* DEVELOPMENT || DEBUG */
772
773 #define PMAP_TRACE(level, ...)
774
775 #endif /* DEVELOPMENT || DEBUG */
776
777
778 /*
779 * Internal function prototypes (forward declarations).
780 */
781
782 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
783
784 static void pmap_set_reference(ppnum_t pn);
785
786 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
787
788 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
789
790 static kern_return_t pmap_expand(
791 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
792
793 static int pmap_remove_range(
794 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
795
796 static tt_entry_t *pmap_tt1_allocate(
797 pmap_t, vm_size_t, unsigned int);
798
799 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
800
801 static void pmap_tt1_deallocate(
802 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
803
804 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
805
806 static kern_return_t pmap_tt_allocate(
807 pmap_t, tt_entry_t **, unsigned int, unsigned int);
808
809 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
810
811 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
812 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
813 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
814
815 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
816
817
818 static void pmap_unmap_commpage(
819 pmap_t pmap);
820
821 static boolean_t
822 pmap_is_64bit(pmap_t);
823
824
825 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
826
827 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
828
829 static bool pmap_update_cache_attributes_locked(
830 ppnum_t, unsigned, bool);
831
832 static boolean_t arm_clear_fast_fault(
833 ppnum_t ppnum,
834 vm_prot_t fault_type,
835 pt_entry_t *pte_p);
836
837 static void pmap_trim_self(pmap_t pmap);
838 static void pmap_trim_subord(pmap_t subord);
839
840
841 /*
842 * Temporary prototypes, while we wait for pmap_enter to move to taking an
843 * address instead of a page number.
844 */
845 static kern_return_t
846 pmap_enter_addr(
847 pmap_t pmap,
848 vm_map_address_t v,
849 pmap_paddr_t pa,
850 vm_prot_t prot,
851 vm_prot_t fault_type,
852 unsigned int flags,
853 boolean_t wired);
854
855 kern_return_t
856 pmap_enter_options_addr(
857 pmap_t pmap,
858 vm_map_address_t v,
859 pmap_paddr_t pa,
860 vm_prot_t prot,
861 vm_prot_t fault_type,
862 unsigned int flags,
863 boolean_t wired,
864 unsigned int options,
865 __unused void *arg,
866 __unused pmap_mapping_type_t mapping_type);
867
868 #ifdef CONFIG_XNUPOST
869 kern_return_t pmap_test(void);
870 #endif /* CONFIG_XNUPOST */
871
872 PMAP_SUPPORT_PROTOTYPES(
873 kern_return_t,
874 arm_fast_fault, (pmap_t pmap,
875 vm_map_address_t va,
876 vm_prot_t fault_type,
877 bool was_af_fault,
878 bool from_user), ARM_FAST_FAULT_INDEX);
879
880 PMAP_SUPPORT_PROTOTYPES(
881 boolean_t,
882 arm_force_fast_fault, (ppnum_t ppnum,
883 vm_prot_t allow_mode,
884 int options), ARM_FORCE_FAST_FAULT_INDEX);
885
886 MARK_AS_PMAP_TEXT static boolean_t
887 arm_force_fast_fault_with_flush_range(
888 ppnum_t ppnum,
889 vm_prot_t allow_mode,
890 int options,
891 pmap_tlb_flush_range_t *flush_range);
892
893 /**
894 * Definition of the states driving the batch cache attributes update
895 * state machine.
896 */
897 typedef struct {
898 uint64_t page_index : 32, /* The page index to be operated on */
899 state : 8, /* The current state of the update machine */
900 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
901 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
902 :0;
903 } batch_set_cache_attr_state_t;
904
905 /* Possible values of the "state" field. */
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
907 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
908 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
909 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
910
911 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
912
913 PMAP_SUPPORT_PROTOTYPES(
914 batch_set_cache_attr_state_t,
915 pmap_batch_set_cache_attributes, (
916 #if XNU_MONITOR
917 volatile upl_page_info_t *user_page_list,
918 #else /* !XNU_MONITOR */
919 upl_page_info_array_t user_page_list,
920 #endif /* XNU_MONITOR */
921 batch_set_cache_attr_state_t state,
922 unsigned int page_cnt,
923 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
924
925 PMAP_SUPPORT_PROTOTYPES(
926 kern_return_t,
927 pmap_change_wiring, (pmap_t pmap,
928 vm_map_address_t v,
929 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
930
931 PMAP_SUPPORT_PROTOTYPES(
932 pmap_t,
933 pmap_create_options, (ledger_t ledger,
934 vm_map_size_t size,
935 unsigned int flags,
936 kern_return_t * kr), PMAP_CREATE_INDEX);
937
938 PMAP_SUPPORT_PROTOTYPES(
939 void,
940 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
941
942 PMAP_SUPPORT_PROTOTYPES(
943 kern_return_t,
944 pmap_enter_options, (pmap_t pmap,
945 vm_map_address_t v,
946 pmap_paddr_t pa,
947 vm_prot_t prot,
948 vm_prot_t fault_type,
949 unsigned int flags,
950 boolean_t wired,
951 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
952
953 PMAP_SUPPORT_PROTOTYPES(
954 pmap_paddr_t,
955 pmap_find_pa, (pmap_t pmap,
956 addr64_t va), PMAP_FIND_PA_INDEX);
957
958 PMAP_SUPPORT_PROTOTYPES(
959 kern_return_t,
960 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
961
962
963 PMAP_SUPPORT_PROTOTYPES(
964 boolean_t,
965 pmap_is_empty, (pmap_t pmap,
966 vm_map_offset_t va_start,
967 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
968
969
970 PMAP_SUPPORT_PROTOTYPES(
971 unsigned int,
972 pmap_map_cpu_windows_copy, (ppnum_t pn,
973 vm_prot_t prot,
974 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
975
976 PMAP_SUPPORT_PROTOTYPES(
977 void,
978 pmap_ro_zone_memcpy, (zone_id_t zid,
979 vm_offset_t va,
980 vm_offset_t offset,
981 const vm_offset_t new_data,
982 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
983
984 PMAP_SUPPORT_PROTOTYPES(
985 uint64_t,
986 pmap_ro_zone_atomic_op, (zone_id_t zid,
987 vm_offset_t va,
988 vm_offset_t offset,
989 zro_atomic_op_t op,
990 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
991
992 PMAP_SUPPORT_PROTOTYPES(
993 void,
994 pmap_ro_zone_bzero, (zone_id_t zid,
995 vm_offset_t va,
996 vm_offset_t offset,
997 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
998
999 PMAP_SUPPORT_PROTOTYPES(
1000 vm_map_offset_t,
1001 pmap_nest, (pmap_t grand,
1002 pmap_t subord,
1003 addr64_t vstart,
1004 uint64_t size,
1005 vm_map_offset_t vrestart,
1006 kern_return_t * krp), PMAP_NEST_INDEX);
1007
1008 PMAP_SUPPORT_PROTOTYPES(
1009 void,
1010 pmap_page_protect_options, (ppnum_t ppnum,
1011 vm_prot_t prot,
1012 unsigned int options,
1013 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1014
1015 PMAP_SUPPORT_PROTOTYPES(
1016 vm_map_address_t,
1017 pmap_protect_options, (pmap_t pmap,
1018 vm_map_address_t start,
1019 vm_map_address_t end,
1020 vm_prot_t prot,
1021 unsigned int options,
1022 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1023
1024 PMAP_SUPPORT_PROTOTYPES(
1025 kern_return_t,
1026 pmap_query_page_info, (pmap_t pmap,
1027 vm_map_offset_t va,
1028 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1029
1030 PMAP_SUPPORT_PROTOTYPES(
1031 mach_vm_size_t,
1032 pmap_query_resident, (pmap_t pmap,
1033 vm_map_address_t start,
1034 vm_map_address_t end,
1035 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1036
1037 PMAP_SUPPORT_PROTOTYPES(
1038 void,
1039 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1040
1041 PMAP_SUPPORT_PROTOTYPES(
1042 vm_map_address_t,
1043 pmap_remove_options, (pmap_t pmap,
1044 vm_map_address_t start,
1045 vm_map_address_t end,
1046 int options), PMAP_REMOVE_OPTIONS_INDEX);
1047
1048
1049 PMAP_SUPPORT_PROTOTYPES(
1050 void,
1051 pmap_set_cache_attributes, (ppnum_t pn,
1052 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1053
1054 PMAP_SUPPORT_PROTOTYPES(
1055 void,
1056 pmap_update_compressor_page, (ppnum_t pn,
1057 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1058
1059 PMAP_SUPPORT_PROTOTYPES(
1060 void,
1061 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1062
1063 #if MACH_ASSERT || XNU_MONITOR
1064 PMAP_SUPPORT_PROTOTYPES(
1065 void,
1066 pmap_set_process, (pmap_t pmap,
1067 int pid,
1068 char *procname), PMAP_SET_PROCESS_INDEX);
1069 #endif
1070
1071 PMAP_SUPPORT_PROTOTYPES(
1072 void,
1073 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1074
1075 PMAP_SUPPORT_PROTOTYPES(
1076 vm_map_offset_t,
1077 pmap_unnest_options, (pmap_t grand,
1078 addr64_t vaddr,
1079 uint64_t size,
1080 vm_map_offset_t vrestart,
1081 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1082
1083 PMAP_SUPPORT_PROTOTYPES(
1084 void,
1085 phys_attribute_set, (ppnum_t pn,
1086 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1087
1088 PMAP_SUPPORT_PROTOTYPES(
1089 void,
1090 phys_attribute_clear, (ppnum_t pn,
1091 unsigned int bits,
1092 int options,
1093 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1094
1095 #if __ARM_RANGE_TLBI__
1096 PMAP_SUPPORT_PROTOTYPES(
1097 vm_map_address_t,
1098 phys_attribute_clear_range, (pmap_t pmap,
1099 vm_map_address_t start,
1100 vm_map_address_t end,
1101 unsigned int bits,
1102 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1103 #endif /* __ARM_RANGE_TLBI__ */
1104
1105
1106 PMAP_SUPPORT_PROTOTYPES(
1107 void,
1108 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1109
1110 PMAP_SUPPORT_PROTOTYPES(
1111 void,
1112 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1113
1114 PMAP_SUPPORT_PROTOTYPES(
1115 void,
1116 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1117
1118 PMAP_SUPPORT_PROTOTYPES(
1119 void,
1120 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1121
1122 PMAP_SUPPORT_PROTOTYPES(
1123 void,
1124 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1125
1126 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1127 PMAP_SUPPORT_PROTOTYPES(
1128 void,
1129 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1130 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1131
1132 /* Definition of the states used by pmap_trim(). */
1133 typedef enum {
1134 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1135 PMAP_TRIM_STATE_START = 0,
1136
1137 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1138 PMAP_TRIM_STATE_GRAND_BEFORE,
1139
1140 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1141 PMAP_TRIM_STATE_GRAND_AFTER,
1142
1143 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1144 PMAP_TRIM_STATE_SUBORD,
1145
1146 /* Marks that trimming is finished. */
1147 PMAP_TRIM_STATE_DONE,
1148
1149 /* Sentry enum for sanity checks. */
1150 PMAP_TRIM_STATE_COUNT,
1151 } pmap_trim_state_t;
1152
1153 PMAP_SUPPORT_PROTOTYPES(
1154 pmap_trim_state_t,
1155 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1156
1157 #if HAS_APPLE_PAC
1158 PMAP_SUPPORT_PROTOTYPES(
1159 void *,
1160 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1161 PMAP_SUPPORT_PROTOTYPES(
1162 void *,
1163 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1164 #endif /* HAS_APPLE_PAC */
1165
1166
1167
1168
1169 PMAP_SUPPORT_PROTOTYPES(
1170 kern_return_t,
1171 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1172 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1173
1174 PMAP_SUPPORT_PROTOTYPES(
1175 kern_return_t,
1176 pmap_load_trust_cache_with_type, (TCType_t type,
1177 const vm_address_t pmap_img4_payload,
1178 const vm_size_t pmap_img4_payload_len,
1179 const vm_address_t img4_manifest,
1180 const vm_size_t img4_manifest_len,
1181 const vm_address_t img4_aux_manifest,
1182 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1183
1184 PMAP_SUPPORT_PROTOTYPES(
1185 void,
1186 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1187
1188 PMAP_SUPPORT_PROTOTYPES(
1189 kern_return_t,
1190 pmap_query_trust_cache, (TCQueryType_t query_type,
1191 const uint8_t cdhash[kTCEntryHashSize],
1192 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1193
1194 PMAP_SUPPORT_PROTOTYPES(
1195 errno_t,
1196 pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1197 const void *input_data,
1198 size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1199
1200 #if PMAP_CS_INCLUDE_CODE_SIGNING
1201
1202 PMAP_SUPPORT_PROTOTYPES(
1203 kern_return_t,
1204 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1205 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1206
1207 PMAP_SUPPORT_PROTOTYPES(
1208 kern_return_t,
1209 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1210 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1211
1212 PMAP_SUPPORT_PROTOTYPES(
1213 kern_return_t,
1214 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1215 pmap_cs_profile_t * profile_obj),
1216 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1217
1218 PMAP_SUPPORT_PROTOTYPES(
1219 kern_return_t,
1220 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1221 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1222
1223 PMAP_SUPPORT_PROTOTYPES(
1224 kern_return_t,
1225 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1226 const void *kernel_entitlements),
1227 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1228
1229 PMAP_SUPPORT_PROTOTYPES(
1230 kern_return_t,
1231 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1232 const void **kernel_entitlements),
1233 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1234
1235 PMAP_SUPPORT_PROTOTYPES(
1236 kern_return_t,
1237 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1238 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1239
1240 PMAP_SUPPORT_PROTOTYPES(
1241 kern_return_t,
1242 pmap_cs_allow_invalid, (pmap_t pmap),
1243 PMAP_CS_ALLOW_INVALID_INDEX);
1244
1245 PMAP_SUPPORT_PROTOTYPES(
1246 void,
1247 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1248 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1249
1250 PMAP_SUPPORT_PROTOTYPES(
1251 bool,
1252 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1253 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1254
1255 PMAP_SUPPORT_PROTOTYPES(
1256 void,
1257 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1258 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1259
1260 PMAP_SUPPORT_PROTOTYPES(
1261 void,
1262 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1263 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1264
1265 #endif
1266
1267 PMAP_SUPPORT_PROTOTYPES(
1268 uint32_t,
1269 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1270
1271 PMAP_SUPPORT_PROTOTYPES(
1272 bool,
1273 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1274
1275 PMAP_SUPPORT_PROTOTYPES(
1276 void,
1277 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1278
1279 void pmap_footprint_suspend(vm_map_t map,
1280 boolean_t suspend);
1281 PMAP_SUPPORT_PROTOTYPES(
1282 void,
1283 pmap_footprint_suspend, (vm_map_t map,
1284 boolean_t suspend),
1285 PMAP_FOOTPRINT_SUSPEND_INDEX);
1286
1287
1288
1289
1290 #if DEVELOPMENT || DEBUG
1291 PMAP_SUPPORT_PROTOTYPES(
1292 kern_return_t,
1293 pmap_test_text_corruption, (pmap_paddr_t),
1294 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1295 #endif /* DEVELOPMENT || DEBUG */
1296
1297 /*
1298 * The low global vector page is mapped at a fixed alias.
1299 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1300 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1301 * to check both addresses anyway for backward compatibility. So for now
1302 * we leave H6 and H7 where they were.
1303 */
1304 #if (ARM_PGSHIFT == 14)
1305 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1306 #else
1307 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1308 #endif
1309
1310
1311 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1312 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1313 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1314
1315 #if XNU_MONITOR
1316
1317 #if __has_feature(ptrauth_calls)
1318 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1319 #else
1320 #define __ptrauth_ppl_handler
1321 #endif
1322
1323 /*
1324 * Table of function pointers used for PPL dispatch.
1325 */
1326 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1327 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1328 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1329 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1330 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1331 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1332 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1333 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1334 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1335 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1336 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1337 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1338 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1339 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1340 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1341 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1342 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1343 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1344 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1345 [PMAP_NEST_INDEX] = pmap_nest_internal,
1346 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1347 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1348 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1349 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1350 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1351 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1352 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1353 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1354 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1355 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1356 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1357 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1358 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1359 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1360 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1361 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1362 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1363 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1364 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1365 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1366 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1367 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1368 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1369 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1370 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1371 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1372 [PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1373 #if PMAP_CS_INCLUDE_CODE_SIGNING
1374 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1375 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1376 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1377 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1378 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1379 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1380 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1381 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1382 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1383 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1384 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1385 #endif
1386 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1387 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1388 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1389 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1390 #if HAS_APPLE_PAC
1391 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1392 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1393 #endif /* HAS_APPLE_PAC */
1394 #if __ARM_RANGE_TLBI__
1395 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1396 #endif /* __ARM_RANGE_TLBI__ */
1397 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1398 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1399 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1400 [PMAP_NOP_INDEX] = pmap_nop_internal,
1401
1402 #if DEVELOPMENT || DEBUG
1403 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1404 #endif /* DEVELOPMENT || DEBUG */
1405
1406 };
1407 #endif
1408
1409 #if XNU_MONITOR
1410 /**
1411 * A convenience function for setting protections on a single physical
1412 * aperture or static region mapping without invalidating the TLB.
1413 *
1414 * @note This function does not perform any TLB invalidations. That must be done
1415 * separately to be able to safely use the updated mapping.
1416 *
1417 * @note This function understands the difference between the VM page size and
1418 * the kernel page size and will update multiple PTEs if the sizes differ.
1419 * In other words, enough PTEs will always get updated to change the
1420 * permissions on a PAGE_SIZE amount of memory.
1421 *
1422 * @note The PVH lock for the physical page represented by this mapping must
1423 * already be locked.
1424 *
1425 * @note This function assumes the caller has already verified that the PTE
1426 * pointer does indeed point to a physical aperture or static region page
1427 * table. Please validate your inputs before passing it along to this
1428 * function.
1429 *
1430 * @param ptep Pointer to the physical aperture or static region page table to
1431 * update with a new XPRR index.
1432 * @param expected_perm The XPRR index that is expected to already exist at the
1433 * current mapping. If the current index doesn't match this
1434 * then the system will panic.
1435 * @param new_perm The new XPRR index to update the mapping with.
1436 */
1437 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1438 pmap_set_pte_xprr_perm(
1439 pt_entry_t * const ptep,
1440 unsigned int expected_perm,
1441 unsigned int new_perm)
1442 {
1443 assert(ptep != NULL);
1444
1445 pt_entry_t spte = *ptep;
1446 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1447
1448 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1449 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1450 __func__, ptep, new_perm, expected_perm);
1451 }
1452
1453 /**
1454 * The PTE involved should be valid, should not have the hint bit set, and
1455 * should have the expected XPRR index.
1456 */
1457 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1458 panic_plain("%s: physical aperture or static region PTE is invalid, "
1459 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1460 __func__, ptep, spte, new_perm, expected_perm);
1461 }
1462
1463 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1464 panic_plain("%s: physical aperture or static region PTE has hint bit "
1465 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1466 __func__, ptep, spte, new_perm, expected_perm);
1467 }
1468
1469 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1470 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1471 "ptep=%p, new_perm=%u, expected_perm=%u",
1472 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1473 }
1474
1475 pt_entry_t template = spte;
1476 template &= ~ARM_PTE_XPRR_MASK;
1477 template |= xprr_perm_to_pte(new_perm);
1478
1479 write_pte_strong(ptep, template);
1480 }
1481
1482 /**
1483 * Update the protections on a single physical aperture mapping and invalidate
1484 * the TLB so the mapping can be used.
1485 *
1486 * @note The PVH lock for the physical page must already be locked.
1487 *
1488 * @param pai The physical address index of the page whose physical aperture
1489 * mapping will be updated with new permissions.
1490 * @param expected_perm The XPRR index that is expected to already exist at the
1491 * current mapping. If the current index doesn't match this
1492 * then the system will panic.
1493 * @param new_perm The new XPRR index to update the mapping with.
1494 */
1495 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1496 pmap_set_xprr_perm(
1497 unsigned int pai,
1498 unsigned int expected_perm,
1499 unsigned int new_perm)
1500 {
1501 pvh_assert_locked(pai);
1502
1503 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1504 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1505
1506 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1507
1508 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1509 sync_tlb_flush();
1510 }
1511
1512 /**
1513 * Update the protections on a range of physical aperture or static region
1514 * mappings and invalidate the TLB so the mappings can be used.
1515 *
1516 * @note Static region mappings can only be updated before machine_lockdown().
1517 * Physical aperture mappings can be updated at any time.
1518 *
1519 * @param start The starting virtual address of the static region or physical
1520 * aperture range whose permissions will be updated.
1521 * @param end The final (inclusive) virtual address of the static region or
1522 * physical aperture range whose permissions will be updated.
1523 * @param expected_perm The XPRR index that is expected to already exist at the
1524 * current mappings. If the current indices don't match
1525 * this then the system will panic.
1526 * @param new_perm The new XPRR index to update the mappings with.
1527 */
1528 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1529 pmap_set_range_xprr_perm(
1530 vm_address_t start,
1531 vm_address_t end,
1532 unsigned int expected_perm,
1533 unsigned int new_perm)
1534 {
1535 /**
1536 * Validate our arguments; any invalid argument will be grounds for a panic.
1537 */
1538 if (__improbable((start | end) & ARM_PGMASK)) {
1539 panic_plain("%s: start or end not page aligned, "
1540 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1541 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1542 }
1543
1544 if (__improbable(start > end)) {
1545 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1546 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1547 }
1548
1549 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1550 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1551
1552 if (__improbable(!(in_physmap || in_static))) {
1553 panic_plain("%s: address not in static region or physical aperture, "
1554 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1555 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1556 }
1557
1558 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1559 panic_plain("%s: invalid XPRR index, "
1560 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1561 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1562 }
1563
1564 /*
1565 * Walk over the PTEs for the given range, and set the protections on those
1566 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1567 * one twig entry (whichever twig entry currently maps "va").
1568 */
1569 vm_address_t va = start;
1570 while (va < end) {
1571 /**
1572 * Get the last VA that the twig entry for "va" maps. All of the leaf
1573 * PTEs from va to tte_va_end will have their permissions updated.
1574 */
1575 vm_address_t tte_va_end =
1576 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1577
1578 if (tte_va_end > end) {
1579 tte_va_end = end;
1580 }
1581
1582 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1583
1584 if (ttep == NULL) {
1585 panic_plain("%s: physical aperture or static region tte is NULL, "
1586 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1587 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1588 }
1589
1590 tt_entry_t tte = *ttep;
1591
1592 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1593 panic_plain("%s: tte=0x%llx is not a table type entry, "
1594 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1595 tte, (void *)start, (void *)end, new_perm, expected_perm);
1596 }
1597
1598 /* Walk over the given L3 page table page and update the PTEs. */
1599 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1600 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1601 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1602 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1603
1604 /**
1605 * The current PTE pointer is incremented by the page ratio (ratio of
1606 * VM page size to kernel hardware page size) because one call to
1607 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1608 * a PAGE_SIZE worth of hardware pages.
1609 */
1610 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1611 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1612 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1613 pvh_lock(pai);
1614 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1615 pvh_unlock(pai);
1616 }
1617
1618 va = tte_va_end;
1619 }
1620
1621 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1622 }
1623
1624 #endif /* XNU_MONITOR */
1625
1626 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1627 PMAP_ZINFO_PALLOC(
1628 pmap_t pmap, int bytes)
1629 {
1630 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1631 }
1632
1633 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1634 PMAP_ZINFO_PFREE(
1635 pmap_t pmap,
1636 int bytes)
1637 {
1638 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1639 }
1640
1641 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1642 pmap_tt_ledger_credit(
1643 pmap_t pmap,
1644 vm_size_t size)
1645 {
1646 if (pmap != kernel_pmap) {
1647 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1648 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1649 }
1650 }
1651
1652 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1653 pmap_tt_ledger_debit(
1654 pmap_t pmap,
1655 vm_size_t size)
1656 {
1657 if (pmap != kernel_pmap) {
1658 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1659 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1660 }
1661 }
1662
1663 static inline void
pmap_update_plru(uint16_t asid_index __unused)1664 pmap_update_plru(uint16_t asid_index __unused)
1665 {
1666 #if !HAS_16BIT_ASID
1667 if (__probable(pmap_asid_plru)) {
1668 unsigned plru_index = asid_index >> 6;
1669 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1670 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1671 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1672 }
1673 }
1674 #endif /* !HAS_16BIT_ASID */
1675 }
1676
1677 static bool
alloc_asid(pmap_t pmap)1678 alloc_asid(pmap_t pmap)
1679 {
1680 int vasid = -1;
1681 uint16_t hw_asid;
1682
1683 pmap_simple_lock(&asid_lock);
1684
1685 #if !HAS_16BIT_ASID
1686 if (__probable(pmap_asid_plru)) {
1687 unsigned plru_index = 0;
1688 uint64_t lowest_gen = asid_plru_generation[0];
1689 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1690 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1691 if (asid_plru_generation[i] < lowest_gen) {
1692 plru_index = i;
1693 lowest_gen = asid_plru_generation[i];
1694 lowest_gen_bitmap = asid_plru_bitmap[i];
1695 }
1696 }
1697
1698 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1699 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1700 if (temp_plru) {
1701 vasid = (plru_index << 6) + lsb_first(temp_plru);
1702 #if DEVELOPMENT || DEBUG
1703 ++pmap_asid_hits;
1704 #endif
1705 break;
1706 }
1707 }
1708 }
1709 #else
1710 /**
1711 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1712 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1713 * However, we first try to allocate starting from the position of the most-recently allocated
1714 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1715 * lower bit positions and then re-checking those same lower positions every time we allocate
1716 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1717 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1718 * logic, without requiring prohibitively expensive RCTX instructions.
1719 */
1720 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1721 #endif /* !HAS_16BIT_ASID */
1722 if (__improbable(vasid < 0)) {
1723 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1724 // slightly better with the collision detection scheme used by pmap_switch_internal().
1725 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1726 #if DEVELOPMENT || DEBUG
1727 ++pmap_asid_misses;
1728 #endif
1729 }
1730 if (__improbable(vasid < 0)) {
1731 pmap_simple_unlock(&asid_lock);
1732 return false;
1733 }
1734 assert((uint32_t)vasid < pmap_max_asids);
1735 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1736 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1737 #if HAS_16BIT_ASID
1738 last_allocated_asid = (uint16_t)vasid;
1739 #endif /* HAS_16BIT_ASID */
1740 pmap_simple_unlock(&asid_lock);
1741 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1742 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1743 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1744 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1745 * reassign to a reserved VASID. */
1746 assert(pmap->sw_asid < UINT8_MAX);
1747 pmap->sw_asid = UINT8_MAX;
1748 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1749 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1750 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1751 assert(hw_asid < MAX_HW_ASIDS);
1752 }
1753 pmap_update_plru(hw_asid);
1754 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1755 #if __ARM_KERNEL_PROTECT__
1756 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1757 #endif
1758 pmap->hw_asid = hw_asid;
1759 return true;
1760 }
1761
1762 static void
free_asid(pmap_t pmap)1763 free_asid(pmap_t pmap)
1764 {
1765 unsigned int vasid;
1766 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1767 if (__improbable(hw_asid == 0)) {
1768 return;
1769 }
1770
1771 #if __ARM_KERNEL_PROTECT__
1772 hw_asid >>= 1;
1773 #endif
1774 hw_asid -= 1;
1775
1776 #if HAS_16BIT_ASID
1777 vasid = hw_asid;
1778 #else
1779 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1780 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1781 } else {
1782 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1783 }
1784
1785 if (__probable(pmap_asid_plru)) {
1786 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1787 }
1788 #endif /* HAS_16BIT_ASID */
1789 pmap_simple_lock(&asid_lock);
1790 assert(!bitmap_test(&asid_bitmap[0], vasid));
1791 bitmap_set(&asid_bitmap[0], vasid);
1792 pmap_simple_unlock(&asid_lock);
1793 }
1794
1795
1796 boolean_t
pmap_valid_address(pmap_paddr_t addr)1797 pmap_valid_address(
1798 pmap_paddr_t addr)
1799 {
1800 return pa_valid(addr);
1801 }
1802
1803
1804
1805
1806
1807
1808 /*
1809 * Map memory at initialization. The physical addresses being
1810 * mapped are not managed and are never unmapped.
1811 *
1812 * For now, VM is already on, we only need to map the
1813 * specified memory.
1814 */
1815 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1816 pmap_map(
1817 vm_map_address_t virt,
1818 vm_offset_t start,
1819 vm_offset_t end,
1820 vm_prot_t prot,
1821 unsigned int flags)
1822 {
1823 kern_return_t kr;
1824 vm_size_t ps;
1825
1826 ps = PAGE_SIZE;
1827 while (start < end) {
1828 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1829 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1830
1831 if (kr != KERN_SUCCESS) {
1832 panic("%s: failed pmap_enter, "
1833 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1834 __FUNCTION__,
1835 (void *) virt, (void *) start, (void *) end, prot, flags);
1836 }
1837
1838 virt += ps;
1839 start += ps;
1840 }
1841 return virt;
1842 }
1843
1844 #if XNU_MONITOR
1845 /**
1846 * Remove kernel writeablity from an IO PTE value if the page is owned by
1847 * guarded mode software.
1848 *
1849 * @param paddr The physical address of the page which has to be non-DRAM.
1850 * @param tmplate The PTE value to be evaluated.
1851 *
1852 * @return A new PTE value with permission bits modified.
1853 */
1854 static inline
1855 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1856 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1857 {
1858 assert(!pa_valid(paddr));
1859
1860 const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1861
1862 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1863 /* PPL to own the page by converting KERN_RW to PPL_RW. */
1864 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1865 switch (xprr_perm) {
1866 case XPRR_KERN_RO_PERM:
1867 break;
1868 case XPRR_KERN_RW_PERM:
1869 tmplate &= ~ARM_PTE_XPRR_MASK;
1870 tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1871 break;
1872 default:
1873 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1874 }
1875 }
1876
1877 return tmplate;
1878 }
1879 #endif /* XNU_MONITOR */
1880
1881 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1882 pmap_map_bd_with_options(
1883 vm_map_address_t virt,
1884 vm_offset_t start,
1885 vm_offset_t end,
1886 vm_prot_t prot,
1887 int32_t options)
1888 {
1889 pt_entry_t mem_attr;
1890
1891 switch (options & PMAP_MAP_BD_MASK) {
1892 case PMAP_MAP_BD_WCOMB:
1893 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1894 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1895 break;
1896 case PMAP_MAP_BD_POSTED:
1897 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1898 break;
1899 case PMAP_MAP_BD_POSTED_REORDERED:
1900 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1901 break;
1902 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1903 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1904 break;
1905 default:
1906 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1907 break;
1908 }
1909
1910 /* not cacheable and not buffered */
1911 pt_entry_t tmplate = pa_to_pte(start)
1912 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1913 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1914 | mem_attr;
1915
1916 #if __ARM_KERNEL_PROTECT__
1917 tmplate |= ARM_PTE_NG;
1918 #endif /* __ARM_KERNEL_PROTECT__ */
1919
1920 vm_map_address_t vaddr = virt;
1921 vm_offset_t paddr = start;
1922 while (paddr < end) {
1923 pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1924 if (ptep == PT_ENTRY_NULL) {
1925 panic("pmap_map_bd");
1926 }
1927
1928 /**
1929 * For every iteration, the paddr encoded in tmplate is incrementing,
1930 * but we always start with the original AP bits defined at the top
1931 * of the function in tmplate and only modify the AP bits in the pte
1932 * variable.
1933 */
1934 pt_entry_t pte;
1935 #if XNU_MONITOR
1936 if (!pa_valid(paddr)) {
1937 pte = pmap_construct_io_pte(paddr, tmplate);
1938 } else {
1939 pte = tmplate;
1940 }
1941 #else /* !XNU_MONITOR */
1942 pte = tmplate;
1943 #endif
1944
1945 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1946 write_pte_strong(ptep, pte);
1947
1948 pte_increment_pa(tmplate);
1949 vaddr += PAGE_SIZE;
1950 paddr += PAGE_SIZE;
1951 }
1952
1953 if (end >= start) {
1954 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1955 }
1956
1957 return vaddr;
1958 }
1959
1960 /*
1961 * Back-door routine for mapping kernel VM at initialization.
1962 * Useful for mapping memory outside the range
1963 * [vm_first_phys, vm_last_phys] (i.e., devices).
1964 * Otherwise like pmap_map.
1965 */
1966 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1967 pmap_map_bd(
1968 vm_map_address_t virt,
1969 vm_offset_t start,
1970 vm_offset_t end,
1971 vm_prot_t prot)
1972 {
1973 return pmap_map_bd_with_options(virt, start, end, prot, 0);
1974 }
1975
1976 /*
1977 * Back-door routine for mapping kernel VM at initialization.
1978 * Useful for mapping memory specific physical addresses in early
1979 * boot (i.e., before kernel_map is initialized).
1980 *
1981 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1982 */
1983
1984 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1985 pmap_map_high_window_bd(
1986 vm_offset_t pa_start,
1987 vm_size_t len,
1988 vm_prot_t prot)
1989 {
1990 pt_entry_t *ptep, pte;
1991 vm_map_address_t va_start = VREGION1_START;
1992 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1993 vm_map_address_t va_end;
1994 vm_map_address_t va;
1995 vm_size_t offset;
1996
1997 offset = pa_start & PAGE_MASK;
1998 pa_start -= offset;
1999 len += offset;
2000
2001 if (len > (va_max - va_start)) {
2002 panic("%s: area too large, "
2003 "pa_start=%p, len=%p, prot=0x%x",
2004 __FUNCTION__,
2005 (void*)pa_start, (void*)len, prot);
2006 }
2007
2008 scan:
2009 for (; va_start < va_max; va_start += PAGE_SIZE) {
2010 ptep = pmap_pte(kernel_pmap, va_start);
2011 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2012 if (*ptep == ARM_PTE_TYPE_FAULT) {
2013 break;
2014 }
2015 }
2016 if (va_start > va_max) {
2017 panic("%s: insufficient pages, "
2018 "pa_start=%p, len=%p, prot=0x%x",
2019 __FUNCTION__,
2020 (void*)pa_start, (void*)len, prot);
2021 }
2022
2023 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2024 ptep = pmap_pte(kernel_pmap, va_end);
2025 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2026 if (*ptep != ARM_PTE_TYPE_FAULT) {
2027 va_start = va_end + PAGE_SIZE;
2028 goto scan;
2029 }
2030 }
2031
2032 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2033 ptep = pmap_pte(kernel_pmap, va);
2034 pte = pa_to_pte(pa_start)
2035 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2036 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2037 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2038 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2039 #if __ARM_KERNEL_PROTECT__
2040 pte |= ARM_PTE_NG;
2041 #endif /* __ARM_KERNEL_PROTECT__ */
2042 write_pte_strong(ptep, pte);
2043 }
2044 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2045 #if KASAN
2046 kasan_notify_address(va_start, len);
2047 #endif
2048 return va_start;
2049 }
2050
2051 static uint32_t
pmap_compute_max_asids(void)2052 pmap_compute_max_asids(void)
2053 {
2054 DTEntry entry;
2055 void const *prop = NULL;
2056 uint32_t max_asids;
2057 int err;
2058 unsigned int prop_size;
2059
2060 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2061 assert(err == kSuccess);
2062
2063 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2064 /* TODO: consider allowing maxproc limits to be scaled earlier so that
2065 * we can choose a more flexible default value here. */
2066 return MAX_ASIDS;
2067 }
2068
2069 if (prop_size != sizeof(max_asids)) {
2070 panic("pmap-max-asids property is not a 32-bit integer");
2071 }
2072
2073 max_asids = *((uint32_t const *)prop);
2074 #if HAS_16BIT_ASID
2075 if (max_asids > MAX_HW_ASIDS) {
2076 panic("pmap-max-asids 0x%x too large", max_asids);
2077 }
2078 #else
2079 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2080 max_asids = (max_asids + 63) & ~63UL;
2081
2082 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2083 /* currently capped by size of pmap->sw_asid */
2084 panic("pmap-max-asids 0x%x too large", max_asids);
2085 }
2086 #endif /* HAS_16BIT_ASID */
2087 if (max_asids == 0) {
2088 panic("pmap-max-asids cannot be zero");
2089 }
2090 return max_asids;
2091 }
2092
2093 #if __arm64__
2094 /*
2095 * pmap_get_arm64_prot
2096 *
2097 * return effective armv8 VMSA block protections including
2098 * table AP/PXN/XN overrides of a pmap entry
2099 *
2100 */
2101
2102 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2103 pmap_get_arm64_prot(
2104 pmap_t pmap,
2105 vm_offset_t addr)
2106 {
2107 tt_entry_t tte = 0;
2108 unsigned int level = 0;
2109 uint64_t tte_type = 0;
2110 uint64_t effective_prot_bits = 0;
2111 uint64_t aggregate_tte = 0;
2112 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2113 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2114
2115 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2116 tte = *pmap_ttne(pmap, level, addr);
2117
2118 if (!(tte & ARM_TTE_VALID)) {
2119 return 0;
2120 }
2121
2122 tte_type = tte & ARM_TTE_TYPE_MASK;
2123
2124 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2125 (level == pt_attr->pta_max_level)) {
2126 /* Block or page mapping; both have the same protection bit layout. */
2127 break;
2128 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
2129 /* All of the table bits we care about are overrides, so just OR them together. */
2130 aggregate_tte |= tte;
2131 }
2132 }
2133
2134 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2135 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2136 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2137
2138 /* Start with the PTE bits. */
2139 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2140
2141 /* Table AP bits mask out block/page AP bits */
2142 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2143
2144 /* XN/PXN bits can be OR'd in. */
2145 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2146 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2147
2148 return effective_prot_bits;
2149 }
2150 #endif /* __arm64__ */
2151
2152 /*
2153 * Bootstrap the system enough to run with virtual memory.
2154 *
2155 * The early VM initialization code has already allocated
2156 * the first CPU's translation table and made entries for
2157 * all the one-to-one mappings to be found there.
2158 *
2159 * We must set up the kernel pmap structures, the
2160 * physical-to-virtual translation lookup tables for the
2161 * physical memory to be managed (between avail_start and
2162 * avail_end).
2163 *
2164 * Map the kernel's code and data, and allocate the system page table.
2165 * Page_size must already be set.
2166 *
2167 * Parameters:
2168 * first_avail first available physical page -
2169 * after kernel page tables
2170 * avail_start PA of first managed physical page
2171 * avail_end PA of last managed physical page
2172 */
2173
2174 void
pmap_bootstrap(vm_offset_t vstart)2175 pmap_bootstrap(
2176 vm_offset_t vstart)
2177 {
2178 vm_map_offset_t maxoffset;
2179
2180 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2181
2182 #if XNU_MONITOR
2183
2184 #if DEVELOPMENT || DEBUG
2185 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2186 #endif
2187
2188 #if CONFIG_CSR_FROM_DT
2189 if (csr_unsafe_kernel_text) {
2190 pmap_ppl_disable = true;
2191 }
2192 #endif /* CONFIG_CSR_FROM_DT */
2193
2194 #endif /* XNU_MONITOR */
2195
2196 #if DEVELOPMENT || DEBUG
2197 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2198 kprintf("Kernel traces for pmap operations enabled\n");
2199 }
2200 #endif
2201
2202 /*
2203 * Initialize the kernel pmap.
2204 */
2205 #if ARM_PARAMETERIZED_PMAP
2206 kernel_pmap->pmap_pt_attr = native_pt_attr;
2207 #endif /* ARM_PARAMETERIZED_PMAP */
2208 #if HAS_APPLE_PAC
2209 kernel_pmap->disable_jop = 0;
2210 #endif /* HAS_APPLE_PAC */
2211 kernel_pmap->tte = cpu_tte;
2212 kernel_pmap->ttep = cpu_ttep;
2213 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2214 kernel_pmap->max = UINTPTR_MAX;
2215 os_atomic_init(&kernel_pmap->ref_count, 1);
2216 #if XNU_MONITOR
2217 os_atomic_init(&kernel_pmap->nested_count, 0);
2218 #endif
2219 kernel_pmap->nx_enabled = TRUE;
2220 #ifdef __arm64__
2221 kernel_pmap->is_64bit = TRUE;
2222 #else
2223 kernel_pmap->is_64bit = FALSE;
2224 #endif
2225 #if CONFIG_ROSETTA
2226 kernel_pmap->is_rosetta = FALSE;
2227 #endif
2228
2229 #if ARM_PARAMETERIZED_PMAP
2230 kernel_pmap->pmap_pt_attr = native_pt_attr;
2231 #endif /* ARM_PARAMETERIZED_PMAP */
2232
2233 kernel_pmap->nested_region_addr = 0x0ULL;
2234 kernel_pmap->nested_region_size = 0x0ULL;
2235 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2236 kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2237 kernel_pmap->type = PMAP_TYPE_KERNEL;
2238
2239 kernel_pmap->hw_asid = 0;
2240 kernel_pmap->sw_asid = 0;
2241
2242 pmap_lock_init(kernel_pmap);
2243
2244 pmap_max_asids = pmap_compute_max_asids();
2245 #if HAS_16BIT_ASID
2246 asid_chunk_size = MAX_HW_ASIDS;
2247 #else
2248 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2249 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2250 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2251 * masking used by the PLRU scheme. This means we must handle the case in which
2252 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2253 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2254 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2255 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2256 #endif /* HAS_16BIT_ASIDS */
2257
2258 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2259
2260 /**
2261 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2262 * pp_attr_table, etc). This function will use `avail_start` to allocate
2263 * space for these data structures.
2264 */
2265 pmap_data_bootstrap();
2266
2267 /**
2268 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2269 */
2270 uat_bootstrap();
2271
2272
2273 /**
2274 * Bootstrap any necessary SART data structures and values needed from the device tree.
2275 */
2276 sart_bootstrap();
2277
2278 /**
2279 * Don't make any assumptions about the alignment of avail_start before this
2280 * point (i.e., pmap_data_bootstrap() performs allocations).
2281 */
2282 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2283
2284 const pmap_paddr_t pmap_struct_start = avail_start;
2285
2286 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2287 avail_start = round_page(avail_start + asid_table_size);
2288
2289 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2290
2291 vm_first_phys = gPhysBase;
2292 vm_last_phys = trunc_page(avail_end);
2293
2294 queue_init(&map_pmap_list);
2295 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2296 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2297 free_page_size_tt_count = 0;
2298 free_page_size_tt_max = 0;
2299 free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2300 free_two_page_size_tt_count = 0;
2301 free_two_page_size_tt_max = 0;
2302 free_tt_list = TT_FREE_ENTRY_NULL;
2303 free_tt_count = 0;
2304 free_tt_max = 0;
2305
2306 virtual_space_start = vstart;
2307 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2308
2309 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2310 #if !HAS_16BIT_ASID
2311 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2312 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2313 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2314 #endif /* !HAS_16BIT_ASID */
2315
2316
2317
2318 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2319 maxoffset = trunc_page(maxoffset);
2320 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2321 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2322 arm_pmap_max_offset_default = maxoffset;
2323 }
2324 }
2325 #if defined(__arm64__)
2326 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2327 maxoffset = trunc_page(maxoffset);
2328 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2329 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2330 arm64_pmap_max_offset_default = maxoffset;
2331 }
2332 }
2333 #endif
2334
2335 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2336
2337
2338 #if PMAP_CS_PPL_MONITOR
2339 /* Initialize the PPL trust cache read-write lock */
2340 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2341 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2342 #endif
2343
2344 #if MACH_ASSERT
2345 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2346 &vm_footprint_suspend_allowed,
2347 sizeof(vm_footprint_suspend_allowed));
2348 #endif /* MACH_ASSERT */
2349
2350 #if KASAN
2351 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2352 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2353 #endif /* KASAN */
2354
2355 /**
2356 * Ensure that avail_start is always left on a page boundary. The calling
2357 * code might not perform any alignment before allocating page tables so
2358 * this is important.
2359 */
2360 avail_start = round_page(avail_start);
2361 }
2362
2363 #if XNU_MONITOR
2364
2365 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2366 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2367 {
2368 pmap_paddr_t cur_pa;
2369 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2370 assert(pa_valid(cur_pa));
2371 ppattr_pa_set_monitor(cur_pa);
2372 }
2373 }
2374
2375 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2376 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2377 pmap_paddr_t end_pa,
2378 unsigned int expected_perm,
2379 unsigned int new_perm)
2380 {
2381 vm_offset_t start_va = phystokv(start_pa);
2382 vm_offset_t end_va = start_va + (end_pa - start_pa);
2383
2384 pa_set_range_monitor(start_pa, end_pa);
2385 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2386 }
2387
2388 static void
pmap_lockdown_kc(void)2389 pmap_lockdown_kc(void)
2390 {
2391 extern vm_offset_t vm_kernelcache_base;
2392 extern vm_offset_t vm_kernelcache_top;
2393 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2394 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2395 pmap_paddr_t cur_pa = start_pa;
2396 vm_offset_t cur_va = vm_kernelcache_base;
2397 while (cur_pa < end_pa) {
2398 vm_size_t range_size = end_pa - cur_pa;
2399 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2400 if (ptov_va != cur_va) {
2401 /*
2402 * If the physical address maps back to a virtual address that is non-linear
2403 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2404 * reclaimed by the OS and should therefore not be locked down.
2405 */
2406 cur_pa += range_size;
2407 cur_va += range_size;
2408 continue;
2409 }
2410 unsigned int pai = pa_index(cur_pa);
2411 pv_entry_t **pv_h = pai_to_pvh(pai);
2412
2413 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2414
2415 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2416 panic("pai %d already locked down", pai);
2417 }
2418
2419 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2420 cur_pa += ARM_PGBYTES;
2421 cur_va += ARM_PGBYTES;
2422 }
2423 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2424 extern uint64_t ctrr_ro_test;
2425 extern uint64_t ctrr_nx_test;
2426 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2427 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2428 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2429 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2430 }
2431 #endif
2432 }
2433
2434 void
pmap_static_allocations_done(void)2435 pmap_static_allocations_done(void)
2436 {
2437 pmap_paddr_t monitor_start_pa;
2438 pmap_paddr_t monitor_end_pa;
2439
2440 /*
2441 * Protect the bootstrap (V=P and V->P) page tables.
2442 *
2443 * These bootstrap allocations will be used primarily for page tables.
2444 * If we wish to secure the page tables, we need to start by marking
2445 * these bootstrap allocations as pages that we want to protect.
2446 */
2447 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2448 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2449
2450 /* The bootstrap page tables are mapped RW at boostrap. */
2451 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2452
2453 /*
2454 * We use avail_start as a pointer to the first address that has not
2455 * been reserved for bootstrap, so we know which pages to give to the
2456 * virtual memory layer.
2457 */
2458 monitor_start_pa = first_avail_phys;
2459 monitor_end_pa = avail_start;
2460
2461 /* The other bootstrap allocations are mapped RW at bootstrap. */
2462 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2463
2464 /*
2465 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2466 * to RO in arm_vm_prot_finalize(), which is called after this function.
2467 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2468 * they can't be allocated for other uses. We don't need a special xPRR
2469 * protection index, as there is no PPL_RO index, and these pages are ultimately
2470 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2471 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2472 * lookup table index to USER_XO before APRR is applied, leading the hardware
2473 * to believe we are dealing with an user XO page upon performing a translation.
2474 */
2475 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2476 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2477 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2478
2479 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2480 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2481
2482 /* PPL data is RW for the PPL, RO for the kernel. */
2483 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2484
2485 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2486 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2487
2488 /* PPL text is RX for the PPL, RO for the kernel. */
2489 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2490
2491
2492 /*
2493 * In order to support DTrace, the save areas for the PPL must be
2494 * writable. This is due to the fact that DTrace will try to update
2495 * register state.
2496 */
2497 if (pmap_ppl_disable) {
2498 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2499 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2500
2501 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2502 }
2503
2504
2505 if (segSizePPLDATACONST > 0) {
2506 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2507 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2508
2509 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2510 }
2511
2512 /*
2513 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2514 * precaution. The real RW mappings are at a different location with guard pages.
2515 */
2516 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2517
2518 /* Prevent remapping of the kernelcache */
2519 pmap_lockdown_kc();
2520 }
2521
2522 void
pmap_lockdown_ppl(void)2523 pmap_lockdown_ppl(void)
2524 {
2525 /* Mark the PPL as being locked down. */
2526
2527 mp_disable_preemption(); // for _nopreempt locking operations
2528 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2529 if (commpage_text_kva != 0) {
2530 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2531 false, VM_PROT_READ | VM_PROT_EXECUTE);
2532 }
2533 mp_enable_preemption();
2534
2535 /* Write-protect the kernel RO commpage. */
2536 #error "XPRR configuration error"
2537 }
2538 #endif /* XNU_MONITOR */
2539
2540 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2541 pmap_virtual_space(
2542 vm_offset_t *startp,
2543 vm_offset_t *endp
2544 )
2545 {
2546 *startp = virtual_space_start;
2547 *endp = virtual_space_end;
2548 }
2549
2550
2551 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2552 pmap_virtual_region(
2553 unsigned int region_select,
2554 vm_map_offset_t *startp,
2555 vm_map_size_t *size
2556 )
2557 {
2558 boolean_t ret = FALSE;
2559 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2560 if (region_select == 0) {
2561 /*
2562 * In this config, the bootstrap mappings should occupy their own L2
2563 * TTs, as they should be immutable after boot. Having the associated
2564 * TTEs and PTEs in their own pages allows us to lock down those pages,
2565 * while allowing the rest of the kernel address range to be remapped.
2566 */
2567 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2568 #if defined(ARM_LARGE_MEMORY)
2569 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2570 #else
2571 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2572 #endif
2573 ret = TRUE;
2574 }
2575
2576 #if defined(ARM_LARGE_MEMORY)
2577 if (region_select == 1) {
2578 *startp = VREGION1_START;
2579 *size = VREGION1_SIZE;
2580 ret = TRUE;
2581 }
2582 #endif
2583 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2584 #if defined(ARM_LARGE_MEMORY)
2585 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2586 if (region_select == 0) {
2587 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2588 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2589 ret = TRUE;
2590 }
2591
2592 if (region_select == 1) {
2593 *startp = VREGION1_START;
2594 *size = VREGION1_SIZE;
2595 ret = TRUE;
2596 }
2597 #else /* !defined(ARM_LARGE_MEMORY) */
2598 unsigned long low_global_vr_mask = 0;
2599 vm_map_size_t low_global_vr_size = 0;
2600
2601 if (region_select == 0) {
2602 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2603 if (!TEST_PAGE_SIZE_4K) {
2604 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2605 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2606 } else {
2607 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2608 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2609 }
2610 ret = TRUE;
2611 }
2612 if (region_select == 1) {
2613 *startp = VREGION1_START;
2614 *size = VREGION1_SIZE;
2615 ret = TRUE;
2616 }
2617 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2618 if (!TEST_PAGE_SIZE_4K) {
2619 low_global_vr_mask = 0xFFFFFFFFFE000000;
2620 low_global_vr_size = 0x2000000;
2621 } else {
2622 low_global_vr_mask = 0xFFFFFFFFFF800000;
2623 low_global_vr_size = 0x800000;
2624 }
2625
2626 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2627 *startp = LOW_GLOBAL_BASE_ADDRESS;
2628 *size = low_global_vr_size;
2629 ret = TRUE;
2630 }
2631
2632 if (region_select == 3) {
2633 /* In this config, we allow the bootstrap mappings to occupy the same
2634 * page table pages as the heap.
2635 */
2636 *startp = VM_MIN_KERNEL_ADDRESS;
2637 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2638 ret = TRUE;
2639 }
2640 #endif /* defined(ARM_LARGE_MEMORY) */
2641 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2642 return ret;
2643 }
2644
2645 /*
2646 * Routines to track and allocate physical pages during early boot.
2647 * On most systems that memory runs from first_avail through to avail_end
2648 * with no gaps.
2649 *
2650 * If the system supports ECC and ecc_bad_pages_count > 0, we
2651 * need to skip those pages.
2652 */
2653
2654 static unsigned int avail_page_count = 0;
2655 static bool need_ram_ranges_init = true;
2656
2657
2658 /**
2659 * Checks to see if a given page is in
2660 * the array of known bad pages
2661 *
2662 * @param ppn page number to check
2663 */
2664 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2665 pmap_is_bad_ram(__unused ppnum_t ppn)
2666 {
2667 return false;
2668 }
2669
2670 /**
2671 * Prepare bad ram pages to be skipped.
2672 */
2673
2674 /*
2675 * Initialize the count of available pages. No lock needed here,
2676 * as this code is called while kernel boot up is single threaded.
2677 */
2678 static void
initialize_ram_ranges(void)2679 initialize_ram_ranges(void)
2680 {
2681 pmap_paddr_t first = first_avail;
2682 pmap_paddr_t end = avail_end;
2683
2684 assert(first <= end);
2685 assert(first == (first & ~PAGE_MASK));
2686 assert(end == (end & ~PAGE_MASK));
2687 avail_page_count = atop(end - first);
2688
2689 need_ram_ranges_init = false;
2690 }
2691
2692 unsigned int
pmap_free_pages(void)2693 pmap_free_pages(
2694 void)
2695 {
2696 if (need_ram_ranges_init) {
2697 initialize_ram_ranges();
2698 }
2699 return avail_page_count;
2700 }
2701
2702 unsigned int
pmap_free_pages_span(void)2703 pmap_free_pages_span(
2704 void)
2705 {
2706 if (need_ram_ranges_init) {
2707 initialize_ram_ranges();
2708 }
2709 return (unsigned int)atop(avail_end - first_avail);
2710 }
2711
2712
2713 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2714 pmap_next_page_hi(
2715 ppnum_t * pnum,
2716 __unused boolean_t might_free)
2717 {
2718 return pmap_next_page(pnum);
2719 }
2720
2721
2722 boolean_t
pmap_next_page(ppnum_t * pnum)2723 pmap_next_page(
2724 ppnum_t *pnum)
2725 {
2726 if (need_ram_ranges_init) {
2727 initialize_ram_ranges();
2728 }
2729
2730
2731 if (first_avail != avail_end) {
2732 *pnum = (ppnum_t)atop(first_avail);
2733 first_avail += PAGE_SIZE;
2734 assert(avail_page_count > 0);
2735 --avail_page_count;
2736 return TRUE;
2737 }
2738 assert(avail_page_count == 0);
2739 return FALSE;
2740 }
2741
2742
2743 /*
2744 * Initialize the pmap module.
2745 * Called by vm_init, to initialize any structures that the pmap
2746 * system needs to map virtual memory.
2747 */
2748 void
pmap_init(void)2749 pmap_init(
2750 void)
2751 {
2752 /*
2753 * Protect page zero in the kernel map.
2754 * (can be overruled by permanent transltion
2755 * table entries at page zero - see arm_vm_init).
2756 */
2757 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2758
2759 pmap_initialized = TRUE;
2760
2761 /*
2762 * Create the zone of physical maps
2763 * and the physical-to-virtual entries.
2764 */
2765 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2766 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2767
2768
2769 /*
2770 * Initialize the pmap object (for tracking the vm_page_t
2771 * structures for pages we allocate to be page tables in
2772 * pmap_expand().
2773 */
2774 _vm_object_allocate(mem_size, pmap_object);
2775 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2776
2777 /*
2778 * The values of [hard_]maxproc may have been scaled, make sure
2779 * they are still less than the value of pmap_max_asids.
2780 */
2781 if ((uint32_t)maxproc > pmap_max_asids) {
2782 maxproc = pmap_max_asids;
2783 }
2784 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2785 hard_maxproc = pmap_max_asids;
2786 }
2787 }
2788
2789 /**
2790 * Verify that a given physical page contains no mappings (outside of the
2791 * default physical aperture mapping).
2792 *
2793 * @param ppnum Physical page number to check there are no mappings to.
2794 *
2795 * @return True if there are no mappings, false otherwise or if the page is not
2796 * kernel-managed.
2797 */
2798 bool
pmap_verify_free(ppnum_t ppnum)2799 pmap_verify_free(ppnum_t ppnum)
2800 {
2801 const pmap_paddr_t pa = ptoa(ppnum);
2802
2803 assert(pa != vm_page_fictitious_addr);
2804
2805 /* Only mappings to kernel-managed physical memory are tracked. */
2806 if (!pa_valid(pa)) {
2807 return false;
2808 }
2809
2810 const unsigned int pai = pa_index(pa);
2811 pv_entry_t **pvh = pai_to_pvh(pai);
2812
2813 return pvh_test_type(pvh, PVH_TYPE_NULL);
2814 }
2815
2816 #if MACH_ASSERT
2817 /**
2818 * Verify that a given physical page contains no mappings (outside of the
2819 * default physical aperture mapping) and if it does, then panic.
2820 *
2821 * @note It's recommended to use pmap_verify_free() directly when operating in
2822 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2823 * normally being called from outside of the PPL, and the pv_head_table
2824 * can't be modified outside of the PPL).
2825 *
2826 * @param ppnum Physical page number to check there are no mappings to.
2827 */
2828 void
pmap_assert_free(ppnum_t ppnum)2829 pmap_assert_free(ppnum_t ppnum)
2830 {
2831 const pmap_paddr_t pa = ptoa(ppnum);
2832
2833 /* Only mappings to kernel-managed physical memory are tracked. */
2834 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2835 return;
2836 }
2837
2838 const unsigned int pai = pa_index(pa);
2839 pv_entry_t **pvh = pai_to_pvh(pai);
2840
2841 /**
2842 * This function is always called from outside of the PPL. Because of this,
2843 * the PVH entry can't be locked. This function is generally only called
2844 * before the VM reclaims a physical page and shouldn't be creating new
2845 * mappings. Even if a new mapping is created while parsing the hierarchy,
2846 * the worst case is that the system will panic in another way, and we were
2847 * already about to panic anyway.
2848 */
2849
2850 /**
2851 * Since pmap_verify_free() returned false, that means there is at least one
2852 * mapping left. Let's get some extra info on the first mapping we find to
2853 * dump in the panic string (the common case is that there is one spare
2854 * mapping that was never unmapped).
2855 */
2856 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2857
2858 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2859 first_ptep = pvh_ptep(pvh);
2860 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2861 pv_entry_t *pvep = pvh_pve_list(pvh);
2862
2863 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2864 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2865 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2866 if (first_ptep != PT_ENTRY_NULL) {
2867 break;
2868 }
2869 }
2870
2871 /* The PVE should have at least one valid PTE. */
2872 assert(first_ptep != PT_ENTRY_NULL);
2873 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2874 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2875 __func__, pvh, pai);
2876 } else {
2877 /**
2878 * The mapping disappeared between here and the pmap_verify_free() call.
2879 * The only way that can happen is if the VM was racing this call with
2880 * a call that unmaps PTEs. Operations on this page should not be
2881 * occurring at the same time as this check, and unfortunately we can't
2882 * lock the PVH entry to prevent it, so just panic instead.
2883 */
2884 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2885 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2886 __func__, pvh, pai);
2887 }
2888
2889 /* Panic with a unique string identifying the first bad mapping and owner. */
2890 {
2891 /* First PTE is mapped by the main CPUs. */
2892 pmap_t pmap = ptep_get_pmap(first_ptep);
2893 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2894
2895 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2896 "%s CPU mapping (pmap: %p)",
2897 __func__, (uint64_t)pa, first_ptep, type, pmap);
2898 }
2899 }
2900 #endif
2901
2902
2903 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2904 pmap_root_alloc_size(pmap_t pmap)
2905 {
2906 #pragma unused(pmap)
2907 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2908 unsigned int root_level = pt_attr_root_level(pt_attr);
2909 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2910 }
2911
2912
2913 /*
2914 * Create and return a physical map.
2915 *
2916 * If the size specified for the map
2917 * is zero, the map is an actual physical
2918 * map, and may be referenced by the
2919 * hardware.
2920 *
2921 * If the size specified is non-zero,
2922 * the map will be used in software only, and
2923 * is bounded by that size.
2924 */
2925 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2926 pmap_create_options_internal(
2927 ledger_t ledger,
2928 vm_map_size_t size,
2929 unsigned int flags,
2930 kern_return_t *kr)
2931 {
2932 unsigned i;
2933 unsigned tte_index_max;
2934 pmap_t p;
2935 bool is_64bit = flags & PMAP_CREATE_64BIT;
2936 #if defined(HAS_APPLE_PAC)
2937 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2938 #endif /* defined(HAS_APPLE_PAC) */
2939 kern_return_t local_kr = KERN_SUCCESS;
2940
2941 if (size != 0) {
2942 {
2943 // Size parameter should only be set for stage 2.
2944 return PMAP_NULL;
2945 }
2946 }
2947
2948 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2949 return PMAP_NULL;
2950 }
2951
2952 #if XNU_MONITOR
2953 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2954 goto pmap_create_fail;
2955 }
2956
2957 assert(p != PMAP_NULL);
2958
2959 if (ledger) {
2960 pmap_ledger_validate(ledger);
2961 pmap_ledger_retain(ledger);
2962 }
2963 #else
2964 /*
2965 * Allocate a pmap struct from the pmap_zone. Then allocate
2966 * the translation table of the right size for the pmap.
2967 */
2968 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2969 local_kr = KERN_RESOURCE_SHORTAGE;
2970 goto pmap_create_fail;
2971 }
2972 #endif
2973
2974 p->ledger = ledger;
2975
2976
2977 p->pmap_vm_map_cs_enforced = false;
2978 p->min = 0;
2979
2980
2981 #if CONFIG_ROSETTA
2982 if (flags & PMAP_CREATE_ROSETTA) {
2983 p->is_rosetta = TRUE;
2984 } else {
2985 p->is_rosetta = FALSE;
2986 }
2987 #endif /* CONFIG_ROSETTA */
2988
2989 #if defined(HAS_APPLE_PAC)
2990 p->disable_jop = disable_jop;
2991 #endif /* defined(HAS_APPLE_PAC) */
2992
2993 p->nested_region_true_start = 0;
2994 p->nested_region_true_end = ~0;
2995
2996 p->nx_enabled = true;
2997 p->is_64bit = is_64bit;
2998 p->nested_pmap = PMAP_NULL;
2999 p->type = PMAP_TYPE_USER;
3000
3001 #if ARM_PARAMETERIZED_PMAP
3002 /* Default to the native pt_attr */
3003 p->pmap_pt_attr = native_pt_attr;
3004 #endif /* ARM_PARAMETERIZED_PMAP */
3005 #if __ARM_MIXED_PAGE_SIZE__
3006 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3007 p->pmap_pt_attr = &pmap_pt_attr_4k;
3008 }
3009 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3010 p->max = pmap_user_va_size(p);
3011
3012 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3013 local_kr = KERN_NO_SPACE;
3014 goto id_alloc_fail;
3015 }
3016
3017 pmap_lock_init(p);
3018
3019 p->tt_entry_free = (tt_entry_t *)0;
3020 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3021
3022
3023 #if XNU_MONITOR
3024 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3025 #else
3026 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3027 #endif
3028 if (!(p->tte)) {
3029 local_kr = KERN_RESOURCE_SHORTAGE;
3030 goto tt1_alloc_fail;
3031 }
3032
3033 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3034 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3035
3036 /* nullify the translation table */
3037 for (i = 0; i < tte_index_max; i++) {
3038 p->tte[i] = ARM_TTE_TYPE_FAULT;
3039 }
3040
3041 FLUSH_PTE();
3042
3043 /*
3044 * initialize the rest of the structure
3045 */
3046 p->nested_region_addr = 0x0ULL;
3047 p->nested_region_size = 0x0ULL;
3048 p->nested_region_unnested_table_bitmap = NULL;
3049 p->nested_region_unnested_table_bitmap_size = 0x0UL;
3050
3051 p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3052 p->nested_no_bounds_refcnt = 0;
3053 p->nested_bounds_set = false;
3054
3055
3056 #if MACH_ASSERT
3057 p->pmap_pid = 0;
3058 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3059 #endif /* MACH_ASSERT */
3060 #if DEVELOPMENT || DEBUG
3061 p->footprint_was_suspended = FALSE;
3062 #endif /* DEVELOPMENT || DEBUG */
3063
3064 #if XNU_MONITOR
3065 os_atomic_init(&p->nested_count, 0);
3066 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3067 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3068 os_atomic_thread_fence(release);
3069 #endif
3070 os_atomic_init(&p->ref_count, 1);
3071 pmap_simple_lock(&pmaps_lock);
3072 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3073 pmap_simple_unlock(&pmaps_lock);
3074
3075 /*
3076 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3077 * which can lead to a concurrent disconnect operation making the balance
3078 * transiently negative. The ledger should still ultimately balance out,
3079 * which we still check upon pmap destruction.
3080 */
3081 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3082 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3083 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3084 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3085 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3086 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3087 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3088 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3089 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3090
3091 return p;
3092
3093 tt1_alloc_fail:
3094 pmap_get_pt_ops(p)->free_id(p);
3095 id_alloc_fail:
3096 #if XNU_MONITOR
3097 pmap_free_pmap(p);
3098
3099 if (ledger) {
3100 pmap_ledger_release(ledger);
3101 }
3102 #else
3103 zfree(pmap_zone, p);
3104 #endif
3105 pmap_create_fail:
3106 #if XNU_MONITOR
3107 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3108 #endif
3109 *kr = local_kr;
3110 #if XNU_MONITOR
3111 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3112 #endif
3113 return PMAP_NULL;
3114 }
3115
3116 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3117 pmap_create_options(
3118 ledger_t ledger,
3119 vm_map_size_t size,
3120 unsigned int flags)
3121 {
3122 pmap_t pmap;
3123 kern_return_t kr = KERN_SUCCESS;
3124
3125 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3126
3127 ledger_reference(ledger);
3128
3129 #if XNU_MONITOR
3130 for (;;) {
3131 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3132 if (kr != KERN_RESOURCE_SHORTAGE) {
3133 break;
3134 }
3135 assert(pmap == PMAP_NULL);
3136 pmap_alloc_page_for_ppl(0);
3137 kr = KERN_SUCCESS;
3138 }
3139 #else
3140 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3141 #endif
3142
3143 if (pmap == PMAP_NULL) {
3144 ledger_dereference(ledger);
3145 }
3146
3147 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3148
3149 return pmap;
3150 }
3151
3152 #if XNU_MONITOR
3153 /*
3154 * This symbol remains in place when the PPL is enabled so that the dispatch
3155 * table does not change from development to release configurations.
3156 */
3157 #endif
3158 #if MACH_ASSERT || XNU_MONITOR
3159 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3160 pmap_set_process_internal(
3161 __unused pmap_t pmap,
3162 __unused int pid,
3163 __unused char *procname)
3164 {
3165 #if MACH_ASSERT
3166 if (pmap == NULL || pmap->pmap_pid == -1) {
3167 return;
3168 }
3169
3170 validate_pmap_mutable(pmap);
3171
3172 pmap->pmap_pid = pid;
3173 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3174 #endif /* MACH_ASSERT */
3175 }
3176 #endif /* MACH_ASSERT || XNU_MONITOR */
3177
3178 #if MACH_ASSERT
3179 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3180 pmap_set_process(
3181 pmap_t pmap,
3182 int pid,
3183 char *procname)
3184 {
3185 #if XNU_MONITOR
3186 pmap_set_process_ppl(pmap, pid, procname);
3187 #else
3188 pmap_set_process_internal(pmap, pid, procname);
3189 #endif
3190 }
3191 #endif /* MACH_ASSERT */
3192
3193 /*
3194 * pmap_deallocate_all_leaf_tts:
3195 *
3196 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3197 * removing and deallocating all TTEs.
3198 */
3199 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3200 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3201 {
3202 tt_entry_t tte = ARM_TTE_EMPTY;
3203 tt_entry_t * ttep = NULL;
3204 tt_entry_t * last_ttep = NULL;
3205
3206 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3207
3208 assert(level < pt_attr_leaf_level(pt_attr));
3209
3210 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3211
3212 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3213 tte = *ttep;
3214
3215 if (!(tte & ARM_TTE_VALID)) {
3216 continue;
3217 }
3218
3219 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3220 panic("%s: found block mapping, ttep=%p, tte=%p, "
3221 "pmap=%p, first_ttep=%p, level=%u",
3222 __FUNCTION__, ttep, (void *)tte,
3223 pmap, first_ttep, level);
3224 }
3225
3226 /* Must be valid, type table */
3227 if (level < pt_attr_twig_level(pt_attr)) {
3228 /* If we haven't reached the twig level, recurse to the next level. */
3229 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3230 }
3231
3232 /* Remove the TTE. */
3233 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3234 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3235 }
3236 }
3237
3238 /*
3239 * We maintain stats and ledgers so that a task's physical footprint is:
3240 * phys_footprint = ((internal - alternate_accounting)
3241 * + (internal_compressed - alternate_accounting_compressed)
3242 * + iokit_mapped
3243 * + purgeable_nonvolatile
3244 * + purgeable_nonvolatile_compressed
3245 * + page_table)
3246 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3247 */
3248
3249 /*
3250 * Retire the given physical map from service.
3251 * Should only be called if the map contains
3252 * no valid mappings.
3253 */
3254 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3255 pmap_destroy_internal(
3256 pmap_t pmap)
3257 {
3258 if (pmap == PMAP_NULL) {
3259 return;
3260 }
3261
3262 validate_pmap(pmap);
3263
3264 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3265
3266 int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3267 if (ref_count > 0) {
3268 return;
3269 } else if (__improbable(ref_count < 0)) {
3270 panic("pmap %p: refcount underflow", pmap);
3271 } else if (__improbable(pmap == kernel_pmap)) {
3272 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3273 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3274 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3275 }
3276
3277 #if XNU_MONITOR
3278 /*
3279 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3280 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3281 * That ensures that if the pmap is currently in use elsewhere, this path will
3282 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3283 * ref_count of 0 and panic.
3284 */
3285 os_atomic_thread_fence(seq_cst);
3286 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3287 panic("pmap %p: attempt to destroy while nested", pmap);
3288 }
3289 const int max_cpu = ml_get_max_cpu_number();
3290 for (unsigned int i = 0; i <= max_cpu; ++i) {
3291 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3292 if (cpu_data == NULL) {
3293 continue;
3294 }
3295 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3296 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3297 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3298 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3299 }
3300 }
3301 #endif
3302 pmap_unmap_commpage(pmap);
3303
3304 pmap_simple_lock(&pmaps_lock);
3305 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3306 pmap_simple_unlock(&pmaps_lock);
3307
3308 pmap_trim_self(pmap);
3309
3310 /*
3311 * Free the memory maps, then the
3312 * pmap structure.
3313 */
3314 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3315
3316
3317
3318 if (pmap->tte) {
3319 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3320 pmap->tte = (tt_entry_t *) NULL;
3321 pmap->ttep = 0;
3322 }
3323
3324 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3325
3326 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3327 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3328 sync_tlb_flush();
3329 } else {
3330 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3331 sync_tlb_flush();
3332 /* return its asid to the pool */
3333 pmap_get_pt_ops(pmap)->free_id(pmap);
3334 if (pmap->nested_pmap != NULL) {
3335 #if XNU_MONITOR
3336 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3337 #endif
3338 /* release the reference we hold on the nested pmap */
3339 pmap_destroy_internal(pmap->nested_pmap);
3340 }
3341 }
3342
3343 pmap_check_ledgers(pmap);
3344
3345 if (pmap->nested_region_unnested_table_bitmap) {
3346 #if XNU_MONITOR
3347 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3348 #else
3349 kfree_data(pmap->nested_region_unnested_table_bitmap,
3350 pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3351 #endif
3352 }
3353
3354 #if XNU_MONITOR
3355 if (pmap->ledger) {
3356 pmap_ledger_release(pmap->ledger);
3357 }
3358
3359 pmap_lock_destroy(pmap);
3360 pmap_free_pmap(pmap);
3361 #else
3362 pmap_lock_destroy(pmap);
3363 zfree(pmap_zone, pmap);
3364 #endif
3365 }
3366
3367 void
pmap_destroy(pmap_t pmap)3368 pmap_destroy(
3369 pmap_t pmap)
3370 {
3371 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3372
3373 ledger_t ledger = pmap->ledger;
3374
3375 #if XNU_MONITOR
3376 pmap_destroy_ppl(pmap);
3377
3378 pmap_ledger_check_balance(pmap);
3379 #else
3380 pmap_destroy_internal(pmap);
3381 #endif
3382
3383 ledger_dereference(ledger);
3384
3385 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3386 }
3387
3388
3389 /*
3390 * Add a reference to the specified pmap.
3391 */
3392 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3393 pmap_reference_internal(
3394 pmap_t pmap)
3395 {
3396 if (pmap != PMAP_NULL) {
3397 validate_pmap_mutable(pmap);
3398 os_atomic_inc(&pmap->ref_count, relaxed);
3399 }
3400 }
3401
3402 void
pmap_reference(pmap_t pmap)3403 pmap_reference(
3404 pmap_t pmap)
3405 {
3406 #if XNU_MONITOR
3407 pmap_reference_ppl(pmap);
3408 #else
3409 pmap_reference_internal(pmap);
3410 #endif
3411 }
3412
3413 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3414 pmap_tt1_allocate(
3415 pmap_t pmap,
3416 vm_size_t size,
3417 unsigned option)
3418 {
3419 tt_entry_t *tt1 = NULL;
3420 tt_free_entry_t *tt1_free;
3421 pmap_paddr_t pa;
3422 vm_address_t va;
3423 vm_address_t va_end;
3424 kern_return_t ret;
3425
3426 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3427 size = PAGE_SIZE;
3428 }
3429
3430 pmap_simple_lock(&tt1_lock);
3431 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3432 free_page_size_tt_count--;
3433 tt1 = (tt_entry_t *)free_page_size_tt_list;
3434 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3435 } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3436 free_two_page_size_tt_count--;
3437 tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3438 free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3439 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3440 free_tt_count--;
3441 tt1 = (tt_entry_t *)free_tt_list;
3442 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3443 }
3444
3445 pmap_simple_unlock(&tt1_lock);
3446
3447 if (tt1 != NULL) {
3448 pmap_tt_ledger_credit(pmap, size);
3449 return (tt_entry_t *)tt1;
3450 }
3451
3452 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3453
3454 if (ret == KERN_RESOURCE_SHORTAGE) {
3455 return (tt_entry_t *)0;
3456 }
3457
3458 #if XNU_MONITOR
3459 assert(pa);
3460 #endif
3461
3462 if (size < PAGE_SIZE) {
3463 va = phystokv(pa) + size;
3464 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3465 tt_free_entry_t *next_free = NULL;
3466 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3467 tt1_free = (tt_free_entry_t *)va;
3468 tt1_free->next = next_free;
3469 next_free = tt1_free;
3470 }
3471 pmap_simple_lock(&tt1_lock);
3472 local_free_list->next = free_tt_list;
3473 free_tt_list = next_free;
3474 free_tt_count += ((PAGE_SIZE / size) - 1);
3475 if (free_tt_count > free_tt_max) {
3476 free_tt_max = free_tt_count;
3477 }
3478 pmap_simple_unlock(&tt1_lock);
3479 }
3480
3481 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3482 * Depending on the device, this can vary between 512b and 16K. */
3483 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3484 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3485 pmap_tt_ledger_credit(pmap, size);
3486
3487 return (tt_entry_t *) phystokv(pa);
3488 }
3489
3490 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3491 pmap_tt1_deallocate(
3492 pmap_t pmap,
3493 tt_entry_t *tt,
3494 vm_size_t size,
3495 unsigned option)
3496 {
3497 tt_free_entry_t *tt_entry;
3498
3499 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3500 size = PAGE_SIZE;
3501 }
3502
3503 tt_entry = (tt_free_entry_t *)tt;
3504 assert(not_in_kdp);
3505 pmap_simple_lock(&tt1_lock);
3506
3507 if (size < PAGE_SIZE) {
3508 free_tt_count++;
3509 if (free_tt_count > free_tt_max) {
3510 free_tt_max = free_tt_count;
3511 }
3512 tt_entry->next = free_tt_list;
3513 free_tt_list = tt_entry;
3514 }
3515
3516 if (size == PAGE_SIZE) {
3517 free_page_size_tt_count++;
3518 if (free_page_size_tt_count > free_page_size_tt_max) {
3519 free_page_size_tt_max = free_page_size_tt_count;
3520 }
3521 tt_entry->next = free_page_size_tt_list;
3522 free_page_size_tt_list = tt_entry;
3523 }
3524
3525 if (size == 2 * PAGE_SIZE) {
3526 free_two_page_size_tt_count++;
3527 if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3528 free_two_page_size_tt_max = free_two_page_size_tt_count;
3529 }
3530 tt_entry->next = free_two_page_size_tt_list;
3531 free_two_page_size_tt_list = tt_entry;
3532 }
3533
3534 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3535 pmap_simple_unlock(&tt1_lock);
3536 pmap_tt_ledger_debit(pmap, size);
3537 return;
3538 }
3539
3540 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3541 free_page_size_tt_count--;
3542 tt = (tt_entry_t *)free_page_size_tt_list;
3543 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3544
3545 pmap_simple_unlock(&tt1_lock);
3546
3547 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3548
3549 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3550
3551 pmap_simple_lock(&tt1_lock);
3552 }
3553
3554 while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3555 free_two_page_size_tt_count--;
3556 tt = (tt_entry_t *)free_two_page_size_tt_list;
3557 free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3558
3559 pmap_simple_unlock(&tt1_lock);
3560
3561 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3562
3563 OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3564
3565 pmap_simple_lock(&tt1_lock);
3566 }
3567 pmap_simple_unlock(&tt1_lock);
3568 pmap_tt_ledger_debit(pmap, size);
3569 }
3570
3571 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3572 pmap_tt_allocate(
3573 pmap_t pmap,
3574 tt_entry_t **ttp,
3575 unsigned int level,
3576 unsigned int options)
3577 {
3578 pmap_paddr_t pa;
3579 *ttp = NULL;
3580
3581 /* Traverse the tt_entry_free list to find a free tt_entry */
3582 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3583 return KERN_ABORTED;
3584 }
3585
3586 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3587 tt_free_entry_t *tt_free_cur, *tt_free_next;
3588
3589 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3590 tt_free_next = tt_free_cur->next;
3591 tt_free_cur->next = NULL;
3592 *ttp = (tt_entry_t *)tt_free_cur;
3593 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3594 }
3595 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3596
3597 /* Only do the heavylifting here when we don't have a free tt_entry. */
3598 if (*ttp == NULL) {
3599 pt_desc_t *ptdp;
3600
3601 /*
3602 * Allocate a VM page for the level x page table entries.
3603 */
3604 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3605 if (options & PMAP_OPTIONS_NOWAIT) {
3606 return KERN_RESOURCE_SHORTAGE;
3607 }
3608 VM_PAGE_WAIT();
3609 }
3610
3611 /* Allocate a new Page Table Descriptor for the newly allocated page table. */
3612 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3613 if (options & PMAP_OPTIONS_NOWAIT) {
3614 /* Deallocate all allocated resources so far. */
3615 pmap_pages_free(pa, PAGE_SIZE);
3616 return KERN_RESOURCE_SHORTAGE;
3617 }
3618 VM_PAGE_WAIT();
3619 }
3620
3621 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3622 OSAddAtomic64(1, &alloc_ttepages_count);
3623 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3624 } else {
3625 OSAddAtomic64(1, &alloc_ptepages_count);
3626 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3627 }
3628
3629 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3630
3631 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3632
3633 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3634 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3635 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3636
3637 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3638 if (PAGE_SIZE > pmap_page_size) {
3639 vm_address_t va;
3640 vm_address_t va_end;
3641
3642 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3643 /* Deallocate all allocated resources so far. */
3644 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3645 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3646 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3647 pmap_pages_free(pa, PAGE_SIZE);
3648 ptd_deallocate(ptdp);
3649
3650 return KERN_ABORTED;
3651 }
3652
3653 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3654 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3655 pmap->tt_entry_free = (tt_entry_t *)va;
3656 }
3657 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3658 }
3659
3660 *ttp = (tt_entry_t *)phystokv(pa);
3661 }
3662
3663 #if XNU_MONITOR
3664 assert(*ttp);
3665 #endif
3666
3667 return KERN_SUCCESS;
3668 }
3669
3670
3671 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3672 pmap_tt_deallocate(
3673 pmap_t pmap,
3674 tt_entry_t *ttp,
3675 unsigned int level)
3676 {
3677 pt_desc_t *ptdp;
3678 ptd_info_t *ptd_info;
3679 unsigned pt_acc_cnt;
3680 unsigned i;
3681 vm_offset_t free_page = 0;
3682 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3683 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3684
3685 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3686
3687 ptdp = ptep_get_ptd(ttp);
3688 ptd_info = ptd_get_info(ptdp, ttp);
3689
3690 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3691
3692 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3693 ptd_info->refcnt = 0;
3694 }
3695
3696 if (__improbable(ptd_info->refcnt != 0)) {
3697 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3698 }
3699
3700 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3701 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3702 }
3703
3704 if (pt_acc_cnt == 0) {
3705 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3706 unsigned pt_free_entry_cnt = 1;
3707
3708 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3709 tt_free_entry_t *tt_free_list_next;
3710
3711 tt_free_list_next = tt_free_list->next;
3712 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3713 pt_free_entry_cnt++;
3714 }
3715 tt_free_list = tt_free_list_next;
3716 }
3717 if (pt_free_entry_cnt == max_pt_index) {
3718 tt_free_entry_t *tt_free_list_cur;
3719
3720 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3721 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3722 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3723
3724 while (tt_free_list_cur) {
3725 tt_free_entry_t *tt_free_list_next;
3726
3727 tt_free_list_next = tt_free_list_cur->next;
3728 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3729 tt_free_list->next = tt_free_list_next->next;
3730 } else {
3731 tt_free_list = tt_free_list_next;
3732 }
3733 tt_free_list_cur = tt_free_list_next;
3734 }
3735 } else {
3736 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3737 pmap->tt_entry_free = ttp;
3738 }
3739 } else {
3740 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3741 pmap->tt_entry_free = ttp;
3742 }
3743
3744 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3745
3746 if (free_page != 0) {
3747 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3748 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3749 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3750 if (level < pt_attr_leaf_level(pt_attr)) {
3751 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3752 } else {
3753 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3754 }
3755 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3756 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3757 }
3758 }
3759
3760 /**
3761 * Safely clear out a translation table entry.
3762 *
3763 * @note If the TTE to clear out points to a leaf table, then that leaf table
3764 * must have a refcnt of zero before the TTE can be removed.
3765 * @note This function expects to be called with pmap locked exclusive, and will
3766 * return with pmap unlocked.
3767 *
3768 * @param pmap The pmap containing the page table whose TTE is being removed.
3769 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3770 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3771 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3772 * @param ttep Pointer to the TTE that should be cleared out.
3773 * @param level The level of the page table that contains the TTE to be removed.
3774 */
3775 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3776 pmap_tte_remove(
3777 pmap_t pmap,
3778 vm_offset_t va_start,
3779 vm_offset_t va_end,
3780 bool need_strong_sync,
3781 tt_entry_t *ttep,
3782 unsigned int level)
3783 {
3784 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3785
3786 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3787 const tt_entry_t tte = *ttep;
3788
3789 if (__improbable(tte == ARM_TTE_EMPTY)) {
3790 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3791 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3792 }
3793
3794 *ttep = (tt_entry_t) 0;
3795 FLUSH_PTE_STRONG();
3796 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3797 if (va_end > va_start) {
3798 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3799 }
3800
3801 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3802
3803 /**
3804 * Remember, the passed in "level" parameter refers to the level above the
3805 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3806 * page table).
3807 */
3808 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3809
3810 /**
3811 * Non-leaf pagetables don't track active references in the PTD and instead
3812 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3813 * the real refcount below.
3814 */
3815 unsigned short refcnt = PT_DESC_REFCOUNT;
3816
3817 /*
3818 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3819 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3820 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3821 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3822 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3823 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3824 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3825 * synchronize it against the disconnect operation. If that removal caused the
3826 * refcount to reach zero, the pagetable page could be freed before the disconnect
3827 * operation is finished using the relevant pagetable descriptor.
3828 * Address these cases by waiting until all CPUs have been observed to not be
3829 * executing pmap_disconnect().
3830 */
3831 if (remove_leaf_table) {
3832 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3833 const int max_cpu = ml_get_max_cpu_number();
3834 bitmap_full(&active_disconnects[0], max_cpu + 1);
3835 bool inflight_disconnect;
3836
3837 /*
3838 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3839 * ahead of any prior PTE load which may have observed the effect of a
3840 * concurrent disconnect operation. An acquire fence is required for this;
3841 * a load-acquire operation is insufficient.
3842 */
3843 os_atomic_thread_fence(acquire);
3844 do {
3845 inflight_disconnect = false;
3846 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3847 i >= 0;
3848 i = bitmap_next(&active_disconnects[0], i)) {
3849 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3850 if (cpu_data == NULL) {
3851 continue;
3852 }
3853 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3854 __builtin_arm_wfe();
3855 inflight_disconnect = true;
3856 continue;
3857 }
3858 os_atomic_clear_exclusive();
3859 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3860 }
3861 } while (inflight_disconnect);
3862 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3863 os_atomic_thread_fence(acquire);
3864 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3865 }
3866
3867 #if MACH_ASSERT
3868 /**
3869 * On internal devices, always do the page table consistency check
3870 * regardless of page table level or the actual refcnt value.
3871 */
3872 {
3873 #else /* MACH_ASSERT */
3874 /**
3875 * Only perform the page table consistency check when deleting leaf page
3876 * tables and it seems like there might be valid/compressed mappings
3877 * leftover.
3878 */
3879 if (__improbable(remove_leaf_table && refcnt != 0)) {
3880 #endif /* MACH_ASSERT */
3881
3882 /**
3883 * There are multiple problems that can arise as a non-zero refcnt:
3884 * 1. A bug in the refcnt management logic.
3885 * 2. A memory stomper or hardware failure.
3886 * 3. The VM forgetting to unmap all of the valid mappings in an address
3887 * space before destroying a pmap.
3888 *
3889 * By looping over the page table and determining how many valid or
3890 * compressed entries there actually are, we can narrow down which of
3891 * these three cases is causing this panic. If the expected refcnt
3892 * (valid + compressed) and the actual refcnt don't match then the
3893 * problem is probably either a memory corruption issue (if the
3894 * non-empty entries don't match valid+compressed, that could also be a
3895 * sign of corruption) or refcnt management bug. Otherwise, there
3896 * actually are leftover mappings and the higher layers of xnu are
3897 * probably at fault.
3898 */
3899 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3900 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3901
3902 pt_entry_t *ptep = bpte;
3903 unsigned short non_empty = 0, valid = 0, comp = 0;
3904
3905 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3906 /**
3907 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3908 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3909 * That's because it's possible for the 4-tuple PTE clear operation in
3910 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3911 * pmap_disconnect() to race each other in such a way that the compressed marker
3912 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3913 * This should be harmless as only the 1st PTE is used for accounting purposes,
3914 * but we don't want it to trip our internal checks here.
3915 */
3916 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3917 if ((i % PAGE_RATIO) == 0) {
3918 comp++;
3919 } else {
3920 continue;
3921 }
3922 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3923 valid++;
3924 }
3925
3926 /* Keep track of all non-empty entries to detect memory corruption. */
3927 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3928 non_empty++;
3929 }
3930 }
3931
3932 #if MACH_ASSERT
3933 /**
3934 * On internal machines, panic whenever a page table getting deleted has
3935 * leftover mappings (valid or otherwise) or a leaf page table has a
3936 * non-zero refcnt.
3937 */
3938 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3939 #else /* MACH_ASSERT */
3940 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3941 {
3942 #endif /* MACH_ASSERT */
3943 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3944 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3945 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3946 }
3947 }
3948 }
3949
3950 /**
3951 * Given a pointer to an entry within a `level` page table, delete the
3952 * page table at `level` + 1 that is represented by that entry. For instance,
3953 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3954 * contains the PA of the L3 table, and `level` would be "2".
3955 *
3956 * @note If the table getting deallocated is a leaf table, then that leaf table
3957 * must have a refcnt of zero before getting deallocated. All other levels
3958 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3959 * @note This function expects to be called with pmap locked exclusive and will
3960 * return with pmap unlocked.
3961 *
3962 * @param pmap The pmap that owns the page table to be deallocated.
3963 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3964 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3965 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3966 * @param ttep Pointer to the `level` TTE to remove.
3967 * @param level The level of the table that contains an entry pointing to the
3968 * table to be removed. The deallocated page table will be a
3969 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3970 * deleted).
3971 */
3972 void
3973 pmap_tte_deallocate(
3974 pmap_t pmap,
3975 vm_offset_t va_start,
3976 vm_offset_t va_end,
3977 bool need_strong_sync,
3978 tt_entry_t *ttep,
3979 unsigned int level)
3980 {
3981 tt_entry_t tte;
3982
3983 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3984
3985 tte = *ttep;
3986
3987 if (tte_get_ptd(tte)->pmap != pmap) {
3988 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3989 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3990 }
3991
3992 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3993 __func__, ttep, (unsigned long long)tte);
3994
3995 /* pmap_tte_remove() will drop the pmap lock */
3996 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3997
3998 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3999 }
4000
4001 /*
4002 * Remove a range of hardware page-table entries.
4003 * The entries given are the first (inclusive)
4004 * and last (exclusive) entries for the VM pages.
4005 * The virtual address is the va for the first pte.
4006 *
4007 * The pmap must be locked.
4008 * If the pmap is not the kernel pmap, the range must lie
4009 * entirely within one pte-page. This is NOT checked.
4010 * Assumes that the pte-page exists.
4011 *
4012 * Returns the number of PTE changed
4013 */
4014 MARK_AS_PMAP_TEXT static int
4015 pmap_remove_range(
4016 pmap_t pmap,
4017 vm_map_address_t va,
4018 pt_entry_t *bpte,
4019 pt_entry_t *epte)
4020 {
4021 bool need_strong_sync = false;
4022 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4023 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4024 if (num_changed > 0) {
4025 PMAP_UPDATE_TLBS(pmap, va,
4026 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4027 }
4028 return num_changed;
4029 }
4030
4031
4032 #ifdef PVH_FLAG_EXEC
4033
4034 /*
4035 * Update the access protection bits of the physical aperture mapping for a page.
4036 * This is useful, for example, in guranteeing that a verified executable page
4037 * has no writable mappings anywhere in the system, including the physical
4038 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4039 * synchronization overhead in cases where the call to this function is
4040 * guaranteed to be followed by other TLB operations.
4041 */
4042 void
4043 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4044 {
4045 #if __ARM_PTE_PHYSMAP__
4046 pvh_assert_locked(pai);
4047 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4048 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4049
4050 pt_entry_t tmplate = *pte_p;
4051 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4052 return;
4053 }
4054 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4055 if (tmplate & ARM_PTE_HINT_MASK) {
4056 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4057 __func__, pte_p, (void *)kva, tmplate);
4058 }
4059 write_pte_strong(pte_p, tmplate);
4060 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4061 if (!flush_tlb_async) {
4062 sync_tlb_flush();
4063 }
4064 #endif
4065 }
4066 #endif /* defined(PVH_FLAG_EXEC) */
4067
4068
4069
4070 MARK_AS_PMAP_TEXT int
4071 pmap_remove_range_options(
4072 pmap_t pmap,
4073 vm_map_address_t va,
4074 pt_entry_t *bpte,
4075 pt_entry_t *epte,
4076 vm_map_address_t *eva,
4077 bool *need_strong_sync __unused,
4078 int options)
4079 {
4080 pt_entry_t *cpte;
4081 size_t npages = 0;
4082 int num_removed, num_unwired;
4083 int num_pte_changed;
4084 unsigned int pai = 0;
4085 pmap_paddr_t pa;
4086 int num_external, num_internal, num_reusable;
4087 int num_alt_internal;
4088 uint64_t num_compressed, num_alt_compressed;
4089 int16_t refcnt = 0;
4090
4091 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4092
4093 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4094 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4095
4096 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4097 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4098 }
4099
4100 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4101 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4102 }
4103
4104 if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4105 panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4106 }
4107
4108 num_removed = 0;
4109 num_unwired = 0;
4110 num_pte_changed = 0;
4111 num_external = 0;
4112 num_internal = 0;
4113 num_reusable = 0;
4114 num_compressed = 0;
4115 num_alt_internal = 0;
4116 num_alt_compressed = 0;
4117
4118 #if XNU_MONITOR
4119 bool ro_va = false;
4120 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4121 ro_va = true;
4122 }
4123 #endif
4124 for (cpte = bpte; cpte < epte;
4125 cpte += PAGE_RATIO, va += pmap_page_size) {
4126 pt_entry_t spte;
4127 boolean_t managed = FALSE;
4128
4129 /*
4130 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4131 * so we need to be as aggressive as possible in checking for preemption when we can.
4132 */
4133 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4134 *eva = va;
4135 break;
4136 }
4137
4138 spte = *((volatile pt_entry_t*)cpte);
4139
4140 while (!managed) {
4141 if (pmap != kernel_pmap &&
4142 (options & PMAP_OPTIONS_REMOVE) &&
4143 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4144 /*
4145 * "pmap" must be locked at this point,
4146 * so this should not race with another
4147 * pmap_remove_range() or pmap_enter().
4148 */
4149
4150 /* one less "compressed"... */
4151 num_compressed++;
4152 if (spte & ARM_PTE_COMPRESSED_ALT) {
4153 /* ... but it used to be "ALTACCT" */
4154 num_alt_compressed++;
4155 }
4156
4157 /* clear marker */
4158 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4159 /*
4160 * "refcnt" also accounts for
4161 * our "compressed" markers,
4162 * so let's update it here.
4163 */
4164 --refcnt;
4165 spte = *((volatile pt_entry_t*)cpte);
4166 }
4167 /*
4168 * It may be possible for the pte to transition from managed
4169 * to unmanaged in this timeframe; for now, elide the assert.
4170 * We should break out as a consequence of checking pa_valid.
4171 */
4172 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4173 pa = pte_to_pa(spte);
4174 if (!pa_valid(pa)) {
4175 #if XNU_MONITOR
4176 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4177 #endif
4178 #if XNU_MONITOR
4179 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4180 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4181 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4182 __func__, (uint64_t)pa);
4183 }
4184 #endif
4185 break;
4186 }
4187 #if HAS_FEAT_XS
4188 if (pte_is_xs(pt_attr, spte)) {
4189 *need_strong_sync = true;
4190 }
4191 #endif /* HAS_FEAT_XS */
4192 pai = pa_index(pa);
4193 pvh_lock(pai);
4194 spte = *((volatile pt_entry_t*)cpte);
4195 pa = pte_to_pa(spte);
4196 if (pai == pa_index(pa)) {
4197 managed = TRUE;
4198 break; // Leave pai locked as we will unlock it after we free the PV entry
4199 }
4200 pvh_unlock(pai);
4201 }
4202
4203 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4204 /*
4205 * There used to be a valid mapping here but it
4206 * has already been removed when the page was
4207 * sent to the VM compressor, so nothing left to
4208 * remove now...
4209 */
4210 continue;
4211 }
4212
4213 /* remove the translation, do not flush the TLB */
4214 if (*cpte != ARM_PTE_TYPE_FAULT) {
4215 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4216 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4217 #if MACH_ASSERT
4218 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4219 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4220 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4221 }
4222 #endif
4223 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4224 num_pte_changed++;
4225 }
4226
4227 if ((spte != ARM_PTE_TYPE_FAULT) &&
4228 (pmap != kernel_pmap)) {
4229 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4230 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4231 --refcnt;
4232 }
4233
4234 if (pte_is_wired(spte)) {
4235 pte_set_wired(pmap, cpte, 0);
4236 num_unwired++;
4237 }
4238 /*
4239 * if not managed, we're done
4240 */
4241 if (!managed) {
4242 continue;
4243 }
4244
4245 #if XNU_MONITOR
4246 if (__improbable(ro_va)) {
4247 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4248 }
4249 #endif
4250
4251 /*
4252 * find and remove the mapping from the chain for this
4253 * physical address.
4254 */
4255 bool is_internal, is_altacct;
4256 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4257
4258 if (is_altacct) {
4259 assert(is_internal);
4260 num_internal++;
4261 num_alt_internal++;
4262 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4263 ppattr_clear_altacct(pai);
4264 ppattr_clear_internal(pai);
4265 }
4266 } else if (is_internal) {
4267 if (ppattr_test_reusable(pai)) {
4268 num_reusable++;
4269 } else {
4270 num_internal++;
4271 }
4272 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4273 ppattr_clear_internal(pai);
4274 }
4275 } else {
4276 num_external++;
4277 }
4278 pvh_unlock(pai);
4279 num_removed++;
4280 }
4281
4282 /*
4283 * Update the counts
4284 */
4285 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4286
4287 if (pmap != kernel_pmap) {
4288 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4289 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4290 }
4291
4292 /* update ledgers */
4293 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4294 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4295 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4296 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4297 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4298 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4299 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4300 /* make needed adjustments to phys_footprint */
4301 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4302 ((num_internal -
4303 num_alt_internal) +
4304 (num_compressed -
4305 num_alt_compressed)) * pmap_page_size);
4306 }
4307
4308 /* flush the ptable entries we have written */
4309 if (num_pte_changed > 0) {
4310 FLUSH_PTE_STRONG();
4311 }
4312
4313 return num_pte_changed;
4314 }
4315
4316
4317 /*
4318 * Remove the given range of addresses
4319 * from the specified map.
4320 *
4321 * It is assumed that the start and end are properly
4322 * rounded to the hardware page size.
4323 */
4324 void
4325 pmap_remove(
4326 pmap_t pmap,
4327 vm_map_address_t start,
4328 vm_map_address_t end)
4329 {
4330 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4331 }
4332
4333 MARK_AS_PMAP_TEXT vm_map_address_t
4334 pmap_remove_options_internal(
4335 pmap_t pmap,
4336 vm_map_address_t start,
4337 vm_map_address_t end,
4338 int options)
4339 {
4340 vm_map_address_t eva = end;
4341 pt_entry_t *bpte, *epte;
4342 pt_entry_t *pte_p;
4343 tt_entry_t *tte_p;
4344 int remove_count = 0;
4345 bool need_strong_sync = false;
4346 bool unlock = true;
4347
4348 validate_pmap_mutable(pmap);
4349
4350 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4351
4352 if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4353 panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4354 }
4355
4356 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4357
4358 tte_p = pmap_tte(pmap, start);
4359
4360 if (tte_p == (tt_entry_t *) NULL) {
4361 goto done;
4362 }
4363
4364 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4365 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4366 bpte = &pte_p[pte_index(pt_attr, start)];
4367 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4368
4369 /*
4370 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4371 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4372 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4373 */
4374 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4375 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4376 __func__, ptep_get_pmap(bpte), pmap, bpte);
4377 }
4378
4379 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4380 &need_strong_sync, options);
4381
4382 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4383 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4384 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4385 unlock = false; // pmap_tte_deallocate() has dropped the lock
4386 }
4387 }
4388
4389 done:
4390 if (unlock) {
4391 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4392 }
4393
4394 if (remove_count > 0) {
4395 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4396 }
4397 return eva;
4398 }
4399
4400 void
4401 pmap_remove_options(
4402 pmap_t pmap,
4403 vm_map_address_t start,
4404 vm_map_address_t end,
4405 int options)
4406 {
4407 vm_map_address_t va;
4408
4409 if (pmap == PMAP_NULL) {
4410 return;
4411 }
4412
4413 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4414
4415 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4416 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4417 VM_KERNEL_ADDRHIDE(end));
4418
4419 /*
4420 * We allow single-page requests to execute non-preemptibly,
4421 * as it doesn't make sense to sample AST_URGENT for a single-page
4422 * operation, and there are a couple of special use cases that
4423 * require a non-preemptible single-page operation.
4424 */
4425 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4426 pmap_verify_preemptible();
4427 }
4428
4429 /*
4430 * Invalidate the translation buffer first
4431 */
4432 va = start;
4433 while (va < end) {
4434 vm_map_address_t l;
4435
4436 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4437 if (l > end) {
4438 l = end;
4439 }
4440
4441 #if XNU_MONITOR
4442 va = pmap_remove_options_ppl(pmap, va, l, options);
4443
4444 pmap_ledger_check_balance(pmap);
4445 #else
4446 va = pmap_remove_options_internal(pmap, va, l, options);
4447 #endif
4448 }
4449
4450 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4451 }
4452
4453
4454 /*
4455 * Remove phys addr if mapped in specified map
4456 */
4457 void
4458 pmap_remove_some_phys(
4459 __unused pmap_t map,
4460 __unused ppnum_t pn)
4461 {
4462 /* Implement to support working set code */
4463 }
4464
4465 /*
4466 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4467 * switch a thread onto a new vm_map.
4468 */
4469 void
4470 pmap_switch_user(thread_t thread, vm_map_t new_map)
4471 {
4472 pmap_t new_pmap = new_map->pmap;
4473
4474
4475 thread->map = new_map;
4476 pmap_set_pmap(new_pmap, thread);
4477
4478 }
4479
4480 void
4481 pmap_set_pmap(
4482 pmap_t pmap,
4483 #if !__ARM_USER_PROTECT__
4484 __unused
4485 #endif
4486 thread_t thread)
4487 {
4488 pmap_switch(pmap);
4489 #if __ARM_USER_PROTECT__
4490 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4491 thread->machine.asid = pmap->hw_asid;
4492 #endif
4493 }
4494
4495 static void
4496 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4497 {
4498 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4499 }
4500
4501 static inline bool
4502 pmap_user_ttb_is_clear(void)
4503 {
4504 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4505 }
4506
4507 MARK_AS_PMAP_TEXT void
4508 pmap_switch_internal(
4509 pmap_t pmap)
4510 {
4511 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4512 #if XNU_MONITOR
4513 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4514
4515 /**
4516 * Make sure a pmap is never active-and-nested. For more details,
4517 * see pmap_set_nested_internal().
4518 */
4519 os_atomic_thread_fence(seq_cst);
4520 if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4521 panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4522 }
4523 #endif
4524 validate_pmap_mutable(pmap);
4525 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4526 uint16_t asid_index = pmap->hw_asid;
4527 bool do_asid_flush = false;
4528 bool do_commpage_flush = false;
4529
4530 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4531 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4532 }
4533 #if __ARM_KERNEL_PROTECT__
4534 asid_index >>= 1;
4535 #endif
4536
4537 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4538 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4539 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4540 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4541 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4542 bool break_before_make = do_shared_region_flush;
4543
4544 #if !HAS_16BIT_ASID
4545 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4546 asid_index -= 1;
4547 pmap_update_plru(asid_index);
4548
4549 /* Paranoia. */
4550 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4551
4552 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4553 uint8_t new_sw_asid = pmap->sw_asid;
4554 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4555
4556 if (new_sw_asid != last_sw_asid) {
4557 /*
4558 * If the virtual ASID of the new pmap does not match the virtual ASID
4559 * last seen on this CPU for the physical ASID (that was a mouthful),
4560 * then this switch runs the risk of aliasing. We need to flush the
4561 * TLB for this phyiscal ASID in this case.
4562 */
4563 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4564 do_asid_flush = true;
4565 break_before_make = true;
4566 }
4567 }
4568 #endif /* !HAS_16BIT_ASID */
4569
4570 #if __ARM_MIXED_PAGE_SIZE__
4571 if (pt_attr->pta_tcr_value != get_tcr()) {
4572 break_before_make = true;
4573 }
4574 #endif
4575 #if __ARM_MIXED_PAGE_SIZE__
4576 /*
4577 * For mixed page size configurations, we need to flush the global commpage mappings from
4578 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4579 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4580 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4581 * conflict abort or other unpredictable behavior.
4582 */
4583 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4584 do_commpage_flush = true;
4585 }
4586 if (do_commpage_flush) {
4587 break_before_make = true;
4588 }
4589 #endif
4590 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4591 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4592 pmap_clear_user_ttb_internal();
4593 }
4594
4595 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4596 * to flush the userspace mappings for that region. Those mappings are global
4597 * and will not be protected by the ASID. It should also be cheaper to flush the
4598 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4599 if (__improbable(do_shared_region_flush)) {
4600 #if __ARM_RANGE_TLBI__
4601 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4602 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4603
4604 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4605 * There may still be non-global entries that overlap with the incoming pmap's
4606 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4607 * must necessarily belong to a different ASID than the incoming pmap, or they would
4608 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4609 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4610 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4611 * to consider additional invalidation here in the future. */
4612 if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4613 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4614 } else {
4615 /*
4616 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4617 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4618 * have a single-page shared region anyway, not least because pmap_nest()
4619 * requires L2 block alignment of the address and size.
4620 */
4621 do_asid_flush = false;
4622 flush_core_tlb_async();
4623 }
4624 #else
4625 do_asid_flush = false;
4626 flush_core_tlb_async();
4627 #endif // __ARM_RANGE_TLBI__
4628 }
4629
4630 #if __ARM_MIXED_PAGE_SIZE__
4631 if (__improbable(do_commpage_flush)) {
4632 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4633 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4634 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4635 flush_core_tlb_allrange_async(rtlbi_param);
4636 }
4637 #endif
4638 if (__improbable(do_asid_flush)) {
4639 pmap_flush_core_tlb_asid_async(pmap);
4640 #if DEVELOPMENT || DEBUG
4641 os_atomic_inc(&pmap_asid_flushes, relaxed);
4642 #endif
4643 }
4644 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4645 sync_tlb_flush_local();
4646 }
4647
4648 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4649 }
4650
4651 void
4652 pmap_switch(
4653 pmap_t pmap)
4654 {
4655 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4656 #if XNU_MONITOR
4657 pmap_switch_ppl(pmap);
4658 #else
4659 pmap_switch_internal(pmap);
4660 #endif
4661 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4662 }
4663
4664 void
4665 pmap_page_protect(
4666 ppnum_t ppnum,
4667 vm_prot_t prot)
4668 {
4669 pmap_page_protect_options(ppnum, prot, 0, NULL);
4670 }
4671
4672 /*
4673 * Routine: pmap_page_protect_options
4674 *
4675 * Function:
4676 * Lower the permission for all mappings to a given
4677 * page.
4678 */
4679 MARK_AS_PMAP_TEXT static void
4680 pmap_page_protect_options_with_flush_range(
4681 ppnum_t ppnum,
4682 vm_prot_t prot,
4683 unsigned int options,
4684 pmap_tlb_flush_range_t *flush_range)
4685 {
4686 pmap_paddr_t phys = ptoa(ppnum);
4687 pv_entry_t **pv_h;
4688 pv_entry_t *pve_p, *orig_pve_p;
4689 pv_entry_t *pveh_p;
4690 pv_entry_t *pvet_p;
4691 pt_entry_t *pte_p, *orig_pte_p;
4692 pv_entry_t *new_pve_p;
4693 pt_entry_t *new_pte_p;
4694 vm_offset_t pvh_flags;
4695 unsigned int pai;
4696 bool remove;
4697 bool set_NX;
4698 unsigned int pvh_cnt = 0;
4699 unsigned int pass1_updated = 0;
4700 unsigned int pass2_updated = 0;
4701
4702 assert(ppnum != vm_page_fictitious_addr);
4703
4704 /* Only work with managed pages. */
4705 if (!pa_valid(phys)) {
4706 return;
4707 }
4708
4709 /*
4710 * Determine the new protection.
4711 */
4712 switch (prot) {
4713 case VM_PROT_ALL:
4714 return; /* nothing to do */
4715 case VM_PROT_READ:
4716 case VM_PROT_READ | VM_PROT_EXECUTE:
4717 remove = false;
4718 break;
4719 default:
4720 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4721 options = options & ~PMAP_OPTIONS_NOFLUSH;
4722 remove = true;
4723 break;
4724 }
4725
4726 pmap_cpu_data_t *pmap_cpu_data = NULL;
4727 if (remove) {
4728 #if !XNU_MONITOR
4729 mp_disable_preemption();
4730 #endif
4731 pmap_cpu_data = pmap_get_cpu_data();
4732 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4733 /*
4734 * Ensure the store to inflight_disconnect will be observed before any of the
4735 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4736 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4737 * another CPU, in between this function's clearing a PTE and dropping the
4738 * corresponding pagetable refcount. That can lead to a panic if the
4739 * destroying thread observes a non-zero refcount. For this we need a store-
4740 * store barrier; a store-release operation would not be sufficient.
4741 */
4742 os_atomic_thread_fence(release);
4743 }
4744
4745 pai = pa_index(phys);
4746 pvh_lock(pai);
4747 pv_h = pai_to_pvh(pai);
4748 pvh_flags = pvh_get_flags(pv_h);
4749
4750 #if XNU_MONITOR
4751 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4752 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4753 }
4754 if (__improbable(ppattr_pa_test_monitor(phys))) {
4755 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4756 }
4757 #endif
4758
4759
4760 orig_pte_p = pte_p = PT_ENTRY_NULL;
4761 orig_pve_p = pve_p = PV_ENTRY_NULL;
4762 pveh_p = PV_ENTRY_NULL;
4763 pvet_p = PV_ENTRY_NULL;
4764 new_pve_p = PV_ENTRY_NULL;
4765 new_pte_p = PT_ENTRY_NULL;
4766
4767
4768 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4769 orig_pte_p = pte_p = pvh_ptep(pv_h);
4770 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4771 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4772 pveh_p = pve_p;
4773 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4774 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4775 }
4776
4777 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4778 int pve_ptep_idx = 0;
4779
4780 /*
4781 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4782 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4783 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4784 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4785 * operation, TLB invalidation may be handled by the caller so it's possible for
4786 * tlb_flush_needed to be true while issue_tlbi is false.
4787 */
4788 bool issue_tlbi = false;
4789 bool tlb_flush_needed = false;
4790 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4791 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4792 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4793 bool update = false;
4794
4795 if (pve_p != PV_ENTRY_NULL) {
4796 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4797 if (pte_p == PT_ENTRY_NULL) {
4798 goto protect_skip_pve_pass1;
4799 }
4800 }
4801
4802 #ifdef PVH_FLAG_IOMMU
4803 if (pvh_ptep_is_iommu(pte_p)) {
4804 #if XNU_MONITOR
4805 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4806 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4807 ppnum, ptep_get_iommu(pte_p), pve_p);
4808 }
4809 #endif
4810 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4811 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4812 ppnum, ptep_get_iommu(pte_p), pve_p);
4813 }
4814 goto protect_skip_pve_pass1;
4815 }
4816 #endif
4817 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4818 const pmap_t pmap = ptdp->pmap;
4819 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4820
4821 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4822 #if MACH_ASSERT
4823 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4824 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4825 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4826 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4827
4828 pv_entry_t *check_pvep = pve_p;
4829
4830 do {
4831 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4832 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4833 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4834 }
4835 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4836
4837 /* Restore previous PTEP value. */
4838 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4839 }
4840 #endif
4841 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4842 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4843 }
4844
4845 #if DEVELOPMENT || DEBUG
4846 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4847 #else
4848 if ((prot & VM_PROT_EXECUTE))
4849 #endif
4850 {
4851 set_NX = false;
4852 } else {
4853 set_NX = true;
4854 }
4855
4856 #if HAS_FEAT_XS
4857 /**
4858 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4859 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4860 */
4861 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4862 #endif /* HAS_FEAT_XS */
4863
4864 /* Remove the mapping if new protection is NONE */
4865 if (remove) {
4866 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4867 panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4868 __func__, pmap, ppnum);
4869 }
4870
4871 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4872 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4873 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4874 pt_entry_t spte = *pte_p;
4875
4876 if (pte_is_wired(spte)) {
4877 pte_set_wired(pmap, pte_p, 0);
4878 spte = *pte_p;
4879 if (pmap != kernel_pmap) {
4880 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4881 }
4882 }
4883
4884 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4885 (uint64_t)spte, pte_p, ppnum);
4886
4887 if (compress && is_internal && (pmap != kernel_pmap)) {
4888 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4889 /* mark this PTE as having been "compressed" */
4890 tmplate = ARM_PTE_COMPRESSED;
4891 if (is_altacct) {
4892 tmplate |= ARM_PTE_COMPRESSED_ALT;
4893 }
4894 } else {
4895 tmplate = ARM_PTE_TYPE_FAULT;
4896 }
4897
4898 assert(spte != tmplate);
4899 write_pte_fast(pte_p, tmplate);
4900 update = true;
4901 ++pass1_updated;
4902
4903 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4904
4905 if (pmap != kernel_pmap) {
4906 if (ppattr_test_reusable(pai) &&
4907 is_internal &&
4908 !is_altacct) {
4909 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4910 } else if (!is_internal) {
4911 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4912 }
4913
4914 if (is_altacct) {
4915 assert(is_internal);
4916 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4917 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4918 if (options & PMAP_OPTIONS_COMPRESSOR) {
4919 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4920 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4921 }
4922 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4923 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4924 } else if (ppattr_test_reusable(pai)) {
4925 assert(is_internal);
4926 if (options & PMAP_OPTIONS_COMPRESSOR) {
4927 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4928 /* was not in footprint, but is now */
4929 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4930 }
4931 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4932 } else if (is_internal) {
4933 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4934
4935 /*
4936 * Update all stats related to physical footprint, which only
4937 * deals with internal pages.
4938 */
4939 if (options & PMAP_OPTIONS_COMPRESSOR) {
4940 /*
4941 * This removal is only being done so we can send this page to
4942 * the compressor; therefore it mustn't affect total task footprint.
4943 */
4944 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4945 } else {
4946 /*
4947 * This internal page isn't going to the compressor, so adjust stats to keep
4948 * phys_footprint up to date.
4949 */
4950 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4951 }
4952 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4953 } else {
4954 /* external page: no impact on ledgers */
4955 }
4956 }
4957 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4958 } else {
4959 pt_entry_t spte = *pte_p;
4960 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4961
4962 if (pmap == kernel_pmap) {
4963 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4964 } else {
4965 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4966 }
4967
4968 /*
4969 * While the naive implementation of this would serve to add execute
4970 * permission, this is not how the VM uses this interface, or how
4971 * x86_64 implements it. So ignore requests to add execute permissions.
4972 */
4973 if (set_NX) {
4974 tmplate |= pt_attr_leaf_xn(pt_attr);
4975 }
4976
4977
4978 assert(spte != ARM_PTE_TYPE_FAULT);
4979 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4980
4981 if (spte != tmplate) {
4982 /*
4983 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4984 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4985 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
4986 * should always be cleared by this function.
4987 */
4988 pte_set_was_writeable(tmplate, true);
4989 write_pte_fast(pte_p, tmplate);
4990 update = true;
4991 ++pass1_updated;
4992 } else if (pte_was_writeable(tmplate)) {
4993 /*
4994 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4995 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
4996 * write access to a page, this function should always at least clear that flag for
4997 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4998 * these mappings go through vm_fault(). We therefore don't want those accesses to
4999 * be handled through arm_fast_fault().
5000 */
5001 pte_set_was_writeable(tmplate, false);
5002 write_pte_fast(pte_p, tmplate);
5003 }
5004 }
5005
5006 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5007 tlb_flush_needed = true;
5008 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5009 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5010 issue_tlbi = true;
5011 }
5012 }
5013 protect_skip_pve_pass1:
5014 pte_p = PT_ENTRY_NULL;
5015 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5016 pve_ptep_idx = 0;
5017 pve_p = pve_next(pve_p);
5018 }
5019 }
5020
5021 if (tlb_flush_needed) {
5022 FLUSH_PTE_STRONG();
5023 }
5024
5025 if (!remove && !issue_tlbi) {
5026 goto protect_finish;
5027 }
5028
5029 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5030 pv_entry_t **pve_pp = pv_h;
5031 pve_p = orig_pve_p;
5032 pte_p = orig_pte_p;
5033 pve_ptep_idx = 0;
5034
5035 /*
5036 * We need to keep track of whether a particular PVE list contains IOMMU
5037 * mappings when removing entries, because we should only remove CPU
5038 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5039 * it around.
5040 */
5041 bool iommu_mapping_in_pve = false;
5042 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5043 if (pve_p != PV_ENTRY_NULL) {
5044 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5045 if (pte_p == PT_ENTRY_NULL) {
5046 goto protect_skip_pve_pass2;
5047 }
5048 }
5049
5050 #ifdef PVH_FLAG_IOMMU
5051 if (pvh_ptep_is_iommu(pte_p)) {
5052 iommu_mapping_in_pve = true;
5053 if (remove && (pve_p == PV_ENTRY_NULL)) {
5054 /*
5055 * We've found an IOMMU entry and it's the only entry in the PV list.
5056 * We don't discard IOMMU entries, so simply set up the new PV list to
5057 * contain the single IOMMU PTE and exit the loop.
5058 */
5059 new_pte_p = pte_p;
5060 break;
5061 }
5062 goto protect_skip_pve_pass2;
5063 }
5064 #endif
5065 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5066 const pmap_t pmap = ptdp->pmap;
5067 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5068
5069 if (remove) {
5070 if (!compress && (pmap != kernel_pmap)) {
5071 /*
5072 * We must wait to decrement the refcount until we're completely finished using the PTE
5073 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5074 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5075 * under us.
5076 */
5077 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5078 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5079 }
5080 }
5081 /* Remove this CPU mapping from PVE list. */
5082 if (pve_p != PV_ENTRY_NULL) {
5083 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5084 }
5085 } else {
5086 pt_entry_t spte = *pte_p;
5087 if (pte_was_writeable(spte)) {
5088 pte_set_was_writeable(spte, false);
5089 write_pte_fast(pte_p, spte);
5090 } else {
5091 goto protect_skip_pve_pass2;
5092 }
5093 }
5094 ++pass2_updated;
5095 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5096 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5097 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5098 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5099 }
5100
5101 protect_skip_pve_pass2:
5102 pte_p = PT_ENTRY_NULL;
5103 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5104 pve_ptep_idx = 0;
5105
5106 if (remove) {
5107 /**
5108 * If there are any IOMMU mappings in the PVE list, preserve
5109 * those mappings in a new PVE list (new_pve_p) which will later
5110 * become the new PVH entry. Keep track of the CPU mappings in
5111 * pveh_p/pvet_p so they can be deallocated later.
5112 */
5113 if (iommu_mapping_in_pve) {
5114 iommu_mapping_in_pve = false;
5115 pv_entry_t *temp_pve_p = pve_next(pve_p);
5116 pve_remove(pv_h, pve_pp, pve_p);
5117 pveh_p = pvh_pve_list(pv_h);
5118 pve_p->pve_next = new_pve_p;
5119 new_pve_p = pve_p;
5120 pve_p = temp_pve_p;
5121 continue;
5122 } else {
5123 pvet_p = pve_p;
5124 pvh_cnt++;
5125 }
5126 }
5127
5128 pve_pp = pve_next_ptr(pve_p);
5129 pve_p = pve_next(pve_p);
5130 iommu_mapping_in_pve = false;
5131 }
5132 }
5133
5134 protect_finish:
5135
5136 #ifdef PVH_FLAG_EXEC
5137 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5138 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5139 }
5140 #endif
5141 if (__improbable(pass1_updated != pass2_updated)) {
5142 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5143 __func__, pass1_updated, pass2_updated);
5144 }
5145 /* if we removed a bunch of entries, take care of them now */
5146 if (remove) {
5147 if (new_pve_p != PV_ENTRY_NULL) {
5148 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5149 pvh_set_flags(pv_h, pvh_flags);
5150 } else if (new_pte_p != PT_ENTRY_NULL) {
5151 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5152 pvh_set_flags(pv_h, pvh_flags);
5153 } else {
5154 if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5155 pmap_flush_noncoherent_page(phys);
5156 }
5157 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5158 }
5159 }
5160
5161 if (flush_range && tlb_flush_needed) {
5162 if (!remove) {
5163 flush_range->ptfr_flush_needed = true;
5164 tlb_flush_needed = false;
5165 }
5166 }
5167
5168 /*
5169 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5170 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5171 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5172 * a page to be repurposed while it is still live in the TLBs.
5173 */
5174 if (remove && tlb_flush_needed) {
5175 sync_tlb_flush();
5176 }
5177
5178
5179 pvh_unlock(pai);
5180
5181 if (remove) {
5182 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5183 #if !XNU_MONITOR
5184 mp_enable_preemption();
5185 #endif
5186 }
5187
5188 if (!remove && tlb_flush_needed) {
5189 sync_tlb_flush();
5190 }
5191
5192 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5193 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5194 }
5195 }
5196
5197 MARK_AS_PMAP_TEXT void
5198 pmap_page_protect_options_internal(
5199 ppnum_t ppnum,
5200 vm_prot_t prot,
5201 unsigned int options,
5202 void *arg)
5203 {
5204 if (arg != NULL) {
5205 /*
5206 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5207 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5208 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5209 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5210 * In that case, force the flush to take place.
5211 */
5212 options &= ~PMAP_OPTIONS_NOFLUSH;
5213 }
5214 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5215 }
5216
5217 void
5218 pmap_page_protect_options(
5219 ppnum_t ppnum,
5220 vm_prot_t prot,
5221 unsigned int options,
5222 void *arg)
5223 {
5224 pmap_paddr_t phys = ptoa(ppnum);
5225
5226 assert(ppnum != vm_page_fictitious_addr);
5227
5228 /* Only work with managed pages. */
5229 if (!pa_valid(phys)) {
5230 return;
5231 }
5232
5233 /*
5234 * Determine the new protection.
5235 */
5236 if (prot == VM_PROT_ALL) {
5237 return; /* nothing to do */
5238 }
5239
5240 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5241
5242 #if XNU_MONITOR
5243 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5244 #else
5245 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5246 #endif
5247
5248 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5249 }
5250
5251
5252 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5253 MARK_AS_PMAP_TEXT void
5254 pmap_disable_user_jop_internal(pmap_t pmap)
5255 {
5256 if (pmap == kernel_pmap) {
5257 panic("%s: called with kernel_pmap", __func__);
5258 }
5259 validate_pmap_mutable(pmap);
5260 pmap->disable_jop = true;
5261 }
5262
5263 void
5264 pmap_disable_user_jop(pmap_t pmap)
5265 {
5266 #if XNU_MONITOR
5267 pmap_disable_user_jop_ppl(pmap);
5268 #else
5269 pmap_disable_user_jop_internal(pmap);
5270 #endif
5271 }
5272 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5273
5274 /*
5275 * Indicates if the pmap layer enforces some additional restrictions on the
5276 * given set of protections.
5277 */
5278 bool
5279 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5280 {
5281 return false;
5282 }
5283
5284 /*
5285 * Set the physical protection on the
5286 * specified range of this map as requested.
5287 * VERY IMPORTANT: Will not increase permissions.
5288 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5289 */
5290 void
5291 pmap_protect(
5292 pmap_t pmap,
5293 vm_map_address_t b,
5294 vm_map_address_t e,
5295 vm_prot_t prot)
5296 {
5297 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5298 }
5299
5300 MARK_AS_PMAP_TEXT vm_map_address_t
5301 pmap_protect_options_internal(
5302 pmap_t pmap,
5303 vm_map_address_t start,
5304 vm_map_address_t end,
5305 vm_prot_t prot,
5306 unsigned int options,
5307 __unused void *args)
5308 {
5309 tt_entry_t *tte_p;
5310 pt_entry_t *bpte_p, *epte_p;
5311 pt_entry_t *pte_p;
5312 boolean_t set_NX = TRUE;
5313 boolean_t set_XO = FALSE;
5314 boolean_t should_have_removed = FALSE;
5315 bool need_strong_sync = false;
5316
5317 /* Validate the pmap input before accessing its data. */
5318 validate_pmap_mutable(pmap);
5319
5320 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5321
5322 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5323 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5324 }
5325
5326 #if DEVELOPMENT || DEBUG
5327 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5328 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5329 should_have_removed = TRUE;
5330 }
5331 } else
5332 #endif
5333 {
5334 /* Determine the new protection. */
5335 switch (prot) {
5336 case VM_PROT_EXECUTE:
5337 set_XO = TRUE;
5338 OS_FALLTHROUGH;
5339 case VM_PROT_READ:
5340 case VM_PROT_READ | VM_PROT_EXECUTE:
5341 break;
5342 case VM_PROT_READ | VM_PROT_WRITE:
5343 case VM_PROT_ALL:
5344 return end; /* nothing to do */
5345 default:
5346 should_have_removed = TRUE;
5347 }
5348 }
5349
5350 if (should_have_removed) {
5351 panic("%s: should have been a remove operation, "
5352 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5353 __FUNCTION__,
5354 pmap, (void *)start, (void *)end, prot, options, args);
5355 }
5356
5357 #if DEVELOPMENT || DEBUG
5358 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5359 #else
5360 if ((prot & VM_PROT_EXECUTE))
5361 #endif
5362 {
5363 set_NX = FALSE;
5364 } else {
5365 set_NX = TRUE;
5366 }
5367
5368 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5369 vm_map_address_t va = start;
5370 unsigned int npages = 0;
5371
5372 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5373
5374 tte_p = pmap_tte(pmap, start);
5375
5376 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5377 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5378 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5379 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5380 pte_p = bpte_p;
5381
5382 for (pte_p = bpte_p;
5383 pte_p < epte_p;
5384 pte_p += PAGE_RATIO, va += pmap_page_size) {
5385 ++npages;
5386 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5387 pmap_pending_preemption())) {
5388 break;
5389 }
5390 pt_entry_t spte;
5391 #if DEVELOPMENT || DEBUG
5392 boolean_t force_write = FALSE;
5393 #endif
5394
5395 spte = *((volatile pt_entry_t*)pte_p);
5396
5397 if ((spte == ARM_PTE_TYPE_FAULT) ||
5398 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5399 continue;
5400 }
5401
5402 pmap_paddr_t pa;
5403 unsigned int pai = 0;
5404 boolean_t managed = FALSE;
5405
5406 while (!managed) {
5407 /*
5408 * It may be possible for the pte to transition from managed
5409 * to unmanaged in this timeframe; for now, elide the assert.
5410 * We should break out as a consequence of checking pa_valid.
5411 */
5412 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5413 pa = pte_to_pa(spte);
5414 if (!pa_valid(pa)) {
5415 break;
5416 }
5417 pai = pa_index(pa);
5418 pvh_lock(pai);
5419 spte = *((volatile pt_entry_t*)pte_p);
5420 pa = pte_to_pa(spte);
5421 if (pai == pa_index(pa)) {
5422 managed = TRUE;
5423 break; // Leave the PVH locked as we will unlock it after we free the PTE
5424 }
5425 pvh_unlock(pai);
5426 }
5427
5428 if ((spte == ARM_PTE_TYPE_FAULT) ||
5429 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5430 continue;
5431 }
5432
5433 pt_entry_t tmplate;
5434
5435 if (pmap == kernel_pmap) {
5436 #if DEVELOPMENT || DEBUG
5437 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5438 force_write = TRUE;
5439 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5440 } else
5441 #endif
5442 {
5443 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5444 }
5445 } else {
5446 #if DEVELOPMENT || DEBUG
5447 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5448 assert(pmap->type != PMAP_TYPE_NESTED);
5449 force_write = TRUE;
5450 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5451 } else
5452 #endif
5453 {
5454 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5455 }
5456 }
5457
5458 /*
5459 * XXX Removing "NX" would
5460 * grant "execute" access
5461 * immediately, bypassing any
5462 * checks VM might want to do
5463 * in its soft fault path.
5464 * pmap_protect() and co. are
5465 * not allowed to increase
5466 * access permissions.
5467 */
5468 if (set_NX) {
5469 tmplate |= pt_attr_leaf_xn(pt_attr);
5470 } else {
5471 if (pmap == kernel_pmap) {
5472 /* do NOT clear "PNX"! */
5473 tmplate |= ARM_PTE_NX;
5474 } else {
5475 /* do NOT clear "NX"! */
5476 tmplate |= pt_attr_leaf_x(pt_attr);
5477 if (set_XO) {
5478 tmplate &= ~ARM_PTE_APMASK;
5479 tmplate |= pt_attr_leaf_rona(pt_attr);
5480 }
5481 }
5482 }
5483
5484 #if DEVELOPMENT || DEBUG
5485 if (force_write) {
5486 /*
5487 * TODO: Run CS/Monitor checks here.
5488 */
5489 if (managed) {
5490 /*
5491 * We are marking the page as writable,
5492 * so we consider it to be modified and
5493 * referenced.
5494 */
5495 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5496 tmplate |= ARM_PTE_AF;
5497
5498 if (ppattr_test_reffault(pai)) {
5499 ppattr_clear_reffault(pai);
5500 }
5501
5502 if (ppattr_test_modfault(pai)) {
5503 ppattr_clear_modfault(pai);
5504 }
5505 }
5506 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5507 /*
5508 * An immediate request for anything other than
5509 * write should still mark the page as
5510 * referenced if managed.
5511 */
5512 if (managed) {
5513 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5514 tmplate |= ARM_PTE_AF;
5515
5516 if (ppattr_test_reffault(pai)) {
5517 ppattr_clear_reffault(pai);
5518 }
5519 }
5520 }
5521 #endif
5522
5523 /* We do not expect to write fast fault the entry. */
5524 pte_set_was_writeable(tmplate, false);
5525 #if HAS_FEAT_XS
5526 if (pte_is_xs(pt_attr, spte)) {
5527 need_strong_sync = true;
5528 }
5529 #endif /* HAS_FEAT_XS */
5530
5531 write_pte_fast(pte_p, tmplate);
5532
5533 if (managed) {
5534 pvh_assert_locked(pai);
5535 pvh_unlock(pai);
5536 }
5537 }
5538 FLUSH_PTE_STRONG();
5539 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5540 } else {
5541 va = end;
5542 }
5543
5544 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5545 return va;
5546 }
5547
5548 void
5549 pmap_protect_options(
5550 pmap_t pmap,
5551 vm_map_address_t b,
5552 vm_map_address_t e,
5553 vm_prot_t prot,
5554 unsigned int options,
5555 __unused void *args)
5556 {
5557 vm_map_address_t l, beg;
5558
5559 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5560
5561 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5562 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5563 pmap, (uint64_t)b, (uint64_t)e);
5564 }
5565
5566 /*
5567 * We allow single-page requests to execute non-preemptibly,
5568 * as it doesn't make sense to sample AST_URGENT for a single-page
5569 * operation, and there are a couple of special use cases that
5570 * require a non-preemptible single-page operation.
5571 */
5572 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5573 pmap_verify_preemptible();
5574 }
5575
5576 #if DEVELOPMENT || DEBUG
5577 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5578 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5579 pmap_remove_options(pmap, b, e, options);
5580 return;
5581 }
5582 } else
5583 #endif
5584 {
5585 /* Determine the new protection. */
5586 switch (prot) {
5587 case VM_PROT_EXECUTE:
5588 case VM_PROT_READ:
5589 case VM_PROT_READ | VM_PROT_EXECUTE:
5590 break;
5591 case VM_PROT_READ | VM_PROT_WRITE:
5592 case VM_PROT_ALL:
5593 return; /* nothing to do */
5594 default:
5595 pmap_remove_options(pmap, b, e, options);
5596 return;
5597 }
5598 }
5599
5600 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5601 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5602 VM_KERNEL_ADDRHIDE(e));
5603
5604 beg = b;
5605
5606 while (beg < e) {
5607 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5608
5609 if (l > e) {
5610 l = e;
5611 }
5612
5613 #if XNU_MONITOR
5614 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5615 #else
5616 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5617 #endif
5618 }
5619
5620 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5621 }
5622
5623 /**
5624 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5625 *
5626 * @param pmap pmap to insert the pages into.
5627 * @param va virtual address to map the pages into.
5628 * @param pa page number of the first physical page to map.
5629 * @param size block size, in number of pages.
5630 * @param prot mapping protection attributes.
5631 * @param attr flags to pass to pmap_enter().
5632 *
5633 * @return KERN_SUCCESS.
5634 */
5635 kern_return_t
5636 pmap_map_block(
5637 pmap_t pmap,
5638 addr64_t va,
5639 ppnum_t pa,
5640 uint32_t size,
5641 vm_prot_t prot,
5642 int attr,
5643 unsigned int flags)
5644 {
5645 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5646 }
5647
5648 /**
5649 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5650 * As opposed to pmap_map_block(), this function takes
5651 * a physical address as an input and operates using the
5652 * page size associated with the input pmap.
5653 *
5654 * @param pmap pmap to insert the pages into.
5655 * @param va virtual address to map the pages into.
5656 * @param pa physical address of the first physical page to map.
5657 * @param size block size, in number of pages.
5658 * @param prot mapping protection attributes.
5659 * @param attr flags to pass to pmap_enter().
5660 *
5661 * @return KERN_SUCCESS.
5662 */
5663 kern_return_t
5664 pmap_map_block_addr(
5665 pmap_t pmap,
5666 addr64_t va,
5667 pmap_paddr_t pa,
5668 uint32_t size,
5669 vm_prot_t prot,
5670 int attr,
5671 unsigned int flags)
5672 {
5673 #if __ARM_MIXED_PAGE_SIZE__
5674 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5675 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5676 #else
5677 const uint64_t pmap_page_size = PAGE_SIZE;
5678 #endif
5679
5680 for (ppnum_t page = 0; page < size; page++) {
5681 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5682 panic("%s: failed pmap_enter_addr, "
5683 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5684 __FUNCTION__,
5685 pmap, va, (uint64_t)pa, size, prot, flags);
5686 }
5687
5688 va += pmap_page_size;
5689 pa += pmap_page_size;
5690 }
5691
5692 return KERN_SUCCESS;
5693 }
5694
5695 kern_return_t
5696 pmap_enter_addr(
5697 pmap_t pmap,
5698 vm_map_address_t v,
5699 pmap_paddr_t pa,
5700 vm_prot_t prot,
5701 vm_prot_t fault_type,
5702 unsigned int flags,
5703 boolean_t wired)
5704 {
5705 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5706 }
5707
5708 /*
5709 * Insert the given physical page (p) at
5710 * the specified virtual address (v) in the
5711 * target physical map with the protection requested.
5712 *
5713 * If specified, the page will be wired down, meaning
5714 * that the related pte can not be reclaimed.
5715 *
5716 * NB: This is the only routine which MAY NOT lazy-evaluate
5717 * or lose information. That is, this routine must actually
5718 * insert this page into the given map eventually (must make
5719 * forward progress eventually.
5720 */
5721 kern_return_t
5722 pmap_enter(
5723 pmap_t pmap,
5724 vm_map_address_t v,
5725 ppnum_t pn,
5726 vm_prot_t prot,
5727 vm_prot_t fault_type,
5728 unsigned int flags,
5729 boolean_t wired,
5730 __unused pmap_mapping_type_t mapping_type)
5731 {
5732 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5733 }
5734
5735 /*
5736 * Attempt to commit the pte.
5737 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5738 * Performs no page table or accounting writes on failures.
5739 */
5740 static inline bool
5741 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5742 {
5743 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5744 bool success = false, changed_wiring = false;
5745
5746 __unreachable_ok_push
5747 if (TEST_PAGE_RATIO_4) {
5748 /*
5749 * 16K virtual pages w/ 4K hw pages.
5750 * We actually need to update 4 ptes here which can't easily be done atomically.
5751 * As a result we require the exclusive pmap lock.
5752 */
5753 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5754 *old_pte = *pte_p;
5755 if (*old_pte == new_pte) {
5756 /* Another thread completed this operation. Nothing to do here. */
5757 success = true;
5758 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5759 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5760 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5761 success = false;
5762 } else {
5763 write_pte_fast(pte_p, new_pte);
5764 success = true;
5765 }
5766 } else {
5767 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5768 }
5769 __unreachable_ok_pop
5770
5771 if (success && *old_pte != new_pte) {
5772 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5773 bool need_strong_sync = false;
5774 FLUSH_PTE_STRONG();
5775 #if HAS_FEAT_XS
5776 if (pte_is_xs(pt_attr, *old_pte)) {
5777 need_strong_sync = true;
5778 }
5779 #endif /* HAS_FEAT_XS */
5780 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5781 } else {
5782 FLUSH_PTE();
5783 __builtin_arm_isb(ISB_SY);
5784 }
5785 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5786 (new_pte & ARM_PTE_WIRED) != 0 :
5787 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5788
5789 if (pmap != kernel_pmap && changed_wiring) {
5790 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5791 if (new_pte & ARM_PTE_WIRED) {
5792 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5793 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5794 } else {
5795 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5796 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5797 }
5798 }
5799
5800 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5801 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5802 }
5803 return success;
5804 }
5805
5806 MARK_AS_PMAP_TEXT static pt_entry_t
5807 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5808 {
5809 pt_entry_t pte;
5810
5811 switch (wimg & (VM_WIMG_MASK)) {
5812 case VM_WIMG_IO:
5813 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5814 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5815 // AP, while preserving the security benefits of using device
5816 // mapping against side-channel attacks. On pre-H14 platforms,
5817 // the accesses will still be strongly ordered.
5818 if (is_dram_addr(pa)) {
5819 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5820 } else {
5821 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5822 }
5823 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5824 break;
5825 case VM_WIMG_RT:
5826 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5827 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5828 break;
5829 case VM_WIMG_POSTED:
5830 if (is_dram_addr(pa)) {
5831 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5832 } else {
5833 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5834 }
5835 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5836 break;
5837 case VM_WIMG_POSTED_REORDERED:
5838 if (is_dram_addr(pa)) {
5839 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5840 } else {
5841 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5842 }
5843 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5844 break;
5845 case VM_WIMG_POSTED_COMBINED_REORDERED:
5846 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5847 #if HAS_FEAT_XS
5848 if (!is_dram_addr(pa)) {
5849 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5850 }
5851 #endif /* HAS_FEAT_XS */
5852 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5853 break;
5854 case VM_WIMG_WCOMB:
5855 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5856 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5857 break;
5858 case VM_WIMG_WTHRU:
5859 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5860 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5861 break;
5862 case VM_WIMG_COPYBACK:
5863 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5864 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5865 break;
5866 case VM_WIMG_INNERWBACK:
5867 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5868 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5869 break;
5870 default:
5871 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5872 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5873 }
5874
5875 return pte;
5876 }
5877
5878
5879 /*
5880 * Construct a PTE (and the physical page attributes) for the given virtual to
5881 * physical mapping.
5882 *
5883 * This function has no side effects and is safe to call so that it is safe to
5884 * call while attempting a pmap_enter transaction.
5885 */
5886 MARK_AS_PMAP_TEXT static pt_entry_t
5887 pmap_construct_pte(
5888 const pmap_t pmap,
5889 vm_map_address_t va,
5890 pmap_paddr_t pa,
5891 vm_prot_t prot,
5892 vm_prot_t fault_type,
5893 boolean_t wired,
5894 const pt_attr_t* const pt_attr,
5895 uint16_t *pp_attr_bits /* OUTPUT */
5896 )
5897 {
5898 bool set_NX = false, set_XO = false;
5899 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5900 assert(pp_attr_bits != NULL);
5901 *pp_attr_bits = 0;
5902
5903 if (wired) {
5904 pte |= ARM_PTE_WIRED;
5905 }
5906
5907 #if DEVELOPMENT || DEBUG
5908 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5909 #else
5910 if ((prot & VM_PROT_EXECUTE))
5911 #endif
5912 {
5913 set_NX = false;
5914 } else {
5915 set_NX = true;
5916 }
5917
5918 if (prot == VM_PROT_EXECUTE) {
5919 set_XO = true;
5920 }
5921
5922 if (set_NX) {
5923 pte |= pt_attr_leaf_xn(pt_attr);
5924 } else {
5925 if (pmap == kernel_pmap) {
5926 pte |= ARM_PTE_NX;
5927 } else {
5928 pte |= pt_attr_leaf_x(pt_attr);
5929 }
5930 }
5931
5932 if (pmap == kernel_pmap) {
5933 #if __ARM_KERNEL_PROTECT__
5934 pte |= ARM_PTE_NG;
5935 #endif /* __ARM_KERNEL_PROTECT__ */
5936 if (prot & VM_PROT_WRITE) {
5937 pte |= ARM_PTE_AP(AP_RWNA);
5938 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5939 } else {
5940 pte |= ARM_PTE_AP(AP_RONA);
5941 *pp_attr_bits |= PP_ATTR_REFERENCED;
5942 }
5943 } else {
5944 if (pmap->type != PMAP_TYPE_NESTED) {
5945 pte |= ARM_PTE_NG;
5946 } else if ((pmap->nested_region_unnested_table_bitmap)
5947 && (va >= pmap->nested_region_addr)
5948 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5949 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5950
5951 if ((pmap->nested_region_unnested_table_bitmap)
5952 && testbit(index, (int *)pmap->nested_region_unnested_table_bitmap)) {
5953 pte |= ARM_PTE_NG;
5954 }
5955 }
5956 if (prot & VM_PROT_WRITE) {
5957 assert(pmap->type != PMAP_TYPE_NESTED);
5958 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5959 if (fault_type & VM_PROT_WRITE) {
5960 pte |= pt_attr_leaf_rw(pt_attr);
5961 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5962 } else {
5963 pte |= pt_attr_leaf_ro(pt_attr);
5964 /*
5965 * Mark the page as MODFAULT so that a subsequent write
5966 * may be handled through arm_fast_fault().
5967 */
5968 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5969 pte_set_was_writeable(pte, true);
5970 }
5971 } else {
5972 pte |= pt_attr_leaf_rw(pt_attr);
5973 *pp_attr_bits |= PP_ATTR_REFERENCED;
5974 }
5975 } else {
5976 if (set_XO) {
5977 pte |= pt_attr_leaf_rona(pt_attr);
5978 } else {
5979 pte |= pt_attr_leaf_ro(pt_attr);
5980 }
5981 *pp_attr_bits |= PP_ATTR_REFERENCED;
5982 }
5983 }
5984
5985 pte |= ARM_PTE_AF;
5986 return pte;
5987 }
5988
5989 MARK_AS_PMAP_TEXT kern_return_t
5990 pmap_enter_options_internal(
5991 pmap_t pmap,
5992 vm_map_address_t v,
5993 pmap_paddr_t pa,
5994 vm_prot_t prot,
5995 vm_prot_t fault_type,
5996 unsigned int flags,
5997 boolean_t wired,
5998 unsigned int options)
5999 {
6000 ppnum_t pn = (ppnum_t)atop(pa);
6001 pt_entry_t pte;
6002 pt_entry_t spte;
6003 pt_entry_t *pte_p;
6004 bool refcnt_updated;
6005 bool wiredcnt_updated;
6006 bool ro_va = false;
6007 unsigned int wimg_bits;
6008 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6009 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6010 kern_return_t kr = KERN_SUCCESS;
6011 uint16_t pp_attr_bits;
6012 volatile uint16_t *refcnt;
6013 volatile uint16_t *wiredcnt;
6014 pv_free_list_t *local_pv_free;
6015
6016 validate_pmap_mutable(pmap);
6017
6018 #if XNU_MONITOR
6019 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6020 panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6021 }
6022 #endif
6023
6024 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6025
6026 if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6027 panic("%s: pmap %p v 0x%llx not page-aligned",
6028 __func__, pmap, (unsigned long long)v);
6029 }
6030
6031 if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6032 panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6033 }
6034
6035 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6036 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6037 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6038 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6039 }
6040
6041 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6042 panic("pmap_enter_options() pmap %p pa 0x%llx",
6043 pmap, (uint64_t)pa);
6044 }
6045
6046 /* The PA should not extend beyond the architected physical address space */
6047 pa &= ARM_PTE_PAGE_MASK;
6048
6049 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6050 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6051 extern vm_offset_t ctrr_test_page;
6052 if (__probable(v != ctrr_test_page))
6053 #endif
6054 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6055 }
6056 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6057 if (__improbable(prot != VM_PROT_READ)) {
6058 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6059 __func__, (unsigned long long)v, prot);
6060 }
6061 ro_va = true;
6062 }
6063 assert(pn != vm_page_fictitious_addr);
6064
6065 refcnt_updated = false;
6066 wiredcnt_updated = false;
6067
6068 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6069 /*
6070 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6071 *
6072 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6073 */
6074 lock_mode = PMAP_LOCK_EXCLUSIVE;
6075 }
6076
6077 if (!pmap_lock_preempt(pmap, lock_mode)) {
6078 return KERN_ABORTED;
6079 }
6080
6081 /*
6082 * Expand pmap to include this pte. Assume that
6083 * pmap is always expanded to include enough hardware
6084 * pages to map one VM page.
6085 */
6086 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6087 /* Must unlock to expand the pmap. */
6088 pmap_unlock(pmap, lock_mode);
6089
6090 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6091
6092 if (kr != KERN_SUCCESS) {
6093 return kr;
6094 }
6095
6096 if (!pmap_lock_preempt(pmap, lock_mode)) {
6097 return KERN_ABORTED;
6098 }
6099 }
6100
6101 if (options & PMAP_OPTIONS_NOENTER) {
6102 pmap_unlock(pmap, lock_mode);
6103 return KERN_SUCCESS;
6104 }
6105
6106 /*
6107 * Since we may not hold the pmap lock exclusive, updating the pte is
6108 * done via a cmpxchg loop.
6109 * We need to be careful about modifying non-local data structures before commiting
6110 * the new pte since we may need to re-do the transaction.
6111 */
6112 spte = os_atomic_load(pte_p, relaxed);
6113 while (!committed) {
6114 refcnt = NULL;
6115 wiredcnt = NULL;
6116 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6117 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6118
6119 if (pmap != kernel_pmap) {
6120 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6121 refcnt = &ptd_info->refcnt;
6122 wiredcnt = &ptd_info->wiredcnt;
6123 /*
6124 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6125 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6126 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6127 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6128 * have PTDs, so we can't use the check there.
6129 */
6130 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6131 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6132 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6133 }
6134 /*
6135 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6136 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6137 * or acquire the pmap lock exclusive.
6138 */
6139 if (!wiredcnt_updated) {
6140 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6141 wiredcnt_updated = true;
6142 }
6143 if (!refcnt_updated) {
6144 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6145 refcnt_updated = true;
6146 drop_refcnt = true;
6147 }
6148 }
6149
6150 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6151 /*
6152 * There is already a mapping here & it's for a different physical page.
6153 * First remove that mapping.
6154 *
6155 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6156 */
6157 if (lock_mode == PMAP_LOCK_SHARED) {
6158 if (pmap_lock_shared_to_exclusive(pmap)) {
6159 lock_mode = PMAP_LOCK_EXCLUSIVE;
6160 } else {
6161 /*
6162 * We failed to upgrade to an exclusive lock.
6163 * As a result we no longer hold the lock at all,
6164 * so we need to re-acquire it and restart the transaction.
6165 */
6166 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6167 lock_mode = PMAP_LOCK_EXCLUSIVE;
6168 /* pmap might have changed after we dropped the lock. Try again. */
6169 spte = os_atomic_load(pte_p, relaxed);
6170 continue;
6171 }
6172 }
6173 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6174 spte = ARM_PTE_TYPE_FAULT;
6175 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6176 }
6177
6178 /*
6179 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6180 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6181 * read-write protection. The PMAP layer though still needs to use the right
6182 * index, which is the older XO-now-TPRO one and that is specially selected
6183 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6184 */
6185 if (options & PMAP_OPTIONS_MAP_TPRO) {
6186 if (__improbable(pmap == kernel_pmap)) {
6187 panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6188 __func__);
6189 }
6190 pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6191 } else {
6192 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6193 }
6194
6195 if (pa_valid(pa)) {
6196 unsigned int pai;
6197 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6198
6199 is_internal = FALSE;
6200 is_altacct = FALSE;
6201
6202 pai = pa_index(pa);
6203
6204 pvh_lock(pai);
6205
6206 /*
6207 * Make sure that the current per-cpu PV free list has
6208 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6209 * if the transaction succeeds. We're either in the
6210 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6211 * Note that we can still be interrupted, but a primary
6212 * interrupt handler can never enter the pmap.
6213 */
6214 #if !XNU_MONITOR
6215 assert(get_preemption_level() > 0);
6216 #endif
6217 local_pv_free = &pmap_get_cpu_data()->pv_free;
6218 pv_entry_t **pv_h = pai_to_pvh(pai);
6219 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6220 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6221
6222 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6223 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6224 int new_allocated_pves = 0;
6225
6226 while (new_allocated_pves < 2) {
6227 local_pv_free = &pmap_get_cpu_data()->pv_free;
6228 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6229 if (pv_status == PV_ALLOC_FAIL) {
6230 break;
6231 } else if (pv_status == PV_ALLOC_RETRY) {
6232 /*
6233 * In the case that pv_alloc() had to grab a new page of PVEs,
6234 * it will have dropped the pmap lock while doing so.
6235 * On non-PPL devices, dropping the lock re-enables preemption so we may
6236 * be on a different CPU now.
6237 */
6238 local_pv_free = &pmap_get_cpu_data()->pv_free;
6239 } else {
6240 /* If we've gotten this far then a node should've been allocated. */
6241 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6242
6243 new_allocated_pves++;
6244 }
6245 }
6246
6247 for (int i = 0; i < new_allocated_pves; i++) {
6248 pv_free(new_pve_p[i]);
6249 }
6250 }
6251
6252 if (pv_status == PV_ALLOC_FAIL) {
6253 pvh_unlock(pai);
6254 kr = KERN_RESOURCE_SHORTAGE;
6255 break;
6256 } else if (pv_status == PV_ALLOC_RETRY) {
6257 pvh_unlock(pai);
6258 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6259 spte = os_atomic_load(pte_p, relaxed);
6260 continue;
6261 }
6262
6263 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6264 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6265 } else {
6266 wimg_bits = pmap_cache_attributes(pn);
6267 }
6268
6269 /* We may be retrying this operation after dropping the PVH lock.
6270 * Cache attributes for the physical page may have changed while the lock
6271 * was dropped, so clear any cache attributes we may have previously set
6272 * in the PTE template. */
6273 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6274 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6275
6276 #if XNU_MONITOR
6277 /* The regular old kernel is not allowed to remap PPL pages. */
6278 if (__improbable(ppattr_pa_test_monitor(pa))) {
6279 panic("%s: page belongs to PPL, "
6280 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6281 __FUNCTION__,
6282 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6283 }
6284
6285 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6286 panic("%s: page locked down, "
6287 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6288 __FUNCTION__,
6289 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6290 }
6291 #endif
6292
6293
6294
6295 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6296 if (!committed) {
6297 pvh_unlock(pai);
6298 continue;
6299 }
6300 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6301 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6302
6303 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6304 /*
6305 * If there was already a valid pte here then we reuse its reference
6306 * on the ptd and drop the one that we took above.
6307 */
6308 drop_refcnt = had_valid_mapping;
6309
6310 if (!had_valid_mapping) {
6311 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6312 int pve_ptep_idx = 0;
6313 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6314 /* We did all the allocations up top. So this shouldn't be able to fail. */
6315 if (pv_status != PV_ALLOC_SUCCESS) {
6316 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6317 __func__, pv_status, new_pve_p, pmap);
6318 }
6319
6320 if (pmap != kernel_pmap) {
6321 if (options & PMAP_OPTIONS_INTERNAL) {
6322 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6323 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6324 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6325 /*
6326 * Make a note to ourselves that this
6327 * mapping is using alternative
6328 * accounting. We'll need this in order
6329 * to know which ledger to debit when
6330 * the mapping is removed.
6331 *
6332 * The altacct bit must be set while
6333 * the pv head is locked. Defer the
6334 * ledger accounting until after we've
6335 * dropped the lock.
6336 */
6337 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6338 is_altacct = TRUE;
6339 }
6340 }
6341 if (ppattr_test_reusable(pai) &&
6342 !is_altacct) {
6343 is_reusable = TRUE;
6344 } else if (options & PMAP_OPTIONS_INTERNAL) {
6345 is_internal = TRUE;
6346 } else {
6347 is_external = TRUE;
6348 }
6349 }
6350 }
6351
6352 pvh_unlock(pai);
6353
6354 if (pp_attr_bits != 0) {
6355 ppattr_pa_set_bits(pa, pp_attr_bits);
6356 }
6357
6358 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6359 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6360
6361 if (is_internal) {
6362 /*
6363 * Make corresponding adjustments to
6364 * phys_footprint statistics.
6365 */
6366 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6367 if (is_altacct) {
6368 /*
6369 * If this page is internal and
6370 * in an IOKit region, credit
6371 * the task's total count of
6372 * dirty, internal IOKit pages.
6373 * It should *not* count towards
6374 * the task's total physical
6375 * memory footprint, because
6376 * this entire region was
6377 * already billed to the task
6378 * at the time the mapping was
6379 * created.
6380 *
6381 * Put another way, this is
6382 * internal++ and
6383 * alternate_accounting++, so
6384 * net effect on phys_footprint
6385 * is 0. That means: don't
6386 * touch phys_footprint here.
6387 */
6388 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6389 } else {
6390 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6391 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6392 skip_footprint_debit = true;
6393 } else {
6394 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6395 }
6396 }
6397 }
6398 if (is_reusable) {
6399 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6400 } else if (is_external) {
6401 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6402 }
6403 }
6404 } else {
6405 if (prot & VM_PROT_EXECUTE) {
6406 kr = KERN_FAILURE;
6407 break;
6408 }
6409
6410 wimg_bits = pmap_cache_attributes(pn);
6411 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6412 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6413 }
6414
6415 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6416
6417 #if XNU_MONITOR
6418 pte = pmap_construct_io_pte(pa, pte);
6419
6420 /**
6421 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6422 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6423 * created and later removed. We must therefore prevent an attacker from downgrading a
6424 * a writable mapping in order to allow it to be removed and remapped to something else.
6425 */
6426 if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6427 ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6428 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6429 (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6430 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6431 __func__, (uint64_t)pte_to_pa(spte));
6432 }
6433 #endif
6434
6435 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6436 if (committed) {
6437 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6438 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6439
6440 /**
6441 * If there was already a valid pte here then we reuse its
6442 * reference on the ptd and drop the one that we took above.
6443 */
6444 drop_refcnt = had_valid_mapping;
6445 }
6446 }
6447 if (committed) {
6448 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6449 assert(pmap != kernel_pmap);
6450
6451 /* One less "compressed" */
6452 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6453 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6454
6455 if (spte & ARM_PTE_COMPRESSED_ALT) {
6456 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6457 } else if (!skip_footprint_debit) {
6458 /* Was part of the footprint */
6459 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6460 }
6461 /* The old entry held a reference so drop the extra one that we took above. */
6462 drop_refcnt = true;
6463 }
6464 }
6465 }
6466
6467 if (drop_refcnt && refcnt != NULL) {
6468 assert(refcnt_updated);
6469 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6470 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6471 }
6472 }
6473
6474 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6475 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6476 }
6477
6478 pmap_unlock(pmap, lock_mode);
6479
6480 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6481 pmap_phys_write_disable(v);
6482 }
6483
6484 return kr;
6485 }
6486
6487 kern_return_t
6488 pmap_enter_options_addr(
6489 pmap_t pmap,
6490 vm_map_address_t v,
6491 pmap_paddr_t pa,
6492 vm_prot_t prot,
6493 vm_prot_t fault_type,
6494 unsigned int flags,
6495 boolean_t wired,
6496 unsigned int options,
6497 __unused void *arg,
6498 __unused pmap_mapping_type_t mapping_type)
6499 {
6500 kern_return_t kr = KERN_FAILURE;
6501
6502
6503 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6504 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6505
6506
6507 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6508 do {
6509 #if XNU_MONITOR
6510 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6511 #else
6512 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6513 #endif
6514
6515 if (kr == KERN_RESOURCE_SHORTAGE) {
6516 #if XNU_MONITOR
6517 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6518 #endif
6519 if (nowait_requested) {
6520 break;
6521 }
6522 }
6523 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6524
6525 #if XNU_MONITOR
6526 pmap_ledger_check_balance(pmap);
6527 #endif
6528
6529 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6530
6531 return kr;
6532 }
6533
6534 kern_return_t
6535 pmap_enter_options(
6536 pmap_t pmap,
6537 vm_map_address_t v,
6538 ppnum_t pn,
6539 vm_prot_t prot,
6540 vm_prot_t fault_type,
6541 unsigned int flags,
6542 boolean_t wired,
6543 unsigned int options,
6544 __unused void *arg,
6545 pmap_mapping_type_t mapping_type)
6546 {
6547 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6548 }
6549
6550 /*
6551 * Routine: pmap_change_wiring
6552 * Function: Change the wiring attribute for a map/virtual-address
6553 * pair.
6554 * In/out conditions:
6555 * The mapping must already exist in the pmap.
6556 */
6557 MARK_AS_PMAP_TEXT kern_return_t
6558 pmap_change_wiring_internal(
6559 pmap_t pmap,
6560 vm_map_address_t v,
6561 boolean_t wired)
6562 {
6563 pt_entry_t *pte_p;
6564 pmap_paddr_t pa;
6565
6566 validate_pmap_mutable(pmap);
6567
6568 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6569 return KERN_ABORTED;
6570 }
6571
6572 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6573
6574 pte_p = pmap_pte(pmap, v);
6575 if (pte_p == PT_ENTRY_NULL) {
6576 if (!wired) {
6577 /*
6578 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6579 * may have been freed by a remove operation.
6580 */
6581 goto pmap_change_wiring_return;
6582 } else {
6583 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6584 }
6585 }
6586 /*
6587 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6588 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6589 */
6590 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6591
6592 while (pa_valid(pa)) {
6593 pmap_paddr_t new_pa;
6594
6595 pvh_lock(pa_index(pa));
6596 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6597
6598 if (pa == new_pa) {
6599 break;
6600 }
6601
6602 pvh_unlock(pa_index(pa));
6603 pa = new_pa;
6604 }
6605
6606 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6607 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6608 if (!wired) {
6609 /* PTE cleared by prior remove/disconnect operation */
6610 goto pmap_change_wiring_cleanup;
6611 } else {
6612 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6613 __func__, pte_p, (uint64_t)*pte_p, pmap);
6614 }
6615 }
6616
6617 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6618 if (wired != pte_is_wired(*pte_p)) {
6619 pte_set_wired(pmap, pte_p, wired);
6620 if (pmap != kernel_pmap) {
6621 if (wired) {
6622 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6623 } else if (!wired) {
6624 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6625 }
6626 }
6627 }
6628
6629 pmap_change_wiring_cleanup:
6630 if (pa_valid(pa)) {
6631 pvh_unlock(pa_index(pa));
6632 }
6633
6634 pmap_change_wiring_return:
6635 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6636
6637 return KERN_SUCCESS;
6638 }
6639
6640 void
6641 pmap_change_wiring(
6642 pmap_t pmap,
6643 vm_map_address_t v,
6644 boolean_t wired)
6645 {
6646 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6647 pmap_verify_preemptible();
6648
6649 kern_return_t kr = KERN_FAILURE;
6650 #if XNU_MONITOR
6651 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6652 do {
6653 kr = pmap_change_wiring_ppl(pmap, v, wired);
6654 } while (kr == KERN_ABORTED);
6655
6656 pmap_ledger_check_balance(pmap);
6657 #else
6658 /* Since we verified preemptibility, call the helper only once. */
6659 kr = pmap_change_wiring_internal(pmap, v, wired);
6660 #endif
6661
6662 if (kr != KERN_SUCCESS) {
6663 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6664 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6665 }
6666 }
6667
6668 MARK_AS_PMAP_TEXT pmap_paddr_t
6669 pmap_find_pa_internal(
6670 pmap_t pmap,
6671 addr64_t va)
6672 {
6673 pmap_paddr_t pa = 0;
6674
6675 validate_pmap(pmap);
6676
6677 if (pmap != kernel_pmap) {
6678 pmap_lock(pmap, PMAP_LOCK_SHARED);
6679 }
6680
6681 pa = pmap_vtophys(pmap, va);
6682
6683 if (pmap != kernel_pmap) {
6684 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6685 }
6686
6687 return pa;
6688 }
6689
6690 pmap_paddr_t
6691 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6692 {
6693 pmap_paddr_t pa = 0;
6694
6695 if (pmap == kernel_pmap) {
6696 pa = mmu_kvtop(va);
6697 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6698 /*
6699 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6700 * translation even if PAN would prevent kernel access through the translation.
6701 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6702 */
6703 pa = mmu_uvtop(va);
6704 }
6705 return pa;
6706 }
6707
6708 pmap_paddr_t
6709 pmap_find_pa(
6710 pmap_t pmap,
6711 addr64_t va)
6712 {
6713 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6714
6715 if (pa != 0) {
6716 return pa;
6717 }
6718
6719 if (not_in_kdp) {
6720 #if XNU_MONITOR
6721 return pmap_find_pa_ppl(pmap, va);
6722 #else
6723 return pmap_find_pa_internal(pmap, va);
6724 #endif
6725 } else {
6726 return pmap_vtophys(pmap, va);
6727 }
6728 }
6729
6730 ppnum_t
6731 pmap_find_phys_nofault(
6732 pmap_t pmap,
6733 addr64_t va)
6734 {
6735 ppnum_t ppn;
6736 ppn = atop(pmap_find_pa_nofault(pmap, va));
6737 return ppn;
6738 }
6739
6740 ppnum_t
6741 pmap_find_phys(
6742 pmap_t pmap,
6743 addr64_t va)
6744 {
6745 ppnum_t ppn;
6746 ppn = atop(pmap_find_pa(pmap, va));
6747 return ppn;
6748 }
6749
6750 /**
6751 * Translate a kernel virtual address into a physical address.
6752 *
6753 * @param va The kernel virtual address to translate. Does not work on user
6754 * virtual addresses.
6755 *
6756 * @return The physical address if the translation was successful, or zero if
6757 * no valid mappings were found for the given virtual address.
6758 */
6759 pmap_paddr_t
6760 kvtophys(vm_offset_t va)
6761 {
6762 /**
6763 * Attempt to do the translation first in hardware using the AT (address
6764 * translation) instruction. This will attempt to use the MMU to do the
6765 * translation for us.
6766 */
6767 pmap_paddr_t pa = mmu_kvtop(va);
6768
6769 if (pa) {
6770 return pa;
6771 }
6772
6773 /* If the MMU can't find the mapping, then manually walk the page tables. */
6774 return pmap_vtophys(kernel_pmap, va);
6775 }
6776
6777 /**
6778 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6779 * points to a non-kernel-managed physical page, then this call will panic().
6780 *
6781 * @note The output of this function is guaranteed to be a kernel-managed
6782 * physical page, which means it's safe to pass the output directly to
6783 * pa_index() to create a physical address index for various pmap data
6784 * structures.
6785 *
6786 * @param va The kernel virtual address to translate. Does not work on user
6787 * virtual addresses.
6788 *
6789 * @return The translated physical address for the given virtual address.
6790 */
6791 pmap_paddr_t
6792 kvtophys_nofail(vm_offset_t va)
6793 {
6794 pmap_paddr_t pa = kvtophys(va);
6795
6796 if (!pa_valid(pa)) {
6797 panic("%s: Invalid or non-kernel-managed physical page returned, "
6798 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6799 }
6800
6801 return pa;
6802 }
6803
6804 pmap_paddr_t
6805 pmap_vtophys(
6806 pmap_t pmap,
6807 addr64_t va)
6808 {
6809 if ((va < pmap->min) || (va >= pmap->max)) {
6810 return 0;
6811 }
6812
6813 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6814
6815 tt_entry_t * ttp = NULL;
6816 tt_entry_t * ttep = NULL;
6817 tt_entry_t tte = ARM_TTE_EMPTY;
6818 pmap_paddr_t pa = 0;
6819 unsigned int cur_level;
6820
6821 ttp = pmap->tte;
6822
6823 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6824 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6825
6826 tte = *ttep;
6827
6828 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6829 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6830 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6831 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6832
6833 if ((tte & valid_mask) != valid_mask) {
6834 return (pmap_paddr_t) 0;
6835 }
6836
6837 /* This detects both leaf entries and intermediate block mappings. */
6838 if ((tte & type_mask) == type_block) {
6839 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6840 break;
6841 }
6842
6843 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6844 }
6845
6846 return pa;
6847 }
6848
6849 /*
6850 * pmap_init_pte_page - Initialize a page table page.
6851 */
6852 MARK_AS_PMAP_TEXT void
6853 pmap_init_pte_page(
6854 pmap_t pmap,
6855 pt_entry_t *pte_p,
6856 vm_offset_t va,
6857 unsigned int ttlevel,
6858 boolean_t alloc_ptd)
6859 {
6860 pt_desc_t *ptdp = NULL;
6861 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6862
6863 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6864 if (alloc_ptd) {
6865 /*
6866 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6867 * on 4KB hardware, we may already have allocated a page table descriptor for a
6868 * bootstrap request, so we check for an existing PTD here.
6869 */
6870 ptdp = ptd_alloc(pmap);
6871 if (ptdp == NULL) {
6872 panic("%s: unable to allocate PTD", __func__);
6873 }
6874 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6875 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6876 pvh_set_flags(pvh, 0);
6877 } else {
6878 panic("pmap_init_pte_page(): pte_p %p", pte_p);
6879 }
6880 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6881 ptdp = pvh_ptd(pvh);
6882 } else {
6883 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6884 }
6885
6886 // below barrier ensures previous updates to the page are visible to PTW before
6887 // it is linked to the PTE of previous level
6888 __builtin_arm_dmb(DMB_ISHST);
6889 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6890 }
6891
6892 /*
6893 * Routine: pmap_expand
6894 *
6895 * Expands a pmap to be able to map the specified virtual address.
6896 *
6897 * Allocates new memory for the default (COARSE) translation table
6898 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6899 * also allocates space for the corresponding pv entries.
6900 *
6901 * Nothing should be locked.
6902 */
6903 MARK_AS_PMAP_TEXT static kern_return_t
6904 pmap_expand(
6905 pmap_t pmap,
6906 vm_map_address_t v,
6907 unsigned int options,
6908 unsigned int level)
6909 {
6910 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6911
6912 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6913 return KERN_INVALID_ADDRESS;
6914 }
6915 pmap_paddr_t pa;
6916 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6917 tt_entry_t *tte_p;
6918 tt_entry_t *tt_p;
6919
6920 pa = 0x0ULL;
6921 tt_p = (tt_entry_t *)NULL;
6922
6923 for (; ttlevel < level; ttlevel++) {
6924 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6925 return KERN_ABORTED;
6926 }
6927
6928 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6929 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6930 kern_return_t ret;
6931 while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6932 if (options & PMAP_OPTIONS_NOWAIT) {
6933 /* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6934 return ret;
6935 }
6936 #if XNU_MONITOR
6937 panic("%s: failed to allocate tt, "
6938 "pmap=%p, v=%p, options=0x%x, level=%u",
6939 __FUNCTION__,
6940 pmap, (void *)v, options, level);
6941 #else
6942 VM_PAGE_WAIT();
6943 #endif
6944 }
6945
6946 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6947 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6948 return KERN_ABORTED;
6949 }
6950
6951 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6952 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6953 pa = kvtophys_nofail((vm_offset_t)tt_p);
6954 tte_p = pmap_ttne(pmap, ttlevel, v);
6955 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6956 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6957 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6958 pa = 0x0ULL;
6959 tt_p = (tt_entry_t *)NULL;
6960 }
6961 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6962 } else {
6963 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6964 }
6965
6966 if (tt_p != (tt_entry_t *)NULL) {
6967 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6968 tt_p = (tt_entry_t *)NULL;
6969 }
6970 }
6971
6972 return KERN_SUCCESS;
6973 }
6974
6975 /*
6976 * Routine: pmap_gc
6977 * Function:
6978 * Pmap garbage collection
6979 * Called by the pageout daemon when pages are scarce.
6980 *
6981 */
6982 void
6983 pmap_gc(void)
6984 {
6985 /*
6986 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6987 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6988 * or may contain wired mappings. However, with the relatively recent change to
6989 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6990 * page, it may make sense to call that function here.
6991 */
6992 }
6993
6994 /*
6995 * By default, don't attempt pmap GC more frequently
6996 * than once / 1 minutes.
6997 */
6998
6999 void
7000 compute_pmap_gc_throttle(
7001 void *arg __unused)
7002 {
7003 }
7004
7005 /*
7006 * pmap_attribute_cache_sync(vm_offset_t pa)
7007 *
7008 * Invalidates all of the instruction cache on a physical page and
7009 * pushes any dirty data from the data cache for the same physical page
7010 */
7011
7012 kern_return_t
7013 pmap_attribute_cache_sync(
7014 ppnum_t pp,
7015 vm_size_t size,
7016 __unused vm_machine_attribute_t attribute,
7017 __unused vm_machine_attribute_val_t * value)
7018 {
7019 if (size > PAGE_SIZE) {
7020 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7021 } else {
7022 cache_sync_page(pp);
7023 }
7024
7025 return KERN_SUCCESS;
7026 }
7027
7028 /*
7029 * pmap_sync_page_data_phys(ppnum_t pp)
7030 *
7031 * Invalidates all of the instruction cache on a physical page and
7032 * pushes any dirty data from the data cache for the same physical page
7033 */
7034 void
7035 pmap_sync_page_data_phys(
7036 ppnum_t pp)
7037 {
7038 cache_sync_page(pp);
7039 }
7040
7041 /*
7042 * pmap_sync_page_attributes_phys(ppnum_t pp)
7043 *
7044 * Write back and invalidate all cachelines on a physical page.
7045 */
7046 void
7047 pmap_sync_page_attributes_phys(
7048 ppnum_t pp)
7049 {
7050 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7051 }
7052
7053 #if CONFIG_COREDUMP
7054 /* temporary workaround */
7055 boolean_t
7056 coredumpok(
7057 vm_map_t map,
7058 mach_vm_offset_t va)
7059 {
7060 pt_entry_t *pte_p;
7061 pt_entry_t spte;
7062
7063 pte_p = pmap_pte(map->pmap, va);
7064 if (0 == pte_p) {
7065 return FALSE;
7066 }
7067 if (vm_map_entry_has_device_pager(map, va)) {
7068 return FALSE;
7069 }
7070 spte = *pte_p;
7071 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7072 }
7073 #endif
7074
7075 void
7076 fillPage(
7077 ppnum_t pn,
7078 unsigned int fill)
7079 {
7080 unsigned int *addr;
7081 int count;
7082
7083 addr = (unsigned int *) phystokv(ptoa(pn));
7084 count = PAGE_SIZE / sizeof(unsigned int);
7085 while (count--) {
7086 *addr++ = fill;
7087 }
7088 }
7089
7090 extern void mapping_set_mod(ppnum_t pn);
7091
7092 void
7093 mapping_set_mod(
7094 ppnum_t pn)
7095 {
7096 pmap_set_modify(pn);
7097 }
7098
7099 extern void mapping_set_ref(ppnum_t pn);
7100
7101 void
7102 mapping_set_ref(
7103 ppnum_t pn)
7104 {
7105 pmap_set_reference(pn);
7106 }
7107
7108 /*
7109 * Clear specified attribute bits.
7110 *
7111 * Try to force an arm_fast_fault() for all mappings of
7112 * the page - to force attributes to be set again at fault time.
7113 * If the forcing succeeds, clear the cached bits at the head.
7114 * Otherwise, something must have been wired, so leave the cached
7115 * attributes alone.
7116 */
7117 MARK_AS_PMAP_TEXT static void
7118 phys_attribute_clear_with_flush_range(
7119 ppnum_t pn,
7120 unsigned int bits,
7121 int options,
7122 void *arg,
7123 pmap_tlb_flush_range_t *flush_range)
7124 {
7125 pmap_paddr_t pa = ptoa(pn);
7126 vm_prot_t allow_mode = VM_PROT_ALL;
7127
7128 #if XNU_MONITOR
7129 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7130 panic("%s: illegal request, "
7131 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7132 __FUNCTION__,
7133 pn, bits, options, arg, flush_range);
7134 }
7135 #endif
7136 if ((arg != NULL) || (flush_range != NULL)) {
7137 options = options & ~PMAP_OPTIONS_NOFLUSH;
7138 }
7139
7140 if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7141 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7142 "invalid options",
7143 pn, bits, options, arg, flush_range);
7144 }
7145
7146 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7147 (options & PMAP_OPTIONS_NOFLUSH))) {
7148 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7149 "should not clear 'modified' without flushing TLBs",
7150 pn, bits, options, arg, flush_range);
7151 }
7152
7153 assert(pn != vm_page_fictitious_addr);
7154
7155 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7156 assert(bits == PP_ATTR_MODIFIED);
7157
7158 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7159 /*
7160 * We short circuit this case; it should not need to
7161 * invoke arm_force_fast_fault, so just clear the modified bit.
7162 * pmap_page_protect has taken care of resetting
7163 * the state so that we'll see the next write as a fault to
7164 * the VM (i.e. we don't want a fast fault).
7165 */
7166 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7167 return;
7168 }
7169 if (bits & PP_ATTR_REFERENCED) {
7170 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7171 }
7172 if (bits & PP_ATTR_MODIFIED) {
7173 allow_mode &= ~VM_PROT_WRITE;
7174 }
7175
7176 if (bits == PP_ATTR_NOENCRYPT) {
7177 /*
7178 * We short circuit this case; it should not need to
7179 * invoke arm_force_fast_fault, so just clear and
7180 * return. On ARM, this bit is just a debugging aid.
7181 */
7182 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7183 return;
7184 }
7185
7186 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7187 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7188 }
7189 }
7190
7191 MARK_AS_PMAP_TEXT void
7192 phys_attribute_clear_internal(
7193 ppnum_t pn,
7194 unsigned int bits,
7195 int options,
7196 void *arg)
7197 {
7198 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7199 }
7200
7201 #if __ARM_RANGE_TLBI__
7202 MARK_AS_PMAP_TEXT static vm_map_address_t
7203 phys_attribute_clear_twig_internal(
7204 pmap_t pmap,
7205 vm_map_address_t start,
7206 vm_map_address_t end,
7207 unsigned int bits,
7208 unsigned int options,
7209 pmap_tlb_flush_range_t *flush_range)
7210 {
7211 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7212 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7213 assert(end >= start);
7214 assert((end - start) <= pt_attr_twig_size(pt_attr));
7215 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7216 vm_map_address_t va = start;
7217 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7218 tt_entry_t *tte_p;
7219 tte_p = pmap_tte(pmap, start);
7220 unsigned int npages = 0;
7221
7222 if (tte_p == (tt_entry_t *) NULL) {
7223 return end;
7224 }
7225
7226 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7227 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7228
7229 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7230 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7231 assert(end_pte_p >= start_pte_p);
7232 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7233 if (__improbable(npages++ && pmap_pending_preemption())) {
7234 return va;
7235 }
7236 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7237 if (pa_valid(pa)) {
7238 ppnum_t pn = (ppnum_t) atop(pa);
7239 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7240 }
7241 }
7242 }
7243 return end;
7244 }
7245
7246 MARK_AS_PMAP_TEXT vm_map_address_t
7247 phys_attribute_clear_range_internal(
7248 pmap_t pmap,
7249 vm_map_address_t start,
7250 vm_map_address_t end,
7251 unsigned int bits,
7252 unsigned int options)
7253 {
7254 if (__improbable(end < start)) {
7255 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7256 }
7257 validate_pmap_mutable(pmap);
7258
7259 vm_map_address_t va = start;
7260 pmap_tlb_flush_range_t flush_range = {
7261 .ptfr_pmap = pmap,
7262 .ptfr_start = start,
7263 .ptfr_end = end,
7264 .ptfr_flush_needed = false
7265 };
7266
7267 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7268 return va;
7269 }
7270
7271 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7272
7273 while (va < end) {
7274 vm_map_address_t curr_end;
7275
7276 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7277 if (curr_end > end) {
7278 curr_end = end;
7279 }
7280
7281 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7282 if ((va < curr_end) || pmap_pending_preemption()) {
7283 break;
7284 }
7285 }
7286 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7287 if (flush_range.ptfr_flush_needed) {
7288 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7289 flush_range.ptfr_start,
7290 flush_range.ptfr_end - flush_range.ptfr_start,
7291 flush_range.ptfr_pmap,
7292 true,
7293 false);
7294 sync_tlb_flush();
7295 }
7296 return va;
7297 }
7298
7299 static void
7300 phys_attribute_clear_range(
7301 pmap_t pmap,
7302 vm_map_address_t start,
7303 vm_map_address_t end,
7304 unsigned int bits,
7305 unsigned int options)
7306 {
7307 /*
7308 * We allow single-page requests to execute non-preemptibly,
7309 * as it doesn't make sense to sample AST_URGENT for a single-page
7310 * operation, and there are a couple of special use cases that
7311 * require a non-preemptible single-page operation.
7312 */
7313 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7314 pmap_verify_preemptible();
7315 }
7316
7317 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7318
7319 while (start < end) {
7320 #if XNU_MONITOR
7321 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7322 #else
7323 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7324 #endif
7325 }
7326
7327 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7328 }
7329 #endif /* __ARM_RANGE_TLBI__ */
7330
7331 static void
7332 phys_attribute_clear(
7333 ppnum_t pn,
7334 unsigned int bits,
7335 int options,
7336 void *arg)
7337 {
7338 /*
7339 * Do we really want this tracepoint? It will be extremely chatty.
7340 * Also, should we have a corresponding trace point for the set path?
7341 */
7342 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7343
7344 #if XNU_MONITOR
7345 phys_attribute_clear_ppl(pn, bits, options, arg);
7346 #else
7347 phys_attribute_clear_internal(pn, bits, options, arg);
7348 #endif
7349
7350 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7351 }
7352
7353 /*
7354 * Set specified attribute bits.
7355 *
7356 * Set cached value in the pv head because we have
7357 * no per-mapping hardware support for referenced and
7358 * modify bits.
7359 */
7360 MARK_AS_PMAP_TEXT void
7361 phys_attribute_set_internal(
7362 ppnum_t pn,
7363 unsigned int bits)
7364 {
7365 pmap_paddr_t pa = ptoa(pn);
7366 assert(pn != vm_page_fictitious_addr);
7367
7368 #if XNU_MONITOR
7369 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7370 panic("%s: illegal request, "
7371 "pn=%u, bits=%#x",
7372 __FUNCTION__,
7373 pn, bits);
7374 }
7375 #endif
7376
7377 ppattr_pa_set_bits(pa, (uint16_t)bits);
7378
7379 return;
7380 }
7381
7382 static void
7383 phys_attribute_set(
7384 ppnum_t pn,
7385 unsigned int bits)
7386 {
7387 #if XNU_MONITOR
7388 phys_attribute_set_ppl(pn, bits);
7389 #else
7390 phys_attribute_set_internal(pn, bits);
7391 #endif
7392 }
7393
7394
7395 /*
7396 * Check specified attribute bits.
7397 *
7398 * use the software cached bits (since no hw support).
7399 */
7400 static boolean_t
7401 phys_attribute_test(
7402 ppnum_t pn,
7403 unsigned int bits)
7404 {
7405 pmap_paddr_t pa = ptoa(pn);
7406 assert(pn != vm_page_fictitious_addr);
7407 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7408 }
7409
7410
7411 /*
7412 * Set the modify/reference bits on the specified physical page.
7413 */
7414 void
7415 pmap_set_modify(ppnum_t pn)
7416 {
7417 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7418 }
7419
7420
7421 /*
7422 * Clear the modify bits on the specified physical page.
7423 */
7424 void
7425 pmap_clear_modify(
7426 ppnum_t pn)
7427 {
7428 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7429 }
7430
7431
7432 /*
7433 * pmap_is_modified:
7434 *
7435 * Return whether or not the specified physical page is modified
7436 * by any physical maps.
7437 */
7438 boolean_t
7439 pmap_is_modified(
7440 ppnum_t pn)
7441 {
7442 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7443 }
7444
7445
7446 /*
7447 * Set the reference bit on the specified physical page.
7448 */
7449 static void
7450 pmap_set_reference(
7451 ppnum_t pn)
7452 {
7453 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7454 }
7455
7456 /*
7457 * Clear the reference bits on the specified physical page.
7458 */
7459 void
7460 pmap_clear_reference(
7461 ppnum_t pn)
7462 {
7463 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7464 }
7465
7466
7467 /*
7468 * pmap_is_referenced:
7469 *
7470 * Return whether or not the specified physical page is referenced
7471 * by any physical maps.
7472 */
7473 boolean_t
7474 pmap_is_referenced(
7475 ppnum_t pn)
7476 {
7477 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7478 }
7479
7480 /*
7481 * pmap_get_refmod(phys)
7482 * returns the referenced and modified bits of the specified
7483 * physical page.
7484 */
7485 unsigned int
7486 pmap_get_refmod(
7487 ppnum_t pn)
7488 {
7489 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7490 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7491 }
7492
7493 static inline unsigned int
7494 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7495 {
7496 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7497 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7498 }
7499
7500 /*
7501 * pmap_clear_refmod(phys, mask)
7502 * clears the referenced and modified bits as specified by the mask
7503 * of the specified physical page.
7504 */
7505 void
7506 pmap_clear_refmod_options(
7507 ppnum_t pn,
7508 unsigned int mask,
7509 unsigned int options,
7510 void *arg)
7511 {
7512 unsigned int bits;
7513
7514 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7515 phys_attribute_clear(pn, bits, options, arg);
7516 }
7517
7518 /*
7519 * Perform pmap_clear_refmod_options on a virtual address range.
7520 * The operation will be performed in bulk & tlb flushes will be coalesced
7521 * if possible.
7522 *
7523 * Returns true if the operation is supported on this platform.
7524 * If this function returns false, the operation is not supported and
7525 * nothing has been modified in the pmap.
7526 */
7527 bool
7528 pmap_clear_refmod_range_options(
7529 pmap_t pmap __unused,
7530 vm_map_address_t start __unused,
7531 vm_map_address_t end __unused,
7532 unsigned int mask __unused,
7533 unsigned int options __unused)
7534 {
7535 #if __ARM_RANGE_TLBI__
7536 unsigned int bits;
7537 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7538 phys_attribute_clear_range(pmap, start, end, bits, options);
7539 return true;
7540 #else /* __ARM_RANGE_TLBI__ */
7541 #pragma unused(pmap, start, end, mask, options)
7542 /*
7543 * This operation allows the VM to bulk modify refmod bits on a virtually
7544 * contiguous range of addresses. This is large performance improvement on
7545 * platforms that support ranged tlbi instructions. But on older platforms,
7546 * we can only flush per-page or the entire asid. So we currently
7547 * only support this operation on platforms that support ranged tlbi.
7548 * instructions. On other platforms, we require that
7549 * the VM modify the bits on a per-page basis.
7550 */
7551 return false;
7552 #endif /* __ARM_RANGE_TLBI__ */
7553 }
7554
7555 void
7556 pmap_clear_refmod(
7557 ppnum_t pn,
7558 unsigned int mask)
7559 {
7560 pmap_clear_refmod_options(pn, mask, 0, NULL);
7561 }
7562
7563 unsigned int
7564 pmap_disconnect_options(
7565 ppnum_t pn,
7566 unsigned int options,
7567 void *arg)
7568 {
7569 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7570 /*
7571 * On ARM, the "modified" bit is managed by software, so
7572 * we know up-front if the physical page is "modified",
7573 * without having to scan all the PTEs pointing to it.
7574 * The caller should have made the VM page "busy" so noone
7575 * should be able to establish any new mapping and "modify"
7576 * the page behind us.
7577 */
7578 if (pmap_is_modified(pn)) {
7579 /*
7580 * The page has been modified and will be sent to
7581 * the VM compressor.
7582 */
7583 options |= PMAP_OPTIONS_COMPRESSOR;
7584 } else {
7585 /*
7586 * The page hasn't been modified and will be freed
7587 * instead of compressed.
7588 */
7589 }
7590 }
7591
7592 /* disconnect the page */
7593 pmap_page_protect_options(pn, 0, options, arg);
7594
7595 /* return ref/chg status */
7596 return pmap_get_refmod(pn);
7597 }
7598
7599 /*
7600 * Routine:
7601 * pmap_disconnect
7602 *
7603 * Function:
7604 * Disconnect all mappings for this page and return reference and change status
7605 * in generic format.
7606 *
7607 */
7608 unsigned int
7609 pmap_disconnect(
7610 ppnum_t pn)
7611 {
7612 pmap_page_protect(pn, 0); /* disconnect the page */
7613 return pmap_get_refmod(pn); /* return ref/chg status */
7614 }
7615
7616 boolean_t
7617 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7618 {
7619 if (ptoa(first) >= vm_last_phys) {
7620 return FALSE;
7621 }
7622 if (ptoa(last) < vm_first_phys) {
7623 return FALSE;
7624 }
7625
7626 return TRUE;
7627 }
7628
7629 /*
7630 * The state maintained by the noencrypt functions is used as a
7631 * debugging aid on ARM. This incurs some overhead on the part
7632 * of the caller. A special case check in phys_attribute_clear
7633 * (the most expensive path) currently minimizes this overhead,
7634 * but stubbing these functions out on RELEASE kernels yields
7635 * further wins.
7636 */
7637 boolean_t
7638 pmap_is_noencrypt(
7639 ppnum_t pn)
7640 {
7641 #if DEVELOPMENT || DEBUG
7642 boolean_t result = FALSE;
7643
7644 if (!pa_valid(ptoa(pn))) {
7645 return FALSE;
7646 }
7647
7648 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7649
7650 return result;
7651 #else
7652 #pragma unused(pn)
7653 return FALSE;
7654 #endif
7655 }
7656
7657 void
7658 pmap_set_noencrypt(
7659 ppnum_t pn)
7660 {
7661 #if DEVELOPMENT || DEBUG
7662 if (!pa_valid(ptoa(pn))) {
7663 return;
7664 }
7665
7666 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7667 #else
7668 #pragma unused(pn)
7669 #endif
7670 }
7671
7672 void
7673 pmap_clear_noencrypt(
7674 ppnum_t pn)
7675 {
7676 #if DEVELOPMENT || DEBUG
7677 if (!pa_valid(ptoa(pn))) {
7678 return;
7679 }
7680
7681 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7682 #else
7683 #pragma unused(pn)
7684 #endif
7685 }
7686
7687 #if XNU_MONITOR
7688 boolean_t
7689 pmap_is_monitor(ppnum_t pn)
7690 {
7691 assert(pa_valid(ptoa(pn)));
7692 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7693 }
7694 #endif
7695
7696 void
7697 pmap_lock_phys_page(ppnum_t pn)
7698 {
7699 #if !XNU_MONITOR
7700 unsigned int pai;
7701 pmap_paddr_t phys = ptoa(pn);
7702
7703 if (pa_valid(phys)) {
7704 pai = pa_index(phys);
7705 pvh_lock(pai);
7706 } else
7707 #else
7708 (void)pn;
7709 #endif
7710 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7711 }
7712
7713
7714 void
7715 pmap_unlock_phys_page(ppnum_t pn)
7716 {
7717 #if !XNU_MONITOR
7718 unsigned int pai;
7719 pmap_paddr_t phys = ptoa(pn);
7720
7721 if (pa_valid(phys)) {
7722 pai = pa_index(phys);
7723 pvh_unlock(pai);
7724 } else
7725 #else
7726 (void)pn;
7727 #endif
7728 { simple_unlock(&phys_backup_lock);}
7729 }
7730
7731 MARK_AS_PMAP_TEXT static void
7732 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7733 {
7734 if (pmap != kernel_pmap) {
7735 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7736 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7737 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7738 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7739 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7740 #if __ARM_MIXED_PAGE_SIZE__
7741 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7742 #endif
7743 }
7744
7745
7746 #if __ARM_MIXED_PAGE_SIZE__
7747 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7748 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7749 }
7750 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7751
7752
7753 if (pmap != kernel_pmap) {
7754 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7755 } else if (!pmap_user_ttb_is_clear()) {
7756 pmap_clear_user_ttb_internal();
7757 }
7758 }
7759
7760 MARK_AS_PMAP_TEXT void
7761 pmap_clear_user_ttb_internal(void)
7762 {
7763 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7764 }
7765
7766 void
7767 pmap_clear_user_ttb(void)
7768 {
7769 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7770 #if XNU_MONITOR
7771 pmap_clear_user_ttb_ppl();
7772 #else
7773 pmap_clear_user_ttb_internal();
7774 #endif
7775 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7776 }
7777
7778
7779 #if defined(__arm64__)
7780 /*
7781 * Marker for use in multi-pass fast-fault PV list processing.
7782 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7783 * these functions, as compressed PTEs should never be present in PV lists.
7784 * Note that this only holds true for arm64; for arm32 we don't have enough
7785 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7786 * and WRITEABLE marker depending on whether the PTE is valid.
7787 */
7788 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7789 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7790 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7791 #endif
7792
7793
7794 MARK_AS_PMAP_TEXT static boolean_t
7795 arm_force_fast_fault_with_flush_range(
7796 ppnum_t ppnum,
7797 vm_prot_t allow_mode,
7798 int options,
7799 pmap_tlb_flush_range_t *flush_range)
7800 {
7801 pmap_paddr_t phys = ptoa(ppnum);
7802 pv_entry_t *pve_p;
7803 pt_entry_t *pte_p;
7804 unsigned int pai;
7805 unsigned int pass1_updated = 0;
7806 unsigned int pass2_updated = 0;
7807 boolean_t result;
7808 pv_entry_t **pv_h;
7809 bool is_reusable;
7810 bool ref_fault;
7811 bool mod_fault;
7812 bool clear_write_fault = false;
7813 bool ref_aliases_mod = false;
7814 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7815
7816 assert(ppnum != vm_page_fictitious_addr);
7817
7818 if (!pa_valid(phys)) {
7819 return FALSE; /* Not a managed page. */
7820 }
7821
7822 result = TRUE;
7823 ref_fault = false;
7824 mod_fault = false;
7825 pai = pa_index(phys);
7826 if (__probable(mustsynch)) {
7827 pvh_lock(pai);
7828 }
7829 pv_h = pai_to_pvh(pai);
7830
7831 #if XNU_MONITOR
7832 if (__improbable(ppattr_pa_test_monitor(phys))) {
7833 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7834 }
7835 #endif
7836 pte_p = PT_ENTRY_NULL;
7837 pve_p = PV_ENTRY_NULL;
7838 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7839 pte_p = pvh_ptep(pv_h);
7840 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7841 pve_p = pvh_pve_list(pv_h);
7842 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7843 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7844 }
7845
7846 is_reusable = ppattr_test_reusable(pai);
7847
7848 /*
7849 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7850 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
7851 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7852 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
7853 * operation, TLB invalidation may be handled by the caller so it's possible for
7854 * tlb_flush_needed to be true while issue_tlbi is false.
7855 */
7856 bool issue_tlbi = false;
7857 bool tlb_flush_needed = false;
7858
7859 pv_entry_t *orig_pve_p = pve_p;
7860 pt_entry_t *orig_pte_p = pte_p;
7861 int pve_ptep_idx = 0;
7862
7863 /*
7864 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7865 * TLB invalidation in pass 2.
7866 */
7867 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7868 pt_entry_t spte;
7869 pt_entry_t tmplate;
7870
7871 if (pve_p != PV_ENTRY_NULL) {
7872 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7873 if (pte_p == PT_ENTRY_NULL) {
7874 goto fff_skip_pve_pass1;
7875 }
7876 }
7877
7878 #ifdef PVH_FLAG_IOMMU
7879 if (pvh_ptep_is_iommu(pte_p)) {
7880 goto fff_skip_pve_pass1;
7881 }
7882 #endif
7883 if (*pte_p == ARM_PTE_EMPTY) {
7884 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7885 }
7886 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7887 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7888 }
7889
7890 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7891 const pmap_t pmap = ptdp->pmap;
7892 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7893 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7894
7895 assert(va >= pmap->min && va < pmap->max);
7896
7897 /* update pmap stats and ledgers */
7898 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7899 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7900 if (is_altacct) {
7901 /*
7902 * We do not track "reusable" status for
7903 * "alternate accounting" mappings.
7904 */
7905 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7906 is_reusable &&
7907 is_internal &&
7908 pmap != kernel_pmap) {
7909 /* one less "reusable" */
7910 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7911 /* one more "internal" */
7912 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7913 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7914
7915 /*
7916 * Since the page is being marked non-reusable, we assume that it will be
7917 * modified soon. Avoid the cost of another trap to handle the fast
7918 * fault when we next write to this page.
7919 */
7920 clear_write_fault = true;
7921 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7922 !is_reusable &&
7923 is_internal &&
7924 pmap != kernel_pmap) {
7925 /* one more "reusable" */
7926 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7927 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7928 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7929 }
7930
7931 bool wiredskip = pte_is_wired(*pte_p) &&
7932 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7933
7934 if (wiredskip) {
7935 result = FALSE;
7936 goto fff_skip_pve_pass1;
7937 }
7938
7939 spte = *pte_p;
7940 tmplate = spte;
7941
7942 #if HAS_FEAT_XS
7943 /**
7944 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7945 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7946 */
7947 assert(!pte_is_xs(pt_attr, spte));
7948 #endif /* HAS_FEAT_XS */
7949 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7950 /* read protection sets the pte to fault */
7951 tmplate = tmplate & ~ARM_PTE_AF;
7952 ref_fault = true;
7953 }
7954 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7955 /* take away write permission if set */
7956 if (pmap == kernel_pmap) {
7957 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7958 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7959 pte_set_was_writeable(tmplate, true);
7960 mod_fault = true;
7961 }
7962 } else {
7963 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7964 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7965 pte_set_was_writeable(tmplate, true);
7966 mod_fault = true;
7967 }
7968 }
7969 }
7970
7971 #if MACH_ASSERT && XNU_MONITOR
7972 if (is_pte_xprr_protected(pmap, spte)) {
7973 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7974 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7975 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7976 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7977 ppnum, options, allow_mode);
7978 }
7979 }
7980 #endif /* MACH_ASSERT && XNU_MONITOR */
7981
7982 if (result && (tmplate != spte)) {
7983 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7984 !(options & PMAP_OPTIONS_NOFLUSH)) {
7985 tlb_flush_needed = true;
7986 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7987 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7988 #ifdef ARM_PTE_FF_MARKER
7989 assert(!(spte & ARM_PTE_FF_MARKER));
7990 tmplate |= ARM_PTE_FF_MARKER;
7991 ++pass1_updated;
7992 #endif
7993 issue_tlbi = true;
7994 }
7995 }
7996 write_pte_fast(pte_p, tmplate);
7997 }
7998
7999 fff_skip_pve_pass1:
8000 pte_p = PT_ENTRY_NULL;
8001 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8002 pve_ptep_idx = 0;
8003 pve_p = pve_next(pve_p);
8004 }
8005 }
8006
8007 if (tlb_flush_needed) {
8008 FLUSH_PTE_STRONG();
8009 }
8010
8011 if (!issue_tlbi) {
8012 goto fff_finish;
8013 }
8014
8015 /* Pass 2: Issue any required TLB invalidations */
8016 pve_p = orig_pve_p;
8017 pte_p = orig_pte_p;
8018 pve_ptep_idx = 0;
8019
8020 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8021 if (pve_p != PV_ENTRY_NULL) {
8022 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8023 if (pte_p == PT_ENTRY_NULL) {
8024 goto fff_skip_pve_pass2;
8025 }
8026 }
8027
8028 #ifdef PVH_FLAG_IOMMU
8029 if (pvh_ptep_is_iommu(pte_p)) {
8030 goto fff_skip_pve_pass2;
8031 }
8032 #endif
8033
8034 #ifdef ARM_PTE_FF_MARKER
8035 pt_entry_t spte = *pte_p;
8036
8037 if (!(spte & ARM_PTE_FF_MARKER)) {
8038 goto fff_skip_pve_pass2;
8039 } else {
8040 spte &= (~ARM_PTE_FF_MARKER);
8041 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8042 write_pte_fast(pte_p, spte);
8043 ++pass2_updated;
8044 }
8045 #endif
8046 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8047 const pmap_t pmap = ptdp->pmap;
8048 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8049
8050 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8051 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8052 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8053 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8054 }
8055
8056 fff_skip_pve_pass2:
8057 pte_p = PT_ENTRY_NULL;
8058 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8059 pve_ptep_idx = 0;
8060 pve_p = pve_next(pve_p);
8061 }
8062 }
8063
8064 fff_finish:
8065 if (__improbable(pass1_updated != pass2_updated)) {
8066 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8067 __func__, pass1_updated, pass2_updated);
8068 }
8069
8070 /*
8071 * If we are using the same approach for ref and mod
8072 * faults on this PTE, do not clear the write fault;
8073 * this would cause both ref and mod to be set on the
8074 * page again, and prevent us from taking ANY read/write
8075 * fault on the mapping.
8076 */
8077 if (clear_write_fault && !ref_aliases_mod) {
8078 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8079 }
8080 if (tlb_flush_needed) {
8081 if (flush_range) {
8082 /* Delayed flush. Signal to the caller that the flush is needed. */
8083 flush_range->ptfr_flush_needed = true;
8084 } else {
8085 sync_tlb_flush();
8086 }
8087 }
8088
8089 /* update global "reusable" status for this page */
8090 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8091 ppattr_clear_reusable(pai);
8092 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8093 ppattr_set_reusable(pai);
8094 }
8095
8096 if (mod_fault) {
8097 ppattr_set_modfault(pai);
8098 }
8099 if (ref_fault) {
8100 ppattr_set_reffault(pai);
8101 }
8102 if (__probable(mustsynch)) {
8103 pvh_unlock(pai);
8104 }
8105 return result;
8106 }
8107
8108 MARK_AS_PMAP_TEXT boolean_t
8109 arm_force_fast_fault_internal(
8110 ppnum_t ppnum,
8111 vm_prot_t allow_mode,
8112 int options)
8113 {
8114 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8115 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8116 }
8117 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8118 }
8119
8120 /*
8121 * Routine: arm_force_fast_fault
8122 *
8123 * Function:
8124 * Force all mappings for this page to fault according
8125 * to the access modes allowed, so we can gather ref/modify
8126 * bits again.
8127 */
8128
8129 boolean_t
8130 arm_force_fast_fault(
8131 ppnum_t ppnum,
8132 vm_prot_t allow_mode,
8133 int options,
8134 __unused void *arg)
8135 {
8136 pmap_paddr_t phys = ptoa(ppnum);
8137
8138 assert(ppnum != vm_page_fictitious_addr);
8139
8140 if (!pa_valid(phys)) {
8141 return FALSE; /* Not a managed page. */
8142 }
8143
8144 #if XNU_MONITOR
8145 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8146 #else
8147 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8148 #endif
8149 }
8150
8151 /*
8152 * Routine: arm_clear_fast_fault
8153 *
8154 * Function:
8155 * Clear pending force fault for all mappings for this page based on
8156 * the observed fault type, update ref/modify bits.
8157 */
8158 MARK_AS_PMAP_TEXT static boolean_t
8159 arm_clear_fast_fault(
8160 ppnum_t ppnum,
8161 vm_prot_t fault_type,
8162 pt_entry_t *pte_p)
8163 {
8164 pmap_paddr_t pa = ptoa(ppnum);
8165 pv_entry_t *pve_p;
8166 unsigned int pai;
8167 boolean_t result;
8168 bool tlb_flush_needed = false;
8169 pv_entry_t **pv_h;
8170 unsigned int npve = 0;
8171 unsigned int pass1_updated = 0;
8172 unsigned int pass2_updated = 0;
8173
8174 assert(ppnum != vm_page_fictitious_addr);
8175
8176 if (!pa_valid(pa)) {
8177 return FALSE; /* Not a managed page. */
8178 }
8179
8180 result = FALSE;
8181 pai = pa_index(pa);
8182 pvh_assert_locked(pai);
8183 pv_h = pai_to_pvh(pai);
8184
8185 pve_p = PV_ENTRY_NULL;
8186 if (pte_p == PT_ENTRY_NULL) {
8187 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8188 pte_p = pvh_ptep(pv_h);
8189 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8190 pve_p = pvh_pve_list(pv_h);
8191 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8192 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8193 }
8194 }
8195
8196 pv_entry_t *orig_pve_p = pve_p;
8197 pt_entry_t *orig_pte_p = pte_p;
8198 int pve_ptep_idx = 0;
8199
8200 /*
8201 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8202 * TLB invalidation in pass 2.
8203 */
8204 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8205 pt_entry_t spte;
8206 pt_entry_t tmplate;
8207
8208 if (pve_p != PV_ENTRY_NULL) {
8209 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8210 if (pte_p == PT_ENTRY_NULL) {
8211 goto cff_skip_pve_pass1;
8212 }
8213 }
8214
8215 #ifdef PVH_FLAG_IOMMU
8216 if (pvh_ptep_is_iommu(pte_p)) {
8217 goto cff_skip_pve_pass1;
8218 }
8219 #endif
8220 if (*pte_p == ARM_PTE_EMPTY) {
8221 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8222 }
8223
8224 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8225 const pmap_t pmap = ptdp->pmap;
8226 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8227
8228 assert(va >= pmap->min && va < pmap->max);
8229
8230 spte = *pte_p;
8231 tmplate = spte;
8232
8233 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8234 {
8235 if (pmap == kernel_pmap) {
8236 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8237 } else {
8238 assert(pmap->type != PMAP_TYPE_NESTED);
8239 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8240 }
8241 }
8242
8243 tmplate |= ARM_PTE_AF;
8244
8245 pte_set_was_writeable(tmplate, false);
8246 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8247 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8248 tmplate = spte | ARM_PTE_AF;
8249
8250 {
8251 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8252 }
8253 }
8254
8255 #if MACH_ASSERT && XNU_MONITOR
8256 if (is_pte_xprr_protected(pmap, spte)) {
8257 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8258 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8259 "ppnum=0x%x, fault_type=0x%x",
8260 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8261 ppnum, fault_type);
8262 }
8263 }
8264 #endif /* MACH_ASSERT && XNU_MONITOR */
8265
8266 assert(spte != ARM_PTE_TYPE_FAULT);
8267 if (spte != tmplate) {
8268 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8269 #ifdef ARM_PTE_FF_MARKER
8270 assert(!(spte & ARM_PTE_FF_MARKER));
8271 tmplate |= ARM_PTE_FF_MARKER;
8272 ++pass1_updated;
8273 #endif
8274 tlb_flush_needed = true;
8275 }
8276 write_pte_fast(pte_p, tmplate);
8277 result = TRUE;
8278 }
8279
8280 cff_skip_pve_pass1:
8281 pte_p = PT_ENTRY_NULL;
8282 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8283 pve_ptep_idx = 0;
8284 pve_p = pve_next(pve_p);
8285 ++npve;
8286 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8287 break;
8288 }
8289 }
8290 }
8291
8292 if (!tlb_flush_needed) {
8293 goto cff_finish;
8294 }
8295
8296 FLUSH_PTE_STRONG();
8297
8298 /* Pass 2: Issue any required TLB invalidations */
8299 pve_p = orig_pve_p;
8300 pte_p = orig_pte_p;
8301 pve_ptep_idx = 0;
8302 npve = 0;
8303
8304 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8305 if (pve_p != PV_ENTRY_NULL) {
8306 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8307 if (pte_p == PT_ENTRY_NULL) {
8308 goto cff_skip_pve_pass2;
8309 }
8310 }
8311
8312 #ifdef PVH_FLAG_IOMMU
8313 if (pvh_ptep_is_iommu(pte_p)) {
8314 goto cff_skip_pve_pass2;
8315 }
8316 #endif
8317
8318 #ifdef ARM_PTE_FF_MARKER
8319 pt_entry_t spte = *pte_p;
8320
8321 if (!(spte & ARM_PTE_FF_MARKER)) {
8322 goto cff_skip_pve_pass2;
8323 } else {
8324 spte &= (~ARM_PTE_FF_MARKER);
8325 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8326 write_pte_fast(pte_p, spte);
8327 ++pass2_updated;
8328 }
8329 #endif
8330 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8331 const pmap_t pmap = ptdp->pmap;
8332 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8333
8334 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8335 pmap, true, false);
8336
8337 cff_skip_pve_pass2:
8338 pte_p = PT_ENTRY_NULL;
8339 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8340 pve_ptep_idx = 0;
8341 pve_p = pve_next(pve_p);
8342 ++npve;
8343 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8344 break;
8345 }
8346 }
8347 }
8348
8349 cff_finish:
8350 if (__improbable(pass1_updated != pass2_updated)) {
8351 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8352 __func__, pass1_updated, pass2_updated);
8353 }
8354 if (tlb_flush_needed) {
8355 sync_tlb_flush();
8356 }
8357 return result;
8358 }
8359
8360 /*
8361 * Determine if the fault was induced by software tracking of
8362 * modify/reference bits. If so, re-enable the mapping (and set
8363 * the appropriate bits).
8364 *
8365 * Returns KERN_SUCCESS if the fault was induced and was
8366 * successfully handled.
8367 *
8368 * Returns KERN_FAILURE if the fault was not induced and
8369 * the function was unable to deal with it.
8370 *
8371 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8372 * disallows this type of access.
8373 *
8374 * Returns KERN_ABORTED if the pmap lock is taken and a
8375 * preemption is pending.
8376 *
8377 */
8378 MARK_AS_PMAP_TEXT kern_return_t
8379 arm_fast_fault_internal(
8380 pmap_t pmap,
8381 vm_map_address_t va,
8382 vm_prot_t fault_type,
8383 __unused bool was_af_fault,
8384 __unused bool from_user)
8385 {
8386 kern_return_t result = KERN_FAILURE;
8387 pt_entry_t *ptep;
8388 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8389 unsigned int pai;
8390 pmap_paddr_t pa;
8391 validate_pmap_mutable(pmap);
8392
8393 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8394 return KERN_ABORTED;
8395 }
8396
8397 /*
8398 * If the entry doesn't exist, is completely invalid, or is already
8399 * valid, we can't fix it here.
8400 */
8401
8402 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8403 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8404 if (ptep != PT_ENTRY_NULL) {
8405 while (true) {
8406 spte = *((volatile pt_entry_t*)ptep);
8407
8408 pa = pte_to_pa(spte);
8409
8410 if ((spte == ARM_PTE_TYPE_FAULT) ||
8411 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8412 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8413 return result;
8414 }
8415
8416 if (!pa_valid(pa)) {
8417 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8418 #if XNU_MONITOR
8419 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8420 return KERN_PROTECTION_FAILURE;
8421 } else
8422 #endif
8423 return result;
8424 }
8425 pai = pa_index(pa);
8426 pvh_lock(pai);
8427 if (*ptep == spte) {
8428 /*
8429 * Double-check the spte value, as we care about the AF bit.
8430 * It's also possible that pmap_page_protect() transitioned the
8431 * PTE to compressed/empty before we grabbed the PVH lock.
8432 */
8433 break;
8434 }
8435 pvh_unlock(pai);
8436 }
8437 } else {
8438 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8439 return result;
8440 }
8441
8442
8443 if ((result != KERN_SUCCESS) &&
8444 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8445 /*
8446 * An attempted access will always clear ref/mod fault state, as
8447 * appropriate for the fault type. arm_clear_fast_fault will
8448 * update the associated PTEs for the page as appropriate; if
8449 * any PTEs are updated, we redrive the access. If the mapping
8450 * does not actually allow for the attempted access, the
8451 * following fault will (hopefully) fail to update any PTEs, and
8452 * thus cause arm_fast_fault to decide that it failed to handle
8453 * the fault.
8454 */
8455 if (ppattr_test_reffault(pai)) {
8456 ppattr_clear_reffault(pai);
8457 }
8458 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8459 ppattr_clear_modfault(pai);
8460 }
8461
8462 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8463 /*
8464 * Should this preserve KERN_PROTECTION_FAILURE? The
8465 * cost of not doing so is a another fault in a case
8466 * that should already result in an exception.
8467 */
8468 result = KERN_SUCCESS;
8469 }
8470 }
8471
8472 /*
8473 * If the PTE already has sufficient permissions, we can report the fault as handled.
8474 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8475 * on mappings of the same page
8476 */
8477 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8478 uintptr_t ap_ro, ap_rw, ap_x;
8479 if (pmap == kernel_pmap) {
8480 ap_ro = ARM_PTE_AP(AP_RONA);
8481 ap_rw = ARM_PTE_AP(AP_RWNA);
8482 ap_x = ARM_PTE_NX;
8483 } else {
8484 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8485 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8486 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8487 }
8488 /*
8489 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8490 * hardware they may be xPRR-protected, in which case they'll be handled
8491 * by the is_pte_xprr_protected() case above. Additionally, the exception
8492 * handling path currently does not call arm_fast_fault() without at least
8493 * VM_PROT_READ in fault_type.
8494 */
8495 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8496 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8497 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8498 result = KERN_SUCCESS;
8499 }
8500 }
8501 }
8502
8503 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8504 /*
8505 * A prior arm_clear_fast_fault() operation may have returned early due to
8506 * another pending PV list operation or an excessively large PV list.
8507 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8508 * taking a fault on the same mapping.
8509 */
8510 result = KERN_SUCCESS;
8511 }
8512
8513 pvh_unlock(pai);
8514 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8515 return result;
8516 }
8517
8518 kern_return_t
8519 arm_fast_fault(
8520 pmap_t pmap,
8521 vm_map_address_t va,
8522 vm_prot_t fault_type,
8523 bool was_af_fault,
8524 __unused bool from_user)
8525 {
8526 kern_return_t result = KERN_FAILURE;
8527
8528 if (va < pmap->min || va >= pmap->max) {
8529 return result;
8530 }
8531
8532 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8533 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8534 from_user);
8535
8536 do {
8537 #if XNU_MONITOR
8538 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8539 #else
8540 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8541 #endif
8542 } while (result == KERN_ABORTED);
8543
8544 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8545
8546 return result;
8547 }
8548
8549 void
8550 pmap_copy_page(
8551 ppnum_t psrc,
8552 ppnum_t pdst)
8553 {
8554 bcopy_phys((addr64_t) (ptoa(psrc)),
8555 (addr64_t) (ptoa(pdst)),
8556 PAGE_SIZE);
8557 }
8558
8559
8560 /*
8561 * pmap_copy_page copies the specified (machine independent) pages.
8562 */
8563 void
8564 pmap_copy_part_page(
8565 ppnum_t psrc,
8566 vm_offset_t src_offset,
8567 ppnum_t pdst,
8568 vm_offset_t dst_offset,
8569 vm_size_t len)
8570 {
8571 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8572 (addr64_t) (ptoa(pdst) + dst_offset),
8573 len);
8574 }
8575
8576
8577 /*
8578 * pmap_zero_page zeros the specified (machine independent) page.
8579 */
8580 void
8581 pmap_zero_page(
8582 ppnum_t pn)
8583 {
8584 assert(pn != vm_page_fictitious_addr);
8585 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8586 }
8587
8588 /*
8589 * pmap_zero_part_page
8590 * zeros the specified (machine independent) part of a page.
8591 */
8592 void
8593 pmap_zero_part_page(
8594 ppnum_t pn,
8595 vm_offset_t offset,
8596 vm_size_t len)
8597 {
8598 assert(pn != vm_page_fictitious_addr);
8599 assert(offset + len <= PAGE_SIZE);
8600 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8601 }
8602
8603 void
8604 pmap_map_globals(
8605 void)
8606 {
8607 pt_entry_t *ptep, pte;
8608
8609 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8610 assert(ptep != PT_ENTRY_NULL);
8611 assert(*ptep == ARM_PTE_EMPTY);
8612
8613 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8614 #if __ARM_KERNEL_PROTECT__
8615 pte |= ARM_PTE_NG;
8616 #endif /* __ARM_KERNEL_PROTECT__ */
8617 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8618 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8619 *ptep = pte;
8620 FLUSH_PTE();
8621 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8622
8623 #if KASAN
8624 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8625 #endif
8626 }
8627
8628 vm_offset_t
8629 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8630 {
8631 if (__improbable(index >= CPUWINDOWS_MAX)) {
8632 panic("%s: invalid index %u", __func__, index);
8633 }
8634 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8635 }
8636
8637 MARK_AS_PMAP_TEXT unsigned int
8638 pmap_map_cpu_windows_copy_internal(
8639 ppnum_t pn,
8640 vm_prot_t prot,
8641 unsigned int wimg_bits)
8642 {
8643 pt_entry_t *ptep = NULL, pte;
8644 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8645 unsigned int cpu_num;
8646 unsigned int i;
8647 vm_offset_t cpu_copywindow_vaddr = 0;
8648 bool need_strong_sync = false;
8649
8650 #if XNU_MONITOR
8651 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8652 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8653 #endif
8654
8655 #if XNU_MONITOR
8656 #ifdef __ARM_COHERENT_IO__
8657 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8658 panic("%s: attempted to map a managed page, "
8659 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8660 __FUNCTION__,
8661 pn, prot, wimg_bits);
8662 }
8663 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8664 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8665 }
8666
8667 #else /* __ARM_COHERENT_IO__ */
8668 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8669 #endif /* __ARM_COHERENT_IO__ */
8670 #endif /* XNU_MONITOR */
8671 cpu_num = pmap_cpu_data->cpu_number;
8672
8673 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8674 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8675 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8676 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8677 if (*ptep == ARM_PTE_TYPE_FAULT) {
8678 break;
8679 }
8680 }
8681 if (i == CPUWINDOWS_MAX) {
8682 panic("pmap_map_cpu_windows_copy: out of window");
8683 }
8684
8685 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8686 #if __ARM_KERNEL_PROTECT__
8687 pte |= ARM_PTE_NG;
8688 #endif /* __ARM_KERNEL_PROTECT__ */
8689
8690 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8691
8692 if (prot & VM_PROT_WRITE) {
8693 pte |= ARM_PTE_AP(AP_RWNA);
8694 } else {
8695 pte |= ARM_PTE_AP(AP_RONA);
8696 }
8697 #if HAS_FEAT_XS
8698 need_strong_sync = pte_is_xs(native_pt_attr, pte);
8699 #endif
8700 write_pte_fast(ptep, pte);
8701 /*
8702 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8703 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8704 */
8705 FLUSH_PTE_STRONG();
8706 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8707 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8708
8709 return i;
8710 }
8711
8712 unsigned int
8713 pmap_map_cpu_windows_copy(
8714 ppnum_t pn,
8715 vm_prot_t prot,
8716 unsigned int wimg_bits)
8717 {
8718 #if XNU_MONITOR
8719 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8720 #else
8721 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8722 #endif
8723 }
8724
8725 MARK_AS_PMAP_TEXT void
8726 pmap_unmap_cpu_windows_copy_internal(
8727 unsigned int index)
8728 {
8729 pt_entry_t *ptep;
8730 unsigned int cpu_num;
8731 vm_offset_t cpu_copywindow_vaddr = 0;
8732 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8733
8734 cpu_num = pmap_cpu_data->cpu_number;
8735
8736 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8737 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8738 * (which are likely to have been on I/O memory) are complete before
8739 * tearing down the mapping. */
8740 __builtin_arm_dsb(DSB_SY);
8741 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8742 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8743 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8744 }
8745
8746 void
8747 pmap_unmap_cpu_windows_copy(
8748 unsigned int index)
8749 {
8750 #if XNU_MONITOR
8751 return pmap_unmap_cpu_windows_copy_ppl(index);
8752 #else
8753 return pmap_unmap_cpu_windows_copy_internal(index);
8754 #endif
8755 }
8756
8757 #if XNU_MONITOR
8758
8759 MARK_AS_PMAP_TEXT void
8760 pmap_invoke_with_page(
8761 ppnum_t page_number,
8762 void *ctx,
8763 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8764 {
8765 #pragma unused(page_number, ctx, callback)
8766 }
8767
8768 /*
8769 * Loop over every pmap_io_range (I/O ranges marked as owned by
8770 * the PPL in the device tree) and conditionally call callback() on each range
8771 * that needs to be included in the hibernation image.
8772 *
8773 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8774 * context is needed in the callback.
8775 * @param callback Callback function invoked on each range (gated by flag).
8776 */
8777 MARK_AS_PMAP_TEXT void
8778 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8779 {
8780 extern const pmap_io_range_t* io_attr_table;
8781 extern const unsigned int num_io_rgns;
8782 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8783 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8784 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8785 }
8786 }
8787 }
8788
8789 /**
8790 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8791 * PPL-owned page. Otherwise, do nothing.
8792 *
8793 * @param addr Physical address of the page to set the HASHED flag on.
8794 */
8795 MARK_AS_PMAP_TEXT void
8796 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8797 {
8798 /* Ignore non-managed kernel memory. */
8799 if (!pa_valid(addr)) {
8800 return;
8801 }
8802
8803 const unsigned int pai = pa_index(addr);
8804 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8805 pv_entry_t **pv_h = pai_to_pvh(pai);
8806
8807 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8808 pvh_lock(pai);
8809 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8810 pvh_unlock(pai);
8811 }
8812 }
8813
8814 /**
8815 * Loop through every physical page in the system and clear out the HASHED flag
8816 * on every PPL-owned page. That flag is used to keep track of which pages have
8817 * been hashed into the hibernation image during the hibernation entry process.
8818 *
8819 * The HASHED flag needs to be cleared out between hibernation cycles because the
8820 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8821 * image with the HASHED flag set on certain pages. It's important to clear the
8822 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8823 * into the hibernation image can't be compromised across hibernation cycles.
8824 */
8825 MARK_AS_PMAP_TEXT void
8826 pmap_clear_ppl_hashed_flag_all(void)
8827 {
8828 const unsigned int last_index = pa_index(vm_last_phys);
8829 pv_entry_t **pv_h = NULL;
8830
8831 for (int pai = 0; pai < last_index; ++pai) {
8832 pv_h = pai_to_pvh(pai);
8833
8834 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8835 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8836 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8837 pvh_lock(pai);
8838 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8839 pvh_unlock(pai);
8840 }
8841 }
8842 }
8843
8844 /**
8845 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8846 * ppl_hib driver will call this after all wired pages have been copied into the
8847 * hibernation image.
8848 */
8849 MARK_AS_PMAP_TEXT void
8850 pmap_check_ppl_hashed_flag_all(void)
8851 {
8852 const unsigned int last_index = pa_index(vm_last_phys);
8853 pv_entry_t **pv_h = NULL;
8854
8855 for (int pai = 0; pai < last_index; ++pai) {
8856 pv_h = pai_to_pvh(pai);
8857
8858 /**
8859 * The PMAP stacks are explicitly not saved into the image so skip checking
8860 * the pages that contain the PMAP stacks.
8861 */
8862 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8863 (pai < pa_index(pmap_stacks_end_pa));
8864
8865 if (!is_pmap_stack &&
8866 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8867 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8868 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8869 }
8870 }
8871 }
8872
8873 #endif /* XNU_MONITOR */
8874
8875 /*
8876 * Indicate that a pmap is intended to be used as a nested pmap
8877 * within one or more larger address spaces. This must be set
8878 * before pmap_nest() is called with this pmap as the 'subordinate'.
8879 */
8880 MARK_AS_PMAP_TEXT void
8881 pmap_set_nested_internal(
8882 pmap_t pmap)
8883 {
8884 validate_pmap_mutable(pmap);
8885 if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
8886 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8887 __func__, pmap, pmap->type);
8888 }
8889
8890 #if XNU_MONITOR
8891 /**
8892 * The "seq_cst" ordering of the atomic load here guarantees
8893 * the check below is performed after the type update above
8894 * is observed. Together with similar order guarantee at
8895 * pmap_switch_internal(), it makes sure a pmap is never
8896 * active-and-nested:
8897 *
8898 * pmap_set_nested() | pmap_switch()
8899 * --------------------------------------
8900 * set nested | set active
8901 * store-load barrier| store-load barrier
8902 * assert !active | assert !nested
8903 */
8904 const int max_cpu = ml_get_max_cpu_number();
8905 for (unsigned int i = 0; i <= max_cpu; ++i) {
8906 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
8907 if (cpu_data == NULL) {
8908 continue;
8909 }
8910 if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
8911 panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
8912 }
8913 }
8914 #endif /* XNU_MONITOR */
8915
8916 /**
8917 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
8918 * this pmap its own nested pmap.
8919 */
8920 if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
8921 panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
8922 }
8923
8924 pmap_get_pt_ops(pmap)->free_id(pmap);
8925 }
8926
8927 void
8928 pmap_set_nested(
8929 pmap_t pmap)
8930 {
8931 #if XNU_MONITOR
8932 pmap_set_nested_ppl(pmap);
8933 #else
8934 pmap_set_nested_internal(pmap);
8935 #endif
8936 }
8937
8938 bool
8939 pmap_is_nested(
8940 pmap_t pmap)
8941 {
8942 return pmap->type == PMAP_TYPE_NESTED;
8943 }
8944
8945 /*
8946 * pmap_trim_range(pmap, start, end)
8947 *
8948 * pmap = pmap to operate on
8949 * start = start of the range
8950 * end = end of the range
8951 *
8952 * Attempts to deallocate TTEs for the given range in the nested range.
8953 */
8954 MARK_AS_PMAP_TEXT static void
8955 pmap_trim_range(
8956 pmap_t pmap,
8957 addr64_t start,
8958 addr64_t end)
8959 {
8960 addr64_t cur;
8961 addr64_t nested_region_start;
8962 addr64_t nested_region_end;
8963 addr64_t adjusted_start;
8964 addr64_t adjusted_end;
8965 addr64_t adjust_offmask;
8966 tt_entry_t * tte_p;
8967 pt_entry_t * pte_p;
8968 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8969
8970 if (__improbable(end < start)) {
8971 panic("%s: invalid address range, "
8972 "pmap=%p, start=%p, end=%p",
8973 __func__,
8974 pmap, (void*)start, (void*)end);
8975 }
8976
8977 nested_region_start = pmap->nested_region_addr;
8978 nested_region_end = nested_region_start + pmap->nested_region_size;
8979
8980 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8981 panic("%s: range outside nested region %p-%p, "
8982 "pmap=%p, start=%p, end=%p",
8983 __func__, (void *)nested_region_start, (void *)nested_region_end,
8984 pmap, (void*)start, (void*)end);
8985 }
8986
8987 /* Contract the range to TT page boundaries. */
8988 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8989 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8990 adjusted_end = end & ~adjust_offmask;
8991
8992 /* Iterate over the range, trying to remove TTEs. */
8993 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8994 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8995
8996 tte_p = pmap_tte(pmap, cur);
8997
8998 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8999 pte_p = (pt_entry_t *) ttetokv(*tte_p);
9000
9001 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9002 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9003 /* Deallocate for the nested map. */
9004 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9005 } else if (pmap->type == PMAP_TYPE_USER) {
9006 /**
9007 * Just remove for the parent map. If the leaf table pointed
9008 * to by the TTE being removed (owned by the nested pmap)
9009 * has any mappings, then this call will panic. This
9010 * enforces the policy that tables being trimmed must be
9011 * empty to prevent possible use-after-free attacks.
9012 */
9013 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9014 } else {
9015 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9016 }
9017 } else {
9018 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9019 }
9020 }
9021
9022 /* Remove empty L2 TTs. */
9023 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9024 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9025
9026 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9027 /* For each L1 entry in our range... */
9028 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9029
9030 bool remove_tt1e = true;
9031 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9032 tt_entry_t * tt2e_start;
9033 tt_entry_t * tt2e_end;
9034 tt_entry_t * tt2e_p;
9035 tt_entry_t tt1e;
9036
9037 if (tt1e_p == NULL) {
9038 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9039 continue;
9040 }
9041
9042 tt1e = *tt1e_p;
9043
9044 if (tt1e == ARM_TTE_TYPE_FAULT) {
9045 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9046 continue;
9047 }
9048
9049 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9050 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9051
9052 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9053 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9054 /*
9055 * If any TTEs are populated, don't remove the
9056 * L1 TT.
9057 */
9058 remove_tt1e = false;
9059 }
9060 }
9061
9062 if (remove_tt1e) {
9063 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9064 } else {
9065 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9066 }
9067 }
9068 }
9069
9070 /**
9071 * State machine for multi-step pmap trimming. Trimming is the action of
9072 * deallocating the TTEs of the shared region of pmaps down to a given range.
9073 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9074 * disabling preemption for too long. These steps include computing the bounds
9075 * of the shared region, trimming the head of the "grand", trimming the tail of
9076 * the "grand", and trimming the "subord". Some of the steps can be skipped under
9077 * different conditions.
9078 *
9079 * @param grand the pmap in which the pages are nested
9080 * @param subord the pmap from which the pages are shared, or nested
9081 * @param vstart start of the used range in "grand"
9082 * @param size size of the used range
9083 * @param state the current state of the state machine
9084 *
9085 * @return the next state of the state machine, to be used in the next call
9086 * into this function.
9087 */
9088 MARK_AS_PMAP_TEXT pmap_trim_state_t
9089 pmap_trim_internal(
9090 pmap_t grand,
9091 pmap_t subord,
9092 addr64_t vstart,
9093 uint64_t size,
9094 pmap_trim_state_t state)
9095 {
9096 /* Validation needs to be done regardless of state. */
9097 addr64_t vend;
9098
9099 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9100 panic("%s: grand addr wraps around, "
9101 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9102 __func__, grand, subord, (void*)vstart, size, state);
9103 }
9104
9105 validate_pmap_mutable(grand);
9106 validate_pmap(subord);
9107
9108 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9109 panic("%s: subord is of non-nestable type 0x%hhx, "
9110 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9111 __func__, subord->type, grand, subord, (void*)vstart, size, state);
9112 }
9113
9114 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9115 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9116 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9117 __func__, grand->type, grand, subord, (void*)vstart, size, state);
9118 }
9119
9120 if (__improbable(grand->nested_pmap != subord)) {
9121 panic("%s: grand->nested != subord, "
9122 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9123 __func__, grand, subord, (void*)vstart, size, state);
9124 }
9125
9126 if (__improbable((size != 0) &&
9127 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9128 panic("%s: grand range not in nested region, "
9129 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9130 __func__, grand, subord, (void*)vstart, size, state);
9131 }
9132
9133 /* Trimming starts with figuring out the bounds for the grand. */
9134 if (state == PMAP_TRIM_STATE_START) {
9135 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9136
9137 /**
9138 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9139 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9140 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9141 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9142 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9143 * PMAP_TRIM_STATE_DONE.
9144 */
9145 if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9146 assert(subord->nested_bounds_set);
9147
9148 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9149 if (!grand->nested_bounds_set) {
9150 /* Inherit the bounds from subord. */
9151 grand->nested_region_true_start = subord->nested_region_true_start;
9152 grand->nested_region_true_end = subord->nested_region_true_end;
9153 grand->nested_bounds_set = true;
9154 }
9155
9156 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9157
9158 /* Now that the grand has bounds, we are done. */
9159 return PMAP_TRIM_STATE_DONE;
9160 }
9161
9162 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9163 if ((!subord->nested_bounds_set) && size) {
9164 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9165 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9166
9167 subord->nested_region_true_start = vstart;
9168 subord->nested_region_true_end = vend;
9169 subord->nested_region_true_start &= ~adjust_offmask;
9170
9171 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9172 panic("%s: padded true end wraps around, "
9173 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9174 __func__, grand, subord, (void*)vstart, size, state);
9175 }
9176
9177 subord->nested_region_true_end &= ~adjust_offmask;
9178 subord->nested_bounds_set = true;
9179 }
9180
9181 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9182 if (subord->nested_bounds_set) {
9183 /* Inherit the bounds from subord. */
9184 grand->nested_region_true_start = subord->nested_region_true_start;
9185 grand->nested_region_true_end = subord->nested_region_true_end;
9186 grand->nested_bounds_set = true;
9187
9188 /* If we know the bounds, we can trim the pmap. */
9189 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9190
9191 state = PMAP_TRIM_STATE_GRAND_BEFORE;
9192 } else {
9193 /* Don't trim if we don't know the bounds. */
9194 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9195
9196 return PMAP_TRIM_STATE_DONE;
9197 }
9198 }
9199
9200 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
9201 if (!grand->nested_bounds_set) {
9202 panic("%s: !grand->nested_bounds_set, "
9203 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9204 __func__, grand, subord, (void*)vstart, size, state);
9205 }
9206
9207 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9208 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9209 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9210 NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9211 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9212 (unsigned int)grand->nested_no_bounds_ref_state);
9213 }
9214
9215 #if XNU_MONITOR
9216 if (pmap_pending_preemption()) {
9217 return PMAP_TRIM_STATE_GRAND_AFTER;
9218 }
9219 #endif
9220
9221 state = PMAP_TRIM_STATE_GRAND_AFTER;
9222 }
9223
9224 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9225 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9226 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9227 NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9228 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9229 (unsigned int)grand->nested_no_bounds_ref_state);
9230 }
9231
9232 #if XNU_MONITOR
9233 if (pmap_pending_preemption()) {
9234 return PMAP_TRIM_STATE_SUBORD;
9235 }
9236 #endif
9237
9238 state = PMAP_TRIM_STATE_SUBORD;
9239 }
9240
9241 /* START state is guaranteed to compute the bounds for the subord. */
9242 if (!subord->nested_bounds_set) {
9243 panic("%s: !subord->nested_bounds_set, "
9244 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9245 __func__, grand, subord, (void*)vstart, size, state);
9246 }
9247
9248 if (state == PMAP_TRIM_STATE_SUBORD) {
9249 /**
9250 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9251 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9252 * called once grand's nested tables have been fully trimmed, and can only be called once
9253 * for each 'grand' pmap. We use release ordering for the atomics above to ensure that
9254 * the state update is visible only once the preceding trim operation is complete. An
9255 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9256 * but locking within pmap_trim_range() should make that harmless (and all but one will
9257 * ultimately panic due to a failed atomic state CAS). We use acquire ordering here to
9258 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9259 * of the state CAS.
9260 */
9261 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9262 NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9263 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9264 (unsigned int)grand->nested_no_bounds_ref_state);
9265 }
9266 pmap_trim_subord(subord);
9267 }
9268
9269 return PMAP_TRIM_STATE_DONE;
9270 }
9271
9272 MARK_AS_PMAP_TEXT static void
9273 pmap_trim_self(pmap_t pmap)
9274 {
9275 if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9276 /* If we have a no bounds ref, we need to drop it. */
9277 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9278 pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9279 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9280 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9281 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9282 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9283
9284 if (nested_bounds_set) {
9285 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9286 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9287 }
9288 /*
9289 * Try trimming the nested pmap, in case we had the
9290 * last reference.
9291 */
9292 pmap_trim_subord(pmap->nested_pmap);
9293 }
9294 }
9295
9296 /*
9297 * pmap_trim_subord(grand, subord)
9298 *
9299 * grand = pmap that we have nested subord in
9300 * subord = nested pmap we are attempting to trim
9301 *
9302 * Trims subord if possible
9303 */
9304 MARK_AS_PMAP_TEXT static void
9305 pmap_trim_subord(pmap_t subord)
9306 {
9307 bool contract_subord = false;
9308
9309 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9310
9311 subord->nested_no_bounds_refcnt--;
9312
9313 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9314 /* If this was the last no bounds reference, trim subord. */
9315 contract_subord = true;
9316 }
9317
9318 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9319
9320 if (contract_subord) {
9321 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9322 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9323 }
9324 }
9325
9326 /**
9327 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9328 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9329 * disabling preemption for too long.
9330 *
9331 * @note When we load the shared region we always create pages tables for the
9332 * entire region. In practice, the shared cache may use just a portion
9333 * of that. Before we know the bounds of the shared region, it can
9334 * already be mapped into processes. Therefore, once the bounds are
9335 * known, "trimming" comes in handy to remove the unnecessary page
9336 * tables in the processes the shared region is mapped in, and eventually
9337 * those in the shared region itself. Note that the shared region must
9338 * be trimmed after the user processes because it has the L3 entries
9339 * everyone else is pointing to.
9340 *
9341 * @param grand the pmap in which the pages are nested
9342 * @param subord the pmap from which the pages are shared, or nested
9343 * @param vstart start of the used range in "grand"
9344 * @param size size of the used range
9345 */
9346 void
9347 pmap_trim(
9348 pmap_t grand,
9349 pmap_t subord,
9350 addr64_t vstart,
9351 uint64_t size)
9352 {
9353 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9354
9355 #if XNU_MONITOR
9356 /* On PPL systems, drives the state machine until its done. */
9357 while (state != PMAP_TRIM_STATE_DONE) {
9358 __assert_only pmap_trim_state_t old_state = state;
9359 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9360
9361 /* Are we making progress? */
9362 assert(old_state != state);
9363 }
9364
9365 pmap_ledger_check_balance(grand);
9366 pmap_ledger_check_balance(subord);
9367 #else
9368 state = pmap_trim_internal(grand, subord, vstart, size, state);
9369
9370 /* On non-PPL systems, we expect the implementation to finish in one call. */
9371 assert(state == PMAP_TRIM_STATE_DONE);
9372 #endif
9373 }
9374
9375 #if HAS_APPLE_PAC
9376 void *
9377 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9378 {
9379 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9380 panic("attempt to sign user pointer without process independent key");
9381 }
9382
9383 void *res = NULL;
9384 uint64_t current_intr_state = pmap_interrupts_disable();
9385
9386 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9387
9388 __compiler_materialize_and_prevent_reordering_on(value);
9389 switch (key) {
9390 case ptrauth_key_asia:
9391 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9392 break;
9393 case ptrauth_key_asda:
9394 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9395 break;
9396 default:
9397 __builtin_unreachable();
9398 }
9399 __compiler_materialize_and_prevent_reordering_on(res);
9400
9401 ml_disable_user_jop_key(jop_key, saved_jop_state);
9402
9403 pmap_interrupts_restore(current_intr_state);
9404
9405 return res;
9406 }
9407
9408 void *
9409 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9410 {
9411 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9412 }
9413
9414 void *
9415 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9416 {
9417 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9418 panic("attempt to auth user pointer without process independent key");
9419 }
9420
9421 void *res = NULL;
9422 uint64_t current_intr_state = pmap_interrupts_disable();
9423
9424 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9425 __compiler_materialize_and_prevent_reordering_on(value);
9426 res = ml_auth_ptr_unchecked(value, key, discriminator);
9427 __compiler_materialize_and_prevent_reordering_on(res);
9428 ml_disable_user_jop_key(jop_key, saved_jop_state);
9429
9430 pmap_interrupts_restore(current_intr_state);
9431
9432 return res;
9433 }
9434
9435 void *
9436 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9437 {
9438 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9439 }
9440 #endif /* HAS_APPLE_PAC */
9441
9442 /*
9443 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9444 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9445 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9446 * return value, to indicate where a preempted [un]nest operation should resume.
9447 * When the return value contains the ending address of the nested region with
9448 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9449 */
9450 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9451
9452 /*
9453 * kern_return_t pmap_nest(grand, subord, vstart, size)
9454 *
9455 * grand = the pmap that we will nest subord into
9456 * subord = the pmap that goes into the grand
9457 * vstart = start of range in pmap to be inserted
9458 * size = Size of nest area (up to 16TB)
9459 *
9460 * Inserts a pmap into another. This is used to implement shared segments.
9461 *
9462 */
9463
9464 /**
9465 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9466 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9467 * This function operates in 3 main phases:
9468 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9469 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9470 * the mapping range are present in subord.
9471 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9472 * contains pointers to subord's leaf-level pagetable pages for the specified
9473 * VA range.
9474 *
9475 * This function may return early due to pending AST_URGENT preemption; if so
9476 * it will indicate the need to be re-entered.
9477 *
9478 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9479 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9480 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9481 * @param size twig-aligned size of the nesting range
9482 * @param vrestart the twig-aligned starting address of the current call. May contain
9483 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9484 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9485 * KERN_RESOURCE_SHORTAGE on allocation failure.
9486 *
9487 * @return the virtual address at which to restart the operation, possibly including
9488 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9489 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9490 */
9491 MARK_AS_PMAP_TEXT vm_map_offset_t
9492 pmap_nest_internal(
9493 pmap_t grand,
9494 pmap_t subord,
9495 addr64_t vstart,
9496 uint64_t size,
9497 vm_map_offset_t vrestart,
9498 kern_return_t *krp)
9499 {
9500 kern_return_t kr = KERN_FAILURE;
9501 vm_map_offset_t vaddr;
9502 tt_entry_t *stte_p;
9503 tt_entry_t *gtte_p;
9504 uint64_t nested_region_unnested_table_bitmap_size;
9505 unsigned int* nested_region_unnested_table_bitmap = NULL;
9506 uint64_t new_nested_region_unnested_table_bitmap_size;
9507 unsigned int* new_nested_region_unnested_table_bitmap = NULL;
9508 int expand_options = 0;
9509 bool deref_subord = true;
9510 bool grand_locked = false;
9511
9512 addr64_t vend;
9513 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9514 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9515 }
9516 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9517 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9518 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9519 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9520 }
9521
9522 assert(krp != NULL);
9523 validate_pmap_mutable(grand);
9524 validate_pmap(subord);
9525 #if XNU_MONITOR
9526 /*
9527 * Ordering is important here. validate_pmap() has already ensured subord is a
9528 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9529 * be in the process of being destroyed. If destruction is already committed,
9530 * then the check of ref_count below will cover us. If destruction is initiated
9531 * during or after this call, then pmap_destroy() will catch the non-zero
9532 * nested_count.
9533 */
9534 os_atomic_inc(&subord->nested_count, relaxed);
9535 os_atomic_thread_fence(seq_cst);
9536 #endif
9537 if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9538 panic("%s: invalid subordinate pmap %p", __func__, subord);
9539 }
9540
9541 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9542 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9543 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9544 }
9545
9546 #if XNU_MONITOR
9547 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9548 #endif
9549
9550 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9551 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9552 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9553 grand, vstart, size, (unsigned long long)vrestart);
9554 }
9555
9556 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9557 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9558 }
9559
9560 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9561 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9562 }
9563
9564 if (subord->nested_region_unnested_table_bitmap == NULL) {
9565 nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9566
9567 if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9568 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9569 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9570 __func__, nested_region_unnested_table_bitmap_size,
9571 grand, subord, vstart, size);
9572 }
9573
9574 #if XNU_MONITOR
9575 pmap_paddr_t pa = 0;
9576
9577 if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9578 panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9579 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9580 __FUNCTION__, nested_region_unnested_table_bitmap_size,
9581 grand, subord, vstart, size);
9582 }
9583
9584 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9585
9586 if (kr != KERN_SUCCESS) {
9587 goto nest_cleanup;
9588 }
9589
9590 assert(pa);
9591
9592 nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9593 #else
9594 nested_region_unnested_table_bitmap = kalloc_data(
9595 nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9596 Z_WAITOK | Z_ZERO);
9597 #endif
9598
9599 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9600 kr = KERN_ABORTED;
9601 goto nest_cleanup;
9602 }
9603
9604 if (subord->nested_region_unnested_table_bitmap == NULL) {
9605 subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9606 subord->nested_region_addr = vstart;
9607 subord->nested_region_size = (mach_vm_offset_t) size;
9608
9609 /**
9610 * Ensure that the rest of the subord->nested_region_* fields are
9611 * initialized and visible before setting the nested_region_unnested_table_bitmap
9612 * field (which is used as the flag to say that the rest are initialized).
9613 */
9614 __builtin_arm_dmb(DMB_ISHST);
9615 subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9616 nested_region_unnested_table_bitmap = NULL;
9617 }
9618 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9619 if (nested_region_unnested_table_bitmap != NULL) {
9620 #if XNU_MONITOR
9621 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9622 #else
9623 kfree_data(nested_region_unnested_table_bitmap,
9624 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9625 #endif
9626 nested_region_unnested_table_bitmap = NULL;
9627 }
9628 }
9629
9630 /**
9631 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9632 * speculated before their initialization.
9633 */
9634 __builtin_arm_dmb(DMB_ISHLD);
9635
9636 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9637 uint64_t new_size;
9638
9639 nested_region_unnested_table_bitmap = NULL;
9640 nested_region_unnested_table_bitmap_size = 0ULL;
9641 new_size = vend - subord->nested_region_addr;
9642
9643 new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9644
9645 if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9646 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9647 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9648 __func__, new_nested_region_unnested_table_bitmap_size,
9649 grand, subord, vstart, size);
9650 }
9651
9652 #if XNU_MONITOR
9653 pmap_paddr_t pa = 0;
9654
9655 if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9656 panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9657 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9658 __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9659 grand, subord, vstart, new_size);
9660 }
9661
9662 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9663
9664 if (kr != KERN_SUCCESS) {
9665 goto nest_cleanup;
9666 }
9667
9668 assert(pa);
9669
9670 new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9671 #else
9672 new_nested_region_unnested_table_bitmap = kalloc_data(
9673 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9674 Z_WAITOK | Z_ZERO);
9675 #endif
9676 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9677 kr = KERN_ABORTED;
9678 goto nest_cleanup;
9679 }
9680
9681 if (subord->nested_region_size < new_size) {
9682 bcopy(subord->nested_region_unnested_table_bitmap,
9683 new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9684 nested_region_unnested_table_bitmap_size = subord->nested_region_unnested_table_bitmap_size;
9685 nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9686 subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9687 subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9688 subord->nested_region_size = new_size;
9689 new_nested_region_unnested_table_bitmap = NULL;
9690 }
9691 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9692 if (nested_region_unnested_table_bitmap != NULL) {
9693 #if XNU_MONITOR
9694 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9695 #else
9696 kfree_data(nested_region_unnested_table_bitmap,
9697 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9698 #endif
9699 nested_region_unnested_table_bitmap = NULL;
9700 }
9701 if (new_nested_region_unnested_table_bitmap != NULL) {
9702 #if XNU_MONITOR
9703 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9704 #else
9705 kfree_data(new_nested_region_unnested_table_bitmap,
9706 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9707 #endif
9708 new_nested_region_unnested_table_bitmap = NULL;
9709 }
9710 }
9711
9712 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9713 kr = KERN_ABORTED;
9714 goto nest_cleanup;
9715 }
9716
9717 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9718 /**
9719 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9720 * into a nested pmap, which would then produce multiple levels of nesting.
9721 */
9722 if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9723 panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9724 }
9725 /*
9726 * If this is grand's first nesting operation, keep the reference on subord.
9727 * It will be released by pmap_destroy_internal() when grand is destroyed.
9728 */
9729 deref_subord = false;
9730
9731 if (!subord->nested_bounds_set) {
9732 /*
9733 * We are nesting without the shared regions bounds
9734 * being known. We'll have to trim the pmap later.
9735 */
9736 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9737 NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9738 panic("%s: grand %p already nested", __func__, grand);
9739 }
9740 subord->nested_no_bounds_refcnt++;
9741 }
9742
9743 if (__improbable(vstart < subord->nested_region_addr ||
9744 vend > (subord->nested_region_addr + subord->nested_region_size))) {
9745 panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9746 __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9747 (void *) (subord->nested_region_addr + subord->nested_region_size));
9748 }
9749
9750 grand->nested_region_addr = vstart;
9751 grand->nested_region_size = (mach_vm_offset_t) size;
9752 } else {
9753 if (__improbable(grand->nested_pmap != subord)) {
9754 panic("pmap_nest() pmap %p has a nested pmap", grand);
9755 } else if (__improbable(grand->nested_region_addr > vstart)) {
9756 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9757 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9758 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9759 }
9760 }
9761
9762 vaddr = vrestart & ~PMAP_NEST_GRAND;
9763 if (vaddr < subord->nested_region_true_start) {
9764 vaddr = subord->nested_region_true_start;
9765 }
9766
9767 addr64_t true_end = vend;
9768 if (true_end > subord->nested_region_true_end) {
9769 true_end = subord->nested_region_true_end;
9770 }
9771 __unused unsigned int ttecount = 0;
9772
9773 if (vrestart & PMAP_NEST_GRAND) {
9774 goto nest_grand;
9775 }
9776
9777 while (vaddr < true_end) {
9778 stte_p = pmap_tte(subord, vaddr);
9779 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9780 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9781 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9782
9783 if (kr != KERN_SUCCESS) {
9784 goto done;
9785 }
9786
9787 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9788 }
9789 vaddr += pt_attr_twig_size(pt_attr);
9790 vrestart = vaddr;
9791 ++ttecount;
9792 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9793 pmap_pending_preemption())) {
9794 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9795 kr = KERN_SUCCESS;
9796 goto done;
9797 }
9798 }
9799 /*
9800 * copy TTEs from subord pmap into grand pmap
9801 */
9802
9803 vaddr = (vm_map_offset_t) vstart;
9804 if (vaddr < subord->nested_region_true_start) {
9805 vaddr = subord->nested_region_true_start;
9806 }
9807 vrestart = vaddr | PMAP_NEST_GRAND;
9808
9809 nest_grand:
9810 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9811
9812 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9813 kr = KERN_ABORTED;
9814 goto done;
9815 }
9816 while (vaddr < true_end) {
9817 gtte_p = pmap_tte(grand, vaddr);
9818 if (gtte_p == PT_ENTRY_NULL) {
9819 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9820 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9821 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9822 if (kr == KERN_SUCCESS) {
9823 kr = KERN_ABORTED;
9824 }
9825 }
9826
9827 if (kr != KERN_SUCCESS) {
9828 goto done;
9829 }
9830
9831 gtte_p = pmap_tt2e(grand, vaddr);
9832 }
9833 /* Don't leak a page table page. Don't violate break-before-make. */
9834 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9835 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9836 __func__, gtte_p, grand);
9837 }
9838 /**
9839 * It's possible that grand was trimmed by pmap_trim_internal() while the
9840 * lock was dropped, in which case the previously stored "true" start/end
9841 * will no longer be accurate. In that case, we need to avoid nesting
9842 * tables outside the trimmed range, as those tables may be immediately freed
9843 * which would lead to a dangling page table pointer in grand.
9844 * Note that pmap_trim() may concurrently update grand's bounds as we are
9845 * making these checks, but in that case pmap_trim_range() has not yet
9846 * been called on grand and will wait for us to drop grand's lock, so it
9847 * should see any TTEs we've nested here and clear them appropriately.
9848 */
9849 if (__probable((vaddr >= grand->nested_region_true_start) &&
9850 (vaddr < grand->nested_region_true_end))) {
9851 stte_p = pmap_tte(subord, vaddr);
9852 if (__improbable(stte_p == PT_ENTRY_NULL)) {
9853 panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9854 }
9855 *gtte_p = *stte_p;
9856 }
9857
9858 vaddr += pt_attr_twig_size(pt_attr);
9859 vrestart = vaddr | PMAP_NEST_GRAND;
9860 ++ttecount;
9861 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9862 pmap_pending_preemption())) {
9863 break;
9864 }
9865 }
9866 if (vaddr >= true_end) {
9867 vrestart = vend | PMAP_NEST_GRAND;
9868 }
9869
9870 kr = KERN_SUCCESS;
9871 done:
9872
9873 FLUSH_PTE();
9874 __builtin_arm_isb(ISB_SY);
9875
9876 if (grand_locked) {
9877 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9878 }
9879
9880 nest_cleanup:
9881 #if XNU_MONITOR
9882 if (kr != KERN_SUCCESS) {
9883 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9884 *krp = kr;
9885 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9886 }
9887 #else
9888 if (kr != KERN_SUCCESS) {
9889 *krp = kr;
9890 }
9891 #endif
9892 if (nested_region_unnested_table_bitmap != NULL) {
9893 #if XNU_MONITOR
9894 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9895 #else
9896 kfree_data(nested_region_unnested_table_bitmap,
9897 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9898 #endif
9899 }
9900 if (new_nested_region_unnested_table_bitmap != NULL) {
9901 #if XNU_MONITOR
9902 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9903 #else
9904 kfree_data(new_nested_region_unnested_table_bitmap,
9905 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9906 #endif
9907 }
9908 if (deref_subord) {
9909 #if XNU_MONITOR
9910 os_atomic_dec(&subord->nested_count, relaxed);
9911 #endif
9912 pmap_destroy_internal(subord);
9913 }
9914 return vrestart;
9915 }
9916
9917 kern_return_t
9918 pmap_nest(
9919 pmap_t grand,
9920 pmap_t subord,
9921 addr64_t vstart,
9922 uint64_t size)
9923 {
9924 kern_return_t kr = KERN_SUCCESS;
9925 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9926 vm_map_offset_t vend = vaddr + size;
9927 __unused vm_map_offset_t vlast = vaddr;
9928
9929 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9930 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9931 VM_KERNEL_ADDRHIDE(vstart));
9932
9933 pmap_verify_preemptible();
9934 #if XNU_MONITOR
9935 while (vaddr != (vend | PMAP_NEST_GRAND)) {
9936 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9937 if (kr == KERN_RESOURCE_SHORTAGE) {
9938 pmap_alloc_page_for_ppl(0);
9939 kr = KERN_SUCCESS;
9940 } else if (kr == KERN_ABORTED) {
9941 /**
9942 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9943 * that it won't update kr when KERN_SUCCESS is to be returned.
9944 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9945 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9946 */
9947 kr = KERN_SUCCESS;
9948 continue;
9949 } else if (kr != KERN_SUCCESS) {
9950 break;
9951 } else if (vaddr == vlast) {
9952 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9953 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9954 }
9955 vlast = vaddr;
9956 }
9957
9958 pmap_ledger_check_balance(grand);
9959 pmap_ledger_check_balance(subord);
9960 #else
9961 /**
9962 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9963 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9964 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9965 */
9966 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9967 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9968 }
9969 #endif
9970
9971 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9972
9973 return kr;
9974 }
9975
9976 /*
9977 * kern_return_t pmap_unnest(grand, vaddr)
9978 *
9979 * grand = the pmap that will have the virtual range unnested
9980 * vaddr = start of range in pmap to be unnested
9981 * size = size of range in pmap to be unnested
9982 *
9983 */
9984
9985 kern_return_t
9986 pmap_unnest(
9987 pmap_t grand,
9988 addr64_t vaddr,
9989 uint64_t size)
9990 {
9991 return pmap_unnest_options(grand, vaddr, size, 0);
9992 }
9993
9994 /**
9995 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9996 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9997 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9998 * still have the region nested. The mappings in 'grand' will be left empty
9999 * with the assumption that they will be demand-filled by subsequent access faults.
10000 *
10001 * This function operates in 2 main phases:
10002 * 1. Iteration over the nested pmap's mappings for the specified range to mark
10003 * them non-global.
10004 * 2. Clearing of the twig-level TTEs for the address range in grand.
10005 *
10006 * This function may return early due to pending AST_URGENT preemption; if so
10007 * it will indicate the need to be re-entered.
10008 *
10009 * @param grand pmap from which to unnest mappings
10010 * @param vaddr twig-aligned virtual address for the beginning of the nested range
10011 * @param size twig-aligned size of the nested range
10012 * @param vrestart the page-aligned starting address of the current call. May contain
10013 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10014 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10015 * grand is being torn down and step 1) above is not needed.
10016 *
10017 * @return the virtual address at which to restart the operation, possibly including
10018 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
10019 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10020 */
10021 MARK_AS_PMAP_TEXT vm_map_offset_t
10022 pmap_unnest_options_internal(
10023 pmap_t grand,
10024 addr64_t vaddr,
10025 uint64_t size,
10026 vm_map_offset_t vrestart,
10027 unsigned int option)
10028 {
10029 vm_map_offset_t start;
10030 vm_map_offset_t addr;
10031 tt_entry_t *tte_p;
10032 unsigned int current_index;
10033 unsigned int start_index;
10034 unsigned int max_index;
10035 unsigned int entry_count = 0;
10036
10037 addr64_t vend;
10038 addr64_t true_end;
10039 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10040 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10041 }
10042 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10043 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10044 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10045 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10046 }
10047
10048 validate_pmap_mutable(grand);
10049
10050 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10051 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10052 }
10053
10054 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10055
10056 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10057 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10058 (unsigned long long)vaddr, (unsigned long long)size);
10059 }
10060
10061 if (__improbable(grand->nested_pmap == NULL)) {
10062 panic("%s: %p has no nested pmap", __func__, grand);
10063 }
10064
10065 true_end = vend;
10066 if (true_end > grand->nested_pmap->nested_region_true_end) {
10067 true_end = grand->nested_pmap->nested_region_true_end;
10068 }
10069
10070 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10071 if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10072 return vrestart;
10073 }
10074
10075 start = vrestart;
10076 if (start < grand->nested_pmap->nested_region_true_start) {
10077 start = grand->nested_pmap->nested_region_true_start;
10078 }
10079 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10080 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10081 bool flush_tlb = false;
10082
10083 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10084 pt_entry_t *bpte, *cpte;
10085
10086 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10087
10088 bpte = pmap_pte(grand->nested_pmap, addr);
10089
10090 /*
10091 * If we've re-entered this function partway through unnesting a leaf region, the
10092 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10093 * the run of PTEs. We therefore also need to check for a non-twig-aligned starting
10094 * address.
10095 */
10096 if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10097 (addr & pt_attr_twig_offmask(pt_attr))) {
10098 /*
10099 * Mark the 'twig' region as being unnested. Every mapping entered within
10100 * the nested pmap in this region will now be marked non-global. Do this
10101 * before marking any of the PTEs within the region as non-global to avoid
10102 * the possibility of pmap_enter() subsequently inserting a global mapping
10103 * in the region, which could lead to a TLB conflict if a non-global entry
10104 * is later inserted for the same VA in a pmap which has fully unnested this
10105 * region.
10106 */
10107 setbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10108 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10109 pmap_paddr_t pa;
10110 unsigned int pai = 0;
10111 boolean_t managed = FALSE;
10112 pt_entry_t spte;
10113
10114 if ((*cpte != ARM_PTE_TYPE_FAULT)
10115 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10116 spte = *((volatile pt_entry_t*)cpte);
10117 while (!managed) {
10118 pa = pte_to_pa(spte);
10119 if (!pa_valid(pa)) {
10120 break;
10121 }
10122 pai = pa_index(pa);
10123 pvh_lock(pai);
10124 spte = *((volatile pt_entry_t*)cpte);
10125 pa = pte_to_pa(spte);
10126 if (pai == pa_index(pa)) {
10127 managed = TRUE;
10128 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10129 }
10130 pvh_unlock(pai);
10131 }
10132
10133 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10134 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10135 flush_tlb = true;
10136 }
10137
10138 if (managed) {
10139 pvh_assert_locked(pai);
10140 pvh_unlock(pai);
10141 }
10142 }
10143
10144 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10145 vrestart = addr;
10146 ++entry_count;
10147 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10148 pmap_pending_preemption())) {
10149 goto unnest_subord_done;
10150 }
10151 }
10152 }
10153 addr = vlim;
10154 vrestart = addr;
10155 ++entry_count;
10156 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10157 pmap_pending_preemption())) {
10158 break;
10159 }
10160 }
10161
10162 unnest_subord_done:
10163 if (flush_tlb) {
10164 FLUSH_PTE_STRONG();
10165 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10166 }
10167
10168 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10169 if (current_index < max_index) {
10170 return vrestart;
10171 }
10172 }
10173
10174 /*
10175 * invalidate all pdes for segment at vaddr in pmap grand
10176 */
10177 if (vrestart & PMAP_NEST_GRAND) {
10178 addr = vrestart & ~PMAP_NEST_GRAND;
10179 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10180 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10181 }
10182 } else {
10183 addr = vaddr;
10184 vrestart = vaddr | PMAP_NEST_GRAND;
10185 }
10186
10187 /**
10188 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10189 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10190 * upon reentry.
10191 */
10192 if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10193 return vrestart;
10194 }
10195
10196 if (addr < grand->nested_pmap->nested_region_true_start) {
10197 addr = grand->nested_pmap->nested_region_true_start;
10198 }
10199
10200 start = addr;
10201
10202 while (addr < true_end) {
10203 tte_p = pmap_tte(grand, addr);
10204 /*
10205 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10206 * so it's possible that a region we're trying to unnest may not have been
10207 * nested in the first place.
10208 */
10209 if (tte_p != NULL) {
10210 *tte_p = ARM_TTE_TYPE_FAULT;
10211 }
10212 addr += pt_attr_twig_size(pt_attr);
10213 vrestart = addr | PMAP_NEST_GRAND;
10214 ++entry_count;
10215 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10216 pmap_pending_preemption())) {
10217 break;
10218 }
10219 }
10220 if (addr >= true_end) {
10221 vrestart = vend | PMAP_NEST_GRAND;
10222 }
10223
10224 FLUSH_PTE_STRONG();
10225 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10226
10227 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10228
10229 return vrestart;
10230 }
10231
10232 kern_return_t
10233 pmap_unnest_options(
10234 pmap_t grand,
10235 addr64_t vaddr,
10236 uint64_t size,
10237 unsigned int option)
10238 {
10239 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10240 vm_map_offset_t vend = vaddr + size;
10241
10242 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10243 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10244
10245 pmap_verify_preemptible();
10246 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10247 #if XNU_MONITOR
10248 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10249 #else
10250 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10251 #endif
10252 }
10253
10254 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10255
10256 return KERN_SUCCESS;
10257 }
10258
10259 boolean_t
10260 pmap_adjust_unnest_parameters(
10261 __unused pmap_t p,
10262 __unused vm_map_offset_t *s,
10263 __unused vm_map_offset_t *e)
10264 {
10265 return TRUE; /* to get to log_unnest_badness()... */
10266 }
10267
10268 #if PMAP_FORK_NEST
10269 /**
10270 * Perform any necessary pre-nesting of the parent's shared region at fork()
10271 * time.
10272 *
10273 * @note This should only be called from vm_map_fork().
10274 *
10275 * @param old_pmap The pmap of the parent task.
10276 * @param new_pmap The pmap of the child task.
10277 * @param nesting_start An output parameter that is updated with the start
10278 * address of the range that was pre-nested
10279 * @param nesting_end An output parameter that is updated with the end
10280 * address of the range that was pre-nested
10281 *
10282 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10283 * KERN_INVALID_ARGUMENT if the arguments were not valid.
10284 */
10285 kern_return_t
10286 pmap_fork_nest(
10287 pmap_t old_pmap,
10288 pmap_t new_pmap,
10289 vm_map_offset_t *nesting_start,
10290 vm_map_offset_t *nesting_end)
10291 {
10292 if (old_pmap == NULL || new_pmap == NULL) {
10293 return KERN_INVALID_ARGUMENT;
10294 }
10295 if (old_pmap->nested_pmap == NULL) {
10296 return KERN_SUCCESS;
10297 }
10298 pmap_nest(new_pmap,
10299 old_pmap->nested_pmap,
10300 old_pmap->nested_region_addr,
10301 old_pmap->nested_region_size);
10302 assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10303 new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10304 new_pmap->nested_region_size == old_pmap->nested_region_size,
10305 "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10306 new_pmap->nested_pmap,
10307 new_pmap->nested_region_addr,
10308 new_pmap->nested_region_size,
10309 old_pmap->nested_pmap,
10310 old_pmap->nested_region_addr,
10311 old_pmap->nested_region_size);
10312 *nesting_start = old_pmap->nested_region_addr;
10313 *nesting_end = *nesting_start + old_pmap->nested_region_size;
10314 return KERN_SUCCESS;
10315 }
10316 #endif /* PMAP_FORK_NEST */
10317
10318 /*
10319 * disable no-execute capability on
10320 * the specified pmap
10321 */
10322 #if DEVELOPMENT || DEBUG
10323 void
10324 pmap_disable_NX(
10325 pmap_t pmap)
10326 {
10327 pmap->nx_enabled = FALSE;
10328 }
10329 #else
10330 void
10331 pmap_disable_NX(
10332 __unused pmap_t pmap)
10333 {
10334 }
10335 #endif
10336
10337 /*
10338 * flush a range of hardware TLB entries.
10339 * NOTE: assumes the smallest TLB entry in use will be for
10340 * an ARM small page (4K).
10341 */
10342
10343 #if __ARM_RANGE_TLBI__
10344 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10345 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
10346 #else
10347 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10348 #endif // __ARM_RANGE_TLBI__
10349 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10350 "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10351 "of npages to 32 bits below may truncate.");
10352
10353 static void
10354 flush_mmu_tlb_region_asid_async(
10355 vm_offset_t va,
10356 size_t length,
10357 pmap_t pmap,
10358 bool last_level_only __unused,
10359 bool strong __unused)
10360 {
10361 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10362 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10363 size_t npages = length >> pmap_page_shift;
10364 uint32_t asid;
10365
10366 asid = pmap->hw_asid;
10367
10368 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10369 boolean_t flush_all = FALSE;
10370
10371 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10372 flush_all = TRUE;
10373 }
10374 if (flush_all) {
10375 flush_mmu_tlb_async();
10376 } else {
10377 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10378 }
10379 return;
10380 }
10381 #if __ARM_RANGE_TLBI__
10382 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10383 /**
10384 * Note that casting npages to 32 bits here is always safe thanks to
10385 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10386 */
10387 va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10388 if (pmap->type == PMAP_TYPE_NESTED) {
10389 flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10390 } else {
10391 flush_mmu_tlb_range_async(va, last_level_only, strong);
10392 }
10393 return;
10394 }
10395 #endif
10396 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10397 va = tlbi_asid(asid) | tlbi_addr(va);
10398
10399 if (pmap->type == PMAP_TYPE_NESTED) {
10400 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10401 } else {
10402 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10403 }
10404 }
10405
10406 MARK_AS_PMAP_TEXT static void
10407 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10408 {
10409 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10410 }
10411
10412 void
10413 flush_mmu_tlb_region(
10414 vm_offset_t va,
10415 unsigned length)
10416 {
10417 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10418 sync_tlb_flush();
10419 }
10420
10421 unsigned int
10422 pmap_cache_attributes(
10423 ppnum_t pn)
10424 {
10425 pmap_paddr_t paddr;
10426 unsigned int pai;
10427 unsigned int result;
10428 pp_attr_t pp_attr_current;
10429
10430 paddr = ptoa(pn);
10431
10432 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10433
10434 if (!pa_valid(paddr)) {
10435 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10436 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10437 }
10438
10439 result = VM_WIMG_DEFAULT;
10440
10441 pai = pa_index(paddr);
10442
10443 pp_attr_current = pp_attr_table[pai];
10444 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10445 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10446 }
10447 return result;
10448 }
10449
10450 MARK_AS_PMAP_TEXT static void
10451 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10452 {
10453 if ((wimg_bits_prev != wimg_bits_new)
10454 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10455 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10456 && (wimg_bits_new != VM_WIMG_COPYBACK))
10457 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10458 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10459 pmap_sync_page_attributes_phys(pn);
10460 }
10461
10462 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10463 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10464 }
10465 }
10466
10467 MARK_AS_PMAP_TEXT __unused void
10468 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10469 {
10470 pmap_paddr_t paddr = ptoa(pn);
10471 const unsigned int pai = pa_index(paddr);
10472
10473 if (__improbable(!pa_valid(paddr))) {
10474 panic("%s called on non-managed page 0x%08x", __func__, pn);
10475 }
10476
10477 pvh_lock(pai);
10478
10479 #if XNU_MONITOR
10480 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10481 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10482 }
10483 #endif
10484
10485 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10486
10487 pvh_unlock(pai);
10488
10489 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10490 }
10491
10492 void *
10493 pmap_map_compressor_page(ppnum_t pn)
10494 {
10495 #if __ARM_PTE_PHYSMAP__
10496 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10497 if (cacheattr != VM_WIMG_DEFAULT) {
10498 #if XNU_MONITOR
10499 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10500 #else
10501 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10502 #endif
10503 }
10504 #endif
10505 return (void*)phystokv(ptoa(pn));
10506 }
10507
10508 void
10509 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10510 {
10511 #if __ARM_PTE_PHYSMAP__
10512 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10513 if (cacheattr != VM_WIMG_DEFAULT) {
10514 #if XNU_MONITOR
10515 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10516 #else
10517 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10518 #endif
10519 }
10520 #endif
10521 }
10522
10523 /**
10524 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10525 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10526 *
10527 * @param user_page_list List of pages to be updated.
10528 * @param page_cnt Number of pages in total in user_page_list.
10529 * @param cacheattr The new cache attribute.
10530 *
10531 * @return Success if true is returned.
10532 */
10533 bool
10534 pmap_batch_set_cache_attributes(
10535 upl_page_info_array_t user_page_list,
10536 unsigned int page_cnt,
10537 unsigned int cacheattr)
10538 {
10539 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10540
10541 if (page_cnt == 0) {
10542 return true;
10543 }
10544
10545 batch_set_cache_attr_state_t states;
10546 states.page_index = 0;
10547 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10548 states.tlb_flush_pass_needed = false;
10549 states.rt_cache_flush_pass_needed = false;
10550
10551 /* Verify we are being called from a preemptible context. */
10552 pmap_verify_preemptible();
10553
10554 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10555 #if XNU_MONITOR
10556 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10557 #else /* !XNU_MONITOR */
10558 states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10559 #endif /* XNU_MONITOR */
10560 }
10561
10562 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10563 return true;
10564 }
10565
10566 /**
10567 * Flushes TLB entries associated with the page specified by paddr, but do not
10568 * issue barriers yet.
10569 *
10570 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10571 */
10572 MARK_AS_PMAP_TEXT static void
10573 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10574 {
10575 #if __ARM_PTE_PHYSMAP__
10576 /* Flush the physical aperture mappings. */
10577 const vm_offset_t kva = phystokv(paddr);
10578 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10579 #endif /* __ARM_PTE_PHYSMAP__ */
10580
10581 /* Flush the mappings tracked in the ptes. */
10582 const unsigned int pai = pa_index(paddr);
10583 pv_entry_t **pv_h = pai_to_pvh(pai);
10584
10585 pt_entry_t *pte_p = PT_ENTRY_NULL;
10586 pv_entry_t *pve_p = PV_ENTRY_NULL;
10587
10588 pvh_assert_locked(pai);
10589
10590 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10591 pte_p = pvh_ptep(pv_h);
10592 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10593 pve_p = pvh_pve_list(pv_h);
10594 pte_p = PT_ENTRY_NULL;
10595 }
10596
10597 int pve_ptep_idx = 0;
10598 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10599 if (pve_p != PV_ENTRY_NULL) {
10600 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10601 if (pte_p == PT_ENTRY_NULL) {
10602 goto flush_tlb_skip_pte;
10603 }
10604 }
10605
10606 #ifdef PVH_FLAG_IOMMU
10607 if (pvh_ptep_is_iommu(pte_p)) {
10608 goto flush_tlb_skip_pte;
10609 }
10610 #endif /* PVH_FLAG_IOMMU */
10611 pmap_t pmap = ptep_get_pmap(pte_p);
10612 vm_map_address_t va = ptep_get_va(pte_p);
10613
10614 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10615 pmap, true, false);
10616
10617 flush_tlb_skip_pte:
10618 pte_p = PT_ENTRY_NULL;
10619 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10620 pve_ptep_idx = 0;
10621 pve_p = pve_next(pve_p);
10622 }
10623 }
10624 }
10625
10626 /**
10627 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10628 *
10629 * @param pai The Physical Address Index of the entry.
10630 * @param cacheattr The new cache attribute.
10631 */
10632 MARK_AS_PMAP_TEXT static void
10633 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10634 {
10635 pvh_assert_locked(pai);
10636
10637 pp_attr_t pp_attr_current, pp_attr_template;
10638 do {
10639 pp_attr_current = pp_attr_table[pai];
10640 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10641
10642 /**
10643 * WIMG bits should only be updated under the PVH lock, but we should do
10644 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10645 */
10646 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10647 }
10648
10649 /**
10650 * Batch updates the cache attributes of a list of pages in three passes.
10651 *
10652 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10653 * In pass two, TLB entries are flushed for each page in the list if necessary.
10654 * In pass three, caches are cleaned for each page in the list if necessary.
10655 *
10656 * When running in PPL, this function may decide to return to the caller in response
10657 * to AST_URGENT.
10658 *
10659 * @param user_page_list List of pages to be updated.
10660 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10661 * @param page_cnt Number of pages in total in user_page_list.
10662 * @param cacheattr The new cache attributes.
10663 *
10664 * @return The new state of the state machine.
10665 */
10666 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10667 pmap_batch_set_cache_attributes_internal(
10668 #if XNU_MONITOR
10669 volatile upl_page_info_t *user_page_list,
10670 #else /* !XNU_MONITOR */
10671 upl_page_info_array_t user_page_list,
10672 #endif /* XNU_MONITOR */
10673 batch_set_cache_attr_state_t states,
10674 unsigned int page_cnt,
10675 unsigned int cacheattr)
10676 {
10677 uint64_t page_index = states.page_index;
10678 uint64_t state = states.state;
10679 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10680 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10681
10682 /* For verifying progress. */
10683 __assert_only const uint64_t page_index_old = page_index;
10684 __assert_only const uint64_t state_old = state;
10685
10686 /* Assert page_index and state are within their range. */
10687 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10688 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10689 }
10690
10691 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10692 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10693 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10694 while (page_index < page_cnt) {
10695 const ppnum_t pn = user_page_list[page_index].phys_addr;
10696 const pmap_paddr_t paddr = ptoa(pn);
10697
10698 if (!pa_valid(paddr)) {
10699 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10700 }
10701
10702 const unsigned int pai = pa_index(paddr);
10703
10704 /* Lock the page. */
10705 pvh_lock(pai);
10706
10707 #if XNU_MONITOR
10708 if (ppattr_pa_test_monitor(paddr)) {
10709 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10710 }
10711 #endif /* XNU_MONITOR */
10712 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10713
10714 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10715 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10716 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10717 }
10718
10719 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10720
10721 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10722 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10723 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10724 }
10725
10726 /* Update the cache attributes in PTE and PP_ATTR table. */
10727 if (wimg_bits_new != wimg_bits_prev) {
10728 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10729 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10730 }
10731
10732 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10733 rt_cache_flush_pass_needed = true;
10734 }
10735
10736 pvh_unlock(pai);
10737
10738 page_index++;
10739
10740 #if XNU_MONITOR
10741 /**
10742 * Check for AST_URGENT every page, as the pve list search in cache
10743 * update can take non-constant time.
10744 */
10745 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10746 goto pbscai_exit;
10747 }
10748 #endif /* XNU_MONITOR */
10749 }
10750
10751 /* page_index == page_cnt && !pmap_pending_preemption() */
10752 if (tlb_flush_pass_needed) {
10753 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10754 } else if (rt_cache_flush_pass_needed) {
10755 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10756 } else {
10757 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10758 }
10759 page_index = 0;
10760
10761 /* Sync the PTE writes before potential TLB/Cache flushes. */
10762 FLUSH_PTE_STRONG();
10763
10764 #if XNU_MONITOR
10765 if (__improbable(pmap_pending_preemption())) {
10766 goto pbscai_exit;
10767 }
10768 #endif /* XNU_MONITOR */
10769 }
10770
10771 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10772 /**
10773 * Pass 2: for each physical page and for each mapping, we need to flush
10774 * the TLB for it.
10775 */
10776 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10777 while (page_index < page_cnt) {
10778 const ppnum_t pn = user_page_list[page_index].phys_addr;
10779
10780 const pmap_paddr_t paddr = ptoa(pn);
10781 if (!pa_valid(paddr)) {
10782 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10783 }
10784
10785 const unsigned int pai = pa_index(paddr);
10786
10787 pvh_lock(pai);
10788 pmap_flush_tlb_for_paddr_locked_async(paddr);
10789 pvh_unlock(pai);
10790
10791 page_index++;
10792
10793 #if XNU_MONITOR
10794 /**
10795 * Check for AST_URGENT every page, as the pve list search in cache
10796 * update can take non-constant time.
10797 */
10798 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10799 goto pbscai_exit;
10800 }
10801 #endif /* XNU_MONITOR */
10802 }
10803
10804 #if HAS_FEAT_XS
10805 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10806 arm64_sync_tlb(false);
10807 #else
10808 /**
10809 * For targets that distinguish between mild and strong DSB, mild DSB
10810 * will not drain the prefetcher. This can lead to prefetch-driven
10811 * cache fills that defeat the uncacheable requirement of the RT memory type.
10812 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10813 */
10814 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10815 #endif
10816
10817 if (rt_cache_flush_pass_needed) {
10818 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10819 } else {
10820 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10821 }
10822 page_index = 0;
10823
10824 #if XNU_MONITOR
10825 if (__improbable(pmap_pending_preemption())) {
10826 goto pbscai_exit;
10827 }
10828 #endif /* XNU_MONITOR */
10829 }
10830
10831 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10832 /* Pass 3: Flush the cache if the page is recently set to RT */
10833 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10834 #if !XNU_MONITOR
10835 /**
10836 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10837 * in the state where DC by VA instructions remain enabled.
10838 */
10839 disable_preemption();
10840 #endif /* !XNU_MONITOR */
10841
10842 assert(get_preemption_level() > 0);
10843
10844 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10845 /**
10846 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10847 * and the host will handle cache maintenance for it. So we don't need to
10848 * worry about enabling the ops here for AVP.
10849 */
10850 enable_dc_mva_ops();
10851 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10852
10853 while (page_index < page_cnt) {
10854 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10855
10856 if (!pa_valid(paddr)) {
10857 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10858 }
10859
10860 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10861
10862 page_index++;
10863
10864 #if XNU_MONITOR
10865 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10866 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10867 disable_dc_mva_ops();
10868 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10869 goto pbscai_exit;
10870 }
10871 #endif /* XNU_MONITOR */
10872 }
10873
10874 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10875 disable_dc_mva_ops();
10876 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10877
10878 #if !XNU_MONITOR
10879 enable_preemption();
10880 #endif /* !XNU_MONITOR */
10881
10882 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10883 page_index = 0;
10884 }
10885
10886 #if XNU_MONITOR
10887 pbscai_exit:
10888 #endif /* XNU_MONITOR */
10889 /* Assert page_index and state are within their range. */
10890 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10891
10892 /* Make sure we are making progress in this call. */
10893 assert(page_index > page_index_old || state > state_old);
10894
10895 batch_set_cache_attr_state_t states_new;
10896 states_new.page_index = page_index;
10897 states_new.state = state;
10898 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10899 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10900 return states_new;
10901 }
10902
10903 MARK_AS_PMAP_TEXT static void
10904 pmap_set_cache_attributes_priv(
10905 ppnum_t pn,
10906 unsigned int cacheattr,
10907 boolean_t external __unused)
10908 {
10909 pmap_paddr_t paddr;
10910 unsigned int pai;
10911 pp_attr_t pp_attr_current;
10912 pp_attr_t pp_attr_template;
10913 unsigned int wimg_bits_prev, wimg_bits_new;
10914
10915 paddr = ptoa(pn);
10916
10917 if (!pa_valid(paddr)) {
10918 return; /* Not a managed page. */
10919 }
10920
10921 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10922 cacheattr = VM_WIMG_DEFAULT;
10923 }
10924
10925 pai = pa_index(paddr);
10926
10927 pvh_lock(pai);
10928
10929 #if XNU_MONITOR
10930 if (external && ppattr_pa_test_monitor(paddr)) {
10931 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10932 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
10933 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10934 }
10935 #endif
10936
10937 do {
10938 pp_attr_current = pp_attr_table[pai];
10939 wimg_bits_prev = VM_WIMG_DEFAULT;
10940 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10941 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10942 }
10943
10944 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10945
10946 /**
10947 * WIMG bits should only be updated under the PVH lock, but we should do
10948 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10949 */
10950 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10951
10952 wimg_bits_new = VM_WIMG_DEFAULT;
10953 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10954 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10955 }
10956
10957 if (wimg_bits_new != wimg_bits_prev) {
10958 pmap_update_cache_attributes_locked(pn, cacheattr, true);
10959 }
10960
10961 pvh_unlock(pai);
10962
10963 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10964 }
10965
10966 MARK_AS_PMAP_TEXT void
10967 pmap_set_cache_attributes_internal(
10968 ppnum_t pn,
10969 unsigned int cacheattr)
10970 {
10971 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10972 }
10973
10974 void
10975 pmap_set_cache_attributes(
10976 ppnum_t pn,
10977 unsigned int cacheattr)
10978 {
10979 #if XNU_MONITOR
10980 pmap_set_cache_attributes_ppl(pn, cacheattr);
10981 #else
10982 pmap_set_cache_attributes_internal(pn, cacheattr);
10983 #endif
10984 }
10985
10986 /**
10987 * Updates the page numbered ppnum to have attribute specified by attributes.
10988 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10989 * The necessity of the TLB flush is returned in case this function is called
10990 * in a batched manner and the TLB flush is intended to be done at a different
10991 * timing.
10992 *
10993 * @param ppnum Page Number of the page to be updated.
10994 * @param attributes The new cache attributes.
10995 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10996 * immediately.
10997 *
10998 * @return Returns true if a TLB flush is needed for this update regardless of
10999 * whether a flush has occurred already.
11000 */
11001 MARK_AS_PMAP_TEXT bool
11002 pmap_update_cache_attributes_locked(
11003 ppnum_t ppnum,
11004 unsigned attributes,
11005 bool perform_tlbi)
11006 {
11007 pmap_paddr_t phys = ptoa(ppnum);
11008 pv_entry_t *pve_p;
11009 pt_entry_t *pte_p;
11010 pv_entry_t **pv_h;
11011 pt_entry_t tmplate;
11012 unsigned int pai;
11013 boolean_t tlb_flush_needed = false;
11014
11015 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11016
11017 if (pmap_panic_dev_wimg_on_managed) {
11018 switch (attributes & VM_WIMG_MASK) {
11019 case VM_WIMG_IO: // nGnRnE
11020 case VM_WIMG_POSTED: // nGnRE
11021 /* supported on DRAM, but slow, so we disallow */
11022
11023 case VM_WIMG_POSTED_REORDERED: // nGRE
11024 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11025 /* unsupported on DRAM */
11026
11027 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11028 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11029 break;
11030
11031 default:
11032 /* not device type memory, all good */
11033
11034 break;
11035 }
11036 }
11037
11038 #if __ARM_PTE_PHYSMAP__
11039 vm_offset_t kva = phystokv(phys);
11040 pte_p = pmap_pte(kernel_pmap, kva);
11041
11042 tmplate = *pte_p;
11043 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11044 #if XNU_MONITOR
11045 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11046 #else
11047 tmplate |= wimg_to_pte(attributes, phys);
11048 #endif
11049 if (tmplate & ARM_PTE_HINT_MASK) {
11050 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11051 __FUNCTION__, pte_p, (void *)kva, tmplate);
11052 }
11053
11054 if (perform_tlbi) {
11055 write_pte_strong(pte_p, tmplate);
11056 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11057 } else {
11058 write_pte_fast(pte_p, tmplate);
11059 }
11060 tlb_flush_needed = true;
11061 #endif
11062
11063 pai = pa_index(phys);
11064
11065 pv_h = pai_to_pvh(pai);
11066
11067 pte_p = PT_ENTRY_NULL;
11068 pve_p = PV_ENTRY_NULL;
11069 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11070 pte_p = pvh_ptep(pv_h);
11071 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11072 pve_p = pvh_pve_list(pv_h);
11073 pte_p = PT_ENTRY_NULL;
11074 }
11075
11076 int pve_ptep_idx = 0;
11077 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11078 vm_map_address_t va;
11079 pmap_t pmap;
11080
11081 if (pve_p != PV_ENTRY_NULL) {
11082 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11083 if (pte_p == PT_ENTRY_NULL) {
11084 goto cache_skip_pve;
11085 }
11086 }
11087
11088 #ifdef PVH_FLAG_IOMMU
11089 if (pvh_ptep_is_iommu(pte_p)) {
11090 goto cache_skip_pve;
11091 }
11092 #endif
11093 pmap = ptep_get_pmap(pte_p);
11094 #if HAS_FEAT_XS
11095 /**
11096 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11097 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11098 */
11099 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11100 #endif /* HAS_FEAT_XS */
11101 va = ptep_get_va(pte_p);
11102
11103 tmplate = *pte_p;
11104 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11105 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11106
11107 if (perform_tlbi) {
11108 write_pte_strong(pte_p, tmplate);
11109 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11110 pmap, true, false);
11111 } else {
11112 write_pte_fast(pte_p, tmplate);
11113 }
11114 tlb_flush_needed = true;
11115
11116 cache_skip_pve:
11117 pte_p = PT_ENTRY_NULL;
11118 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11119 pve_ptep_idx = 0;
11120 pve_p = pve_next(pve_p);
11121 }
11122 }
11123 if (perform_tlbi && tlb_flush_needed) {
11124 #if HAS_FEAT_XS
11125 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11126 arm64_sync_tlb(false);
11127 #else
11128 /**
11129 * For targets that distinguish between mild and strong DSB, mild DSB
11130 * will not drain the prefetcher. This can lead to prefetch-driven
11131 * cache fills that defeat the uncacheable requirement of the RT memory type.
11132 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11133 */
11134 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11135 #endif
11136 }
11137
11138 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11139
11140 return tlb_flush_needed;
11141 }
11142
11143 /**
11144 * Mark a pmap as being dedicated to use for a commpage mapping.
11145 * The pmap itself will never be activated on a CPU; its mappings will
11146 * only be embedded in userspace pmaps at a fixed virtual address.
11147 *
11148 * @param pmap the pmap to mark as belonging to a commpage.
11149 */
11150 static void
11151 pmap_set_commpage(pmap_t pmap)
11152 {
11153 #if XNU_MONITOR
11154 assert(!pmap_ppl_locked_down);
11155 #endif
11156 assert(pmap->type == PMAP_TYPE_USER);
11157 pmap->type = PMAP_TYPE_COMMPAGE;
11158 /*
11159 * Free the pmap's ASID. This pmap should not ever be directly
11160 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
11161 * ASID space contention but will also cause pmap_switch() to panic
11162 * if an attacker tries to activate this pmap. Disable preemption to
11163 * accommodate the *_nopreempt spinlock in free_asid().
11164 */
11165 mp_disable_preemption();
11166 pmap_get_pt_ops(pmap)->free_id(pmap);
11167 mp_enable_preemption();
11168 }
11169
11170 static void
11171 pmap_update_tt3e(
11172 pmap_t pmap,
11173 vm_address_t address,
11174 tt_entry_t template)
11175 {
11176 tt_entry_t *ptep, pte;
11177
11178 ptep = pmap_tt3e(pmap, address);
11179 if (ptep == NULL) {
11180 panic("%s: no ptep?", __FUNCTION__);
11181 }
11182
11183 pte = *ptep;
11184 pte = tte_to_pa(pte) | template;
11185 write_pte_strong(ptep, pte);
11186 }
11187
11188 /* Note absence of non-global bit */
11189 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11190 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11191 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11192 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11193
11194 /* Note absence of non-global bit and no-execute bit. */
11195 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11196 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11197 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11198 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11199
11200 void
11201 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11202 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11203 {
11204 kern_return_t kr;
11205 pmap_paddr_t data_pa = 0; // data address
11206 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11207 pmap_paddr_t text_pa = 0; // text address
11208
11209 *kernel_data_addr = 0;
11210 *kernel_text_addr = 0;
11211 *user_text_addr = 0;
11212
11213 #if XNU_MONITOR
11214 data_pa = pmap_alloc_page_for_kern(0);
11215 assert(data_pa);
11216 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11217 ro_data_pa = pmap_alloc_page_for_kern(0);
11218 assert(ro_data_pa);
11219 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11220 #if CONFIG_ARM_PFZ
11221 text_pa = pmap_alloc_page_for_kern(0);
11222 assert(text_pa);
11223 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11224 #endif
11225
11226 #else /* XNU_MONITOR */
11227 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11228 /*
11229 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11230 * mapped at page granularity, so a separate page for kernel RO data would not
11231 * be useful.
11232 */
11233 ro_data_pa = data_pa;
11234 #if CONFIG_ARM_PFZ
11235 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11236 #endif
11237
11238 #endif /* XNU_MONITOR */
11239
11240 /*
11241 * In order to avoid burning extra pages on mapping the shared page, we
11242 * create a dedicated pmap for the shared page. We forcibly nest the
11243 * translation tables from this pmap into other pmaps. The level we
11244 * will nest at depends on the MMU configuration (page size, TTBR range,
11245 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11246 *
11247 * Note that this is NOT "the nested pmap" (which is used to nest the
11248 * shared cache).
11249 *
11250 * Note that we update parameters of the entry for our unique needs (NG
11251 * entry, etc.).
11252 */
11253 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11254 assert(commpage_pmap_default != NULL);
11255 pmap_set_commpage(commpage_pmap_default);
11256
11257 /* The user 64-bit mappings... */
11258 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11259 assert(kr == KERN_SUCCESS);
11260 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11261
11262 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11263 assert(kr == KERN_SUCCESS);
11264 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11265 #if CONFIG_ARM_PFZ
11266 /* User mapping of comm page text section for 64 bit mapping only
11267 *
11268 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11269 * user processes to get this page mapped in, they should never call into
11270 * this page.
11271 *
11272 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11273 * is slid in the same L3 as the data commpage. It is either outside the
11274 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11275 * it is reserved and unavailable to mach VM for future mappings.
11276 */
11277 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11278 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11279
11280 vm_map_address_t commpage_text_va = 0;
11281
11282 do {
11283 int text_leaf_index = random() % num_ptes;
11284
11285 // Generate a VA for the commpage text with the same root and twig index as data
11286 // comm page, but with new leaf index we've just generated.
11287 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11288 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11289 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11290
11291 // Assert that this is empty
11292 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11293 assert(ptep != PT_ENTRY_NULL);
11294 assert(*ptep == ARM_TTE_EMPTY);
11295
11296 // At this point, we've found the address we want to insert our comm page at
11297 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11298 assert(kr == KERN_SUCCESS);
11299 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11300 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11301
11302 *user_text_addr = commpage_text_va;
11303 #endif
11304
11305 /* ...and the user 32-bit mappings. */
11306 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11307 assert(kr == KERN_SUCCESS);
11308 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11309
11310 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11311 assert(kr == KERN_SUCCESS);
11312 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11313 #if __ARM_MIXED_PAGE_SIZE__
11314 /**
11315 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11316 * new set of page tables that point to the exact same 16K shared page as
11317 * before. Only the first 4K of the 16K shared page is mapped since that's
11318 * the only part that contains relevant data.
11319 */
11320 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11321 assert(commpage_pmap_4k != NULL);
11322 pmap_set_commpage(commpage_pmap_4k);
11323
11324 /* The user 64-bit mappings... */
11325 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11326 assert(kr == KERN_SUCCESS);
11327 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11328
11329 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11330 assert(kr == KERN_SUCCESS);
11331 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11332
11333 /* ...and the user 32-bit mapping. */
11334 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11335 assert(kr == KERN_SUCCESS);
11336 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11337
11338 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11339 assert(kr == KERN_SUCCESS);
11340 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11341 #endif
11342
11343 /* For manipulation in kernel, go straight to physical page */
11344 *kernel_data_addr = phystokv(data_pa);
11345 assert(commpage_ro_data_kva == 0);
11346 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11347 assert(commpage_text_kva == 0);
11348 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11349 }
11350
11351
11352 /*
11353 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11354 * with user controlled TTEs for regions that aren't explicitly reserved by the
11355 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11356 */
11357 #if (ARM_PGSHIFT == 14)
11358 /**
11359 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11360 * commpage completely above the maximum 32-bit userspace VA.
11361 */
11362 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11363
11364 /**
11365 * Normally there'd be an assert to check that 64-bit devices with 64-bit
11366 * userspace VAs can nest the commpage completely above the maximum 64-bit
11367 * userpace VA, but that technically isn't true on macOS. On those systems, the
11368 * commpage lives within the userspace VA range, but is protected by the VM as
11369 * a reserved region (see vm_reserved_regions[] definition for more info).
11370 */
11371
11372 #elif (ARM_PGSHIFT == 12)
11373 /**
11374 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11375 * above the maximum userspace VA.
11376 */
11377 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11378 #else
11379 #error Nested shared page mapping is unsupported on this config
11380 #endif
11381
11382 MARK_AS_PMAP_TEXT kern_return_t
11383 pmap_insert_commpage_internal(
11384 pmap_t pmap)
11385 {
11386 kern_return_t kr = KERN_SUCCESS;
11387 vm_offset_t commpage_vaddr;
11388 pt_entry_t *ttep, *src_ttep;
11389 int options = 0;
11390 pmap_t commpage_pmap = commpage_pmap_default;
11391
11392 /* Validate the pmap input before accessing its data. */
11393 validate_pmap_mutable(pmap);
11394
11395 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11396 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11397
11398 #if __ARM_MIXED_PAGE_SIZE__
11399 #if !__ARM_16K_PG__
11400 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11401 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11402 #endif /* !__ARM_16K_PG__ */
11403
11404 /* Choose the correct shared page pmap to use. */
11405 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11406 if (pmap_page_size == 16384) {
11407 commpage_pmap = commpage_pmap_default;
11408 } else if (pmap_page_size == 4096) {
11409 commpage_pmap = commpage_pmap_4k;
11410 } else {
11411 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11412 }
11413 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11414
11415 #if XNU_MONITOR
11416 options |= PMAP_OPTIONS_NOWAIT;
11417 #endif /* XNU_MONITOR */
11418
11419 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11420 #error We assume a single page.
11421 #endif
11422
11423 if (pmap_is_64bit(pmap)) {
11424 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11425 } else {
11426 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11427 }
11428
11429
11430 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11431
11432 /*
11433 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11434 * two (2MB) depending on the address space layout. For 16KB pages, each level
11435 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11436 * to "nest".
11437 *
11438 * Note: This is not "nesting" in the shared cache sense. This definition of
11439 * nesting just means inserting pointers to pre-allocated tables inside of
11440 * the passed in pmap to allow us to share page tables (which map the shared
11441 * page) for every task. This saves at least one page of memory per process
11442 * compared to creating new page tables in every process for mapping the
11443 * shared page.
11444 */
11445
11446 /**
11447 * Allocate the twig page tables if needed, and slam a pointer to the shared
11448 * page's tables into place.
11449 */
11450 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11451 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11452
11453 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11454
11455 if (kr != KERN_SUCCESS) {
11456 #if XNU_MONITOR
11457 if (kr == KERN_RESOURCE_SHORTAGE) {
11458 return kr;
11459 } else
11460 #endif
11461 if (kr == KERN_ABORTED) {
11462 return kr;
11463 } else {
11464 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11465 }
11466 }
11467
11468 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11469 }
11470
11471 if (*ttep != ARM_PTE_EMPTY) {
11472 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11473 }
11474
11475 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11476
11477 *ttep = *src_ttep;
11478 FLUSH_PTE_STRONG();
11479
11480 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11481
11482 return kr;
11483 }
11484
11485 static void
11486 pmap_unmap_commpage(
11487 pmap_t pmap)
11488 {
11489 pt_entry_t *ttep;
11490 vm_offset_t commpage_vaddr;
11491 pmap_t commpage_pmap = commpage_pmap_default;
11492
11493 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11494 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11495
11496 #if __ARM_MIXED_PAGE_SIZE__
11497 #if !__ARM_16K_PG__
11498 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11499 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11500 #endif /* !__ARM_16K_PG__ */
11501
11502 /* Choose the correct shared page pmap to use. */
11503 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11504 if (pmap_page_size == 16384) {
11505 commpage_pmap = commpage_pmap_default;
11506 } else if (pmap_page_size == 4096) {
11507 commpage_pmap = commpage_pmap_4k;
11508 } else {
11509 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11510 }
11511 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11512
11513 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11514 #error We assume a single page.
11515 #endif
11516
11517 if (pmap_is_64bit(pmap)) {
11518 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11519 } else {
11520 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11521 }
11522
11523
11524 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11525
11526 if (ttep == NULL) {
11527 return;
11528 }
11529
11530 /* It had better be mapped to the shared page. */
11531 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11532 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11533 }
11534
11535 *ttep = ARM_TTE_EMPTY;
11536 FLUSH_PTE_STRONG();
11537
11538 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11539 sync_tlb_flush();
11540 }
11541
11542 void
11543 pmap_insert_commpage(
11544 pmap_t pmap)
11545 {
11546 kern_return_t kr = KERN_FAILURE;
11547 #if XNU_MONITOR
11548 do {
11549 kr = pmap_insert_commpage_ppl(pmap);
11550
11551 if (kr == KERN_RESOURCE_SHORTAGE) {
11552 pmap_alloc_page_for_ppl(0);
11553 }
11554 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11555
11556 pmap_ledger_check_balance(pmap);
11557 #else
11558 do {
11559 kr = pmap_insert_commpage_internal(pmap);
11560 } while (kr == KERN_ABORTED);
11561 #endif
11562
11563 if (kr != KERN_SUCCESS) {
11564 panic("%s: failed to insert the shared page, kr=%d, "
11565 "pmap=%p",
11566 __FUNCTION__, kr,
11567 pmap);
11568 }
11569 }
11570
11571 static boolean_t
11572 pmap_is_64bit(
11573 pmap_t pmap)
11574 {
11575 return pmap->is_64bit;
11576 }
11577
11578 bool
11579 pmap_is_exotic(
11580 pmap_t pmap __unused)
11581 {
11582 return false;
11583 }
11584
11585
11586 /* ARMTODO -- an implementation that accounts for
11587 * holes in the physical map, if any.
11588 */
11589 boolean_t
11590 pmap_valid_page(
11591 ppnum_t pn)
11592 {
11593 return pa_valid(ptoa(pn));
11594 }
11595
11596 boolean_t
11597 pmap_bootloader_page(
11598 ppnum_t pn)
11599 {
11600 pmap_paddr_t paddr = ptoa(pn);
11601
11602 if (pa_valid(paddr)) {
11603 return FALSE;
11604 }
11605 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11606 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11607 }
11608
11609 MARK_AS_PMAP_TEXT boolean_t
11610 pmap_is_empty_internal(
11611 pmap_t pmap,
11612 vm_map_offset_t va_start,
11613 vm_map_offset_t va_end)
11614 {
11615 vm_map_offset_t block_start, block_end;
11616 tt_entry_t *tte_p;
11617
11618 if (pmap == NULL) {
11619 return TRUE;
11620 }
11621
11622 validate_pmap(pmap);
11623
11624 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11625 unsigned int initial_not_in_kdp = not_in_kdp;
11626
11627 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11628 pmap_lock(pmap, PMAP_LOCK_SHARED);
11629 }
11630
11631
11632 /* TODO: This will be faster if we increment ttep at each level. */
11633 block_start = va_start;
11634
11635 while (block_start < va_end) {
11636 pt_entry_t *bpte_p, *epte_p;
11637 pt_entry_t *pte_p;
11638
11639 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11640 if (block_end > va_end) {
11641 block_end = va_end;
11642 }
11643
11644 tte_p = pmap_tte(pmap, block_start);
11645 if ((tte_p != PT_ENTRY_NULL)
11646 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11647 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11648 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11649 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11650
11651 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11652 if (*pte_p != ARM_PTE_EMPTY) {
11653 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11654 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11655 }
11656 return FALSE;
11657 }
11658 }
11659 }
11660 block_start = block_end;
11661 }
11662
11663 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11664 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11665 }
11666
11667 return TRUE;
11668 }
11669
11670 boolean_t
11671 pmap_is_empty(
11672 pmap_t pmap,
11673 vm_map_offset_t va_start,
11674 vm_map_offset_t va_end)
11675 {
11676 #if XNU_MONITOR
11677 return pmap_is_empty_ppl(pmap, va_start, va_end);
11678 #else
11679 return pmap_is_empty_internal(pmap, va_start, va_end);
11680 #endif
11681 }
11682
11683 vm_map_offset_t
11684 pmap_max_offset(
11685 boolean_t is64,
11686 unsigned int option)
11687 {
11688 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11689 }
11690
11691 vm_map_offset_t
11692 pmap_max_64bit_offset(
11693 __unused unsigned int option)
11694 {
11695 vm_map_offset_t max_offset_ret = 0;
11696
11697 #if defined(__arm64__)
11698 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11699 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11700 max_offset_ret = arm64_pmap_max_offset_default;
11701 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11702 max_offset_ret = min_max_offset;
11703 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11704 max_offset_ret = MACH_VM_MAX_ADDRESS;
11705 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11706 if (arm64_pmap_max_offset_default) {
11707 max_offset_ret = arm64_pmap_max_offset_default;
11708 } else if (max_mem > 0xC0000000) {
11709 // devices with > 3GB of memory
11710 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11711 } else if (max_mem > 0x40000000) {
11712 // devices with > 1GB and <= 3GB of memory
11713 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11714 } else {
11715 // devices with <= 1 GB of memory
11716 max_offset_ret = min_max_offset;
11717 }
11718 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11719 if (arm64_pmap_max_offset_default) {
11720 // Allow the boot-arg to override jumbo size
11721 max_offset_ret = arm64_pmap_max_offset_default;
11722 } else {
11723 max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11724 }
11725 } else {
11726 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11727 }
11728
11729 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11730 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11731 assert(max_offset_ret >= min_max_offset);
11732 }
11733 #else
11734 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11735 #endif
11736
11737 return max_offset_ret;
11738 }
11739
11740 vm_map_offset_t
11741 pmap_max_32bit_offset(
11742 unsigned int option)
11743 {
11744 vm_map_offset_t max_offset_ret = 0;
11745
11746 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11747 max_offset_ret = arm_pmap_max_offset_default;
11748 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11749 max_offset_ret = VM_MAX_ADDRESS;
11750 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11751 max_offset_ret = VM_MAX_ADDRESS;
11752 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11753 if (arm_pmap_max_offset_default) {
11754 max_offset_ret = arm_pmap_max_offset_default;
11755 } else if (max_mem > 0x20000000) {
11756 max_offset_ret = VM_MAX_ADDRESS;
11757 } else {
11758 max_offset_ret = VM_MAX_ADDRESS;
11759 }
11760 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11761 max_offset_ret = VM_MAX_ADDRESS;
11762 } else {
11763 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11764 }
11765
11766 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11767 return max_offset_ret;
11768 }
11769
11770 #if CONFIG_DTRACE
11771 /*
11772 * Constrain DTrace copyin/copyout actions
11773 */
11774 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11775 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11776
11777 kern_return_t
11778 dtrace_copyio_preflight(
11779 __unused addr64_t va)
11780 {
11781 if (current_map() == kernel_map) {
11782 return KERN_FAILURE;
11783 } else {
11784 return KERN_SUCCESS;
11785 }
11786 }
11787
11788 kern_return_t
11789 dtrace_copyio_postflight(
11790 __unused addr64_t va)
11791 {
11792 return KERN_SUCCESS;
11793 }
11794 #endif /* CONFIG_DTRACE */
11795
11796
11797 void
11798 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11799 {
11800 }
11801
11802
11803 void
11804 pmap_flush(
11805 __unused pmap_flush_context *cpus_to_flush)
11806 {
11807 /* not implemented yet */
11808 return;
11809 }
11810
11811 #if XNU_MONITOR
11812
11813 /*
11814 * Enforce that the address range described by kva and nbytes is not currently
11815 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11816 * unintentionally writing to PPL-owned memory.
11817 */
11818 void
11819 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11820 {
11821 vm_offset_t end;
11822 if (os_add_overflow(kva, nbytes, &end)) {
11823 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11824 }
11825 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11826 pmap_paddr_t pa = kvtophys_nofail(ckva);
11827 pp_attr_t attr;
11828 unsigned int pai = pa_index(pa);
11829 if (ckva == phystokv(pa)) {
11830 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11831 }
11832 do {
11833 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11834 if (attr & PP_ATTR_MONITOR) {
11835 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11836 }
11837 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11838 }
11839 }
11840
11841 void
11842 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11843 {
11844 vm_offset_t end;
11845 if (os_add_overflow(kva, nbytes, &end)) {
11846 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11847 }
11848 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11849 pmap_paddr_t pa = kvtophys_nofail(ckva);
11850
11851 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11852 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11853 }
11854 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11855 ppattr_pa_clear_no_monitor(pa);
11856 }
11857 }
11858
11859 /**
11860 * Lock down a page, making all mappings read-only, and preventing further
11861 * mappings or removal of this particular kva's mapping. Effectively, it makes
11862 * the physical page at kva immutable (see the ppl_writable parameter for an
11863 * exception to this).
11864 *
11865 * @param kva Valid address to any mapping of the physical page to lockdown.
11866 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11867 * @param ppl_writable True if the PPL should still be able to write to the page
11868 * using the physical aperture mapping. False will make the
11869 * page read-only for both the kernel and PPL in the
11870 * physical aperture.
11871 */
11872
11873 MARK_AS_PMAP_TEXT static void
11874 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11875 {
11876 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11877 }
11878
11879 /**
11880 * Lock down a page, giving all mappings the specified maximum permissions, and
11881 * preventing further mappings or removal of this particular kva's mapping.
11882 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11883 * parameter for an exception to this).
11884 *
11885 * @param kva Valid address to any mapping of the physical page to lockdown.
11886 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11887 * @param ppl_writable True if the PPL should still be able to write to the page
11888 * using the physical aperture mapping. False will make the
11889 * page read-only for both the kernel and PPL in the
11890 * physical aperture.
11891 * @param prot Maximum permissions to allow in existing alias mappings
11892 */
11893 MARK_AS_PMAP_TEXT static void
11894 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11895 {
11896 const pmap_paddr_t pa = kvtophys_nofail(kva);
11897 const unsigned int pai = pa_index(pa);
11898
11899 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11900 pvh_lock(pai);
11901 pv_entry_t **pvh = pai_to_pvh(pai);
11902 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11903
11904 if (__improbable(ppattr_pa_test_monitor(pa))) {
11905 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11906 }
11907
11908 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11909 panic("%s: %#lx already locked down/executable (%#llx)",
11910 __func__, kva, (uint64_t)pvh_flags);
11911 }
11912
11913
11914 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11915
11916 /* Update the physical aperture mapping to prevent kernel write access. */
11917 const unsigned int new_xprr_perm =
11918 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11919 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11920
11921 pvh_unlock(pai);
11922
11923 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11924
11925 /**
11926 * Double-check that the mapping didn't change physical addresses before the
11927 * LOCKDOWN flag was set (there is a brief window between the above
11928 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11929 *
11930 * This doesn't solve the ABA problem, but this doesn't have to since once
11931 * the pvh_lock() is grabbed no new mappings can be created on this physical
11932 * page without the LOCKDOWN flag already set (so any future mappings can
11933 * only be RO, and no existing mappings can be removed).
11934 */
11935 if (kvtophys_nofail(kva) != pa) {
11936 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11937 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11938 }
11939 }
11940
11941 /**
11942 * Helper for releasing a page from being locked down to the PPL, making it writable to the
11943 * kernel once again.
11944 *
11945 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11946 * to unlockdown a page that was never locked down, will panic.
11947 *
11948 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
11949 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11950 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11951 * passed to the paired pmap_ppl_lockdown_page() call. Any
11952 * deviation will result in a panic.
11953 */
11954 MARK_AS_PMAP_TEXT static void
11955 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11956 {
11957 pvh_assert_locked(pai);
11958 pv_entry_t **pvh = pai_to_pvh(pai);
11959 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11960
11961 if (__improbable(!(pvh_flags & lockdown_flag))) {
11962 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11963 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11964 }
11965
11966
11967 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11968
11969 /* Restore the pre-lockdown physical aperture mapping permissions. */
11970 const unsigned int old_xprr_perm =
11971 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11972 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11973 }
11974
11975 /**
11976 * Release a page from being locked down to the PPL, making it writable to the
11977 * kernel once again.
11978 *
11979 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11980 * to unlockdown a page that was never locked down, will panic.
11981 *
11982 * @param kva Valid address to any mapping of the physical page to unlockdown.
11983 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11984 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11985 * passed to the paired pmap_ppl_lockdown_page() call. Any
11986 * deviation will result in a panic.
11987 */
11988 MARK_AS_PMAP_TEXT static void
11989 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11990 {
11991 const pmap_paddr_t pa = kvtophys_nofail(kva);
11992 const unsigned int pai = pa_index(pa);
11993
11994 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11995 pvh_lock(pai);
11996 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11997 pvh_unlock(pai);
11998 }
11999
12000 #else /* XNU_MONITOR */
12001
12002 void __unused
12003 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12004 {
12005 }
12006
12007 void __unused
12008 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12009 {
12010 }
12011
12012 #endif /* !XNU_MONITOR */
12013
12014
12015 MARK_AS_PMAP_TEXT static inline void
12016 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12017 {
12018 #if XNU_MONITOR
12019 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12020 #else
12021 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12022 #endif
12023 }
12024
12025 MARK_AS_PMAP_TEXT static inline void
12026 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12027 {
12028 #if XNU_MONITOR
12029 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12030 #else
12031 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12032 #endif
12033 }
12034
12035 /**
12036 * Perform basic validation checks on the destination only and
12037 * corresponding offset/sizes prior to writing to a read only allocation.
12038 *
12039 * @note Should be called before writing to an allocation from the read
12040 * only allocator.
12041 *
12042 * @param zid The ID of the zone the allocation belongs to.
12043 * @param va VA of element being modified (destination).
12044 * @param offset Offset being written to, in the element.
12045 * @param new_data_size Size of modification.
12046 *
12047 */
12048
12049 MARK_AS_PMAP_TEXT static void
12050 pmap_ro_zone_validate_element_dst(
12051 zone_id_t zid,
12052 vm_offset_t va,
12053 vm_offset_t offset,
12054 vm_size_t new_data_size)
12055 {
12056 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12057 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12058 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12059 }
12060
12061 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12062
12063 /* Check element is from correct zone and properly aligned */
12064 zone_require_ro(zid, elem_size, (void*)va);
12065
12066 if (__improbable(new_data_size > (elem_size - offset))) {
12067 panic("%s: New data size %lu too large for elem size %lu at addr %p",
12068 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12069 }
12070 if (__improbable(offset >= elem_size)) {
12071 panic("%s: Offset %lu too large for elem size %lu at addr %p",
12072 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12073 }
12074 }
12075
12076
12077 /**
12078 * Perform basic validation checks on the source, destination and
12079 * corresponding offset/sizes prior to writing to a read only allocation.
12080 *
12081 * @note Should be called before writing to an allocation from the read
12082 * only allocator.
12083 *
12084 * @param zid The ID of the zone the allocation belongs to.
12085 * @param va VA of element being modified (destination).
12086 * @param offset Offset being written to, in the element.
12087 * @param new_data Pointer to new data (source).
12088 * @param new_data_size Size of modification.
12089 *
12090 */
12091
12092 MARK_AS_PMAP_TEXT static void
12093 pmap_ro_zone_validate_element(
12094 zone_id_t zid,
12095 vm_offset_t va,
12096 vm_offset_t offset,
12097 const vm_offset_t new_data,
12098 vm_size_t new_data_size)
12099 {
12100 vm_offset_t sum = 0;
12101
12102 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12103 panic("%s: Integer addition overflow %p + %lu = %lu",
12104 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12105 }
12106
12107 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12108 }
12109
12110 /**
12111 * Ensure that physical page is locked down and pinned, before writing to it.
12112 *
12113 * @note Should be called before writing to an allocation from the read
12114 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12115 * ensure that it is called after the modification.
12116 *
12117 *
12118 * @param pa Physical address of the element being modified.
12119 * @param va Virtual address of element being modified.
12120 * @param size Size of the modification.
12121 *
12122 */
12123
12124 MARK_AS_PMAP_TEXT static void
12125 pmap_ro_zone_lock_phy_page(
12126 const pmap_paddr_t pa,
12127 vm_offset_t va,
12128 vm_size_t size)
12129 {
12130 const unsigned int pai = pa_index(pa);
12131 pvh_lock(pai);
12132
12133 /* Ensure that the physical page is locked down */
12134 #if XNU_MONITOR
12135 pv_entry_t **pvh = pai_to_pvh(pai);
12136 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12137 panic("%s: Physical page not locked down %llx", __func__, pa);
12138 }
12139 #endif /* XNU_MONITOR */
12140
12141 /* Ensure page can't become PPL-owned memory before the memcpy occurs */
12142 pmap_pin_kernel_pages(va, size);
12143 }
12144
12145 /**
12146 * Unlock and unpin physical page after writing to it.
12147 *
12148 * @note Should be called after writing to an allocation from the read
12149 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12150 * ensure that it has been called prior to the modification.
12151 *
12152 * @param pa Physical address of the element that was modified.
12153 * @param va Virtual address of element that was modified.
12154 * @param size Size of the modification.
12155 *
12156 */
12157
12158 MARK_AS_PMAP_TEXT static void
12159 pmap_ro_zone_unlock_phy_page(
12160 const pmap_paddr_t pa,
12161 vm_offset_t va,
12162 vm_size_t size)
12163 {
12164 const unsigned int pai = pa_index(pa);
12165 pmap_unpin_kernel_pages(va, size);
12166 pvh_unlock(pai);
12167 }
12168
12169 /**
12170 * Function to copy kauth_cred from new_data to kv.
12171 * Function defined in "kern_prot.c"
12172 *
12173 * @note Will be removed upon completion of
12174 * <rdar://problem/72635194> Compiler PAC support for memcpy.
12175 *
12176 * @param kv Address to copy new data to.
12177 * @param new_data Pointer to new data.
12178 *
12179 */
12180
12181 extern void
12182 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12183
12184 /**
12185 * Zalloc-specific memcpy that writes through the physical aperture
12186 * and ensures the element being modified is from a read-only zone.
12187 *
12188 * @note Designed to work only with the zone allocator's read-only submap.
12189 *
12190 * @param zid The ID of the zone to allocate from.
12191 * @param va VA of element to be modified.
12192 * @param offset Offset from element.
12193 * @param new_data Pointer to new data.
12194 * @param new_data_size Size of modification.
12195 *
12196 */
12197
12198 void
12199 pmap_ro_zone_memcpy(
12200 zone_id_t zid,
12201 vm_offset_t va,
12202 vm_offset_t offset,
12203 const vm_offset_t new_data,
12204 vm_size_t new_data_size)
12205 {
12206 #if XNU_MONITOR
12207 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12208 #else /* XNU_MONITOR */
12209 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12210 #endif /* XNU_MONITOR */
12211 }
12212
12213 MARK_AS_PMAP_TEXT void
12214 pmap_ro_zone_memcpy_internal(
12215 zone_id_t zid,
12216 vm_offset_t va,
12217 vm_offset_t offset,
12218 const vm_offset_t new_data,
12219 vm_size_t new_data_size)
12220 {
12221 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12222
12223 if (!new_data || new_data_size == 0) {
12224 return;
12225 }
12226
12227 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12228 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12229 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12230 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12231 }
12232
12233 /**
12234 * Zalloc-specific function to atomically mutate fields of an element that
12235 * belongs to a read-only zone, via the physcial aperture.
12236 *
12237 * @note Designed to work only with the zone allocator's read-only submap.
12238 *
12239 * @param zid The ID of the zone the element belongs to.
12240 * @param va VA of element to be modified.
12241 * @param offset Offset in element.
12242 * @param op Atomic operation to perform.
12243 * @param value Mutation value.
12244 *
12245 */
12246
12247 uint64_t
12248 pmap_ro_zone_atomic_op(
12249 zone_id_t zid,
12250 vm_offset_t va,
12251 vm_offset_t offset,
12252 zro_atomic_op_t op,
12253 uint64_t value)
12254 {
12255 #if XNU_MONITOR
12256 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12257 #else /* XNU_MONITOR */
12258 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12259 #endif /* XNU_MONITOR */
12260 }
12261
12262 MARK_AS_PMAP_TEXT uint64_t
12263 pmap_ro_zone_atomic_op_internal(
12264 zone_id_t zid,
12265 vm_offset_t va,
12266 vm_offset_t offset,
12267 zro_atomic_op_t op,
12268 uint64_t value)
12269 {
12270 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12271 vm_size_t value_size = op & 0xf;
12272
12273 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12274 pmap_ro_zone_lock_phy_page(pa, va, value_size);
12275 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12276 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12277
12278 return value;
12279 }
12280
12281 /**
12282 * bzero for allocations from read only zones, that writes through the
12283 * physical aperture.
12284 *
12285 * @note This is called by the zfree path of all allocations from read
12286 * only zones.
12287 *
12288 * @param zid The ID of the zone the allocation belongs to.
12289 * @param va VA of element to be zeroed.
12290 * @param offset Offset in the element.
12291 * @param size Size of allocation.
12292 *
12293 */
12294
12295 void
12296 pmap_ro_zone_bzero(
12297 zone_id_t zid,
12298 vm_offset_t va,
12299 vm_offset_t offset,
12300 vm_size_t size)
12301 {
12302 #if XNU_MONITOR
12303 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12304 #else /* XNU_MONITOR */
12305 pmap_ro_zone_bzero_internal(zid, va, offset, size);
12306 #endif /* XNU_MONITOR */
12307 }
12308
12309 MARK_AS_PMAP_TEXT void
12310 pmap_ro_zone_bzero_internal(
12311 zone_id_t zid,
12312 vm_offset_t va,
12313 vm_offset_t offset,
12314 vm_size_t size)
12315 {
12316 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12317 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12318 pmap_ro_zone_lock_phy_page(pa, va, size);
12319 bzero((void*)phystokv(pa), size);
12320 pmap_ro_zone_unlock_phy_page(pa, va, size);
12321 }
12322
12323 /**
12324 * Removes write access from the Physical Aperture.
12325 *
12326 * @note For non-PPL devices, it simply makes all virtual mappings RO.
12327 * @note Designed to work only with the zone allocator's read-only submap.
12328 *
12329 * @param va VA of the page to restore write access to.
12330 *
12331 */
12332 MARK_AS_PMAP_TEXT static void
12333 pmap_phys_write_disable(vm_address_t va)
12334 {
12335 #if XNU_MONITOR
12336 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12337 #else /* XNU_MONITOR */
12338 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12339 #endif /* XNU_MONITOR */
12340 }
12341
12342 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
12343
12344 MARK_AS_PMAP_TEXT mach_vm_size_t
12345 pmap_query_resident_internal(
12346 pmap_t pmap,
12347 vm_map_address_t start,
12348 vm_map_address_t end,
12349 mach_vm_size_t *compressed_bytes_p)
12350 {
12351 mach_vm_size_t resident_bytes = 0;
12352 mach_vm_size_t compressed_bytes = 0;
12353
12354 pt_entry_t *bpte, *epte;
12355 pt_entry_t *pte_p;
12356 tt_entry_t *tte_p;
12357
12358 if (pmap == NULL) {
12359 return PMAP_RESIDENT_INVALID;
12360 }
12361
12362 validate_pmap(pmap);
12363
12364 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12365
12366 /* Ensure that this request is valid, and addresses exactly one TTE. */
12367 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12368 (end % pt_attr_page_size(pt_attr)))) {
12369 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12370 }
12371
12372 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12373 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12374 }
12375
12376 pmap_lock(pmap, PMAP_LOCK_SHARED);
12377 tte_p = pmap_tte(pmap, start);
12378 if (tte_p == (tt_entry_t *) NULL) {
12379 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12380 return PMAP_RESIDENT_INVALID;
12381 }
12382 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12383 pte_p = (pt_entry_t *) ttetokv(*tte_p);
12384 bpte = &pte_p[pte_index(pt_attr, start)];
12385 epte = &pte_p[pte_index(pt_attr, end)];
12386
12387 for (; bpte < epte; bpte++) {
12388 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12389 compressed_bytes += pt_attr_page_size(pt_attr);
12390 } else if (pa_valid(pte_to_pa(*bpte))) {
12391 resident_bytes += pt_attr_page_size(pt_attr);
12392 }
12393 }
12394 }
12395 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12396
12397 if (compressed_bytes_p) {
12398 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12399 *compressed_bytes_p += compressed_bytes;
12400 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12401 }
12402
12403 return resident_bytes;
12404 }
12405
12406 mach_vm_size_t
12407 pmap_query_resident(
12408 pmap_t pmap,
12409 vm_map_address_t start,
12410 vm_map_address_t end,
12411 mach_vm_size_t *compressed_bytes_p)
12412 {
12413 mach_vm_size_t total_resident_bytes;
12414 mach_vm_size_t compressed_bytes;
12415 vm_map_address_t va;
12416
12417
12418 if (pmap == PMAP_NULL) {
12419 if (compressed_bytes_p) {
12420 *compressed_bytes_p = 0;
12421 }
12422 return 0;
12423 }
12424
12425 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12426
12427 total_resident_bytes = 0;
12428 compressed_bytes = 0;
12429
12430 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12431 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12432 VM_KERNEL_ADDRHIDE(end));
12433
12434 va = start;
12435 while (va < end) {
12436 vm_map_address_t l;
12437 mach_vm_size_t resident_bytes;
12438
12439 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12440
12441 if (l > end) {
12442 l = end;
12443 }
12444 #if XNU_MONITOR
12445 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12446 #else
12447 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12448 #endif
12449 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12450 break;
12451 }
12452
12453 total_resident_bytes += resident_bytes;
12454
12455 va = l;
12456 }
12457
12458 if (compressed_bytes_p) {
12459 *compressed_bytes_p = compressed_bytes;
12460 }
12461
12462 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12463 total_resident_bytes);
12464
12465 return total_resident_bytes;
12466 }
12467
12468 #if MACH_ASSERT
12469 static void
12470 pmap_check_ledgers(
12471 pmap_t pmap)
12472 {
12473 int pid;
12474 char *procname;
12475
12476 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12477 /*
12478 * This pmap was not or is no longer fully associated
12479 * with a task (e.g. the old pmap after a fork()/exec() or
12480 * spawn()). Its "ledger" still points at a task that is
12481 * now using a different (and active) address space, so
12482 * we can't check that all the pmap ledgers are balanced here.
12483 *
12484 * If the "pid" is set, that means that we went through
12485 * pmap_set_process() in task_terminate_internal(), so
12486 * this task's ledger should not have been re-used and
12487 * all the pmap ledgers should be back to 0.
12488 */
12489 return;
12490 }
12491
12492 pid = pmap->pmap_pid;
12493 procname = pmap->pmap_procname;
12494
12495 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12496 }
12497 #endif /* MACH_ASSERT */
12498
12499 void
12500 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12501 {
12502 }
12503
12504 /**
12505 * The minimum shared region nesting size is used by the VM to determine when to
12506 * break up large mappings to nested regions. The smallest size that these
12507 * mappings can be broken into is determined by what page table level those
12508 * regions are being nested in at and the size of the page tables.
12509 *
12510 * For instance, if a nested region is nesting at L2 for a process utilizing
12511 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12512 * block entry).
12513 *
12514 * @param pmap The target pmap to determine the block size based on whether it's
12515 * using 16KB or 4KB page tables.
12516 */
12517 uint64_t
12518 pmap_shared_region_size_min(__unused pmap_t pmap)
12519 {
12520 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12521
12522 /**
12523 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12524 * 4KB pages). This means that a target pmap will contain L2 entries that
12525 * point to shared L3 page tables in the shared region pmap.
12526 */
12527 return pt_attr_twig_size(pt_attr);
12528 }
12529
12530 boolean_t
12531 pmap_enforces_execute_only(
12532 pmap_t pmap)
12533 {
12534 return pmap != kernel_pmap;
12535 }
12536
12537 MARK_AS_PMAP_TEXT void
12538 pmap_set_vm_map_cs_enforced_internal(
12539 pmap_t pmap,
12540 bool new_value)
12541 {
12542 validate_pmap_mutable(pmap);
12543 pmap->pmap_vm_map_cs_enforced = new_value;
12544 }
12545
12546 void
12547 pmap_set_vm_map_cs_enforced(
12548 pmap_t pmap,
12549 bool new_value)
12550 {
12551 #if XNU_MONITOR
12552 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12553 #else
12554 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12555 #endif
12556 }
12557
12558 extern int cs_process_enforcement_enable;
12559 bool
12560 pmap_get_vm_map_cs_enforced(
12561 pmap_t pmap)
12562 {
12563 if (cs_process_enforcement_enable) {
12564 return true;
12565 }
12566 return pmap->pmap_vm_map_cs_enforced;
12567 }
12568
12569 MARK_AS_PMAP_TEXT void
12570 pmap_set_jit_entitled_internal(
12571 __unused pmap_t pmap)
12572 {
12573 return;
12574 }
12575
12576 void
12577 pmap_set_jit_entitled(
12578 pmap_t pmap)
12579 {
12580 #if XNU_MONITOR
12581 pmap_set_jit_entitled_ppl(pmap);
12582 #else
12583 pmap_set_jit_entitled_internal(pmap);
12584 #endif
12585 }
12586
12587 bool
12588 pmap_get_jit_entitled(
12589 __unused pmap_t pmap)
12590 {
12591 return false;
12592 }
12593
12594 MARK_AS_PMAP_TEXT void
12595 pmap_set_tpro_internal(
12596 __unused pmap_t pmap)
12597 {
12598 return;
12599 }
12600
12601 void
12602 pmap_set_tpro(
12603 pmap_t pmap)
12604 {
12605 #if XNU_MONITOR
12606 pmap_set_tpro_ppl(pmap);
12607 #else /* XNU_MONITOR */
12608 pmap_set_tpro_internal(pmap);
12609 #endif /* XNU_MONITOR */
12610 }
12611
12612 bool
12613 pmap_get_tpro(
12614 __unused pmap_t pmap)
12615 {
12616 return false;
12617 }
12618
12619 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12620
12621 MARK_AS_PMAP_TEXT kern_return_t
12622 pmap_query_page_info_internal(
12623 pmap_t pmap,
12624 vm_map_offset_t va,
12625 int *disp_p)
12626 {
12627 pmap_paddr_t pa;
12628 int disp;
12629 unsigned int pai;
12630 pt_entry_t *pte_p, pte;
12631 pv_entry_t **pv_h, *pve_p;
12632
12633 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12634 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12635 *disp_p = 0;
12636 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12637 return KERN_INVALID_ARGUMENT;
12638 }
12639
12640 validate_pmap(pmap);
12641 pmap_lock(pmap, PMAP_LOCK_SHARED);
12642
12643 try_again:
12644 disp = 0;
12645 pte_p = pmap_pte(pmap, va);
12646 if (pte_p == PT_ENTRY_NULL) {
12647 goto done;
12648 }
12649 pte = *(volatile pt_entry_t*)pte_p;
12650 pa = pte_to_pa(pte);
12651 if (pa == 0) {
12652 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12653 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12654 if (pte & ARM_PTE_COMPRESSED_ALT) {
12655 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12656 }
12657 }
12658 } else {
12659 disp |= PMAP_QUERY_PAGE_PRESENT;
12660 pai = pa_index(pa);
12661 if (!pa_valid(pa)) {
12662 goto done;
12663 }
12664 pvh_lock(pai);
12665 if (pte != *(volatile pt_entry_t*)pte_p) {
12666 /* something changed: try again */
12667 pvh_unlock(pai);
12668 pmap_query_page_info_retries++;
12669 goto try_again;
12670 }
12671 pv_h = pai_to_pvh(pai);
12672 pve_p = PV_ENTRY_NULL;
12673 int pve_ptep_idx = 0;
12674 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12675 pve_p = pvh_pve_list(pv_h);
12676 while (pve_p != PV_ENTRY_NULL &&
12677 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12678 pve_p = pve_next(pve_p);
12679 }
12680 }
12681
12682 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12683 disp |= PMAP_QUERY_PAGE_ALTACCT;
12684 } else if (ppattr_test_reusable(pai)) {
12685 disp |= PMAP_QUERY_PAGE_REUSABLE;
12686 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12687 disp |= PMAP_QUERY_PAGE_INTERNAL;
12688 }
12689 pvh_unlock(pai);
12690 }
12691
12692 done:
12693 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12694 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12695 *disp_p = disp;
12696 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12697 return KERN_SUCCESS;
12698 }
12699
12700 kern_return_t
12701 pmap_query_page_info(
12702 pmap_t pmap,
12703 vm_map_offset_t va,
12704 int *disp_p)
12705 {
12706 #if XNU_MONITOR
12707 return pmap_query_page_info_ppl(pmap, va, disp_p);
12708 #else
12709 return pmap_query_page_info_internal(pmap, va, disp_p);
12710 #endif
12711 }
12712
12713
12714
12715 uint32_t
12716 pmap_user_va_bits(pmap_t pmap __unused)
12717 {
12718 #if __ARM_MIXED_PAGE_SIZE__
12719 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12720 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12721 #else
12722 return 64 - T0SZ_BOOT;
12723 #endif
12724 }
12725
12726 uint32_t
12727 pmap_kernel_va_bits(void)
12728 {
12729 return 64 - T1SZ_BOOT;
12730 }
12731
12732 static vm_map_size_t
12733 pmap_user_va_size(pmap_t pmap)
12734 {
12735 return 1ULL << pmap_user_va_bits(pmap);
12736 }
12737
12738
12739
12740 bool
12741 pmap_in_ppl(void)
12742 {
12743 // Unsupported
12744 return false;
12745 }
12746
12747 __attribute__((__noreturn__))
12748 void
12749 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12750 {
12751 panic("%s called on an unsupported platform.", __FUNCTION__);
12752 }
12753
12754 void *
12755 pmap_claim_reserved_ppl_page(void)
12756 {
12757 // Unsupported
12758 return NULL;
12759 }
12760
12761 void
12762 pmap_free_reserved_ppl_page(void __unused *kva)
12763 {
12764 // Unsupported
12765 }
12766
12767
12768 #if PMAP_CS_PPL_MONITOR
12769
12770 /* Immutable part of the trust cache runtime */
12771 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12772
12773 /* Mutable part of the trust cache runtime */
12774 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12775
12776 /* Lock for the trust cache runtime */
12777 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12778
12779 MARK_AS_PMAP_TEXT kern_return_t
12780 pmap_check_trust_cache_runtime_for_uuid_internal(
12781 const uint8_t check_uuid[kUUIDSize])
12782 {
12783 kern_return_t ret = KERN_DENIED;
12784
12785 if (amfi->TrustCache.version < 3) {
12786 /* AMFI change hasn't landed in the build */
12787 pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12788 return KERN_NOT_SUPPORTED;
12789 }
12790
12791 /* Lock the runtime as shared */
12792 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12793
12794 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12795 &ppl_trust_cache_rt,
12796 check_uuid,
12797 NULL);
12798
12799 /* Unlock the runtime */
12800 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12801
12802 if (tc_ret.error == kTCReturnSuccess) {
12803 ret = KERN_SUCCESS;
12804 } else if (tc_ret.error == kTCReturnNotFound) {
12805 ret = KERN_NOT_FOUND;
12806 } else {
12807 ret = KERN_FAILURE;
12808 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12809 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12810 }
12811
12812 return ret;
12813 }
12814
12815 kern_return_t
12816 pmap_check_trust_cache_runtime_for_uuid(
12817 const uint8_t check_uuid[kUUIDSize])
12818 {
12819 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12820 }
12821
12822 MARK_AS_PMAP_TEXT kern_return_t
12823 pmap_load_trust_cache_with_type_internal(
12824 TCType_t type,
12825 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12826 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12827 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12828 {
12829 kern_return_t ret = KERN_DENIED;
12830 pmap_img4_payload_t *payload = NULL;
12831 size_t img4_payload_len = 0;
12832 size_t payload_len_aligned = 0;
12833 size_t manifest_len_aligned = 0;
12834
12835 /* Ignore the auxiliary manifest until we add support for it */
12836 (void)img4_aux_manifest;
12837 (void)img4_aux_manifest_len;
12838
12839
12840 #if PMAP_CS_INCLUDE_CODE_SIGNING
12841 if (pmap_cs) {
12842 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12843 panic("trust cache type not loadable from interface: %u", type);
12844 } else if (type >= kTCTypeTotal) {
12845 panic("attempted to load an unsupported trust cache type: %u", type);
12846 }
12847
12848 /* Validate entitlement for the calling process */
12849 if (TCTypeConfig[type].entitlementValue != NULL) {
12850 const bool entitlement_satisfied = check_entitlement_pmap(
12851 NULL,
12852 "com.apple.private.pmap.load-trust-cache",
12853 TCTypeConfig[type].entitlementValue,
12854 false,
12855 true);
12856
12857 if (entitlement_satisfied == false) {
12858 panic("attempted to load trust cache without entitlement: %u", type);
12859 }
12860 }
12861 }
12862 #endif
12863
12864 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12865 ret = pmap_reserve_ppl_page();
12866 if (ret != KERN_SUCCESS) {
12867 if (ret != KERN_RESOURCE_SHORTAGE) {
12868 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12869 }
12870 return ret;
12871 }
12872
12873 /* Align the passed in lengths to the page size -- round_page is overflow safe */
12874 payload_len_aligned = round_page(pmap_img4_payload_len);
12875 manifest_len_aligned = round_page(img4_manifest_len);
12876
12877 /* Ensure we have valid data passed in */
12878 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12879 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12880
12881 /*
12882 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12883 * data structure used by libTrustCache to manage the payload. We need to be able to
12884 * write to that data structure, so we keep the payload PPL writable.
12885 */
12886 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12887 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12888
12889 /* Should be safe to read from this now */
12890 payload = (pmap_img4_payload_t*)pmap_img4_payload;
12891
12892 /* Acquire a writable version of the trust cache data structure */
12893 TrustCache_t *trust_cache = &payload->trust_cache;
12894 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12895
12896 /* Calculate the correct length of the img4 payload */
12897 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12898 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12899 }
12900
12901 /* Exclusively lock the runtime */
12902 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12903
12904 /* Load the trust cache */
12905 TCReturn_t tc_ret = amfi->TrustCache.load(
12906 &ppl_trust_cache_rt,
12907 type,
12908 trust_cache,
12909 (const uintptr_t)payload->img4_payload, img4_payload_len,
12910 (const uintptr_t)img4_manifest, img4_manifest_len);
12911
12912 /* Unlock the runtime */
12913 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12914
12915 if (tc_ret.error == kTCReturnSuccess) {
12916 ret = KERN_SUCCESS;
12917 } else {
12918 if (tc_ret.error == kTCReturnDuplicate) {
12919 ret = KERN_ALREADY_IN_SET;
12920 } else {
12921 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12922 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12923
12924 ret = KERN_FAILURE;
12925 }
12926
12927 /* Unlock the payload data */
12928 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12929 trust_cache = NULL;
12930 payload = NULL;
12931 }
12932
12933 /* Unlock the manifest since it is no longer needed */
12934 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12935
12936 /* Return the CoreCrypto reserved page back to the free list */
12937 pmap_release_reserved_ppl_page();
12938
12939 return ret;
12940 }
12941
12942 kern_return_t
12943 pmap_load_trust_cache_with_type(
12944 TCType_t type,
12945 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12946 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12947 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12948 {
12949 kern_return_t ret = KERN_DENIED;
12950
12951 ret = pmap_load_trust_cache_with_type_ppl(
12952 type,
12953 pmap_img4_payload, pmap_img4_payload_len,
12954 img4_manifest, img4_manifest_len,
12955 img4_aux_manifest, img4_aux_manifest_len);
12956
12957 while (ret == KERN_RESOURCE_SHORTAGE) {
12958 /* Allocate a page from the free list */
12959 pmap_alloc_page_for_ppl(0);
12960
12961 /* Attempt the call again */
12962 ret = pmap_load_trust_cache_with_type_ppl(
12963 type,
12964 pmap_img4_payload, pmap_img4_payload_len,
12965 img4_manifest, img4_manifest_len,
12966 img4_aux_manifest, img4_aux_manifest_len);
12967 }
12968
12969 return ret;
12970 }
12971
12972 MARK_AS_PMAP_TEXT kern_return_t
12973 pmap_query_trust_cache_safe(
12974 TCQueryType_t query_type,
12975 const uint8_t cdhash[kTCEntryHashSize],
12976 TrustCacheQueryToken_t *query_token)
12977 {
12978 kern_return_t ret = KERN_NOT_FOUND;
12979
12980 /* Validate the query type preemptively */
12981 if (query_type >= kTCQueryTypeTotal) {
12982 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12983 return KERN_INVALID_ARGUMENT;
12984 }
12985
12986 /* Lock the runtime as shared */
12987 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12988
12989 TCReturn_t tc_ret = amfi->TrustCache.query(
12990 &ppl_trust_cache_rt,
12991 query_type,
12992 cdhash,
12993 query_token);
12994
12995 /* Unlock the runtime */
12996 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12997
12998 if (tc_ret.error == kTCReturnSuccess) {
12999 ret = KERN_SUCCESS;
13000 } else if (tc_ret.error == kTCReturnNotFound) {
13001 ret = KERN_NOT_FOUND;
13002 } else {
13003 ret = KERN_FAILURE;
13004 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13005 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13006 }
13007
13008 return ret;
13009 }
13010
13011 MARK_AS_PMAP_TEXT kern_return_t
13012 pmap_query_trust_cache_internal(
13013 TCQueryType_t query_type,
13014 const uint8_t cdhash[kTCEntryHashSize],
13015 TrustCacheQueryToken_t *query_token)
13016 {
13017 kern_return_t ret = KERN_NOT_FOUND;
13018 TrustCacheQueryToken_t query_token_safe = {0};
13019 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13020
13021 /* Copy in the CDHash into PPL storage */
13022 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13023
13024 /* Query through the safe API since we're in the PPL now */
13025 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13026
13027 if (query_token != NULL) {
13028 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13029 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13030 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13031 }
13032
13033 return ret;
13034 }
13035
13036 kern_return_t
13037 pmap_query_trust_cache(
13038 TCQueryType_t query_type,
13039 const uint8_t cdhash[kTCEntryHashSize],
13040 TrustCacheQueryToken_t *query_token)
13041 {
13042 kern_return_t ret = KERN_NOT_FOUND;
13043
13044 ret = pmap_query_trust_cache_ppl(
13045 query_type,
13046 cdhash,
13047 query_token);
13048
13049 return ret;
13050 }
13051
13052 MARK_AS_PMAP_DATA bool ppl_developer_mode_set = false;
13053 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13054
13055 MARK_AS_PMAP_TEXT void
13056 pmap_toggle_developer_mode_internal(
13057 bool state)
13058 {
13059 bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
13060
13061 /*
13062 * Only the following state transitions are allowed:
13063 * -- not set --> false
13064 * -- not set --> true
13065 * -- true --> false
13066 * -- true --> true
13067 * -- false --> false
13068 *
13069 * We never allow false --> true transitions.
13070 */
13071 bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13072
13073 if ((current == false) && (state == true) && state_set) {
13074 panic("PMAP_CS: attempted to enable developer mode incorrectly");
13075 }
13076
13077 /* We're going to update the developer mode state, so update this first */
13078 os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13079
13080 /* Update the developer mode state on the system */
13081 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13082 }
13083
13084 void
13085 pmap_toggle_developer_mode(
13086 bool state)
13087 {
13088 pmap_toggle_developer_mode_ppl(state);
13089 }
13090
13091 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13092 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13093
13094 #pragma mark Image4 - New
13095
13096 typedef struct _pmap_image4_dispatch {
13097 image4_cs_trap_t selector;
13098 image4_cs_trap_handler_t handler;
13099 } pmap_image4_dispatch_t;
13100
13101 MARK_AS_PMAP_TEXT static errno_t
13102 _pmap_image4_monitor_trap_set_release_type(
13103 const pmap_image4_dispatch_t *dispatch,
13104 const void *input_data)
13105 {
13106 /*
13107 * csmx_release_type --> __cs_copy
13108 */
13109 image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13110
13111 /* Copy the input data to prevent ToCToU */
13112 memcpy(&input, input_data, sizeof(input));
13113
13114 /* Dispatch to AppleImage4 */
13115 return dispatch->handler(
13116 dispatch->selector,
13117 &input, sizeof(input),
13118 NULL, NULL);
13119 }
13120
13121
13122
13123 MARK_AS_PMAP_TEXT static errno_t
13124 _pmap_image4_monitor_trap_nonce_set(
13125 const pmap_image4_dispatch_t *dispatch,
13126 const void *input_data)
13127 {
13128 /*
13129 * csmx_clear --> __cs_copy
13130 * csmx_cipher --> __cs_copy
13131 */
13132 image4_cs_trap_argv_nonce_set_t input = {0};
13133
13134 /* Copy the input data to prevent ToCToU */
13135 memcpy(&input, input_data, sizeof(input));
13136
13137 /* Dispatch to AppleImage4 */
13138 return dispatch->handler(
13139 dispatch->selector,
13140 &input, sizeof(input),
13141 NULL, NULL);
13142 }
13143
13144 MARK_AS_PMAP_TEXT static errno_t
13145 _pmap_image4_monitor_trap_nonce_roll(
13146 const pmap_image4_dispatch_t *dispatch,
13147 const void *input_data)
13148 {
13149 image4_cs_trap_argv_nonce_roll_t input = {0};
13150
13151 /* Copy the input data to prevent ToCToU */
13152 memcpy(&input, input_data, sizeof(input));
13153
13154 /* Dispatch to AppleImage4 */
13155 return dispatch->handler(
13156 dispatch->selector,
13157 &input, sizeof(input),
13158 NULL, NULL);
13159 }
13160
13161 MARK_AS_PMAP_TEXT static errno_t
13162 _pmap_image4_monitor_trap_image_activate(
13163 const pmap_image4_dispatch_t *dispatch,
13164 const void *input_data)
13165 {
13166 /*
13167 * csmx_payload (csmx_payload_len) --> __cs_xfer
13168 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13169 */
13170 image4_cs_trap_argv_image_activate_t input = {0};
13171
13172 /* Copy the input data to prevent ToCToU */
13173 memcpy(&input, input_data, sizeof(input));
13174
13175 /* Validate the payload region */
13176 pmap_cs_assert_addr(
13177 input.csmx_payload, round_page(input.csmx_payload_len),
13178 false, false);
13179
13180 /* Validate the manifest region */
13181 pmap_cs_assert_addr(
13182 input.csmx_manifest, round_page(input.csmx_manifest_len),
13183 false, false);
13184
13185 /* Lockdown the payload region */
13186 pmap_cs_lockdown_pages(
13187 input.csmx_payload, round_page(input.csmx_payload_len), false);
13188
13189 /* Lockdown the manifest region */
13190 pmap_cs_lockdown_pages(
13191 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13192
13193 /* Dispatch the handler */
13194 errno_t err = dispatch->handler(
13195 dispatch->selector,
13196 &input, sizeof(input),
13197 NULL, NULL);
13198
13199 /*
13200 * Image activation always returns the manifest back to the kernel since it isn't
13201 * needed once the evaluation of the image has been completed. The payload must
13202 * remain owned by the monitor if the activation was successful.
13203 */
13204 if (err != 0) {
13205 /* Unlock the payload region */
13206 pmap_cs_unlockdown_pages(
13207 input.csmx_payload, round_page(input.csmx_payload_len), false);
13208 }
13209
13210 /* Unlock the manifest region */
13211 pmap_cs_unlockdown_pages(
13212 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13213
13214 return err;
13215 }
13216
13217 MARK_AS_PMAP_TEXT static errno_t
13218 _pmap_image4_monitor_trap_passthrough(
13219 __unused const pmap_image4_dispatch_t *dispatch,
13220 __unused const void *input_data,
13221 __unused size_t input_size)
13222 {
13223 #if DEVELOPMENT || DEBUG || KASAN
13224 return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13225 #else
13226 pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13227 return ENOSYS;
13228 #endif
13229 }
13230
13231 MARK_AS_PMAP_TEXT errno_t
13232 pmap_image4_monitor_trap_internal(
13233 image4_cs_trap_t selector,
13234 const void *input_data,
13235 size_t input_size)
13236 {
13237 kern_return_t ret = KERN_DENIED;
13238 errno_t err = EPERM;
13239
13240 /* Acquire the handler for this selector */
13241 image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13242 if (handler == NULL) {
13243 pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13244 return EINVAL;
13245 }
13246
13247 /* Verify input size for the handler */
13248 if (input_size != image4_cs_trap_vector_size(selector)) {
13249 pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13250 return EINVAL;
13251 }
13252
13253 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13254 ret = pmap_reserve_ppl_page();
13255 if (ret != KERN_SUCCESS) {
13256 if (ret == KERN_RESOURCE_SHORTAGE) {
13257 return ENOMEM;
13258 }
13259 pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13260 return EPERM;
13261 }
13262
13263 /* Setup dispatch parameters */
13264 pmap_image4_dispatch_t dispatch = {
13265 .selector = selector,
13266 .handler = handler
13267 };
13268
13269 switch (selector) {
13270 case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13271 err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13272 break;
13273
13274 case IMAGE4_CS_TRAP_NONCE_SET:
13275 err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13276 break;
13277
13278 case IMAGE4_CS_TRAP_NONCE_ROLL:
13279 err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13280 break;
13281
13282 case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13283 err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13284 break;
13285
13286 default:
13287 err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13288 break;
13289 }
13290
13291 /* Return the CoreCrypto reserved page back to the free list */
13292 pmap_release_reserved_ppl_page();
13293
13294 return err;
13295 }
13296
13297 errno_t
13298 pmap_image4_monitor_trap(
13299 image4_cs_trap_t selector,
13300 const void *input_data,
13301 size_t input_size)
13302 {
13303 errno_t err = EPERM;
13304
13305 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13306 while (err == ENOMEM) {
13307 /* Allocate a page from the free list */
13308 pmap_alloc_page_for_ppl(0);
13309
13310 /* Call the monitor dispatch again */
13311 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13312 }
13313
13314 return err;
13315 }
13316
13317 #endif /* PMAP_CS_PPL_MONITOR */
13318
13319 #if PMAP_CS_INCLUDE_CODE_SIGNING
13320
13321 static int
13322 pmap_cs_profiles_rbtree_compare(
13323 void *profile0,
13324 void *profile1)
13325 {
13326 if (profile0 < profile1) {
13327 return -1;
13328 } else if (profile0 > profile1) {
13329 return 1;
13330 }
13331 return 0;
13332 }
13333
13334 /* Red-black tree for managing provisioning profiles */
13335 MARK_AS_PMAP_DATA static
13336 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13337
13338 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13339 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13340
13341 /* Lock for the profile red-black tree */
13342 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13343
13344 void
13345 pmap_initialize_provisioning_profiles(void)
13346 {
13347 /* Initialize the profiles red-black tree lock */
13348 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13349 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13350
13351 /* Initialize the red-black tree itself */
13352 RB_INIT(&pmap_cs_registered_profiles);
13353
13354 printf("initialized PPL provisioning profile data\n");
13355 }
13356
13357 static bool
13358 pmap_is_testflight_profile(
13359 pmap_cs_profile_t *profile_obj)
13360 {
13361 const char *entitlement_name = "beta-reports-active";
13362 const size_t entitlement_length = strlen(entitlement_name);
13363 CEQueryOperation_t query[2] = {0};
13364
13365 /* If the profile provisions no entitlements, then it isn't a test flight one */
13366 if (profile_obj->entitlements_ctx == NULL) {
13367 return false;
13368 }
13369
13370 /* Build our CoreEntitlements query */
13371 query[0].opcode = kCEOpSelectKey;
13372 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13373 query[0].parameters.stringParameter.length = entitlement_length;
13374 query[1] = CEMatchBool(true);
13375
13376 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13377 profile_obj->entitlements_ctx,
13378 query, 2);
13379
13380 if (ce_err == amfi->CoreEntitlements.kNoError) {
13381 return true;
13382 }
13383
13384 return false;
13385 }
13386
13387 static bool
13388 pmap_is_development_profile(
13389 pmap_cs_profile_t *profile_obj)
13390 {
13391 /* Check for UPP */
13392 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13393 *profile_obj->profile_ctx,
13394 CESelectDictValue("ProvisionsAllDevices"));
13395 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13396 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13397 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13398 return false;
13399 }
13400 }
13401
13402 /* Check for TestFlight profile */
13403 if (pmap_is_testflight_profile(profile_obj) == true) {
13404 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13405 return false;
13406 }
13407
13408 pmap_cs_log_info("%p: development profile", profile_obj);
13409 return true;
13410 }
13411
13412 static kern_return_t
13413 pmap_initialize_profile_entitlements(
13414 pmap_cs_profile_t *profile_obj)
13415 {
13416 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13417 *profile_obj->profile_ctx,
13418 CESelectDictValue("Entitlements"));
13419
13420 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13421 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13422 profile_obj->entitlements_ctx = NULL;
13423
13424 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13425 return KERN_NOT_FOUND;
13426 }
13427
13428 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13429 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13430
13431 CEValidationResult ce_result = {0};
13432 CEError_t ce_err = amfi->CoreEntitlements.Validate(
13433 pmap_cs_core_entitlements_runtime,
13434 &ce_result,
13435 der_start, der_end);
13436 if (ce_err != amfi->CoreEntitlements.kNoError) {
13437 pmap_cs_log_error("unable to validate profile entitlements: %s",
13438 amfi->CoreEntitlements.GetErrorString(ce_err));
13439
13440 return KERN_ABORTED;
13441 }
13442
13443 struct CEQueryContext query_ctx = {0};
13444 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13445 pmap_cs_core_entitlements_runtime,
13446 ce_result,
13447 &query_ctx);
13448 if (ce_err != amfi->CoreEntitlements.kNoError) {
13449 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13450 amfi->CoreEntitlements.GetErrorString(ce_err));
13451
13452 return KERN_ABORTED;
13453 }
13454
13455 /* Setup the entitlements context within the profile object */
13456 profile_obj->entitlements_ctx_storage = query_ctx;
13457 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13458
13459 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13460 return KERN_SUCCESS;
13461 }
13462
13463 kern_return_t
13464 pmap_register_provisioning_profile_internal(
13465 const vm_address_t payload_addr,
13466 const vm_size_t payload_size)
13467 {
13468 kern_return_t ret = KERN_DENIED;
13469 pmap_cs_profile_t *profile_obj = NULL;
13470 pmap_profile_payload_t *profile_payload = NULL;
13471 vm_size_t max_profile_blob_size = 0;
13472 const uint8_t *profile_content = NULL;
13473 size_t profile_content_length = 0;
13474
13475
13476 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
13477 ret = pmap_reserve_ppl_page();
13478 if (ret != KERN_SUCCESS) {
13479 if (ret != KERN_RESOURCE_SHORTAGE) {
13480 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13481 }
13482 return ret;
13483 }
13484
13485 /* Ensure we have valid data passed in */
13486 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13487
13488 /*
13489 * Lockdown the data passed in. The pmap profile payload also contains the profile
13490 * data structure used by the PPL to manage the payload. We need to be able to write
13491 * to that data structure, so we keep the payload PPL writable.
13492 */
13493 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13494
13495 /* Should be safe to read from this now */
13496 profile_payload = (pmap_profile_payload_t*)payload_addr;
13497
13498 /* Ensure the profile blob size provided is valid */
13499 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13500 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13501 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13502 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13503 }
13504
13505 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13506 const bool allow_development_root_cert = true;
13507 #else
13508 const bool allow_development_root_cert = false;
13509 #endif
13510
13511 int ct_result = coretrust->CTEvaluateProvisioningProfile(
13512 profile_payload->profile_blob, profile_payload->profile_blob_size,
13513 allow_development_root_cert,
13514 &profile_content, &profile_content_length);
13515
13516 /* Release the PPL page allocated for CoreCrypto */
13517 pmap_release_reserved_ppl_page();
13518
13519 if (ct_result != 0) {
13520 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13521 } else if ((profile_content == NULL) || profile_content_length == 0) {
13522 panic("PMAP_CS: profile does not have any content: %p | %lu",
13523 profile_content, profile_content_length);
13524 }
13525
13526 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13527 pmap_cs_core_entitlements_runtime,
13528 CCDER_CONSTRUCTED_SET,
13529 false,
13530 profile_content, profile_content + profile_content_length);
13531 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13532 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13533 }
13534
13535 /* Acquire a writable version of the profile data structure */
13536 profile_obj = &profile_payload->profile_obj_storage;
13537 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13538
13539 profile_obj->original_payload = profile_payload;
13540 profile_obj->profile_ctx_storage = profile_ctx_storage;
13541 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13542 os_atomic_store(&profile_obj->reference_count, 0, release);
13543
13544 /* Setup the entitlements provisioned by the profile */
13545 ret = pmap_initialize_profile_entitlements(profile_obj);
13546 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13547 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13548 }
13549
13550 /* Setup properties of the profile */
13551 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13552
13553 /* Mark as validated since it passed all checks */
13554 profile_obj->profile_validated = true;
13555
13556 /* Add the profile to the red-black tree */
13557 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13558 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13559 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13560 }
13561 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13562
13563 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13564 return KERN_SUCCESS;
13565 }
13566
13567 kern_return_t
13568 pmap_register_provisioning_profile(
13569 const vm_address_t payload_addr,
13570 const vm_size_t payload_size)
13571 {
13572 kern_return_t ret = KERN_DENIED;
13573
13574 ret = pmap_register_provisioning_profile_ppl(
13575 payload_addr,
13576 payload_size);
13577
13578 while (ret == KERN_RESOURCE_SHORTAGE) {
13579 /* Allocate a page from the free list */
13580 pmap_alloc_page_for_ppl(0);
13581
13582 /* Attempt the call again */
13583 ret = pmap_register_provisioning_profile_ppl(
13584 payload_addr,
13585 payload_size);
13586 }
13587
13588 return ret;
13589 }
13590
13591 kern_return_t
13592 pmap_unregister_provisioning_profile_internal(
13593 pmap_cs_profile_t *profile_obj)
13594 {
13595 kern_return_t ret = KERN_DENIED;
13596
13597 /* Lock the red-black tree exclusively */
13598 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13599
13600 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13601 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13602 }
13603
13604 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13605 if (reference_count != 0) {
13606 ret = KERN_FAILURE;
13607 goto exit;
13608 }
13609
13610 /* Remove the profile from the red-black tree */
13611 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13612
13613 /* Unregistration was a success */
13614 ret = KERN_SUCCESS;
13615
13616 exit:
13617 /* Unlock the red-black tree */
13618 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13619
13620 if (ret == KERN_SUCCESS) {
13621 /* Get the original payload address */
13622 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13623 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13624
13625 /* Get the original payload size */
13626 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13627 payload_size = round_page(payload_size);
13628
13629 /* Unlock the profile payload */
13630 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13631 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13632 profile_payload, payload_size);
13633
13634 profile_obj = NULL;
13635 }
13636 return ret;
13637 }
13638
13639 kern_return_t
13640 pmap_unregister_provisioning_profile(
13641 pmap_cs_profile_t *profile_obj)
13642 {
13643 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13644 }
13645
13646 kern_return_t
13647 pmap_associate_provisioning_profile_internal(
13648 pmap_cs_code_directory_t *cd_entry,
13649 pmap_cs_profile_t *profile_obj)
13650 {
13651 kern_return_t ret = KERN_DENIED;
13652
13653 /* Acquire the lock on the code directory */
13654 pmap_cs_lock_code_directory(cd_entry);
13655
13656 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13657 pmap_cs_log_error("disallowing profile association with verified signature");
13658 goto exit;
13659 } else if (cd_entry->profile_obj != NULL) {
13660 pmap_cs_log_error("disallowing multiple profile associations with signature");
13661 goto exit;
13662 }
13663
13664 /* Lock the red-black tree as shared */
13665 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13666
13667 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13668 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13669 } else if (profile_obj->profile_validated == false) {
13670 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13671 }
13672
13673 /* Associate the profile with the signature */
13674 cd_entry->profile_obj = profile_obj;
13675
13676 /* Increment the reference count on the profile object */
13677 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13678 if (reference_count == 0) {
13679 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13680 }
13681
13682 /* Unlock the red-black tree */
13683 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13684
13685 /* Association was a success */
13686 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13687 ret = KERN_SUCCESS;
13688
13689 exit:
13690 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13691
13692 return ret;
13693 }
13694
13695 kern_return_t
13696 pmap_associate_provisioning_profile(
13697 pmap_cs_code_directory_t *cd_entry,
13698 pmap_cs_profile_t *profile_obj)
13699 {
13700 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13701 }
13702
13703 kern_return_t
13704 pmap_disassociate_provisioning_profile_internal(
13705 pmap_cs_code_directory_t *cd_entry)
13706 {
13707 pmap_cs_profile_t *profile_obj = NULL;
13708 kern_return_t ret = KERN_DENIED;
13709
13710 /* Acquire the lock on the code directory */
13711 pmap_cs_lock_code_directory(cd_entry);
13712
13713 if (cd_entry->profile_obj == NULL) {
13714 ret = KERN_NOT_FOUND;
13715 goto exit;
13716 }
13717 profile_obj = cd_entry->profile_obj;
13718
13719 /* Disassociate the profile from the signature */
13720 cd_entry->profile_obj = NULL;
13721
13722 /* Disassociation was a success */
13723 ret = KERN_SUCCESS;
13724
13725 exit:
13726 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13727
13728 if (ret == KERN_SUCCESS) {
13729 /* Decrement the reference count on the profile object */
13730 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13731 if (reference_count == UINT32_MAX) {
13732 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13733 }
13734 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13735 }
13736 return ret;
13737 }
13738
13739 kern_return_t
13740 pmap_disassociate_provisioning_profile(
13741 pmap_cs_code_directory_t *cd_entry)
13742 {
13743 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13744 }
13745
13746 kern_return_t
13747 pmap_associate_kernel_entitlements_internal(
13748 pmap_cs_code_directory_t *cd_entry,
13749 const void *kernel_entitlements)
13750 {
13751 kern_return_t ret = KERN_DENIED;
13752
13753 if (kernel_entitlements == NULL) {
13754 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13755 }
13756
13757 /* Acquire the lock on the code directory */
13758 pmap_cs_lock_code_directory(cd_entry);
13759
13760 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13761 ret = KERN_DENIED;
13762 goto out;
13763 } else if (cd_entry->kernel_entitlements != NULL) {
13764 ret = KERN_DENIED;
13765 goto out;
13766 }
13767 cd_entry->kernel_entitlements = kernel_entitlements;
13768
13769 /* Association was a success */
13770 ret = KERN_SUCCESS;
13771
13772 out:
13773 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13774 return ret;
13775 }
13776
13777 kern_return_t
13778 pmap_associate_kernel_entitlements(
13779 pmap_cs_code_directory_t *cd_entry,
13780 const void *kernel_entitlements)
13781 {
13782 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13783 }
13784
13785 kern_return_t
13786 pmap_resolve_kernel_entitlements_internal(
13787 pmap_t pmap,
13788 const void **kernel_entitlements)
13789 {
13790 const void *entitlements = NULL;
13791 pmap_cs_code_directory_t *cd_entry = NULL;
13792 kern_return_t ret = KERN_DENIED;
13793
13794 /* Validate the PMAP object */
13795 validate_pmap(pmap);
13796
13797 /* Ensure no kernel PMAP */
13798 if (pmap == kernel_pmap) {
13799 return KERN_NOT_FOUND;
13800 }
13801
13802 /* Attempt a shared lock on the PMAP */
13803 if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13804 return KERN_ABORTED;
13805 }
13806
13807 /*
13808 * Acquire the code signature from the PMAP. This function is called when
13809 * performing an entitlement check, and since we've confirmed this isn't
13810 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13811 * with a code signature.
13812 */
13813 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13814 if (cd_entry == NULL) {
13815 ret = KERN_NOT_FOUND;
13816 goto out;
13817 }
13818
13819 entitlements = cd_entry->kernel_entitlements;
13820 if (entitlements == NULL) {
13821 ret = KERN_NOT_FOUND;
13822 goto out;
13823 }
13824
13825 /* Pin and write out the entitlements object pointer */
13826 if (kernel_entitlements != NULL) {
13827 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13828 *kernel_entitlements = entitlements;
13829 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13830 }
13831
13832 /* Successfully resolved the entitlements */
13833 ret = KERN_SUCCESS;
13834
13835 out:
13836 /* Unlock the code signature object */
13837 if (cd_entry != NULL) {
13838 lck_rw_unlock_shared(&cd_entry->rwlock);
13839 cd_entry = NULL;
13840 }
13841
13842 /* Unlock the PMAP object */
13843 pmap_unlock(pmap, PMAP_LOCK_SHARED);
13844
13845 return ret;
13846 }
13847
13848 kern_return_t
13849 pmap_resolve_kernel_entitlements(
13850 pmap_t pmap,
13851 const void **kernel_entitlements)
13852 {
13853 kern_return_t ret = KERN_DENIED;
13854
13855 do {
13856 ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13857 } while (ret == KERN_ABORTED);
13858
13859 return ret;
13860 }
13861
13862 kern_return_t
13863 pmap_accelerate_entitlements_internal(
13864 pmap_cs_code_directory_t *cd_entry)
13865 {
13866 const coreentitlements_t *CoreEntitlements = NULL;
13867 const CS_SuperBlob *superblob = NULL;
13868 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13869 size_t signature_length = 0;
13870 size_t acceleration_length = 0;
13871 size_t required_length = 0;
13872 kern_return_t ret = KERN_DENIED;
13873
13874 /* Setup the CoreEntitlements interface */
13875 CoreEntitlements = &amfi->CoreEntitlements;
13876
13877 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13878
13879 /* Acquire the lock on the code directory */
13880 pmap_cs_lock_code_directory(cd_entry);
13881
13882 /*
13883 * Only reconstituted code signatures can be accelerated. This is only a policy
13884 * decision we make since this allows us to re-use any unused space within the
13885 * locked down code signature region. There is also a decent bit of validation
13886 * within the reconstitution function to ensure blobs are ordered and do not
13887 * contain any padding around them which can cause issues here.
13888 *
13889 * This also serves as a check to ensure the signature is trusted.
13890 */
13891 if (cd_entry->unneeded_code_signature_unlocked == false) {
13892 ret = KERN_DENIED;
13893 goto out;
13894 }
13895
13896 if (cd_entry->ce_ctx == NULL) {
13897 ret = KERN_SUCCESS;
13898 goto out;
13899 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13900 ret = KERN_SUCCESS;
13901 goto out;
13902 }
13903
13904 /* We only support accelerating when size <= PAGE_SIZE */
13905 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13906 if (ce_err != CoreEntitlements->kNoError) {
13907 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13908 /* Small entitlement blobs aren't eligible */
13909 ret = KERN_SUCCESS;
13910 goto out;
13911 }
13912 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13913 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13914 } else if (acceleration_length > PAGE_SIZE) {
13915 ret = KERN_ABORTED;
13916 goto out;
13917 }
13918 assert(acceleration_length > 0);
13919
13920 superblob = cd_entry->superblob;
13921 signature_length = ntohl(superblob->length);
13922
13923 /* Adjust the required length for the overhead structure -- can't overflow */
13924 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13925 if (required_length > PAGE_SIZE) {
13926 ret = KERN_ABORTED;
13927 goto out;
13928 }
13929
13930 /*
13931 * First we'll check if the code signature has enough space within the locked down
13932 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13933 * allocate the buffer, and if not, we'll just allocate an entire page from the
13934 * free list.
13935 *
13936 * When we're storing the buffer within the code signature, we also need to make
13937 * sure we account for alignment of the buffer.
13938 */
13939 const vm_address_t align_mask = sizeof(void*) - 1;
13940 size_t required_length_within_sig = required_length + align_mask;
13941
13942 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13943 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13944 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13945
13946 /* We need to resolve to the physical aperture */
13947 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13948 acceleration_buf = (void*)phystokv(phys_addr);
13949
13950 /* Ensure the offset within the page wasn't lost */
13951 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13952
13953 acceleration_buf->allocated = false;
13954 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13955 } else {
13956 if (required_length <= pmap_cs_blob_limit) {
13957 struct pmap_cs_blob *bucket = NULL;
13958 size_t bucket_size = 0;
13959
13960 /* Allocate a buffer from the blob allocator */
13961 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13962 if (ret != KERN_SUCCESS) {
13963 goto out;
13964 }
13965 acceleration_buf = (void*)bucket->blob;
13966 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13967 } else {
13968 pmap_paddr_t phys_addr = 0;
13969 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13970 if (ret != KERN_SUCCESS) {
13971 goto out;
13972 }
13973 acceleration_buf = (void*)phystokv(phys_addr);
13974 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13975 }
13976 acceleration_buf->allocated = true;
13977 }
13978 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13979 acceleration_buf->length = acceleration_length;
13980
13981 /* Take the acceleration buffer lock */
13982 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13983
13984 /* Setup the global acceleration buffer state */
13985 pmap_cs_acceleration_buf = acceleration_buf;
13986
13987 /* Accelerate the entitlements */
13988 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13989 if (ce_err != CoreEntitlements->kNoError) {
13990 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13991 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13992 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13993 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13994 }
13995
13996 /*
13997 * The global acceleration buffer lock is unlocked by the allocation function itself
13998 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13999 * an assert that the lock is unlocked here since another thread could have acquired
14000 * it by now.
14001 */
14002 ret = KERN_SUCCESS;
14003
14004 out:
14005 lck_rw_unlock_exclusive(&cd_entry->rwlock);
14006 return ret;
14007 }
14008
14009 kern_return_t
14010 pmap_accelerate_entitlements(
14011 pmap_cs_code_directory_t *cd_entry)
14012 {
14013 kern_return_t ret = KERN_DENIED;
14014
14015 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14016 while (ret == KERN_RESOURCE_SHORTAGE) {
14017 /* Allocate a page for the PPL */
14018 pmap_alloc_page_for_ppl(0);
14019
14020 /* Try again */
14021 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14022 }
14023
14024 return ret;
14025 }
14026
14027 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14028
14029 MARK_AS_PMAP_TEXT bool
14030 pmap_lookup_in_loaded_trust_caches_internal(
14031 const uint8_t cdhash[CS_CDHASH_LEN])
14032 {
14033 kern_return_t kr = KERN_NOT_FOUND;
14034
14035 #if PMAP_CS_PPL_MONITOR
14036 /*
14037 * If we have the PPL monitor, then this function can only be called from
14038 * within the PPL. Calling it directly would've caused a panic, so we can
14039 * assume that we're in the PPL here.
14040 */
14041 uint8_t cdhash_safe[CS_CDHASH_LEN];
14042 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14043
14044 kr = pmap_query_trust_cache_safe(
14045 kTCQueryTypeLoadable,
14046 cdhash_safe,
14047 NULL);
14048 #else
14049 kr = query_trust_cache(
14050 kTCQueryTypeLoadable,
14051 cdhash,
14052 NULL);
14053 #endif
14054
14055 if (kr == KERN_SUCCESS) {
14056 return true;
14057 }
14058 return false;
14059 }
14060
14061 bool
14062 pmap_lookup_in_loaded_trust_caches(
14063 const uint8_t cdhash[CS_CDHASH_LEN])
14064 {
14065 #if XNU_MONITOR
14066 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14067 #else
14068 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14069 #endif
14070 }
14071
14072 MARK_AS_PMAP_TEXT uint32_t
14073 pmap_lookup_in_static_trust_cache_internal(
14074 const uint8_t cdhash[CS_CDHASH_LEN])
14075 {
14076 TrustCacheQueryToken_t query_token = {0};
14077 kern_return_t kr = KERN_NOT_FOUND;
14078 uint64_t flags = 0;
14079 uint8_t hash_type = 0;
14080
14081 #if PMAP_CS_PPL_MONITOR
14082 /*
14083 * If we have the PPL monitor, then this function can only be called from
14084 * within the PPL. Calling it directly would've caused a panic, so we can
14085 * assume that we're in the PPL here.
14086 */
14087 uint8_t cdhash_safe[CS_CDHASH_LEN];
14088 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14089
14090 kr = pmap_query_trust_cache_safe(
14091 kTCQueryTypeStatic,
14092 cdhash_safe,
14093 &query_token);
14094 #else
14095 kr = query_trust_cache(
14096 kTCQueryTypeStatic,
14097 cdhash,
14098 &query_token);
14099 #endif
14100
14101 if (kr == KERN_SUCCESS) {
14102 amfi->TrustCache.queryGetFlags(&query_token, &flags);
14103 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14104
14105 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14106 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14107 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14108 }
14109
14110 return 0;
14111 }
14112
14113 uint32_t
14114 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14115 {
14116 #if XNU_MONITOR
14117 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14118 #else
14119 return pmap_lookup_in_static_trust_cache_internal(cdhash);
14120 #endif
14121 }
14122
14123 #if PMAP_CS_INCLUDE_CODE_SIGNING
14124
14125 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14126 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14127
14128 MARK_AS_PMAP_TEXT void
14129 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14130 {
14131
14132 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14133 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14134 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14135
14136 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14137 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14138 }
14139
14140 MARK_AS_PMAP_TEXT bool
14141 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14142 {
14143 bool match = false;
14144
14145 /* Lockdown mode disallows compilation service */
14146 if (ppl_lockdown_mode_enabled == true) {
14147 return false;
14148 }
14149
14150 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14151 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14152 match = true;
14153 }
14154 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14155
14156 if (match) {
14157 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14158 }
14159
14160 return match;
14161 }
14162
14163 void
14164 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14165 {
14166 #if XNU_MONITOR
14167 pmap_set_compilation_service_cdhash_ppl(cdhash);
14168 #else
14169 pmap_set_compilation_service_cdhash_internal(cdhash);
14170 #endif
14171 }
14172
14173 bool
14174 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14175 {
14176 #if XNU_MONITOR
14177 return pmap_match_compilation_service_cdhash_ppl(cdhash);
14178 #else
14179 return pmap_match_compilation_service_cdhash_internal(cdhash);
14180 #endif
14181 }
14182
14183 /*
14184 * As part of supporting local signing on the device, we need the PMAP layer
14185 * to store the local signing key so that PMAP_CS can validate with it. We
14186 * store it at the PMAP layer such that it is accessible to both AMFI and
14187 * PMAP_CS should they need it.
14188 */
14189 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14190 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14191
14192 MARK_AS_PMAP_TEXT void
14193 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14194 {
14195 bool key_set = false;
14196
14197 /*
14198 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14199 * a successful exchange means that the local signing public key has _not_ been
14200 * set. In case the key has been set, we panic as we would never expect the
14201 * kernel to attempt to set the key more than once.
14202 */
14203 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14204
14205 if (key_set) {
14206 panic("attempted to set the local signing public key multiple times");
14207 }
14208
14209 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14210 pmap_cs_log_info("set local signing public key");
14211 }
14212
14213 void
14214 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14215 {
14216 #if XNU_MONITOR
14217 return pmap_set_local_signing_public_key_ppl(public_key);
14218 #else
14219 return pmap_set_local_signing_public_key_internal(public_key);
14220 #endif
14221 }
14222
14223 uint8_t*
14224 pmap_get_local_signing_public_key(void)
14225 {
14226 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14227
14228 if (key_set) {
14229 return pmap_local_signing_public_key;
14230 }
14231
14232 return NULL;
14233 }
14234
14235 /*
14236 * Locally signed applications need to be explicitly authorized by an entitled application
14237 * before we allow them to run.
14238 */
14239 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14240 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14241
14242 MARK_AS_PMAP_TEXT void
14243 pmap_unrestrict_local_signing_internal(
14244 const uint8_t cdhash[CS_CDHASH_LEN])
14245 {
14246
14247 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14248 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14249 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14250
14251 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14252 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14253 }
14254
14255 void
14256 pmap_unrestrict_local_signing(
14257 const uint8_t cdhash[CS_CDHASH_LEN])
14258 {
14259 #if XNU_MONITOR
14260 return pmap_unrestrict_local_signing_ppl(cdhash);
14261 #else
14262 return pmap_unrestrict_local_signing_internal(cdhash);
14263 #endif
14264 }
14265
14266 #if PMAP_CS
14267 MARK_AS_PMAP_TEXT static void
14268 pmap_restrict_local_signing(void)
14269 {
14270 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14271 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14272 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14273 }
14274
14275 MARK_AS_PMAP_TEXT static bool
14276 pmap_local_signing_restricted(
14277 const uint8_t cdhash[CS_CDHASH_LEN])
14278 {
14279 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14280 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14281 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14282
14283 return ret != 0;
14284 }
14285
14286 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14287 #endif
14288
14289 MARK_AS_PMAP_TEXT void
14290 pmap_footprint_suspend_internal(
14291 vm_map_t map,
14292 boolean_t suspend)
14293 {
14294 #if DEVELOPMENT || DEBUG
14295 if (suspend) {
14296 current_thread()->pmap_footprint_suspended = TRUE;
14297 map->pmap->footprint_was_suspended = TRUE;
14298 } else {
14299 current_thread()->pmap_footprint_suspended = FALSE;
14300 }
14301 #else /* DEVELOPMENT || DEBUG */
14302 (void) map;
14303 (void) suspend;
14304 #endif /* DEVELOPMENT || DEBUG */
14305 }
14306
14307 void
14308 pmap_footprint_suspend(
14309 vm_map_t map,
14310 boolean_t suspend)
14311 {
14312 #if XNU_MONITOR
14313 pmap_footprint_suspend_ppl(map, suspend);
14314 #else
14315 pmap_footprint_suspend_internal(map, suspend);
14316 #endif
14317 }
14318
14319 MARK_AS_PMAP_TEXT void
14320 pmap_nop_internal(pmap_t pmap __unused)
14321 {
14322 validate_pmap_mutable(pmap);
14323 }
14324
14325 void
14326 pmap_nop(pmap_t pmap)
14327 {
14328 #if XNU_MONITOR
14329 pmap_nop_ppl(pmap);
14330 #else
14331 pmap_nop_internal(pmap);
14332 #endif
14333 }
14334
14335 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14336
14337 struct page_table_dump_header {
14338 uint64_t pa;
14339 uint64_t num_entries;
14340 uint64_t start_va;
14341 uint64_t end_va;
14342 };
14343
14344 static kern_return_t
14345 pmap_dump_page_tables_recurse(pmap_t pmap,
14346 const tt_entry_t *ttp,
14347 unsigned int cur_level,
14348 unsigned int level_mask,
14349 uint64_t start_va,
14350 void *buf_start,
14351 void *buf_end,
14352 size_t *bytes_copied)
14353 {
14354 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14355 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14356
14357 uint64_t size = pt_attr->pta_level_info[cur_level].size;
14358 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14359 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14360 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14361
14362 void *bufp = (uint8_t*)buf_start + *bytes_copied;
14363
14364 if (cur_level == pt_attr_root_level(pt_attr)) {
14365 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14366 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14367 }
14368
14369 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14370 const tt_entry_t *tt_end = &ttp[num_entries];
14371
14372 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14373 return KERN_INSUFFICIENT_BUFFER_SIZE;
14374 }
14375
14376 if (level_mask & (1U << cur_level)) {
14377 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14378 header->pa = ml_static_vtop((vm_offset_t)ttp);
14379 header->num_entries = num_entries;
14380 header->start_va = start_va;
14381 header->end_va = start_va + (num_entries * size);
14382
14383 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14384 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14385 }
14386 uint64_t current_va = start_va;
14387
14388 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14389 tt_entry_t tte = *ttep;
14390
14391 if (!(tte & valid_mask)) {
14392 continue;
14393 }
14394
14395 if ((tte & type_mask) == type_block) {
14396 continue;
14397 } else {
14398 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14399 panic("%s: corrupt entry %#llx at %p, "
14400 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14401 __FUNCTION__, tte, ttep,
14402 ttp, cur_level, bufp, buf_end);
14403 }
14404
14405 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14406
14407 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14408 level_mask, current_va, buf_start, buf_end, bytes_copied);
14409
14410 if (recurse_result != KERN_SUCCESS) {
14411 return recurse_result;
14412 }
14413 }
14414 }
14415
14416 return KERN_SUCCESS;
14417 }
14418
14419 kern_return_t
14420 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14421 {
14422 if (not_in_kdp) {
14423 panic("pmap_dump_page_tables must only be called from kernel debugger context");
14424 }
14425 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14426 level_mask, pmap->min, bufp, buf_end, bytes_copied);
14427 }
14428
14429 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14430
14431 kern_return_t
14432 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14433 unsigned int level_mask __unused, size_t *bytes_copied __unused)
14434 {
14435 return KERN_NOT_SUPPORTED;
14436 }
14437 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14438
14439
14440 #ifdef CONFIG_XNUPOST
14441 #ifdef __arm64__
14442 static volatile bool pmap_test_took_fault = false;
14443
14444 static bool
14445 pmap_test_fault_handler(arm_saved_state_t * state)
14446 {
14447 bool retval = false;
14448 uint32_t esr = get_saved_state_esr(state);
14449 esr_exception_class_t class = ESR_EC(esr);
14450 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
14451
14452 if ((class == ESR_EC_DABORT_EL1) &&
14453 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14454 pmap_test_took_fault = true;
14455 /* return to the instruction immediately after the call to NX page */
14456 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14457 retval = true;
14458 }
14459
14460 return retval;
14461 }
14462
14463 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14464 static NOKASAN bool
14465 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14466 {
14467 pmap_t old_pmap = NULL;
14468
14469 pmap_test_took_fault = false;
14470
14471 /*
14472 * We're potentially switching pmaps without using the normal thread
14473 * mechanism; disable interrupts and preemption to avoid any unexpected
14474 * memory accesses.
14475 */
14476 uint64_t old_int_state = pmap_interrupts_disable();
14477 mp_disable_preemption();
14478
14479 if (pmap != NULL) {
14480 old_pmap = current_pmap();
14481 pmap_switch(pmap);
14482
14483 /* Disable PAN; pmap shouldn't be the kernel pmap. */
14484 #if __ARM_PAN_AVAILABLE__
14485 __builtin_arm_wsr("pan", 0);
14486 #endif /* __ARM_PAN_AVAILABLE__ */
14487 }
14488
14489 ml_expect_fault_begin(pmap_test_fault_handler, va);
14490
14491 if (is_write) {
14492 *((volatile uint64_t*)(va)) = 0xdec0de;
14493 } else {
14494 volatile uint64_t tmp = *((volatile uint64_t*)(va));
14495 (void)tmp;
14496 }
14497
14498 /* Save the fault bool, and undo the gross stuff we did. */
14499 bool took_fault = pmap_test_took_fault;
14500 ml_expect_fault_end();
14501
14502 if (pmap != NULL) {
14503 #if __ARM_PAN_AVAILABLE__
14504 __builtin_arm_wsr("pan", 1);
14505 #endif /* __ARM_PAN_AVAILABLE__ */
14506
14507 pmap_switch(old_pmap);
14508 }
14509
14510 mp_enable_preemption();
14511 pmap_interrupts_restore(old_int_state);
14512 bool retval = (took_fault == should_fault);
14513 return retval;
14514 }
14515
14516 static bool
14517 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14518 {
14519 bool retval = pmap_test_access(pmap, va, should_fault, false);
14520
14521 if (!retval) {
14522 T_FAIL("%s: %s, "
14523 "pmap=%p, va=%p, should_fault=%u",
14524 __func__, should_fault ? "did not fault" : "faulted",
14525 pmap, (void*)va, (unsigned)should_fault);
14526 }
14527
14528 return retval;
14529 }
14530
14531 static bool
14532 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14533 {
14534 bool retval = pmap_test_access(pmap, va, should_fault, true);
14535
14536 if (!retval) {
14537 T_FAIL("%s: %s, "
14538 "pmap=%p, va=%p, should_fault=%u",
14539 __func__, should_fault ? "did not fault" : "faulted",
14540 pmap, (void*)va, (unsigned)should_fault);
14541 }
14542
14543 return retval;
14544 }
14545
14546 static bool
14547 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14548 {
14549 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14550 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14551
14552 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14553
14554 if (!retval) {
14555 T_FAIL("%s: bits=%u, "
14556 "pa=%p, should_be_set=%u",
14557 __func__, bits,
14558 (void*)pa, should_be_set);
14559 }
14560
14561 return retval;
14562 }
14563
14564 static __attribute__((noinline)) bool
14565 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14566 {
14567 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14568 return retval;
14569 }
14570
14571 static int
14572 pmap_test_test_config(unsigned int flags)
14573 {
14574 T_LOG("running pmap_test_test_config flags=0x%X", flags);
14575 unsigned int map_count = 0;
14576 unsigned long page_ratio = 0;
14577 pmap_t pmap = pmap_create_options(NULL, 0, flags);
14578
14579 if (!pmap) {
14580 panic("Failed to allocate pmap");
14581 }
14582
14583 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14584 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14585 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14586 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14587
14588 if (pmap_page_size <= native_page_size) {
14589 page_ratio = native_page_size / pmap_page_size;
14590 } else {
14591 /*
14592 * We claim to support a page_ratio of less than 1, which is
14593 * not currently supported by the pmap layer; panic.
14594 */
14595 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14596 "flags=%u",
14597 __func__, native_page_size, pmap_page_size,
14598 flags);
14599 }
14600
14601 if (PAGE_RATIO > 1) {
14602 /*
14603 * The kernel is deliberately pretending to have 16KB pages.
14604 * The pmap layer has code that supports this, so pretend the
14605 * page size is larger than it is.
14606 */
14607 pmap_page_size = PAGE_SIZE;
14608 native_page_size = PAGE_SIZE;
14609 }
14610
14611 /*
14612 * Get two pages from the VM; one to be mapped wired, and one to be
14613 * mapped nonwired.
14614 */
14615 vm_page_t unwired_vm_page = vm_page_grab();
14616 vm_page_t wired_vm_page = vm_page_grab();
14617
14618 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14619 panic("Failed to grab VM pages");
14620 }
14621
14622 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14623 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14624
14625 pmap_paddr_t pa = ptoa(pn);
14626 pmap_paddr_t wired_pa = ptoa(wired_pn);
14627
14628 /*
14629 * We'll start mappings at the second twig TT. This keeps us from only
14630 * using the first entry in each TT, which would trivially be address
14631 * 0; one of the things we will need to test is retrieving the VA for
14632 * a given PTE.
14633 */
14634 vm_map_address_t va_base = pmap_twig_size;
14635 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14636
14637 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14638 /*
14639 * Not exactly a functional failure, but this test relies on
14640 * there being a spare PTE slot we can use to pin the TT.
14641 */
14642 panic("Cannot pin translation table");
14643 }
14644
14645 /*
14646 * Create the wired mapping; this will prevent the pmap layer from
14647 * reclaiming our test TTs, which would interfere with this test
14648 * ("interfere" -> "make it panic").
14649 */
14650 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14651
14652 #if XNU_MONITOR
14653 /*
14654 * If the PPL is enabled, make sure that the kernel cannot write
14655 * to PPL memory.
14656 */
14657 if (!pmap_ppl_disable) {
14658 T_LOG("Validate that kernel cannot write to PPL memory.");
14659 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14660 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14661 }
14662 #endif
14663
14664 /*
14665 * Create read-only mappings of the nonwired page; if the pmap does
14666 * not use the same page size as the kernel, create multiple mappings
14667 * so that the kernel page is fully mapped.
14668 */
14669 for (map_count = 0; map_count < page_ratio; map_count++) {
14670 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14671 }
14672
14673 /* Validate that all the PTEs have the expected PA and VA. */
14674 for (map_count = 0; map_count < page_ratio; map_count++) {
14675 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14676
14677 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14678 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14679 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14680 }
14681
14682 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14683 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14684 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14685 }
14686 }
14687
14688 T_LOG("Validate that reads to our mapping do not fault.");
14689 pmap_test_read(pmap, va_base, false);
14690
14691 T_LOG("Validate that writes to our mapping fault.");
14692 pmap_test_write(pmap, va_base, true);
14693
14694 T_LOG("Make the first mapping writable.");
14695 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14696
14697 T_LOG("Validate that writes to our mapping do not fault.");
14698 pmap_test_write(pmap, va_base, false);
14699
14700
14701 T_LOG("Make the first mapping execute-only");
14702 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14703
14704
14705 T_LOG("Validate that reads to our mapping do not fault.");
14706 pmap_test_read(pmap, va_base, false);
14707
14708 T_LOG("Validate that writes to our mapping fault.");
14709 pmap_test_write(pmap, va_base, true);
14710
14711
14712 /*
14713 * For page ratios of greater than 1: validate that writes to the other
14714 * mappings still fault. Remove the mappings afterwards (we're done
14715 * with page ratio testing).
14716 */
14717 for (map_count = 1; map_count < page_ratio; map_count++) {
14718 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14719 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14720 }
14721
14722 T_LOG("Mark the page unreferenced and unmodified.");
14723 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14724 pmap_test_check_refmod(pa, 0);
14725
14726 /*
14727 * Begin testing the ref/mod state machine. Re-enter the mapping with
14728 * different protection/fault_type settings, and confirm that the
14729 * ref/mod state matches our expectations at each step.
14730 */
14731 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
14732 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14733 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14734
14735 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
14736 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14737 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14738 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14739
14740 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
14741 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14742 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14743 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14744
14745 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
14746 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14747 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14748
14749 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
14750 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14751 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14752 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14753
14754 /*
14755 * Shared memory testing; we'll have two mappings; one read-only,
14756 * one read-write.
14757 */
14758 vm_map_address_t rw_base = va_base;
14759 vm_map_address_t ro_base = va_base + pmap_page_size;
14760
14761 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14762 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14763
14764 /*
14765 * Test that we take faults as expected for unreferenced/unmodified
14766 * pages. Also test the arm_fast_fault interface, to ensure that
14767 * mapping permissions change as expected.
14768 */
14769 T_LOG("!ref/!mod: expect no access");
14770 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14771 pmap_test_read_write(pmap, ro_base, false, false);
14772 pmap_test_read_write(pmap, rw_base, false, false);
14773
14774 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14775 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14776 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14777 pmap_test_read_write(pmap, ro_base, true, false);
14778 pmap_test_read_write(pmap, rw_base, true, false);
14779
14780 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14781 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14782 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14783 pmap_test_read_write(pmap, ro_base, true, false);
14784 pmap_test_read_write(pmap, rw_base, true, true);
14785
14786 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14787 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14788 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14789 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14790 pmap_test_read_write(pmap, ro_base, true, false);
14791 pmap_test_read_write(pmap, rw_base, true, true);
14792
14793 T_LOG("RW protect both mappings; should not change protections.");
14794 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14795 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14796 pmap_test_read_write(pmap, ro_base, true, false);
14797 pmap_test_read_write(pmap, rw_base, true, true);
14798
14799 T_LOG("Read protect both mappings; RW mapping should become RO.");
14800 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14801 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14802 pmap_test_read_write(pmap, ro_base, true, false);
14803 pmap_test_read_write(pmap, rw_base, true, false);
14804
14805 T_LOG("RW protect the page; mappings should not change protections.");
14806 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14807 pmap_page_protect(pn, VM_PROT_ALL);
14808 pmap_test_read_write(pmap, ro_base, true, false);
14809 pmap_test_read_write(pmap, rw_base, true, true);
14810
14811 T_LOG("Read protect the page; RW mapping should become RO.");
14812 pmap_page_protect(pn, VM_PROT_READ);
14813 pmap_test_read_write(pmap, ro_base, true, false);
14814 pmap_test_read_write(pmap, rw_base, true, false);
14815
14816 T_LOG("Validate that disconnect removes all known mappings of the page.");
14817 pmap_disconnect(pn);
14818 if (!pmap_verify_free(pn)) {
14819 T_FAIL("Page still has mappings");
14820 }
14821
14822 T_LOG("Remove the wired mapping, so we can tear down the test map.");
14823 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14824 pmap_destroy(pmap);
14825
14826 T_LOG("Release the pages back to the VM.");
14827 vm_page_lock_queues();
14828 vm_page_free(unwired_vm_page);
14829 vm_page_free(wired_vm_page);
14830 vm_page_unlock_queues();
14831
14832 T_LOG("Testing successful!");
14833 return 0;
14834 }
14835 #endif /* __arm64__ */
14836
14837 kern_return_t
14838 pmap_test(void)
14839 {
14840 T_LOG("Starting pmap_tests");
14841 #ifdef __arm64__
14842 int flags = 0;
14843 flags |= PMAP_CREATE_64BIT;
14844
14845 #if __ARM_MIXED_PAGE_SIZE__
14846 T_LOG("Testing VM_PAGE_SIZE_4KB");
14847 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14848 T_LOG("Testing VM_PAGE_SIZE_16KB");
14849 pmap_test_test_config(flags);
14850 #else /* __ARM_MIXED_PAGE_SIZE__ */
14851 pmap_test_test_config(flags);
14852 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14853
14854 #endif /* __arm64__ */
14855 T_PASS("completed pmap_test successfully");
14856 return KERN_SUCCESS;
14857 }
14858 #endif /* CONFIG_XNUPOST */
14859
14860 /*
14861 * The following function should never make it to RELEASE code, since
14862 * it provides a way to get the PPL to modify text pages.
14863 */
14864 #if DEVELOPMENT || DEBUG
14865
14866 #define ARM_UNDEFINED_INSN 0xe7f000f0
14867 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14868
14869 /**
14870 * Forcibly overwrite executable text with an illegal instruction.
14871 *
14872 * @note Only used for xnu unit testing.
14873 *
14874 * @param pa The physical address to corrupt.
14875 *
14876 * @return KERN_SUCCESS on success.
14877 */
14878 kern_return_t
14879 pmap_test_text_corruption(pmap_paddr_t pa)
14880 {
14881 #if XNU_MONITOR
14882 return pmap_test_text_corruption_ppl(pa);
14883 #else /* XNU_MONITOR */
14884 return pmap_test_text_corruption_internal(pa);
14885 #endif /* XNU_MONITOR */
14886 }
14887
14888 MARK_AS_PMAP_TEXT kern_return_t
14889 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14890 {
14891 vm_offset_t va = phystokv(pa);
14892 unsigned int pai = pa_index(pa);
14893
14894 assert(pa_valid(pa));
14895
14896 pvh_lock(pai);
14897
14898 pv_entry_t **pv_h = pai_to_pvh(pai);
14899 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14900 #if defined(PVH_FLAG_EXEC)
14901 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14902
14903 if (need_ap_twiddle) {
14904 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14905 }
14906 #endif /* defined(PVH_FLAG_EXEC) */
14907
14908 /*
14909 * The low bit in an instruction address indicates a THUMB instruction
14910 */
14911 if (va & 1) {
14912 va &= ~(vm_offset_t)1;
14913 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14914 } else {
14915 *(uint32_t *)va = ARM_UNDEFINED_INSN;
14916 }
14917
14918 #if defined(PVH_FLAG_EXEC)
14919 if (need_ap_twiddle) {
14920 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14921 }
14922 #endif /* defined(PVH_FLAG_EXEC) */
14923
14924 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14925
14926 pvh_unlock(pai);
14927
14928 return KERN_SUCCESS;
14929 }
14930
14931 #endif /* DEVELOPMENT || DEBUG */
14932