1 /*
2 * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93
94 #include <pexpert/device_tree.h>
95
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106
107
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131
132 const struct page_table_ops native_pt_ops =
133 {
134 .alloc_id = alloc_asid,
135 .free_id = free_asid,
136 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 .wimg_to_pte = wimg_to_pte,
139 };
140
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 [0] = {
144 .size = ARM_16K_TT_L0_SIZE,
145 .offmask = ARM_16K_TT_L0_OFFMASK,
146 .shift = ARM_16K_TT_L0_SHIFT,
147 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 .valid_mask = ARM_TTE_VALID,
149 .type_mask = ARM_TTE_TYPE_MASK,
150 .type_block = ARM_TTE_TYPE_BLOCK
151 },
152 [1] = {
153 .size = ARM_16K_TT_L1_SIZE,
154 .offmask = ARM_16K_TT_L1_OFFMASK,
155 .shift = ARM_16K_TT_L1_SHIFT,
156 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 .valid_mask = ARM_TTE_VALID,
158 .type_mask = ARM_TTE_TYPE_MASK,
159 .type_block = ARM_TTE_TYPE_BLOCK
160 },
161 [2] = {
162 .size = ARM_16K_TT_L2_SIZE,
163 .offmask = ARM_16K_TT_L2_OFFMASK,
164 .shift = ARM_16K_TT_L2_SHIFT,
165 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 .valid_mask = ARM_TTE_VALID,
167 .type_mask = ARM_TTE_TYPE_MASK,
168 .type_block = ARM_TTE_TYPE_BLOCK
169 },
170 [3] = {
171 .size = ARM_16K_TT_L3_SIZE,
172 .offmask = ARM_16K_TT_L3_OFFMASK,
173 .shift = ARM_16K_TT_L3_SHIFT,
174 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 .valid_mask = ARM_PTE_TYPE_VALID,
176 .type_mask = ARM_PTE_TYPE_MASK,
177 .type_block = ARM_TTE_TYPE_L3BLOCK
178 }
179 };
180
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 [0] = {
184 .size = ARM_4K_TT_L0_SIZE,
185 .offmask = ARM_4K_TT_L0_OFFMASK,
186 .shift = ARM_4K_TT_L0_SHIFT,
187 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 .valid_mask = ARM_TTE_VALID,
189 .type_mask = ARM_TTE_TYPE_MASK,
190 .type_block = ARM_TTE_TYPE_BLOCK
191 },
192 [1] = {
193 .size = ARM_4K_TT_L1_SIZE,
194 .offmask = ARM_4K_TT_L1_OFFMASK,
195 .shift = ARM_4K_TT_L1_SHIFT,
196 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 .valid_mask = ARM_TTE_VALID,
198 .type_mask = ARM_TTE_TYPE_MASK,
199 .type_block = ARM_TTE_TYPE_BLOCK
200 },
201 [2] = {
202 .size = ARM_4K_TT_L2_SIZE,
203 .offmask = ARM_4K_TT_L2_OFFMASK,
204 .shift = ARM_4K_TT_L2_SHIFT,
205 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 .valid_mask = ARM_TTE_VALID,
207 .type_mask = ARM_TTE_TYPE_MASK,
208 .type_block = ARM_TTE_TYPE_BLOCK
209 },
210 [3] = {
211 .size = ARM_4K_TT_L3_SIZE,
212 .offmask = ARM_4K_TT_L3_OFFMASK,
213 .shift = ARM_4K_TT_L3_SHIFT,
214 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 .valid_mask = ARM_PTE_TYPE_VALID,
216 .type_mask = ARM_PTE_TYPE_MASK,
217 .type_block = ARM_TTE_TYPE_L3BLOCK
218 }
219 };
220
221 const struct page_table_attr pmap_pt_attr_4k = {
222 .pta_level_info = pmap_table_level_info_4k,
223 .pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 .pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 .pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 .pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 .pta_max_level = PMAP_TT_L3_LEVEL,
234 .pta_ops = &native_pt_ops,
235 .ap_ro = ARM_PTE_AP(AP_RORO),
236 .ap_rw = ARM_PTE_AP(AP_RWRW),
237 .ap_rona = ARM_PTE_AP(AP_RONA),
238 .ap_rwna = ARM_PTE_AP(AP_RWNA),
239 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 .ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 .pta_tcr_value = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 .pta_page_size = 4096,
245 .pta_page_shift = 12,
246 };
247
248 const struct page_table_attr pmap_pt_attr_16k = {
249 .pta_level_info = pmap_table_level_info_16k,
250 .pta_root_level = PMAP_TT_L1_LEVEL,
251 .pta_commpage_level = PMAP_TT_L2_LEVEL,
252 .pta_max_level = PMAP_TT_L3_LEVEL,
253 .pta_ops = &native_pt_ops,
254 .ap_ro = ARM_PTE_AP(AP_RORO),
255 .ap_rw = ARM_PTE_AP(AP_RWRW),
256 .ap_rona = ARM_PTE_AP(AP_RONA),
257 .ap_rwna = ARM_PTE_AP(AP_RWNA),
258 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
259 .ap_x = ARM_PTE_PNX,
260 #if __ARM_MIXED_PAGE_SIZE__
261 .pta_tcr_value = TCR_EL1_16KB,
262 #endif /* __ARM_MIXED_PAGE_SIZE__ */
263 .pta_page_size = 16384,
264 .pta_page_shift = 14,
265 };
266
267 #if __ARM_16K_PG__
268 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
269 #else /* !__ARM_16K_PG__ */
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
271 #endif /* !__ARM_16K_PG__ */
272
273
274 #if MACH_ASSERT
275 int vm_footprint_suspend_allowed = 1;
276
277 extern int pmap_ledgers_panic;
278 extern int pmap_ledgers_panic_leeway;
279
280 #endif /* MACH_ASSERT */
281
282 #if DEVELOPMENT || DEBUG
283 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
284 (current_thread()->pmap_footprint_suspended)
285 #else /* DEVELOPMENT || DEBUG */
286 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
287 #endif /* DEVELOPMENT || DEBUG */
288
289
290 /*
291 * Represents a tlb range that will be flushed before exiting
292 * the ppl.
293 * Used by phys_attribute_clear_range to defer flushing pages in
294 * this range until the end of the operation.
295 */
296 typedef struct pmap_tlb_flush_range {
297 pmap_t ptfr_pmap;
298 vm_map_address_t ptfr_start;
299 vm_map_address_t ptfr_end;
300 bool ptfr_flush_needed;
301 } pmap_tlb_flush_range_t;
302
303 #if XNU_MONITOR
304 /*
305 * PPL External References.
306 */
307 extern vm_offset_t segPPLDATAB;
308 extern unsigned long segSizePPLDATA;
309 extern vm_offset_t segPPLTEXTB;
310 extern unsigned long segSizePPLTEXT;
311 extern vm_offset_t segPPLDATACONSTB;
312 extern unsigned long segSizePPLDATACONST;
313
314
315 /*
316 * PPL Global Variables
317 */
318
319 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
320 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
321 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
322 #else
323 const boolean_t pmap_ppl_disable = FALSE;
324 #endif
325
326 /*
327 * Indicates if the PPL has started applying APRR.
328 * This variable is accessed from various assembly trampolines, so be sure to change
329 * those if you change the size or layout of this variable.
330 */
331 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
332
333 extern void *pmap_stacks_start;
334 extern void *pmap_stacks_end;
335
336 #endif /* !XNU_MONITOR */
337
338
339
340 /* Virtual memory region for early allocation */
341 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
342 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
343 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
344
345 extern uint8_t bootstrap_pagetables[];
346
347 extern unsigned int not_in_kdp;
348
349 extern vm_offset_t first_avail;
350
351 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
352 extern vm_offset_t virtual_space_end; /* End of kernel address space */
353 extern vm_offset_t static_memory_end;
354
355 extern const vm_map_address_t physmap_base;
356 extern const vm_map_address_t physmap_end;
357
358 extern int maxproc, hard_maxproc;
359
360 /* The number of address bits one TTBR can cover. */
361 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
362
363 /*
364 * The bounds on our TTBRs. These are for sanity checking that
365 * an address is accessible by a TTBR before we attempt to map it.
366 */
367
368 /* The level of the root of a page table. */
369 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
370
371 /* The number of entries in the root TT of a page table. */
372 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
373
374 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
375 const pmap_t kernel_pmap = &kernel_pmap_store;
376
377 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
378
379 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
380 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
381 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
382
383 typedef struct tt_free_entry {
384 struct tt_free_entry *next;
385 } tt_free_entry_t;
386
387 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
388
389 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
390 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
391 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
392 #define FREE_PAGE_SIZE_TT_MAX 4
393 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
394 unsigned int free_two_page_size_tt_count MARK_AS_PMAP_DATA;
395 unsigned int free_two_page_size_tt_max MARK_AS_PMAP_DATA;
396 #define FREE_TWO_PAGE_SIZE_TT_MAX 4
397 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
398 unsigned int free_tt_count MARK_AS_PMAP_DATA;
399 unsigned int free_tt_max MARK_AS_PMAP_DATA;
400
401 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
402
403 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
404 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
405 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
406 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
407 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
408 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
409
410 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
411 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
412
413 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
414 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
415
416 /* Lock group used for all pmap object locks. */
417 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
418
419 #if DEVELOPMENT || DEBUG
420 int nx_enabled = 1; /* enable no-execute protection */
421 int allow_data_exec = 0; /* No apps may execute data */
422 int allow_stack_exec = 0; /* No apps may execute from the stack */
423 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
424 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
425 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
426 #else /* DEVELOPMENT || DEBUG */
427 const int nx_enabled = 1; /* enable no-execute protection */
428 const int allow_data_exec = 0; /* No apps may execute data */
429 const int allow_stack_exec = 0; /* No apps may execute from the stack */
430 #endif /* DEVELOPMENT || DEBUG */
431
432 /**
433 * This variable is set true during hibernation entry to protect pmap data structures
434 * during image copying, and reset false on hibernation exit.
435 */
436 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
437
438 #if MACH_ASSERT
439 static void pmap_check_ledgers(pmap_t pmap);
440 #else
441 static inline void
pmap_check_ledgers(__unused pmap_t pmap)442 pmap_check_ledgers(__unused pmap_t pmap)
443 {
444 }
445 #endif /* MACH_ASSERT */
446
447 /**
448 * This helper function ensures that potentially-long-running batched PPL operations are
449 * called in preemptible context before entering the PPL, so that the PPL call may
450 * periodically exit to allow pending urgent ASTs to be taken.
451 */
452 static inline void
pmap_verify_preemptible(void)453 pmap_verify_preemptible(void)
454 {
455 assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
456 }
457
458 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
459
460 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
462
463 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
464
465 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
466 #if defined(__arm64__)
467 /* end of shared region + 512MB for various purposes */
468 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
469 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
470 "Minimum address space size outside allowable range");
471
472 // Max offset is 13.375GB for devices with "large" memory config
473 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
474 // Max offset is 9.375GB for devices with "small" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
476
477
478 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
479 "Large device address space size outside allowable range");
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
481 "Small device address space size outside allowable range");
482
483 # ifdef XNU_TARGET_OS_OSX
484 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
485 # else
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
487 # endif
488 #endif /* __arm64__ */
489
490 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
491 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
492 #else
493 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
494 #endif
495
496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
497 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
498 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
499 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
500 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
501 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
502 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
503 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
504
505
506 #if __ARM_MIXED_PAGE_SIZE__
507 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
508 #endif
509 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
510 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
511 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
512
513 /* PTE Define Macros */
514
515 #define ARM_PTE_IS_COMPRESSED(x, p) \
516 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
517 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
518 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
519 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
520 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
521
522 #define pte_is_wired(pte) \
523 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
524
525 #define pte_was_writeable(pte) \
526 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
527
528 #define pte_set_was_writeable(pte, was_writeable) \
529 do { \
530 if ((was_writeable)) { \
531 (pte) |= ARM_PTE_WRITEABLE; \
532 } else { \
533 (pte) &= ~ARM_PTE_WRITEABLE; \
534 } \
535 } while(0)
536
537 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)538 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
539 {
540 if (wired) {
541 *ptep |= ARM_PTE_WIRED;
542 } else {
543 *ptep &= ~ARM_PTE_WIRED;
544 }
545 /*
546 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
547 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
548 * never reclaimed.
549 */
550 if (pmap == kernel_pmap) {
551 return;
552 }
553 unsigned short *ptd_wiredcnt_ptr;
554 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
555 if (wired) {
556 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
557 } else {
558 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
559 if (__improbable(prev_wired == 0)) {
560 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
561 }
562 }
563 }
564
565 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
566 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only); \
567 arm64_sync_tlb(strong); \
568 }
569
570 /*
571 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
572 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
573 * will observe the updated PTE.
574 */
575 #define FLUSH_PTE() \
576 __builtin_arm_dmb(DMB_ISH);
577
578 /*
579 * Synchronize updates to PTEs that were previously valid and thus may be cached in
580 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
581 * TLBI. This should only require a store-store barrier, as subsequent accesses in
582 * program order will not issue until the DSB completes. Prior loads may be reordered
583 * after the barrier, but their behavior should not be materially affected by the
584 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
585 * matter for loads until the access is re-driven well after the TLB update is
586 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
587 * we should be in a position to handle access faults. For "voluntary" PTE access
588 * restriction due to unmapping or protection, the decision to restrict access should
589 * have a data dependency on prior loads in order to avoid a data race.
590 */
591 #define FLUSH_PTE_STRONG() \
592 __builtin_arm_dsb(DSB_ISHST);
593
594 /**
595 * Write enough page table entries to map a single VM page. On systems where the
596 * VM page size does not match the hardware page size, multiple page table
597 * entries will need to be written.
598 *
599 * @note This function does not emit a barrier to ensure these page table writes
600 * have completed before continuing. This is commonly needed. In the case
601 * where a DMB or DSB barrier is needed, then use the write_pte() and
602 * write_pte_strong() functions respectively instead of this one.
603 *
604 * @param ptep Pointer to the first page table entry to update.
605 * @param pte The value to write into each page table entry. In the case that
606 * multiple PTEs are updated to a non-empty value, then the address
607 * in this value will automatically be incremented for each PTE
608 * write.
609 */
610 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)611 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
612 {
613 /**
614 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
615 * systems, which is why it's checked at runtime instead of compile time.
616 * The "unreachable" warning needs to be suppressed because it still is a
617 * compile time constant on some systems.
618 */
619 __unreachable_ok_push
620 if (TEST_PAGE_RATIO_4) {
621 if (((uintptr_t)ptep) & 0x1f) {
622 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
623 __func__, ptep, (void*)pte);
624 }
625
626 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
627 /**
628 * If we're writing an empty/compressed PTE value, then don't
629 * auto-increment the address for each PTE write.
630 */
631 *ptep = pte;
632 *(ptep + 1) = pte;
633 *(ptep + 2) = pte;
634 *(ptep + 3) = pte;
635 } else {
636 *ptep = pte;
637 *(ptep + 1) = pte | 0x1000;
638 *(ptep + 2) = pte | 0x2000;
639 *(ptep + 3) = pte | 0x3000;
640 }
641 } else {
642 *ptep = pte;
643 }
644 __unreachable_ok_pop
645 }
646
647 /**
648 * Writes enough page table entries to map a single VM page and then ensures
649 * those writes complete by executing a Data Memory Barrier.
650 *
651 * @note The DMB issued by this function is not strong enough to protect against
652 * TLB invalidates from being reordered above the PTE writes. If a TLBI
653 * instruction is going to immediately be called after this write, it's
654 * recommended to call write_pte_strong() instead of this function.
655 *
656 * See the function header for write_pte_fast() for more details on the
657 * parameters.
658 */
659 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)660 write_pte(pt_entry_t *ptep, pt_entry_t pte)
661 {
662 write_pte_fast(ptep, pte);
663 FLUSH_PTE();
664 }
665
666 /**
667 * Writes enough page table entries to map a single VM page and then ensures
668 * those writes complete by executing a Data Synchronization Barrier. This
669 * barrier provides stronger guarantees than the DMB executed by write_pte().
670 *
671 * @note This function is useful if you're going to immediately flush the TLB
672 * after making the PTE write. A DSB is required to protect against the
673 * TLB invalidate being reordered before the PTE write.
674 *
675 * See the function header for write_pte_fast() for more details on the
676 * parameters.
677 */
678 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)679 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
680 {
681 write_pte_fast(ptep, pte);
682 FLUSH_PTE_STRONG();
683 }
684
685 /**
686 * Retrieve the pmap structure for the thread running on the current CPU.
687 */
688 pmap_t
current_pmap()689 current_pmap()
690 {
691 const pmap_t current = vm_map_pmap(current_thread()->map);
692
693 assert(current != NULL);
694
695 #if XNU_MONITOR
696 /**
697 * On PPL-enabled systems, it's important that PPL policy decisions aren't
698 * decided by kernel-writable memory. This function is used in various parts
699 * of the PPL, and besides validating that the pointer returned by this
700 * function is indeed a pmap structure, it's also important to ensure that
701 * it's actually the current thread's pmap. This is because different pmaps
702 * will have access to different entitlements based on the code signature of
703 * their loaded process. So if a different user pmap is set in the current
704 * thread structure (in an effort to bypass code signing restrictions), even
705 * though the structure would validate correctly as it is a real pmap
706 * structure, it should fail here.
707 *
708 * This only needs to occur for user pmaps because the kernel pmap's root
709 * page table is always the same as TTBR1 (it's set during bootstrap and not
710 * changed so it'd be redundant to check), and its code signing fields are
711 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
712 * it shouldn't be possible to set those fields. Due to that, an attacker
713 * setting the current thread's pmap to the kernel pmap as a way to bypass
714 * this check won't accomplish anything as it doesn't provide any extra code
715 * signing entitlements.
716 */
717 if ((current != kernel_pmap) &&
718 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
719 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
720 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
721 }
722 #endif /* XNU_MONITOR */
723
724 return current;
725 }
726
727 #if DEVELOPMENT || DEBUG
728
729 /*
730 * Trace levels are controlled by a bitmask in which each
731 * level can be enabled/disabled by the (1<<level) position
732 * in the boot arg
733 * Level 0: PPL extension functionality
734 * Level 1: pmap lifecycle (create/destroy/switch)
735 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
736 * Level 3: internal state management (attributes/fast-fault)
737 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
738 */
739
740 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
741
742 #define PMAP_TRACE(level, ...) \
743 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
744 KDBG_RELEASE(__VA_ARGS__); \
745 }
746 #else /* DEVELOPMENT || DEBUG */
747
748 #define PMAP_TRACE(level, ...)
749
750 #endif /* DEVELOPMENT || DEBUG */
751
752
753 /*
754 * Internal function prototypes (forward declarations).
755 */
756
757 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
758
759 static void pmap_set_reference(ppnum_t pn);
760
761 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
762
763 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
764
765 static kern_return_t pmap_expand(
766 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
767
768 static int pmap_remove_range(
769 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
770
771 static tt_entry_t *pmap_tt1_allocate(
772 pmap_t, vm_size_t, unsigned int);
773
774 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
775
776 static void pmap_tt1_deallocate(
777 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
778
779 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
780
781 static kern_return_t pmap_tt_allocate(
782 pmap_t, tt_entry_t **, unsigned int, unsigned int);
783
784 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
785
786 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
787 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
788 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
789
790 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
791
792
793 static void pmap_unmap_commpage(
794 pmap_t pmap);
795
796 static boolean_t
797 pmap_is_64bit(pmap_t);
798
799
800 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
801
802 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
803
804 static bool pmap_update_cache_attributes_locked(
805 ppnum_t, unsigned, bool);
806
807 static boolean_t arm_clear_fast_fault(
808 ppnum_t ppnum,
809 vm_prot_t fault_type,
810 pt_entry_t *pte_p);
811
812 static void pmap_trim_self(pmap_t pmap);
813 static void pmap_trim_subord(pmap_t subord);
814
815
816 /*
817 * Temporary prototypes, while we wait for pmap_enter to move to taking an
818 * address instead of a page number.
819 */
820 static kern_return_t
821 pmap_enter_addr(
822 pmap_t pmap,
823 vm_map_address_t v,
824 pmap_paddr_t pa,
825 vm_prot_t prot,
826 vm_prot_t fault_type,
827 unsigned int flags,
828 boolean_t wired);
829
830 kern_return_t
831 pmap_enter_options_addr(
832 pmap_t pmap,
833 vm_map_address_t v,
834 pmap_paddr_t pa,
835 vm_prot_t prot,
836 vm_prot_t fault_type,
837 unsigned int flags,
838 boolean_t wired,
839 unsigned int options,
840 __unused void *arg);
841
842 #ifdef CONFIG_XNUPOST
843 kern_return_t pmap_test(void);
844 #endif /* CONFIG_XNUPOST */
845
846 PMAP_SUPPORT_PROTOTYPES(
847 kern_return_t,
848 arm_fast_fault, (pmap_t pmap,
849 vm_map_address_t va,
850 vm_prot_t fault_type,
851 bool was_af_fault,
852 bool from_user), ARM_FAST_FAULT_INDEX);
853
854 PMAP_SUPPORT_PROTOTYPES(
855 boolean_t,
856 arm_force_fast_fault, (ppnum_t ppnum,
857 vm_prot_t allow_mode,
858 int options), ARM_FORCE_FAST_FAULT_INDEX);
859
860 MARK_AS_PMAP_TEXT static boolean_t
861 arm_force_fast_fault_with_flush_range(
862 ppnum_t ppnum,
863 vm_prot_t allow_mode,
864 int options,
865 pmap_tlb_flush_range_t *flush_range);
866
867 /**
868 * Definition of the states driving the batch cache attributes update
869 * state machine.
870 */
871 typedef struct {
872 uint64_t page_index : 32, /* The page index to be operated on */
873 state : 8, /* The current state of the update machine */
874 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
875 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
876 :0;
877 } batch_set_cache_attr_state_t;
878
879 /* Possible values of the "state" field. */
880 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
881 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
882 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
883 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
884
885 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
886
887 PMAP_SUPPORT_PROTOTYPES(
888 batch_set_cache_attr_state_t,
889 pmap_batch_set_cache_attributes, (
890 #if XNU_MONITOR
891 volatile upl_page_info_t *user_page_list,
892 #else /* !XNU_MONITOR */
893 upl_page_info_array_t user_page_list,
894 #endif /* XNU_MONITOR */
895 batch_set_cache_attr_state_t state,
896 unsigned int page_cnt,
897 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
898
899 PMAP_SUPPORT_PROTOTYPES(
900 kern_return_t,
901 pmap_change_wiring, (pmap_t pmap,
902 vm_map_address_t v,
903 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
904
905 PMAP_SUPPORT_PROTOTYPES(
906 pmap_t,
907 pmap_create_options, (ledger_t ledger,
908 vm_map_size_t size,
909 unsigned int flags,
910 kern_return_t * kr), PMAP_CREATE_INDEX);
911
912 PMAP_SUPPORT_PROTOTYPES(
913 void,
914 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
915
916 PMAP_SUPPORT_PROTOTYPES(
917 kern_return_t,
918 pmap_enter_options, (pmap_t pmap,
919 vm_map_address_t v,
920 pmap_paddr_t pa,
921 vm_prot_t prot,
922 vm_prot_t fault_type,
923 unsigned int flags,
924 boolean_t wired,
925 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
926
927 PMAP_SUPPORT_PROTOTYPES(
928 pmap_paddr_t,
929 pmap_find_pa, (pmap_t pmap,
930 addr64_t va), PMAP_FIND_PA_INDEX);
931
932 PMAP_SUPPORT_PROTOTYPES(
933 kern_return_t,
934 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
935
936
937 PMAP_SUPPORT_PROTOTYPES(
938 boolean_t,
939 pmap_is_empty, (pmap_t pmap,
940 vm_map_offset_t va_start,
941 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
942
943
944 PMAP_SUPPORT_PROTOTYPES(
945 unsigned int,
946 pmap_map_cpu_windows_copy, (ppnum_t pn,
947 vm_prot_t prot,
948 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
949
950 PMAP_SUPPORT_PROTOTYPES(
951 void,
952 pmap_ro_zone_memcpy, (zone_id_t zid,
953 vm_offset_t va,
954 vm_offset_t offset,
955 const vm_offset_t new_data,
956 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
957
958 PMAP_SUPPORT_PROTOTYPES(
959 uint64_t,
960 pmap_ro_zone_atomic_op, (zone_id_t zid,
961 vm_offset_t va,
962 vm_offset_t offset,
963 zro_atomic_op_t op,
964 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
965
966 PMAP_SUPPORT_PROTOTYPES(
967 void,
968 pmap_ro_zone_bzero, (zone_id_t zid,
969 vm_offset_t va,
970 vm_offset_t offset,
971 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
972
973 PMAP_SUPPORT_PROTOTYPES(
974 vm_map_offset_t,
975 pmap_nest, (pmap_t grand,
976 pmap_t subord,
977 addr64_t vstart,
978 uint64_t size,
979 vm_map_offset_t vrestart,
980 kern_return_t * krp), PMAP_NEST_INDEX);
981
982 PMAP_SUPPORT_PROTOTYPES(
983 void,
984 pmap_page_protect_options, (ppnum_t ppnum,
985 vm_prot_t prot,
986 unsigned int options,
987 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
988
989 PMAP_SUPPORT_PROTOTYPES(
990 vm_map_address_t,
991 pmap_protect_options, (pmap_t pmap,
992 vm_map_address_t start,
993 vm_map_address_t end,
994 vm_prot_t prot,
995 unsigned int options,
996 void *args), PMAP_PROTECT_OPTIONS_INDEX);
997
998 PMAP_SUPPORT_PROTOTYPES(
999 kern_return_t,
1000 pmap_query_page_info, (pmap_t pmap,
1001 vm_map_offset_t va,
1002 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1003
1004 PMAP_SUPPORT_PROTOTYPES(
1005 mach_vm_size_t,
1006 pmap_query_resident, (pmap_t pmap,
1007 vm_map_address_t start,
1008 vm_map_address_t end,
1009 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1010
1011 PMAP_SUPPORT_PROTOTYPES(
1012 void,
1013 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1014
1015 PMAP_SUPPORT_PROTOTYPES(
1016 vm_map_address_t,
1017 pmap_remove_options, (pmap_t pmap,
1018 vm_map_address_t start,
1019 vm_map_address_t end,
1020 int options), PMAP_REMOVE_OPTIONS_INDEX);
1021
1022
1023 PMAP_SUPPORT_PROTOTYPES(
1024 void,
1025 pmap_set_cache_attributes, (ppnum_t pn,
1026 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1027
1028 PMAP_SUPPORT_PROTOTYPES(
1029 void,
1030 pmap_update_compressor_page, (ppnum_t pn,
1031 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1032
1033 PMAP_SUPPORT_PROTOTYPES(
1034 void,
1035 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1036
1037 #if MACH_ASSERT || XNU_MONITOR
1038 PMAP_SUPPORT_PROTOTYPES(
1039 void,
1040 pmap_set_process, (pmap_t pmap,
1041 int pid,
1042 char *procname), PMAP_SET_PROCESS_INDEX);
1043 #endif
1044
1045 PMAP_SUPPORT_PROTOTYPES(
1046 void,
1047 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1048
1049 PMAP_SUPPORT_PROTOTYPES(
1050 vm_map_offset_t,
1051 pmap_unnest_options, (pmap_t grand,
1052 addr64_t vaddr,
1053 uint64_t size,
1054 vm_map_offset_t vrestart,
1055 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1056
1057 PMAP_SUPPORT_PROTOTYPES(
1058 void,
1059 phys_attribute_set, (ppnum_t pn,
1060 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1061
1062 PMAP_SUPPORT_PROTOTYPES(
1063 void,
1064 phys_attribute_clear, (ppnum_t pn,
1065 unsigned int bits,
1066 int options,
1067 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1068
1069 #if __ARM_RANGE_TLBI__
1070 PMAP_SUPPORT_PROTOTYPES(
1071 vm_map_address_t,
1072 phys_attribute_clear_range, (pmap_t pmap,
1073 vm_map_address_t start,
1074 vm_map_address_t end,
1075 unsigned int bits,
1076 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1077 #endif /* __ARM_RANGE_TLBI__ */
1078
1079
1080 PMAP_SUPPORT_PROTOTYPES(
1081 void,
1082 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1083
1084 PMAP_SUPPORT_PROTOTYPES(
1085 void,
1086 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1087
1088 PMAP_SUPPORT_PROTOTYPES(
1089 void,
1090 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1091
1092 PMAP_SUPPORT_PROTOTYPES(
1093 void,
1094 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1095
1096 PMAP_SUPPORT_PROTOTYPES(
1097 void,
1098 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1099
1100 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1101 PMAP_SUPPORT_PROTOTYPES(
1102 void,
1103 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1104 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1105
1106 /* Definition of the states used by pmap_trim(). */
1107 typedef enum {
1108 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1109 PMAP_TRIM_STATE_START = 0,
1110
1111 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1112 PMAP_TRIM_STATE_GRAND_BEFORE,
1113
1114 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1115 PMAP_TRIM_STATE_GRAND_AFTER,
1116
1117 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1118 PMAP_TRIM_STATE_SUBORD,
1119
1120 /* Marks that trimming is finished. */
1121 PMAP_TRIM_STATE_DONE,
1122
1123 /* Sentry enum for sanity checks. */
1124 PMAP_TRIM_STATE_COUNT,
1125 } pmap_trim_state_t;
1126
1127 PMAP_SUPPORT_PROTOTYPES(
1128 pmap_trim_state_t,
1129 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1130
1131 #if HAS_APPLE_PAC
1132 PMAP_SUPPORT_PROTOTYPES(
1133 void *,
1134 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1135 PMAP_SUPPORT_PROTOTYPES(
1136 void *,
1137 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1138 #endif /* HAS_APPLE_PAC */
1139
1140
1141
1142
1143 PMAP_SUPPORT_PROTOTYPES(
1144 kern_return_t,
1145 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1146 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1147
1148 PMAP_SUPPORT_PROTOTYPES(
1149 kern_return_t,
1150 pmap_load_trust_cache_with_type, (TCType_t type,
1151 const vm_address_t pmap_img4_payload,
1152 const vm_size_t pmap_img4_payload_len,
1153 const vm_address_t img4_manifest,
1154 const vm_size_t img4_manifest_len,
1155 const vm_address_t img4_aux_manifest,
1156 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1157
1158 PMAP_SUPPORT_PROTOTYPES(
1159 void,
1160 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1161
1162 PMAP_SUPPORT_PROTOTYPES(
1163 kern_return_t,
1164 pmap_query_trust_cache, (TCQueryType_t query_type,
1165 const uint8_t cdhash[kTCEntryHashSize],
1166 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1167
1168 #if PMAP_CS_INCLUDE_CODE_SIGNING
1169
1170 PMAP_SUPPORT_PROTOTYPES(
1171 kern_return_t,
1172 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1173 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1174
1175 PMAP_SUPPORT_PROTOTYPES(
1176 kern_return_t,
1177 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1178 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1179
1180 PMAP_SUPPORT_PROTOTYPES(
1181 kern_return_t,
1182 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1183 pmap_cs_profile_t * profile_obj),
1184 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1185
1186 PMAP_SUPPORT_PROTOTYPES(
1187 kern_return_t,
1188 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1189 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1190
1191 PMAP_SUPPORT_PROTOTYPES(
1192 kern_return_t,
1193 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1194 const void *kernel_entitlements),
1195 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1196
1197 PMAP_SUPPORT_PROTOTYPES(
1198 kern_return_t,
1199 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1200 const void **kernel_entitlements),
1201 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1202
1203 PMAP_SUPPORT_PROTOTYPES(
1204 kern_return_t,
1205 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1206 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1207
1208 PMAP_SUPPORT_PROTOTYPES(
1209 kern_return_t,
1210 pmap_cs_allow_invalid, (pmap_t pmap),
1211 PMAP_CS_ALLOW_INVALID_INDEX);
1212
1213 PMAP_SUPPORT_PROTOTYPES(
1214 void,
1215 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1216 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1217
1218 PMAP_SUPPORT_PROTOTYPES(
1219 bool,
1220 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1221 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1222
1223 PMAP_SUPPORT_PROTOTYPES(
1224 void,
1225 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1226 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1227
1228 PMAP_SUPPORT_PROTOTYPES(
1229 void,
1230 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1231 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1232
1233 #endif
1234
1235 PMAP_SUPPORT_PROTOTYPES(
1236 uint32_t,
1237 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1238
1239 PMAP_SUPPORT_PROTOTYPES(
1240 bool,
1241 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1242
1243 PMAP_SUPPORT_PROTOTYPES(
1244 void,
1245 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1246
1247 void pmap_footprint_suspend(vm_map_t map,
1248 boolean_t suspend);
1249 PMAP_SUPPORT_PROTOTYPES(
1250 void,
1251 pmap_footprint_suspend, (vm_map_t map,
1252 boolean_t suspend),
1253 PMAP_FOOTPRINT_SUSPEND_INDEX);
1254
1255
1256
1257
1258 #if DEVELOPMENT || DEBUG
1259 PMAP_SUPPORT_PROTOTYPES(
1260 kern_return_t,
1261 pmap_test_text_corruption, (pmap_paddr_t),
1262 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1263 #endif /* DEVELOPMENT || DEBUG */
1264
1265 /*
1266 * The low global vector page is mapped at a fixed alias.
1267 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1268 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1269 * to check both addresses anyway for backward compatibility. So for now
1270 * we leave H6 and H7 where they were.
1271 */
1272 #if (ARM_PGSHIFT == 14)
1273 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1274 #else
1275 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1276 #endif
1277
1278
1279 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1280 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1281 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1282
1283 #if XNU_MONITOR
1284
1285 #if __has_feature(ptrauth_calls)
1286 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1287 #else
1288 #define __ptrauth_ppl_handler
1289 #endif
1290
1291 /*
1292 * Table of function pointers used for PPL dispatch.
1293 */
1294 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1295 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1296 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1297 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1298 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1299 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1300 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1301 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1302 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1303 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1304 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1305 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1306 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1307 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1308 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1309 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1310 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1311 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1312 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1313 [PMAP_NEST_INDEX] = pmap_nest_internal,
1314 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1315 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1316 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1317 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1318 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1319 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1320 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1321 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1322 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1323 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1324 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1325 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1326 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1327 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1328 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1329 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1330 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1331 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1332 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1333 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1334 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1335 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1336 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1337 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1338 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1339 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1340 #if PMAP_CS_INCLUDE_CODE_SIGNING
1341 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1342 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1343 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1344 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1345 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1346 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1347 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1348 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1349 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1350 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1351 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1352 #endif
1353 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1354 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1355 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1356 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1357 #if HAS_APPLE_PAC
1358 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1359 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1360 #endif /* HAS_APPLE_PAC */
1361 #if __ARM_RANGE_TLBI__
1362 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1363 #endif /* __ARM_RANGE_TLBI__ */
1364 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1365 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1366 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1367 [PMAP_NOP_INDEX] = pmap_nop_internal,
1368
1369 #if DEVELOPMENT || DEBUG
1370 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1371 #endif /* DEVELOPMENT || DEBUG */
1372
1373 };
1374 #endif
1375
1376 #if XNU_MONITOR
1377 /**
1378 * A convenience function for setting protections on a single physical
1379 * aperture or static region mapping without invalidating the TLB.
1380 *
1381 * @note This function does not perform any TLB invalidations. That must be done
1382 * separately to be able to safely use the updated mapping.
1383 *
1384 * @note This function understands the difference between the VM page size and
1385 * the kernel page size and will update multiple PTEs if the sizes differ.
1386 * In other words, enough PTEs will always get updated to change the
1387 * permissions on a PAGE_SIZE amount of memory.
1388 *
1389 * @note The PVH lock for the physical page represented by this mapping must
1390 * already be locked.
1391 *
1392 * @note This function assumes the caller has already verified that the PTE
1393 * pointer does indeed point to a physical aperture or static region page
1394 * table. Please validate your inputs before passing it along to this
1395 * function.
1396 *
1397 * @param ptep Pointer to the physical aperture or static region page table to
1398 * update with a new XPRR index.
1399 * @param expected_perm The XPRR index that is expected to already exist at the
1400 * current mapping. If the current index doesn't match this
1401 * then the system will panic.
1402 * @param new_perm The new XPRR index to update the mapping with.
1403 */
1404 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1405 pmap_set_pte_xprr_perm(
1406 pt_entry_t * const ptep,
1407 unsigned int expected_perm,
1408 unsigned int new_perm)
1409 {
1410 assert(ptep != NULL);
1411
1412 pt_entry_t spte = *ptep;
1413 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1414
1415 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1416 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1417 __func__, ptep, new_perm, expected_perm);
1418 }
1419
1420 /**
1421 * The PTE involved should be valid, should not have the hint bit set, and
1422 * should have the expected XPRR index.
1423 */
1424 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1425 panic_plain("%s: physical aperture or static region PTE is invalid, "
1426 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1427 __func__, ptep, spte, new_perm, expected_perm);
1428 }
1429
1430 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1431 panic_plain("%s: physical aperture or static region PTE has hint bit "
1432 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1433 __func__, ptep, spte, new_perm, expected_perm);
1434 }
1435
1436 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1437 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1438 "ptep=%p, new_perm=%u, expected_perm=%u",
1439 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1440 }
1441
1442 pt_entry_t template = spte;
1443 template &= ~ARM_PTE_XPRR_MASK;
1444 template |= xprr_perm_to_pte(new_perm);
1445
1446 write_pte_strong(ptep, template);
1447 }
1448
1449 /**
1450 * Update the protections on a single physical aperture mapping and invalidate
1451 * the TLB so the mapping can be used.
1452 *
1453 * @note The PVH lock for the physical page must already be locked.
1454 *
1455 * @param pai The physical address index of the page whose physical aperture
1456 * mapping will be updated with new permissions.
1457 * @param expected_perm The XPRR index that is expected to already exist at the
1458 * current mapping. If the current index doesn't match this
1459 * then the system will panic.
1460 * @param new_perm The new XPRR index to update the mapping with.
1461 */
1462 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1463 pmap_set_xprr_perm(
1464 unsigned int pai,
1465 unsigned int expected_perm,
1466 unsigned int new_perm)
1467 {
1468 pvh_assert_locked(pai);
1469
1470 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1471 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1472
1473 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1474
1475 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true);
1476 sync_tlb_flush();
1477 }
1478
1479 /**
1480 * Update the protections on a range of physical aperture or static region
1481 * mappings and invalidate the TLB so the mappings can be used.
1482 *
1483 * @note Static region mappings can only be updated before machine_lockdown().
1484 * Physical aperture mappings can be updated at any time.
1485 *
1486 * @param start The starting virtual address of the static region or physical
1487 * aperture range whose permissions will be updated.
1488 * @param end The final (inclusive) virtual address of the static region or
1489 * physical aperture range whose permissions will be updated.
1490 * @param expected_perm The XPRR index that is expected to already exist at the
1491 * current mappings. If the current indices don't match
1492 * this then the system will panic.
1493 * @param new_perm The new XPRR index to update the mappings with.
1494 */
1495 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1496 pmap_set_range_xprr_perm(
1497 vm_address_t start,
1498 vm_address_t end,
1499 unsigned int expected_perm,
1500 unsigned int new_perm)
1501 {
1502 /**
1503 * Validate our arguments; any invalid argument will be grounds for a panic.
1504 */
1505 if (__improbable((start | end) & ARM_PGMASK)) {
1506 panic_plain("%s: start or end not page aligned, "
1507 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1508 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1509 }
1510
1511 if (__improbable(start > end)) {
1512 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1513 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1514 }
1515
1516 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1517 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1518
1519 if (__improbable(!(in_physmap || in_static))) {
1520 panic_plain("%s: address not in static region or physical aperture, "
1521 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1522 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1523 }
1524
1525 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1526 panic_plain("%s: invalid XPRR index, "
1527 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1528 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1529 }
1530
1531 /*
1532 * Walk over the PTEs for the given range, and set the protections on those
1533 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1534 * one twig entry (whichever twig entry currently maps "va").
1535 */
1536 vm_address_t va = start;
1537 while (va < end) {
1538 /**
1539 * Get the last VA that the twig entry for "va" maps. All of the leaf
1540 * PTEs from va to tte_va_end will have their permissions updated.
1541 */
1542 vm_address_t tte_va_end =
1543 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1544
1545 if (tte_va_end > end) {
1546 tte_va_end = end;
1547 }
1548
1549 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1550
1551 if (ttep == NULL) {
1552 panic_plain("%s: physical aperture or static region tte is NULL, "
1553 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1554 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1555 }
1556
1557 tt_entry_t tte = *ttep;
1558
1559 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1560 panic_plain("%s: tte=0x%llx is not a table type entry, "
1561 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1562 tte, (void *)start, (void *)end, new_perm, expected_perm);
1563 }
1564
1565 /* Walk over the given L3 page table page and update the PTEs. */
1566 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1567 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1568 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1569 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1570
1571 /**
1572 * The current PTE pointer is incremented by the page ratio (ratio of
1573 * VM page size to kernel hardware page size) because one call to
1574 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1575 * a PAGE_SIZE worth of hardware pages.
1576 */
1577 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1578 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1579 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1580 pvh_lock(pai);
1581 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1582 pvh_unlock(pai);
1583 }
1584
1585 va = tte_va_end;
1586 }
1587
1588 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1589 }
1590
1591 #endif /* XNU_MONITOR */
1592
1593 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1594 PMAP_ZINFO_PALLOC(
1595 pmap_t pmap, int bytes)
1596 {
1597 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1598 }
1599
1600 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1601 PMAP_ZINFO_PFREE(
1602 pmap_t pmap,
1603 int bytes)
1604 {
1605 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1606 }
1607
1608 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1609 pmap_tt_ledger_credit(
1610 pmap_t pmap,
1611 vm_size_t size)
1612 {
1613 if (pmap != kernel_pmap) {
1614 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1615 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1616 }
1617 }
1618
1619 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1620 pmap_tt_ledger_debit(
1621 pmap_t pmap,
1622 vm_size_t size)
1623 {
1624 if (pmap != kernel_pmap) {
1625 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1626 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1627 }
1628 }
1629
1630 static inline void
pmap_update_plru(uint16_t asid_index)1631 pmap_update_plru(uint16_t asid_index)
1632 {
1633 if (__probable(pmap_asid_plru)) {
1634 unsigned plru_index = asid_index >> 6;
1635 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1636 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1637 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1638 }
1639 }
1640 }
1641
1642 static bool
alloc_asid(pmap_t pmap)1643 alloc_asid(pmap_t pmap)
1644 {
1645 int vasid = -1;
1646 uint16_t hw_asid;
1647
1648 pmap_simple_lock(&asid_lock);
1649
1650 if (__probable(pmap_asid_plru)) {
1651 unsigned plru_index = 0;
1652 uint64_t lowest_gen = asid_plru_generation[0];
1653 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1654 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1655 if (asid_plru_generation[i] < lowest_gen) {
1656 plru_index = i;
1657 lowest_gen = asid_plru_generation[i];
1658 lowest_gen_bitmap = asid_plru_bitmap[i];
1659 }
1660 }
1661
1662 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1663 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1664 if (temp_plru) {
1665 vasid = (plru_index << 6) + lsb_first(temp_plru);
1666 #if DEVELOPMENT || DEBUG
1667 ++pmap_asid_hits;
1668 #endif
1669 break;
1670 }
1671 }
1672 }
1673 if (__improbable(vasid < 0)) {
1674 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1675 // slightly better with the collision detection scheme used by pmap_switch_internal().
1676 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1677 #if DEVELOPMENT || DEBUG
1678 ++pmap_asid_misses;
1679 #endif
1680 }
1681 if (__improbable(vasid < 0)) {
1682 pmap_simple_unlock(&asid_lock);
1683 return false;
1684 }
1685 assert((uint32_t)vasid < pmap_max_asids);
1686 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1687 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1688 pmap_simple_unlock(&asid_lock);
1689 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1690 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1691 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1692 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1693 * reassign to a reserved VASID. */
1694 assert(pmap->sw_asid < UINT8_MAX);
1695 pmap->sw_asid = UINT8_MAX;
1696 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1697 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1698 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1699 assert(hw_asid < MAX_HW_ASIDS);
1700 }
1701 pmap_update_plru(hw_asid);
1702 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1703 #if __ARM_KERNEL_PROTECT__
1704 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1705 #endif
1706 pmap->hw_asid = hw_asid;
1707 return true;
1708 }
1709
1710 static void
free_asid(pmap_t pmap)1711 free_asid(pmap_t pmap)
1712 {
1713 unsigned int vasid;
1714 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1715 if (__improbable(hw_asid == 0)) {
1716 return;
1717 }
1718
1719 #if __ARM_KERNEL_PROTECT__
1720 hw_asid >>= 1;
1721 #endif
1722 hw_asid -= 1;
1723
1724 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1725 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1726 } else {
1727 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1728 }
1729
1730 if (__probable(pmap_asid_plru)) {
1731 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1732 }
1733 pmap_simple_lock(&asid_lock);
1734 assert(!bitmap_test(&asid_bitmap[0], vasid));
1735 bitmap_set(&asid_bitmap[0], vasid);
1736 pmap_simple_unlock(&asid_lock);
1737 }
1738
1739
1740 boolean_t
pmap_valid_address(pmap_paddr_t addr)1741 pmap_valid_address(
1742 pmap_paddr_t addr)
1743 {
1744 return pa_valid(addr);
1745 }
1746
1747
1748
1749
1750
1751
1752 /*
1753 * Map memory at initialization. The physical addresses being
1754 * mapped are not managed and are never unmapped.
1755 *
1756 * For now, VM is already on, we only need to map the
1757 * specified memory.
1758 */
1759 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1760 pmap_map(
1761 vm_map_address_t virt,
1762 vm_offset_t start,
1763 vm_offset_t end,
1764 vm_prot_t prot,
1765 unsigned int flags)
1766 {
1767 kern_return_t kr;
1768 vm_size_t ps;
1769
1770 ps = PAGE_SIZE;
1771 while (start < end) {
1772 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1773 prot, VM_PROT_NONE, flags, FALSE);
1774
1775 if (kr != KERN_SUCCESS) {
1776 panic("%s: failed pmap_enter, "
1777 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1778 __FUNCTION__,
1779 (void *) virt, (void *) start, (void *) end, prot, flags);
1780 }
1781
1782 virt += ps;
1783 start += ps;
1784 }
1785 return virt;
1786 }
1787
1788 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1789 pmap_map_bd_with_options(
1790 vm_map_address_t virt,
1791 vm_offset_t start,
1792 vm_offset_t end,
1793 vm_prot_t prot,
1794 int32_t options)
1795 {
1796 pt_entry_t tmplate;
1797 pt_entry_t *ptep;
1798 vm_map_address_t vaddr;
1799 vm_offset_t paddr;
1800 pt_entry_t mem_attr;
1801
1802 switch (options & PMAP_MAP_BD_MASK) {
1803 case PMAP_MAP_BD_WCOMB:
1804 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1805 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1806 break;
1807 case PMAP_MAP_BD_POSTED:
1808 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1809 break;
1810 case PMAP_MAP_BD_POSTED_REORDERED:
1811 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1812 break;
1813 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1814 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1815 break;
1816 default:
1817 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1818 break;
1819 }
1820
1821 tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1822 mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1823 #if __ARM_KERNEL_PROTECT__
1824 tmplate |= ARM_PTE_NG;
1825 #endif /* __ARM_KERNEL_PROTECT__ */
1826
1827 vaddr = virt;
1828 paddr = start;
1829 while (paddr < end) {
1830 ptep = pmap_pte(kernel_pmap, vaddr);
1831 if (ptep == PT_ENTRY_NULL) {
1832 panic("%s: no PTE for vaddr=%p, "
1833 "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1834 __FUNCTION__, (void*)vaddr,
1835 (void*)virt, (void*)start, (void*)end, prot, options);
1836 }
1837
1838 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1839 write_pte_strong(ptep, tmplate);
1840
1841 pte_increment_pa(tmplate);
1842 vaddr += PAGE_SIZE;
1843 paddr += PAGE_SIZE;
1844 }
1845
1846 if (end >= start) {
1847 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1848 }
1849
1850 return vaddr;
1851 }
1852
1853 /*
1854 * Back-door routine for mapping kernel VM at initialization.
1855 * Useful for mapping memory outside the range
1856 * [vm_first_phys, vm_last_phys] (i.e., devices).
1857 * Otherwise like pmap_map.
1858 */
1859 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1860 pmap_map_bd(
1861 vm_map_address_t virt,
1862 vm_offset_t start,
1863 vm_offset_t end,
1864 vm_prot_t prot)
1865 {
1866 pt_entry_t tmplate;
1867 pt_entry_t *ptep;
1868 vm_map_address_t vaddr;
1869 vm_offset_t paddr;
1870
1871 /* not cacheable and not buffered */
1872 tmplate = pa_to_pte(start)
1873 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1874 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1875 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1876 #if __ARM_KERNEL_PROTECT__
1877 tmplate |= ARM_PTE_NG;
1878 #endif /* __ARM_KERNEL_PROTECT__ */
1879
1880 vaddr = virt;
1881 paddr = start;
1882 while (paddr < end) {
1883 ptep = pmap_pte(kernel_pmap, vaddr);
1884 if (ptep == PT_ENTRY_NULL) {
1885 panic("pmap_map_bd");
1886 }
1887 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1888 write_pte_strong(ptep, tmplate);
1889
1890 pte_increment_pa(tmplate);
1891 vaddr += PAGE_SIZE;
1892 paddr += PAGE_SIZE;
1893 }
1894
1895 if (end >= start) {
1896 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1897 }
1898
1899 return vaddr;
1900 }
1901
1902 /*
1903 * Back-door routine for mapping kernel VM at initialization.
1904 * Useful for mapping memory specific physical addresses in early
1905 * boot (i.e., before kernel_map is initialized).
1906 *
1907 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1908 */
1909
1910 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1911 pmap_map_high_window_bd(
1912 vm_offset_t pa_start,
1913 vm_size_t len,
1914 vm_prot_t prot)
1915 {
1916 pt_entry_t *ptep, pte;
1917 vm_map_address_t va_start = VREGION1_START;
1918 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1919 vm_map_address_t va_end;
1920 vm_map_address_t va;
1921 vm_size_t offset;
1922
1923 offset = pa_start & PAGE_MASK;
1924 pa_start -= offset;
1925 len += offset;
1926
1927 if (len > (va_max - va_start)) {
1928 panic("%s: area too large, "
1929 "pa_start=%p, len=%p, prot=0x%x",
1930 __FUNCTION__,
1931 (void*)pa_start, (void*)len, prot);
1932 }
1933
1934 scan:
1935 for (; va_start < va_max; va_start += PAGE_SIZE) {
1936 ptep = pmap_pte(kernel_pmap, va_start);
1937 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1938 if (*ptep == ARM_PTE_TYPE_FAULT) {
1939 break;
1940 }
1941 }
1942 if (va_start > va_max) {
1943 panic("%s: insufficient pages, "
1944 "pa_start=%p, len=%p, prot=0x%x",
1945 __FUNCTION__,
1946 (void*)pa_start, (void*)len, prot);
1947 }
1948
1949 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1950 ptep = pmap_pte(kernel_pmap, va_end);
1951 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1952 if (*ptep != ARM_PTE_TYPE_FAULT) {
1953 va_start = va_end + PAGE_SIZE;
1954 goto scan;
1955 }
1956 }
1957
1958 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1959 ptep = pmap_pte(kernel_pmap, va);
1960 pte = pa_to_pte(pa_start)
1961 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1962 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1963 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
1964 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
1965 #if __ARM_KERNEL_PROTECT__
1966 pte |= ARM_PTE_NG;
1967 #endif /* __ARM_KERNEL_PROTECT__ */
1968 write_pte_strong(ptep, pte);
1969 }
1970 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
1971 #if KASAN
1972 kasan_notify_address(va_start, len);
1973 #endif
1974 return va_start;
1975 }
1976
1977 static uint32_t
pmap_compute_max_asids(void)1978 pmap_compute_max_asids(void)
1979 {
1980 DTEntry entry;
1981 void const *prop = NULL;
1982 uint32_t max_asids;
1983 int err;
1984 unsigned int prop_size;
1985
1986 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1987 assert(err == kSuccess);
1988
1989 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
1990 /* TODO: consider allowing maxproc limits to be scaled earlier so that
1991 * we can choose a more flexible default value here. */
1992 return MAX_ASIDS;
1993 }
1994
1995 if (prop_size != sizeof(max_asids)) {
1996 panic("pmap-max-asids property is not a 32-bit integer");
1997 }
1998
1999 max_asids = *((uint32_t const *)prop);
2000 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2001 max_asids = (max_asids + 63) & ~63UL;
2002
2003 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2004 /* currently capped by size of pmap->sw_asid */
2005 panic("pmap-max-asids too large");
2006 }
2007 if (max_asids == 0) {
2008 panic("pmap-max-asids cannot be zero");
2009 }
2010 return max_asids;
2011 }
2012
2013 #if __arm64__
2014 /*
2015 * pmap_get_arm64_prot
2016 *
2017 * return effective armv8 VMSA block protections including
2018 * table AP/PXN/XN overrides of a pmap entry
2019 *
2020 */
2021
2022 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2023 pmap_get_arm64_prot(
2024 pmap_t pmap,
2025 vm_offset_t addr)
2026 {
2027 tt_entry_t tte = 0;
2028 unsigned int level = 0;
2029 uint64_t tte_type = 0;
2030 uint64_t effective_prot_bits = 0;
2031 uint64_t aggregate_tte = 0;
2032 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2033 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2034
2035 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2036 tte = *pmap_ttne(pmap, level, addr);
2037
2038 if (!(tte & ARM_TTE_VALID)) {
2039 return 0;
2040 }
2041
2042 tte_type = tte & ARM_TTE_TYPE_MASK;
2043
2044 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2045 (level == pt_attr->pta_max_level)) {
2046 /* Block or page mapping; both have the same protection bit layout. */
2047 break;
2048 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
2049 /* All of the table bits we care about are overrides, so just OR them together. */
2050 aggregate_tte |= tte;
2051 }
2052 }
2053
2054 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2055 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2056 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2057
2058 /* Start with the PTE bits. */
2059 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2060
2061 /* Table AP bits mask out block/page AP bits */
2062 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2063
2064 /* XN/PXN bits can be OR'd in. */
2065 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2066 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2067
2068 return effective_prot_bits;
2069 }
2070 #endif /* __arm64__ */
2071
2072 /*
2073 * Bootstrap the system enough to run with virtual memory.
2074 *
2075 * The early VM initialization code has already allocated
2076 * the first CPU's translation table and made entries for
2077 * all the one-to-one mappings to be found there.
2078 *
2079 * We must set up the kernel pmap structures, the
2080 * physical-to-virtual translation lookup tables for the
2081 * physical memory to be managed (between avail_start and
2082 * avail_end).
2083 *
2084 * Map the kernel's code and data, and allocate the system page table.
2085 * Page_size must already be set.
2086 *
2087 * Parameters:
2088 * first_avail first available physical page -
2089 * after kernel page tables
2090 * avail_start PA of first managed physical page
2091 * avail_end PA of last managed physical page
2092 */
2093
2094 void
pmap_bootstrap(vm_offset_t vstart)2095 pmap_bootstrap(
2096 vm_offset_t vstart)
2097 {
2098 vm_map_offset_t maxoffset;
2099
2100 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2101
2102 #if XNU_MONITOR
2103
2104 #if DEVELOPMENT || DEBUG
2105 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2106 #endif
2107
2108 #if CONFIG_CSR_FROM_DT
2109 if (csr_unsafe_kernel_text) {
2110 pmap_ppl_disable = true;
2111 }
2112 #endif /* CONFIG_CSR_FROM_DT */
2113
2114 #endif /* XNU_MONITOR */
2115
2116 #if DEVELOPMENT || DEBUG
2117 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2118 kprintf("Kernel traces for pmap operations enabled\n");
2119 }
2120 #endif
2121
2122 /*
2123 * Initialize the kernel pmap.
2124 */
2125 #if ARM_PARAMETERIZED_PMAP
2126 kernel_pmap->pmap_pt_attr = native_pt_attr;
2127 #endif /* ARM_PARAMETERIZED_PMAP */
2128 #if HAS_APPLE_PAC
2129 kernel_pmap->disable_jop = 0;
2130 #endif /* HAS_APPLE_PAC */
2131 kernel_pmap->tte = cpu_tte;
2132 kernel_pmap->ttep = cpu_ttep;
2133 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2134 kernel_pmap->max = UINTPTR_MAX;
2135 os_atomic_init(&kernel_pmap->ref_count, 1);
2136 #if XNU_MONITOR
2137 os_atomic_init(&kernel_pmap->nested_count, 0);
2138 #endif
2139 kernel_pmap->nx_enabled = TRUE;
2140 #ifdef __arm64__
2141 kernel_pmap->is_64bit = TRUE;
2142 #else
2143 kernel_pmap->is_64bit = FALSE;
2144 #endif
2145 #if CONFIG_ROSETTA
2146 kernel_pmap->is_rosetta = FALSE;
2147 #endif
2148
2149 #if ARM_PARAMETERIZED_PMAP
2150 kernel_pmap->pmap_pt_attr = native_pt_attr;
2151 #endif /* ARM_PARAMETERIZED_PMAP */
2152
2153 kernel_pmap->nested_region_addr = 0x0ULL;
2154 kernel_pmap->nested_region_size = 0x0ULL;
2155 kernel_pmap->nested_region_asid_bitmap = NULL;
2156 kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2157 kernel_pmap->type = PMAP_TYPE_KERNEL;
2158
2159 kernel_pmap->hw_asid = 0;
2160 kernel_pmap->sw_asid = 0;
2161
2162 pmap_lock_init(kernel_pmap);
2163
2164 pmap_max_asids = pmap_compute_max_asids();
2165 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2166 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2167 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2168 * masking used by the PLRU scheme. This means we must handle the case in which
2169 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2170 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2171 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2172 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2173
2174 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2175
2176 /**
2177 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2178 * pp_attr_table, etc). This function will use `avail_start` to allocate
2179 * space for these data structures.
2180 */
2181 pmap_data_bootstrap();
2182
2183 /**
2184 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2185 */
2186 uat_bootstrap();
2187
2188
2189 /**
2190 * Bootstrap any necessary SART data structures and values needed from the device tree.
2191 */
2192 sart_bootstrap();
2193
2194 /**
2195 * Don't make any assumptions about the alignment of avail_start before this
2196 * point (i.e., pmap_data_bootstrap() performs allocations).
2197 */
2198 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2199
2200 const pmap_paddr_t pmap_struct_start = avail_start;
2201
2202 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2203 avail_start = round_page(avail_start + asid_table_size);
2204
2205 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2206
2207 vm_first_phys = gPhysBase;
2208 vm_last_phys = trunc_page(avail_end);
2209
2210 queue_init(&map_pmap_list);
2211 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2212 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2213 free_page_size_tt_count = 0;
2214 free_page_size_tt_max = 0;
2215 free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2216 free_two_page_size_tt_count = 0;
2217 free_two_page_size_tt_max = 0;
2218 free_tt_list = TT_FREE_ENTRY_NULL;
2219 free_tt_count = 0;
2220 free_tt_max = 0;
2221
2222 virtual_space_start = vstart;
2223 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2224
2225 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2226 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2227 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2228 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2229
2230
2231
2232 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2233 maxoffset = trunc_page(maxoffset);
2234 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2235 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2236 arm_pmap_max_offset_default = maxoffset;
2237 }
2238 }
2239 #if defined(__arm64__)
2240 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2241 maxoffset = trunc_page(maxoffset);
2242 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2243 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2244 arm64_pmap_max_offset_default = maxoffset;
2245 }
2246 }
2247 #endif
2248
2249 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2250
2251
2252 #if PMAP_CS_PPL_MONITOR
2253 /* Initialize the PPL trust cache read-write lock */
2254 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2255 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2256 #endif
2257
2258 #if MACH_ASSERT
2259 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2260 &vm_footprint_suspend_allowed,
2261 sizeof(vm_footprint_suspend_allowed));
2262 #endif /* MACH_ASSERT */
2263
2264 #if KASAN
2265 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2266 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2267 #endif /* KASAN */
2268
2269 /**
2270 * Ensure that avail_start is always left on a page boundary. The calling
2271 * code might not perform any alignment before allocating page tables so
2272 * this is important.
2273 */
2274 avail_start = round_page(avail_start);
2275 }
2276
2277 #if XNU_MONITOR
2278
2279 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2280 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2281 {
2282 pmap_paddr_t cur_pa;
2283 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2284 assert(pa_valid(cur_pa));
2285 ppattr_pa_set_monitor(cur_pa);
2286 }
2287 }
2288
2289 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2290 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2291 pmap_paddr_t end_pa,
2292 unsigned int expected_perm,
2293 unsigned int new_perm)
2294 {
2295 vm_offset_t start_va = phystokv(start_pa);
2296 vm_offset_t end_va = start_va + (end_pa - start_pa);
2297
2298 pa_set_range_monitor(start_pa, end_pa);
2299 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2300 }
2301
2302 static void
pmap_lockdown_kc(void)2303 pmap_lockdown_kc(void)
2304 {
2305 extern vm_offset_t vm_kernelcache_base;
2306 extern vm_offset_t vm_kernelcache_top;
2307 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2308 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2309 pmap_paddr_t cur_pa = start_pa;
2310 vm_offset_t cur_va = vm_kernelcache_base;
2311 while (cur_pa < end_pa) {
2312 vm_size_t range_size = end_pa - cur_pa;
2313 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2314 if (ptov_va != cur_va) {
2315 /*
2316 * If the physical address maps back to a virtual address that is non-linear
2317 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2318 * reclaimed by the OS and should therefore not be locked down.
2319 */
2320 cur_pa += range_size;
2321 cur_va += range_size;
2322 continue;
2323 }
2324 unsigned int pai = pa_index(cur_pa);
2325 pv_entry_t **pv_h = pai_to_pvh(pai);
2326
2327 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2328
2329 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2330 panic("pai %d already locked down", pai);
2331 }
2332
2333 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2334 cur_pa += ARM_PGBYTES;
2335 cur_va += ARM_PGBYTES;
2336 }
2337 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2338 extern uint64_t ctrr_ro_test;
2339 extern uint64_t ctrr_nx_test;
2340 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2341 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2342 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2343 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2344 }
2345 #endif
2346 }
2347
2348 void
pmap_static_allocations_done(void)2349 pmap_static_allocations_done(void)
2350 {
2351 pmap_paddr_t monitor_start_pa;
2352 pmap_paddr_t monitor_end_pa;
2353
2354 /*
2355 * Protect the bootstrap (V=P and V->P) page tables.
2356 *
2357 * These bootstrap allocations will be used primarily for page tables.
2358 * If we wish to secure the page tables, we need to start by marking
2359 * these bootstrap allocations as pages that we want to protect.
2360 */
2361 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2362 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2363
2364 /* The bootstrap page tables are mapped RW at boostrap. */
2365 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2366
2367 /*
2368 * We use avail_start as a pointer to the first address that has not
2369 * been reserved for bootstrap, so we know which pages to give to the
2370 * virtual memory layer.
2371 */
2372 monitor_start_pa = BootArgs->topOfKernelData;
2373 monitor_end_pa = avail_start;
2374
2375 /* The other bootstrap allocations are mapped RW at bootstrap. */
2376 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2377
2378 /*
2379 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2380 * to RO in arm_vm_prot_finalize(), which is called after this function.
2381 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2382 * they can't be allocated for other uses. We don't need a special xPRR
2383 * protection index, as there is no PPL_RO index, and these pages are ultimately
2384 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2385 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2386 * lookup table index to USER_XO before APRR is applied, leading the hardware
2387 * to believe we are dealing with an user XO page upon performing a translation.
2388 */
2389 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2390 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2391 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2392
2393 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2394 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2395
2396 /* PPL data is RW for the PPL, RO for the kernel. */
2397 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2398
2399 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2400 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2401
2402 /* PPL text is RX for the PPL, RO for the kernel. */
2403 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2404
2405
2406 /*
2407 * In order to support DTrace, the save areas for the PPL must be
2408 * writable. This is due to the fact that DTrace will try to update
2409 * register state.
2410 */
2411 if (pmap_ppl_disable) {
2412 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2413 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2414
2415 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2416 }
2417
2418
2419 if (segSizePPLDATACONST > 0) {
2420 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2421 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2422
2423 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2424 }
2425
2426 /*
2427 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2428 * precaution. The real RW mappings are at a different location with guard pages.
2429 */
2430 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2431
2432 /* Prevent remapping of the kernelcache */
2433 pmap_lockdown_kc();
2434 }
2435
2436 void
pmap_lockdown_ppl(void)2437 pmap_lockdown_ppl(void)
2438 {
2439 /* Mark the PPL as being locked down. */
2440
2441 mp_disable_preemption(); // for _nopreempt locking operations
2442 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2443 if (commpage_text_kva != 0) {
2444 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2445 false, VM_PROT_READ | VM_PROT_EXECUTE);
2446 }
2447 mp_enable_preemption();
2448
2449 /* Write-protect the kernel RO commpage. */
2450 #error "XPRR configuration error"
2451 }
2452 #endif /* XNU_MONITOR */
2453
2454 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2455 pmap_virtual_space(
2456 vm_offset_t *startp,
2457 vm_offset_t *endp
2458 )
2459 {
2460 *startp = virtual_space_start;
2461 *endp = virtual_space_end;
2462 }
2463
2464
2465 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2466 pmap_virtual_region(
2467 unsigned int region_select,
2468 vm_map_offset_t *startp,
2469 vm_map_size_t *size
2470 )
2471 {
2472 boolean_t ret = FALSE;
2473 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2474 if (region_select == 0) {
2475 /*
2476 * In this config, the bootstrap mappings should occupy their own L2
2477 * TTs, as they should be immutable after boot. Having the associated
2478 * TTEs and PTEs in their own pages allows us to lock down those pages,
2479 * while allowing the rest of the kernel address range to be remapped.
2480 */
2481 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2482 #if defined(ARM_LARGE_MEMORY)
2483 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2484 #else
2485 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2486 #endif
2487 ret = TRUE;
2488 }
2489
2490 #if defined(ARM_LARGE_MEMORY)
2491 if (region_select == 1) {
2492 *startp = VREGION1_START;
2493 *size = VREGION1_SIZE;
2494 ret = TRUE;
2495 }
2496 #endif
2497 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2498 #if defined(ARM_LARGE_MEMORY)
2499 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2500 if (region_select == 0) {
2501 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2502 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2503 ret = TRUE;
2504 }
2505
2506 if (region_select == 1) {
2507 *startp = VREGION1_START;
2508 *size = VREGION1_SIZE;
2509 ret = TRUE;
2510 }
2511 #else /* !defined(ARM_LARGE_MEMORY) */
2512 unsigned long low_global_vr_mask = 0;
2513 vm_map_size_t low_global_vr_size = 0;
2514
2515 if (region_select == 0) {
2516 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2517 if (!TEST_PAGE_SIZE_4K) {
2518 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2519 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2520 } else {
2521 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2522 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2523 }
2524 ret = TRUE;
2525 }
2526 if (region_select == 1) {
2527 *startp = VREGION1_START;
2528 *size = VREGION1_SIZE;
2529 ret = TRUE;
2530 }
2531 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2532 if (!TEST_PAGE_SIZE_4K) {
2533 low_global_vr_mask = 0xFFFFFFFFFE000000;
2534 low_global_vr_size = 0x2000000;
2535 } else {
2536 low_global_vr_mask = 0xFFFFFFFFFF800000;
2537 low_global_vr_size = 0x800000;
2538 }
2539
2540 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2541 *startp = LOW_GLOBAL_BASE_ADDRESS;
2542 *size = low_global_vr_size;
2543 ret = TRUE;
2544 }
2545
2546 if (region_select == 3) {
2547 /* In this config, we allow the bootstrap mappings to occupy the same
2548 * page table pages as the heap.
2549 */
2550 *startp = VM_MIN_KERNEL_ADDRESS;
2551 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2552 ret = TRUE;
2553 }
2554 #endif /* defined(ARM_LARGE_MEMORY) */
2555 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2556 return ret;
2557 }
2558
2559 /*
2560 * Routines to track and allocate physical pages during early boot.
2561 * On most systems that memory runs from first_avail through to avail_end
2562 * with no gaps.
2563 *
2564 * If the system supports ECC and ecc_bad_pages_count > 0, we
2565 * need to skip those pages.
2566 */
2567
2568 static unsigned int avail_page_count = 0;
2569 static bool need_ram_ranges_init = true;
2570
2571
2572 /**
2573 * Checks to see if a given page is in
2574 * the array of known bad pages
2575 *
2576 * @param ppn page number to check
2577 */
2578 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2579 pmap_is_bad_ram(__unused ppnum_t ppn)
2580 {
2581 return false;
2582 }
2583
2584 /**
2585 * Prepare bad ram pages to be skipped.
2586 */
2587
2588 /*
2589 * Initialize the count of available pages. No lock needed here,
2590 * as this code is called while kernel boot up is single threaded.
2591 */
2592 static void
initialize_ram_ranges(void)2593 initialize_ram_ranges(void)
2594 {
2595 pmap_paddr_t first = first_avail;
2596 pmap_paddr_t end = avail_end;
2597
2598 assert(first <= end);
2599 assert(first == (first & ~PAGE_MASK));
2600 assert(end == (end & ~PAGE_MASK));
2601 avail_page_count = atop(end - first);
2602
2603 need_ram_ranges_init = false;
2604 }
2605
2606 unsigned int
pmap_free_pages(void)2607 pmap_free_pages(
2608 void)
2609 {
2610 if (need_ram_ranges_init) {
2611 initialize_ram_ranges();
2612 }
2613 return avail_page_count;
2614 }
2615
2616 unsigned int
pmap_free_pages_span(void)2617 pmap_free_pages_span(
2618 void)
2619 {
2620 if (need_ram_ranges_init) {
2621 initialize_ram_ranges();
2622 }
2623 return (unsigned int)atop(avail_end - first_avail);
2624 }
2625
2626
2627 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2628 pmap_next_page_hi(
2629 ppnum_t * pnum,
2630 __unused boolean_t might_free)
2631 {
2632 return pmap_next_page(pnum);
2633 }
2634
2635
2636 boolean_t
pmap_next_page(ppnum_t * pnum)2637 pmap_next_page(
2638 ppnum_t *pnum)
2639 {
2640 if (need_ram_ranges_init) {
2641 initialize_ram_ranges();
2642 }
2643
2644
2645 if (first_avail != avail_end) {
2646 *pnum = (ppnum_t)atop(first_avail);
2647 first_avail += PAGE_SIZE;
2648 assert(avail_page_count > 0);
2649 --avail_page_count;
2650 return TRUE;
2651 }
2652 assert(avail_page_count == 0);
2653 return FALSE;
2654 }
2655
2656
2657 /*
2658 * Initialize the pmap module.
2659 * Called by vm_init, to initialize any structures that the pmap
2660 * system needs to map virtual memory.
2661 */
2662 void
pmap_init(void)2663 pmap_init(
2664 void)
2665 {
2666 /*
2667 * Protect page zero in the kernel map.
2668 * (can be overruled by permanent transltion
2669 * table entries at page zero - see arm_vm_init).
2670 */
2671 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2672
2673 pmap_initialized = TRUE;
2674
2675 /*
2676 * Create the zone of physical maps
2677 * and the physical-to-virtual entries.
2678 */
2679 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2680 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2681
2682
2683 /*
2684 * Initialize the pmap object (for tracking the vm_page_t
2685 * structures for pages we allocate to be page tables in
2686 * pmap_expand().
2687 */
2688 _vm_object_allocate(mem_size, pmap_object);
2689 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2690
2691 /*
2692 * The values of [hard_]maxproc may have been scaled, make sure
2693 * they are still less than the value of pmap_max_asids.
2694 */
2695 if ((uint32_t)maxproc > pmap_max_asids) {
2696 maxproc = pmap_max_asids;
2697 }
2698 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2699 hard_maxproc = pmap_max_asids;
2700 }
2701 }
2702
2703 /**
2704 * Verify that a given physical page contains no mappings (outside of the
2705 * default physical aperture mapping).
2706 *
2707 * @param ppnum Physical page number to check there are no mappings to.
2708 *
2709 * @return True if there are no mappings, false otherwise or if the page is not
2710 * kernel-managed.
2711 */
2712 bool
pmap_verify_free(ppnum_t ppnum)2713 pmap_verify_free(ppnum_t ppnum)
2714 {
2715 const pmap_paddr_t pa = ptoa(ppnum);
2716
2717 assert(pa != vm_page_fictitious_addr);
2718
2719 /* Only mappings to kernel-managed physical memory are tracked. */
2720 if (!pa_valid(pa)) {
2721 return false;
2722 }
2723
2724 const unsigned int pai = pa_index(pa);
2725 pv_entry_t **pvh = pai_to_pvh(pai);
2726
2727 return pvh_test_type(pvh, PVH_TYPE_NULL);
2728 }
2729
2730 #if MACH_ASSERT
2731 /**
2732 * Verify that a given physical page contains no mappings (outside of the
2733 * default physical aperture mapping) and if it does, then panic.
2734 *
2735 * @note It's recommended to use pmap_verify_free() directly when operating in
2736 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2737 * normally being called from outside of the PPL, and the pv_head_table
2738 * can't be modified outside of the PPL).
2739 *
2740 * @param ppnum Physical page number to check there are no mappings to.
2741 */
2742 void
pmap_assert_free(ppnum_t ppnum)2743 pmap_assert_free(ppnum_t ppnum)
2744 {
2745 const pmap_paddr_t pa = ptoa(ppnum);
2746
2747 /* Only mappings to kernel-managed physical memory are tracked. */
2748 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2749 return;
2750 }
2751
2752 const unsigned int pai = pa_index(pa);
2753 pv_entry_t **pvh = pai_to_pvh(pai);
2754
2755 /**
2756 * This function is always called from outside of the PPL. Because of this,
2757 * the PVH entry can't be locked. This function is generally only called
2758 * before the VM reclaims a physical page and shouldn't be creating new
2759 * mappings. Even if a new mapping is created while parsing the hierarchy,
2760 * the worst case is that the system will panic in another way, and we were
2761 * already about to panic anyway.
2762 */
2763
2764 /**
2765 * Since pmap_verify_free() returned false, that means there is at least one
2766 * mapping left. Let's get some extra info on the first mapping we find to
2767 * dump in the panic string (the common case is that there is one spare
2768 * mapping that was never unmapped).
2769 */
2770 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2771
2772 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2773 first_ptep = pvh_ptep(pvh);
2774 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2775 pv_entry_t *pvep = pvh_pve_list(pvh);
2776
2777 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2778 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2779 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2780 if (first_ptep != PT_ENTRY_NULL) {
2781 break;
2782 }
2783 }
2784
2785 /* The PVE should have at least one valid PTE. */
2786 assert(first_ptep != PT_ENTRY_NULL);
2787 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2788 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2789 __func__, pvh, pai);
2790 } else {
2791 /**
2792 * The mapping disappeared between here and the pmap_verify_free() call.
2793 * The only way that can happen is if the VM was racing this call with
2794 * a call that unmaps PTEs. Operations on this page should not be
2795 * occurring at the same time as this check, and unfortunately we can't
2796 * lock the PVH entry to prevent it, so just panic instead.
2797 */
2798 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2799 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2800 __func__, pvh, pai);
2801 }
2802
2803 /* Panic with a unique string identifying the first bad mapping and owner. */
2804 {
2805 /* First PTE is mapped by the main CPUs. */
2806 pmap_t pmap = ptep_get_pmap(first_ptep);
2807 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2808
2809 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2810 "%s CPU mapping (pmap: %p)",
2811 __func__, (uint64_t)pa, first_ptep, type, pmap);
2812 }
2813 }
2814 #endif
2815
2816
2817 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2818 pmap_root_alloc_size(pmap_t pmap)
2819 {
2820 #pragma unused(pmap)
2821 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2822 unsigned int root_level = pt_attr_root_level(pt_attr);
2823 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2824 }
2825
2826
2827 /*
2828 * Create and return a physical map.
2829 *
2830 * If the size specified for the map
2831 * is zero, the map is an actual physical
2832 * map, and may be referenced by the
2833 * hardware.
2834 *
2835 * If the size specified is non-zero,
2836 * the map will be used in software only, and
2837 * is bounded by that size.
2838 */
2839 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2840 pmap_create_options_internal(
2841 ledger_t ledger,
2842 vm_map_size_t size,
2843 unsigned int flags,
2844 kern_return_t *kr)
2845 {
2846 unsigned i;
2847 unsigned tte_index_max;
2848 pmap_t p;
2849 bool is_64bit = flags & PMAP_CREATE_64BIT;
2850 #if defined(HAS_APPLE_PAC)
2851 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2852 #endif /* defined(HAS_APPLE_PAC) */
2853 kern_return_t local_kr = KERN_SUCCESS;
2854
2855 if (size != 0) {
2856 {
2857 // Size parameter should only be set for stage 2.
2858 return PMAP_NULL;
2859 }
2860 }
2861
2862 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2863 return PMAP_NULL;
2864 }
2865
2866 #if XNU_MONITOR
2867 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2868 goto pmap_create_fail;
2869 }
2870
2871 assert(p != PMAP_NULL);
2872
2873 if (ledger) {
2874 pmap_ledger_validate(ledger);
2875 pmap_ledger_retain(ledger);
2876 }
2877 #else
2878 /*
2879 * Allocate a pmap struct from the pmap_zone. Then allocate
2880 * the translation table of the right size for the pmap.
2881 */
2882 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2883 local_kr = KERN_RESOURCE_SHORTAGE;
2884 goto pmap_create_fail;
2885 }
2886 #endif
2887
2888 p->ledger = ledger;
2889
2890
2891 p->pmap_vm_map_cs_enforced = false;
2892 p->min = 0;
2893
2894
2895 #if CONFIG_ROSETTA
2896 if (flags & PMAP_CREATE_ROSETTA) {
2897 p->is_rosetta = TRUE;
2898 } else {
2899 p->is_rosetta = FALSE;
2900 }
2901 #endif /* CONFIG_ROSETTA */
2902
2903 #if defined(HAS_APPLE_PAC)
2904 p->disable_jop = disable_jop;
2905 #endif /* defined(HAS_APPLE_PAC) */
2906
2907 p->nested_region_true_start = 0;
2908 p->nested_region_true_end = ~0;
2909
2910 p->nx_enabled = true;
2911 p->is_64bit = is_64bit;
2912 p->nested_pmap = PMAP_NULL;
2913 p->type = PMAP_TYPE_USER;
2914
2915 #if ARM_PARAMETERIZED_PMAP
2916 /* Default to the native pt_attr */
2917 p->pmap_pt_attr = native_pt_attr;
2918 #endif /* ARM_PARAMETERIZED_PMAP */
2919 #if __ARM_MIXED_PAGE_SIZE__
2920 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2921 p->pmap_pt_attr = &pmap_pt_attr_4k;
2922 }
2923 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2924 p->max = pmap_user_va_size(p);
2925
2926 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2927 local_kr = KERN_NO_SPACE;
2928 goto id_alloc_fail;
2929 }
2930
2931 pmap_lock_init(p);
2932
2933 p->tt_entry_free = (tt_entry_t *)0;
2934 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
2935
2936
2937 #if XNU_MONITOR
2938 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
2939 #else
2940 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
2941 #endif
2942 if (!(p->tte)) {
2943 local_kr = KERN_RESOURCE_SHORTAGE;
2944 goto tt1_alloc_fail;
2945 }
2946
2947 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
2948 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2949
2950 /* nullify the translation table */
2951 for (i = 0; i < tte_index_max; i++) {
2952 p->tte[i] = ARM_TTE_TYPE_FAULT;
2953 }
2954
2955 FLUSH_PTE();
2956
2957 /*
2958 * initialize the rest of the structure
2959 */
2960 p->nested_region_addr = 0x0ULL;
2961 p->nested_region_size = 0x0ULL;
2962 p->nested_region_asid_bitmap = NULL;
2963 p->nested_region_asid_bitmap_size = 0x0UL;
2964
2965 p->nested_has_no_bounds_ref = false;
2966 p->nested_no_bounds_refcnt = 0;
2967 p->nested_bounds_set = false;
2968
2969
2970 #if MACH_ASSERT
2971 p->pmap_pid = 0;
2972 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
2973 #endif /* MACH_ASSERT */
2974 #if DEVELOPMENT || DEBUG
2975 p->footprint_was_suspended = FALSE;
2976 #endif /* DEVELOPMENT || DEBUG */
2977
2978 #if XNU_MONITOR
2979 os_atomic_init(&p->nested_count, 0);
2980 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
2981 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
2982 os_atomic_thread_fence(release);
2983 #endif
2984 os_atomic_init(&p->ref_count, 1);
2985 pmap_simple_lock(&pmaps_lock);
2986 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
2987 pmap_simple_unlock(&pmaps_lock);
2988
2989 /*
2990 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
2991 * which can lead to a concurrent disconnect operation making the balance
2992 * transiently negative. The ledger should still ultimately balance out,
2993 * which we still check upon pmap destruction.
2994 */
2995 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
2996 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
2997 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
2998 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
2999 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3000 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3001 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3002 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3003 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3004
3005 return p;
3006
3007 tt1_alloc_fail:
3008 pmap_get_pt_ops(p)->free_id(p);
3009 id_alloc_fail:
3010 #if XNU_MONITOR
3011 pmap_free_pmap(p);
3012
3013 if (ledger) {
3014 pmap_ledger_release(ledger);
3015 }
3016 #else
3017 zfree(pmap_zone, p);
3018 #endif
3019 pmap_create_fail:
3020 #if XNU_MONITOR
3021 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3022 #endif
3023 *kr = local_kr;
3024 #if XNU_MONITOR
3025 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3026 #endif
3027 return PMAP_NULL;
3028 }
3029
3030 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3031 pmap_create_options(
3032 ledger_t ledger,
3033 vm_map_size_t size,
3034 unsigned int flags)
3035 {
3036 pmap_t pmap;
3037 kern_return_t kr = KERN_SUCCESS;
3038
3039 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3040
3041 ledger_reference(ledger);
3042
3043 #if XNU_MONITOR
3044 for (;;) {
3045 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3046 if (kr != KERN_RESOURCE_SHORTAGE) {
3047 break;
3048 }
3049 assert(pmap == PMAP_NULL);
3050 pmap_alloc_page_for_ppl(0);
3051 kr = KERN_SUCCESS;
3052 }
3053 #else
3054 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3055 #endif
3056
3057 if (pmap == PMAP_NULL) {
3058 ledger_dereference(ledger);
3059 }
3060
3061 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3062
3063 return pmap;
3064 }
3065
3066 #if XNU_MONITOR
3067 /*
3068 * This symbol remains in place when the PPL is enabled so that the dispatch
3069 * table does not change from development to release configurations.
3070 */
3071 #endif
3072 #if MACH_ASSERT || XNU_MONITOR
3073 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3074 pmap_set_process_internal(
3075 __unused pmap_t pmap,
3076 __unused int pid,
3077 __unused char *procname)
3078 {
3079 #if MACH_ASSERT
3080 if (pmap == NULL || pmap->pmap_pid == -1) {
3081 return;
3082 }
3083
3084 validate_pmap_mutable(pmap);
3085
3086 pmap->pmap_pid = pid;
3087 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3088 #endif /* MACH_ASSERT */
3089 }
3090 #endif /* MACH_ASSERT || XNU_MONITOR */
3091
3092 #if MACH_ASSERT
3093 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3094 pmap_set_process(
3095 pmap_t pmap,
3096 int pid,
3097 char *procname)
3098 {
3099 #if XNU_MONITOR
3100 pmap_set_process_ppl(pmap, pid, procname);
3101 #else
3102 pmap_set_process_internal(pmap, pid, procname);
3103 #endif
3104 }
3105 #endif /* MACH_ASSERT */
3106
3107 /*
3108 * pmap_deallocate_all_leaf_tts:
3109 *
3110 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3111 * removing and deallocating all TTEs.
3112 */
3113 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3114 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3115 {
3116 tt_entry_t tte = ARM_TTE_EMPTY;
3117 tt_entry_t * ttep = NULL;
3118 tt_entry_t * last_ttep = NULL;
3119
3120 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3121
3122 assert(level < pt_attr_leaf_level(pt_attr));
3123
3124 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3125
3126 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3127 tte = *ttep;
3128
3129 if (!(tte & ARM_TTE_VALID)) {
3130 continue;
3131 }
3132
3133 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3134 panic("%s: found block mapping, ttep=%p, tte=%p, "
3135 "pmap=%p, first_ttep=%p, level=%u",
3136 __FUNCTION__, ttep, (void *)tte,
3137 pmap, first_ttep, level);
3138 }
3139
3140 /* Must be valid, type table */
3141 if (level < pt_attr_twig_level(pt_attr)) {
3142 /* If we haven't reached the twig level, recurse to the next level. */
3143 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3144 }
3145
3146 /* Remove the TTE. */
3147 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3148 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3149 }
3150 }
3151
3152 /*
3153 * We maintain stats and ledgers so that a task's physical footprint is:
3154 * phys_footprint = ((internal - alternate_accounting)
3155 * + (internal_compressed - alternate_accounting_compressed)
3156 * + iokit_mapped
3157 * + purgeable_nonvolatile
3158 * + purgeable_nonvolatile_compressed
3159 * + page_table)
3160 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3161 */
3162
3163 /*
3164 * Retire the given physical map from service.
3165 * Should only be called if the map contains
3166 * no valid mappings.
3167 */
3168 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3169 pmap_destroy_internal(
3170 pmap_t pmap)
3171 {
3172 if (pmap == PMAP_NULL) {
3173 return;
3174 }
3175
3176 validate_pmap(pmap);
3177
3178 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3179
3180 int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3181 if (ref_count > 0) {
3182 return;
3183 } else if (__improbable(ref_count < 0)) {
3184 panic("pmap %p: refcount underflow", pmap);
3185 } else if (__improbable(pmap == kernel_pmap)) {
3186 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3187 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3188 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3189 }
3190
3191 #if XNU_MONITOR
3192 /*
3193 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3194 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3195 * That ensures that if the pmap is currently in use elsewhere, this path will
3196 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3197 * ref_count of 0 and panic.
3198 */
3199 os_atomic_thread_fence(seq_cst);
3200 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3201 panic("pmap %p: attempt to destroy while nested", pmap);
3202 }
3203 const int max_cpu = ml_get_max_cpu_number();
3204 for (unsigned int i = 0; i <= max_cpu; ++i) {
3205 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3206 if (cpu_data == NULL) {
3207 continue;
3208 }
3209 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3210 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3211 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3212 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3213 }
3214 }
3215 #endif
3216 pmap_unmap_commpage(pmap);
3217
3218 pmap_simple_lock(&pmaps_lock);
3219 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3220 pmap_simple_unlock(&pmaps_lock);
3221
3222 pmap_trim_self(pmap);
3223
3224 /*
3225 * Free the memory maps, then the
3226 * pmap structure.
3227 */
3228 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3229
3230
3231
3232 if (pmap->tte) {
3233 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3234 pmap->tte = (tt_entry_t *) NULL;
3235 pmap->ttep = 0;
3236 }
3237
3238 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3239
3240 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3241 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false);
3242 sync_tlb_flush();
3243 } else {
3244 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3245 sync_tlb_flush();
3246 /* return its asid to the pool */
3247 pmap_get_pt_ops(pmap)->free_id(pmap);
3248 if (pmap->nested_pmap != NULL) {
3249 #if XNU_MONITOR
3250 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3251 #endif
3252 /* release the reference we hold on the nested pmap */
3253 pmap_destroy_internal(pmap->nested_pmap);
3254 }
3255 }
3256
3257 pmap_check_ledgers(pmap);
3258
3259 if (pmap->nested_region_asid_bitmap) {
3260 #if XNU_MONITOR
3261 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3262 #else
3263 kfree_data(pmap->nested_region_asid_bitmap,
3264 pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3265 #endif
3266 }
3267
3268 #if XNU_MONITOR
3269 if (pmap->ledger) {
3270 pmap_ledger_release(pmap->ledger);
3271 }
3272
3273 pmap_lock_destroy(pmap);
3274 pmap_free_pmap(pmap);
3275 #else
3276 pmap_lock_destroy(pmap);
3277 zfree(pmap_zone, pmap);
3278 #endif
3279 }
3280
3281 void
pmap_destroy(pmap_t pmap)3282 pmap_destroy(
3283 pmap_t pmap)
3284 {
3285 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3286
3287 ledger_t ledger = pmap->ledger;
3288
3289 #if XNU_MONITOR
3290 pmap_destroy_ppl(pmap);
3291
3292 pmap_ledger_check_balance(pmap);
3293 #else
3294 pmap_destroy_internal(pmap);
3295 #endif
3296
3297 ledger_dereference(ledger);
3298
3299 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3300 }
3301
3302
3303 /*
3304 * Add a reference to the specified pmap.
3305 */
3306 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3307 pmap_reference_internal(
3308 pmap_t pmap)
3309 {
3310 if (pmap != PMAP_NULL) {
3311 validate_pmap_mutable(pmap);
3312 os_atomic_inc(&pmap->ref_count, relaxed);
3313 }
3314 }
3315
3316 void
pmap_reference(pmap_t pmap)3317 pmap_reference(
3318 pmap_t pmap)
3319 {
3320 #if XNU_MONITOR
3321 pmap_reference_ppl(pmap);
3322 #else
3323 pmap_reference_internal(pmap);
3324 #endif
3325 }
3326
3327 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3328 pmap_tt1_allocate(
3329 pmap_t pmap,
3330 vm_size_t size,
3331 unsigned option)
3332 {
3333 tt_entry_t *tt1 = NULL;
3334 tt_free_entry_t *tt1_free;
3335 pmap_paddr_t pa;
3336 vm_address_t va;
3337 vm_address_t va_end;
3338 kern_return_t ret;
3339
3340 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3341 size = PAGE_SIZE;
3342 }
3343
3344 pmap_simple_lock(&tt1_lock);
3345 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3346 free_page_size_tt_count--;
3347 tt1 = (tt_entry_t *)free_page_size_tt_list;
3348 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3349 } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3350 free_two_page_size_tt_count--;
3351 tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3352 free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3353 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3354 free_tt_count--;
3355 tt1 = (tt_entry_t *)free_tt_list;
3356 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3357 }
3358
3359 pmap_simple_unlock(&tt1_lock);
3360
3361 if (tt1 != NULL) {
3362 pmap_tt_ledger_credit(pmap, size);
3363 return (tt_entry_t *)tt1;
3364 }
3365
3366 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3367
3368 if (ret == KERN_RESOURCE_SHORTAGE) {
3369 return (tt_entry_t *)0;
3370 }
3371
3372 #if XNU_MONITOR
3373 assert(pa);
3374 #endif
3375
3376 if (size < PAGE_SIZE) {
3377 va = phystokv(pa) + size;
3378 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3379 tt_free_entry_t *next_free = NULL;
3380 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3381 tt1_free = (tt_free_entry_t *)va;
3382 tt1_free->next = next_free;
3383 next_free = tt1_free;
3384 }
3385 pmap_simple_lock(&tt1_lock);
3386 local_free_list->next = free_tt_list;
3387 free_tt_list = next_free;
3388 free_tt_count += ((PAGE_SIZE / size) - 1);
3389 if (free_tt_count > free_tt_max) {
3390 free_tt_max = free_tt_count;
3391 }
3392 pmap_simple_unlock(&tt1_lock);
3393 }
3394
3395 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3396 * Depending on the device, this can vary between 512b and 16K. */
3397 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3398 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3399 pmap_tt_ledger_credit(pmap, size);
3400
3401 return (tt_entry_t *) phystokv(pa);
3402 }
3403
3404 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3405 pmap_tt1_deallocate(
3406 pmap_t pmap,
3407 tt_entry_t *tt,
3408 vm_size_t size,
3409 unsigned option)
3410 {
3411 tt_free_entry_t *tt_entry;
3412
3413 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3414 size = PAGE_SIZE;
3415 }
3416
3417 tt_entry = (tt_free_entry_t *)tt;
3418 assert(not_in_kdp);
3419 pmap_simple_lock(&tt1_lock);
3420
3421 if (size < PAGE_SIZE) {
3422 free_tt_count++;
3423 if (free_tt_count > free_tt_max) {
3424 free_tt_max = free_tt_count;
3425 }
3426 tt_entry->next = free_tt_list;
3427 free_tt_list = tt_entry;
3428 }
3429
3430 if (size == PAGE_SIZE) {
3431 free_page_size_tt_count++;
3432 if (free_page_size_tt_count > free_page_size_tt_max) {
3433 free_page_size_tt_max = free_page_size_tt_count;
3434 }
3435 tt_entry->next = free_page_size_tt_list;
3436 free_page_size_tt_list = tt_entry;
3437 }
3438
3439 if (size == 2 * PAGE_SIZE) {
3440 free_two_page_size_tt_count++;
3441 if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3442 free_two_page_size_tt_max = free_two_page_size_tt_count;
3443 }
3444 tt_entry->next = free_two_page_size_tt_list;
3445 free_two_page_size_tt_list = tt_entry;
3446 }
3447
3448 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3449 pmap_simple_unlock(&tt1_lock);
3450 pmap_tt_ledger_debit(pmap, size);
3451 return;
3452 }
3453
3454 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3455 free_page_size_tt_count--;
3456 tt = (tt_entry_t *)free_page_size_tt_list;
3457 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3458
3459 pmap_simple_unlock(&tt1_lock);
3460
3461 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3462
3463 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3464
3465 pmap_simple_lock(&tt1_lock);
3466 }
3467
3468 while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3469 free_two_page_size_tt_count--;
3470 tt = (tt_entry_t *)free_two_page_size_tt_list;
3471 free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3472
3473 pmap_simple_unlock(&tt1_lock);
3474
3475 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3476
3477 OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3478
3479 pmap_simple_lock(&tt1_lock);
3480 }
3481 pmap_simple_unlock(&tt1_lock);
3482 pmap_tt_ledger_debit(pmap, size);
3483 }
3484
3485 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3486 pmap_tt_allocate(
3487 pmap_t pmap,
3488 tt_entry_t **ttp,
3489 unsigned int level,
3490 unsigned int options)
3491 {
3492 pmap_paddr_t pa;
3493 *ttp = NULL;
3494
3495 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3496 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3497 tt_free_entry_t *tt_free_cur, *tt_free_next;
3498
3499 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3500 tt_free_next = tt_free_cur->next;
3501 tt_free_cur->next = NULL;
3502 *ttp = (tt_entry_t *)tt_free_cur;
3503 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3504 }
3505 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3506
3507 if (*ttp == NULL) {
3508 pt_desc_t *ptdp;
3509
3510 /*
3511 * Allocate a VM page for the level x page table entries.
3512 */
3513 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3514 if (options & PMAP_OPTIONS_NOWAIT) {
3515 return KERN_RESOURCE_SHORTAGE;
3516 }
3517 VM_PAGE_WAIT();
3518 }
3519
3520 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3521 if (options & PMAP_OPTIONS_NOWAIT) {
3522 pmap_pages_free(pa, PAGE_SIZE);
3523 return KERN_RESOURCE_SHORTAGE;
3524 }
3525 VM_PAGE_WAIT();
3526 }
3527
3528 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3529 OSAddAtomic64(1, &alloc_ttepages_count);
3530 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3531 } else {
3532 OSAddAtomic64(1, &alloc_ptepages_count);
3533 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3534 }
3535
3536 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3537
3538 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3539
3540 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3541 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3542 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3543
3544 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3545 if (PAGE_SIZE > pmap_page_size) {
3546 vm_address_t va;
3547 vm_address_t va_end;
3548
3549 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3550
3551 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3552 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3553 pmap->tt_entry_free = (tt_entry_t *)va;
3554 }
3555 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3556 }
3557
3558 *ttp = (tt_entry_t *)phystokv(pa);
3559 }
3560
3561 #if XNU_MONITOR
3562 assert(*ttp);
3563 #endif
3564
3565 return KERN_SUCCESS;
3566 }
3567
3568
3569 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3570 pmap_tt_deallocate(
3571 pmap_t pmap,
3572 tt_entry_t *ttp,
3573 unsigned int level)
3574 {
3575 pt_desc_t *ptdp;
3576 ptd_info_t *ptd_info;
3577 unsigned pt_acc_cnt;
3578 unsigned i;
3579 vm_offset_t free_page = 0;
3580 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3581 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3582
3583 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3584
3585 ptdp = ptep_get_ptd(ttp);
3586 ptd_info = ptd_get_info(ptdp, ttp);
3587
3588 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3589
3590 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3591 ptd_info->refcnt = 0;
3592 }
3593
3594 if (__improbable(ptd_info->refcnt != 0)) {
3595 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3596 }
3597
3598 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3599 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3600 }
3601
3602 if (pt_acc_cnt == 0) {
3603 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3604 unsigned pt_free_entry_cnt = 1;
3605
3606 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3607 tt_free_entry_t *tt_free_list_next;
3608
3609 tt_free_list_next = tt_free_list->next;
3610 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3611 pt_free_entry_cnt++;
3612 }
3613 tt_free_list = tt_free_list_next;
3614 }
3615 if (pt_free_entry_cnt == max_pt_index) {
3616 tt_free_entry_t *tt_free_list_cur;
3617
3618 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3619 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3620 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3621
3622 while (tt_free_list_cur) {
3623 tt_free_entry_t *tt_free_list_next;
3624
3625 tt_free_list_next = tt_free_list_cur->next;
3626 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3627 tt_free_list->next = tt_free_list_next->next;
3628 } else {
3629 tt_free_list = tt_free_list_next;
3630 }
3631 tt_free_list_cur = tt_free_list_next;
3632 }
3633 } else {
3634 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3635 pmap->tt_entry_free = ttp;
3636 }
3637 } else {
3638 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3639 pmap->tt_entry_free = ttp;
3640 }
3641
3642 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3643
3644 if (free_page != 0) {
3645 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3646 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3647 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3648 if (level < pt_attr_leaf_level(pt_attr)) {
3649 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3650 } else {
3651 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3652 }
3653 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3654 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3655 }
3656 }
3657
3658 /**
3659 * Safely clear out a translation table entry.
3660 *
3661 * @note If the TTE to clear out points to a leaf table, then that leaf table
3662 * must have a refcnt of zero before the TTE can be removed.
3663 * @note This function expects to be called with pmap locked exclusive, and will
3664 * return with pmap unlocked.
3665 *
3666 * @param pmap The pmap containing the page table whose TTE is being removed.
3667 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3668 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3669 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3670 * @param ttep Pointer to the TTE that should be cleared out.
3671 * @param level The level of the page table that contains the TTE to be removed.
3672 */
3673 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3674 pmap_tte_remove(
3675 pmap_t pmap,
3676 vm_offset_t va_start,
3677 vm_offset_t va_end,
3678 bool need_strong_sync,
3679 tt_entry_t *ttep,
3680 unsigned int level)
3681 {
3682 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3683
3684 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3685 const tt_entry_t tte = *ttep;
3686
3687 if (__improbable(tte == ARM_TTE_EMPTY)) {
3688 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3689 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3690 }
3691
3692 *ttep = (tt_entry_t) 0;
3693 FLUSH_PTE_STRONG();
3694 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3695 if (va_end > va_start) {
3696 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3697 }
3698
3699 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3700
3701 /**
3702 * Remember, the passed in "level" parameter refers to the level above the
3703 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3704 * page table).
3705 */
3706 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3707
3708 /**
3709 * Non-leaf pagetables don't track active references in the PTD and instead
3710 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3711 * the real refcount below.
3712 */
3713 unsigned short refcnt = PT_DESC_REFCOUNT;
3714
3715 /*
3716 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3717 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3718 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3719 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3720 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3721 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3722 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3723 * synchronize it against the disconnect operation. If that removal caused the
3724 * refcount to reach zero, the pagetable page could be freed before the disconnect
3725 * operation is finished using the relevant pagetable descriptor.
3726 * Address these cases by waiting until all CPUs have been observed to not be
3727 * executing pmap_disconnect().
3728 */
3729 if (remove_leaf_table) {
3730 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3731 const int max_cpu = ml_get_max_cpu_number();
3732 bitmap_full(&active_disconnects[0], max_cpu + 1);
3733 bool inflight_disconnect;
3734
3735 /*
3736 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3737 * ahead of any prior PTE load which may have observed the effect of a
3738 * concurrent disconnect operation. An acquire fence is required for this;
3739 * a load-acquire operation is insufficient.
3740 */
3741 os_atomic_thread_fence(acquire);
3742 do {
3743 inflight_disconnect = false;
3744 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3745 i >= 0;
3746 i = bitmap_next(&active_disconnects[0], i)) {
3747 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3748 if (cpu_data == NULL) {
3749 continue;
3750 }
3751 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3752 __builtin_arm_wfe();
3753 inflight_disconnect = true;
3754 continue;
3755 }
3756 os_atomic_clear_exclusive();
3757 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3758 }
3759 } while (inflight_disconnect);
3760 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3761 os_atomic_thread_fence(acquire);
3762 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3763 }
3764
3765 #if MACH_ASSERT
3766 /**
3767 * On internal devices, always do the page table consistency check
3768 * regardless of page table level or the actual refcnt value.
3769 */
3770 {
3771 #else /* MACH_ASSERT */
3772 /**
3773 * Only perform the page table consistency check when deleting leaf page
3774 * tables and it seems like there might be valid/compressed mappings
3775 * leftover.
3776 */
3777 if (__improbable(remove_leaf_table && refcnt != 0)) {
3778 #endif /* MACH_ASSERT */
3779
3780 /**
3781 * There are multiple problems that can arise as a non-zero refcnt:
3782 * 1. A bug in the refcnt management logic.
3783 * 2. A memory stomper or hardware failure.
3784 * 3. The VM forgetting to unmap all of the valid mappings in an address
3785 * space before destroying a pmap.
3786 *
3787 * By looping over the page table and determining how many valid or
3788 * compressed entries there actually are, we can narrow down which of
3789 * these three cases is causing this panic. If the expected refcnt
3790 * (valid + compressed) and the actual refcnt don't match then the
3791 * problem is probably either a memory corruption issue (if the
3792 * non-empty entries don't match valid+compressed, that could also be a
3793 * sign of corruption) or refcnt management bug. Otherwise, there
3794 * actually are leftover mappings and the higher layers of xnu are
3795 * probably at fault.
3796 */
3797 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3798 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3799
3800 pt_entry_t *ptep = bpte;
3801 unsigned short non_empty = 0, valid = 0, comp = 0;
3802 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3803 /* Keep track of all non-empty entries to detect memory corruption. */
3804 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3805 non_empty++;
3806 }
3807
3808 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3809 comp++;
3810 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3811 valid++;
3812 }
3813 }
3814
3815 #if MACH_ASSERT
3816 /**
3817 * On internal machines, panic whenever a page table getting deleted has
3818 * leftover mappings (valid or otherwise) or a leaf page table has a
3819 * non-zero refcnt.
3820 */
3821 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3822 #else /* MACH_ASSERT */
3823 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3824 {
3825 #endif /* MACH_ASSERT */
3826 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3827 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3828 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3829 }
3830 }
3831 }
3832
3833 /**
3834 * Given a pointer to an entry within a `level` page table, delete the
3835 * page table at `level` + 1 that is represented by that entry. For instance,
3836 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3837 * contains the PA of the L3 table, and `level` would be "2".
3838 *
3839 * @note If the table getting deallocated is a leaf table, then that leaf table
3840 * must have a refcnt of zero before getting deallocated. All other levels
3841 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3842 * @note This function expects to be called with pmap locked exclusive and will
3843 * return with pmap unlocked.
3844 *
3845 * @param pmap The pmap that owns the page table to be deallocated.
3846 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3847 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3848 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3849 * @param ttep Pointer to the `level` TTE to remove.
3850 * @param level The level of the table that contains an entry pointing to the
3851 * table to be removed. The deallocated page table will be a
3852 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3853 * deleted).
3854 */
3855 void
3856 pmap_tte_deallocate(
3857 pmap_t pmap,
3858 vm_offset_t va_start,
3859 vm_offset_t va_end,
3860 bool need_strong_sync,
3861 tt_entry_t *ttep,
3862 unsigned int level)
3863 {
3864 tt_entry_t tte;
3865
3866 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3867
3868 tte = *ttep;
3869
3870 if (tte_get_ptd(tte)->pmap != pmap) {
3871 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3872 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3873 }
3874
3875 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3876 __func__, ttep, (unsigned long long)tte);
3877
3878 /* pmap_tte_remove() will drop the pmap lock */
3879 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3880
3881 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3882 }
3883
3884 /*
3885 * Remove a range of hardware page-table entries.
3886 * The entries given are the first (inclusive)
3887 * and last (exclusive) entries for the VM pages.
3888 * The virtual address is the va for the first pte.
3889 *
3890 * The pmap must be locked.
3891 * If the pmap is not the kernel pmap, the range must lie
3892 * entirely within one pte-page. This is NOT checked.
3893 * Assumes that the pte-page exists.
3894 *
3895 * Returns the number of PTE changed
3896 */
3897 MARK_AS_PMAP_TEXT static int
3898 pmap_remove_range(
3899 pmap_t pmap,
3900 vm_map_address_t va,
3901 pt_entry_t *bpte,
3902 pt_entry_t *epte)
3903 {
3904 bool need_strong_sync = false;
3905 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
3906 &need_strong_sync, PMAP_OPTIONS_REMOVE);
3907 if (num_changed > 0) {
3908 PMAP_UPDATE_TLBS(pmap, va,
3909 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
3910 }
3911 return num_changed;
3912 }
3913
3914
3915 #ifdef PVH_FLAG_EXEC
3916
3917 /*
3918 * Update the access protection bits of the physical aperture mapping for a page.
3919 * This is useful, for example, in guranteeing that a verified executable page
3920 * has no writable mappings anywhere in the system, including the physical
3921 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
3922 * synchronization overhead in cases where the call to this function is
3923 * guaranteed to be followed by other TLB operations.
3924 */
3925 void
3926 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
3927 {
3928 #if __ARM_PTE_PHYSMAP__
3929 pvh_assert_locked(pai);
3930 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
3931 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
3932
3933 pt_entry_t tmplate = *pte_p;
3934 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
3935 return;
3936 }
3937 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
3938 if (tmplate & ARM_PTE_HINT_MASK) {
3939 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
3940 __func__, pte_p, (void *)kva, tmplate);
3941 }
3942 write_pte_strong(pte_p, tmplate);
3943 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
3944 if (!flush_tlb_async) {
3945 sync_tlb_flush();
3946 }
3947 #endif
3948 }
3949 #endif /* defined(PVH_FLAG_EXEC) */
3950
3951
3952
3953 MARK_AS_PMAP_TEXT int
3954 pmap_remove_range_options(
3955 pmap_t pmap,
3956 vm_map_address_t va,
3957 pt_entry_t *bpte,
3958 pt_entry_t *epte,
3959 vm_map_address_t *eva,
3960 bool *need_strong_sync __unused,
3961 int options)
3962 {
3963 pt_entry_t *cpte;
3964 size_t npages = 0;
3965 int num_removed, num_unwired;
3966 int num_pte_changed;
3967 unsigned int pai = 0;
3968 pmap_paddr_t pa;
3969 int num_external, num_internal, num_reusable;
3970 int num_alt_internal;
3971 uint64_t num_compressed, num_alt_compressed;
3972 int16_t refcnt = 0;
3973
3974 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3975
3976 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3977 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
3978
3979 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
3980 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
3981 }
3982
3983 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3984 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
3985 }
3986
3987 num_removed = 0;
3988 num_unwired = 0;
3989 num_pte_changed = 0;
3990 num_external = 0;
3991 num_internal = 0;
3992 num_reusable = 0;
3993 num_compressed = 0;
3994 num_alt_internal = 0;
3995 num_alt_compressed = 0;
3996
3997 #if XNU_MONITOR
3998 bool ro_va = false;
3999 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4000 ro_va = true;
4001 }
4002 #endif
4003 for (cpte = bpte; cpte < epte;
4004 cpte += PAGE_RATIO, va += pmap_page_size) {
4005 pt_entry_t spte;
4006 boolean_t managed = FALSE;
4007
4008 /*
4009 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4010 * so we need to be as aggressive as possible in checking for preemption when we can.
4011 */
4012 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4013 *eva = va;
4014 break;
4015 }
4016
4017 spte = *((volatile pt_entry_t*)cpte);
4018
4019 while (!managed) {
4020 if (pmap != kernel_pmap &&
4021 (options & PMAP_OPTIONS_REMOVE) &&
4022 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4023 /*
4024 * "pmap" must be locked at this point,
4025 * so this should not race with another
4026 * pmap_remove_range() or pmap_enter().
4027 */
4028
4029 /* one less "compressed"... */
4030 num_compressed++;
4031 if (spte & ARM_PTE_COMPRESSED_ALT) {
4032 /* ... but it used to be "ALTACCT" */
4033 num_alt_compressed++;
4034 }
4035
4036 /* clear marker */
4037 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4038 /*
4039 * "refcnt" also accounts for
4040 * our "compressed" markers,
4041 * so let's update it here.
4042 */
4043 --refcnt;
4044 spte = *((volatile pt_entry_t*)cpte);
4045 }
4046 /*
4047 * It may be possible for the pte to transition from managed
4048 * to unmanaged in this timeframe; for now, elide the assert.
4049 * We should break out as a consequence of checking pa_valid.
4050 */
4051 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4052 pa = pte_to_pa(spte);
4053 if (!pa_valid(pa)) {
4054 #if XNU_MONITOR
4055 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4056 #endif
4057 #if XNU_MONITOR
4058 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4059 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4060 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4061 __func__, (uint64_t)pa);
4062 }
4063 #endif
4064 break;
4065 }
4066 pai = pa_index(pa);
4067 pvh_lock(pai);
4068 spte = *((volatile pt_entry_t*)cpte);
4069 pa = pte_to_pa(spte);
4070 if (pai == pa_index(pa)) {
4071 managed = TRUE;
4072 break; // Leave pai locked as we will unlock it after we free the PV entry
4073 }
4074 pvh_unlock(pai);
4075 }
4076
4077 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4078 /*
4079 * There used to be a valid mapping here but it
4080 * has already been removed when the page was
4081 * sent to the VM compressor, so nothing left to
4082 * remove now...
4083 */
4084 continue;
4085 }
4086
4087 /* remove the translation, do not flush the TLB */
4088 if (*cpte != ARM_PTE_TYPE_FAULT) {
4089 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4090 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4091 #if MACH_ASSERT
4092 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4093 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4094 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4095 }
4096 #endif
4097 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4098 num_pte_changed++;
4099 }
4100
4101 if ((spte != ARM_PTE_TYPE_FAULT) &&
4102 (pmap != kernel_pmap)) {
4103 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4104 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4105 --refcnt;
4106 }
4107
4108 if (pte_is_wired(spte)) {
4109 pte_set_wired(pmap, cpte, 0);
4110 num_unwired++;
4111 }
4112 /*
4113 * if not managed, we're done
4114 */
4115 if (!managed) {
4116 continue;
4117 }
4118
4119 #if XNU_MONITOR
4120 if (__improbable(ro_va)) {
4121 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4122 }
4123 #endif
4124
4125 /*
4126 * find and remove the mapping from the chain for this
4127 * physical address.
4128 */
4129 bool is_internal, is_altacct;
4130 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4131
4132 if (is_altacct) {
4133 assert(is_internal);
4134 num_internal++;
4135 num_alt_internal++;
4136 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4137 ppattr_clear_altacct(pai);
4138 ppattr_clear_internal(pai);
4139 }
4140 } else if (is_internal) {
4141 if (ppattr_test_reusable(pai)) {
4142 num_reusable++;
4143 } else {
4144 num_internal++;
4145 }
4146 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4147 ppattr_clear_internal(pai);
4148 }
4149 } else {
4150 num_external++;
4151 }
4152 pvh_unlock(pai);
4153 num_removed++;
4154 }
4155
4156 /*
4157 * Update the counts
4158 */
4159 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4160
4161 if (pmap != kernel_pmap) {
4162 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4163 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4164 }
4165
4166 /* update ledgers */
4167 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4168 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4169 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4170 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4171 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4172 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4173 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4174 /* make needed adjustments to phys_footprint */
4175 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4176 ((num_internal -
4177 num_alt_internal) +
4178 (num_compressed -
4179 num_alt_compressed)) * pmap_page_size);
4180 }
4181
4182 /* flush the ptable entries we have written */
4183 if (num_pte_changed > 0) {
4184 FLUSH_PTE_STRONG();
4185 }
4186
4187 return num_pte_changed;
4188 }
4189
4190
4191 /*
4192 * Remove the given range of addresses
4193 * from the specified map.
4194 *
4195 * It is assumed that the start and end are properly
4196 * rounded to the hardware page size.
4197 */
4198 void
4199 pmap_remove(
4200 pmap_t pmap,
4201 vm_map_address_t start,
4202 vm_map_address_t end)
4203 {
4204 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4205 }
4206
4207 MARK_AS_PMAP_TEXT vm_map_address_t
4208 pmap_remove_options_internal(
4209 pmap_t pmap,
4210 vm_map_address_t start,
4211 vm_map_address_t end,
4212 int options)
4213 {
4214 vm_map_address_t eva = end;
4215 pt_entry_t *bpte, *epte;
4216 pt_entry_t *pte_p;
4217 tt_entry_t *tte_p;
4218 int remove_count = 0;
4219 bool need_strong_sync = false;
4220 bool unlock = true;
4221
4222 if (__improbable(end < start)) {
4223 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4224 }
4225
4226 validate_pmap_mutable(pmap);
4227
4228 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4229
4230 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4231
4232 tte_p = pmap_tte(pmap, start);
4233
4234 if (tte_p == (tt_entry_t *) NULL) {
4235 goto done;
4236 }
4237
4238 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4239 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4240 bpte = &pte_p[pte_index(pt_attr, start)];
4241 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4242
4243 /*
4244 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4245 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4246 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4247 */
4248 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4249 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4250 __func__, ptep_get_pmap(bpte), pmap, bpte);
4251 }
4252
4253 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4254 &need_strong_sync, options);
4255
4256 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4257 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4258 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4259 unlock = false; // pmap_tte_deallocate() has dropped the lock
4260 }
4261 }
4262
4263 done:
4264 if (unlock) {
4265 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4266 }
4267
4268 if (remove_count > 0) {
4269 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4270 }
4271 return eva;
4272 }
4273
4274 void
4275 pmap_remove_options(
4276 pmap_t pmap,
4277 vm_map_address_t start,
4278 vm_map_address_t end,
4279 int options)
4280 {
4281 vm_map_address_t va;
4282
4283 if (pmap == PMAP_NULL) {
4284 return;
4285 }
4286
4287 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4288
4289 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4290 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4291 VM_KERNEL_ADDRHIDE(end));
4292
4293 #if MACH_ASSERT
4294 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4295 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4296 pmap, (uint64_t)start, (uint64_t)end);
4297 }
4298 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4299 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4300 pmap, (uint64_t)start, (uint64_t)end);
4301 }
4302 #endif
4303
4304 /*
4305 * We allow single-page requests to execute non-preemptibly,
4306 * as it doesn't make sense to sample AST_URGENT for a single-page
4307 * operation, and there are a couple of special use cases that
4308 * require a non-preemptible single-page operation.
4309 */
4310 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4311 pmap_verify_preemptible();
4312 }
4313
4314 /*
4315 * Invalidate the translation buffer first
4316 */
4317 va = start;
4318 while (va < end) {
4319 vm_map_address_t l;
4320
4321 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4322 if (l > end) {
4323 l = end;
4324 }
4325
4326 #if XNU_MONITOR
4327 va = pmap_remove_options_ppl(pmap, va, l, options);
4328
4329 pmap_ledger_check_balance(pmap);
4330 #else
4331 va = pmap_remove_options_internal(pmap, va, l, options);
4332 #endif
4333 }
4334
4335 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4336 }
4337
4338
4339 /*
4340 * Remove phys addr if mapped in specified map
4341 */
4342 void
4343 pmap_remove_some_phys(
4344 __unused pmap_t map,
4345 __unused ppnum_t pn)
4346 {
4347 /* Implement to support working set code */
4348 }
4349
4350 /*
4351 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4352 * switch a thread onto a new vm_map.
4353 */
4354 void
4355 pmap_switch_user(thread_t thread, vm_map_t new_map)
4356 {
4357 pmap_t new_pmap = new_map->pmap;
4358
4359
4360 thread->map = new_map;
4361 pmap_set_pmap(new_pmap, thread);
4362
4363 }
4364
4365 void
4366 pmap_set_pmap(
4367 pmap_t pmap,
4368 #if !__ARM_USER_PROTECT__
4369 __unused
4370 #endif
4371 thread_t thread)
4372 {
4373 pmap_switch(pmap);
4374 #if __ARM_USER_PROTECT__
4375 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4376 thread->machine.asid = pmap->hw_asid;
4377 #endif
4378 }
4379
4380 static void
4381 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4382 {
4383 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4384 }
4385
4386 static inline bool
4387 pmap_user_ttb_is_clear(void)
4388 {
4389 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4390 }
4391
4392 MARK_AS_PMAP_TEXT void
4393 pmap_switch_internal(
4394 pmap_t pmap)
4395 {
4396 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4397 #if XNU_MONITOR
4398 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4399 #endif
4400 validate_pmap_mutable(pmap);
4401 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4402 uint16_t asid_index = pmap->hw_asid;
4403 bool do_asid_flush = false;
4404 bool do_commpage_flush = false;
4405
4406 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4407 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4408 }
4409 #if __ARM_KERNEL_PROTECT__
4410 asid_index >>= 1;
4411 #endif
4412
4413 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4414 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4415 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4416 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4417 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4418 bool break_before_make = do_shared_region_flush;
4419
4420 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4421 asid_index -= 1;
4422 pmap_update_plru(asid_index);
4423
4424 /* Paranoia. */
4425 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4426
4427 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4428 uint8_t new_sw_asid = pmap->sw_asid;
4429 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4430
4431 if (new_sw_asid != last_sw_asid) {
4432 /*
4433 * If the virtual ASID of the new pmap does not match the virtual ASID
4434 * last seen on this CPU for the physical ASID (that was a mouthful),
4435 * then this switch runs the risk of aliasing. We need to flush the
4436 * TLB for this phyiscal ASID in this case.
4437 */
4438 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4439 do_asid_flush = true;
4440 break_before_make = true;
4441 }
4442 }
4443
4444 #if __ARM_MIXED_PAGE_SIZE__
4445 if (pt_attr->pta_tcr_value != get_tcr()) {
4446 break_before_make = true;
4447 }
4448 #endif
4449 #if __ARM_MIXED_PAGE_SIZE__
4450 /*
4451 * For mixed page size configurations, we need to flush the global commpage mappings from
4452 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4453 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4454 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4455 * conflict abort or other unpredictable behavior.
4456 */
4457 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4458 do_commpage_flush = true;
4459 }
4460 if (do_commpage_flush) {
4461 break_before_make = true;
4462 }
4463 #endif
4464 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4465 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4466 pmap_clear_user_ttb_internal();
4467 }
4468
4469 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4470 * to flush the userspace mappings for that region. Those mappings are global
4471 * and will not be protected by the ASID. It should also be cheaper to flush the
4472 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4473 if (__improbable(do_shared_region_flush)) {
4474 #if __ARM_RANGE_TLBI__
4475 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4476 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4477
4478 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4479 * There may still be non-global entries that overlap with the incoming pmap's
4480 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4481 * must necessarily belong to a different ASID than the incoming pmap, or they would
4482 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4483 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4484 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4485 * to consider additional invalidation here in the future. */
4486 if (npages_prev <= ARM64_TLB_RANGE_PAGES) {
4487 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4488 } else {
4489 do_asid_flush = false;
4490 flush_core_tlb_async();
4491 }
4492 #else
4493 do_asid_flush = false;
4494 flush_core_tlb_async();
4495 #endif // __ARM_RANGE_TLBI__
4496 }
4497
4498 #if __ARM_MIXED_PAGE_SIZE__
4499 if (__improbable(do_commpage_flush)) {
4500 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4501 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4502 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4503 flush_core_tlb_allrange_async(rtlbi_param);
4504 }
4505 #endif
4506 if (__improbable(do_asid_flush)) {
4507 pmap_flush_core_tlb_asid_async(pmap);
4508 #if DEVELOPMENT || DEBUG
4509 os_atomic_inc(&pmap_asid_flushes, relaxed);
4510 #endif
4511 }
4512 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4513 sync_tlb_flush_local();
4514 }
4515
4516 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4517 }
4518
4519 void
4520 pmap_switch(
4521 pmap_t pmap)
4522 {
4523 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4524 #if XNU_MONITOR
4525 pmap_switch_ppl(pmap);
4526 #else
4527 pmap_switch_internal(pmap);
4528 #endif
4529 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4530 }
4531
4532 void
4533 pmap_page_protect(
4534 ppnum_t ppnum,
4535 vm_prot_t prot)
4536 {
4537 pmap_page_protect_options(ppnum, prot, 0, NULL);
4538 }
4539
4540 /*
4541 * Routine: pmap_page_protect_options
4542 *
4543 * Function:
4544 * Lower the permission for all mappings to a given
4545 * page.
4546 */
4547 MARK_AS_PMAP_TEXT static void
4548 pmap_page_protect_options_with_flush_range(
4549 ppnum_t ppnum,
4550 vm_prot_t prot,
4551 unsigned int options,
4552 pmap_tlb_flush_range_t *flush_range)
4553 {
4554 pmap_paddr_t phys = ptoa(ppnum);
4555 pv_entry_t **pv_h;
4556 pv_entry_t *pve_p, *orig_pve_p;
4557 pv_entry_t *pveh_p;
4558 pv_entry_t *pvet_p;
4559 pt_entry_t *pte_p, *orig_pte_p;
4560 pv_entry_t *new_pve_p;
4561 pt_entry_t *new_pte_p;
4562 vm_offset_t pvh_flags;
4563 unsigned int pai;
4564 bool remove;
4565 bool set_NX;
4566 unsigned int pvh_cnt = 0;
4567 unsigned int pass1_updated = 0;
4568 unsigned int pass2_updated = 0;
4569
4570 assert(ppnum != vm_page_fictitious_addr);
4571
4572 /* Only work with managed pages. */
4573 if (!pa_valid(phys)) {
4574 return;
4575 }
4576
4577 /*
4578 * Determine the new protection.
4579 */
4580 switch (prot) {
4581 case VM_PROT_ALL:
4582 return; /* nothing to do */
4583 case VM_PROT_READ:
4584 case VM_PROT_READ | VM_PROT_EXECUTE:
4585 remove = false;
4586 break;
4587 default:
4588 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4589 options = options & ~PMAP_OPTIONS_NOFLUSH;
4590 remove = true;
4591 break;
4592 }
4593
4594 pmap_cpu_data_t *pmap_cpu_data = NULL;
4595 if (remove) {
4596 #if !XNU_MONITOR
4597 mp_disable_preemption();
4598 #endif
4599 pmap_cpu_data = pmap_get_cpu_data();
4600 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4601 /*
4602 * Ensure the store to inflight_disconnect will be observed before any of the
4603 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4604 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4605 * another CPU, in between this function's clearing a PTE and dropping the
4606 * corresponding pagetable refcount. That can lead to a panic if the
4607 * destroying thread observes a non-zero refcount. For this we need a store-
4608 * store barrier; a store-release operation would not be sufficient.
4609 */
4610 os_atomic_thread_fence(release);
4611 }
4612
4613 pai = pa_index(phys);
4614 pvh_lock(pai);
4615 pv_h = pai_to_pvh(pai);
4616 pvh_flags = pvh_get_flags(pv_h);
4617
4618 #if XNU_MONITOR
4619 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4620 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4621 }
4622 if (__improbable(ppattr_pa_test_monitor(phys))) {
4623 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4624 }
4625 #endif
4626
4627
4628 orig_pte_p = pte_p = PT_ENTRY_NULL;
4629 orig_pve_p = pve_p = PV_ENTRY_NULL;
4630 pveh_p = PV_ENTRY_NULL;
4631 pvet_p = PV_ENTRY_NULL;
4632 new_pve_p = PV_ENTRY_NULL;
4633 new_pte_p = PT_ENTRY_NULL;
4634
4635
4636 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4637 orig_pte_p = pte_p = pvh_ptep(pv_h);
4638 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4639 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4640 pveh_p = pve_p;
4641 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4642 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4643 }
4644
4645 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4646 int pve_ptep_idx = 0;
4647
4648 /*
4649 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4650 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4651 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4652 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4653 * operation, TLB invalidation may be handled by the caller so it's possible for
4654 * tlb_flush_needed to be true while issue_tlbi is false.
4655 */
4656 bool issue_tlbi = false;
4657 bool tlb_flush_needed = false;
4658 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4659 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4660 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4661 bool update = false;
4662
4663 if (pve_p != PV_ENTRY_NULL) {
4664 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4665 if (pte_p == PT_ENTRY_NULL) {
4666 goto protect_skip_pve_pass1;
4667 }
4668 }
4669
4670 #ifdef PVH_FLAG_IOMMU
4671 if (pvh_ptep_is_iommu(pte_p)) {
4672 #if XNU_MONITOR
4673 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4674 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4675 ppnum, ptep_get_iommu(pte_p), pve_p);
4676 }
4677 #endif
4678 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4679 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4680 ppnum, ptep_get_iommu(pte_p), pve_p);
4681 }
4682 goto protect_skip_pve_pass1;
4683 }
4684 #endif
4685 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4686 const pmap_t pmap = ptdp->pmap;
4687 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4688
4689 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4690 #if MACH_ASSERT
4691 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4692 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4693 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4694 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4695
4696 pv_entry_t *check_pvep = pve_p;
4697
4698 do {
4699 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4700 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4701 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4702 }
4703 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4704
4705 /* Restore previous PTEP value. */
4706 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4707 }
4708 #endif
4709 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4710 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4711 }
4712
4713 #if DEVELOPMENT || DEBUG
4714 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4715 #else
4716 if ((prot & VM_PROT_EXECUTE))
4717 #endif
4718 {
4719 set_NX = false;
4720 } else {
4721 set_NX = true;
4722 }
4723
4724 /* Remove the mapping if new protection is NONE */
4725 if (remove) {
4726 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4727 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4728 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4729 pt_entry_t spte = *pte_p;
4730
4731 if (pte_is_wired(spte)) {
4732 pte_set_wired(pmap, pte_p, 0);
4733 spte = *pte_p;
4734 if (pmap != kernel_pmap) {
4735 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4736 }
4737 }
4738
4739 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4740 (uint64_t)spte, pte_p, ppnum);
4741
4742 if (compress && is_internal && (pmap != kernel_pmap)) {
4743 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4744 /* mark this PTE as having been "compressed" */
4745 tmplate = ARM_PTE_COMPRESSED;
4746 if (is_altacct) {
4747 tmplate |= ARM_PTE_COMPRESSED_ALT;
4748 }
4749 } else {
4750 tmplate = ARM_PTE_TYPE_FAULT;
4751 }
4752
4753 assert(spte != tmplate);
4754 write_pte_fast(pte_p, tmplate);
4755 update = true;
4756 ++pass1_updated;
4757
4758 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4759
4760 if (pmap != kernel_pmap) {
4761 if (ppattr_test_reusable(pai) &&
4762 is_internal &&
4763 !is_altacct) {
4764 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4765 } else if (!is_internal) {
4766 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4767 }
4768
4769 if (is_altacct) {
4770 assert(is_internal);
4771 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4772 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4773 if (options & PMAP_OPTIONS_COMPRESSOR) {
4774 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4775 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4776 }
4777 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4778 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4779 } else if (ppattr_test_reusable(pai)) {
4780 assert(is_internal);
4781 if (options & PMAP_OPTIONS_COMPRESSOR) {
4782 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4783 /* was not in footprint, but is now */
4784 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4785 }
4786 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4787 } else if (is_internal) {
4788 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4789
4790 /*
4791 * Update all stats related to physical footprint, which only
4792 * deals with internal pages.
4793 */
4794 if (options & PMAP_OPTIONS_COMPRESSOR) {
4795 /*
4796 * This removal is only being done so we can send this page to
4797 * the compressor; therefore it mustn't affect total task footprint.
4798 */
4799 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4800 } else {
4801 /*
4802 * This internal page isn't going to the compressor, so adjust stats to keep
4803 * phys_footprint up to date.
4804 */
4805 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4806 }
4807 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4808 } else {
4809 /* external page: no impact on ledgers */
4810 }
4811 }
4812 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4813 } else {
4814 pt_entry_t spte = *pte_p;
4815 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4816
4817 if (pmap == kernel_pmap) {
4818 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4819 } else {
4820 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4821 }
4822
4823 /*
4824 * While the naive implementation of this would serve to add execute
4825 * permission, this is not how the VM uses this interface, or how
4826 * x86_64 implements it. So ignore requests to add execute permissions.
4827 */
4828 if (set_NX) {
4829 tmplate |= pt_attr_leaf_xn(pt_attr);
4830 }
4831
4832
4833 assert(spte != ARM_PTE_TYPE_FAULT);
4834 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4835
4836 if (spte != tmplate) {
4837 /*
4838 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4839 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4840 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
4841 * should always be cleared by this function.
4842 */
4843 pte_set_was_writeable(tmplate, true);
4844 write_pte_fast(pte_p, tmplate);
4845 update = true;
4846 ++pass1_updated;
4847 } else if (pte_was_writeable(tmplate)) {
4848 /*
4849 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4850 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
4851 * write access to a page, this function should always at least clear that flag for
4852 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4853 * these mappings go through vm_fault(). We therefore don't want those accesses to
4854 * be handled through arm_fast_fault().
4855 */
4856 pte_set_was_writeable(tmplate, false);
4857 write_pte_fast(pte_p, tmplate);
4858 }
4859 }
4860
4861 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4862 tlb_flush_needed = true;
4863 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4864 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4865 issue_tlbi = true;
4866 }
4867 }
4868 protect_skip_pve_pass1:
4869 pte_p = PT_ENTRY_NULL;
4870 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4871 pve_ptep_idx = 0;
4872 pve_p = pve_next(pve_p);
4873 }
4874 }
4875
4876 if (tlb_flush_needed) {
4877 FLUSH_PTE_STRONG();
4878 }
4879
4880 if (!remove && !issue_tlbi) {
4881 goto protect_finish;
4882 }
4883
4884 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
4885 pv_entry_t **pve_pp = pv_h;
4886 pve_p = orig_pve_p;
4887 pte_p = orig_pte_p;
4888 pve_ptep_idx = 0;
4889
4890 /*
4891 * We need to keep track of whether a particular PVE list contains IOMMU
4892 * mappings when removing entries, because we should only remove CPU
4893 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4894 * it around.
4895 */
4896 bool iommu_mapping_in_pve = false;
4897 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4898 if (pve_p != PV_ENTRY_NULL) {
4899 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4900 if (pte_p == PT_ENTRY_NULL) {
4901 goto protect_skip_pve_pass2;
4902 }
4903 }
4904
4905 #ifdef PVH_FLAG_IOMMU
4906 if (pvh_ptep_is_iommu(pte_p)) {
4907 iommu_mapping_in_pve = true;
4908 if (remove && (pve_p == PV_ENTRY_NULL)) {
4909 /*
4910 * We've found an IOMMU entry and it's the only entry in the PV list.
4911 * We don't discard IOMMU entries, so simply set up the new PV list to
4912 * contain the single IOMMU PTE and exit the loop.
4913 */
4914 new_pte_p = pte_p;
4915 break;
4916 }
4917 goto protect_skip_pve_pass2;
4918 }
4919 #endif
4920 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4921 const pmap_t pmap = ptdp->pmap;
4922 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4923
4924 if (remove) {
4925 if (!compress && (pmap != kernel_pmap)) {
4926 /*
4927 * We must wait to decrement the refcount until we're completely finished using the PTE
4928 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
4929 * pmap_remove() call might observe the zero refcount and free the pagetable out from
4930 * under us.
4931 */
4932 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
4933 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
4934 }
4935 }
4936 /* Remove this CPU mapping from PVE list. */
4937 if (pve_p != PV_ENTRY_NULL) {
4938 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4939 }
4940 } else {
4941 pt_entry_t spte = *pte_p;
4942 if (pte_was_writeable(spte)) {
4943 pte_set_was_writeable(spte, false);
4944 write_pte_fast(pte_p, spte);
4945 } else {
4946 goto protect_skip_pve_pass2;
4947 }
4948 }
4949 ++pass2_updated;
4950 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4951 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4952 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
4953 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
4954 }
4955
4956 protect_skip_pve_pass2:
4957 pte_p = PT_ENTRY_NULL;
4958 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4959 pve_ptep_idx = 0;
4960
4961 if (remove) {
4962 /**
4963 * If there are any IOMMU mappings in the PVE list, preserve
4964 * those mappings in a new PVE list (new_pve_p) which will later
4965 * become the new PVH entry. Keep track of the CPU mappings in
4966 * pveh_p/pvet_p so they can be deallocated later.
4967 */
4968 if (iommu_mapping_in_pve) {
4969 iommu_mapping_in_pve = false;
4970 pv_entry_t *temp_pve_p = pve_next(pve_p);
4971 pve_remove(pv_h, pve_pp, pve_p);
4972 pveh_p = pvh_pve_list(pv_h);
4973 pve_p->pve_next = new_pve_p;
4974 new_pve_p = pve_p;
4975 pve_p = temp_pve_p;
4976 continue;
4977 } else {
4978 pvet_p = pve_p;
4979 pvh_cnt++;
4980 }
4981 }
4982
4983 pve_pp = pve_next_ptr(pve_p);
4984 pve_p = pve_next(pve_p);
4985 iommu_mapping_in_pve = false;
4986 }
4987 }
4988
4989 protect_finish:
4990
4991 #ifdef PVH_FLAG_EXEC
4992 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
4993 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
4994 }
4995 #endif
4996 if (__improbable(pass1_updated != pass2_updated)) {
4997 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
4998 __func__, pass1_updated, pass2_updated);
4999 }
5000 /* if we removed a bunch of entries, take care of them now */
5001 if (remove) {
5002 if (new_pve_p != PV_ENTRY_NULL) {
5003 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5004 pvh_set_flags(pv_h, pvh_flags);
5005 } else if (new_pte_p != PT_ENTRY_NULL) {
5006 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5007 pvh_set_flags(pv_h, pvh_flags);
5008 } else {
5009 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5010 }
5011 }
5012
5013 if (flush_range && tlb_flush_needed) {
5014 if (!remove) {
5015 flush_range->ptfr_flush_needed = true;
5016 tlb_flush_needed = false;
5017 }
5018 }
5019
5020 /*
5021 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5022 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5023 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5024 * a page to be repurposed while it is still live in the TLBs.
5025 */
5026 if (remove && tlb_flush_needed) {
5027 sync_tlb_flush();
5028 }
5029
5030 pvh_unlock(pai);
5031
5032 if (remove) {
5033 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5034 #if !XNU_MONITOR
5035 mp_enable_preemption();
5036 #endif
5037 }
5038
5039 if (!remove && tlb_flush_needed) {
5040 sync_tlb_flush();
5041 }
5042
5043 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5044 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5045 }
5046 }
5047
5048 MARK_AS_PMAP_TEXT void
5049 pmap_page_protect_options_internal(
5050 ppnum_t ppnum,
5051 vm_prot_t prot,
5052 unsigned int options,
5053 void *arg)
5054 {
5055 if (arg != NULL) {
5056 /*
5057 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5058 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5059 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5060 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5061 * In that case, force the flush to take place.
5062 */
5063 options &= ~PMAP_OPTIONS_NOFLUSH;
5064 }
5065 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5066 }
5067
5068 void
5069 pmap_page_protect_options(
5070 ppnum_t ppnum,
5071 vm_prot_t prot,
5072 unsigned int options,
5073 void *arg)
5074 {
5075 pmap_paddr_t phys = ptoa(ppnum);
5076
5077 assert(ppnum != vm_page_fictitious_addr);
5078
5079 /* Only work with managed pages. */
5080 if (!pa_valid(phys)) {
5081 return;
5082 }
5083
5084 /*
5085 * Determine the new protection.
5086 */
5087 if (prot == VM_PROT_ALL) {
5088 return; /* nothing to do */
5089 }
5090
5091 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5092
5093 #if XNU_MONITOR
5094 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5095 #else
5096 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5097 #endif
5098
5099 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5100 }
5101
5102
5103 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
5104 MARK_AS_PMAP_TEXT void
5105 pmap_disable_user_jop_internal(pmap_t pmap)
5106 {
5107 if (pmap == kernel_pmap) {
5108 panic("%s: called with kernel_pmap", __func__);
5109 }
5110 validate_pmap_mutable(pmap);
5111 pmap->disable_jop = true;
5112 }
5113
5114 void
5115 pmap_disable_user_jop(pmap_t pmap)
5116 {
5117 #if XNU_MONITOR
5118 pmap_disable_user_jop_ppl(pmap);
5119 #else
5120 pmap_disable_user_jop_internal(pmap);
5121 #endif
5122 }
5123 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
5124
5125 /*
5126 * Indicates if the pmap layer enforces some additional restrictions on the
5127 * given set of protections.
5128 */
5129 bool
5130 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5131 {
5132 return false;
5133 }
5134
5135 /*
5136 * Set the physical protection on the
5137 * specified range of this map as requested.
5138 * VERY IMPORTANT: Will not increase permissions.
5139 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5140 */
5141 void
5142 pmap_protect(
5143 pmap_t pmap,
5144 vm_map_address_t b,
5145 vm_map_address_t e,
5146 vm_prot_t prot)
5147 {
5148 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5149 }
5150
5151 MARK_AS_PMAP_TEXT vm_map_address_t
5152 pmap_protect_options_internal(
5153 pmap_t pmap,
5154 vm_map_address_t start,
5155 vm_map_address_t end,
5156 vm_prot_t prot,
5157 unsigned int options,
5158 __unused void *args)
5159 {
5160 tt_entry_t *tte_p;
5161 pt_entry_t *bpte_p, *epte_p;
5162 pt_entry_t *pte_p;
5163 boolean_t set_NX = TRUE;
5164 boolean_t set_XO = FALSE;
5165 boolean_t should_have_removed = FALSE;
5166 bool need_strong_sync = false;
5167
5168 /* Validate the pmap input before accessing its data. */
5169 validate_pmap_mutable(pmap);
5170
5171 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5172
5173 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5174 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5175 }
5176
5177 #if DEVELOPMENT || DEBUG
5178 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5179 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5180 should_have_removed = TRUE;
5181 }
5182 } else
5183 #endif
5184 {
5185 /* Determine the new protection. */
5186 switch (prot) {
5187 case VM_PROT_EXECUTE:
5188 set_XO = TRUE;
5189 OS_FALLTHROUGH;
5190 case VM_PROT_READ:
5191 case VM_PROT_READ | VM_PROT_EXECUTE:
5192 break;
5193 case VM_PROT_READ | VM_PROT_WRITE:
5194 case VM_PROT_ALL:
5195 return end; /* nothing to do */
5196 default:
5197 should_have_removed = TRUE;
5198 }
5199 }
5200
5201 if (should_have_removed) {
5202 panic("%s: should have been a remove operation, "
5203 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5204 __FUNCTION__,
5205 pmap, (void *)start, (void *)end, prot, options, args);
5206 }
5207
5208 #if DEVELOPMENT || DEBUG
5209 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5210 #else
5211 if ((prot & VM_PROT_EXECUTE))
5212 #endif
5213 {
5214 set_NX = FALSE;
5215 } else {
5216 set_NX = TRUE;
5217 }
5218
5219 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5220 vm_map_address_t va = start;
5221 unsigned int npages = 0;
5222
5223 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5224
5225 tte_p = pmap_tte(pmap, start);
5226
5227 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5228 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5229 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5230 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5231 pte_p = bpte_p;
5232
5233 for (pte_p = bpte_p;
5234 pte_p < epte_p;
5235 pte_p += PAGE_RATIO, va += pmap_page_size) {
5236 ++npages;
5237 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5238 pmap_pending_preemption())) {
5239 break;
5240 }
5241 pt_entry_t spte;
5242 #if DEVELOPMENT || DEBUG
5243 boolean_t force_write = FALSE;
5244 #endif
5245
5246 spte = *((volatile pt_entry_t*)pte_p);
5247
5248 if ((spte == ARM_PTE_TYPE_FAULT) ||
5249 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5250 continue;
5251 }
5252
5253 pmap_paddr_t pa;
5254 unsigned int pai = 0;
5255 boolean_t managed = FALSE;
5256
5257 while (!managed) {
5258 /*
5259 * It may be possible for the pte to transition from managed
5260 * to unmanaged in this timeframe; for now, elide the assert.
5261 * We should break out as a consequence of checking pa_valid.
5262 */
5263 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5264 pa = pte_to_pa(spte);
5265 if (!pa_valid(pa)) {
5266 break;
5267 }
5268 pai = pa_index(pa);
5269 pvh_lock(pai);
5270 spte = *((volatile pt_entry_t*)pte_p);
5271 pa = pte_to_pa(spte);
5272 if (pai == pa_index(pa)) {
5273 managed = TRUE;
5274 break; // Leave the PVH locked as we will unlock it after we free the PTE
5275 }
5276 pvh_unlock(pai);
5277 }
5278
5279 if ((spte == ARM_PTE_TYPE_FAULT) ||
5280 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5281 continue;
5282 }
5283
5284 pt_entry_t tmplate;
5285
5286 if (pmap == kernel_pmap) {
5287 #if DEVELOPMENT || DEBUG
5288 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5289 force_write = TRUE;
5290 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5291 } else
5292 #endif
5293 {
5294 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5295 }
5296 } else {
5297 #if DEVELOPMENT || DEBUG
5298 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5299 assert(pmap->type != PMAP_TYPE_NESTED);
5300 force_write = TRUE;
5301 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5302 } else
5303 #endif
5304 {
5305 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5306 }
5307 }
5308
5309 /*
5310 * XXX Removing "NX" would
5311 * grant "execute" access
5312 * immediately, bypassing any
5313 * checks VM might want to do
5314 * in its soft fault path.
5315 * pmap_protect() and co. are
5316 * not allowed to increase
5317 * access permissions.
5318 */
5319 if (set_NX) {
5320 tmplate |= pt_attr_leaf_xn(pt_attr);
5321 } else {
5322 if (pmap == kernel_pmap) {
5323 /* do NOT clear "PNX"! */
5324 tmplate |= ARM_PTE_NX;
5325 } else {
5326 /* do NOT clear "NX"! */
5327 tmplate |= pt_attr_leaf_x(pt_attr);
5328 if (set_XO) {
5329 tmplate &= ~ARM_PTE_APMASK;
5330 tmplate |= pt_attr_leaf_rona(pt_attr);
5331 }
5332 }
5333 }
5334
5335 #if DEVELOPMENT || DEBUG
5336 if (force_write) {
5337 /*
5338 * TODO: Run CS/Monitor checks here.
5339 */
5340 if (managed) {
5341 /*
5342 * We are marking the page as writable,
5343 * so we consider it to be modified and
5344 * referenced.
5345 */
5346 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5347 tmplate |= ARM_PTE_AF;
5348
5349 if (ppattr_test_reffault(pai)) {
5350 ppattr_clear_reffault(pai);
5351 }
5352
5353 if (ppattr_test_modfault(pai)) {
5354 ppattr_clear_modfault(pai);
5355 }
5356 }
5357 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5358 /*
5359 * An immediate request for anything other than
5360 * write should still mark the page as
5361 * referenced if managed.
5362 */
5363 if (managed) {
5364 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5365 tmplate |= ARM_PTE_AF;
5366
5367 if (ppattr_test_reffault(pai)) {
5368 ppattr_clear_reffault(pai);
5369 }
5370 }
5371 }
5372 #endif
5373
5374 /* We do not expect to write fast fault the entry. */
5375 pte_set_was_writeable(tmplate, false);
5376
5377 write_pte_fast(pte_p, tmplate);
5378
5379 if (managed) {
5380 pvh_assert_locked(pai);
5381 pvh_unlock(pai);
5382 }
5383 }
5384 FLUSH_PTE_STRONG();
5385 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5386 } else {
5387 va = end;
5388 }
5389
5390 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5391 return va;
5392 }
5393
5394 void
5395 pmap_protect_options(
5396 pmap_t pmap,
5397 vm_map_address_t b,
5398 vm_map_address_t e,
5399 vm_prot_t prot,
5400 unsigned int options,
5401 __unused void *args)
5402 {
5403 vm_map_address_t l, beg;
5404
5405 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5406
5407 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5408 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5409 pmap, (uint64_t)b, (uint64_t)e);
5410 }
5411
5412 /*
5413 * We allow single-page requests to execute non-preemptibly,
5414 * as it doesn't make sense to sample AST_URGENT for a single-page
5415 * operation, and there are a couple of special use cases that
5416 * require a non-preemptible single-page operation.
5417 */
5418 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5419 pmap_verify_preemptible();
5420 }
5421
5422 #if DEVELOPMENT || DEBUG
5423 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5424 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5425 pmap_remove_options(pmap, b, e, options);
5426 return;
5427 }
5428 } else
5429 #endif
5430 {
5431 /* Determine the new protection. */
5432 switch (prot) {
5433 case VM_PROT_EXECUTE:
5434 case VM_PROT_READ:
5435 case VM_PROT_READ | VM_PROT_EXECUTE:
5436 break;
5437 case VM_PROT_READ | VM_PROT_WRITE:
5438 case VM_PROT_ALL:
5439 return; /* nothing to do */
5440 default:
5441 pmap_remove_options(pmap, b, e, options);
5442 return;
5443 }
5444 }
5445
5446 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5447 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5448 VM_KERNEL_ADDRHIDE(e));
5449
5450 beg = b;
5451
5452 while (beg < e) {
5453 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5454
5455 if (l > e) {
5456 l = e;
5457 }
5458
5459 #if XNU_MONITOR
5460 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5461 #else
5462 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5463 #endif
5464 }
5465
5466 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5467 }
5468
5469 /**
5470 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5471 *
5472 * @param pmap pmap to insert the pages into.
5473 * @param va virtual address to map the pages into.
5474 * @param pa page number of the first physical page to map.
5475 * @param size block size, in number of pages.
5476 * @param prot mapping protection attributes.
5477 * @param attr flags to pass to pmap_enter().
5478 *
5479 * @return KERN_SUCCESS.
5480 */
5481 kern_return_t
5482 pmap_map_block(
5483 pmap_t pmap,
5484 addr64_t va,
5485 ppnum_t pa,
5486 uint32_t size,
5487 vm_prot_t prot,
5488 int attr,
5489 unsigned int flags)
5490 {
5491 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5492 }
5493
5494 /**
5495 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5496 * As opposed to pmap_map_block(), this function takes
5497 * a physical address as an input and operates using the
5498 * page size associated with the input pmap.
5499 *
5500 * @param pmap pmap to insert the pages into.
5501 * @param va virtual address to map the pages into.
5502 * @param pa physical address of the first physical page to map.
5503 * @param size block size, in number of pages.
5504 * @param prot mapping protection attributes.
5505 * @param attr flags to pass to pmap_enter().
5506 *
5507 * @return KERN_SUCCESS.
5508 */
5509 kern_return_t
5510 pmap_map_block_addr(
5511 pmap_t pmap,
5512 addr64_t va,
5513 pmap_paddr_t pa,
5514 uint32_t size,
5515 vm_prot_t prot,
5516 int attr,
5517 unsigned int flags)
5518 {
5519 #if __ARM_MIXED_PAGE_SIZE__
5520 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5521 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5522 #else
5523 const uint64_t pmap_page_size = PAGE_SIZE;
5524 #endif
5525
5526 for (ppnum_t page = 0; page < size; page++) {
5527 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5528 panic("%s: failed pmap_enter_addr, "
5529 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5530 __FUNCTION__,
5531 pmap, va, (uint64_t)pa, size, prot, flags);
5532 }
5533
5534 va += pmap_page_size;
5535 pa += pmap_page_size;
5536 }
5537
5538 return KERN_SUCCESS;
5539 }
5540
5541 kern_return_t
5542 pmap_enter_addr(
5543 pmap_t pmap,
5544 vm_map_address_t v,
5545 pmap_paddr_t pa,
5546 vm_prot_t prot,
5547 vm_prot_t fault_type,
5548 unsigned int flags,
5549 boolean_t wired)
5550 {
5551 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5552 }
5553
5554 /*
5555 * Insert the given physical page (p) at
5556 * the specified virtual address (v) in the
5557 * target physical map with the protection requested.
5558 *
5559 * If specified, the page will be wired down, meaning
5560 * that the related pte can not be reclaimed.
5561 *
5562 * NB: This is the only routine which MAY NOT lazy-evaluate
5563 * or lose information. That is, this routine must actually
5564 * insert this page into the given map eventually (must make
5565 * forward progress eventually.
5566 */
5567 kern_return_t
5568 pmap_enter(
5569 pmap_t pmap,
5570 vm_map_address_t v,
5571 ppnum_t pn,
5572 vm_prot_t prot,
5573 vm_prot_t fault_type,
5574 unsigned int flags,
5575 boolean_t wired)
5576 {
5577 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5578 }
5579
5580 /*
5581 * Attempt to commit the pte.
5582 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5583 * Performs no page table or accounting writes on failures.
5584 */
5585 static inline bool
5586 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5587 {
5588 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5589 bool success = false, changed_wiring = false;
5590
5591 __unreachable_ok_push
5592 if (TEST_PAGE_RATIO_4) {
5593 /*
5594 * 16K virtual pages w/ 4K hw pages.
5595 * We actually need to update 4 ptes here which can't easily be done atomically.
5596 * As a result we require the exclusive pmap lock.
5597 */
5598 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5599 *old_pte = *pte_p;
5600 if (*old_pte == new_pte) {
5601 /* Another thread completed this operation. Nothing to do here. */
5602 success = true;
5603 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5604 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5605 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5606 success = false;
5607 } else {
5608 write_pte_fast(pte_p, new_pte);
5609 success = true;
5610 }
5611 } else {
5612 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5613 }
5614 __unreachable_ok_pop
5615
5616 if (success && *old_pte != new_pte) {
5617 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5618 FLUSH_PTE_STRONG();
5619 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), false, true);
5620 } else {
5621 FLUSH_PTE();
5622 __builtin_arm_isb(ISB_SY);
5623 }
5624 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5625 (new_pte & ARM_PTE_WIRED) != 0 :
5626 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5627
5628 if (pmap != kernel_pmap && changed_wiring) {
5629 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5630 if (new_pte & ARM_PTE_WIRED) {
5631 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5632 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5633 } else {
5634 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5635 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5636 }
5637 }
5638
5639 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5640 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5641 }
5642 return success;
5643 }
5644
5645 MARK_AS_PMAP_TEXT static pt_entry_t
5646 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5647 {
5648 pt_entry_t pte;
5649
5650 switch (wimg & (VM_WIMG_MASK)) {
5651 case VM_WIMG_IO:
5652 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5653 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5654 // AP, while preserving the security benefits of using device
5655 // mapping against side-channel attacks. On pre-H14 platforms,
5656 // the accesses will still be strongly ordered.
5657 if (is_dram_addr(pa)) {
5658 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5659 } else {
5660 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5661 }
5662 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5663 break;
5664 case VM_WIMG_RT:
5665 #if HAS_UCNORMAL_MEM
5666 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5667 #else
5668 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5669 #endif
5670 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5671 break;
5672 case VM_WIMG_POSTED:
5673 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5674 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5675 break;
5676 case VM_WIMG_POSTED_REORDERED:
5677 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5678 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5679 break;
5680 case VM_WIMG_POSTED_COMBINED_REORDERED:
5681 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5682 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5683 break;
5684 case VM_WIMG_WCOMB:
5685 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5686 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5687 break;
5688 case VM_WIMG_WTHRU:
5689 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5690 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5691 break;
5692 case VM_WIMG_COPYBACK:
5693 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5694 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5695 break;
5696 case VM_WIMG_INNERWBACK:
5697 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5698 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5699 break;
5700 default:
5701 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5702 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5703 }
5704
5705 return pte;
5706 }
5707
5708
5709 /*
5710 * Construct a PTE (and the physical page attributes) for the given virtual to
5711 * physical mapping.
5712 *
5713 * This function has no side effects and is safe to call so that it is safe to
5714 * call while attempting a pmap_enter transaction.
5715 */
5716 MARK_AS_PMAP_TEXT static pt_entry_t
5717 pmap_construct_pte(
5718 const pmap_t pmap,
5719 vm_map_address_t va,
5720 pmap_paddr_t pa,
5721 vm_prot_t prot,
5722 vm_prot_t fault_type,
5723 boolean_t wired,
5724 const pt_attr_t* const pt_attr,
5725 uint16_t *pp_attr_bits /* OUTPUT */
5726 )
5727 {
5728 bool set_NX = false, set_XO = false;
5729 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5730 assert(pp_attr_bits != NULL);
5731 *pp_attr_bits = 0;
5732
5733 if (wired) {
5734 pte |= ARM_PTE_WIRED;
5735 }
5736
5737 #if DEVELOPMENT || DEBUG
5738 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5739 #else
5740 if ((prot & VM_PROT_EXECUTE))
5741 #endif
5742 {
5743 set_NX = false;
5744 } else {
5745 set_NX = true;
5746 }
5747
5748 if (prot == VM_PROT_EXECUTE) {
5749 set_XO = true;
5750 }
5751
5752 if (set_NX) {
5753 pte |= pt_attr_leaf_xn(pt_attr);
5754 } else {
5755 if (pmap == kernel_pmap) {
5756 pte |= ARM_PTE_NX;
5757 } else {
5758 pte |= pt_attr_leaf_x(pt_attr);
5759 }
5760 }
5761
5762 if (pmap == kernel_pmap) {
5763 #if __ARM_KERNEL_PROTECT__
5764 pte |= ARM_PTE_NG;
5765 #endif /* __ARM_KERNEL_PROTECT__ */
5766 if (prot & VM_PROT_WRITE) {
5767 pte |= ARM_PTE_AP(AP_RWNA);
5768 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5769 } else {
5770 pte |= ARM_PTE_AP(AP_RONA);
5771 *pp_attr_bits |= PP_ATTR_REFERENCED;
5772 }
5773 } else {
5774 if (pmap->type != PMAP_TYPE_NESTED) {
5775 pte |= ARM_PTE_NG;
5776 } else if ((pmap->nested_region_asid_bitmap)
5777 && (va >= pmap->nested_region_addr)
5778 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5779 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5780
5781 if ((pmap->nested_region_asid_bitmap)
5782 && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5783 pte |= ARM_PTE_NG;
5784 }
5785 }
5786 if (prot & VM_PROT_WRITE) {
5787 assert(pmap->type != PMAP_TYPE_NESTED);
5788 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5789 if (fault_type & VM_PROT_WRITE) {
5790 pte |= pt_attr_leaf_rw(pt_attr);
5791 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5792 } else {
5793 pte |= pt_attr_leaf_ro(pt_attr);
5794 /*
5795 * Mark the page as MODFAULT so that a subsequent write
5796 * may be handled through arm_fast_fault().
5797 */
5798 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5799 pte_set_was_writeable(pte, true);
5800 }
5801 } else {
5802 pte |= pt_attr_leaf_rw(pt_attr);
5803 *pp_attr_bits |= PP_ATTR_REFERENCED;
5804 }
5805 } else {
5806 if (set_XO) {
5807 pte |= pt_attr_leaf_rona(pt_attr);
5808 } else {
5809 pte |= pt_attr_leaf_ro(pt_attr);
5810 }
5811 *pp_attr_bits |= PP_ATTR_REFERENCED;
5812 }
5813 }
5814
5815 pte |= ARM_PTE_AF;
5816 return pte;
5817 }
5818
5819 MARK_AS_PMAP_TEXT kern_return_t
5820 pmap_enter_options_internal(
5821 pmap_t pmap,
5822 vm_map_address_t v,
5823 pmap_paddr_t pa,
5824 vm_prot_t prot,
5825 vm_prot_t fault_type,
5826 unsigned int flags,
5827 boolean_t wired,
5828 unsigned int options)
5829 {
5830 ppnum_t pn = (ppnum_t)atop(pa);
5831 pt_entry_t pte;
5832 pt_entry_t spte;
5833 pt_entry_t *pte_p;
5834 bool refcnt_updated;
5835 bool wiredcnt_updated;
5836 bool ro_va = false;
5837 unsigned int wimg_bits;
5838 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
5839 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
5840 kern_return_t kr = KERN_SUCCESS;
5841 uint16_t pp_attr_bits;
5842 volatile uint16_t *refcnt;
5843 volatile uint16_t *wiredcnt;
5844 pv_free_list_t *local_pv_free;
5845
5846 validate_pmap_mutable(pmap);
5847
5848 #if XNU_MONITOR
5849 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
5850 panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
5851 }
5852 #endif
5853
5854 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5855
5856 if ((v) & pt_attr_leaf_offmask(pt_attr)) {
5857 panic("pmap_enter_options() pmap %p v 0x%llx",
5858 pmap, (uint64_t)v);
5859 }
5860
5861 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
5862 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
5863 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
5864 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
5865 }
5866
5867 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
5868 panic("pmap_enter_options() pmap %p pa 0x%llx",
5869 pmap, (uint64_t)pa);
5870 }
5871
5872 /* The PA should not extend beyond the architected physical address space */
5873 pa &= ARM_PTE_PAGE_MASK;
5874
5875 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
5876 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
5877 extern vm_offset_t ctrr_test_page;
5878 if (__probable(v != ctrr_test_page))
5879 #endif
5880 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
5881 }
5882 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
5883 if (__improbable(prot != VM_PROT_READ)) {
5884 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
5885 __func__, (unsigned long long)v, prot);
5886 }
5887 ro_va = true;
5888 }
5889 assert(pn != vm_page_fictitious_addr);
5890
5891 refcnt_updated = false;
5892 wiredcnt_updated = false;
5893
5894 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
5895 /*
5896 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
5897 *
5898 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
5899 */
5900 lock_mode = PMAP_LOCK_EXCLUSIVE;
5901 }
5902
5903 if (!pmap_lock_preempt(pmap, lock_mode)) {
5904 return KERN_ABORTED;
5905 }
5906
5907 /*
5908 * Expand pmap to include this pte. Assume that
5909 * pmap is always expanded to include enough hardware
5910 * pages to map one VM page.
5911 */
5912 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
5913 /* Must unlock to expand the pmap. */
5914 pmap_unlock(pmap, lock_mode);
5915
5916 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
5917
5918 if (kr != KERN_SUCCESS) {
5919 return kr;
5920 }
5921
5922 if (!pmap_lock_preempt(pmap, lock_mode)) {
5923 return KERN_ABORTED;
5924 }
5925 }
5926
5927 if (options & PMAP_OPTIONS_NOENTER) {
5928 pmap_unlock(pmap, lock_mode);
5929 return KERN_SUCCESS;
5930 }
5931
5932 /*
5933 * Since we may not hold the pmap lock exclusive, updating the pte is
5934 * done via a cmpxchg loop.
5935 * We need to be careful about modifying non-local data structures before commiting
5936 * the new pte since we may need to re-do the transaction.
5937 */
5938 spte = os_atomic_load(pte_p, relaxed);
5939 while (!committed) {
5940 refcnt = NULL;
5941 wiredcnt = NULL;
5942 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
5943 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
5944
5945 if (pmap != kernel_pmap) {
5946 ptd_info_t *ptd_info = ptep_get_info(pte_p);
5947 refcnt = &ptd_info->refcnt;
5948 wiredcnt = &ptd_info->wiredcnt;
5949 /*
5950 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
5951 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
5952 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
5953 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
5954 * have PTDs, so we can't use the check there.
5955 */
5956 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
5957 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
5958 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
5959 }
5960 /*
5961 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
5962 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
5963 * or acquire the pmap lock exclusive.
5964 */
5965 if (!wiredcnt_updated) {
5966 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
5967 wiredcnt_updated = true;
5968 }
5969 if (!refcnt_updated) {
5970 OSAddAtomic16(1, (volatile int16_t*)refcnt);
5971 refcnt_updated = true;
5972 drop_refcnt = true;
5973 }
5974 }
5975
5976 #if XNU_MONITOR
5977 /**
5978 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
5979 * but in support of hibernation we allow temporary read-only mappings of these pages to be
5980 * created and later removed. We must therefore prevent an attacker from downgrading a
5981 * a writable mapping in order to allow it to be removed and remapped to something else.
5982 */
5983 if (__improbable(had_valid_mapping && !pa_valid(pte_to_pa(spte)) &&
5984 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !(prot & VM_PROT_WRITE) &&
5985 (pmap_cache_attributes((ppnum_t)atop(pte_to_pa(spte))) & PP_ATTR_MONITOR))) {
5986 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
5987 __func__, (uint64_t)pte_to_pa(spte));
5988 }
5989 #endif
5990
5991 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
5992 /*
5993 * There is already a mapping here & it's for a different physical page.
5994 * First remove that mapping.
5995 *
5996 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
5997 */
5998 if (lock_mode == PMAP_LOCK_SHARED) {
5999 if (pmap_lock_shared_to_exclusive(pmap)) {
6000 lock_mode = PMAP_LOCK_EXCLUSIVE;
6001 } else {
6002 /*
6003 * We failed to upgrade to an exclusive lock.
6004 * As a result we no longer hold the lock at all,
6005 * so we need to re-acquire it and restart the transaction.
6006 */
6007 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6008 lock_mode = PMAP_LOCK_EXCLUSIVE;
6009 /* pmap might have changed after we dropped the lock. Try again. */
6010 spte = os_atomic_load(pte_p, relaxed);
6011 continue;
6012 }
6013 }
6014 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6015 spte = ARM_PTE_TYPE_FAULT;
6016 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6017 }
6018
6019 /*
6020 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6021 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6022 * read-write protection. The PMAP layer though still needs to use the right
6023 * index, which is the older XO-now-TPRO one and that is specially selected
6024 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6025 */
6026 if (options & PMAP_OPTIONS_MAP_TPRO) {
6027 pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6028 } else {
6029 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6030 }
6031
6032 if (pa_valid(pa)) {
6033 unsigned int pai;
6034 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6035
6036 is_internal = FALSE;
6037 is_altacct = FALSE;
6038
6039 pai = pa_index(pa);
6040
6041 pvh_lock(pai);
6042
6043 /*
6044 * Make sure that the current per-cpu PV free list has
6045 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6046 * if the transaction succeeds. We're either in the
6047 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6048 * Note that we can still be interrupted, but a primary
6049 * interrupt handler can never enter the pmap.
6050 */
6051 #if !XNU_MONITOR
6052 assert(get_preemption_level() > 0);
6053 #endif
6054 local_pv_free = &pmap_get_cpu_data()->pv_free;
6055 pv_entry_t **pv_h = pai_to_pvh(pai);
6056 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6057 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6058
6059 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6060 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6061 int new_allocated_pves = 0;
6062
6063 while (new_allocated_pves < 2) {
6064 local_pv_free = &pmap_get_cpu_data()->pv_free;
6065 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6066 if (pv_status == PV_ALLOC_FAIL) {
6067 break;
6068 } else if (pv_status == PV_ALLOC_RETRY) {
6069 /*
6070 * In the case that pv_alloc() had to grab a new page of PVEs,
6071 * it will have dropped the pmap lock while doing so.
6072 * On non-PPL devices, dropping the lock re-enables preemption so we may
6073 * be on a different CPU now.
6074 */
6075 local_pv_free = &pmap_get_cpu_data()->pv_free;
6076 } else {
6077 /* If we've gotten this far then a node should've been allocated. */
6078 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6079
6080 new_allocated_pves++;
6081 }
6082 }
6083
6084 for (int i = 0; i < new_allocated_pves; i++) {
6085 pv_free(new_pve_p[i]);
6086 }
6087 }
6088
6089 if (pv_status == PV_ALLOC_FAIL) {
6090 pvh_unlock(pai);
6091 kr = KERN_RESOURCE_SHORTAGE;
6092 break;
6093 } else if (pv_status == PV_ALLOC_RETRY) {
6094 pvh_unlock(pai);
6095 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6096 spte = os_atomic_load(pte_p, relaxed);
6097 continue;
6098 }
6099
6100 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6101 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6102 } else {
6103 wimg_bits = pmap_cache_attributes(pn);
6104 }
6105
6106 /* We may be retrying this operation after dropping the PVH lock.
6107 * Cache attributes for the physical page may have changed while the lock
6108 * was dropped, so clear any cache attributes we may have previously set
6109 * in the PTE template. */
6110 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6111 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6112
6113 #if XNU_MONITOR
6114 /* The regular old kernel is not allowed to remap PPL pages. */
6115 if (__improbable(ppattr_pa_test_monitor(pa))) {
6116 panic("%s: page belongs to PPL, "
6117 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6118 __FUNCTION__,
6119 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6120 }
6121
6122 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6123 panic("%s: page locked down, "
6124 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6125 __FUNCTION__,
6126 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6127 }
6128 #endif
6129
6130
6131
6132 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6133 if (!committed) {
6134 pvh_unlock(pai);
6135 continue;
6136 }
6137 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6138 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6139
6140 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6141 /*
6142 * If there was already a valid pte here then we reuse its reference
6143 * on the ptd and drop the one that we took above.
6144 */
6145 drop_refcnt = had_valid_mapping;
6146
6147 if (!had_valid_mapping) {
6148 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6149 int pve_ptep_idx = 0;
6150 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6151 /* We did all the allocations up top. So this shouldn't be able to fail. */
6152 if (pv_status != PV_ALLOC_SUCCESS) {
6153 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6154 __func__, pv_status, new_pve_p, pmap);
6155 }
6156
6157 if (pmap != kernel_pmap) {
6158 if (options & PMAP_OPTIONS_INTERNAL) {
6159 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6160 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6161 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6162 /*
6163 * Make a note to ourselves that this
6164 * mapping is using alternative
6165 * accounting. We'll need this in order
6166 * to know which ledger to debit when
6167 * the mapping is removed.
6168 *
6169 * The altacct bit must be set while
6170 * the pv head is locked. Defer the
6171 * ledger accounting until after we've
6172 * dropped the lock.
6173 */
6174 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6175 is_altacct = TRUE;
6176 }
6177 }
6178 if (ppattr_test_reusable(pai) &&
6179 !is_altacct) {
6180 is_reusable = TRUE;
6181 } else if (options & PMAP_OPTIONS_INTERNAL) {
6182 is_internal = TRUE;
6183 } else {
6184 is_external = TRUE;
6185 }
6186 }
6187 }
6188
6189 pvh_unlock(pai);
6190
6191 if (pp_attr_bits != 0) {
6192 ppattr_pa_set_bits(pa, pp_attr_bits);
6193 }
6194
6195 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6196 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6197
6198 if (is_internal) {
6199 /*
6200 * Make corresponding adjustments to
6201 * phys_footprint statistics.
6202 */
6203 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6204 if (is_altacct) {
6205 /*
6206 * If this page is internal and
6207 * in an IOKit region, credit
6208 * the task's total count of
6209 * dirty, internal IOKit pages.
6210 * It should *not* count towards
6211 * the task's total physical
6212 * memory footprint, because
6213 * this entire region was
6214 * already billed to the task
6215 * at the time the mapping was
6216 * created.
6217 *
6218 * Put another way, this is
6219 * internal++ and
6220 * alternate_accounting++, so
6221 * net effect on phys_footprint
6222 * is 0. That means: don't
6223 * touch phys_footprint here.
6224 */
6225 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6226 } else {
6227 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6228 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6229 skip_footprint_debit = true;
6230 } else {
6231 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6232 }
6233 }
6234 }
6235 if (is_reusable) {
6236 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6237 } else if (is_external) {
6238 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6239 }
6240 }
6241 } else {
6242 if (prot & VM_PROT_EXECUTE) {
6243 kr = KERN_FAILURE;
6244 break;
6245 }
6246
6247 wimg_bits = pmap_cache_attributes(pn);
6248 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6249 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6250 }
6251
6252 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6253
6254 #if XNU_MONITOR
6255 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
6256 uint64_t xprr_perm = pte_to_xprr_perm(pte);
6257 switch (xprr_perm) {
6258 case XPRR_KERN_RO_PERM:
6259 break;
6260 case XPRR_KERN_RW_PERM:
6261 pte &= ~ARM_PTE_XPRR_MASK;
6262 pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
6263 break;
6264 default:
6265 panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte);
6266 }
6267 }
6268 #endif
6269 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6270 if (committed) {
6271 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6272 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6273
6274 /**
6275 * If there was already a valid pte here then we reuse its
6276 * reference on the ptd and drop the one that we took above.
6277 */
6278 drop_refcnt = had_valid_mapping;
6279 }
6280 }
6281 if (committed) {
6282 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6283 assert(pmap != kernel_pmap);
6284
6285 /* One less "compressed" */
6286 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6287 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6288
6289 if (spte & ARM_PTE_COMPRESSED_ALT) {
6290 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6291 } else if (!skip_footprint_debit) {
6292 /* Was part of the footprint */
6293 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6294 }
6295 /* The old entry held a reference so drop the extra one that we took above. */
6296 drop_refcnt = true;
6297 }
6298 }
6299 }
6300
6301 if (drop_refcnt && refcnt != NULL) {
6302 assert(refcnt_updated);
6303 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6304 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6305 }
6306 }
6307
6308 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6309 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6310 }
6311
6312 pmap_unlock(pmap, lock_mode);
6313
6314 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6315 pmap_phys_write_disable(v);
6316 }
6317
6318 return kr;
6319 }
6320
6321 kern_return_t
6322 pmap_enter_options_addr(
6323 pmap_t pmap,
6324 vm_map_address_t v,
6325 pmap_paddr_t pa,
6326 vm_prot_t prot,
6327 vm_prot_t fault_type,
6328 unsigned int flags,
6329 boolean_t wired,
6330 unsigned int options,
6331 __unused void *arg)
6332 {
6333 kern_return_t kr = KERN_FAILURE;
6334
6335
6336 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6337 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6338
6339
6340 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6341 do {
6342 #if XNU_MONITOR
6343 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6344 #else
6345 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6346 #endif
6347
6348 if (kr == KERN_RESOURCE_SHORTAGE) {
6349 #if XNU_MONITOR
6350 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6351 #endif
6352 if (nowait_requested) {
6353 break;
6354 }
6355 }
6356 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6357
6358 #if XNU_MONITOR
6359 pmap_ledger_check_balance(pmap);
6360 #endif
6361
6362 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6363
6364 return kr;
6365 }
6366
6367 kern_return_t
6368 pmap_enter_options(
6369 pmap_t pmap,
6370 vm_map_address_t v,
6371 ppnum_t pn,
6372 vm_prot_t prot,
6373 vm_prot_t fault_type,
6374 unsigned int flags,
6375 boolean_t wired,
6376 unsigned int options,
6377 __unused void *arg)
6378 {
6379 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6380 }
6381
6382 /*
6383 * Routine: pmap_change_wiring
6384 * Function: Change the wiring attribute for a map/virtual-address
6385 * pair.
6386 * In/out conditions:
6387 * The mapping must already exist in the pmap.
6388 */
6389 MARK_AS_PMAP_TEXT kern_return_t
6390 pmap_change_wiring_internal(
6391 pmap_t pmap,
6392 vm_map_address_t v,
6393 boolean_t wired)
6394 {
6395 pt_entry_t *pte_p;
6396 pmap_paddr_t pa;
6397
6398 validate_pmap_mutable(pmap);
6399
6400 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6401 return KERN_ABORTED;
6402 }
6403
6404 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6405
6406 pte_p = pmap_pte(pmap, v);
6407 if (pte_p == PT_ENTRY_NULL) {
6408 if (!wired) {
6409 /*
6410 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6411 * may have been freed by a remove operation.
6412 */
6413 goto pmap_change_wiring_return;
6414 } else {
6415 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6416 }
6417 }
6418 /*
6419 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6420 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6421 */
6422 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6423
6424 while (pa_valid(pa)) {
6425 pmap_paddr_t new_pa;
6426
6427 pvh_lock(pa_index(pa));
6428 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6429
6430 if (pa == new_pa) {
6431 break;
6432 }
6433
6434 pvh_unlock(pa_index(pa));
6435 pa = new_pa;
6436 }
6437
6438 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6439 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6440 if (!wired) {
6441 /* PTE cleared by prior remove/disconnect operation */
6442 goto pmap_change_wiring_cleanup;
6443 } else {
6444 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6445 __func__, pte_p, (uint64_t)*pte_p, pmap);
6446 }
6447 }
6448
6449 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6450 if (wired != pte_is_wired(*pte_p)) {
6451 pte_set_wired(pmap, pte_p, wired);
6452 if (pmap != kernel_pmap) {
6453 if (wired) {
6454 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6455 } else if (!wired) {
6456 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6457 }
6458 }
6459 }
6460
6461 pmap_change_wiring_cleanup:
6462 if (pa_valid(pa)) {
6463 pvh_unlock(pa_index(pa));
6464 }
6465
6466 pmap_change_wiring_return:
6467 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6468
6469 return KERN_SUCCESS;
6470 }
6471
6472 void
6473 pmap_change_wiring(
6474 pmap_t pmap,
6475 vm_map_address_t v,
6476 boolean_t wired)
6477 {
6478 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6479 pmap_verify_preemptible();
6480
6481 kern_return_t kr = KERN_FAILURE;
6482 #if XNU_MONITOR
6483 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6484 do {
6485 kr = pmap_change_wiring_ppl(pmap, v, wired);
6486 } while (kr == KERN_ABORTED);
6487
6488 pmap_ledger_check_balance(pmap);
6489 #else
6490 /* Since we verified preemptibility, call the helper only once. */
6491 kr = pmap_change_wiring_internal(pmap, v, wired);
6492 #endif
6493
6494 if (kr != KERN_SUCCESS) {
6495 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6496 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6497 }
6498 }
6499
6500 MARK_AS_PMAP_TEXT pmap_paddr_t
6501 pmap_find_pa_internal(
6502 pmap_t pmap,
6503 addr64_t va)
6504 {
6505 pmap_paddr_t pa = 0;
6506
6507 validate_pmap(pmap);
6508
6509 if (pmap != kernel_pmap) {
6510 pmap_lock(pmap, PMAP_LOCK_SHARED);
6511 }
6512
6513 pa = pmap_vtophys(pmap, va);
6514
6515 if (pmap != kernel_pmap) {
6516 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6517 }
6518
6519 return pa;
6520 }
6521
6522 pmap_paddr_t
6523 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6524 {
6525 pmap_paddr_t pa = 0;
6526
6527 if (pmap == kernel_pmap) {
6528 pa = mmu_kvtop(va);
6529 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6530 /*
6531 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6532 * translation even if PAN would prevent kernel access through the translation.
6533 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6534 */
6535 pa = mmu_uvtop(va);
6536 }
6537 return pa;
6538 }
6539
6540 pmap_paddr_t
6541 pmap_find_pa(
6542 pmap_t pmap,
6543 addr64_t va)
6544 {
6545 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6546
6547 if (pa != 0) {
6548 return pa;
6549 }
6550
6551 if (not_in_kdp) {
6552 #if XNU_MONITOR
6553 return pmap_find_pa_ppl(pmap, va);
6554 #else
6555 return pmap_find_pa_internal(pmap, va);
6556 #endif
6557 } else {
6558 return pmap_vtophys(pmap, va);
6559 }
6560 }
6561
6562 ppnum_t
6563 pmap_find_phys_nofault(
6564 pmap_t pmap,
6565 addr64_t va)
6566 {
6567 ppnum_t ppn;
6568 ppn = atop(pmap_find_pa_nofault(pmap, va));
6569 return ppn;
6570 }
6571
6572 ppnum_t
6573 pmap_find_phys(
6574 pmap_t pmap,
6575 addr64_t va)
6576 {
6577 ppnum_t ppn;
6578 ppn = atop(pmap_find_pa(pmap, va));
6579 return ppn;
6580 }
6581
6582 /**
6583 * Translate a kernel virtual address into a physical address.
6584 *
6585 * @param va The kernel virtual address to translate. Does not work on user
6586 * virtual addresses.
6587 *
6588 * @return The physical address if the translation was successful, or zero if
6589 * no valid mappings were found for the given virtual address.
6590 */
6591 pmap_paddr_t
6592 kvtophys(vm_offset_t va)
6593 {
6594 /**
6595 * Attempt to do the translation first in hardware using the AT (address
6596 * translation) instruction. This will attempt to use the MMU to do the
6597 * translation for us.
6598 */
6599 pmap_paddr_t pa = mmu_kvtop(va);
6600
6601 if (pa) {
6602 return pa;
6603 }
6604
6605 /* If the MMU can't find the mapping, then manually walk the page tables. */
6606 return pmap_vtophys(kernel_pmap, va);
6607 }
6608
6609 /**
6610 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6611 * points to a non-kernel-managed physical page, then this call will panic().
6612 *
6613 * @note The output of this function is guaranteed to be a kernel-managed
6614 * physical page, which means it's safe to pass the output directly to
6615 * pa_index() to create a physical address index for various pmap data
6616 * structures.
6617 *
6618 * @param va The kernel virtual address to translate. Does not work on user
6619 * virtual addresses.
6620 *
6621 * @return The translated physical address for the given virtual address.
6622 */
6623 pmap_paddr_t
6624 kvtophys_nofail(vm_offset_t va)
6625 {
6626 pmap_paddr_t pa = kvtophys(va);
6627
6628 if (!pa_valid(pa)) {
6629 panic("%s: Invalid or non-kernel-managed physical page returned, "
6630 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6631 }
6632
6633 return pa;
6634 }
6635
6636 pmap_paddr_t
6637 pmap_vtophys(
6638 pmap_t pmap,
6639 addr64_t va)
6640 {
6641 if ((va < pmap->min) || (va >= pmap->max)) {
6642 return 0;
6643 }
6644
6645 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6646
6647 tt_entry_t * ttp = NULL;
6648 tt_entry_t * ttep = NULL;
6649 tt_entry_t tte = ARM_TTE_EMPTY;
6650 pmap_paddr_t pa = 0;
6651 unsigned int cur_level;
6652
6653 ttp = pmap->tte;
6654
6655 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6656 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6657
6658 tte = *ttep;
6659
6660 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6661 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6662 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6663 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6664
6665 if ((tte & valid_mask) != valid_mask) {
6666 return (pmap_paddr_t) 0;
6667 }
6668
6669 /* This detects both leaf entries and intermediate block mappings. */
6670 if ((tte & type_mask) == type_block) {
6671 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6672 break;
6673 }
6674
6675 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6676 }
6677
6678 return pa;
6679 }
6680
6681 /*
6682 * pmap_init_pte_page - Initialize a page table page.
6683 */
6684 MARK_AS_PMAP_TEXT void
6685 pmap_init_pte_page(
6686 pmap_t pmap,
6687 pt_entry_t *pte_p,
6688 vm_offset_t va,
6689 unsigned int ttlevel,
6690 boolean_t alloc_ptd)
6691 {
6692 pt_desc_t *ptdp = NULL;
6693 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6694
6695 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6696 if (alloc_ptd) {
6697 /*
6698 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6699 * on 4KB hardware, we may already have allocated a page table descriptor for a
6700 * bootstrap request, so we check for an existing PTD here.
6701 */
6702 ptdp = ptd_alloc(pmap);
6703 if (ptdp == NULL) {
6704 panic("%s: unable to allocate PTD", __func__);
6705 }
6706 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6707 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6708 pvh_set_flags(pvh, 0);
6709 } else {
6710 panic("pmap_init_pte_page(): pte_p %p", pte_p);
6711 }
6712 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6713 ptdp = pvh_ptd(pvh);
6714 } else {
6715 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6716 }
6717
6718 // below barrier ensures previous updates to the page are visible to PTW before
6719 // it is linked to the PTE of previous level
6720 __builtin_arm_dmb(DMB_ISHST);
6721 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6722 }
6723
6724 /*
6725 * Routine: pmap_expand
6726 *
6727 * Expands a pmap to be able to map the specified virtual address.
6728 *
6729 * Allocates new memory for the default (COARSE) translation table
6730 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6731 * also allocates space for the corresponding pv entries.
6732 *
6733 * Nothing should be locked.
6734 */
6735 MARK_AS_PMAP_TEXT static kern_return_t
6736 pmap_expand(
6737 pmap_t pmap,
6738 vm_map_address_t v,
6739 unsigned int options,
6740 unsigned int level)
6741 {
6742 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6743
6744 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6745 return KERN_INVALID_ADDRESS;
6746 }
6747 pmap_paddr_t pa;
6748 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6749 tt_entry_t *tte_p;
6750 tt_entry_t *tt_p;
6751
6752 pa = 0x0ULL;
6753 tt_p = (tt_entry_t *)NULL;
6754
6755 for (; ttlevel < level; ttlevel++) {
6756 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6757 return KERN_ABORTED;
6758 }
6759
6760 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6761 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6762 while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
6763 if (options & PMAP_OPTIONS_NOWAIT) {
6764 return KERN_RESOURCE_SHORTAGE;
6765 }
6766 #if XNU_MONITOR
6767 panic("%s: failed to allocate tt, "
6768 "pmap=%p, v=%p, options=0x%x, level=%u",
6769 __FUNCTION__,
6770 pmap, (void *)v, options, level);
6771 #else
6772 VM_PAGE_WAIT();
6773 #endif
6774 }
6775
6776 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6777 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6778 return KERN_ABORTED;
6779 }
6780
6781 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6782 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6783 pa = kvtophys_nofail((vm_offset_t)tt_p);
6784 tte_p = pmap_ttne(pmap, ttlevel, v);
6785 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6786 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6787 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6788 pa = 0x0ULL;
6789 tt_p = (tt_entry_t *)NULL;
6790 }
6791 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6792 } else {
6793 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6794 }
6795
6796 if (tt_p != (tt_entry_t *)NULL) {
6797 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6798 tt_p = (tt_entry_t *)NULL;
6799 }
6800 }
6801
6802 return KERN_SUCCESS;
6803 }
6804
6805 /*
6806 * Routine: pmap_gc
6807 * Function:
6808 * Pmap garbage collection
6809 * Called by the pageout daemon when pages are scarce.
6810 *
6811 */
6812 void
6813 pmap_gc(void)
6814 {
6815 /*
6816 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6817 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6818 * or may contain wired mappings. However, with the relatively recent change to
6819 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6820 * page, it may make sense to call that function here.
6821 */
6822 }
6823
6824 /*
6825 * By default, don't attempt pmap GC more frequently
6826 * than once / 1 minutes.
6827 */
6828
6829 void
6830 compute_pmap_gc_throttle(
6831 void *arg __unused)
6832 {
6833 }
6834
6835 /*
6836 * pmap_attribute_cache_sync(vm_offset_t pa)
6837 *
6838 * Invalidates all of the instruction cache on a physical page and
6839 * pushes any dirty data from the data cache for the same physical page
6840 */
6841
6842 kern_return_t
6843 pmap_attribute_cache_sync(
6844 ppnum_t pp,
6845 vm_size_t size,
6846 __unused vm_machine_attribute_t attribute,
6847 __unused vm_machine_attribute_val_t * value)
6848 {
6849 if (size > PAGE_SIZE) {
6850 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
6851 } else {
6852 cache_sync_page(pp);
6853 }
6854
6855 return KERN_SUCCESS;
6856 }
6857
6858 /*
6859 * pmap_sync_page_data_phys(ppnum_t pp)
6860 *
6861 * Invalidates all of the instruction cache on a physical page and
6862 * pushes any dirty data from the data cache for the same physical page
6863 */
6864 void
6865 pmap_sync_page_data_phys(
6866 ppnum_t pp)
6867 {
6868 cache_sync_page(pp);
6869 }
6870
6871 /*
6872 * pmap_sync_page_attributes_phys(ppnum_t pp)
6873 *
6874 * Write back and invalidate all cachelines on a physical page.
6875 */
6876 void
6877 pmap_sync_page_attributes_phys(
6878 ppnum_t pp)
6879 {
6880 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
6881 }
6882
6883 #if CONFIG_COREDUMP
6884 /* temporary workaround */
6885 boolean_t
6886 coredumpok(
6887 vm_map_t map,
6888 mach_vm_offset_t va)
6889 {
6890 pt_entry_t *pte_p;
6891 pt_entry_t spte;
6892
6893 pte_p = pmap_pte(map->pmap, va);
6894 if (0 == pte_p) {
6895 return FALSE;
6896 }
6897 if (vm_map_entry_has_device_pager(map, va)) {
6898 return FALSE;
6899 }
6900 spte = *pte_p;
6901 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6902 }
6903 #endif
6904
6905 void
6906 fillPage(
6907 ppnum_t pn,
6908 unsigned int fill)
6909 {
6910 unsigned int *addr;
6911 int count;
6912
6913 addr = (unsigned int *) phystokv(ptoa(pn));
6914 count = PAGE_SIZE / sizeof(unsigned int);
6915 while (count--) {
6916 *addr++ = fill;
6917 }
6918 }
6919
6920 extern void mapping_set_mod(ppnum_t pn);
6921
6922 void
6923 mapping_set_mod(
6924 ppnum_t pn)
6925 {
6926 pmap_set_modify(pn);
6927 }
6928
6929 extern void mapping_set_ref(ppnum_t pn);
6930
6931 void
6932 mapping_set_ref(
6933 ppnum_t pn)
6934 {
6935 pmap_set_reference(pn);
6936 }
6937
6938 /*
6939 * Clear specified attribute bits.
6940 *
6941 * Try to force an arm_fast_fault() for all mappings of
6942 * the page - to force attributes to be set again at fault time.
6943 * If the forcing succeeds, clear the cached bits at the head.
6944 * Otherwise, something must have been wired, so leave the cached
6945 * attributes alone.
6946 */
6947 MARK_AS_PMAP_TEXT static void
6948 phys_attribute_clear_with_flush_range(
6949 ppnum_t pn,
6950 unsigned int bits,
6951 int options,
6952 void *arg,
6953 pmap_tlb_flush_range_t *flush_range)
6954 {
6955 pmap_paddr_t pa = ptoa(pn);
6956 vm_prot_t allow_mode = VM_PROT_ALL;
6957
6958 #if XNU_MONITOR
6959 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
6960 panic("%s: illegal request, "
6961 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
6962 __FUNCTION__,
6963 pn, bits, options, arg, flush_range);
6964 }
6965 #endif
6966 if ((arg != NULL) || (flush_range != NULL)) {
6967 options = options & ~PMAP_OPTIONS_NOFLUSH;
6968 }
6969
6970 if (__improbable((bits & PP_ATTR_MODIFIED) &&
6971 (options & PMAP_OPTIONS_NOFLUSH))) {
6972 panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): "
6973 "should not clear 'modified' without flushing TLBs\n",
6974 pn, bits, options, arg, flush_range);
6975 }
6976
6977 assert(pn != vm_page_fictitious_addr);
6978
6979 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
6980 assert(bits == PP_ATTR_MODIFIED);
6981
6982 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
6983 /*
6984 * We short circuit this case; it should not need to
6985 * invoke arm_force_fast_fault, so just clear the modified bit.
6986 * pmap_page_protect has taken care of resetting
6987 * the state so that we'll see the next write as a fault to
6988 * the VM (i.e. we don't want a fast fault).
6989 */
6990 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6991 return;
6992 }
6993 if (bits & PP_ATTR_REFERENCED) {
6994 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
6995 }
6996 if (bits & PP_ATTR_MODIFIED) {
6997 allow_mode &= ~VM_PROT_WRITE;
6998 }
6999
7000 if (bits == PP_ATTR_NOENCRYPT) {
7001 /*
7002 * We short circuit this case; it should not need to
7003 * invoke arm_force_fast_fault, so just clear and
7004 * return. On ARM, this bit is just a debugging aid.
7005 */
7006 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7007 return;
7008 }
7009
7010 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7011 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7012 }
7013 }
7014
7015 MARK_AS_PMAP_TEXT void
7016 phys_attribute_clear_internal(
7017 ppnum_t pn,
7018 unsigned int bits,
7019 int options,
7020 void *arg)
7021 {
7022 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7023 }
7024
7025 #if __ARM_RANGE_TLBI__
7026 MARK_AS_PMAP_TEXT static vm_map_address_t
7027 phys_attribute_clear_twig_internal(
7028 pmap_t pmap,
7029 vm_map_address_t start,
7030 vm_map_address_t end,
7031 unsigned int bits,
7032 unsigned int options,
7033 pmap_tlb_flush_range_t *flush_range)
7034 {
7035 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7036 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7037 assert(end >= start);
7038 assert((end - start) <= pt_attr_twig_size(pt_attr));
7039 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7040 vm_map_address_t va = start;
7041 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7042 tt_entry_t *tte_p;
7043 tte_p = pmap_tte(pmap, start);
7044 unsigned int npages = 0;
7045
7046 if (tte_p == (tt_entry_t *) NULL) {
7047 return end;
7048 }
7049
7050 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7051 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7052
7053 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7054 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7055 assert(end_pte_p >= start_pte_p);
7056 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7057 if (__improbable(npages++ && pmap_pending_preemption())) {
7058 return va;
7059 }
7060 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7061 if (pa_valid(pa)) {
7062 ppnum_t pn = (ppnum_t) atop(pa);
7063 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7064 }
7065 }
7066 }
7067 return end;
7068 }
7069
7070 MARK_AS_PMAP_TEXT vm_map_address_t
7071 phys_attribute_clear_range_internal(
7072 pmap_t pmap,
7073 vm_map_address_t start,
7074 vm_map_address_t end,
7075 unsigned int bits,
7076 unsigned int options)
7077 {
7078 if (__improbable(end < start)) {
7079 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7080 }
7081 validate_pmap_mutable(pmap);
7082
7083 vm_map_address_t va = start;
7084 pmap_tlb_flush_range_t flush_range = {
7085 .ptfr_pmap = pmap,
7086 .ptfr_start = start,
7087 .ptfr_end = end,
7088 .ptfr_flush_needed = false
7089 };
7090
7091 pmap_lock(pmap, PMAP_LOCK_SHARED);
7092 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7093
7094 while (va < end) {
7095 vm_map_address_t curr_end;
7096
7097 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7098 if (curr_end > end) {
7099 curr_end = end;
7100 }
7101
7102 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7103 if ((va < curr_end) || pmap_pending_preemption()) {
7104 break;
7105 }
7106 }
7107 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7108 if (flush_range.ptfr_flush_needed) {
7109 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7110 flush_range.ptfr_start,
7111 flush_range.ptfr_end - flush_range.ptfr_start,
7112 flush_range.ptfr_pmap,
7113 true);
7114 sync_tlb_flush();
7115 }
7116 return va;
7117 }
7118
7119 static void
7120 phys_attribute_clear_range(
7121 pmap_t pmap,
7122 vm_map_address_t start,
7123 vm_map_address_t end,
7124 unsigned int bits,
7125 unsigned int options)
7126 {
7127 /*
7128 * We allow single-page requests to execute non-preemptibly,
7129 * as it doesn't make sense to sample AST_URGENT for a single-page
7130 * operation, and there are a couple of special use cases that
7131 * require a non-preemptible single-page operation.
7132 */
7133 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7134 pmap_verify_preemptible();
7135 }
7136
7137 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7138
7139 while (start < end) {
7140 #if XNU_MONITOR
7141 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7142 #else
7143 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7144 #endif
7145 }
7146
7147 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7148 }
7149 #endif /* __ARM_RANGE_TLBI__ */
7150
7151 static void
7152 phys_attribute_clear(
7153 ppnum_t pn,
7154 unsigned int bits,
7155 int options,
7156 void *arg)
7157 {
7158 /*
7159 * Do we really want this tracepoint? It will be extremely chatty.
7160 * Also, should we have a corresponding trace point for the set path?
7161 */
7162 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7163
7164 #if XNU_MONITOR
7165 phys_attribute_clear_ppl(pn, bits, options, arg);
7166 #else
7167 phys_attribute_clear_internal(pn, bits, options, arg);
7168 #endif
7169
7170 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7171 }
7172
7173 /*
7174 * Set specified attribute bits.
7175 *
7176 * Set cached value in the pv head because we have
7177 * no per-mapping hardware support for referenced and
7178 * modify bits.
7179 */
7180 MARK_AS_PMAP_TEXT void
7181 phys_attribute_set_internal(
7182 ppnum_t pn,
7183 unsigned int bits)
7184 {
7185 pmap_paddr_t pa = ptoa(pn);
7186 assert(pn != vm_page_fictitious_addr);
7187
7188 #if XNU_MONITOR
7189 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7190 panic("%s: illegal request, "
7191 "pn=%u, bits=%#x",
7192 __FUNCTION__,
7193 pn, bits);
7194 }
7195 #endif
7196
7197 ppattr_pa_set_bits(pa, (uint16_t)bits);
7198
7199 return;
7200 }
7201
7202 static void
7203 phys_attribute_set(
7204 ppnum_t pn,
7205 unsigned int bits)
7206 {
7207 #if XNU_MONITOR
7208 phys_attribute_set_ppl(pn, bits);
7209 #else
7210 phys_attribute_set_internal(pn, bits);
7211 #endif
7212 }
7213
7214
7215 /*
7216 * Check specified attribute bits.
7217 *
7218 * use the software cached bits (since no hw support).
7219 */
7220 static boolean_t
7221 phys_attribute_test(
7222 ppnum_t pn,
7223 unsigned int bits)
7224 {
7225 pmap_paddr_t pa = ptoa(pn);
7226 assert(pn != vm_page_fictitious_addr);
7227 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7228 }
7229
7230
7231 /*
7232 * Set the modify/reference bits on the specified physical page.
7233 */
7234 void
7235 pmap_set_modify(ppnum_t pn)
7236 {
7237 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7238 }
7239
7240
7241 /*
7242 * Clear the modify bits on the specified physical page.
7243 */
7244 void
7245 pmap_clear_modify(
7246 ppnum_t pn)
7247 {
7248 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7249 }
7250
7251
7252 /*
7253 * pmap_is_modified:
7254 *
7255 * Return whether or not the specified physical page is modified
7256 * by any physical maps.
7257 */
7258 boolean_t
7259 pmap_is_modified(
7260 ppnum_t pn)
7261 {
7262 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7263 }
7264
7265
7266 /*
7267 * Set the reference bit on the specified physical page.
7268 */
7269 static void
7270 pmap_set_reference(
7271 ppnum_t pn)
7272 {
7273 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7274 }
7275
7276 /*
7277 * Clear the reference bits on the specified physical page.
7278 */
7279 void
7280 pmap_clear_reference(
7281 ppnum_t pn)
7282 {
7283 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7284 }
7285
7286
7287 /*
7288 * pmap_is_referenced:
7289 *
7290 * Return whether or not the specified physical page is referenced
7291 * by any physical maps.
7292 */
7293 boolean_t
7294 pmap_is_referenced(
7295 ppnum_t pn)
7296 {
7297 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7298 }
7299
7300 /*
7301 * pmap_get_refmod(phys)
7302 * returns the referenced and modified bits of the specified
7303 * physical page.
7304 */
7305 unsigned int
7306 pmap_get_refmod(
7307 ppnum_t pn)
7308 {
7309 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7310 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7311 }
7312
7313 static inline unsigned int
7314 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7315 {
7316 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7317 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7318 }
7319
7320 /*
7321 * pmap_clear_refmod(phys, mask)
7322 * clears the referenced and modified bits as specified by the mask
7323 * of the specified physical page.
7324 */
7325 void
7326 pmap_clear_refmod_options(
7327 ppnum_t pn,
7328 unsigned int mask,
7329 unsigned int options,
7330 void *arg)
7331 {
7332 unsigned int bits;
7333
7334 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7335 phys_attribute_clear(pn, bits, options, arg);
7336 }
7337
7338 /*
7339 * Perform pmap_clear_refmod_options on a virtual address range.
7340 * The operation will be performed in bulk & tlb flushes will be coalesced
7341 * if possible.
7342 *
7343 * Returns true if the operation is supported on this platform.
7344 * If this function returns false, the operation is not supported and
7345 * nothing has been modified in the pmap.
7346 */
7347 bool
7348 pmap_clear_refmod_range_options(
7349 pmap_t pmap __unused,
7350 vm_map_address_t start __unused,
7351 vm_map_address_t end __unused,
7352 unsigned int mask __unused,
7353 unsigned int options __unused)
7354 {
7355 #if __ARM_RANGE_TLBI__
7356 unsigned int bits;
7357 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7358 phys_attribute_clear_range(pmap, start, end, bits, options);
7359 return true;
7360 #else /* __ARM_RANGE_TLBI__ */
7361 #pragma unused(pmap, start, end, mask, options)
7362 /*
7363 * This operation allows the VM to bulk modify refmod bits on a virtually
7364 * contiguous range of addresses. This is large performance improvement on
7365 * platforms that support ranged tlbi instructions. But on older platforms,
7366 * we can only flush per-page or the entire asid. So we currently
7367 * only support this operation on platforms that support ranged tlbi.
7368 * instructions. On other platforms, we require that
7369 * the VM modify the bits on a per-page basis.
7370 */
7371 return false;
7372 #endif /* __ARM_RANGE_TLBI__ */
7373 }
7374
7375 void
7376 pmap_clear_refmod(
7377 ppnum_t pn,
7378 unsigned int mask)
7379 {
7380 pmap_clear_refmod_options(pn, mask, 0, NULL);
7381 }
7382
7383 unsigned int
7384 pmap_disconnect_options(
7385 ppnum_t pn,
7386 unsigned int options,
7387 void *arg)
7388 {
7389 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7390 /*
7391 * On ARM, the "modified" bit is managed by software, so
7392 * we know up-front if the physical page is "modified",
7393 * without having to scan all the PTEs pointing to it.
7394 * The caller should have made the VM page "busy" so noone
7395 * should be able to establish any new mapping and "modify"
7396 * the page behind us.
7397 */
7398 if (pmap_is_modified(pn)) {
7399 /*
7400 * The page has been modified and will be sent to
7401 * the VM compressor.
7402 */
7403 options |= PMAP_OPTIONS_COMPRESSOR;
7404 } else {
7405 /*
7406 * The page hasn't been modified and will be freed
7407 * instead of compressed.
7408 */
7409 }
7410 }
7411
7412 /* disconnect the page */
7413 pmap_page_protect_options(pn, 0, options, arg);
7414
7415 /* return ref/chg status */
7416 return pmap_get_refmod(pn);
7417 }
7418
7419 /*
7420 * Routine:
7421 * pmap_disconnect
7422 *
7423 * Function:
7424 * Disconnect all mappings for this page and return reference and change status
7425 * in generic format.
7426 *
7427 */
7428 unsigned int
7429 pmap_disconnect(
7430 ppnum_t pn)
7431 {
7432 pmap_page_protect(pn, 0); /* disconnect the page */
7433 return pmap_get_refmod(pn); /* return ref/chg status */
7434 }
7435
7436 boolean_t
7437 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7438 {
7439 if (ptoa(first) >= vm_last_phys) {
7440 return FALSE;
7441 }
7442 if (ptoa(last) < vm_first_phys) {
7443 return FALSE;
7444 }
7445
7446 return TRUE;
7447 }
7448
7449 /*
7450 * The state maintained by the noencrypt functions is used as a
7451 * debugging aid on ARM. This incurs some overhead on the part
7452 * of the caller. A special case check in phys_attribute_clear
7453 * (the most expensive path) currently minimizes this overhead,
7454 * but stubbing these functions out on RELEASE kernels yields
7455 * further wins.
7456 */
7457 boolean_t
7458 pmap_is_noencrypt(
7459 ppnum_t pn)
7460 {
7461 #if DEVELOPMENT || DEBUG
7462 boolean_t result = FALSE;
7463
7464 if (!pa_valid(ptoa(pn))) {
7465 return FALSE;
7466 }
7467
7468 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7469
7470 return result;
7471 #else
7472 #pragma unused(pn)
7473 return FALSE;
7474 #endif
7475 }
7476
7477 void
7478 pmap_set_noencrypt(
7479 ppnum_t pn)
7480 {
7481 #if DEVELOPMENT || DEBUG
7482 if (!pa_valid(ptoa(pn))) {
7483 return;
7484 }
7485
7486 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7487 #else
7488 #pragma unused(pn)
7489 #endif
7490 }
7491
7492 void
7493 pmap_clear_noencrypt(
7494 ppnum_t pn)
7495 {
7496 #if DEVELOPMENT || DEBUG
7497 if (!pa_valid(ptoa(pn))) {
7498 return;
7499 }
7500
7501 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7502 #else
7503 #pragma unused(pn)
7504 #endif
7505 }
7506
7507 #if XNU_MONITOR
7508 boolean_t
7509 pmap_is_monitor(ppnum_t pn)
7510 {
7511 assert(pa_valid(ptoa(pn)));
7512 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7513 }
7514 #endif
7515
7516 void
7517 pmap_lock_phys_page(ppnum_t pn)
7518 {
7519 #if !XNU_MONITOR
7520 unsigned int pai;
7521 pmap_paddr_t phys = ptoa(pn);
7522
7523 if (pa_valid(phys)) {
7524 pai = pa_index(phys);
7525 pvh_lock(pai);
7526 } else
7527 #else
7528 (void)pn;
7529 #endif
7530 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7531 }
7532
7533
7534 void
7535 pmap_unlock_phys_page(ppnum_t pn)
7536 {
7537 #if !XNU_MONITOR
7538 unsigned int pai;
7539 pmap_paddr_t phys = ptoa(pn);
7540
7541 if (pa_valid(phys)) {
7542 pai = pa_index(phys);
7543 pvh_unlock(pai);
7544 } else
7545 #else
7546 (void)pn;
7547 #endif
7548 { simple_unlock(&phys_backup_lock);}
7549 }
7550
7551 MARK_AS_PMAP_TEXT static void
7552 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7553 {
7554 if (pmap != kernel_pmap) {
7555 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7556 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7557 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7558 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7559 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7560 #if __ARM_MIXED_PAGE_SIZE__
7561 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7562 #endif
7563 }
7564
7565
7566 #if __ARM_MIXED_PAGE_SIZE__
7567 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7568 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7569 }
7570 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7571
7572
7573 if (pmap != kernel_pmap) {
7574 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7575 } else if (!pmap_user_ttb_is_clear()) {
7576 pmap_clear_user_ttb_internal();
7577 }
7578 }
7579
7580 MARK_AS_PMAP_TEXT void
7581 pmap_clear_user_ttb_internal(void)
7582 {
7583 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7584 }
7585
7586 void
7587 pmap_clear_user_ttb(void)
7588 {
7589 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7590 #if XNU_MONITOR
7591 pmap_clear_user_ttb_ppl();
7592 #else
7593 pmap_clear_user_ttb_internal();
7594 #endif
7595 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7596 }
7597
7598
7599 #if defined(__arm64__)
7600 /*
7601 * Marker for use in multi-pass fast-fault PV list processing.
7602 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7603 * these functions, as compressed PTEs should never be present in PV lists.
7604 * Note that this only holds true for arm64; for arm32 we don't have enough
7605 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7606 * and WRITEABLE marker depending on whether the PTE is valid.
7607 */
7608 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7609 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7610 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7611 #endif
7612
7613
7614 MARK_AS_PMAP_TEXT static boolean_t
7615 arm_force_fast_fault_with_flush_range(
7616 ppnum_t ppnum,
7617 vm_prot_t allow_mode,
7618 int options,
7619 pmap_tlb_flush_range_t *flush_range)
7620 {
7621 pmap_paddr_t phys = ptoa(ppnum);
7622 pv_entry_t *pve_p;
7623 pt_entry_t *pte_p;
7624 unsigned int pai;
7625 unsigned int pass1_updated = 0;
7626 unsigned int pass2_updated = 0;
7627 boolean_t result;
7628 pv_entry_t **pv_h;
7629 bool is_reusable;
7630 bool ref_fault;
7631 bool mod_fault;
7632 bool clear_write_fault = false;
7633 bool ref_aliases_mod = false;
7634 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7635
7636 assert(ppnum != vm_page_fictitious_addr);
7637
7638 if (!pa_valid(phys)) {
7639 return FALSE; /* Not a managed page. */
7640 }
7641
7642 result = TRUE;
7643 ref_fault = false;
7644 mod_fault = false;
7645 pai = pa_index(phys);
7646 if (__probable(mustsynch)) {
7647 pvh_lock(pai);
7648 }
7649 pv_h = pai_to_pvh(pai);
7650
7651 #if XNU_MONITOR
7652 if (__improbable(ppattr_pa_test_monitor(phys))) {
7653 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7654 }
7655 #endif
7656 pte_p = PT_ENTRY_NULL;
7657 pve_p = PV_ENTRY_NULL;
7658 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7659 pte_p = pvh_ptep(pv_h);
7660 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7661 pve_p = pvh_pve_list(pv_h);
7662 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7663 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7664 }
7665
7666 is_reusable = ppattr_test_reusable(pai);
7667
7668 /*
7669 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7670 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
7671 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7672 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
7673 * operation, TLB invalidation may be handled by the caller so it's possible for
7674 * tlb_flush_needed to be true while issue_tlbi is false.
7675 */
7676 bool issue_tlbi = false;
7677 bool tlb_flush_needed = false;
7678
7679 pv_entry_t *orig_pve_p = pve_p;
7680 pt_entry_t *orig_pte_p = pte_p;
7681 int pve_ptep_idx = 0;
7682
7683 /*
7684 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7685 * TLB invalidation in pass 2.
7686 */
7687 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7688 pt_entry_t spte;
7689 pt_entry_t tmplate;
7690
7691 if (pve_p != PV_ENTRY_NULL) {
7692 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7693 if (pte_p == PT_ENTRY_NULL) {
7694 goto fff_skip_pve_pass1;
7695 }
7696 }
7697
7698 #ifdef PVH_FLAG_IOMMU
7699 if (pvh_ptep_is_iommu(pte_p)) {
7700 goto fff_skip_pve_pass1;
7701 }
7702 #endif
7703 if (*pte_p == ARM_PTE_EMPTY) {
7704 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7705 }
7706 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7707 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7708 }
7709
7710 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7711 const pmap_t pmap = ptdp->pmap;
7712 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7713 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7714
7715 assert(va >= pmap->min && va < pmap->max);
7716
7717 /* update pmap stats and ledgers */
7718 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7719 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7720 if (is_altacct) {
7721 /*
7722 * We do not track "reusable" status for
7723 * "alternate accounting" mappings.
7724 */
7725 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7726 is_reusable &&
7727 is_internal &&
7728 pmap != kernel_pmap) {
7729 /* one less "reusable" */
7730 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7731 /* one more "internal" */
7732 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7733 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7734
7735 /*
7736 * Since the page is being marked non-reusable, we assume that it will be
7737 * modified soon. Avoid the cost of another trap to handle the fast
7738 * fault when we next write to this page.
7739 */
7740 clear_write_fault = true;
7741 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7742 !is_reusable &&
7743 is_internal &&
7744 pmap != kernel_pmap) {
7745 /* one more "reusable" */
7746 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7747 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7748 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7749 }
7750
7751 bool wiredskip = pte_is_wired(*pte_p) &&
7752 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7753
7754 if (wiredskip) {
7755 result = FALSE;
7756 goto fff_skip_pve_pass1;
7757 }
7758
7759 spte = *pte_p;
7760 tmplate = spte;
7761
7762 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7763 /* read protection sets the pte to fault */
7764 tmplate = tmplate & ~ARM_PTE_AF;
7765 ref_fault = true;
7766 }
7767 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7768 /* take away write permission if set */
7769 if (pmap == kernel_pmap) {
7770 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7771 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7772 pte_set_was_writeable(tmplate, true);
7773 mod_fault = true;
7774 }
7775 } else {
7776 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7777 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7778 pte_set_was_writeable(tmplate, true);
7779 mod_fault = true;
7780 }
7781 }
7782 }
7783
7784 #if MACH_ASSERT && XNU_MONITOR
7785 if (is_pte_xprr_protected(pmap, spte)) {
7786 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7787 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7788 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7789 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7790 ppnum, options, allow_mode);
7791 }
7792 }
7793 #endif /* MACH_ASSERT && XNU_MONITOR */
7794
7795 if (result && (tmplate != spte)) {
7796 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7797 !(options & PMAP_OPTIONS_NOFLUSH)) {
7798 tlb_flush_needed = true;
7799 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7800 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7801 #ifdef ARM_PTE_FF_MARKER
7802 assert(!(spte & ARM_PTE_FF_MARKER));
7803 tmplate |= ARM_PTE_FF_MARKER;
7804 ++pass1_updated;
7805 #endif
7806 issue_tlbi = true;
7807 }
7808 }
7809 write_pte_fast(pte_p, tmplate);
7810 }
7811
7812 fff_skip_pve_pass1:
7813 pte_p = PT_ENTRY_NULL;
7814 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7815 pve_ptep_idx = 0;
7816 pve_p = pve_next(pve_p);
7817 }
7818 }
7819
7820 if (tlb_flush_needed) {
7821 FLUSH_PTE_STRONG();
7822 }
7823
7824 if (!issue_tlbi) {
7825 goto fff_finish;
7826 }
7827
7828 /* Pass 2: Issue any required TLB invalidations */
7829 pve_p = orig_pve_p;
7830 pte_p = orig_pte_p;
7831 pve_ptep_idx = 0;
7832
7833 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7834 if (pve_p != PV_ENTRY_NULL) {
7835 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7836 if (pte_p == PT_ENTRY_NULL) {
7837 goto fff_skip_pve_pass2;
7838 }
7839 }
7840
7841 #ifdef PVH_FLAG_IOMMU
7842 if (pvh_ptep_is_iommu(pte_p)) {
7843 goto fff_skip_pve_pass2;
7844 }
7845 #endif
7846
7847 #ifdef ARM_PTE_FF_MARKER
7848 pt_entry_t spte = *pte_p;
7849
7850 if (!(spte & ARM_PTE_FF_MARKER)) {
7851 goto fff_skip_pve_pass2;
7852 } else {
7853 spte &= (~ARM_PTE_FF_MARKER);
7854 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
7855 write_pte_fast(pte_p, spte);
7856 ++pass2_updated;
7857 }
7858 #endif
7859 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7860 const pmap_t pmap = ptdp->pmap;
7861 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7862
7863 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7864 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
7865 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
7866 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
7867 }
7868
7869 fff_skip_pve_pass2:
7870 pte_p = PT_ENTRY_NULL;
7871 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7872 pve_ptep_idx = 0;
7873 pve_p = pve_next(pve_p);
7874 }
7875 }
7876
7877 fff_finish:
7878 if (__improbable(pass1_updated != pass2_updated)) {
7879 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
7880 __func__, pass1_updated, pass2_updated);
7881 }
7882
7883 /*
7884 * If we are using the same approach for ref and mod
7885 * faults on this PTE, do not clear the write fault;
7886 * this would cause both ref and mod to be set on the
7887 * page again, and prevent us from taking ANY read/write
7888 * fault on the mapping.
7889 */
7890 if (clear_write_fault && !ref_aliases_mod) {
7891 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
7892 }
7893 if (tlb_flush_needed) {
7894 if (flush_range) {
7895 /* Delayed flush. Signal to the caller that the flush is needed. */
7896 flush_range->ptfr_flush_needed = true;
7897 } else {
7898 sync_tlb_flush();
7899 }
7900 }
7901
7902 /* update global "reusable" status for this page */
7903 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
7904 ppattr_clear_reusable(pai);
7905 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
7906 ppattr_set_reusable(pai);
7907 }
7908
7909 if (mod_fault) {
7910 ppattr_set_modfault(pai);
7911 }
7912 if (ref_fault) {
7913 ppattr_set_reffault(pai);
7914 }
7915 if (__probable(mustsynch)) {
7916 pvh_unlock(pai);
7917 }
7918 return result;
7919 }
7920
7921 MARK_AS_PMAP_TEXT boolean_t
7922 arm_force_fast_fault_internal(
7923 ppnum_t ppnum,
7924 vm_prot_t allow_mode,
7925 int options)
7926 {
7927 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
7928 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
7929 }
7930 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
7931 }
7932
7933 /*
7934 * Routine: arm_force_fast_fault
7935 *
7936 * Function:
7937 * Force all mappings for this page to fault according
7938 * to the access modes allowed, so we can gather ref/modify
7939 * bits again.
7940 */
7941
7942 boolean_t
7943 arm_force_fast_fault(
7944 ppnum_t ppnum,
7945 vm_prot_t allow_mode,
7946 int options,
7947 __unused void *arg)
7948 {
7949 pmap_paddr_t phys = ptoa(ppnum);
7950
7951 assert(ppnum != vm_page_fictitious_addr);
7952
7953 if (!pa_valid(phys)) {
7954 return FALSE; /* Not a managed page. */
7955 }
7956
7957 #if XNU_MONITOR
7958 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
7959 #else
7960 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
7961 #endif
7962 }
7963
7964 /*
7965 * Routine: arm_clear_fast_fault
7966 *
7967 * Function:
7968 * Clear pending force fault for all mappings for this page based on
7969 * the observed fault type, update ref/modify bits.
7970 */
7971 MARK_AS_PMAP_TEXT static boolean_t
7972 arm_clear_fast_fault(
7973 ppnum_t ppnum,
7974 vm_prot_t fault_type,
7975 pt_entry_t *pte_p)
7976 {
7977 pmap_paddr_t pa = ptoa(ppnum);
7978 pv_entry_t *pve_p;
7979 unsigned int pai;
7980 boolean_t result;
7981 bool tlb_flush_needed = false;
7982 pv_entry_t **pv_h;
7983 unsigned int npve = 0;
7984 unsigned int pass1_updated = 0;
7985 unsigned int pass2_updated = 0;
7986
7987 assert(ppnum != vm_page_fictitious_addr);
7988
7989 if (!pa_valid(pa)) {
7990 return FALSE; /* Not a managed page. */
7991 }
7992
7993 result = FALSE;
7994 pai = pa_index(pa);
7995 pvh_assert_locked(pai);
7996 pv_h = pai_to_pvh(pai);
7997
7998 pve_p = PV_ENTRY_NULL;
7999 if (pte_p == PT_ENTRY_NULL) {
8000 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8001 pte_p = pvh_ptep(pv_h);
8002 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8003 pve_p = pvh_pve_list(pv_h);
8004 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8005 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8006 }
8007 }
8008
8009 pv_entry_t *orig_pve_p = pve_p;
8010 pt_entry_t *orig_pte_p = pte_p;
8011 int pve_ptep_idx = 0;
8012
8013 /*
8014 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8015 * TLB invalidation in pass 2.
8016 */
8017 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8018 pt_entry_t spte;
8019 pt_entry_t tmplate;
8020
8021 if (pve_p != PV_ENTRY_NULL) {
8022 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8023 if (pte_p == PT_ENTRY_NULL) {
8024 goto cff_skip_pve_pass1;
8025 }
8026 }
8027
8028 #ifdef PVH_FLAG_IOMMU
8029 if (pvh_ptep_is_iommu(pte_p)) {
8030 goto cff_skip_pve_pass1;
8031 }
8032 #endif
8033 if (*pte_p == ARM_PTE_EMPTY) {
8034 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8035 }
8036
8037 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8038 const pmap_t pmap = ptdp->pmap;
8039 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8040
8041 assert(va >= pmap->min && va < pmap->max);
8042
8043 spte = *pte_p;
8044 tmplate = spte;
8045
8046 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8047 {
8048 if (pmap == kernel_pmap) {
8049 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8050 } else {
8051 assert(pmap->type != PMAP_TYPE_NESTED);
8052 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8053 }
8054 }
8055
8056 tmplate |= ARM_PTE_AF;
8057
8058 pte_set_was_writeable(tmplate, false);
8059 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8060 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8061 tmplate = spte | ARM_PTE_AF;
8062
8063 {
8064 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8065 }
8066 }
8067
8068 #if MACH_ASSERT && XNU_MONITOR
8069 if (is_pte_xprr_protected(pmap, spte)) {
8070 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8071 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8072 "ppnum=0x%x, fault_type=0x%x",
8073 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8074 ppnum, fault_type);
8075 }
8076 }
8077 #endif /* MACH_ASSERT && XNU_MONITOR */
8078
8079 assert(spte != ARM_PTE_TYPE_FAULT);
8080 if (spte != tmplate) {
8081 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8082 #ifdef ARM_PTE_FF_MARKER
8083 assert(!(spte & ARM_PTE_FF_MARKER));
8084 tmplate |= ARM_PTE_FF_MARKER;
8085 ++pass1_updated;
8086 #endif
8087 tlb_flush_needed = true;
8088 }
8089 write_pte_fast(pte_p, tmplate);
8090 result = TRUE;
8091 }
8092
8093 cff_skip_pve_pass1:
8094 pte_p = PT_ENTRY_NULL;
8095 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8096 pve_ptep_idx = 0;
8097 pve_p = pve_next(pve_p);
8098 ++npve;
8099 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8100 break;
8101 }
8102 }
8103 }
8104
8105 if (!tlb_flush_needed) {
8106 goto cff_finish;
8107 }
8108
8109 FLUSH_PTE_STRONG();
8110
8111 /* Pass 2: Issue any required TLB invalidations */
8112 pve_p = orig_pve_p;
8113 pte_p = orig_pte_p;
8114 pve_ptep_idx = 0;
8115 npve = 0;
8116
8117 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8118 if (pve_p != PV_ENTRY_NULL) {
8119 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8120 if (pte_p == PT_ENTRY_NULL) {
8121 goto cff_skip_pve_pass2;
8122 }
8123 }
8124
8125 #ifdef PVH_FLAG_IOMMU
8126 if (pvh_ptep_is_iommu(pte_p)) {
8127 goto cff_skip_pve_pass2;
8128 }
8129 #endif
8130
8131 #ifdef ARM_PTE_FF_MARKER
8132 pt_entry_t spte = *pte_p;
8133
8134 if (!(spte & ARM_PTE_FF_MARKER)) {
8135 goto cff_skip_pve_pass2;
8136 } else {
8137 spte &= (~ARM_PTE_FF_MARKER);
8138 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8139 write_pte_fast(pte_p, spte);
8140 ++pass2_updated;
8141 }
8142 #endif
8143 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8144 const pmap_t pmap = ptdp->pmap;
8145 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8146
8147 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8148
8149 cff_skip_pve_pass2:
8150 pte_p = PT_ENTRY_NULL;
8151 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8152 pve_ptep_idx = 0;
8153 pve_p = pve_next(pve_p);
8154 ++npve;
8155 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8156 break;
8157 }
8158 }
8159 }
8160
8161 cff_finish:
8162 if (__improbable(pass1_updated != pass2_updated)) {
8163 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8164 __func__, pass1_updated, pass2_updated);
8165 }
8166 if (tlb_flush_needed) {
8167 sync_tlb_flush();
8168 }
8169 return result;
8170 }
8171
8172 /*
8173 * Determine if the fault was induced by software tracking of
8174 * modify/reference bits. If so, re-enable the mapping (and set
8175 * the appropriate bits).
8176 *
8177 * Returns KERN_SUCCESS if the fault was induced and was
8178 * successfully handled.
8179 *
8180 * Returns KERN_FAILURE if the fault was not induced and
8181 * the function was unable to deal with it.
8182 *
8183 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8184 * disallows this type of access.
8185 *
8186 * Returns KERN_ABORTED if the pmap lock is taken and a
8187 * preemption is pending.
8188 *
8189 */
8190 MARK_AS_PMAP_TEXT kern_return_t
8191 arm_fast_fault_internal(
8192 pmap_t pmap,
8193 vm_map_address_t va,
8194 vm_prot_t fault_type,
8195 __unused bool was_af_fault,
8196 __unused bool from_user)
8197 {
8198 kern_return_t result = KERN_FAILURE;
8199 pt_entry_t *ptep;
8200 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8201 unsigned int pai;
8202 pmap_paddr_t pa;
8203 validate_pmap_mutable(pmap);
8204
8205 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8206 return KERN_ABORTED;
8207 }
8208
8209 /*
8210 * If the entry doesn't exist, is completely invalid, or is already
8211 * valid, we can't fix it here.
8212 */
8213
8214 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8215 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8216 if (ptep != PT_ENTRY_NULL) {
8217 while (true) {
8218 spte = *((volatile pt_entry_t*)ptep);
8219
8220 pa = pte_to_pa(spte);
8221
8222 if ((spte == ARM_PTE_TYPE_FAULT) ||
8223 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8224 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8225 return result;
8226 }
8227
8228 if (!pa_valid(pa)) {
8229 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8230 #if XNU_MONITOR
8231 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8232 return KERN_PROTECTION_FAILURE;
8233 } else
8234 #endif
8235 return result;
8236 }
8237 pai = pa_index(pa);
8238 pvh_lock(pai);
8239 if (*ptep == spte) {
8240 /*
8241 * Double-check the spte value, as we care about the AF bit.
8242 * It's also possible that pmap_page_protect() transitioned the
8243 * PTE to compressed/empty before we grabbed the PVH lock.
8244 */
8245 break;
8246 }
8247 pvh_unlock(pai);
8248 }
8249 } else {
8250 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8251 return result;
8252 }
8253
8254
8255 if ((result != KERN_SUCCESS) &&
8256 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8257 /*
8258 * An attempted access will always clear ref/mod fault state, as
8259 * appropriate for the fault type. arm_clear_fast_fault will
8260 * update the associated PTEs for the page as appropriate; if
8261 * any PTEs are updated, we redrive the access. If the mapping
8262 * does not actually allow for the attempted access, the
8263 * following fault will (hopefully) fail to update any PTEs, and
8264 * thus cause arm_fast_fault to decide that it failed to handle
8265 * the fault.
8266 */
8267 if (ppattr_test_reffault(pai)) {
8268 ppattr_clear_reffault(pai);
8269 }
8270 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8271 ppattr_clear_modfault(pai);
8272 }
8273
8274 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8275 /*
8276 * Should this preserve KERN_PROTECTION_FAILURE? The
8277 * cost of not doing so is a another fault in a case
8278 * that should already result in an exception.
8279 */
8280 result = KERN_SUCCESS;
8281 }
8282 }
8283
8284 /*
8285 * If the PTE already has sufficient permissions, we can report the fault as handled.
8286 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8287 * on mappings of the same page
8288 */
8289 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8290 uintptr_t ap_ro, ap_rw, ap_x;
8291 if (pmap == kernel_pmap) {
8292 ap_ro = ARM_PTE_AP(AP_RONA);
8293 ap_rw = ARM_PTE_AP(AP_RWNA);
8294 ap_x = ARM_PTE_NX;
8295 } else {
8296 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8297 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8298 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8299 }
8300 /*
8301 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8302 * hardware they may be xPRR-protected, in which case they'll be handled
8303 * by the is_pte_xprr_protected() case above. Additionally, the exception
8304 * handling path currently does not call arm_fast_fault() without at least
8305 * VM_PROT_READ in fault_type.
8306 */
8307 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8308 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8309 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8310 result = KERN_SUCCESS;
8311 }
8312 }
8313 }
8314
8315 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8316 /*
8317 * A prior arm_clear_fast_fault() operation may have returned early due to
8318 * another pending PV list operation or an excessively large PV list.
8319 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8320 * taking a fault on the same mapping.
8321 */
8322 result = KERN_SUCCESS;
8323 }
8324
8325 pvh_unlock(pai);
8326 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8327 return result;
8328 }
8329
8330 kern_return_t
8331 arm_fast_fault(
8332 pmap_t pmap,
8333 vm_map_address_t va,
8334 vm_prot_t fault_type,
8335 bool was_af_fault,
8336 __unused bool from_user)
8337 {
8338 kern_return_t result = KERN_FAILURE;
8339
8340 if (va < pmap->min || va >= pmap->max) {
8341 return result;
8342 }
8343
8344 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8345 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8346 from_user);
8347
8348 do {
8349 #if XNU_MONITOR
8350 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8351 #else
8352 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8353 #endif
8354 } while (result == KERN_ABORTED);
8355
8356 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8357
8358 return result;
8359 }
8360
8361 void
8362 pmap_copy_page(
8363 ppnum_t psrc,
8364 ppnum_t pdst)
8365 {
8366 bcopy_phys((addr64_t) (ptoa(psrc)),
8367 (addr64_t) (ptoa(pdst)),
8368 PAGE_SIZE);
8369 }
8370
8371
8372 /*
8373 * pmap_copy_page copies the specified (machine independent) pages.
8374 */
8375 void
8376 pmap_copy_part_page(
8377 ppnum_t psrc,
8378 vm_offset_t src_offset,
8379 ppnum_t pdst,
8380 vm_offset_t dst_offset,
8381 vm_size_t len)
8382 {
8383 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8384 (addr64_t) (ptoa(pdst) + dst_offset),
8385 len);
8386 }
8387
8388
8389 /*
8390 * pmap_zero_page zeros the specified (machine independent) page.
8391 */
8392 void
8393 pmap_zero_page(
8394 ppnum_t pn)
8395 {
8396 assert(pn != vm_page_fictitious_addr);
8397 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8398 }
8399
8400 /*
8401 * pmap_zero_part_page
8402 * zeros the specified (machine independent) part of a page.
8403 */
8404 void
8405 pmap_zero_part_page(
8406 ppnum_t pn,
8407 vm_offset_t offset,
8408 vm_size_t len)
8409 {
8410 assert(pn != vm_page_fictitious_addr);
8411 assert(offset + len <= PAGE_SIZE);
8412 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8413 }
8414
8415 void
8416 pmap_map_globals(
8417 void)
8418 {
8419 pt_entry_t *ptep, pte;
8420
8421 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8422 assert(ptep != PT_ENTRY_NULL);
8423 assert(*ptep == ARM_PTE_EMPTY);
8424
8425 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8426 #if __ARM_KERNEL_PROTECT__
8427 pte |= ARM_PTE_NG;
8428 #endif /* __ARM_KERNEL_PROTECT__ */
8429 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8430 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8431 *ptep = pte;
8432 FLUSH_PTE();
8433 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8434
8435 #if KASAN
8436 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8437 #endif
8438 }
8439
8440 vm_offset_t
8441 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8442 {
8443 if (__improbable(index >= CPUWINDOWS_MAX)) {
8444 panic("%s: invalid index %u", __func__, index);
8445 }
8446 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8447 }
8448
8449 MARK_AS_PMAP_TEXT unsigned int
8450 pmap_map_cpu_windows_copy_internal(
8451 ppnum_t pn,
8452 vm_prot_t prot,
8453 unsigned int wimg_bits)
8454 {
8455 pt_entry_t *ptep = NULL, pte;
8456 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8457 unsigned int cpu_num;
8458 unsigned int i;
8459 vm_offset_t cpu_copywindow_vaddr = 0;
8460 bool need_strong_sync = false;
8461
8462 #if XNU_MONITOR
8463 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8464 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8465 #endif
8466
8467 #if XNU_MONITOR
8468 #ifdef __ARM_COHERENT_IO__
8469 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8470 panic("%s: attempted to map a managed page, "
8471 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8472 __FUNCTION__,
8473 pn, prot, wimg_bits);
8474 }
8475 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8476 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8477 }
8478
8479 #else /* __ARM_COHERENT_IO__ */
8480 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8481 #endif /* __ARM_COHERENT_IO__ */
8482 #endif /* XNU_MONITOR */
8483 cpu_num = pmap_cpu_data->cpu_number;
8484
8485 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8486 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8487 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8488 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8489 if (*ptep == ARM_PTE_TYPE_FAULT) {
8490 break;
8491 }
8492 }
8493 if (i == CPUWINDOWS_MAX) {
8494 panic("pmap_map_cpu_windows_copy: out of window");
8495 }
8496
8497 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8498 #if __ARM_KERNEL_PROTECT__
8499 pte |= ARM_PTE_NG;
8500 #endif /* __ARM_KERNEL_PROTECT__ */
8501
8502 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8503
8504 if (prot & VM_PROT_WRITE) {
8505 pte |= ARM_PTE_AP(AP_RWNA);
8506 } else {
8507 pte |= ARM_PTE_AP(AP_RONA);
8508 }
8509
8510 write_pte_fast(ptep, pte);
8511 /*
8512 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8513 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8514 */
8515 FLUSH_PTE_STRONG();
8516 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8517 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8518
8519 return i;
8520 }
8521
8522 unsigned int
8523 pmap_map_cpu_windows_copy(
8524 ppnum_t pn,
8525 vm_prot_t prot,
8526 unsigned int wimg_bits)
8527 {
8528 #if XNU_MONITOR
8529 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8530 #else
8531 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8532 #endif
8533 }
8534
8535 MARK_AS_PMAP_TEXT void
8536 pmap_unmap_cpu_windows_copy_internal(
8537 unsigned int index)
8538 {
8539 pt_entry_t *ptep;
8540 unsigned int cpu_num;
8541 vm_offset_t cpu_copywindow_vaddr = 0;
8542 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8543
8544 cpu_num = pmap_cpu_data->cpu_number;
8545
8546 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8547 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8548 * (which are likely to have been on I/O memory) are complete before
8549 * tearing down the mapping. */
8550 __builtin_arm_dsb(DSB_SY);
8551 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8552 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8553 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8554 }
8555
8556 void
8557 pmap_unmap_cpu_windows_copy(
8558 unsigned int index)
8559 {
8560 #if XNU_MONITOR
8561 return pmap_unmap_cpu_windows_copy_ppl(index);
8562 #else
8563 return pmap_unmap_cpu_windows_copy_internal(index);
8564 #endif
8565 }
8566
8567 #if XNU_MONITOR
8568
8569 MARK_AS_PMAP_TEXT void
8570 pmap_invoke_with_page(
8571 ppnum_t page_number,
8572 void *ctx,
8573 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8574 {
8575 #pragma unused(page_number, ctx, callback)
8576 }
8577
8578 /*
8579 * Loop over every pmap_io_range (I/O ranges marked as owned by
8580 * the PPL in the device tree) and conditionally call callback() on each range
8581 * that needs to be included in the hibernation image.
8582 *
8583 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8584 * context is needed in the callback.
8585 * @param callback Callback function invoked on each range (gated by flag).
8586 */
8587 MARK_AS_PMAP_TEXT void
8588 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8589 {
8590 extern const pmap_io_range_t* io_attr_table;
8591 extern const unsigned int num_io_rgns;
8592 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8593 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8594 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8595 }
8596 }
8597 }
8598
8599 /**
8600 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8601 * PPL-owned page. Otherwise, do nothing.
8602 *
8603 * @param addr Physical address of the page to set the HASHED flag on.
8604 */
8605 MARK_AS_PMAP_TEXT void
8606 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8607 {
8608 /* Ignore non-managed kernel memory. */
8609 if (!pa_valid(addr)) {
8610 return;
8611 }
8612
8613 const unsigned int pai = pa_index(addr);
8614 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8615 pv_entry_t **pv_h = pai_to_pvh(pai);
8616
8617 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8618 pvh_lock(pai);
8619 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8620 pvh_unlock(pai);
8621 }
8622 }
8623
8624 /**
8625 * Loop through every physical page in the system and clear out the HASHED flag
8626 * on every PPL-owned page. That flag is used to keep track of which pages have
8627 * been hashed into the hibernation image during the hibernation entry process.
8628 *
8629 * The HASHED flag needs to be cleared out between hibernation cycles because the
8630 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8631 * image with the HASHED flag set on certain pages. It's important to clear the
8632 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8633 * into the hibernation image can't be compromised across hibernation cycles.
8634 */
8635 MARK_AS_PMAP_TEXT void
8636 pmap_clear_ppl_hashed_flag_all(void)
8637 {
8638 const unsigned int last_index = pa_index(vm_last_phys);
8639 pv_entry_t **pv_h = NULL;
8640
8641 for (int pai = 0; pai < last_index; ++pai) {
8642 pv_h = pai_to_pvh(pai);
8643
8644 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8645 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8646 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8647 pvh_lock(pai);
8648 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8649 pvh_unlock(pai);
8650 }
8651 }
8652 }
8653
8654 /**
8655 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8656 * ppl_hib driver will call this after all wired pages have been copied into the
8657 * hibernation image.
8658 */
8659 MARK_AS_PMAP_TEXT void
8660 pmap_check_ppl_hashed_flag_all(void)
8661 {
8662 const unsigned int last_index = pa_index(vm_last_phys);
8663 pv_entry_t **pv_h = NULL;
8664
8665 for (int pai = 0; pai < last_index; ++pai) {
8666 pv_h = pai_to_pvh(pai);
8667
8668 /**
8669 * The PMAP stacks are explicitly not saved into the image so skip checking
8670 * the pages that contain the PMAP stacks.
8671 */
8672 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8673 (pai < pa_index(pmap_stacks_end_pa));
8674
8675 if (!is_pmap_stack &&
8676 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8677 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8678 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8679 }
8680 }
8681 }
8682
8683 #endif /* XNU_MONITOR */
8684
8685 /*
8686 * Indicate that a pmap is intended to be used as a nested pmap
8687 * within one or more larger address spaces. This must be set
8688 * before pmap_nest() is called with this pmap as the 'subordinate'.
8689 */
8690 MARK_AS_PMAP_TEXT void
8691 pmap_set_nested_internal(
8692 pmap_t pmap)
8693 {
8694 validate_pmap_mutable(pmap);
8695 if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8696 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8697 __func__, pmap, pmap->type);
8698 }
8699 pmap->type = PMAP_TYPE_NESTED;
8700 pmap_get_pt_ops(pmap)->free_id(pmap);
8701 }
8702
8703 void
8704 pmap_set_nested(
8705 pmap_t pmap)
8706 {
8707 #if XNU_MONITOR
8708 pmap_set_nested_ppl(pmap);
8709 #else
8710 pmap_set_nested_internal(pmap);
8711 #endif
8712 }
8713
8714 /*
8715 * pmap_trim_range(pmap, start, end)
8716 *
8717 * pmap = pmap to operate on
8718 * start = start of the range
8719 * end = end of the range
8720 *
8721 * Attempts to deallocate TTEs for the given range in the nested range.
8722 */
8723 MARK_AS_PMAP_TEXT static void
8724 pmap_trim_range(
8725 pmap_t pmap,
8726 addr64_t start,
8727 addr64_t end)
8728 {
8729 addr64_t cur;
8730 addr64_t nested_region_start;
8731 addr64_t nested_region_end;
8732 addr64_t adjusted_start;
8733 addr64_t adjusted_end;
8734 addr64_t adjust_offmask;
8735 tt_entry_t * tte_p;
8736 pt_entry_t * pte_p;
8737 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8738
8739 if (__improbable(end < start)) {
8740 panic("%s: invalid address range, "
8741 "pmap=%p, start=%p, end=%p",
8742 __func__,
8743 pmap, (void*)start, (void*)end);
8744 }
8745
8746 nested_region_start = pmap->nested_region_addr;
8747 nested_region_end = nested_region_start + pmap->nested_region_size;
8748
8749 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8750 panic("%s: range outside nested region %p-%p, "
8751 "pmap=%p, start=%p, end=%p",
8752 __func__, (void *)nested_region_start, (void *)nested_region_end,
8753 pmap, (void*)start, (void*)end);
8754 }
8755
8756 /* Contract the range to TT page boundaries. */
8757 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8758 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8759 adjusted_end = end & ~adjust_offmask;
8760
8761 /* Iterate over the range, trying to remove TTEs. */
8762 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8763 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8764
8765 tte_p = pmap_tte(pmap, cur);
8766
8767 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8768 pte_p = (pt_entry_t *) ttetokv(*tte_p);
8769
8770 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8771 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8772 /* Deallocate for the nested map. */
8773 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8774 } else if (pmap->type == PMAP_TYPE_USER) {
8775 /**
8776 * Just remove for the parent map. If the leaf table pointed
8777 * to by the TTE being removed (owned by the nested pmap)
8778 * has any mappings, then this call will panic. This
8779 * enforces the policy that tables being trimmed must be
8780 * empty to prevent possible use-after-free attacks.
8781 */
8782 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8783 } else {
8784 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8785 }
8786 } else {
8787 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8788 }
8789 }
8790
8791 /* Remove empty L2 TTs. */
8792 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8793 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8794
8795 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8796 /* For each L1 entry in our range... */
8797 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8798
8799 bool remove_tt1e = true;
8800 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
8801 tt_entry_t * tt2e_start;
8802 tt_entry_t * tt2e_end;
8803 tt_entry_t * tt2e_p;
8804 tt_entry_t tt1e;
8805
8806 if (tt1e_p == NULL) {
8807 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8808 continue;
8809 }
8810
8811 tt1e = *tt1e_p;
8812
8813 if (tt1e == ARM_TTE_TYPE_FAULT) {
8814 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8815 continue;
8816 }
8817
8818 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
8819 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
8820
8821 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
8822 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
8823 /*
8824 * If any TTEs are populated, don't remove the
8825 * L1 TT.
8826 */
8827 remove_tt1e = false;
8828 }
8829 }
8830
8831 if (remove_tt1e) {
8832 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
8833 } else {
8834 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8835 }
8836 }
8837 }
8838
8839 /**
8840 * State machine for multi-step pmap trimming. Trimming is the action of
8841 * deallocating the TTEs of the shared region of pmaps down to a given range.
8842 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
8843 * disabling preemption for too long. These steps include computing the bounds
8844 * of the shared region, trimming the head of the "grand", trimming the tail of
8845 * the "grand", and trimming the "subord". Some of the steps can be skipped under
8846 * different conditions.
8847 *
8848 * @param grand the pmap in which the pages are nested
8849 * @param subord the pmap from which the pages are shared, or nested
8850 * @param vstart start of the used range in "grand"
8851 * @param size size of the used range
8852 * @param state the current state of the state machine
8853 *
8854 * @return the next state of the state machine, to be used in the next call
8855 * into this function.
8856 */
8857 MARK_AS_PMAP_TEXT pmap_trim_state_t
8858 pmap_trim_internal(
8859 pmap_t grand,
8860 pmap_t subord,
8861 addr64_t vstart,
8862 uint64_t size,
8863 pmap_trim_state_t state)
8864 {
8865 /* Validation needs to be done regardless of state. */
8866 addr64_t vend;
8867
8868 if (__improbable(os_add_overflow(vstart, size, &vend))) {
8869 panic("%s: grand addr wraps around, "
8870 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8871 __func__, grand, subord, (void*)vstart, size, state);
8872 }
8873
8874 validate_pmap_mutable(grand);
8875 validate_pmap(subord);
8876
8877 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8878 panic("%s: subord is of non-nestable type 0x%hhx, "
8879 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8880 __func__, subord->type, grand, subord, (void*)vstart, size, state);
8881 }
8882
8883 if (__improbable(grand->type != PMAP_TYPE_USER)) {
8884 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
8885 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8886 __func__, grand->type, grand, subord, (void*)vstart, size, state);
8887 }
8888
8889 if (__improbable(grand->nested_pmap != subord)) {
8890 panic("%s: grand->nested != subord, "
8891 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8892 __func__, grand, subord, (void*)vstart, size, state);
8893 }
8894
8895 if (__improbable((size != 0) &&
8896 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
8897 panic("%s: grand range not in nested region, "
8898 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8899 __func__, grand, subord, (void*)vstart, size, state);
8900 }
8901
8902 /* Trimming starts with figuring out the bounds for the grand. */
8903 if (state == PMAP_TRIM_STATE_START) {
8904 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8905
8906 /**
8907 * The "nested_has_no_bounds_ref" flag is set by `pmap_nest()` if the subord is nested into
8908 * the grand when the bounds are not known yet. Therefore, if it is not set, either any nesting
8909 * has not happened, or trimming has been done, or nesting has been done with bounds known so
8910 * the "extra" region was not nested in the first place. Anyway, trimming is not needed so
8911 * we exit early with PMAP_TRIM_STATE_DONE.
8912 */
8913 if (!grand->nested_has_no_bounds_ref) {
8914 assert(subord->nested_bounds_set);
8915
8916 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
8917 if (!grand->nested_bounds_set) {
8918 /* Inherit the bounds from subord. */
8919 grand->nested_region_true_start = subord->nested_region_true_start;
8920 grand->nested_region_true_end = subord->nested_region_true_end;
8921 grand->nested_bounds_set = true;
8922 }
8923
8924 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8925
8926 /* Now that the grand has bounds, we are done. */
8927 return PMAP_TRIM_STATE_DONE;
8928 }
8929
8930 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
8931 if ((!subord->nested_bounds_set) && size) {
8932 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8933 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8934
8935 subord->nested_region_true_start = vstart;
8936 subord->nested_region_true_end = vend;
8937 subord->nested_region_true_start &= ~adjust_offmask;
8938
8939 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
8940 panic("%s: padded true end wraps around, "
8941 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8942 __func__, grand, subord, (void*)vstart, size, state);
8943 }
8944
8945 subord->nested_region_true_end &= ~adjust_offmask;
8946 subord->nested_bounds_set = true;
8947 }
8948
8949 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
8950 if (subord->nested_bounds_set) {
8951 /* Inherit the bounds from subord. */
8952 grand->nested_region_true_start = subord->nested_region_true_start;
8953 grand->nested_region_true_end = subord->nested_region_true_end;
8954 grand->nested_bounds_set = true;
8955
8956 /* If we know the bounds, we can trim the pmap. */
8957 grand->nested_has_no_bounds_ref = false;
8958 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8959
8960 state = PMAP_TRIM_STATE_GRAND_BEFORE;
8961 } else {
8962 /* Don't trim if we don't know the bounds. */
8963 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8964
8965 return PMAP_TRIM_STATE_DONE;
8966 }
8967 }
8968
8969 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
8970 if (!grand->nested_bounds_set) {
8971 panic("%s: !grand->nested_bounds_set, "
8972 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8973 __func__, grand, subord, (void*)vstart, size, state);
8974 }
8975
8976 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
8977 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
8978
8979 #if XNU_MONITOR
8980 if (pmap_pending_preemption()) {
8981 return PMAP_TRIM_STATE_GRAND_AFTER;
8982 }
8983 #endif
8984
8985 state = PMAP_TRIM_STATE_GRAND_AFTER;
8986 }
8987
8988 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
8989 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
8990
8991 #if XNU_MONITOR
8992 if (pmap_pending_preemption()) {
8993 return PMAP_TRIM_STATE_SUBORD;
8994 }
8995 #endif
8996
8997 state = PMAP_TRIM_STATE_SUBORD;
8998 }
8999
9000 /* START state is guaranteed to compute the bounds for the subord. */
9001 if (!subord->nested_bounds_set) {
9002 panic("%s: !subord->nested_bounds_set, "
9003 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9004 __func__, grand, subord, (void*)vstart, size, state);
9005 }
9006
9007 if (state == PMAP_TRIM_STATE_SUBORD) {
9008 pmap_trim_subord(subord);
9009 }
9010
9011 return PMAP_TRIM_STATE_DONE;
9012 }
9013
9014 MARK_AS_PMAP_TEXT static void
9015 pmap_trim_self(pmap_t pmap)
9016 {
9017 if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
9018 /* If we have a no bounds ref, we need to drop it. */
9019 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9020 pmap->nested_has_no_bounds_ref = false;
9021 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9022 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9023 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9024 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9025
9026 if (nested_bounds_set) {
9027 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9028 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9029 }
9030 /*
9031 * Try trimming the nested pmap, in case we had the
9032 * last reference.
9033 */
9034 pmap_trim_subord(pmap->nested_pmap);
9035 }
9036 }
9037
9038 /*
9039 * pmap_trim_subord(grand, subord)
9040 *
9041 * grand = pmap that we have nested subord in
9042 * subord = nested pmap we are attempting to trim
9043 *
9044 * Trims subord if possible
9045 */
9046 MARK_AS_PMAP_TEXT static void
9047 pmap_trim_subord(pmap_t subord)
9048 {
9049 bool contract_subord = false;
9050
9051 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9052
9053 subord->nested_no_bounds_refcnt--;
9054
9055 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9056 /* If this was the last no bounds reference, trim subord. */
9057 contract_subord = true;
9058 }
9059
9060 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9061
9062 if (contract_subord) {
9063 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9064 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9065 }
9066 }
9067
9068 /**
9069 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9070 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9071 * disabling preemption for too long.
9072 *
9073 * @note When we load the shared region we always create pages tables for the
9074 * entire region. In practice, the shared cache may use just a portion
9075 * of that. Before we know the bounds of the shared region, it can
9076 * already be mapped into processes. Therefore, once the bounds are
9077 * known, "trimming" comes in handy to remove the unnecessary page
9078 * tables in the processes the shared region is mapped in, and eventually
9079 * those in the shared region itself. Note that the shared region must
9080 * be trimmed after the user processes because it has the L3 entries
9081 * everyone else is pointing to.
9082 *
9083 * @param grand the pmap in which the pages are nested
9084 * @param subord the pmap from which the pages are shared, or nested
9085 * @param vstart start of the used range in "grand"
9086 * @param size size of the used range
9087 */
9088 void
9089 pmap_trim(
9090 pmap_t grand,
9091 pmap_t subord,
9092 addr64_t vstart,
9093 uint64_t size)
9094 {
9095 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9096
9097 #if XNU_MONITOR
9098 /* On PPL systems, drives the state machine until its done. */
9099 while (state != PMAP_TRIM_STATE_DONE) {
9100 __assert_only pmap_trim_state_t old_state = state;
9101 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9102
9103 /* Are we making progress? */
9104 assert(old_state != state);
9105 }
9106
9107 pmap_ledger_check_balance(grand);
9108 pmap_ledger_check_balance(subord);
9109 #else
9110 state = pmap_trim_internal(grand, subord, vstart, size, state);
9111
9112 /* On non-PPL systems, we expect the implementation to finish in one call. */
9113 assert(state == PMAP_TRIM_STATE_DONE);
9114 #endif
9115 }
9116
9117 #if HAS_APPLE_PAC
9118 void *
9119 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9120 {
9121 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9122 panic("attempt to sign user pointer without process independent key");
9123 }
9124
9125 void *res = NULL;
9126 uint64_t current_intr_state = pmap_interrupts_disable();
9127
9128 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9129
9130 __compiler_materialize_and_prevent_reordering_on(value);
9131 switch (key) {
9132 case ptrauth_key_asia:
9133 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9134 break;
9135 case ptrauth_key_asda:
9136 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9137 break;
9138 default:
9139 __builtin_unreachable();
9140 }
9141 __compiler_materialize_and_prevent_reordering_on(res);
9142
9143 ml_disable_user_jop_key(jop_key, saved_jop_state);
9144
9145 pmap_interrupts_restore(current_intr_state);
9146
9147 return res;
9148 }
9149
9150 void *
9151 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9152 {
9153 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9154 }
9155
9156 void *
9157 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9158 {
9159 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9160 panic("attempt to auth user pointer without process independent key");
9161 }
9162
9163 void *res = NULL;
9164 uint64_t current_intr_state = pmap_interrupts_disable();
9165
9166 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9167 __compiler_materialize_and_prevent_reordering_on(value);
9168 res = ml_auth_ptr_unchecked(value, key, discriminator);
9169 __compiler_materialize_and_prevent_reordering_on(res);
9170 ml_disable_user_jop_key(jop_key, saved_jop_state);
9171
9172 pmap_interrupts_restore(current_intr_state);
9173
9174 return res;
9175 }
9176
9177 void *
9178 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9179 {
9180 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9181 }
9182 #endif /* HAS_APPLE_PAC */
9183
9184 /*
9185 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9186 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9187 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9188 * return value, to indicate where a preempted [un]nest operation should resume.
9189 * When the return value contains the ending address of the nested region with
9190 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9191 */
9192 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9193
9194 /*
9195 * kern_return_t pmap_nest(grand, subord, vstart, size)
9196 *
9197 * grand = the pmap that we will nest subord into
9198 * subord = the pmap that goes into the grand
9199 * vstart = start of range in pmap to be inserted
9200 * size = Size of nest area (up to 16TB)
9201 *
9202 * Inserts a pmap into another. This is used to implement shared segments.
9203 *
9204 */
9205
9206 /**
9207 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9208 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9209 * This function operates in 3 main phases:
9210 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9211 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9212 * the mapping range are present in subord.
9213 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9214 * contains pointers to subord's leaf-level pagetable pages for the specified
9215 * VA range.
9216 *
9217 * This function may return early due to pending AST_URGENT preemption; if so
9218 * it will indicate the need to be re-entered.
9219 *
9220 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9221 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9222 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9223 * @param size twig-aligned size of the nesting range
9224 * @param vrestart the twig-aligned starting address of the current call. May contain
9225 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9226 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9227 * KERN_RESOURCE_SHORTAGE on allocation failure.
9228 *
9229 * @return the virtual address at which to restart the operation, possibly including
9230 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9231 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9232 */
9233 MARK_AS_PMAP_TEXT vm_map_offset_t
9234 pmap_nest_internal(
9235 pmap_t grand,
9236 pmap_t subord,
9237 addr64_t vstart,
9238 uint64_t size,
9239 vm_map_offset_t vrestart,
9240 kern_return_t *krp)
9241 {
9242 kern_return_t kr = KERN_FAILURE;
9243 vm_map_offset_t vaddr;
9244 tt_entry_t *stte_p;
9245 tt_entry_t *gtte_p;
9246 unsigned int nested_region_asid_bitmap_size;
9247 unsigned int* nested_region_asid_bitmap;
9248 int expand_options = 0;
9249 bool deref_subord = true;
9250
9251 addr64_t vend;
9252 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9253 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9254 }
9255 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9256 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9257 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9258 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9259 }
9260
9261 assert(krp != NULL);
9262 validate_pmap_mutable(grand);
9263 validate_pmap(subord);
9264 #if XNU_MONITOR
9265 /*
9266 * Ordering is important here. validate_pmap() has already ensured subord is a
9267 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9268 * be in the process of being destroyed. If destruction is already committed,
9269 * then the check of ref_count below will cover us. If destruction is initiated
9270 * during or after this call, then pmap_destroy() will catch the non-zero
9271 * nested_count.
9272 */
9273 os_atomic_inc(&subord->nested_count, relaxed);
9274 os_atomic_thread_fence(seq_cst);
9275 #endif
9276 if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9277 panic("%s: invalid subordinate pmap %p", __func__, subord);
9278 }
9279
9280 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9281 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9282 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9283 }
9284
9285 #if XNU_MONITOR
9286 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9287 #endif
9288
9289 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9290 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9291 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9292 grand, vstart, size, (unsigned long long)vrestart);
9293 }
9294
9295 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9296 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9297 }
9298
9299 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9300 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9301 }
9302
9303 if (subord->nested_region_asid_bitmap == NULL) {
9304 nested_region_asid_bitmap_size = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9305
9306 #if XNU_MONITOR
9307 pmap_paddr_t pa = 0;
9308
9309 if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9310 panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9311 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9312 __FUNCTION__, nested_region_asid_bitmap_size,
9313 grand, subord, vstart, size);
9314 }
9315
9316 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9317
9318 if (kr != KERN_SUCCESS) {
9319 goto nest_cleanup;
9320 }
9321
9322 assert(pa);
9323
9324 nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9325 #else
9326 nested_region_asid_bitmap = kalloc_data(
9327 nested_region_asid_bitmap_size * sizeof(unsigned int),
9328 Z_WAITOK | Z_ZERO);
9329 #endif
9330
9331 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9332 if (subord->nested_region_asid_bitmap == NULL) {
9333 subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9334 subord->nested_region_addr = vstart;
9335 subord->nested_region_size = (mach_vm_offset_t) size;
9336
9337 /**
9338 * Ensure that the rest of the subord->nested_region_* fields are
9339 * initialized and visible before setting the nested_region_asid_bitmap
9340 * field (which is used as the flag to say that the rest are initialized).
9341 */
9342 __builtin_arm_dmb(DMB_ISHST);
9343 subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9344 nested_region_asid_bitmap = NULL;
9345 }
9346 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9347 if (nested_region_asid_bitmap != NULL) {
9348 #if XNU_MONITOR
9349 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9350 #else
9351 kfree_data(nested_region_asid_bitmap,
9352 nested_region_asid_bitmap_size * sizeof(unsigned int));
9353 #endif
9354 }
9355 }
9356
9357 /**
9358 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9359 * speculated before their initialization.
9360 */
9361 __builtin_arm_dmb(DMB_ISHLD);
9362
9363 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9364 uint64_t new_size;
9365 unsigned int new_nested_region_asid_bitmap_size;
9366 unsigned int* new_nested_region_asid_bitmap;
9367
9368 nested_region_asid_bitmap = NULL;
9369 nested_region_asid_bitmap_size = 0;
9370 new_size = vend - subord->nested_region_addr;
9371
9372 /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9373 new_nested_region_asid_bitmap_size = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9374
9375 #if XNU_MONITOR
9376 pmap_paddr_t pa = 0;
9377
9378 if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9379 panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9380 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9381 __FUNCTION__, new_nested_region_asid_bitmap_size,
9382 grand, subord, vstart, new_size);
9383 }
9384
9385 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9386
9387 if (kr != KERN_SUCCESS) {
9388 goto nest_cleanup;
9389 }
9390
9391 assert(pa);
9392
9393 new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9394 #else
9395 new_nested_region_asid_bitmap = kalloc_data(
9396 new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9397 Z_WAITOK | Z_ZERO);
9398 #endif
9399 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9400 if (subord->nested_region_size < new_size) {
9401 bcopy(subord->nested_region_asid_bitmap,
9402 new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9403 nested_region_asid_bitmap_size = subord->nested_region_asid_bitmap_size;
9404 nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9405 subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9406 subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9407 subord->nested_region_size = new_size;
9408 new_nested_region_asid_bitmap = NULL;
9409 }
9410 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9411 if (nested_region_asid_bitmap != NULL) {
9412 #if XNU_MONITOR
9413 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9414 #else
9415 kfree_data(nested_region_asid_bitmap,
9416 nested_region_asid_bitmap_size * sizeof(unsigned int));
9417 #endif
9418 }
9419 if (new_nested_region_asid_bitmap != NULL) {
9420 #if XNU_MONITOR
9421 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9422 #else
9423 kfree_data(new_nested_region_asid_bitmap,
9424 new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9425 #endif
9426 }
9427 }
9428
9429 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9430
9431 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9432 /*
9433 * If this is grand's first nesting operation, keep the reference on subord.
9434 * It will be released by pmap_destroy_internal() when grand is destroyed.
9435 */
9436 deref_subord = false;
9437
9438 if (!subord->nested_bounds_set) {
9439 /*
9440 * We are nesting without the shared regions bounds
9441 * being known. We'll have to trim the pmap later.
9442 */
9443 grand->nested_has_no_bounds_ref = true;
9444 subord->nested_no_bounds_refcnt++;
9445 }
9446
9447 grand->nested_region_addr = vstart;
9448 grand->nested_region_size = (mach_vm_offset_t) size;
9449 } else {
9450 if (__improbable(grand->nested_pmap != subord)) {
9451 panic("pmap_nest() pmap %p has a nested pmap", grand);
9452 } else if (__improbable(grand->nested_region_addr > vstart)) {
9453 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9454 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9455 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9456 }
9457 }
9458
9459 vaddr = vrestart & ~PMAP_NEST_GRAND;
9460 if (vaddr < subord->nested_region_true_start) {
9461 vaddr = subord->nested_region_true_start;
9462 }
9463
9464 addr64_t true_end = vend;
9465 if (true_end > subord->nested_region_true_end) {
9466 true_end = subord->nested_region_true_end;
9467 }
9468 __unused unsigned int ttecount = 0;
9469
9470 if (vrestart & PMAP_NEST_GRAND) {
9471 goto nest_grand;
9472 }
9473
9474 while (vaddr < true_end) {
9475 stte_p = pmap_tte(subord, vaddr);
9476 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9477 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9478 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9479
9480 if (kr != KERN_SUCCESS) {
9481 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9482 goto done;
9483 }
9484
9485 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9486 }
9487 vaddr += pt_attr_twig_size(pt_attr);
9488 vrestart = vaddr;
9489 ++ttecount;
9490 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9491 pmap_pending_preemption())) {
9492 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9493 kr = KERN_SUCCESS;
9494 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9495 goto done;
9496 }
9497 }
9498 /*
9499 * copy TTEs from subord pmap into grand pmap
9500 */
9501
9502 vaddr = (vm_map_offset_t) vstart;
9503 if (vaddr < subord->nested_region_true_start) {
9504 vaddr = subord->nested_region_true_start;
9505 }
9506 vrestart = vaddr | PMAP_NEST_GRAND;
9507
9508 nest_grand:
9509 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9510 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9511 while (vaddr < true_end) {
9512 stte_p = pmap_tte(subord, vaddr);
9513 gtte_p = pmap_tte(grand, vaddr);
9514 if (gtte_p == PT_ENTRY_NULL) {
9515 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9516 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9517 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9518
9519 if (kr != KERN_SUCCESS) {
9520 goto done;
9521 }
9522
9523 gtte_p = pmap_tt2e(grand, vaddr);
9524 }
9525 /* Don't leak a page table page. Don't violate break-before-make. */
9526 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9527 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9528 __func__, gtte_p, grand);
9529 }
9530 *gtte_p = *stte_p;
9531
9532 vaddr += pt_attr_twig_size(pt_attr);
9533 vrestart = vaddr | PMAP_NEST_GRAND;
9534 ++ttecount;
9535 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9536 pmap_pending_preemption())) {
9537 break;
9538 }
9539 }
9540 if (vaddr >= true_end) {
9541 vrestart = vend | PMAP_NEST_GRAND;
9542 }
9543
9544 kr = KERN_SUCCESS;
9545 done:
9546
9547 FLUSH_PTE();
9548 __builtin_arm_isb(ISB_SY);
9549
9550 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9551 #if XNU_MONITOR
9552 nest_cleanup:
9553 if (kr != KERN_SUCCESS) {
9554 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9555 *krp = kr;
9556 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9557 }
9558 #else
9559 if (kr != KERN_SUCCESS) {
9560 *krp = kr;
9561 }
9562 #endif
9563 if (deref_subord) {
9564 #if XNU_MONITOR
9565 os_atomic_dec(&subord->nested_count, relaxed);
9566 #endif
9567 pmap_destroy_internal(subord);
9568 }
9569 return vrestart;
9570 }
9571
9572 kern_return_t
9573 pmap_nest(
9574 pmap_t grand,
9575 pmap_t subord,
9576 addr64_t vstart,
9577 uint64_t size)
9578 {
9579 kern_return_t kr = KERN_SUCCESS;
9580 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9581 vm_map_offset_t vend = vaddr + size;
9582 __unused vm_map_offset_t vlast = vaddr;
9583
9584 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9585 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9586 VM_KERNEL_ADDRHIDE(vstart));
9587
9588 pmap_verify_preemptible();
9589 #if XNU_MONITOR
9590 while (vaddr != (vend | PMAP_NEST_GRAND)) {
9591 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9592 if (kr == KERN_RESOURCE_SHORTAGE) {
9593 pmap_alloc_page_for_ppl(0);
9594 kr = KERN_SUCCESS;
9595 } else if (kr == KERN_ABORTED) {
9596 /* Reset kr to KERN_SUCCESS and try again. */
9597 kr = KERN_SUCCESS;
9598 } else if (kr != KERN_SUCCESS) {
9599 break;
9600 } else if (vaddr == vlast) {
9601 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9602 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9603 }
9604 vlast = vaddr;
9605 }
9606
9607 pmap_ledger_check_balance(grand);
9608 pmap_ledger_check_balance(subord);
9609 #else
9610 /**
9611 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9612 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9613 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9614 */
9615 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9616 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9617 }
9618 #endif
9619
9620 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9621
9622 return kr;
9623 }
9624
9625 /*
9626 * kern_return_t pmap_unnest(grand, vaddr)
9627 *
9628 * grand = the pmap that will have the virtual range unnested
9629 * vaddr = start of range in pmap to be unnested
9630 * size = size of range in pmap to be unnested
9631 *
9632 */
9633
9634 kern_return_t
9635 pmap_unnest(
9636 pmap_t grand,
9637 addr64_t vaddr,
9638 uint64_t size)
9639 {
9640 return pmap_unnest_options(grand, vaddr, size, 0);
9641 }
9642
9643 /**
9644 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9645 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9646 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9647 * still have the region nested. The mappings in 'grand' will be left empty
9648 * with the assumption that they will be demand-filled by subsequent access faults.
9649 *
9650 * This function operates in 2 main phases:
9651 * 1. Iteration over the nested pmap's mappings for the specified range to mark
9652 * them non-global.
9653 * 2. Clearing of the twig-level TTEs for the address range in grand.
9654 *
9655 * This function may return early due to pending AST_URGENT preemption; if so
9656 * it will indicate the need to be re-entered.
9657 *
9658 * @param grand pmap from which to unnest mappings
9659 * @param vaddr twig-aligned virtual address for the beginning of the nested range
9660 * @param size twig-aligned size of the nested range
9661 * @param vrestart the page-aligned starting address of the current call. May contain
9662 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9663 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9664 * grand is being torn down and step 1) above is not needed.
9665 *
9666 * @return the virtual address at which to restart the operation, possibly including
9667 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9668 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9669 */
9670 MARK_AS_PMAP_TEXT vm_map_offset_t
9671 pmap_unnest_options_internal(
9672 pmap_t grand,
9673 addr64_t vaddr,
9674 uint64_t size,
9675 vm_map_offset_t vrestart,
9676 unsigned int option)
9677 {
9678 vm_map_offset_t start;
9679 vm_map_offset_t addr;
9680 tt_entry_t *tte_p;
9681 unsigned int current_index;
9682 unsigned int start_index;
9683 unsigned int max_index;
9684 unsigned int entry_count = 0;
9685
9686 addr64_t vend;
9687 addr64_t true_end;
9688 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9689 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9690 }
9691 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9692 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9693 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9694 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9695 }
9696
9697 validate_pmap_mutable(grand);
9698
9699 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9700
9701 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9702 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9703 (unsigned long long)vaddr, (unsigned long long)size);
9704 }
9705
9706 if (__improbable(grand->nested_pmap == NULL)) {
9707 panic("%s: %p has no nested pmap", __func__, grand);
9708 }
9709
9710 true_end = vend;
9711 if (true_end > grand->nested_pmap->nested_region_true_end) {
9712 true_end = grand->nested_pmap->nested_region_true_end;
9713 }
9714
9715 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
9716 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9717 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9718 }
9719
9720 pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9721
9722 start = vrestart;
9723 if (start < grand->nested_pmap->nested_region_true_start) {
9724 start = grand->nested_pmap->nested_region_true_start;
9725 }
9726 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9727 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9728 bool flush_tlb = false;
9729
9730 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9731 pt_entry_t *bpte, *cpte;
9732
9733 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9734
9735 bpte = pmap_pte(grand->nested_pmap, addr);
9736
9737 /*
9738 * If we've re-entered this function partway through unnesting a leaf region, the
9739 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
9740 * the run of PTEs. We therefore also need to check for a non-twig-aligned starting
9741 * address.
9742 */
9743 if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
9744 (addr & pt_attr_twig_offmask(pt_attr))) {
9745 /*
9746 * Mark the 'twig' region as being unnested. Every mapping entered within
9747 * the nested pmap in this region will now be marked non-global. Do this
9748 * before marking any of the PTEs within the region as non-global to avoid
9749 * the possibility of pmap_enter() subsequently inserting a global mapping
9750 * in the region, which could lead to a TLB conflict if a non-global entry
9751 * is later inserted for the same VA in a pmap which has fully unnested this
9752 * region.
9753 */
9754 setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
9755 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
9756 pmap_paddr_t pa;
9757 unsigned int pai = 0;
9758 boolean_t managed = FALSE;
9759 pt_entry_t spte;
9760
9761 if ((*cpte != ARM_PTE_TYPE_FAULT)
9762 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
9763 spte = *((volatile pt_entry_t*)cpte);
9764 while (!managed) {
9765 pa = pte_to_pa(spte);
9766 if (!pa_valid(pa)) {
9767 break;
9768 }
9769 pai = pa_index(pa);
9770 pvh_lock(pai);
9771 spte = *((volatile pt_entry_t*)cpte);
9772 pa = pte_to_pa(spte);
9773 if (pai == pa_index(pa)) {
9774 managed = TRUE;
9775 break; // Leave the PVH locked as we'll unlock it after we update the PTE
9776 }
9777 pvh_unlock(pai);
9778 }
9779
9780 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
9781 write_pte_fast(cpte, (spte | ARM_PTE_NG));
9782 flush_tlb = true;
9783 }
9784
9785 if (managed) {
9786 pvh_assert_locked(pai);
9787 pvh_unlock(pai);
9788 }
9789 }
9790
9791 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9792 vrestart = addr;
9793 ++entry_count;
9794 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9795 pmap_pending_preemption())) {
9796 goto unnest_subord_done;
9797 }
9798 }
9799 }
9800 addr = vlim;
9801 vrestart = addr;
9802 ++entry_count;
9803 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9804 pmap_pending_preemption())) {
9805 break;
9806 }
9807 }
9808
9809 unnest_subord_done:
9810 if (flush_tlb) {
9811 FLUSH_PTE_STRONG();
9812 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
9813 }
9814
9815 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9816 if (current_index < max_index) {
9817 return vrestart;
9818 }
9819 }
9820
9821 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9822
9823 /*
9824 * invalidate all pdes for segment at vaddr in pmap grand
9825 */
9826 if (vrestart & PMAP_NEST_GRAND) {
9827 addr = vrestart & ~PMAP_NEST_GRAND;
9828 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
9829 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
9830 }
9831 } else {
9832 addr = vaddr;
9833 vrestart = vaddr | PMAP_NEST_GRAND;
9834 }
9835
9836 if (addr < grand->nested_pmap->nested_region_true_start) {
9837 addr = grand->nested_pmap->nested_region_true_start;
9838 }
9839
9840 while (addr < true_end) {
9841 tte_p = pmap_tte(grand, addr);
9842 /*
9843 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
9844 * so it's possible that a region we're trying to unnest may not have been
9845 * nested in the first place.
9846 */
9847 if (tte_p != NULL) {
9848 *tte_p = ARM_TTE_TYPE_FAULT;
9849 }
9850 addr += pt_attr_twig_size(pt_attr);
9851 vrestart = addr | PMAP_NEST_GRAND;
9852 ++entry_count;
9853 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9854 pmap_pending_preemption())) {
9855 break;
9856 }
9857 }
9858 if (addr >= true_end) {
9859 vrestart = vend | PMAP_NEST_GRAND;
9860 }
9861
9862 FLUSH_PTE_STRONG();
9863 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
9864
9865 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9866
9867 return vrestart;
9868 }
9869
9870 kern_return_t
9871 pmap_unnest_options(
9872 pmap_t grand,
9873 addr64_t vaddr,
9874 uint64_t size,
9875 unsigned int option)
9876 {
9877 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
9878 vm_map_offset_t vend = vaddr + size;
9879 __unused vm_map_offset_t vlast = vrestart;
9880
9881 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9882 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9883
9884 pmap_verify_preemptible();
9885 while (vrestart != (vend | PMAP_NEST_GRAND)) {
9886 #if XNU_MONITOR
9887 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
9888 if (vrestart == vlast) {
9889 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9890 __func__, (unsigned long long)vaddr, (unsigned long long)vend, (unsigned long long)vrestart);
9891 }
9892 vlast = vrestart;
9893 #else
9894 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
9895 #endif
9896 }
9897
9898 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9899
9900 return KERN_SUCCESS;
9901 }
9902
9903 boolean_t
9904 pmap_adjust_unnest_parameters(
9905 __unused pmap_t p,
9906 __unused vm_map_offset_t *s,
9907 __unused vm_map_offset_t *e)
9908 {
9909 return TRUE; /* to get to log_unnest_badness()... */
9910 }
9911
9912 #if PMAP_FORK_NEST
9913 /**
9914 * Perform any necessary pre-nesting of the parent's shared region at fork()
9915 * time.
9916 *
9917 * @note This should only be called from vm_map_fork().
9918 *
9919 * @param old_pmap The pmap of the parent task.
9920 * @param new_pmap The pmap of the child task.
9921 * @param nesting_start An output parameter that is updated with the start
9922 * address of the range that was pre-nested
9923 * @param nesting_end An output parameter that is updated with the end
9924 * address of the range that was pre-nested
9925 *
9926 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9927 * KERN_INVALID_ARGUMENT if the arguments were not valid.
9928 */
9929 kern_return_t
9930 pmap_fork_nest(
9931 pmap_t old_pmap,
9932 pmap_t new_pmap,
9933 vm_map_offset_t *nesting_start,
9934 vm_map_offset_t *nesting_end)
9935 {
9936 if (old_pmap == NULL || new_pmap == NULL) {
9937 return KERN_INVALID_ARGUMENT;
9938 }
9939 if (old_pmap->nested_pmap == NULL) {
9940 return KERN_SUCCESS;
9941 }
9942 pmap_nest(new_pmap,
9943 old_pmap->nested_pmap,
9944 old_pmap->nested_region_addr,
9945 old_pmap->nested_region_size);
9946 assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
9947 new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
9948 new_pmap->nested_region_size == old_pmap->nested_region_size,
9949 "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
9950 new_pmap->nested_pmap,
9951 new_pmap->nested_region_addr,
9952 new_pmap->nested_region_size,
9953 old_pmap->nested_pmap,
9954 old_pmap->nested_region_addr,
9955 old_pmap->nested_region_size);
9956 *nesting_start = old_pmap->nested_region_addr;
9957 *nesting_end = *nesting_start + old_pmap->nested_region_size;
9958 return KERN_SUCCESS;
9959 }
9960 #endif /* PMAP_FORK_NEST */
9961
9962 /*
9963 * disable no-execute capability on
9964 * the specified pmap
9965 */
9966 #if DEVELOPMENT || DEBUG
9967 void
9968 pmap_disable_NX(
9969 pmap_t pmap)
9970 {
9971 pmap->nx_enabled = FALSE;
9972 }
9973 #else
9974 void
9975 pmap_disable_NX(
9976 __unused pmap_t pmap)
9977 {
9978 }
9979 #endif
9980
9981 /*
9982 * flush a range of hardware TLB entries.
9983 * NOTE: assumes the smallest TLB entry in use will be for
9984 * an ARM small page (4K).
9985 */
9986
9987 #define ARM_FULL_TLB_FLUSH_THRESHOLD 64
9988
9989 #if __ARM_RANGE_TLBI__
9990 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
9991 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_PAGES
9992 #else
9993 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
9994 #endif // __ARM_RANGE_TLBI__
9995
9996 static void
9997 flush_mmu_tlb_region_asid_async(
9998 vm_offset_t va,
9999 size_t length,
10000 pmap_t pmap,
10001 bool last_level_only __unused)
10002 {
10003 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10004 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10005 ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10006 uint32_t asid;
10007
10008 asid = pmap->hw_asid;
10009
10010 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10011 boolean_t flush_all = FALSE;
10012
10013 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10014 flush_all = TRUE;
10015 }
10016 if (flush_all) {
10017 flush_mmu_tlb_async();
10018 } else {
10019 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT);
10020 }
10021 return;
10022 }
10023 #if __ARM_RANGE_TLBI__
10024 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10025 va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10026 if (pmap->type == PMAP_TYPE_NESTED) {
10027 flush_mmu_tlb_allrange_async(va, last_level_only);
10028 } else {
10029 flush_mmu_tlb_range_async(va, last_level_only);
10030 }
10031 return;
10032 }
10033 #endif
10034 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10035 va = tlbi_asid(asid) | tlbi_addr(va);
10036
10037 if (pmap->type == PMAP_TYPE_NESTED) {
10038 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only);
10039 } else {
10040 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only);
10041 }
10042 }
10043
10044 MARK_AS_PMAP_TEXT static void
10045 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10046 {
10047 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT);
10048 }
10049
10050 void
10051 flush_mmu_tlb_region(
10052 vm_offset_t va,
10053 unsigned length)
10054 {
10055 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
10056 sync_tlb_flush();
10057 }
10058
10059 unsigned int
10060 pmap_cache_attributes(
10061 ppnum_t pn)
10062 {
10063 pmap_paddr_t paddr;
10064 unsigned int pai;
10065 unsigned int result;
10066 pp_attr_t pp_attr_current;
10067
10068 paddr = ptoa(pn);
10069
10070 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10071
10072 if (!pa_valid(paddr)) {
10073 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10074 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10075 }
10076
10077 result = VM_WIMG_DEFAULT;
10078
10079 pai = pa_index(paddr);
10080
10081 pp_attr_current = pp_attr_table[pai];
10082 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10083 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10084 }
10085 return result;
10086 }
10087
10088 MARK_AS_PMAP_TEXT static void
10089 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10090 {
10091 if ((wimg_bits_prev != wimg_bits_new)
10092 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10093 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10094 && (wimg_bits_new != VM_WIMG_COPYBACK))
10095 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10096 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10097 pmap_sync_page_attributes_phys(pn);
10098 }
10099
10100 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10101 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10102 }
10103 }
10104
10105 MARK_AS_PMAP_TEXT __unused void
10106 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10107 {
10108 pmap_paddr_t paddr = ptoa(pn);
10109 const unsigned int pai = pa_index(paddr);
10110
10111 if (__improbable(!pa_valid(paddr))) {
10112 panic("%s called on non-managed page 0x%08x", __func__, pn);
10113 }
10114
10115 pvh_lock(pai);
10116
10117 #if XNU_MONITOR
10118 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10119 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10120 }
10121 #endif
10122
10123 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10124
10125 pvh_unlock(pai);
10126
10127 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10128 }
10129
10130 void *
10131 pmap_map_compressor_page(ppnum_t pn)
10132 {
10133 #if __ARM_PTE_PHYSMAP__
10134 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10135 if (cacheattr != VM_WIMG_DEFAULT) {
10136 #if XNU_MONITOR
10137 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10138 #else
10139 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10140 #endif
10141 }
10142 #endif
10143 return (void*)phystokv(ptoa(pn));
10144 }
10145
10146 void
10147 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10148 {
10149 #if __ARM_PTE_PHYSMAP__
10150 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10151 if (cacheattr != VM_WIMG_DEFAULT) {
10152 #if XNU_MONITOR
10153 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10154 #else
10155 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10156 #endif
10157 }
10158 #endif
10159 }
10160
10161 /**
10162 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10163 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10164 *
10165 * @param user_page_list List of pages to be updated.
10166 * @param page_cnt Number of pages in total in user_page_list.
10167 * @param cacheattr The new cache attribute.
10168 *
10169 * @return Success if true is returned.
10170 */
10171 bool
10172 pmap_batch_set_cache_attributes(
10173 upl_page_info_array_t user_page_list,
10174 unsigned int page_cnt,
10175 unsigned int cacheattr)
10176 {
10177 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10178
10179 if (page_cnt == 0) {
10180 return true;
10181 }
10182
10183 batch_set_cache_attr_state_t states;
10184 states.page_index = 0;
10185 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10186 states.tlb_flush_pass_needed = false;
10187 states.rt_cache_flush_pass_needed = false;
10188
10189 /* Verify we are being called from a preemptible context. */
10190 pmap_verify_preemptible();
10191
10192 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10193 #if XNU_MONITOR
10194 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10195 #else /* !XNU_MONITOR */
10196 states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10197 #endif /* XNU_MONITOR */
10198 }
10199
10200 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10201 return true;
10202 }
10203
10204 /**
10205 * Flushes TLB entries associated with the page specified by paddr, but do not
10206 * issue barriers yet.
10207 *
10208 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10209 */
10210 MARK_AS_PMAP_TEXT static void
10211 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10212 {
10213 #if __ARM_PTE_PHYSMAP__
10214 /* Flush the physical aperture mappings. */
10215 const vm_offset_t kva = phystokv(paddr);
10216 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10217 #endif /* __ARM_PTE_PHYSMAP__ */
10218
10219 /* Flush the mappings tracked in the ptes. */
10220 const unsigned int pai = pa_index(paddr);
10221 pv_entry_t **pv_h = pai_to_pvh(pai);
10222
10223 pt_entry_t *pte_p = PT_ENTRY_NULL;
10224 pv_entry_t *pve_p = PV_ENTRY_NULL;
10225
10226 pvh_assert_locked(pai);
10227
10228 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10229 pte_p = pvh_ptep(pv_h);
10230 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10231 pve_p = pvh_pve_list(pv_h);
10232 pte_p = PT_ENTRY_NULL;
10233 }
10234
10235 int pve_ptep_idx = 0;
10236 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10237 if (pve_p != PV_ENTRY_NULL) {
10238 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10239 if (pte_p == PT_ENTRY_NULL) {
10240 goto flush_tlb_skip_pte;
10241 }
10242 }
10243
10244 #ifdef PVH_FLAG_IOMMU
10245 if (pvh_ptep_is_iommu(pte_p)) {
10246 goto flush_tlb_skip_pte;
10247 }
10248 #endif /* PVH_FLAG_IOMMU */
10249 pmap_t pmap = ptep_get_pmap(pte_p);
10250 vm_map_address_t va = ptep_get_va(pte_p);
10251
10252 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10253
10254 flush_tlb_skip_pte:
10255 pte_p = PT_ENTRY_NULL;
10256 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10257 pve_ptep_idx = 0;
10258 pve_p = pve_next(pve_p);
10259 }
10260 }
10261 }
10262
10263 /**
10264 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10265 *
10266 * @param pai The Physical Address Index of the entry.
10267 * @param cacheattr The new cache attribute.
10268 */
10269 MARK_AS_PMAP_TEXT static void
10270 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10271 {
10272 pvh_assert_locked(pai);
10273
10274 pp_attr_t pp_attr_current, pp_attr_template;
10275 do {
10276 pp_attr_current = pp_attr_table[pai];
10277 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10278
10279 /**
10280 * WIMG bits should only be updated under the PVH lock, but we should do
10281 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10282 */
10283 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10284 }
10285
10286 /**
10287 * Batch updates the cache attributes of a list of pages in three passes.
10288 *
10289 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10290 * In pass two, TLB entries are flushed for each page in the list if necessary.
10291 * In pass three, caches are cleaned for each page in the list if necessary.
10292 *
10293 * When running in PPL, this function may decide to return to the caller in response
10294 * to AST_URGENT.
10295 *
10296 * @param user_page_list List of pages to be updated.
10297 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10298 * @param page_cnt Number of pages in total in user_page_list.
10299 * @param cacheattr The new cache attributes.
10300 *
10301 * @return The new state of the state machine.
10302 */
10303 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10304 pmap_batch_set_cache_attributes_internal(
10305 #if XNU_MONITOR
10306 volatile upl_page_info_t *user_page_list,
10307 #else /* !XNU_MONITOR */
10308 upl_page_info_array_t user_page_list,
10309 #endif /* XNU_MONITOR */
10310 batch_set_cache_attr_state_t states,
10311 unsigned int page_cnt,
10312 unsigned int cacheattr)
10313 {
10314 uint64_t page_index = states.page_index;
10315 uint64_t state = states.state;
10316 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10317 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10318
10319 /* For verifying progress. */
10320 __assert_only const uint64_t page_index_old = page_index;
10321 __assert_only const uint64_t state_old = state;
10322
10323 /* Assert page_index and state are within their range. */
10324 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10325 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10326 }
10327
10328 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10329 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10330 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10331 while (page_index < page_cnt) {
10332 const ppnum_t pn = user_page_list[page_index].phys_addr;
10333 const pmap_paddr_t paddr = ptoa(pn);
10334
10335 if (!pa_valid(paddr)) {
10336 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10337 }
10338
10339 const unsigned int pai = pa_index(paddr);
10340
10341 /* Lock the page. */
10342 pvh_lock(pai);
10343
10344 #if XNU_MONITOR
10345 if (ppattr_pa_test_monitor(paddr)) {
10346 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10347 }
10348 #endif /* XNU_MONITOR */
10349 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10350
10351 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10352 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10353 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10354 }
10355
10356 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10357
10358 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10359 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10360 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10361 }
10362
10363 /* Update the cache attributes in PTE and PP_ATTR table. */
10364 if (wimg_bits_new != wimg_bits_prev) {
10365 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10366 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10367 }
10368
10369 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10370 rt_cache_flush_pass_needed = true;
10371 }
10372
10373 pvh_unlock(pai);
10374
10375 page_index++;
10376
10377 #if XNU_MONITOR
10378 /**
10379 * Check for AST_URGENT every page, as the pve list search in cache
10380 * update can take non-constant time.
10381 */
10382 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10383 goto pbscai_exit;
10384 }
10385 #endif /* XNU_MONITOR */
10386 }
10387
10388 /* page_index == page_cnt && !pmap_pending_preemption() */
10389 if (tlb_flush_pass_needed) {
10390 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10391 } else if (rt_cache_flush_pass_needed) {
10392 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10393 } else {
10394 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10395 }
10396 page_index = 0;
10397
10398 /* Sync the PTE writes before potential TLB/Cache flushes. */
10399 FLUSH_PTE_STRONG();
10400
10401 #if XNU_MONITOR
10402 if (__improbable(pmap_pending_preemption())) {
10403 goto pbscai_exit;
10404 }
10405 #endif /* XNU_MONITOR */
10406 }
10407
10408 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10409 /**
10410 * Pass 2: for each physical page and for each mapping, we need to flush
10411 * the TLB for it.
10412 */
10413 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10414 while (page_index < page_cnt) {
10415 const ppnum_t pn = user_page_list[page_index].phys_addr;
10416
10417 const pmap_paddr_t paddr = ptoa(pn);
10418 if (!pa_valid(paddr)) {
10419 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10420 }
10421
10422 const unsigned int pai = pa_index(paddr);
10423
10424 pvh_lock(pai);
10425 pmap_flush_tlb_for_paddr_locked_async(paddr);
10426 pvh_unlock(pai);
10427
10428 page_index++;
10429
10430 #if XNU_MONITOR
10431 /**
10432 * Check for AST_URGENT every page, as the pve list search in cache
10433 * update can take non-constant time.
10434 */
10435 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10436 goto pbscai_exit;
10437 }
10438 #endif /* XNU_MONITOR */
10439 }
10440
10441 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10442
10443 if (rt_cache_flush_pass_needed) {
10444 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10445 } else {
10446 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10447 }
10448 page_index = 0;
10449
10450 #if XNU_MONITOR
10451 if (__improbable(pmap_pending_preemption())) {
10452 goto pbscai_exit;
10453 }
10454 #endif /* XNU_MONITOR */
10455 }
10456
10457 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10458 /* Pass 3: Flush the cache if the page is recently set to RT */
10459 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10460 #if !XNU_MONITOR
10461 /**
10462 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10463 * in the state where DC by VA instructions remain enabled.
10464 */
10465 disable_preemption();
10466 #endif /* !XNU_MONITOR */
10467
10468 assert(get_preemption_level() > 0);
10469
10470 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10471 /**
10472 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10473 * and the host will handle cache maintenance for it. So we don't need to
10474 * worry about enabling the ops here for AVP.
10475 */
10476 enable_dc_mva_ops();
10477 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10478
10479 while (page_index < page_cnt) {
10480 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10481
10482 if (!pa_valid(paddr)) {
10483 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10484 }
10485
10486 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10487
10488 page_index++;
10489
10490 #if XNU_MONITOR
10491 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10492 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10493 disable_dc_mva_ops();
10494 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10495 goto pbscai_exit;
10496 }
10497 #endif /* XNU_MONITOR */
10498 }
10499
10500 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10501 disable_dc_mva_ops();
10502 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10503
10504 #if !XNU_MONITOR
10505 enable_preemption();
10506 #endif /* !XNU_MONITOR */
10507
10508 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10509 page_index = 0;
10510 }
10511
10512 #if XNU_MONITOR
10513 pbscai_exit:
10514 #endif /* XNU_MONITOR */
10515 /* Assert page_index and state are within their range. */
10516 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10517
10518 /* Make sure we are making progress in this call. */
10519 assert(page_index > page_index_old || state > state_old);
10520
10521 batch_set_cache_attr_state_t states_new;
10522 states_new.page_index = page_index;
10523 states_new.state = state;
10524 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10525 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10526 return states_new;
10527 }
10528
10529 MARK_AS_PMAP_TEXT static void
10530 pmap_set_cache_attributes_priv(
10531 ppnum_t pn,
10532 unsigned int cacheattr,
10533 boolean_t external __unused)
10534 {
10535 pmap_paddr_t paddr;
10536 unsigned int pai;
10537 pp_attr_t pp_attr_current;
10538 pp_attr_t pp_attr_template;
10539 unsigned int wimg_bits_prev, wimg_bits_new;
10540
10541 paddr = ptoa(pn);
10542
10543 if (!pa_valid(paddr)) {
10544 return; /* Not a managed page. */
10545 }
10546
10547 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10548 cacheattr = VM_WIMG_DEFAULT;
10549 }
10550
10551 pai = pa_index(paddr);
10552
10553 pvh_lock(pai);
10554
10555 #if XNU_MONITOR
10556 if (external && ppattr_pa_test_monitor(paddr)) {
10557 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10558 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
10559 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10560 }
10561 #endif
10562
10563 do {
10564 pp_attr_current = pp_attr_table[pai];
10565 wimg_bits_prev = VM_WIMG_DEFAULT;
10566 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10567 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10568 }
10569
10570 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10571
10572 /**
10573 * WIMG bits should only be updated under the PVH lock, but we should do
10574 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10575 */
10576 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10577
10578 wimg_bits_new = VM_WIMG_DEFAULT;
10579 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10580 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10581 }
10582
10583 if (wimg_bits_new != wimg_bits_prev) {
10584 pmap_update_cache_attributes_locked(pn, cacheattr, true);
10585 }
10586
10587 pvh_unlock(pai);
10588
10589 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10590 }
10591
10592 MARK_AS_PMAP_TEXT void
10593 pmap_set_cache_attributes_internal(
10594 ppnum_t pn,
10595 unsigned int cacheattr)
10596 {
10597 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10598 }
10599
10600 void
10601 pmap_set_cache_attributes(
10602 ppnum_t pn,
10603 unsigned int cacheattr)
10604 {
10605 #if XNU_MONITOR
10606 pmap_set_cache_attributes_ppl(pn, cacheattr);
10607 #else
10608 pmap_set_cache_attributes_internal(pn, cacheattr);
10609 #endif
10610 }
10611
10612 /**
10613 * Updates the page numbered ppnum to have attribute specified by attributes.
10614 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10615 * The necessity of the TLB flush is returned in case this function is called
10616 * in a batched manner and the TLB flush is intended to be done at a different
10617 * timing.
10618 *
10619 * @param ppnum Page Number of the page to be updated.
10620 * @param attributes The new cache attributes.
10621 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10622 * immediately.
10623 *
10624 * @return Returns true if a TLB flush is needed for this update regardless of
10625 * whether a flush has occurred already.
10626 */
10627 MARK_AS_PMAP_TEXT bool
10628 pmap_update_cache_attributes_locked(
10629 ppnum_t ppnum,
10630 unsigned attributes,
10631 bool perform_tlbi)
10632 {
10633 pmap_paddr_t phys = ptoa(ppnum);
10634 pv_entry_t *pve_p;
10635 pt_entry_t *pte_p;
10636 pv_entry_t **pv_h;
10637 pt_entry_t tmplate;
10638 unsigned int pai;
10639 boolean_t tlb_flush_needed = false;
10640
10641 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10642
10643 if (pmap_panic_dev_wimg_on_managed) {
10644 switch (attributes & VM_WIMG_MASK) {
10645 case VM_WIMG_IO: // nGnRnE
10646 case VM_WIMG_POSTED: // nGnRE
10647 /* supported on DRAM, but slow, so we disallow */
10648
10649 case VM_WIMG_POSTED_REORDERED: // nGRE
10650 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10651 /* unsupported on DRAM */
10652
10653 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10654 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10655 break;
10656
10657 default:
10658 /* not device type memory, all good */
10659
10660 break;
10661 }
10662 }
10663
10664 #if __ARM_PTE_PHYSMAP__
10665 vm_offset_t kva = phystokv(phys);
10666 pte_p = pmap_pte(kernel_pmap, kva);
10667
10668 tmplate = *pte_p;
10669 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10670 #if XNU_MONITOR
10671 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10672 #else
10673 tmplate |= wimg_to_pte(attributes, phys);
10674 #endif
10675 if (tmplate & ARM_PTE_HINT_MASK) {
10676 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10677 __FUNCTION__, pte_p, (void *)kva, tmplate);
10678 }
10679
10680 if (perform_tlbi) {
10681 write_pte_strong(pte_p, tmplate);
10682 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10683 } else {
10684 write_pte_fast(pte_p, tmplate);
10685 }
10686 tlb_flush_needed = true;
10687 #endif
10688
10689 pai = pa_index(phys);
10690
10691 pv_h = pai_to_pvh(pai);
10692
10693 pte_p = PT_ENTRY_NULL;
10694 pve_p = PV_ENTRY_NULL;
10695 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10696 pte_p = pvh_ptep(pv_h);
10697 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10698 pve_p = pvh_pve_list(pv_h);
10699 pte_p = PT_ENTRY_NULL;
10700 }
10701
10702 int pve_ptep_idx = 0;
10703 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10704 vm_map_address_t va;
10705 pmap_t pmap;
10706
10707 if (pve_p != PV_ENTRY_NULL) {
10708 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10709 if (pte_p == PT_ENTRY_NULL) {
10710 goto cache_skip_pve;
10711 }
10712 }
10713
10714 #ifdef PVH_FLAG_IOMMU
10715 if (pvh_ptep_is_iommu(pte_p)) {
10716 goto cache_skip_pve;
10717 }
10718 #endif
10719 pmap = ptep_get_pmap(pte_p);
10720 va = ptep_get_va(pte_p);
10721
10722 tmplate = *pte_p;
10723 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10724 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
10725
10726 if (perform_tlbi) {
10727 write_pte_strong(pte_p, tmplate);
10728 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10729 } else {
10730 write_pte_fast(pte_p, tmplate);
10731 }
10732 tlb_flush_needed = true;
10733
10734 cache_skip_pve:
10735 pte_p = PT_ENTRY_NULL;
10736 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10737 pve_ptep_idx = 0;
10738 pve_p = pve_next(pve_p);
10739 }
10740 }
10741 if (perform_tlbi && tlb_flush_needed) {
10742 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
10743 }
10744
10745 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
10746
10747 return tlb_flush_needed;
10748 }
10749
10750 /**
10751 * Mark a pmap as being dedicated to use for a commpage mapping.
10752 * The pmap itself will never be activated on a CPU; its mappings will
10753 * only be embedded in userspace pmaps at a fixed virtual address.
10754 *
10755 * @param pmap the pmap to mark as belonging to a commpage.
10756 */
10757 static void
10758 pmap_set_commpage(pmap_t pmap)
10759 {
10760 #if XNU_MONITOR
10761 assert(!pmap_ppl_locked_down);
10762 #endif
10763 assert(pmap->type == PMAP_TYPE_USER);
10764 pmap->type = PMAP_TYPE_COMMPAGE;
10765 /*
10766 * Free the pmap's ASID. This pmap should not ever be directly
10767 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
10768 * ASID space contention but will also cause pmap_switch() to panic
10769 * if an attacker tries to activate this pmap. Disable preemption to
10770 * accommodate the *_nopreempt spinlock in free_asid().
10771 */
10772 mp_disable_preemption();
10773 pmap_get_pt_ops(pmap)->free_id(pmap);
10774 mp_enable_preemption();
10775 }
10776
10777 static void
10778 pmap_update_tt3e(
10779 pmap_t pmap,
10780 vm_address_t address,
10781 tt_entry_t template)
10782 {
10783 tt_entry_t *ptep, pte;
10784
10785 ptep = pmap_tt3e(pmap, address);
10786 if (ptep == NULL) {
10787 panic("%s: no ptep?", __FUNCTION__);
10788 }
10789
10790 pte = *ptep;
10791 pte = tte_to_pa(pte) | template;
10792 write_pte_strong(ptep, pte);
10793 }
10794
10795 /* Note absence of non-global bit */
10796 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10797 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10798 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
10799 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10800
10801 /* Note absence of non-global bit and no-execute bit. */
10802 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10803 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10804 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
10805 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10806
10807 void
10808 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10809 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10810 {
10811 kern_return_t kr;
10812 pmap_paddr_t data_pa = 0; // data address
10813 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10814 pmap_paddr_t text_pa = 0; // text address
10815
10816 *kernel_data_addr = 0;
10817 *kernel_text_addr = 0;
10818 *user_text_addr = 0;
10819
10820 #if XNU_MONITOR
10821 data_pa = pmap_alloc_page_for_kern(0);
10822 assert(data_pa);
10823 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
10824 ro_data_pa = pmap_alloc_page_for_kern(0);
10825 assert(ro_data_pa);
10826 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
10827 #if CONFIG_ARM_PFZ
10828 text_pa = pmap_alloc_page_for_kern(0);
10829 assert(text_pa);
10830 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
10831 #endif
10832
10833 #else /* XNU_MONITOR */
10834 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
10835 /*
10836 * For non-PPL devices, we have neither page lockdown nor a physical aperture
10837 * mapped at page granularity, so a separate page for kernel RO data would not
10838 * be useful.
10839 */
10840 ro_data_pa = data_pa;
10841 #if CONFIG_ARM_PFZ
10842 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
10843 #endif
10844
10845 #endif /* XNU_MONITOR */
10846
10847 /*
10848 * In order to avoid burning extra pages on mapping the shared page, we
10849 * create a dedicated pmap for the shared page. We forcibly nest the
10850 * translation tables from this pmap into other pmaps. The level we
10851 * will nest at depends on the MMU configuration (page size, TTBR range,
10852 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
10853 *
10854 * Note that this is NOT "the nested pmap" (which is used to nest the
10855 * shared cache).
10856 *
10857 * Note that we update parameters of the entry for our unique needs (NG
10858 * entry, etc.).
10859 */
10860 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
10861 assert(commpage_pmap_default != NULL);
10862 pmap_set_commpage(commpage_pmap_default);
10863
10864 /* The user 64-bit mappings... */
10865 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10866 assert(kr == KERN_SUCCESS);
10867 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10868
10869 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10870 assert(kr == KERN_SUCCESS);
10871 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10872 #if CONFIG_ARM_PFZ
10873 /* User mapping of comm page text section for 64 bit mapping only
10874 *
10875 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10876 * user processes to get this page mapped in, they should never call into
10877 * this page.
10878 *
10879 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10880 * is slid in the same L3 as the data commpage. It is either outside the
10881 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
10882 * it is reserved and unavailable to mach VM for future mappings.
10883 */
10884 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
10885 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
10886
10887 vm_map_address_t commpage_text_va = 0;
10888
10889 do {
10890 int text_leaf_index = random() % num_ptes;
10891
10892 // Generate a VA for the commpage text with the same root and twig index as data
10893 // comm page, but with new leaf index we've just generated.
10894 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
10895 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
10896 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10897
10898 // Assert that this is empty
10899 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
10900 assert(ptep != PT_ENTRY_NULL);
10901 assert(*ptep == ARM_TTE_EMPTY);
10902
10903 // At this point, we've found the address we want to insert our comm page at
10904 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10905 assert(kr == KERN_SUCCESS);
10906 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
10907 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
10908
10909 *user_text_addr = commpage_text_va;
10910 #endif
10911
10912 /* ...and the user 32-bit mappings. */
10913 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10914 assert(kr == KERN_SUCCESS);
10915 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10916
10917 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10918 assert(kr == KERN_SUCCESS);
10919 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10920 #if __ARM_MIXED_PAGE_SIZE__
10921 /**
10922 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
10923 * new set of page tables that point to the exact same 16K shared page as
10924 * before. Only the first 4K of the 16K shared page is mapped since that's
10925 * the only part that contains relevant data.
10926 */
10927 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
10928 assert(commpage_pmap_4k != NULL);
10929 pmap_set_commpage(commpage_pmap_4k);
10930
10931 /* The user 64-bit mappings... */
10932 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10933 assert(kr == KERN_SUCCESS);
10934 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10935
10936 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10937 assert(kr == KERN_SUCCESS);
10938 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10939
10940 /* ...and the user 32-bit mapping. */
10941 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10942 assert(kr == KERN_SUCCESS);
10943 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10944
10945 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10946 assert(kr == KERN_SUCCESS);
10947 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10948 #endif
10949
10950 /* For manipulation in kernel, go straight to physical page */
10951 *kernel_data_addr = phystokv(data_pa);
10952 assert(commpage_ro_data_kva == 0);
10953 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
10954 assert(commpage_text_kva == 0);
10955 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
10956 }
10957
10958
10959 /*
10960 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10961 * with user controlled TTEs for regions that aren't explicitly reserved by the
10962 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10963 */
10964 #if (ARM_PGSHIFT == 14)
10965 /**
10966 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10967 * commpage completely above the maximum 32-bit userspace VA.
10968 */
10969 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10970
10971 /**
10972 * Normally there'd be an assert to check that 64-bit devices with 64-bit
10973 * userspace VAs can nest the commpage completely above the maximum 64-bit
10974 * userpace VA, but that technically isn't true on macOS. On those systems, the
10975 * commpage lives within the userspace VA range, but is protected by the VM as
10976 * a reserved region (see vm_reserved_regions[] definition for more info).
10977 */
10978
10979 #elif (ARM_PGSHIFT == 12)
10980 /**
10981 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
10982 * above the maximum userspace VA.
10983 */
10984 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
10985 #else
10986 #error Nested shared page mapping is unsupported on this config
10987 #endif
10988
10989 MARK_AS_PMAP_TEXT kern_return_t
10990 pmap_insert_commpage_internal(
10991 pmap_t pmap)
10992 {
10993 kern_return_t kr = KERN_SUCCESS;
10994 vm_offset_t commpage_vaddr;
10995 pt_entry_t *ttep, *src_ttep;
10996 int options = 0;
10997 pmap_t commpage_pmap = commpage_pmap_default;
10998
10999 /* Validate the pmap input before accessing its data. */
11000 validate_pmap_mutable(pmap);
11001
11002 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11003 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11004
11005 #if __ARM_MIXED_PAGE_SIZE__
11006 #if !__ARM_16K_PG__
11007 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11008 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11009 #endif /* !__ARM_16K_PG__ */
11010
11011 /* Choose the correct shared page pmap to use. */
11012 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11013 if (pmap_page_size == 16384) {
11014 commpage_pmap = commpage_pmap_default;
11015 } else if (pmap_page_size == 4096) {
11016 commpage_pmap = commpage_pmap_4k;
11017 } else {
11018 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11019 }
11020 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11021
11022 #if XNU_MONITOR
11023 options |= PMAP_OPTIONS_NOWAIT;
11024 #endif /* XNU_MONITOR */
11025
11026 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11027 #error We assume a single page.
11028 #endif
11029
11030 if (pmap_is_64bit(pmap)) {
11031 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11032 } else {
11033 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11034 }
11035
11036
11037 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11038
11039 /*
11040 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11041 * two (2MB) depending on the address space layout. For 16KB pages, each level
11042 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11043 * to "nest".
11044 *
11045 * Note: This is not "nesting" in the shared cache sense. This definition of
11046 * nesting just means inserting pointers to pre-allocated tables inside of
11047 * the passed in pmap to allow us to share page tables (which map the shared
11048 * page) for every task. This saves at least one page of memory per process
11049 * compared to creating new page tables in every process for mapping the
11050 * shared page.
11051 */
11052
11053 /**
11054 * Allocate the twig page tables if needed, and slam a pointer to the shared
11055 * page's tables into place.
11056 */
11057 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11058 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11059
11060 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11061
11062 if (kr != KERN_SUCCESS) {
11063 #if XNU_MONITOR
11064 if (kr == KERN_RESOURCE_SHORTAGE) {
11065 return kr;
11066 } else
11067 #endif
11068 if (kr == KERN_ABORTED) {
11069 return kr;
11070 } else {
11071 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11072 }
11073 }
11074
11075 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11076 }
11077
11078 if (*ttep != ARM_PTE_EMPTY) {
11079 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11080 }
11081
11082 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11083
11084 *ttep = *src_ttep;
11085 FLUSH_PTE_STRONG();
11086
11087 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11088
11089 return kr;
11090 }
11091
11092 static void
11093 pmap_unmap_commpage(
11094 pmap_t pmap)
11095 {
11096 pt_entry_t *ttep;
11097 vm_offset_t commpage_vaddr;
11098 pmap_t commpage_pmap = commpage_pmap_default;
11099
11100 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11101 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11102
11103 #if __ARM_MIXED_PAGE_SIZE__
11104 #if !__ARM_16K_PG__
11105 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11106 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11107 #endif /* !__ARM_16K_PG__ */
11108
11109 /* Choose the correct shared page pmap to use. */
11110 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11111 if (pmap_page_size == 16384) {
11112 commpage_pmap = commpage_pmap_default;
11113 } else if (pmap_page_size == 4096) {
11114 commpage_pmap = commpage_pmap_4k;
11115 } else {
11116 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11117 }
11118 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11119
11120 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11121 #error We assume a single page.
11122 #endif
11123
11124 if (pmap_is_64bit(pmap)) {
11125 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11126 } else {
11127 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11128 }
11129
11130
11131 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11132
11133 if (ttep == NULL) {
11134 return;
11135 }
11136
11137 /* It had better be mapped to the shared page. */
11138 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11139 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11140 }
11141
11142 *ttep = ARM_TTE_EMPTY;
11143 FLUSH_PTE_STRONG();
11144
11145 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false);
11146 sync_tlb_flush();
11147 }
11148
11149 void
11150 pmap_insert_commpage(
11151 pmap_t pmap)
11152 {
11153 kern_return_t kr = KERN_FAILURE;
11154 #if XNU_MONITOR
11155 do {
11156 kr = pmap_insert_commpage_ppl(pmap);
11157
11158 if (kr == KERN_RESOURCE_SHORTAGE) {
11159 pmap_alloc_page_for_ppl(0);
11160 }
11161 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11162
11163 pmap_ledger_check_balance(pmap);
11164 #else
11165 do {
11166 kr = pmap_insert_commpage_internal(pmap);
11167 } while (kr == KERN_ABORTED);
11168 #endif
11169
11170 if (kr != KERN_SUCCESS) {
11171 panic("%s: failed to insert the shared page, kr=%d, "
11172 "pmap=%p",
11173 __FUNCTION__, kr,
11174 pmap);
11175 }
11176 }
11177
11178 static boolean_t
11179 pmap_is_64bit(
11180 pmap_t pmap)
11181 {
11182 return pmap->is_64bit;
11183 }
11184
11185 bool
11186 pmap_is_exotic(
11187 pmap_t pmap __unused)
11188 {
11189 return false;
11190 }
11191
11192
11193 /* ARMTODO -- an implementation that accounts for
11194 * holes in the physical map, if any.
11195 */
11196 boolean_t
11197 pmap_valid_page(
11198 ppnum_t pn)
11199 {
11200 return pa_valid(ptoa(pn));
11201 }
11202
11203 boolean_t
11204 pmap_bootloader_page(
11205 ppnum_t pn)
11206 {
11207 pmap_paddr_t paddr = ptoa(pn);
11208
11209 if (pa_valid(paddr)) {
11210 return FALSE;
11211 }
11212 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11213 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11214 }
11215
11216 MARK_AS_PMAP_TEXT boolean_t
11217 pmap_is_empty_internal(
11218 pmap_t pmap,
11219 vm_map_offset_t va_start,
11220 vm_map_offset_t va_end)
11221 {
11222 vm_map_offset_t block_start, block_end;
11223 tt_entry_t *tte_p;
11224
11225 if (pmap == NULL) {
11226 return TRUE;
11227 }
11228
11229 validate_pmap(pmap);
11230
11231 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11232 unsigned int initial_not_in_kdp = not_in_kdp;
11233
11234 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11235 pmap_lock(pmap, PMAP_LOCK_SHARED);
11236 }
11237
11238
11239 /* TODO: This will be faster if we increment ttep at each level. */
11240 block_start = va_start;
11241
11242 while (block_start < va_end) {
11243 pt_entry_t *bpte_p, *epte_p;
11244 pt_entry_t *pte_p;
11245
11246 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11247 if (block_end > va_end) {
11248 block_end = va_end;
11249 }
11250
11251 tte_p = pmap_tte(pmap, block_start);
11252 if ((tte_p != PT_ENTRY_NULL)
11253 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11254 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11255 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11256 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11257
11258 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11259 if (*pte_p != ARM_PTE_EMPTY) {
11260 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11261 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11262 }
11263 return FALSE;
11264 }
11265 }
11266 }
11267 block_start = block_end;
11268 }
11269
11270 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11271 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11272 }
11273
11274 return TRUE;
11275 }
11276
11277 boolean_t
11278 pmap_is_empty(
11279 pmap_t pmap,
11280 vm_map_offset_t va_start,
11281 vm_map_offset_t va_end)
11282 {
11283 #if XNU_MONITOR
11284 return pmap_is_empty_ppl(pmap, va_start, va_end);
11285 #else
11286 return pmap_is_empty_internal(pmap, va_start, va_end);
11287 #endif
11288 }
11289
11290 vm_map_offset_t
11291 pmap_max_offset(
11292 boolean_t is64,
11293 unsigned int option)
11294 {
11295 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11296 }
11297
11298 vm_map_offset_t
11299 pmap_max_64bit_offset(
11300 __unused unsigned int option)
11301 {
11302 vm_map_offset_t max_offset_ret = 0;
11303
11304 #if defined(__arm64__)
11305 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11306 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11307 max_offset_ret = arm64_pmap_max_offset_default;
11308 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11309 max_offset_ret = min_max_offset;
11310 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11311 max_offset_ret = MACH_VM_MAX_ADDRESS;
11312 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11313 if (arm64_pmap_max_offset_default) {
11314 max_offset_ret = arm64_pmap_max_offset_default;
11315 } else if (max_mem > 0xC0000000) {
11316 // devices with > 3GB of memory
11317 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11318 } else if (max_mem > 0x40000000) {
11319 // devices with > 1GB and <= 3GB of memory
11320 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11321 } else {
11322 // devices with <= 1 GB of memory
11323 max_offset_ret = min_max_offset;
11324 }
11325 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11326 if (arm64_pmap_max_offset_default) {
11327 // Allow the boot-arg to override jumbo size
11328 max_offset_ret = arm64_pmap_max_offset_default;
11329 } else {
11330 max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11331 }
11332 } else {
11333 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11334 }
11335
11336 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11337 assert(max_offset_ret >= min_max_offset);
11338 #else
11339 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11340 #endif
11341
11342 return max_offset_ret;
11343 }
11344
11345 vm_map_offset_t
11346 pmap_max_32bit_offset(
11347 unsigned int option)
11348 {
11349 vm_map_offset_t max_offset_ret = 0;
11350
11351 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11352 max_offset_ret = arm_pmap_max_offset_default;
11353 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11354 max_offset_ret = VM_MAX_ADDRESS;
11355 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11356 max_offset_ret = VM_MAX_ADDRESS;
11357 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11358 if (arm_pmap_max_offset_default) {
11359 max_offset_ret = arm_pmap_max_offset_default;
11360 } else if (max_mem > 0x20000000) {
11361 max_offset_ret = VM_MAX_ADDRESS;
11362 } else {
11363 max_offset_ret = VM_MAX_ADDRESS;
11364 }
11365 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11366 max_offset_ret = VM_MAX_ADDRESS;
11367 } else {
11368 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11369 }
11370
11371 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11372 return max_offset_ret;
11373 }
11374
11375 #if CONFIG_DTRACE
11376 /*
11377 * Constrain DTrace copyin/copyout actions
11378 */
11379 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11380 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11381
11382 kern_return_t
11383 dtrace_copyio_preflight(
11384 __unused addr64_t va)
11385 {
11386 if (current_map() == kernel_map) {
11387 return KERN_FAILURE;
11388 } else {
11389 return KERN_SUCCESS;
11390 }
11391 }
11392
11393 kern_return_t
11394 dtrace_copyio_postflight(
11395 __unused addr64_t va)
11396 {
11397 return KERN_SUCCESS;
11398 }
11399 #endif /* CONFIG_DTRACE */
11400
11401
11402 void
11403 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11404 {
11405 }
11406
11407
11408 void
11409 pmap_flush(
11410 __unused pmap_flush_context *cpus_to_flush)
11411 {
11412 /* not implemented yet */
11413 return;
11414 }
11415
11416 #if XNU_MONITOR
11417
11418 /*
11419 * Enforce that the address range described by kva and nbytes is not currently
11420 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11421 * unintentionally writing to PPL-owned memory.
11422 */
11423 void
11424 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11425 {
11426 vm_offset_t end;
11427 if (os_add_overflow(kva, nbytes, &end)) {
11428 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11429 }
11430 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11431 pmap_paddr_t pa = kvtophys_nofail(ckva);
11432 pp_attr_t attr;
11433 unsigned int pai = pa_index(pa);
11434 if (ckva == phystokv(pa)) {
11435 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11436 }
11437 do {
11438 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11439 if (attr & PP_ATTR_MONITOR) {
11440 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11441 }
11442 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11443 }
11444 }
11445
11446 void
11447 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11448 {
11449 vm_offset_t end;
11450 if (os_add_overflow(kva, nbytes, &end)) {
11451 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11452 }
11453 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11454 pmap_paddr_t pa = kvtophys_nofail(ckva);
11455
11456 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11457 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11458 }
11459 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11460 ppattr_pa_clear_no_monitor(pa);
11461 }
11462 }
11463
11464 /**
11465 * Lock down a page, making all mappings read-only, and preventing further
11466 * mappings or removal of this particular kva's mapping. Effectively, it makes
11467 * the physical page at kva immutable (see the ppl_writable parameter for an
11468 * exception to this).
11469 *
11470 * @param kva Valid address to any mapping of the physical page to lockdown.
11471 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11472 * @param ppl_writable True if the PPL should still be able to write to the page
11473 * using the physical aperture mapping. False will make the
11474 * page read-only for both the kernel and PPL in the
11475 * physical aperture.
11476 */
11477
11478 MARK_AS_PMAP_TEXT static void
11479 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11480 {
11481 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11482 }
11483
11484 /**
11485 * Lock down a page, giving all mappings the specified maximum permissions, and
11486 * preventing further mappings or removal of this particular kva's mapping.
11487 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11488 * parameter for an exception to this).
11489 *
11490 * @param kva Valid address to any mapping of the physical page to lockdown.
11491 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11492 * @param ppl_writable True if the PPL should still be able to write to the page
11493 * using the physical aperture mapping. False will make the
11494 * page read-only for both the kernel and PPL in the
11495 * physical aperture.
11496 * @param prot Maximum permissions to allow in existing alias mappings
11497 */
11498 MARK_AS_PMAP_TEXT static void
11499 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11500 {
11501 const pmap_paddr_t pa = kvtophys_nofail(kva);
11502 const unsigned int pai = pa_index(pa);
11503
11504 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11505 pvh_lock(pai);
11506 pv_entry_t **pvh = pai_to_pvh(pai);
11507 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11508
11509 if (__improbable(ppattr_pa_test_monitor(pa))) {
11510 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11511 }
11512
11513 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11514 panic("%s: %#lx already locked down/executable (%#llx)",
11515 __func__, kva, (uint64_t)pvh_flags);
11516 }
11517
11518
11519 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11520
11521 /* Update the physical aperture mapping to prevent kernel write access. */
11522 const unsigned int new_xprr_perm =
11523 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11524 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11525
11526 pvh_unlock(pai);
11527
11528 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11529
11530 /**
11531 * Double-check that the mapping didn't change physical addresses before the
11532 * LOCKDOWN flag was set (there is a brief window between the above
11533 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11534 *
11535 * This doesn't solve the ABA problem, but this doesn't have to since once
11536 * the pvh_lock() is grabbed no new mappings can be created on this physical
11537 * page without the LOCKDOWN flag already set (so any future mappings can
11538 * only be RO, and no existing mappings can be removed).
11539 */
11540 if (kvtophys_nofail(kva) != pa) {
11541 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11542 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11543 }
11544 }
11545
11546 /**
11547 * Helper for releasing a page from being locked down to the PPL, making it writable to the
11548 * kernel once again.
11549 *
11550 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11551 * to unlockdown a page that was never locked down, will panic.
11552 *
11553 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
11554 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11555 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11556 * passed to the paired pmap_ppl_lockdown_page() call. Any
11557 * deviation will result in a panic.
11558 */
11559 MARK_AS_PMAP_TEXT static void
11560 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11561 {
11562 pvh_assert_locked(pai);
11563 pv_entry_t **pvh = pai_to_pvh(pai);
11564 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11565
11566 if (__improbable(!(pvh_flags & lockdown_flag))) {
11567 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11568 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11569 }
11570
11571
11572 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11573
11574 /* Restore the pre-lockdown physical aperture mapping permissions. */
11575 const unsigned int old_xprr_perm =
11576 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11577 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11578 }
11579
11580 /**
11581 * Release a page from being locked down to the PPL, making it writable to the
11582 * kernel once again.
11583 *
11584 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11585 * to unlockdown a page that was never locked down, will panic.
11586 *
11587 * @param kva Valid address to any mapping of the physical page to unlockdown.
11588 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11589 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11590 * passed to the paired pmap_ppl_lockdown_page() call. Any
11591 * deviation will result in a panic.
11592 */
11593 MARK_AS_PMAP_TEXT static void
11594 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11595 {
11596 const pmap_paddr_t pa = kvtophys_nofail(kva);
11597 const unsigned int pai = pa_index(pa);
11598
11599 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11600 pvh_lock(pai);
11601 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11602 pvh_unlock(pai);
11603 }
11604
11605 #else /* XNU_MONITOR */
11606
11607 void __unused
11608 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11609 {
11610 }
11611
11612 void __unused
11613 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11614 {
11615 }
11616
11617 #endif /* !XNU_MONITOR */
11618
11619
11620 MARK_AS_PMAP_TEXT static inline void
11621 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11622 {
11623 #if XNU_MONITOR
11624 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11625 #else
11626 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11627 #endif
11628 }
11629
11630 MARK_AS_PMAP_TEXT static inline void
11631 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11632 {
11633 #if XNU_MONITOR
11634 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11635 #else
11636 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11637 #endif
11638 }
11639
11640 /**
11641 * Perform basic validation checks on the destination only and
11642 * corresponding offset/sizes prior to writing to a read only allocation.
11643 *
11644 * @note Should be called before writing to an allocation from the read
11645 * only allocator.
11646 *
11647 * @param zid The ID of the zone the allocation belongs to.
11648 * @param va VA of element being modified (destination).
11649 * @param offset Offset being written to, in the element.
11650 * @param new_data_size Size of modification.
11651 *
11652 */
11653
11654 MARK_AS_PMAP_TEXT static void
11655 pmap_ro_zone_validate_element_dst(
11656 zone_id_t zid,
11657 vm_offset_t va,
11658 vm_offset_t offset,
11659 vm_size_t new_data_size)
11660 {
11661 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11662 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11663 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11664 }
11665
11666 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
11667
11668 /* Check element is from correct zone and properly aligned */
11669 zone_require_ro(zid, elem_size, (void*)va);
11670
11671 if (__improbable(new_data_size > (elem_size - offset))) {
11672 panic("%s: New data size %lu too large for elem size %lu at addr %p",
11673 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11674 }
11675 if (__improbable(offset >= elem_size)) {
11676 panic("%s: Offset %lu too large for elem size %lu at addr %p",
11677 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11678 }
11679 }
11680
11681
11682 /**
11683 * Perform basic validation checks on the source, destination and
11684 * corresponding offset/sizes prior to writing to a read only allocation.
11685 *
11686 * @note Should be called before writing to an allocation from the read
11687 * only allocator.
11688 *
11689 * @param zid The ID of the zone the allocation belongs to.
11690 * @param va VA of element being modified (destination).
11691 * @param offset Offset being written to, in the element.
11692 * @param new_data Pointer to new data (source).
11693 * @param new_data_size Size of modification.
11694 *
11695 */
11696
11697 MARK_AS_PMAP_TEXT static void
11698 pmap_ro_zone_validate_element(
11699 zone_id_t zid,
11700 vm_offset_t va,
11701 vm_offset_t offset,
11702 const vm_offset_t new_data,
11703 vm_size_t new_data_size)
11704 {
11705 vm_offset_t sum = 0;
11706
11707 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11708 panic("%s: Integer addition overflow %p + %lu = %lu",
11709 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11710 }
11711
11712 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11713 }
11714
11715 /**
11716 * Ensure that physical page is locked down and pinned, before writing to it.
11717 *
11718 * @note Should be called before writing to an allocation from the read
11719 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
11720 * ensure that it is called after the modification.
11721 *
11722 *
11723 * @param pa Physical address of the element being modified.
11724 * @param va Virtual address of element being modified.
11725 * @param size Size of the modification.
11726 *
11727 */
11728
11729 MARK_AS_PMAP_TEXT static void
11730 pmap_ro_zone_lock_phy_page(
11731 const pmap_paddr_t pa,
11732 vm_offset_t va,
11733 vm_size_t size)
11734 {
11735 const unsigned int pai = pa_index(pa);
11736 pvh_lock(pai);
11737
11738 /* Ensure that the physical page is locked down */
11739 #if XNU_MONITOR
11740 pv_entry_t **pvh = pai_to_pvh(pai);
11741 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
11742 panic("%s: Physical page not locked down %llx", __func__, pa);
11743 }
11744 #endif /* XNU_MONITOR */
11745
11746 /* Ensure page can't become PPL-owned memory before the memcpy occurs */
11747 pmap_pin_kernel_pages(va, size);
11748 }
11749
11750 /**
11751 * Unlock and unpin physical page after writing to it.
11752 *
11753 * @note Should be called after writing to an allocation from the read
11754 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
11755 * ensure that it has been called prior to the modification.
11756 *
11757 * @param pa Physical address of the element that was modified.
11758 * @param va Virtual address of element that was modified.
11759 * @param size Size of the modification.
11760 *
11761 */
11762
11763 MARK_AS_PMAP_TEXT static void
11764 pmap_ro_zone_unlock_phy_page(
11765 const pmap_paddr_t pa,
11766 vm_offset_t va,
11767 vm_size_t size)
11768 {
11769 const unsigned int pai = pa_index(pa);
11770 pmap_unpin_kernel_pages(va, size);
11771 pvh_unlock(pai);
11772 }
11773
11774 /**
11775 * Function to copy kauth_cred from new_data to kv.
11776 * Function defined in "kern_prot.c"
11777 *
11778 * @note Will be removed upon completion of
11779 * <rdar://problem/72635194> Compiler PAC support for memcpy.
11780 *
11781 * @param kv Address to copy new data to.
11782 * @param new_data Pointer to new data.
11783 *
11784 */
11785
11786 extern void
11787 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11788
11789 /**
11790 * Zalloc-specific memcpy that writes through the physical aperture
11791 * and ensures the element being modified is from a read-only zone.
11792 *
11793 * @note Designed to work only with the zone allocator's read-only submap.
11794 *
11795 * @param zid The ID of the zone to allocate from.
11796 * @param va VA of element to be modified.
11797 * @param offset Offset from element.
11798 * @param new_data Pointer to new data.
11799 * @param new_data_size Size of modification.
11800 *
11801 */
11802
11803 void
11804 pmap_ro_zone_memcpy(
11805 zone_id_t zid,
11806 vm_offset_t va,
11807 vm_offset_t offset,
11808 const vm_offset_t new_data,
11809 vm_size_t new_data_size)
11810 {
11811 #if XNU_MONITOR
11812 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
11813 #else /* XNU_MONITOR */
11814 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11815 #endif /* XNU_MONITOR */
11816 }
11817
11818 MARK_AS_PMAP_TEXT void
11819 pmap_ro_zone_memcpy_internal(
11820 zone_id_t zid,
11821 vm_offset_t va,
11822 vm_offset_t offset,
11823 const vm_offset_t new_data,
11824 vm_size_t new_data_size)
11825 {
11826 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11827
11828 if (!new_data || new_data_size == 0) {
11829 return;
11830 }
11831
11832 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11833 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
11834 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11835 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
11836 }
11837
11838 /**
11839 * Zalloc-specific function to atomically mutate fields of an element that
11840 * belongs to a read-only zone, via the physcial aperture.
11841 *
11842 * @note Designed to work only with the zone allocator's read-only submap.
11843 *
11844 * @param zid The ID of the zone the element belongs to.
11845 * @param va VA of element to be modified.
11846 * @param offset Offset in element.
11847 * @param op Atomic operation to perform.
11848 * @param value Mutation value.
11849 *
11850 */
11851
11852 uint64_t
11853 pmap_ro_zone_atomic_op(
11854 zone_id_t zid,
11855 vm_offset_t va,
11856 vm_offset_t offset,
11857 zro_atomic_op_t op,
11858 uint64_t value)
11859 {
11860 #if XNU_MONITOR
11861 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
11862 #else /* XNU_MONITOR */
11863 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11864 #endif /* XNU_MONITOR */
11865 }
11866
11867 MARK_AS_PMAP_TEXT uint64_t
11868 pmap_ro_zone_atomic_op_internal(
11869 zone_id_t zid,
11870 vm_offset_t va,
11871 vm_offset_t offset,
11872 zro_atomic_op_t op,
11873 uint64_t value)
11874 {
11875 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11876 vm_size_t value_size = op & 0xf;
11877
11878 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11879 pmap_ro_zone_lock_phy_page(pa, va, value_size);
11880 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11881 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
11882
11883 return value;
11884 }
11885
11886 /**
11887 * bzero for allocations from read only zones, that writes through the
11888 * physical aperture.
11889 *
11890 * @note This is called by the zfree path of all allocations from read
11891 * only zones.
11892 *
11893 * @param zid The ID of the zone the allocation belongs to.
11894 * @param va VA of element to be zeroed.
11895 * @param offset Offset in the element.
11896 * @param size Size of allocation.
11897 *
11898 */
11899
11900 void
11901 pmap_ro_zone_bzero(
11902 zone_id_t zid,
11903 vm_offset_t va,
11904 vm_offset_t offset,
11905 vm_size_t size)
11906 {
11907 #if XNU_MONITOR
11908 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
11909 #else /* XNU_MONITOR */
11910 pmap_ro_zone_bzero_internal(zid, va, offset, size);
11911 #endif /* XNU_MONITOR */
11912 }
11913
11914 MARK_AS_PMAP_TEXT void
11915 pmap_ro_zone_bzero_internal(
11916 zone_id_t zid,
11917 vm_offset_t va,
11918 vm_offset_t offset,
11919 vm_size_t size)
11920 {
11921 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11922 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11923 pmap_ro_zone_lock_phy_page(pa, va, size);
11924 bzero((void*)phystokv(pa), size);
11925 pmap_ro_zone_unlock_phy_page(pa, va, size);
11926 }
11927
11928 /**
11929 * Removes write access from the Physical Aperture.
11930 *
11931 * @note For non-PPL devices, it simply makes all virtual mappings RO.
11932 * @note Designed to work only with the zone allocator's read-only submap.
11933 *
11934 * @param va VA of the page to restore write access to.
11935 *
11936 */
11937 MARK_AS_PMAP_TEXT static void
11938 pmap_phys_write_disable(vm_address_t va)
11939 {
11940 #if XNU_MONITOR
11941 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
11942 #else /* XNU_MONITOR */
11943 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
11944 #endif /* XNU_MONITOR */
11945 }
11946
11947 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
11948
11949 MARK_AS_PMAP_TEXT mach_vm_size_t
11950 pmap_query_resident_internal(
11951 pmap_t pmap,
11952 vm_map_address_t start,
11953 vm_map_address_t end,
11954 mach_vm_size_t *compressed_bytes_p)
11955 {
11956 mach_vm_size_t resident_bytes = 0;
11957 mach_vm_size_t compressed_bytes = 0;
11958
11959 pt_entry_t *bpte, *epte;
11960 pt_entry_t *pte_p;
11961 tt_entry_t *tte_p;
11962
11963 if (pmap == NULL) {
11964 return PMAP_RESIDENT_INVALID;
11965 }
11966
11967 validate_pmap(pmap);
11968
11969 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11970
11971 /* Ensure that this request is valid, and addresses exactly one TTE. */
11972 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11973 (end % pt_attr_page_size(pt_attr)))) {
11974 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11975 }
11976
11977 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11978 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11979 }
11980
11981 pmap_lock(pmap, PMAP_LOCK_SHARED);
11982 tte_p = pmap_tte(pmap, start);
11983 if (tte_p == (tt_entry_t *) NULL) {
11984 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11985 return PMAP_RESIDENT_INVALID;
11986 }
11987 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
11988 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11989 bpte = &pte_p[pte_index(pt_attr, start)];
11990 epte = &pte_p[pte_index(pt_attr, end)];
11991
11992 for (; bpte < epte; bpte++) {
11993 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
11994 compressed_bytes += pt_attr_page_size(pt_attr);
11995 } else if (pa_valid(pte_to_pa(*bpte))) {
11996 resident_bytes += pt_attr_page_size(pt_attr);
11997 }
11998 }
11999 }
12000 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12001
12002 if (compressed_bytes_p) {
12003 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12004 *compressed_bytes_p += compressed_bytes;
12005 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12006 }
12007
12008 return resident_bytes;
12009 }
12010
12011 mach_vm_size_t
12012 pmap_query_resident(
12013 pmap_t pmap,
12014 vm_map_address_t start,
12015 vm_map_address_t end,
12016 mach_vm_size_t *compressed_bytes_p)
12017 {
12018 mach_vm_size_t total_resident_bytes;
12019 mach_vm_size_t compressed_bytes;
12020 vm_map_address_t va;
12021
12022
12023 if (pmap == PMAP_NULL) {
12024 if (compressed_bytes_p) {
12025 *compressed_bytes_p = 0;
12026 }
12027 return 0;
12028 }
12029
12030 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12031
12032 total_resident_bytes = 0;
12033 compressed_bytes = 0;
12034
12035 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12036 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12037 VM_KERNEL_ADDRHIDE(end));
12038
12039 va = start;
12040 while (va < end) {
12041 vm_map_address_t l;
12042 mach_vm_size_t resident_bytes;
12043
12044 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12045
12046 if (l > end) {
12047 l = end;
12048 }
12049 #if XNU_MONITOR
12050 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12051 #else
12052 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12053 #endif
12054 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12055 break;
12056 }
12057
12058 total_resident_bytes += resident_bytes;
12059
12060 va = l;
12061 }
12062
12063 if (compressed_bytes_p) {
12064 *compressed_bytes_p = compressed_bytes;
12065 }
12066
12067 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12068 total_resident_bytes);
12069
12070 return total_resident_bytes;
12071 }
12072
12073 #if MACH_ASSERT
12074 static void
12075 pmap_check_ledgers(
12076 pmap_t pmap)
12077 {
12078 int pid;
12079 char *procname;
12080
12081 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12082 /*
12083 * This pmap was not or is no longer fully associated
12084 * with a task (e.g. the old pmap after a fork()/exec() or
12085 * spawn()). Its "ledger" still points at a task that is
12086 * now using a different (and active) address space, so
12087 * we can't check that all the pmap ledgers are balanced here.
12088 *
12089 * If the "pid" is set, that means that we went through
12090 * pmap_set_process() in task_terminate_internal(), so
12091 * this task's ledger should not have been re-used and
12092 * all the pmap ledgers should be back to 0.
12093 */
12094 return;
12095 }
12096
12097 pid = pmap->pmap_pid;
12098 procname = pmap->pmap_procname;
12099
12100 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12101 }
12102 #endif /* MACH_ASSERT */
12103
12104 void
12105 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12106 {
12107 }
12108
12109 /**
12110 * The minimum shared region nesting size is used by the VM to determine when to
12111 * break up large mappings to nested regions. The smallest size that these
12112 * mappings can be broken into is determined by what page table level those
12113 * regions are being nested in at and the size of the page tables.
12114 *
12115 * For instance, if a nested region is nesting at L2 for a process utilizing
12116 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12117 * block entry).
12118 *
12119 * @param pmap The target pmap to determine the block size based on whether it's
12120 * using 16KB or 4KB page tables.
12121 */
12122 uint64_t
12123 pmap_shared_region_size_min(__unused pmap_t pmap)
12124 {
12125 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12126
12127 /**
12128 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12129 * 4KB pages). This means that a target pmap will contain L2 entries that
12130 * point to shared L3 page tables in the shared region pmap.
12131 */
12132 return pt_attr_twig_size(pt_attr);
12133 }
12134
12135 boolean_t
12136 pmap_enforces_execute_only(
12137 pmap_t pmap)
12138 {
12139 return pmap != kernel_pmap;
12140 }
12141
12142 MARK_AS_PMAP_TEXT void
12143 pmap_set_vm_map_cs_enforced_internal(
12144 pmap_t pmap,
12145 bool new_value)
12146 {
12147 validate_pmap_mutable(pmap);
12148 pmap->pmap_vm_map_cs_enforced = new_value;
12149 }
12150
12151 void
12152 pmap_set_vm_map_cs_enforced(
12153 pmap_t pmap,
12154 bool new_value)
12155 {
12156 #if XNU_MONITOR
12157 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12158 #else
12159 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12160 #endif
12161 }
12162
12163 extern int cs_process_enforcement_enable;
12164 bool
12165 pmap_get_vm_map_cs_enforced(
12166 pmap_t pmap)
12167 {
12168 if (cs_process_enforcement_enable) {
12169 return true;
12170 }
12171 return pmap->pmap_vm_map_cs_enforced;
12172 }
12173
12174 MARK_AS_PMAP_TEXT void
12175 pmap_set_jit_entitled_internal(
12176 __unused pmap_t pmap)
12177 {
12178 return;
12179 }
12180
12181 void
12182 pmap_set_jit_entitled(
12183 pmap_t pmap)
12184 {
12185 #if XNU_MONITOR
12186 pmap_set_jit_entitled_ppl(pmap);
12187 #else
12188 pmap_set_jit_entitled_internal(pmap);
12189 #endif
12190 }
12191
12192 bool
12193 pmap_get_jit_entitled(
12194 __unused pmap_t pmap)
12195 {
12196 return false;
12197 }
12198
12199 MARK_AS_PMAP_TEXT void
12200 pmap_set_tpro_internal(
12201 __unused pmap_t pmap)
12202 {
12203 return;
12204 }
12205
12206 void
12207 pmap_set_tpro(
12208 pmap_t pmap)
12209 {
12210 #if XNU_MONITOR
12211 pmap_set_tpro_ppl(pmap);
12212 #else /* XNU_MONITOR */
12213 pmap_set_tpro_internal(pmap);
12214 #endif /* XNU_MONITOR */
12215 }
12216
12217 bool
12218 pmap_get_tpro(
12219 __unused pmap_t pmap)
12220 {
12221 return false;
12222 }
12223
12224 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12225
12226 MARK_AS_PMAP_TEXT kern_return_t
12227 pmap_query_page_info_internal(
12228 pmap_t pmap,
12229 vm_map_offset_t va,
12230 int *disp_p)
12231 {
12232 pmap_paddr_t pa;
12233 int disp;
12234 unsigned int pai;
12235 pt_entry_t *pte_p, pte;
12236 pv_entry_t **pv_h, *pve_p;
12237
12238 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12239 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12240 *disp_p = 0;
12241 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12242 return KERN_INVALID_ARGUMENT;
12243 }
12244
12245 validate_pmap(pmap);
12246 pmap_lock(pmap, PMAP_LOCK_SHARED);
12247
12248 try_again:
12249 disp = 0;
12250 pte_p = pmap_pte(pmap, va);
12251 if (pte_p == PT_ENTRY_NULL) {
12252 goto done;
12253 }
12254 pte = *(volatile pt_entry_t*)pte_p;
12255 pa = pte_to_pa(pte);
12256 if (pa == 0) {
12257 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12258 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12259 if (pte & ARM_PTE_COMPRESSED_ALT) {
12260 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12261 }
12262 }
12263 } else {
12264 disp |= PMAP_QUERY_PAGE_PRESENT;
12265 pai = pa_index(pa);
12266 if (!pa_valid(pa)) {
12267 goto done;
12268 }
12269 pvh_lock(pai);
12270 if (pte != *(volatile pt_entry_t*)pte_p) {
12271 /* something changed: try again */
12272 pvh_unlock(pai);
12273 pmap_query_page_info_retries++;
12274 goto try_again;
12275 }
12276 pv_h = pai_to_pvh(pai);
12277 pve_p = PV_ENTRY_NULL;
12278 int pve_ptep_idx = 0;
12279 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12280 pve_p = pvh_pve_list(pv_h);
12281 while (pve_p != PV_ENTRY_NULL &&
12282 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12283 pve_p = pve_next(pve_p);
12284 }
12285 }
12286
12287 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12288 disp |= PMAP_QUERY_PAGE_ALTACCT;
12289 } else if (ppattr_test_reusable(pai)) {
12290 disp |= PMAP_QUERY_PAGE_REUSABLE;
12291 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12292 disp |= PMAP_QUERY_PAGE_INTERNAL;
12293 }
12294 pvh_unlock(pai);
12295 }
12296
12297 done:
12298 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12299 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12300 *disp_p = disp;
12301 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12302 return KERN_SUCCESS;
12303 }
12304
12305 kern_return_t
12306 pmap_query_page_info(
12307 pmap_t pmap,
12308 vm_map_offset_t va,
12309 int *disp_p)
12310 {
12311 #if XNU_MONITOR
12312 return pmap_query_page_info_ppl(pmap, va, disp_p);
12313 #else
12314 return pmap_query_page_info_internal(pmap, va, disp_p);
12315 #endif
12316 }
12317
12318
12319
12320 uint32_t
12321 pmap_user_va_bits(pmap_t pmap __unused)
12322 {
12323 #if __ARM_MIXED_PAGE_SIZE__
12324 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12325 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12326 #else
12327 return 64 - T0SZ_BOOT;
12328 #endif
12329 }
12330
12331 uint32_t
12332 pmap_kernel_va_bits(void)
12333 {
12334 return 64 - T1SZ_BOOT;
12335 }
12336
12337 static vm_map_size_t
12338 pmap_user_va_size(pmap_t pmap)
12339 {
12340 return 1ULL << pmap_user_va_bits(pmap);
12341 }
12342
12343
12344
12345 bool
12346 pmap_in_ppl(void)
12347 {
12348 // Unsupported
12349 return false;
12350 }
12351
12352 __attribute__((__noreturn__))
12353 void
12354 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12355 {
12356 panic("%s called on an unsupported platform.", __FUNCTION__);
12357 }
12358
12359 void *
12360 pmap_claim_reserved_ppl_page(void)
12361 {
12362 // Unsupported
12363 return NULL;
12364 }
12365
12366 void
12367 pmap_free_reserved_ppl_page(void __unused *kva)
12368 {
12369 // Unsupported
12370 }
12371
12372
12373 #if PMAP_CS_PPL_MONITOR
12374
12375 /* Immutable part of the trust cache runtime */
12376 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12377
12378 /* Mutable part of the trust cache runtime */
12379 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12380
12381 /* Lock for the trust cache runtime */
12382 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12383
12384 MARK_AS_PMAP_TEXT kern_return_t
12385 pmap_check_trust_cache_runtime_for_uuid_internal(
12386 const uint8_t check_uuid[kUUIDSize])
12387 {
12388 kern_return_t ret = KERN_DENIED;
12389
12390 if (amfi->TrustCache.version < 3) {
12391 /* AMFI change hasn't landed in the build */
12392 pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12393 return KERN_NOT_SUPPORTED;
12394 }
12395
12396 /* Lock the runtime as shared */
12397 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12398
12399 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12400 &ppl_trust_cache_rt,
12401 check_uuid,
12402 NULL);
12403
12404 /* Unlock the runtime */
12405 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12406
12407 if (tc_ret.error == kTCReturnSuccess) {
12408 ret = KERN_SUCCESS;
12409 } else if (tc_ret.error == kTCReturnNotFound) {
12410 ret = KERN_NOT_FOUND;
12411 } else {
12412 ret = KERN_FAILURE;
12413 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12414 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12415 }
12416
12417 return ret;
12418 }
12419
12420 kern_return_t
12421 pmap_check_trust_cache_runtime_for_uuid(
12422 const uint8_t check_uuid[kUUIDSize])
12423 {
12424 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12425 }
12426
12427 MARK_AS_PMAP_TEXT kern_return_t
12428 pmap_load_trust_cache_with_type_internal(
12429 TCType_t type,
12430 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12431 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12432 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12433 {
12434 kern_return_t ret = KERN_DENIED;
12435 pmap_img4_payload_t *payload = NULL;
12436 size_t img4_payload_len = 0;
12437 size_t payload_len_aligned = 0;
12438 size_t manifest_len_aligned = 0;
12439
12440 /* Ignore the auxiliary manifest until we add support for it */
12441 (void)img4_aux_manifest;
12442 (void)img4_aux_manifest_len;
12443
12444
12445 #if PMAP_CS_INCLUDE_CODE_SIGNING
12446 if (pmap_cs) {
12447 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12448 panic("trust cache type not loadable from interface: %u", type);
12449 } else if (type >= kTCTypeTotal) {
12450 panic("attempted to load an unsupported trust cache type: %u", type);
12451 }
12452
12453 /* Validate entitlement for the calling process */
12454 if (TCTypeConfig[type].entitlementValue != NULL) {
12455 const bool entitlement_satisfied = check_entitlement_pmap(
12456 NULL,
12457 "com.apple.private.pmap.load-trust-cache",
12458 TCTypeConfig[type].entitlementValue,
12459 false,
12460 true);
12461
12462 if (entitlement_satisfied == false) {
12463 panic("attempted to load trust cache without entitlement: %u", type);
12464 }
12465 }
12466 }
12467 #endif
12468
12469 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12470 ret = pmap_reserve_ppl_page();
12471 if (ret != KERN_SUCCESS) {
12472 if (ret != KERN_RESOURCE_SHORTAGE) {
12473 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12474 }
12475 return ret;
12476 }
12477
12478 /* Align the passed in lengths to the page size -- round_page is overflow safe */
12479 payload_len_aligned = round_page(pmap_img4_payload_len);
12480 manifest_len_aligned = round_page(img4_manifest_len);
12481
12482 /* Ensure we have valid data passed in */
12483 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12484 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12485
12486 /*
12487 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12488 * data structure used by libTrustCache to manage the payload. We need to be able to
12489 * write to that data structure, so we keep the payload PPL writable.
12490 */
12491 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12492 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12493
12494 /* Should be safe to read from this now */
12495 payload = (pmap_img4_payload_t*)pmap_img4_payload;
12496
12497 /* Acquire a writable version of the trust cache data structure */
12498 TrustCache_t *trust_cache = &payload->trust_cache;
12499 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12500
12501 /* Calculate the correct length of the img4 payload */
12502 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12503 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12504 }
12505
12506 /* Exclusively lock the runtime */
12507 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12508
12509 /* Load the trust cache */
12510 TCReturn_t tc_ret = amfi->TrustCache.load(
12511 &ppl_trust_cache_rt,
12512 type,
12513 trust_cache,
12514 (const uintptr_t)payload->img4_payload, img4_payload_len,
12515 (const uintptr_t)img4_manifest, img4_manifest_len);
12516
12517 /* Unlock the runtime */
12518 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12519
12520 if (tc_ret.error == kTCReturnSuccess) {
12521 ret = KERN_SUCCESS;
12522 } else {
12523 if (tc_ret.error == kTCReturnDuplicate) {
12524 ret = KERN_ALREADY_IN_SET;
12525 } else {
12526 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12527 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12528
12529 ret = KERN_FAILURE;
12530 }
12531
12532 /* Unlock the payload data */
12533 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12534 trust_cache = NULL;
12535 payload = NULL;
12536 }
12537
12538 /* Unlock the manifest since it is no longer needed */
12539 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12540
12541 /* Return the CoreCrypto reserved page back to the free list */
12542 pmap_release_reserved_ppl_page();
12543
12544 return ret;
12545 }
12546
12547 kern_return_t
12548 pmap_load_trust_cache_with_type(
12549 TCType_t type,
12550 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12551 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12552 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12553 {
12554 kern_return_t ret = KERN_DENIED;
12555
12556 ret = pmap_load_trust_cache_with_type_ppl(
12557 type,
12558 pmap_img4_payload, pmap_img4_payload_len,
12559 img4_manifest, img4_manifest_len,
12560 img4_aux_manifest, img4_aux_manifest_len);
12561
12562 while (ret == KERN_RESOURCE_SHORTAGE) {
12563 /* Allocate a page from the free list */
12564 pmap_alloc_page_for_ppl(0);
12565
12566 /* Attempt the call again */
12567 ret = pmap_load_trust_cache_with_type_ppl(
12568 type,
12569 pmap_img4_payload, pmap_img4_payload_len,
12570 img4_manifest, img4_manifest_len,
12571 img4_aux_manifest, img4_aux_manifest_len);
12572 }
12573
12574 return ret;
12575 }
12576
12577 MARK_AS_PMAP_TEXT kern_return_t
12578 pmap_query_trust_cache_safe(
12579 TCQueryType_t query_type,
12580 const uint8_t cdhash[kTCEntryHashSize],
12581 TrustCacheQueryToken_t *query_token)
12582 {
12583 kern_return_t ret = KERN_NOT_FOUND;
12584
12585 /* Validate the query type preemptively */
12586 if (query_type >= kTCQueryTypeTotal) {
12587 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12588 return KERN_INVALID_ARGUMENT;
12589 }
12590
12591 /* Lock the runtime as shared */
12592 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12593
12594 TCReturn_t tc_ret = amfi->TrustCache.query(
12595 &ppl_trust_cache_rt,
12596 query_type,
12597 cdhash,
12598 query_token);
12599
12600 /* Unlock the runtime */
12601 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12602
12603 if (tc_ret.error == kTCReturnSuccess) {
12604 ret = KERN_SUCCESS;
12605 } else if (tc_ret.error == kTCReturnNotFound) {
12606 ret = KERN_NOT_FOUND;
12607 } else {
12608 ret = KERN_FAILURE;
12609 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12610 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12611 }
12612
12613 return ret;
12614 }
12615
12616 MARK_AS_PMAP_TEXT kern_return_t
12617 pmap_query_trust_cache_internal(
12618 TCQueryType_t query_type,
12619 const uint8_t cdhash[kTCEntryHashSize],
12620 TrustCacheQueryToken_t *query_token)
12621 {
12622 kern_return_t ret = KERN_NOT_FOUND;
12623 TrustCacheQueryToken_t query_token_safe = {0};
12624 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12625
12626 /* Copy in the CDHash into PPL storage */
12627 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12628
12629 /* Query through the safe API since we're in the PPL now */
12630 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12631
12632 if (query_token != NULL) {
12633 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12634 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12635 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12636 }
12637
12638 return ret;
12639 }
12640
12641 kern_return_t
12642 pmap_query_trust_cache(
12643 TCQueryType_t query_type,
12644 const uint8_t cdhash[kTCEntryHashSize],
12645 TrustCacheQueryToken_t *query_token)
12646 {
12647 kern_return_t ret = KERN_NOT_FOUND;
12648
12649 ret = pmap_query_trust_cache_ppl(
12650 query_type,
12651 cdhash,
12652 query_token);
12653
12654 return ret;
12655 }
12656
12657 MARK_AS_PMAP_DATA bool ppl_developer_mode_set = false;
12658 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12659
12660 MARK_AS_PMAP_TEXT void
12661 pmap_toggle_developer_mode_internal(
12662 bool state)
12663 {
12664 bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
12665
12666 /*
12667 * Only the following state transitions are allowed:
12668 * -- not set --> false
12669 * -- not set --> true
12670 * -- true --> false
12671 * -- true --> true
12672 * -- false --> false
12673 *
12674 * We never allow false --> true transitions.
12675 */
12676 bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
12677
12678 if ((current == false) && (state == true) && state_set) {
12679 panic("PMAP_CS: attempted to enable developer mode incorrectly");
12680 }
12681
12682 /* We're going to update the developer mode state, so update this first */
12683 os_atomic_store(&ppl_developer_mode_set, true, relaxed);
12684
12685 /* Update the developer mode state on the system */
12686 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
12687 }
12688
12689 void
12690 pmap_toggle_developer_mode(
12691 bool state)
12692 {
12693 pmap_toggle_developer_mode_ppl(state);
12694 }
12695
12696 #endif /* PMAP_CS_PPL_MONITOR */
12697
12698 #if PMAP_CS_INCLUDE_CODE_SIGNING
12699
12700 static int
12701 pmap_cs_profiles_rbtree_compare(
12702 void *profile0,
12703 void *profile1)
12704 {
12705 if (profile0 < profile1) {
12706 return -1;
12707 } else if (profile0 > profile1) {
12708 return 1;
12709 }
12710 return 0;
12711 }
12712
12713 /* Red-black tree for managing provisioning profiles */
12714 MARK_AS_PMAP_DATA static
12715 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
12716
12717 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
12718 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
12719
12720 /* Lock for the profile red-black tree */
12721 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
12722
12723 void
12724 pmap_initialize_provisioning_profiles(void)
12725 {
12726 /* Initialize the profiles red-black tree lock */
12727 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
12728 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
12729
12730 /* Initialize the red-black tree itself */
12731 RB_INIT(&pmap_cs_registered_profiles);
12732
12733 printf("initialized PPL provisioning profile data\n");
12734 }
12735
12736 static bool
12737 pmap_is_testflight_profile(
12738 pmap_cs_profile_t *profile_obj)
12739 {
12740 const char *entitlement_name = "beta-reports-active";
12741 const size_t entitlement_length = strlen(entitlement_name);
12742 CEQueryOperation_t query[2] = {0};
12743
12744 /* If the profile provisions no entitlements, then it isn't a test flight one */
12745 if (profile_obj->entitlements_ctx == NULL) {
12746 return false;
12747 }
12748
12749 /* Build our CoreEntitlements query */
12750 query[0].opcode = kCEOpSelectKey;
12751 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
12752 query[0].parameters.stringParameter.length = entitlement_length;
12753 query[1] = CEMatchBool(true);
12754
12755 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
12756 profile_obj->entitlements_ctx,
12757 query, 2);
12758
12759 if (ce_err == amfi->CoreEntitlements.kNoError) {
12760 return true;
12761 }
12762
12763 return false;
12764 }
12765
12766 static bool
12767 pmap_is_development_profile(
12768 pmap_cs_profile_t *profile_obj)
12769 {
12770 /* Check for UPP */
12771 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
12772 *profile_obj->profile_ctx,
12773 CESelectDictValue("ProvisionsAllDevices"));
12774 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
12775 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
12776 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
12777 return false;
12778 }
12779 }
12780
12781 /* Check for TestFlight profile */
12782 if (pmap_is_testflight_profile(profile_obj) == true) {
12783 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
12784 return false;
12785 }
12786
12787 pmap_cs_log_info("%p: development profile", profile_obj);
12788 return true;
12789 }
12790
12791 static kern_return_t
12792 pmap_initialize_profile_entitlements(
12793 pmap_cs_profile_t *profile_obj)
12794 {
12795 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
12796 *profile_obj->profile_ctx,
12797 CESelectDictValue("Entitlements"));
12798
12799 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
12800 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
12801 profile_obj->entitlements_ctx = NULL;
12802
12803 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
12804 return KERN_NOT_FOUND;
12805 }
12806
12807 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
12808 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
12809
12810 CEValidationResult ce_result = {0};
12811 CEError_t ce_err = amfi->CoreEntitlements.Validate(
12812 pmap_cs_core_entitlements_runtime,
12813 &ce_result,
12814 der_start, der_end);
12815 if (ce_err != amfi->CoreEntitlements.kNoError) {
12816 pmap_cs_log_error("unable to validate profile entitlements: %s",
12817 amfi->CoreEntitlements.GetErrorString(ce_err));
12818
12819 return KERN_ABORTED;
12820 }
12821
12822 struct CEQueryContext query_ctx = {0};
12823 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
12824 pmap_cs_core_entitlements_runtime,
12825 ce_result,
12826 &query_ctx);
12827 if (ce_err != amfi->CoreEntitlements.kNoError) {
12828 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
12829 amfi->CoreEntitlements.GetErrorString(ce_err));
12830
12831 return KERN_ABORTED;
12832 }
12833
12834 /* Setup the entitlements context within the profile object */
12835 profile_obj->entitlements_ctx_storage = query_ctx;
12836 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
12837
12838 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
12839 return KERN_SUCCESS;
12840 }
12841
12842 kern_return_t
12843 pmap_register_provisioning_profile_internal(
12844 const vm_address_t payload_addr,
12845 const vm_size_t payload_size)
12846 {
12847 kern_return_t ret = KERN_DENIED;
12848 pmap_cs_profile_t *profile_obj = NULL;
12849 pmap_profile_payload_t *profile_payload = NULL;
12850 vm_size_t max_profile_blob_size = 0;
12851 const uint8_t *profile_content = NULL;
12852 size_t profile_content_length = 0;
12853
12854
12855 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
12856 ret = pmap_reserve_ppl_page();
12857 if (ret != KERN_SUCCESS) {
12858 if (ret != KERN_RESOURCE_SHORTAGE) {
12859 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
12860 }
12861 return ret;
12862 }
12863
12864 /* Ensure we have valid data passed in */
12865 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
12866
12867 /*
12868 * Lockdown the data passed in. The pmap profile payload also contains the profile
12869 * data structure used by the PPL to manage the payload. We need to be able to write
12870 * to that data structure, so we keep the payload PPL writable.
12871 */
12872 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
12873
12874 /* Should be safe to read from this now */
12875 profile_payload = (pmap_profile_payload_t*)payload_addr;
12876
12877 /* Ensure the profile blob size provided is valid */
12878 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
12879 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
12880 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
12881 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
12882 }
12883
12884 #if PMAP_CS_INCLUDE_INTERNAL_CODE
12885 const bool allow_development_root_cert = true;
12886 #else
12887 const bool allow_development_root_cert = false;
12888 #endif
12889
12890 int ct_result = coretrust->CTEvaluateProvisioningProfile(
12891 profile_payload->profile_blob, profile_payload->profile_blob_size,
12892 allow_development_root_cert,
12893 &profile_content, &profile_content_length);
12894
12895 /* Release the PPL page allocated for CoreCrypto */
12896 pmap_release_reserved_ppl_page();
12897
12898 if (ct_result != 0) {
12899 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
12900 } else if ((profile_content == NULL) || profile_content_length == 0) {
12901 panic("PMAP_CS: profile does not have any content: %p | %lu",
12902 profile_content, profile_content_length);
12903 }
12904
12905 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
12906 pmap_cs_core_entitlements_runtime,
12907 CCDER_CONSTRUCTED_SET,
12908 false,
12909 profile_content, profile_content + profile_content_length);
12910 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
12911 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
12912 }
12913
12914 /* Acquire a writable version of the profile data structure */
12915 profile_obj = &profile_payload->profile_obj_storage;
12916 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
12917
12918 profile_obj->original_payload = profile_payload;
12919 profile_obj->profile_ctx_storage = profile_ctx_storage;
12920 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
12921 os_atomic_store(&profile_obj->reference_count, 0, release);
12922
12923 /* Setup the entitlements provisioned by the profile */
12924 ret = pmap_initialize_profile_entitlements(profile_obj);
12925 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
12926 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
12927 }
12928
12929 /* Setup properties of the profile */
12930 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
12931
12932 /* Mark as validated since it passed all checks */
12933 profile_obj->profile_validated = true;
12934
12935 /* Add the profile to the red-black tree */
12936 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
12937 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
12938 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
12939 }
12940 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
12941
12942 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
12943 return KERN_SUCCESS;
12944 }
12945
12946 kern_return_t
12947 pmap_register_provisioning_profile(
12948 const vm_address_t payload_addr,
12949 const vm_size_t payload_size)
12950 {
12951 kern_return_t ret = KERN_DENIED;
12952
12953 ret = pmap_register_provisioning_profile_ppl(
12954 payload_addr,
12955 payload_size);
12956
12957 while (ret == KERN_RESOURCE_SHORTAGE) {
12958 /* Allocate a page from the free list */
12959 pmap_alloc_page_for_ppl(0);
12960
12961 /* Attempt the call again */
12962 ret = pmap_register_provisioning_profile_ppl(
12963 payload_addr,
12964 payload_size);
12965 }
12966
12967 return ret;
12968 }
12969
12970 kern_return_t
12971 pmap_unregister_provisioning_profile_internal(
12972 pmap_cs_profile_t *profile_obj)
12973 {
12974 kern_return_t ret = KERN_DENIED;
12975
12976 /* Lock the red-black tree exclusively */
12977 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
12978
12979 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
12980 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
12981 }
12982
12983 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
12984 if (reference_count != 0) {
12985 ret = KERN_FAILURE;
12986 goto exit;
12987 }
12988
12989 /* Remove the profile from the red-black tree */
12990 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
12991
12992 /* Unregistration was a success */
12993 ret = KERN_SUCCESS;
12994
12995 exit:
12996 /* Unlock the red-black tree */
12997 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
12998
12999 if (ret == KERN_SUCCESS) {
13000 /* Get the original payload address */
13001 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13002 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13003
13004 /* Get the original payload size */
13005 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13006 payload_size = round_page(payload_size);
13007
13008 /* Unlock the profile payload */
13009 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13010 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13011 profile_payload, payload_size);
13012
13013 profile_obj = NULL;
13014 }
13015 return ret;
13016 }
13017
13018 kern_return_t
13019 pmap_unregister_provisioning_profile(
13020 pmap_cs_profile_t *profile_obj)
13021 {
13022 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13023 }
13024
13025 kern_return_t
13026 pmap_associate_provisioning_profile_internal(
13027 pmap_cs_code_directory_t *cd_entry,
13028 pmap_cs_profile_t *profile_obj)
13029 {
13030 kern_return_t ret = KERN_DENIED;
13031
13032 /* Acquire the lock on the code directory */
13033 pmap_cs_lock_code_directory(cd_entry);
13034
13035 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13036 pmap_cs_log_error("disallowing profile association with verified signature");
13037 goto exit;
13038 } else if (cd_entry->profile_obj != NULL) {
13039 pmap_cs_log_error("disallowing multiple profile associations with signature");
13040 goto exit;
13041 }
13042
13043 /* Lock the red-black tree as shared */
13044 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13045
13046 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13047 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13048 } else if (profile_obj->profile_validated == false) {
13049 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13050 }
13051
13052 /* Associate the profile with the signature */
13053 cd_entry->profile_obj = profile_obj;
13054
13055 /* Increment the reference count on the profile object */
13056 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13057 if (reference_count == 0) {
13058 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13059 }
13060
13061 /* Unlock the red-black tree */
13062 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13063
13064 /* Association was a success */
13065 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13066 ret = KERN_SUCCESS;
13067
13068 exit:
13069 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13070
13071 return ret;
13072 }
13073
13074 kern_return_t
13075 pmap_associate_provisioning_profile(
13076 pmap_cs_code_directory_t *cd_entry,
13077 pmap_cs_profile_t *profile_obj)
13078 {
13079 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13080 }
13081
13082 kern_return_t
13083 pmap_disassociate_provisioning_profile_internal(
13084 pmap_cs_code_directory_t *cd_entry)
13085 {
13086 pmap_cs_profile_t *profile_obj = NULL;
13087 kern_return_t ret = KERN_DENIED;
13088
13089 /* Acquire the lock on the code directory */
13090 pmap_cs_lock_code_directory(cd_entry);
13091
13092 if (cd_entry->profile_obj == NULL) {
13093 ret = KERN_NOT_FOUND;
13094 goto exit;
13095 }
13096 profile_obj = cd_entry->profile_obj;
13097
13098 /* Disassociate the profile from the signature */
13099 cd_entry->profile_obj = NULL;
13100
13101 /* Disassociation was a success */
13102 ret = KERN_SUCCESS;
13103
13104 exit:
13105 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13106
13107 if (ret == KERN_SUCCESS) {
13108 /* Decrement the reference count on the profile object */
13109 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13110 if (reference_count == UINT32_MAX) {
13111 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13112 }
13113 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13114 }
13115 return ret;
13116 }
13117
13118 kern_return_t
13119 pmap_disassociate_provisioning_profile(
13120 pmap_cs_code_directory_t *cd_entry)
13121 {
13122 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13123 }
13124
13125 kern_return_t
13126 pmap_associate_kernel_entitlements_internal(
13127 pmap_cs_code_directory_t *cd_entry,
13128 const void *kernel_entitlements)
13129 {
13130 kern_return_t ret = KERN_DENIED;
13131
13132 if (kernel_entitlements == NULL) {
13133 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13134 }
13135
13136 /* Acquire the lock on the code directory */
13137 pmap_cs_lock_code_directory(cd_entry);
13138
13139 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13140 ret = KERN_DENIED;
13141 goto out;
13142 } else if (cd_entry->kernel_entitlements != NULL) {
13143 ret = KERN_DENIED;
13144 goto out;
13145 }
13146 cd_entry->kernel_entitlements = kernel_entitlements;
13147
13148 /* Association was a success */
13149 ret = KERN_SUCCESS;
13150
13151 out:
13152 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13153 return ret;
13154 }
13155
13156 kern_return_t
13157 pmap_associate_kernel_entitlements(
13158 pmap_cs_code_directory_t *cd_entry,
13159 const void *kernel_entitlements)
13160 {
13161 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13162 }
13163
13164 kern_return_t
13165 pmap_resolve_kernel_entitlements_internal(
13166 pmap_t pmap,
13167 const void **kernel_entitlements)
13168 {
13169 const void *entitlements = NULL;
13170 pmap_cs_code_directory_t *cd_entry = NULL;
13171 kern_return_t ret = KERN_DENIED;
13172
13173 /* Validate the PMAP object */
13174 validate_pmap(pmap);
13175
13176 /* Take a shared lock on the PMAP */
13177 pmap_lock(pmap, PMAP_LOCK_SHARED);
13178
13179 if (pmap == kernel_pmap) {
13180 ret = KERN_NOT_FOUND;
13181 goto out;
13182 }
13183
13184 /*
13185 * Acquire the code signature from the PMAP. This function is called when
13186 * performing an entitlement check, and since we've confirmed this isn't
13187 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13188 * with a code signature.
13189 */
13190 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13191 if (cd_entry == NULL) {
13192 ret = KERN_NOT_FOUND;
13193 goto out;
13194 }
13195
13196 entitlements = cd_entry->kernel_entitlements;
13197 if (entitlements == NULL) {
13198 ret = KERN_NOT_FOUND;
13199 goto out;
13200 }
13201
13202 /* Pin and write out the entitlements object pointer */
13203 if (kernel_entitlements != NULL) {
13204 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13205 *kernel_entitlements = entitlements;
13206 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13207 }
13208
13209 /* Successfully resolved the entitlements */
13210 ret = KERN_SUCCESS;
13211
13212 out:
13213 /* Unlock the code signature object */
13214 if (cd_entry != NULL) {
13215 lck_rw_unlock_shared(&cd_entry->rwlock);
13216 cd_entry = NULL;
13217 }
13218
13219 /* Unlock the PMAP object */
13220 pmap_unlock(pmap, PMAP_LOCK_SHARED);
13221
13222 return ret;
13223 }
13224
13225 kern_return_t
13226 pmap_resolve_kernel_entitlements(
13227 pmap_t pmap,
13228 const void **kernel_entitlements)
13229 {
13230 return pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13231 }
13232
13233 kern_return_t
13234 pmap_accelerate_entitlements_internal(
13235 pmap_cs_code_directory_t *cd_entry)
13236 {
13237 const coreentitlements_t *CoreEntitlements = NULL;
13238 const CS_SuperBlob *superblob = NULL;
13239 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13240 size_t signature_length = 0;
13241 size_t acceleration_length = 0;
13242 size_t required_length = 0;
13243 kern_return_t ret = KERN_DENIED;
13244
13245 /* Setup the CoreEntitlements interface */
13246 CoreEntitlements = &amfi->CoreEntitlements;
13247
13248 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13249
13250 /* Acquire the lock on the code directory */
13251 pmap_cs_lock_code_directory(cd_entry);
13252
13253 /*
13254 * Only reconstituted code signatures can be accelerated. This is only a policy
13255 * decision we make since this allows us to re-use any unused space within the
13256 * locked down code signature region. There is also a decent bit of validation
13257 * within the reconstitution function to ensure blobs are ordered and do not
13258 * contain any padding around them which can cause issues here.
13259 *
13260 * This also serves as a check to ensure the signature is trusted.
13261 */
13262 if (cd_entry->unneeded_code_signature_unlocked == false) {
13263 ret = KERN_DENIED;
13264 goto out;
13265 }
13266
13267 if (cd_entry->ce_ctx == NULL) {
13268 ret = KERN_SUCCESS;
13269 goto out;
13270 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13271 ret = KERN_SUCCESS;
13272 goto out;
13273 }
13274
13275 /* We only support accelerating when size <= PAGE_SIZE */
13276 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13277 if (ce_err != CoreEntitlements->kNoError) {
13278 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13279 /* Small entitlement blobs aren't eligible */
13280 ret = KERN_SUCCESS;
13281 goto out;
13282 }
13283 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13284 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13285 } else if (acceleration_length > PAGE_SIZE) {
13286 ret = KERN_ABORTED;
13287 goto out;
13288 }
13289 assert(acceleration_length > 0);
13290
13291 superblob = cd_entry->superblob;
13292 signature_length = ntohl(superblob->length);
13293
13294 /* Adjust the required length for the overhead structure -- can't overflow */
13295 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13296 if (required_length > PAGE_SIZE) {
13297 ret = KERN_ABORTED;
13298 goto out;
13299 }
13300
13301 /*
13302 * First we'll check if the code signature has enough space within the locked down
13303 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13304 * allocate the buffer, and if not, we'll just allocate an entire page from the
13305 * free list.
13306 *
13307 * When we're storing the buffer within the code signature, we also need to make
13308 * sure we account for alignment of the buffer.
13309 */
13310 const vm_address_t align_mask = sizeof(void*) - 1;
13311 size_t required_length_within_sig = required_length + align_mask;
13312
13313 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13314 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13315 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13316
13317 /* We need to resolve to the physical aperture */
13318 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13319 acceleration_buf = (void*)phystokv(phys_addr);
13320
13321 /* Ensure the offset within the page wasn't lost */
13322 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13323
13324 acceleration_buf->allocated = false;
13325 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13326 } else {
13327 if (required_length <= pmap_cs_blob_limit) {
13328 struct pmap_cs_blob *bucket = NULL;
13329 size_t bucket_size = 0;
13330
13331 /* Allocate a buffer from the blob allocator */
13332 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13333 if (ret != KERN_SUCCESS) {
13334 goto out;
13335 }
13336 acceleration_buf = (void*)bucket->blob;
13337 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13338 } else {
13339 pmap_paddr_t phys_addr = 0;
13340 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13341 if (ret != KERN_SUCCESS) {
13342 goto out;
13343 }
13344 acceleration_buf = (void*)phystokv(phys_addr);
13345 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13346 }
13347 acceleration_buf->allocated = true;
13348 }
13349 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13350 acceleration_buf->length = acceleration_length;
13351
13352 /* Take the acceleration buffer lock */
13353 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13354
13355 /* Setup the global acceleration buffer state */
13356 pmap_cs_acceleration_buf = acceleration_buf;
13357
13358 /* Accelerate the entitlements */
13359 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13360 if (ce_err != CoreEntitlements->kNoError) {
13361 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13362 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13363 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13364 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13365 }
13366
13367 /*
13368 * The global acceleration buffer lock is unlocked by the allocation function itself
13369 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13370 * an assert that the lock is unlocked here since another thread could have acquired
13371 * it by now.
13372 */
13373 ret = KERN_SUCCESS;
13374
13375 out:
13376 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13377 return ret;
13378 }
13379
13380 kern_return_t
13381 pmap_accelerate_entitlements(
13382 pmap_cs_code_directory_t *cd_entry)
13383 {
13384 kern_return_t ret = KERN_DENIED;
13385
13386 ret = pmap_accelerate_entitlements_ppl(cd_entry);
13387 while (ret == KERN_RESOURCE_SHORTAGE) {
13388 /* Allocate a page for the PPL */
13389 pmap_alloc_page_for_ppl(0);
13390
13391 /* Try again */
13392 ret = pmap_accelerate_entitlements_ppl(cd_entry);
13393 }
13394
13395 return ret;
13396 }
13397
13398 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13399
13400 MARK_AS_PMAP_TEXT bool
13401 pmap_lookup_in_loaded_trust_caches_internal(
13402 const uint8_t cdhash[CS_CDHASH_LEN])
13403 {
13404 kern_return_t kr = KERN_NOT_FOUND;
13405
13406 #if PMAP_CS_PPL_MONITOR
13407 /*
13408 * If we have the PPL monitor, then this function can only be called from
13409 * within the PPL. Calling it directly would've caused a panic, so we can
13410 * assume that we're in the PPL here.
13411 */
13412 uint8_t cdhash_safe[CS_CDHASH_LEN];
13413 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13414
13415 kr = pmap_query_trust_cache_safe(
13416 kTCQueryTypeLoadable,
13417 cdhash_safe,
13418 NULL);
13419 #else
13420 kr = query_trust_cache(
13421 kTCQueryTypeLoadable,
13422 cdhash,
13423 NULL);
13424 #endif
13425
13426 if (kr == KERN_SUCCESS) {
13427 return true;
13428 }
13429 return false;
13430 }
13431
13432 bool
13433 pmap_lookup_in_loaded_trust_caches(
13434 const uint8_t cdhash[CS_CDHASH_LEN])
13435 {
13436 #if XNU_MONITOR
13437 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13438 #else
13439 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13440 #endif
13441 }
13442
13443 MARK_AS_PMAP_TEXT uint32_t
13444 pmap_lookup_in_static_trust_cache_internal(
13445 const uint8_t cdhash[CS_CDHASH_LEN])
13446 {
13447 TrustCacheQueryToken_t query_token = {0};
13448 kern_return_t kr = KERN_NOT_FOUND;
13449 uint64_t flags = 0;
13450 uint8_t hash_type = 0;
13451
13452 #if PMAP_CS_PPL_MONITOR
13453 /*
13454 * If we have the PPL monitor, then this function can only be called from
13455 * within the PPL. Calling it directly would've caused a panic, so we can
13456 * assume that we're in the PPL here.
13457 */
13458 uint8_t cdhash_safe[CS_CDHASH_LEN];
13459 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13460
13461 kr = pmap_query_trust_cache_safe(
13462 kTCQueryTypeStatic,
13463 cdhash_safe,
13464 &query_token);
13465 #else
13466 kr = query_trust_cache(
13467 kTCQueryTypeStatic,
13468 cdhash,
13469 &query_token);
13470 #endif
13471
13472 if (kr == KERN_SUCCESS) {
13473 amfi->TrustCache.queryGetFlags(&query_token, &flags);
13474 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13475
13476 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13477 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13478 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13479 }
13480
13481 return 0;
13482 }
13483
13484 uint32_t
13485 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13486 {
13487 #if XNU_MONITOR
13488 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13489 #else
13490 return pmap_lookup_in_static_trust_cache_internal(cdhash);
13491 #endif
13492 }
13493
13494 #if PMAP_CS_INCLUDE_CODE_SIGNING
13495
13496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13497 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13498
13499 MARK_AS_PMAP_TEXT void
13500 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13501 {
13502
13503 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13504 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13505 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13506
13507 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13508 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13509 }
13510
13511 MARK_AS_PMAP_TEXT bool
13512 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13513 {
13514 bool match = false;
13515
13516 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13517 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13518 match = true;
13519 }
13520 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13521
13522 if (match) {
13523 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13524 }
13525
13526 return match;
13527 }
13528
13529 void
13530 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13531 {
13532 #if XNU_MONITOR
13533 pmap_set_compilation_service_cdhash_ppl(cdhash);
13534 #else
13535 pmap_set_compilation_service_cdhash_internal(cdhash);
13536 #endif
13537 }
13538
13539 bool
13540 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13541 {
13542 #if XNU_MONITOR
13543 return pmap_match_compilation_service_cdhash_ppl(cdhash);
13544 #else
13545 return pmap_match_compilation_service_cdhash_internal(cdhash);
13546 #endif
13547 }
13548
13549 /*
13550 * As part of supporting local signing on the device, we need the PMAP layer
13551 * to store the local signing key so that PMAP_CS can validate with it. We
13552 * store it at the PMAP layer such that it is accessible to both AMFI and
13553 * PMAP_CS should they need it.
13554 */
13555 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13556 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13557
13558 MARK_AS_PMAP_TEXT void
13559 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13560 {
13561 bool key_set = false;
13562
13563 /*
13564 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13565 * a successful exchange means that the local signing public key has _not_ been
13566 * set. In case the key has been set, we panic as we would never expect the
13567 * kernel to attempt to set the key more than once.
13568 */
13569 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13570
13571 if (key_set) {
13572 panic("attempted to set the local signing public key multiple times");
13573 }
13574
13575 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13576 pmap_cs_log_info("set local signing public key");
13577 }
13578
13579 void
13580 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13581 {
13582 #if XNU_MONITOR
13583 return pmap_set_local_signing_public_key_ppl(public_key);
13584 #else
13585 return pmap_set_local_signing_public_key_internal(public_key);
13586 #endif
13587 }
13588
13589 uint8_t*
13590 pmap_get_local_signing_public_key(void)
13591 {
13592 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13593
13594 if (key_set) {
13595 return pmap_local_signing_public_key;
13596 }
13597
13598 return NULL;
13599 }
13600
13601 /*
13602 * Locally signed applications need to be explicitly authorized by an entitled application
13603 * before we allow them to run.
13604 */
13605 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13606 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13607
13608 MARK_AS_PMAP_TEXT void
13609 pmap_unrestrict_local_signing_internal(
13610 const uint8_t cdhash[CS_CDHASH_LEN])
13611 {
13612
13613 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13614 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13615 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13616
13617 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13618 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13619 }
13620
13621 void
13622 pmap_unrestrict_local_signing(
13623 const uint8_t cdhash[CS_CDHASH_LEN])
13624 {
13625 #if XNU_MONITOR
13626 return pmap_unrestrict_local_signing_ppl(cdhash);
13627 #else
13628 return pmap_unrestrict_local_signing_internal(cdhash);
13629 #endif
13630 }
13631
13632 #if PMAP_CS
13633 MARK_AS_PMAP_TEXT static void
13634 pmap_restrict_local_signing(void)
13635 {
13636 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13637 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13638 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13639 }
13640
13641 MARK_AS_PMAP_TEXT static bool
13642 pmap_local_signing_restricted(
13643 const uint8_t cdhash[CS_CDHASH_LEN])
13644 {
13645 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13646 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13647 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13648
13649 return ret != 0;
13650 }
13651
13652 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13653 #endif
13654
13655 MARK_AS_PMAP_TEXT void
13656 pmap_footprint_suspend_internal(
13657 vm_map_t map,
13658 boolean_t suspend)
13659 {
13660 #if DEVELOPMENT || DEBUG
13661 if (suspend) {
13662 current_thread()->pmap_footprint_suspended = TRUE;
13663 map->pmap->footprint_was_suspended = TRUE;
13664 } else {
13665 current_thread()->pmap_footprint_suspended = FALSE;
13666 }
13667 #else /* DEVELOPMENT || DEBUG */
13668 (void) map;
13669 (void) suspend;
13670 #endif /* DEVELOPMENT || DEBUG */
13671 }
13672
13673 void
13674 pmap_footprint_suspend(
13675 vm_map_t map,
13676 boolean_t suspend)
13677 {
13678 #if XNU_MONITOR
13679 pmap_footprint_suspend_ppl(map, suspend);
13680 #else
13681 pmap_footprint_suspend_internal(map, suspend);
13682 #endif
13683 }
13684
13685 MARK_AS_PMAP_TEXT void
13686 pmap_nop_internal(pmap_t pmap __unused)
13687 {
13688 validate_pmap_mutable(pmap);
13689 }
13690
13691 void
13692 pmap_nop(pmap_t pmap)
13693 {
13694 #if XNU_MONITOR
13695 pmap_nop_ppl(pmap);
13696 #else
13697 pmap_nop_internal(pmap);
13698 #endif
13699 }
13700
13701 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
13702
13703 struct page_table_dump_header {
13704 uint64_t pa;
13705 uint64_t num_entries;
13706 uint64_t start_va;
13707 uint64_t end_va;
13708 };
13709
13710 static kern_return_t
13711 pmap_dump_page_tables_recurse(pmap_t pmap,
13712 const tt_entry_t *ttp,
13713 unsigned int cur_level,
13714 unsigned int level_mask,
13715 uint64_t start_va,
13716 void *buf_start,
13717 void *buf_end,
13718 size_t *bytes_copied)
13719 {
13720 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13721 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
13722
13723 uint64_t size = pt_attr->pta_level_info[cur_level].size;
13724 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
13725 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
13726 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
13727
13728 void *bufp = (uint8_t*)buf_start + *bytes_copied;
13729
13730 if (cur_level == pt_attr_root_level(pt_attr)) {
13731 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
13732 }
13733
13734 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
13735 const tt_entry_t *tt_end = &ttp[num_entries];
13736
13737 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
13738 return KERN_INSUFFICIENT_BUFFER_SIZE;
13739 }
13740
13741 if (level_mask & (1U << cur_level)) {
13742 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
13743 header->pa = ml_static_vtop((vm_offset_t)ttp);
13744 header->num_entries = num_entries;
13745 header->start_va = start_va;
13746 header->end_va = start_va + (num_entries * size);
13747
13748 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
13749 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
13750 }
13751 uint64_t current_va = start_va;
13752
13753 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
13754 tt_entry_t tte = *ttep;
13755
13756 if (!(tte & valid_mask)) {
13757 continue;
13758 }
13759
13760 if ((tte & type_mask) == type_block) {
13761 continue;
13762 } else {
13763 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
13764 panic("%s: corrupt entry %#llx at %p, "
13765 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
13766 __FUNCTION__, tte, ttep,
13767 ttp, cur_level, bufp, buf_end);
13768 }
13769
13770 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
13771
13772 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
13773 level_mask, current_va, buf_start, buf_end, bytes_copied);
13774
13775 if (recurse_result != KERN_SUCCESS) {
13776 return recurse_result;
13777 }
13778 }
13779 }
13780
13781 return KERN_SUCCESS;
13782 }
13783
13784 kern_return_t
13785 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
13786 {
13787 if (not_in_kdp) {
13788 panic("pmap_dump_page_tables must only be called from kernel debugger context");
13789 }
13790 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
13791 level_mask, pmap->min, bufp, buf_end, bytes_copied);
13792 }
13793
13794 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
13795
13796 kern_return_t
13797 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
13798 unsigned int level_mask __unused, size_t *bytes_copied __unused)
13799 {
13800 return KERN_NOT_SUPPORTED;
13801 }
13802 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
13803
13804
13805 #ifdef CONFIG_XNUPOST
13806 #ifdef __arm64__
13807 static volatile bool pmap_test_took_fault = false;
13808
13809 static bool
13810 pmap_test_fault_handler(arm_saved_state_t * state)
13811 {
13812 bool retval = false;
13813 uint32_t esr = get_saved_state_esr(state);
13814 esr_exception_class_t class = ESR_EC(esr);
13815 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
13816
13817 if ((class == ESR_EC_DABORT_EL1) &&
13818 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
13819 pmap_test_took_fault = true;
13820 /* return to the instruction immediately after the call to NX page */
13821 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
13822 retval = true;
13823 }
13824
13825 return retval;
13826 }
13827
13828 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
13829 static NOKASAN bool
13830 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
13831 {
13832 pmap_t old_pmap = NULL;
13833
13834 pmap_test_took_fault = false;
13835
13836 /*
13837 * We're potentially switching pmaps without using the normal thread
13838 * mechanism; disable interrupts and preemption to avoid any unexpected
13839 * memory accesses.
13840 */
13841 uint64_t old_int_state = pmap_interrupts_disable();
13842 mp_disable_preemption();
13843
13844 if (pmap != NULL) {
13845 old_pmap = current_pmap();
13846 pmap_switch(pmap);
13847
13848 /* Disable PAN; pmap shouldn't be the kernel pmap. */
13849 #if __ARM_PAN_AVAILABLE__
13850 __builtin_arm_wsr("pan", 0);
13851 #endif /* __ARM_PAN_AVAILABLE__ */
13852 }
13853
13854 ml_expect_fault_begin(pmap_test_fault_handler, va);
13855
13856 if (is_write) {
13857 *((volatile uint64_t*)(va)) = 0xdec0de;
13858 } else {
13859 volatile uint64_t tmp = *((volatile uint64_t*)(va));
13860 (void)tmp;
13861 }
13862
13863 /* Save the fault bool, and undo the gross stuff we did. */
13864 bool took_fault = pmap_test_took_fault;
13865 ml_expect_fault_end();
13866
13867 if (pmap != NULL) {
13868 #if __ARM_PAN_AVAILABLE__
13869 __builtin_arm_wsr("pan", 1);
13870 #endif /* __ARM_PAN_AVAILABLE__ */
13871
13872 pmap_switch(old_pmap);
13873 }
13874
13875 mp_enable_preemption();
13876 pmap_interrupts_restore(old_int_state);
13877 bool retval = (took_fault == should_fault);
13878 return retval;
13879 }
13880
13881 static bool
13882 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
13883 {
13884 bool retval = pmap_test_access(pmap, va, should_fault, false);
13885
13886 if (!retval) {
13887 T_FAIL("%s: %s, "
13888 "pmap=%p, va=%p, should_fault=%u",
13889 __func__, should_fault ? "did not fault" : "faulted",
13890 pmap, (void*)va, (unsigned)should_fault);
13891 }
13892
13893 return retval;
13894 }
13895
13896 static bool
13897 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
13898 {
13899 bool retval = pmap_test_access(pmap, va, should_fault, true);
13900
13901 if (!retval) {
13902 T_FAIL("%s: %s, "
13903 "pmap=%p, va=%p, should_fault=%u",
13904 __func__, should_fault ? "did not fault" : "faulted",
13905 pmap, (void*)va, (unsigned)should_fault);
13906 }
13907
13908 return retval;
13909 }
13910
13911 static bool
13912 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
13913 {
13914 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13915 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
13916
13917 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
13918
13919 if (!retval) {
13920 T_FAIL("%s: bits=%u, "
13921 "pa=%p, should_be_set=%u",
13922 __func__, bits,
13923 (void*)pa, should_be_set);
13924 }
13925
13926 return retval;
13927 }
13928
13929 static __attribute__((noinline)) bool
13930 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
13931 {
13932 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
13933 return retval;
13934 }
13935
13936 static int
13937 pmap_test_test_config(unsigned int flags)
13938 {
13939 T_LOG("running pmap_test_test_config flags=0x%X", flags);
13940 unsigned int map_count = 0;
13941 unsigned long page_ratio = 0;
13942 pmap_t pmap = pmap_create_options(NULL, 0, flags);
13943
13944 if (!pmap) {
13945 panic("Failed to allocate pmap");
13946 }
13947
13948 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13949 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
13950 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
13951 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
13952
13953 if (pmap_page_size <= native_page_size) {
13954 page_ratio = native_page_size / pmap_page_size;
13955 } else {
13956 /*
13957 * We claim to support a page_ratio of less than 1, which is
13958 * not currently supported by the pmap layer; panic.
13959 */
13960 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
13961 "flags=%u",
13962 __func__, native_page_size, pmap_page_size,
13963 flags);
13964 }
13965
13966 if (PAGE_RATIO > 1) {
13967 /*
13968 * The kernel is deliberately pretending to have 16KB pages.
13969 * The pmap layer has code that supports this, so pretend the
13970 * page size is larger than it is.
13971 */
13972 pmap_page_size = PAGE_SIZE;
13973 native_page_size = PAGE_SIZE;
13974 }
13975
13976 /*
13977 * Get two pages from the VM; one to be mapped wired, and one to be
13978 * mapped nonwired.
13979 */
13980 vm_page_t unwired_vm_page = vm_page_grab();
13981 vm_page_t wired_vm_page = vm_page_grab();
13982
13983 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
13984 panic("Failed to grab VM pages");
13985 }
13986
13987 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
13988 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
13989
13990 pmap_paddr_t pa = ptoa(pn);
13991 pmap_paddr_t wired_pa = ptoa(wired_pn);
13992
13993 /*
13994 * We'll start mappings at the second twig TT. This keeps us from only
13995 * using the first entry in each TT, which would trivially be address
13996 * 0; one of the things we will need to test is retrieving the VA for
13997 * a given PTE.
13998 */
13999 vm_map_address_t va_base = pmap_twig_size;
14000 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14001
14002 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14003 /*
14004 * Not exactly a functional failure, but this test relies on
14005 * there being a spare PTE slot we can use to pin the TT.
14006 */
14007 panic("Cannot pin translation table");
14008 }
14009
14010 /*
14011 * Create the wired mapping; this will prevent the pmap layer from
14012 * reclaiming our test TTs, which would interfere with this test
14013 * ("interfere" -> "make it panic").
14014 */
14015 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14016
14017 #if XNU_MONITOR
14018 /*
14019 * If the PPL is enabled, make sure that the kernel cannot write
14020 * to PPL memory.
14021 */
14022 if (!pmap_ppl_disable) {
14023 T_LOG("Validate that kernel cannot write to PPL memory.");
14024 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14025 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14026 }
14027 #endif
14028
14029 /*
14030 * Create read-only mappings of the nonwired page; if the pmap does
14031 * not use the same page size as the kernel, create multiple mappings
14032 * so that the kernel page is fully mapped.
14033 */
14034 for (map_count = 0; map_count < page_ratio; map_count++) {
14035 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14036 }
14037
14038 /* Validate that all the PTEs have the expected PA and VA. */
14039 for (map_count = 0; map_count < page_ratio; map_count++) {
14040 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14041
14042 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14043 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14044 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14045 }
14046
14047 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14048 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14049 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14050 }
14051 }
14052
14053 T_LOG("Validate that reads to our mapping do not fault.");
14054 pmap_test_read(pmap, va_base, false);
14055
14056 T_LOG("Validate that writes to our mapping fault.");
14057 pmap_test_write(pmap, va_base, true);
14058
14059 T_LOG("Make the first mapping writable.");
14060 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14061
14062 T_LOG("Validate that writes to our mapping do not fault.");
14063 pmap_test_write(pmap, va_base, false);
14064
14065
14066 T_LOG("Make the first mapping execute-only");
14067 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14068
14069
14070 T_LOG("Validate that reads to our mapping do not fault.");
14071 pmap_test_read(pmap, va_base, false);
14072
14073 T_LOG("Validate that writes to our mapping fault.");
14074 pmap_test_write(pmap, va_base, true);
14075
14076
14077 /*
14078 * For page ratios of greater than 1: validate that writes to the other
14079 * mappings still fault. Remove the mappings afterwards (we're done
14080 * with page ratio testing).
14081 */
14082 for (map_count = 1; map_count < page_ratio; map_count++) {
14083 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14084 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14085 }
14086
14087 T_LOG("Mark the page unreferenced and unmodified.");
14088 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14089 pmap_test_check_refmod(pa, 0);
14090
14091 /*
14092 * Begin testing the ref/mod state machine. Re-enter the mapping with
14093 * different protection/fault_type settings, and confirm that the
14094 * ref/mod state matches our expectations at each step.
14095 */
14096 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
14097 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14098 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14099
14100 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
14101 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14102 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14103 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14104
14105 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
14106 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14107 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14108 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14109
14110 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
14111 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14112 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14113
14114 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
14115 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14116 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14117 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14118
14119 /*
14120 * Shared memory testing; we'll have two mappings; one read-only,
14121 * one read-write.
14122 */
14123 vm_map_address_t rw_base = va_base;
14124 vm_map_address_t ro_base = va_base + pmap_page_size;
14125
14126 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14127 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14128
14129 /*
14130 * Test that we take faults as expected for unreferenced/unmodified
14131 * pages. Also test the arm_fast_fault interface, to ensure that
14132 * mapping permissions change as expected.
14133 */
14134 T_LOG("!ref/!mod: expect no access");
14135 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14136 pmap_test_read_write(pmap, ro_base, false, false);
14137 pmap_test_read_write(pmap, rw_base, false, false);
14138
14139 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14140 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14141 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14142 pmap_test_read_write(pmap, ro_base, true, false);
14143 pmap_test_read_write(pmap, rw_base, true, false);
14144
14145 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14146 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14147 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14148 pmap_test_read_write(pmap, ro_base, true, false);
14149 pmap_test_read_write(pmap, rw_base, true, true);
14150
14151 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14152 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14153 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14154 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14155 pmap_test_read_write(pmap, ro_base, true, false);
14156 pmap_test_read_write(pmap, rw_base, true, true);
14157
14158 T_LOG("RW protect both mappings; should not change protections.");
14159 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14160 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14161 pmap_test_read_write(pmap, ro_base, true, false);
14162 pmap_test_read_write(pmap, rw_base, true, true);
14163
14164 T_LOG("Read protect both mappings; RW mapping should become RO.");
14165 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14166 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14167 pmap_test_read_write(pmap, ro_base, true, false);
14168 pmap_test_read_write(pmap, rw_base, true, false);
14169
14170 T_LOG("RW protect the page; mappings should not change protections.");
14171 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14172 pmap_page_protect(pn, VM_PROT_ALL);
14173 pmap_test_read_write(pmap, ro_base, true, false);
14174 pmap_test_read_write(pmap, rw_base, true, true);
14175
14176 T_LOG("Read protect the page; RW mapping should become RO.");
14177 pmap_page_protect(pn, VM_PROT_READ);
14178 pmap_test_read_write(pmap, ro_base, true, false);
14179 pmap_test_read_write(pmap, rw_base, true, false);
14180
14181 T_LOG("Validate that disconnect removes all known mappings of the page.");
14182 pmap_disconnect(pn);
14183 if (!pmap_verify_free(pn)) {
14184 T_FAIL("Page still has mappings");
14185 }
14186
14187 T_LOG("Remove the wired mapping, so we can tear down the test map.");
14188 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14189 pmap_destroy(pmap);
14190
14191 T_LOG("Release the pages back to the VM.");
14192 vm_page_lock_queues();
14193 vm_page_free(unwired_vm_page);
14194 vm_page_free(wired_vm_page);
14195 vm_page_unlock_queues();
14196
14197 T_LOG("Testing successful!");
14198 return 0;
14199 }
14200 #endif /* __arm64__ */
14201
14202 kern_return_t
14203 pmap_test(void)
14204 {
14205 T_LOG("Starting pmap_tests");
14206 #ifdef __arm64__
14207 int flags = 0;
14208 flags |= PMAP_CREATE_64BIT;
14209
14210 #if __ARM_MIXED_PAGE_SIZE__
14211 T_LOG("Testing VM_PAGE_SIZE_4KB");
14212 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14213 T_LOG("Testing VM_PAGE_SIZE_16KB");
14214 pmap_test_test_config(flags);
14215 #else /* __ARM_MIXED_PAGE_SIZE__ */
14216 pmap_test_test_config(flags);
14217 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14218
14219 #endif /* __arm64__ */
14220 T_PASS("completed pmap_test successfully");
14221 return KERN_SUCCESS;
14222 }
14223 #endif /* CONFIG_XNUPOST */
14224
14225 /*
14226 * The following function should never make it to RELEASE code, since
14227 * it provides a way to get the PPL to modify text pages.
14228 */
14229 #if DEVELOPMENT || DEBUG
14230
14231 #define ARM_UNDEFINED_INSN 0xe7f000f0
14232 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14233
14234 /**
14235 * Forcibly overwrite executable text with an illegal instruction.
14236 *
14237 * @note Only used for xnu unit testing.
14238 *
14239 * @param pa The physical address to corrupt.
14240 *
14241 * @return KERN_SUCCESS on success.
14242 */
14243 kern_return_t
14244 pmap_test_text_corruption(pmap_paddr_t pa)
14245 {
14246 #if XNU_MONITOR
14247 return pmap_test_text_corruption_ppl(pa);
14248 #else /* XNU_MONITOR */
14249 return pmap_test_text_corruption_internal(pa);
14250 #endif /* XNU_MONITOR */
14251 }
14252
14253 MARK_AS_PMAP_TEXT kern_return_t
14254 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14255 {
14256 vm_offset_t va = phystokv(pa);
14257 unsigned int pai = pa_index(pa);
14258
14259 assert(pa_valid(pa));
14260
14261 pvh_lock(pai);
14262
14263 pv_entry_t **pv_h = pai_to_pvh(pai);
14264 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14265 #if defined(PVH_FLAG_EXEC)
14266 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14267
14268 if (need_ap_twiddle) {
14269 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14270 }
14271 #endif /* defined(PVH_FLAG_EXEC) */
14272
14273 /*
14274 * The low bit in an instruction address indicates a THUMB instruction
14275 */
14276 if (va & 1) {
14277 va &= ~(vm_offset_t)1;
14278 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14279 } else {
14280 *(uint32_t *)va = ARM_UNDEFINED_INSN;
14281 }
14282
14283 #if defined(PVH_FLAG_EXEC)
14284 if (need_ap_twiddle) {
14285 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14286 }
14287 #endif /* defined(PVH_FLAG_EXEC) */
14288
14289 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14290
14291 pvh_unlock(pai);
14292
14293 return KERN_SUCCESS;
14294 }
14295
14296 #endif /* DEVELOPMENT || DEBUG */
14297