1 /*
2 * Copyright (c) 2011-2021, 2023-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object_internal.h>
60 #include <vm/vm_page_internal.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm_internal.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 #include <machine/machine_routines.h>
75
76 #include <arm/caches_internal.h>
77 #include <arm/cpu_data.h>
78 #include <arm/cpu_data_internal.h>
79 #include <arm/cpu_capabilities.h>
80 #include <arm/cpu_number.h>
81 #include <arm/machine_cpu.h>
82 #include <arm/misc_protos.h>
83 #include <arm/pmap/pmap_internal.h>
84 #include <arm/trap_internal.h>
85
86 #include <arm64/proc_reg.h>
87 #include <pexpert/arm64/boot.h>
88 #include <arm64/ppl/sart.h>
89 #include <arm64/ppl/uat.h>
90
91 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
92 #include <arm64/amcc_rorgn.h>
93 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
94
95 #include <pexpert/device_tree.h>
96
97 #include <san/kasan.h>
98 #include <sys/cdefs.h>
99
100 #if defined(HAS_APPLE_PAC)
101 #include <ptrauth.h>
102 #endif
103
104 #ifdef CONFIG_XNUPOST
105 #include <tests/xnupost.h>
106 #endif
107
108
109
110 #if HIBERNATION
111 #include <IOKit/IOHibernatePrivate.h>
112 #endif /* HIBERNATION */
113
114 #define PMAP_L1_MAX_ENTRY (ARM_PTE_T1_REGION_MASK(TCR_EL1_BOOT) >> ARM_TT_L1_SHIFT)
115 #define PMAP_ROOT_ALLOC_SIZE ((PMAP_L1_MAX_ENTRY + 1) * sizeof(tt_entry_t))
116
117 #ifndef __ARM64_PMAP_SUBPAGE_L1__
118 _Static_assert(ARM_PGBYTES == PMAP_ROOT_ALLOC_SIZE, "Unexpected L1 Size");
119 #endif
120
121 #if __ARM_VMSA__ != 8
122 #error Unknown __ARM_VMSA__
123 #endif
124
125 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
126
127 extern u_int32_t random(void); /* from <libkern/libkern.h> */
128
129 static bool alloc_asid(pmap_t pmap);
130 static void free_asid(pmap_t pmap);
131 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
132 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
133 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
134
135 const struct page_table_ops native_pt_ops =
136 {
137 .alloc_id = alloc_asid,
138 .free_id = free_asid,
139 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
140 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
141 .wimg_to_pte = wimg_to_pte,
142 };
143
144 const struct page_table_level_info pmap_table_level_info_16k[] =
145 {
146 [0] = {
147 .size = ARM_16K_TT_L0_SIZE,
148 .offmask = ARM_16K_TT_L0_OFFMASK,
149 .shift = ARM_16K_TT_L0_SHIFT,
150 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
151 .valid_mask = ARM_TTE_VALID,
152 .type_mask = ARM_TTE_TYPE_MASK,
153 .type_block = ARM_TTE_TYPE_BLOCK
154 },
155 [1] = {
156 .size = ARM_16K_TT_L1_SIZE,
157 .offmask = ARM_16K_TT_L1_OFFMASK,
158 .shift = ARM_16K_TT_L1_SHIFT,
159 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
160 .valid_mask = ARM_TTE_VALID,
161 .type_mask = ARM_TTE_TYPE_MASK,
162 .type_block = ARM_TTE_TYPE_BLOCK
163 },
164 [2] = {
165 .size = ARM_16K_TT_L2_SIZE,
166 .offmask = ARM_16K_TT_L2_OFFMASK,
167 .shift = ARM_16K_TT_L2_SHIFT,
168 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
169 .valid_mask = ARM_TTE_VALID,
170 .type_mask = ARM_TTE_TYPE_MASK,
171 .type_block = ARM_TTE_TYPE_BLOCK
172 },
173 [3] = {
174 .size = ARM_16K_TT_L3_SIZE,
175 .offmask = ARM_16K_TT_L3_OFFMASK,
176 .shift = ARM_16K_TT_L3_SHIFT,
177 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
178 .valid_mask = ARM_PTE_TYPE_VALID,
179 .type_mask = ARM_TTE_TYPE_MASK,
180 .type_block = ARM_TTE_TYPE_L3BLOCK
181 }
182 };
183
184 const struct page_table_level_info pmap_table_level_info_4k[] =
185 {
186 [0] = {
187 .size = ARM_4K_TT_L0_SIZE,
188 .offmask = ARM_4K_TT_L0_OFFMASK,
189 .shift = ARM_4K_TT_L0_SHIFT,
190 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
191 .valid_mask = ARM_TTE_VALID,
192 .type_mask = ARM_TTE_TYPE_MASK,
193 .type_block = ARM_TTE_TYPE_BLOCK
194 },
195 [1] = {
196 .size = ARM_4K_TT_L1_SIZE,
197 .offmask = ARM_4K_TT_L1_OFFMASK,
198 .shift = ARM_4K_TT_L1_SHIFT,
199 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
200 .valid_mask = ARM_TTE_VALID,
201 .type_mask = ARM_TTE_TYPE_MASK,
202 .type_block = ARM_TTE_TYPE_BLOCK
203 },
204 [2] = {
205 .size = ARM_4K_TT_L2_SIZE,
206 .offmask = ARM_4K_TT_L2_OFFMASK,
207 .shift = ARM_4K_TT_L2_SHIFT,
208 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
209 .valid_mask = ARM_TTE_VALID,
210 .type_mask = ARM_TTE_TYPE_MASK,
211 .type_block = ARM_TTE_TYPE_BLOCK
212 },
213 [3] = {
214 .size = ARM_4K_TT_L3_SIZE,
215 .offmask = ARM_4K_TT_L3_OFFMASK,
216 .shift = ARM_4K_TT_L3_SHIFT,
217 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
218 .valid_mask = ARM_PTE_TYPE_VALID,
219 .type_mask = ARM_TTE_TYPE_MASK,
220 .type_block = ARM_TTE_TYPE_L3BLOCK
221 }
222 };
223
224 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
225 {
226 [0] = { /* Unused */
227 .size = ARM_4K_TT_L0_SIZE,
228 .offmask = ARM_4K_TT_L0_OFFMASK,
229 .shift = ARM_4K_TT_L0_SHIFT,
230 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
231 .valid_mask = ARM_TTE_VALID,
232 .type_mask = ARM_TTE_TYPE_MASK,
233 .type_block = ARM_TTE_TYPE_BLOCK
234 },
235 [1] = { /* Concatenated, so index mask is larger than normal */
236 .size = ARM_4K_TT_L1_SIZE,
237 .offmask = ARM_4K_TT_L1_OFFMASK,
238 .shift = ARM_4K_TT_L1_SHIFT,
239 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
240 .index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
241 #else
242 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
243 #endif
244 .valid_mask = ARM_TTE_VALID,
245 .type_mask = ARM_TTE_TYPE_MASK,
246 .type_block = ARM_TTE_TYPE_BLOCK
247 },
248 [2] = {
249 .size = ARM_4K_TT_L2_SIZE,
250 .offmask = ARM_4K_TT_L2_OFFMASK,
251 .shift = ARM_4K_TT_L2_SHIFT,
252 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
253 .valid_mask = ARM_TTE_VALID,
254 .type_mask = ARM_TTE_TYPE_MASK,
255 .type_block = ARM_TTE_TYPE_BLOCK
256 },
257 [3] = {
258 .size = ARM_4K_TT_L3_SIZE,
259 .offmask = ARM_4K_TT_L3_OFFMASK,
260 .shift = ARM_4K_TT_L3_SHIFT,
261 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
262 .valid_mask = ARM_PTE_TYPE_VALID,
263 .type_mask = ARM_TTE_TYPE_MASK,
264 .type_block = ARM_TTE_TYPE_L3BLOCK
265 }
266 };
267
268 const struct page_table_attr pmap_pt_attr_4k = {
269 .pta_level_info = pmap_table_level_info_4k,
270 .pta_root_level = (T0SZ_BOOT - 16) / 9,
271 #if __ARM_MIXED_PAGE_SIZE__
272 .pta_commpage_level = PMAP_TT_L2_LEVEL,
273 #else /* __ARM_MIXED_PAGE_SIZE__ */
274 #if __ARM_16K_PG__
275 .pta_commpage_level = PMAP_TT_L2_LEVEL,
276 #else /* __ARM_16K_PG__ */
277 .pta_commpage_level = PMAP_TT_L1_LEVEL,
278 #endif /* __ARM_16K_PG__ */
279 #endif /* __ARM_MIXED_PAGE_SIZE__ */
280 .pta_max_level = PMAP_TT_L3_LEVEL,
281 .pta_ops = &native_pt_ops,
282 .ap_ro = ARM_PTE_AP(AP_RORO),
283 .ap_rw = ARM_PTE_AP(AP_RWRW),
284 .ap_rona = ARM_PTE_AP(AP_RONA),
285 .ap_rwna = ARM_PTE_AP(AP_RWNA),
286 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
287 .ap_x = ARM_PTE_PNX,
288 #if __ARM_MIXED_PAGE_SIZE__
289 .pta_tcr_value = TCR_EL1_4KB,
290 #endif /* __ARM_MIXED_PAGE_SIZE__ */
291 .pta_page_size = 4096,
292 .pta_pagezero_size = 4096,
293 .pta_page_shift = 12,
294 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB),
295 };
296
297 const struct page_table_attr pmap_pt_attr_16k = {
298 .pta_level_info = pmap_table_level_info_16k,
299 .pta_root_level = PMAP_TT_L1_LEVEL,
300 .pta_commpage_level = PMAP_TT_L2_LEVEL,
301 .pta_max_level = PMAP_TT_L3_LEVEL,
302 .pta_ops = &native_pt_ops,
303 .ap_ro = ARM_PTE_AP(AP_RORO),
304 .ap_rw = ARM_PTE_AP(AP_RWRW),
305 .ap_rona = ARM_PTE_AP(AP_RONA),
306 .ap_rwna = ARM_PTE_AP(AP_RWNA),
307 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
308 .ap_x = ARM_PTE_PNX,
309 #if __ARM_MIXED_PAGE_SIZE__
310 .pta_tcr_value = TCR_EL1_16KB,
311 #endif /* __ARM_MIXED_PAGE_SIZE__ */
312 .pta_page_size = 16384,
313 .pta_pagezero_size = 16384,
314 .pta_page_shift = 14,
315 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB),
316 };
317
318 #if __ARM_16K_PG__
319 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
320 #else /* !__ARM_16K_PG__ */
321 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
322 #endif /* !__ARM_16K_PG__ */
323
324
325 #if DEVELOPMENT || DEBUG
326 int vm_footprint_suspend_allowed = 1;
327
328 extern int pmap_ledgers_panic;
329 extern int pmap_ledgers_panic_leeway;
330
331 #endif /* DEVELOPMENT || DEBUG */
332
333 #if DEVELOPMENT || DEBUG
334 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
335 (current_thread()->pmap_footprint_suspended)
336 #else /* DEVELOPMENT || DEBUG */
337 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
338 #endif /* DEVELOPMENT || DEBUG */
339
340
341 /*
342 * Represents a tlb range that will be flushed before exiting
343 * the ppl.
344 * Used by phys_attribute_clear_range to defer flushing pages in
345 * this range until the end of the operation.
346 */
347 typedef struct pmap_tlb_flush_range {
348 pmap_t ptfr_pmap;
349 vm_map_address_t ptfr_start;
350 vm_map_address_t ptfr_end;
351 bool ptfr_flush_needed;
352 } pmap_tlb_flush_range_t;
353
354 #if XNU_MONITOR
355 /*
356 * PPL External References.
357 */
358 extern vm_offset_t segPPLDATAB;
359 extern unsigned long segSizePPLDATA;
360 extern vm_offset_t segPPLTEXTB;
361 extern unsigned long segSizePPLTEXT;
362 extern vm_offset_t segPPLDATACONSTB;
363 extern unsigned long segSizePPLDATACONST;
364
365
366 /*
367 * PPL Global Variables
368 */
369
370 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
371 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
372 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
373 #else
374 const boolean_t pmap_ppl_disable = FALSE;
375 #endif
376
377 /*
378 * Indicates if the PPL has started applying APRR.
379 * This variable is accessed from various assembly trampolines, so be sure to change
380 * those if you change the size or layout of this variable.
381 */
382 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
383
384 extern void *pmap_stacks_start;
385 extern void *pmap_stacks_end;
386
387 #endif /* !XNU_MONITOR */
388
389
390
391 /* Virtual memory region for early allocation */
392 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
393 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
394 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
395
396 extern uint8_t bootstrap_pagetables[];
397
398 extern unsigned int not_in_kdp;
399
400 extern vm_offset_t first_avail;
401
402 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
403 extern vm_offset_t virtual_space_end; /* End of kernel address space */
404 extern vm_offset_t static_memory_end;
405
406 extern const vm_map_address_t physmap_base;
407 extern const vm_map_address_t physmap_end;
408
409 extern int maxproc, hard_maxproc;
410
411 /* The number of address bits one TTBR can cover. */
412 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
413
414 /*
415 * The bounds on our TTBRs. These are for sanity checking that
416 * an address is accessible by a TTBR before we attempt to map it.
417 */
418
419 /* The level of the root of a page table. */
420 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
421
422 /* The number of entries in the root TT of a page table. */
423 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
424
425 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
426 const pmap_t kernel_pmap = &kernel_pmap_store;
427
428 __static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
429
430 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
431 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
432 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
433
434 typedef struct tt_free_entry {
435 struct tt_free_entry *next;
436 } tt_free_entry_t;
437
438 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
439
440 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
441 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
442 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
443 #define FREE_PAGE_SIZE_TT_MAX 4
444 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
445 unsigned int free_tt_count MARK_AS_PMAP_DATA;
446 unsigned int free_tt_max MARK_AS_PMAP_DATA;
447
448 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
449
450 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
451 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
452 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
453 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
454 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
455 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
456
457 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
458 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
459
460 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
462
463 /* Lock group used for all pmap object locks. */
464 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
465
466 #if DEVELOPMENT || DEBUG
467 int nx_enabled = 1; /* enable no-execute protection */
468 int allow_data_exec = 0; /* No apps may execute data */
469 int allow_stack_exec = 0; /* No apps may execute from the stack */
470 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
471 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
472 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
473 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
474 #else /* DEVELOPMENT || DEBUG */
475 const int nx_enabled = 1; /* enable no-execute protection */
476 const int allow_data_exec = 0; /* No apps may execute data */
477 const int allow_stack_exec = 0; /* No apps may execute from the stack */
478 #endif /* DEVELOPMENT || DEBUG */
479
480 /**
481 * This variable is set true during hibernation entry to protect pmap data structures
482 * during image copying, and reset false on hibernation exit.
483 */
484 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
485
486 #if MACH_ASSERT
487 static void pmap_check_ledgers(pmap_t pmap);
488 #else
489 static inline void
pmap_check_ledgers(__unused pmap_t pmap)490 pmap_check_ledgers(__unused pmap_t pmap)
491 {
492 }
493 #endif /* MACH_ASSERT */
494
495 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
496
497 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
498 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
499
500 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
501
502 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
503 #if defined(__arm64__)
504 /* end of shared region + 512MB for various purposes */
505 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
506 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
507 "Minimum address space size outside allowable range");
508
509 // Max offset is 15.375GB for devices with "large" memory config
510 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
511 // Max offset is 11.375GB for devices with "small" memory config
512 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
513
514
515 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
516 "Large device address space size outside allowable range");
517 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
518 "Small device address space size outside allowable range");
519
520 # ifdef XNU_TARGET_OS_OSX
521 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
522 # else
523 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
524 # endif
525 #endif /* __arm64__ */
526
527 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
528 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
529 #else
530 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
531 #endif
532
533 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
534 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
535 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
536 SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap;
537 #if !HAS_16BIT_ASID
538 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
539 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
540 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
541 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
542 #else
543 static uint16_t last_allocated_asid = 0;
544 #endif /* !HAS_16BIT_ASID */
545
546 #if HAS_SPECRES_DEBUGGING
547 /* A debug flag that controls SPECRES instructions behavior facilitating perf evaluation. */
548 static SECURITY_READ_ONLY_LATE(uint64_t) specres_debug = 0;
549 #endif /* HAS_SPECRES_DEBUGGING */
550
551
552 #if __ARM_MIXED_PAGE_SIZE__
553 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
554 #endif
555 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
556 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
557 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
558
559 /* PTE Define Macros */
560
561 #define ARM_PTE_IS_COMPRESSED(x, p) \
562 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
563 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
564 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
565 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
566 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
567
568 #define pte_is_wired(pte) \
569 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
570
571 #define pte_was_writeable(pte) \
572 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
573
574 #define pte_set_was_writeable(pte, was_writeable) \
575 do { \
576 if ((was_writeable)) { \
577 (pte) |= ARM_PTE_WRITEABLE; \
578 } else { \
579 (pte) &= ~ARM_PTE_WRITEABLE; \
580 } \
581 } while(0)
582
583 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)584 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
585 {
586 if (wired) {
587 *ptep |= ARM_PTE_WIRED;
588 } else {
589 *ptep &= ~ARM_PTE_WIRED;
590 }
591 /*
592 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
593 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
594 * never reclaimed.
595 */
596 if (pmap == kernel_pmap) {
597 return;
598 }
599 unsigned short *ptd_wiredcnt_ptr;
600 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
601 if (wired) {
602 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
603 } else {
604 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
605 if (__improbable(prev_wired == 0)) {
606 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
607 }
608 }
609 }
610
611 #if HAS_FEAT_XS
612
613 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)614 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
615 {
616 if (__improbable(pt_attr->stage2)) {
617 return false;
618 }
619 switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
620 case CACHE_ATTRINDX_DISABLE_XS:
621 case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
622 return true;
623 default:
624 return false;
625 }
626 }
627
628 #endif /* HAS_FEAT_XS */
629
630 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
631 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
632 arm64_sync_tlb(strong); \
633 }
634
635 /*
636 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
637 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
638 * will observe the updated PTE.
639 */
640 #define FLUSH_PTE() \
641 __builtin_arm_dmb(DMB_ISH);
642
643 /*
644 * Synchronize updates to PTEs that were previously valid and thus may be cached in
645 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
646 * TLBI. This should only require a store-store barrier, as subsequent accesses in
647 * program order will not issue until the DSB completes. Prior loads may be reordered
648 * after the barrier, but their behavior should not be materially affected by the
649 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
650 * matter for loads until the access is re-driven well after the TLB update is
651 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
652 * we should be in a position to handle access faults. For "voluntary" PTE access
653 * restriction due to unmapping or protection, the decision to restrict access should
654 * have a data dependency on prior loads in order to avoid a data race.
655 */
656 #define FLUSH_PTE_STRONG() \
657 __builtin_arm_dsb(DSB_ISHST);
658
659 /**
660 * Write enough page table entries to map a single VM page. On systems where the
661 * VM page size does not match the hardware page size, multiple page table
662 * entries will need to be written.
663 *
664 * @note This function does not emit a barrier to ensure these page table writes
665 * have completed before continuing. This is commonly needed. In the case
666 * where a DMB or DSB barrier is needed, then use the write_pte() and
667 * write_pte_strong() functions respectively instead of this one.
668 *
669 * @param ptep Pointer to the first page table entry to update.
670 * @param pte The value to write into each page table entry. In the case that
671 * multiple PTEs are updated to a non-empty value, then the address
672 * in this value will automatically be incremented for each PTE
673 * write.
674 */
675 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)676 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
677 {
678 /**
679 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
680 * systems, which is why it's checked at runtime instead of compile time.
681 * The "unreachable" warning needs to be suppressed because it still is a
682 * compile time constant on some systems.
683 */
684 __unreachable_ok_push
685 if (TEST_PAGE_RATIO_4) {
686 if (((uintptr_t)ptep) & 0x1f) {
687 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
688 __func__, ptep, (void*)pte);
689 }
690
691 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
692 /**
693 * If we're writing an empty/compressed PTE value, then don't
694 * auto-increment the address for each PTE write.
695 */
696 *ptep = pte;
697 *(ptep + 1) = pte;
698 *(ptep + 2) = pte;
699 *(ptep + 3) = pte;
700 } else {
701 *ptep = pte;
702 *(ptep + 1) = pte | 0x1000;
703 *(ptep + 2) = pte | 0x2000;
704 *(ptep + 3) = pte | 0x3000;
705 }
706 } else {
707 *ptep = pte;
708 }
709 __unreachable_ok_pop
710 }
711
712 /**
713 * Writes enough page table entries to map a single VM page and then ensures
714 * those writes complete by executing a Data Memory Barrier.
715 *
716 * @note The DMB issued by this function is not strong enough to protect against
717 * TLB invalidates from being reordered above the PTE writes. If a TLBI
718 * instruction is going to immediately be called after this write, it's
719 * recommended to call write_pte_strong() instead of this function.
720 *
721 * See the function header for write_pte_fast() for more details on the
722 * parameters.
723 */
724 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)725 write_pte(pt_entry_t *ptep, pt_entry_t pte)
726 {
727 write_pte_fast(ptep, pte);
728 FLUSH_PTE();
729 }
730
731 /**
732 * Writes enough page table entries to map a single VM page and then ensures
733 * those writes complete by executing a Data Synchronization Barrier. This
734 * barrier provides stronger guarantees than the DMB executed by write_pte().
735 *
736 * @note This function is useful if you're going to immediately flush the TLB
737 * after making the PTE write. A DSB is required to protect against the
738 * TLB invalidate being reordered before the PTE write.
739 *
740 * See the function header for write_pte_fast() for more details on the
741 * parameters.
742 */
743 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)744 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
745 {
746 write_pte_fast(ptep, pte);
747 FLUSH_PTE_STRONG();
748 }
749
750 /**
751 * Retrieve the pmap structure for the thread running on the current CPU.
752 */
753 pmap_t
current_pmap()754 current_pmap()
755 {
756 const pmap_t current = vm_map_pmap(current_thread()->map);
757
758 assert(current != NULL);
759
760 #if XNU_MONITOR
761 /**
762 * On PPL-enabled systems, it's important that PPL policy decisions aren't
763 * decided by kernel-writable memory. This function is used in various parts
764 * of the PPL, and besides validating that the pointer returned by this
765 * function is indeed a pmap structure, it's also important to ensure that
766 * it's actually the current thread's pmap. This is because different pmaps
767 * will have access to different entitlements based on the code signature of
768 * their loaded process. So if a different user pmap is set in the current
769 * thread structure (in an effort to bypass code signing restrictions), even
770 * though the structure would validate correctly as it is a real pmap
771 * structure, it should fail here.
772 *
773 * This only needs to occur for user pmaps because the kernel pmap's root
774 * page table is always the same as TTBR1 (it's set during bootstrap and not
775 * changed so it'd be redundant to check), and its code signing fields are
776 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
777 * it shouldn't be possible to set those fields. Due to that, an attacker
778 * setting the current thread's pmap to the kernel pmap as a way to bypass
779 * this check won't accomplish anything as it doesn't provide any extra code
780 * signing entitlements.
781 */
782 if ((current != kernel_pmap) &&
783 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
784 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
785 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
786 }
787 #endif /* XNU_MONITOR */
788
789 return current;
790 }
791
792 #if DEVELOPMENT || DEBUG
793
794 /*
795 * Trace levels are controlled by a bitmask in which each
796 * level can be enabled/disabled by the (1<<level) position
797 * in the boot arg
798 * Level 0: PPL extension functionality
799 * Level 1: pmap lifecycle (create/destroy/switch)
800 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
801 * Level 3: internal state management (attributes/fast-fault)
802 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
803 */
804
805 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
806
807 #define PMAP_TRACE(level, ...) \
808 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
809 KDBG_RELEASE(__VA_ARGS__); \
810 }
811 #else /* DEVELOPMENT || DEBUG */
812
813 #define PMAP_TRACE(level, ...)
814
815 #endif /* DEVELOPMENT || DEBUG */
816
817
818 /*
819 * Internal function prototypes (forward declarations).
820 */
821
822 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
823
824 static void pmap_set_reference(ppnum_t pn);
825
826 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
827
828 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
829
830 static kern_return_t pmap_expand(
831 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
832
833 static int pmap_remove_range(
834 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
835
836 static tt_entry_t *pmap_tt1_allocate(
837 pmap_t, vm_size_t, unsigned int);
838
839 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
840
841 static void pmap_tt1_deallocate(
842 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
843
844 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
845
846 static kern_return_t pmap_tt_allocate(
847 pmap_t, tt_entry_t **, unsigned int, unsigned int);
848
849 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
850
851 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
852 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
853 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
854
855 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
856
857
858 static void pmap_unmap_commpage(
859 pmap_t pmap);
860
861 static boolean_t
862 pmap_is_64bit(pmap_t);
863
864
865 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
866
867 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
868
869 static bool pmap_update_cache_attributes_locked(
870 ppnum_t, unsigned, bool);
871
872 static boolean_t arm_clear_fast_fault(
873 ppnum_t ppnum,
874 vm_prot_t fault_type,
875 pt_entry_t *pte_p);
876
877 static void pmap_trim_self(pmap_t pmap);
878 static void pmap_trim_subord(pmap_t subord);
879
880
881 /*
882 * Temporary prototypes, while we wait for pmap_enter to move to taking an
883 * address instead of a page number.
884 */
885 static kern_return_t
886 pmap_enter_addr(
887 pmap_t pmap,
888 vm_map_address_t v,
889 pmap_paddr_t pa,
890 vm_prot_t prot,
891 vm_prot_t fault_type,
892 unsigned int flags,
893 boolean_t wired);
894
895 kern_return_t
896 pmap_enter_options_addr(
897 pmap_t pmap,
898 vm_map_address_t v,
899 pmap_paddr_t pa,
900 vm_prot_t prot,
901 vm_prot_t fault_type,
902 unsigned int flags,
903 boolean_t wired,
904 unsigned int options,
905 __unused void *arg,
906 __unused pmap_mapping_type_t mapping_type);
907
908 #ifdef CONFIG_XNUPOST
909 kern_return_t pmap_test(void);
910 #endif /* CONFIG_XNUPOST */
911
912 PMAP_SUPPORT_PROTOTYPES(
913 kern_return_t,
914 arm_fast_fault, (pmap_t pmap,
915 vm_map_address_t va,
916 vm_prot_t fault_type,
917 bool was_af_fault,
918 bool from_user), ARM_FAST_FAULT_INDEX);
919
920 PMAP_SUPPORT_PROTOTYPES(
921 boolean_t,
922 arm_force_fast_fault, (ppnum_t ppnum,
923 vm_prot_t allow_mode,
924 int options), ARM_FORCE_FAST_FAULT_INDEX);
925
926 MARK_AS_PMAP_TEXT static boolean_t
927 arm_force_fast_fault_with_flush_range(
928 ppnum_t ppnum,
929 vm_prot_t allow_mode,
930 int options,
931 pmap_tlb_flush_range_t *flush_range);
932
933 /**
934 * Definition of the states driving the batch cache attributes update
935 * state machine.
936 */
937 typedef struct {
938 uint64_t page_index : 32, /* The page index to be operated on */
939 state : 8, /* The current state of the update machine */
940 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
941 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
942 :0;
943 } batch_set_cache_attr_state_t;
944
945 /* Possible values of the "state" field. */
946 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
947 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
948 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
949 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
950
951 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
952
953 PMAP_SUPPORT_PROTOTYPES(
954 batch_set_cache_attr_state_t,
955 pmap_batch_set_cache_attributes, (
956 #if XNU_MONITOR
957 volatile upl_page_info_t *user_page_list,
958 #else /* !XNU_MONITOR */
959 upl_page_info_array_t user_page_list,
960 #endif /* XNU_MONITOR */
961 batch_set_cache_attr_state_t state,
962 unsigned int page_cnt,
963 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
964
965 PMAP_SUPPORT_PROTOTYPES(
966 kern_return_t,
967 pmap_change_wiring, (pmap_t pmap,
968 vm_map_address_t v,
969 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
970
971 PMAP_SUPPORT_PROTOTYPES(
972 pmap_t,
973 pmap_create_options, (ledger_t ledger,
974 vm_map_size_t size,
975 unsigned int flags,
976 kern_return_t * kr), PMAP_CREATE_INDEX);
977
978 PMAP_SUPPORT_PROTOTYPES(
979 void,
980 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
981
982 PMAP_SUPPORT_PROTOTYPES(
983 kern_return_t,
984 pmap_enter_options, (pmap_t pmap,
985 vm_map_address_t v,
986 pmap_paddr_t pa,
987 vm_prot_t prot,
988 vm_prot_t fault_type,
989 unsigned int flags,
990 boolean_t wired,
991 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
992
993 PMAP_SUPPORT_PROTOTYPES(
994 pmap_paddr_t,
995 pmap_find_pa, (pmap_t pmap,
996 addr64_t va), PMAP_FIND_PA_INDEX);
997
998 PMAP_SUPPORT_PROTOTYPES(
999 kern_return_t,
1000 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
1001
1002
1003 PMAP_SUPPORT_PROTOTYPES(
1004 boolean_t,
1005 pmap_is_empty, (pmap_t pmap,
1006 vm_map_offset_t va_start,
1007 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1008
1009
1010 PMAP_SUPPORT_PROTOTYPES(
1011 unsigned int,
1012 pmap_map_cpu_windows_copy, (ppnum_t pn,
1013 vm_prot_t prot,
1014 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1015
1016 PMAP_SUPPORT_PROTOTYPES(
1017 void,
1018 pmap_ro_zone_memcpy, (zone_id_t zid,
1019 vm_offset_t va,
1020 vm_offset_t offset,
1021 const vm_offset_t new_data,
1022 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1023
1024 PMAP_SUPPORT_PROTOTYPES(
1025 uint64_t,
1026 pmap_ro_zone_atomic_op, (zone_id_t zid,
1027 vm_offset_t va,
1028 vm_offset_t offset,
1029 zro_atomic_op_t op,
1030 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1031
1032 PMAP_SUPPORT_PROTOTYPES(
1033 void,
1034 pmap_ro_zone_bzero, (zone_id_t zid,
1035 vm_offset_t va,
1036 vm_offset_t offset,
1037 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1038
1039 PMAP_SUPPORT_PROTOTYPES(
1040 kern_return_t,
1041 pmap_set_shared_region, (pmap_t grand,
1042 pmap_t subord,
1043 addr64_t vstart,
1044 uint64_t size), PMAP_SET_SHARED_REGION_INDEX);
1045
1046 PMAP_SUPPORT_PROTOTYPES(
1047 vm_map_offset_t,
1048 pmap_nest, (pmap_t grand,
1049 pmap_t subord,
1050 addr64_t vstart,
1051 uint64_t size,
1052 vm_map_offset_t vrestart,
1053 kern_return_t * krp), PMAP_NEST_INDEX);
1054
1055 PMAP_SUPPORT_PROTOTYPES(
1056 void,
1057 pmap_page_protect_options, (ppnum_t ppnum,
1058 vm_prot_t prot,
1059 unsigned int options,
1060 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1061
1062 PMAP_SUPPORT_PROTOTYPES(
1063 vm_map_address_t,
1064 pmap_protect_options, (pmap_t pmap,
1065 vm_map_address_t start,
1066 vm_map_address_t end,
1067 vm_prot_t prot,
1068 unsigned int options,
1069 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1070
1071 PMAP_SUPPORT_PROTOTYPES(
1072 kern_return_t,
1073 pmap_query_page_info, (pmap_t pmap,
1074 vm_map_offset_t va,
1075 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1076
1077 PMAP_SUPPORT_PROTOTYPES(
1078 mach_vm_size_t,
1079 pmap_query_resident, (pmap_t pmap,
1080 vm_map_address_t start,
1081 vm_map_address_t end,
1082 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1083
1084 PMAP_SUPPORT_PROTOTYPES(
1085 void,
1086 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1087
1088 PMAP_SUPPORT_PROTOTYPES(
1089 vm_map_address_t,
1090 pmap_remove_options, (pmap_t pmap,
1091 vm_map_address_t start,
1092 vm_map_address_t end,
1093 int options), PMAP_REMOVE_OPTIONS_INDEX);
1094
1095
1096 PMAP_SUPPORT_PROTOTYPES(
1097 void,
1098 pmap_set_cache_attributes, (ppnum_t pn,
1099 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1100
1101 PMAP_SUPPORT_PROTOTYPES(
1102 void,
1103 pmap_update_compressor_page, (ppnum_t pn,
1104 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1105
1106 PMAP_SUPPORT_PROTOTYPES(
1107 void,
1108 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1109
1110 #if MACH_ASSERT || XNU_MONITOR
1111 PMAP_SUPPORT_PROTOTYPES(
1112 void,
1113 pmap_set_process, (pmap_t pmap,
1114 int pid,
1115 char *procname), PMAP_SET_PROCESS_INDEX);
1116 #endif
1117
1118 PMAP_SUPPORT_PROTOTYPES(
1119 void,
1120 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1121
1122 PMAP_SUPPORT_PROTOTYPES(
1123 vm_map_offset_t,
1124 pmap_unnest_options, (pmap_t grand,
1125 addr64_t vaddr,
1126 uint64_t size,
1127 vm_map_offset_t vrestart,
1128 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1129
1130 PMAP_SUPPORT_PROTOTYPES(
1131 void,
1132 phys_attribute_set, (ppnum_t pn,
1133 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1134
1135 PMAP_SUPPORT_PROTOTYPES(
1136 void,
1137 phys_attribute_clear, (ppnum_t pn,
1138 unsigned int bits,
1139 int options,
1140 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1141
1142 #if __ARM_RANGE_TLBI__
1143 PMAP_SUPPORT_PROTOTYPES(
1144 vm_map_address_t,
1145 phys_attribute_clear_range, (pmap_t pmap,
1146 vm_map_address_t start,
1147 vm_map_address_t end,
1148 unsigned int bits,
1149 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1150 #endif /* __ARM_RANGE_TLBI__ */
1151
1152
1153 PMAP_SUPPORT_PROTOTYPES(
1154 void,
1155 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1156
1157 PMAP_SUPPORT_PROTOTYPES(
1158 void,
1159 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1160
1161 PMAP_SUPPORT_PROTOTYPES(
1162 void,
1163 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1164
1165 PMAP_SUPPORT_PROTOTYPES(
1166 void,
1167 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1168
1169 PMAP_SUPPORT_PROTOTYPES(
1170 void,
1171 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1172
1173 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1174 PMAP_SUPPORT_PROTOTYPES(
1175 void,
1176 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1177 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1178
1179 /* Definition of the states used by pmap_trim(). */
1180 typedef enum {
1181 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1182 PMAP_TRIM_STATE_START = 0,
1183
1184 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1185 PMAP_TRIM_STATE_GRAND_BEFORE,
1186
1187 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1188 PMAP_TRIM_STATE_GRAND_AFTER,
1189
1190 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1191 PMAP_TRIM_STATE_SUBORD,
1192
1193 /* Marks that trimming is finished. */
1194 PMAP_TRIM_STATE_DONE,
1195
1196 /* Sentry enum for sanity checks. */
1197 PMAP_TRIM_STATE_COUNT,
1198 } pmap_trim_state_t;
1199
1200 PMAP_SUPPORT_PROTOTYPES(
1201 pmap_trim_state_t,
1202 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1203
1204 #if HAS_APPLE_PAC
1205 PMAP_SUPPORT_PROTOTYPES(
1206 void *,
1207 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1208 PMAP_SUPPORT_PROTOTYPES(
1209 void *,
1210 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1211 #endif /* HAS_APPLE_PAC */
1212
1213
1214
1215
1216 PMAP_SUPPORT_PROTOTYPES(
1217 kern_return_t,
1218 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1219 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1220
1221 PMAP_SUPPORT_PROTOTYPES(
1222 kern_return_t,
1223 pmap_load_trust_cache_with_type, (TCType_t type,
1224 const vm_address_t pmap_img4_payload,
1225 const vm_size_t pmap_img4_payload_len,
1226 const vm_address_t img4_manifest,
1227 const vm_size_t img4_manifest_len,
1228 const vm_address_t img4_aux_manifest,
1229 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1230
1231 PMAP_SUPPORT_PROTOTYPES(
1232 void,
1233 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1234
1235 PMAP_SUPPORT_PROTOTYPES(
1236 kern_return_t,
1237 pmap_query_trust_cache, (TCQueryType_t query_type,
1238 const uint8_t cdhash[kTCEntryHashSize],
1239 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1240
1241 PMAP_SUPPORT_PROTOTYPES(
1242 errno_t,
1243 pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1244 const void *input_data,
1245 size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1246
1247 #if PMAP_CS_INCLUDE_CODE_SIGNING
1248
1249 PMAP_SUPPORT_PROTOTYPES(
1250 kern_return_t,
1251 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1252 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1253
1254 PMAP_SUPPORT_PROTOTYPES(
1255 kern_return_t,
1256 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1257 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1258
1259 PMAP_SUPPORT_PROTOTYPES(
1260 kern_return_t,
1261 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1262 pmap_cs_profile_t * profile_obj),
1263 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1264
1265 PMAP_SUPPORT_PROTOTYPES(
1266 kern_return_t,
1267 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1268 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1269
1270 PMAP_SUPPORT_PROTOTYPES(
1271 kern_return_t,
1272 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1273 const void *kernel_entitlements),
1274 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1275
1276 PMAP_SUPPORT_PROTOTYPES(
1277 kern_return_t,
1278 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1279 const void **kernel_entitlements),
1280 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1281
1282 PMAP_SUPPORT_PROTOTYPES(
1283 kern_return_t,
1284 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1285 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1286
1287 PMAP_SUPPORT_PROTOTYPES(
1288 kern_return_t,
1289 pmap_cs_allow_invalid, (pmap_t pmap),
1290 PMAP_CS_ALLOW_INVALID_INDEX);
1291
1292 PMAP_SUPPORT_PROTOTYPES(
1293 void,
1294 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1295 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1296
1297 PMAP_SUPPORT_PROTOTYPES(
1298 bool,
1299 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1300 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1301
1302 PMAP_SUPPORT_PROTOTYPES(
1303 void,
1304 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1305 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1306
1307 PMAP_SUPPORT_PROTOTYPES(
1308 void,
1309 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1310 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1311
1312 #endif
1313
1314 PMAP_SUPPORT_PROTOTYPES(
1315 uint32_t,
1316 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1317
1318 PMAP_SUPPORT_PROTOTYPES(
1319 bool,
1320 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1321
1322 PMAP_SUPPORT_PROTOTYPES(
1323 void,
1324 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1325
1326 void pmap_footprint_suspend(vm_map_t map,
1327 boolean_t suspend);
1328 PMAP_SUPPORT_PROTOTYPES(
1329 void,
1330 pmap_footprint_suspend, (vm_map_t map,
1331 boolean_t suspend),
1332 PMAP_FOOTPRINT_SUSPEND_INDEX);
1333
1334
1335
1336
1337 #if DEVELOPMENT || DEBUG
1338 PMAP_SUPPORT_PROTOTYPES(
1339 kern_return_t,
1340 pmap_test_text_corruption, (pmap_paddr_t),
1341 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1342 #endif /* DEVELOPMENT || DEBUG */
1343
1344 /*
1345 * The low global vector page is mapped at a fixed alias.
1346 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1347 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1348 * to check both addresses anyway for backward compatibility. So for now
1349 * we leave H6 and H7 where they were.
1350 */
1351 #if (ARM_PGSHIFT == 14)
1352 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1353 #else
1354 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1355 #endif
1356
1357
1358 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1359 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1360 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1361
1362 #if XNU_MONITOR
1363
1364 #if __has_feature(ptrauth_calls)
1365 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1366 #else
1367 #define __ptrauth_ppl_handler
1368 #endif
1369
1370 /*
1371 * Table of function pointers used for PPL dispatch.
1372 */
1373 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1374 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1375 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1376 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1377 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1378 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1379 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1380 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1381 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1382 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1383 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1384 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1385 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1386 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1387 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1388 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1389 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1390 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1391 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1392 [PMAP_SET_SHARED_REGION_INDEX] = pmap_set_shared_region_internal,
1393 [PMAP_NEST_INDEX] = pmap_nest_internal,
1394 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1395 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1396 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1397 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1398 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1399 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1400 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1401 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1402 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1403 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1404 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1405 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1406 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1407 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1408 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1409 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1410 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1411 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1412 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1413 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1414 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1415 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1416 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1417 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1418 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1419 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1420 [PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1421 #if PMAP_CS_INCLUDE_CODE_SIGNING
1422 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1423 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1424 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1425 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1426 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1427 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1428 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1429 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1430 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1431 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1432 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1433 #endif
1434 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1435 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1436 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1437 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1438 #if HAS_APPLE_PAC
1439 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1440 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1441 #endif /* HAS_APPLE_PAC */
1442 #if __ARM_RANGE_TLBI__
1443 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1444 #endif /* __ARM_RANGE_TLBI__ */
1445 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1446 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1447 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1448 [PMAP_NOP_INDEX] = pmap_nop_internal,
1449
1450 #if DEVELOPMENT || DEBUG
1451 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1452 #endif /* DEVELOPMENT || DEBUG */
1453
1454 };
1455 #endif
1456
1457 #if XNU_MONITOR
1458 /**
1459 * A convenience function for setting protections on a single physical
1460 * aperture or static region mapping without invalidating the TLB.
1461 *
1462 * @note This function does not perform any TLB invalidations. That must be done
1463 * separately to be able to safely use the updated mapping.
1464 *
1465 * @note This function understands the difference between the VM page size and
1466 * the kernel page size and will update multiple PTEs if the sizes differ.
1467 * In other words, enough PTEs will always get updated to change the
1468 * permissions on a PAGE_SIZE amount of memory.
1469 *
1470 * @note The PVH lock for the physical page represented by this mapping must
1471 * already be locked.
1472 *
1473 * @note This function assumes the caller has already verified that the PTE
1474 * pointer does indeed point to a physical aperture or static region page
1475 * table. Please validate your inputs before passing it along to this
1476 * function.
1477 *
1478 * @param ptep Pointer to the physical aperture or static region page table to
1479 * update with a new XPRR index.
1480 * @param expected_perm The XPRR index that is expected to already exist at the
1481 * current mapping. If the current index doesn't match this
1482 * then the system will panic.
1483 * @param new_perm The new XPRR index to update the mapping with.
1484 */
1485 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1486 pmap_set_pte_xprr_perm(
1487 pt_entry_t * const ptep,
1488 unsigned int expected_perm,
1489 unsigned int new_perm)
1490 {
1491 assert(ptep != NULL);
1492
1493 pt_entry_t spte = *ptep;
1494 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1495
1496 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1497 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1498 __func__, ptep, new_perm, expected_perm);
1499 }
1500
1501 /**
1502 * The PTE involved should be valid, should not have the hint bit set, and
1503 * should have the expected XPRR index.
1504 */
1505 if (__improbable(!pte_is_valid(spte))) {
1506 panic_plain("%s: physical aperture or static region PTE is invalid, "
1507 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1508 __func__, ptep, spte, new_perm, expected_perm);
1509 }
1510
1511 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1512 panic_plain("%s: physical aperture or static region PTE has hint bit "
1513 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1514 __func__, ptep, spte, new_perm, expected_perm);
1515 }
1516
1517 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1518 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1519 "ptep=%p, new_perm=%u, expected_perm=%u",
1520 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1521 }
1522
1523 pt_entry_t template = spte;
1524 template &= ~ARM_PTE_XPRR_MASK;
1525 template |= xprr_perm_to_pte(new_perm);
1526
1527 write_pte_strong(ptep, template);
1528 }
1529
1530 /**
1531 * Update the protections on a single physical aperture mapping and invalidate
1532 * the TLB so the mapping can be used.
1533 *
1534 * @note The PVH lock for the physical page must already be locked.
1535 *
1536 * @param pai The physical address index of the page whose physical aperture
1537 * mapping will be updated with new permissions.
1538 * @param expected_perm The XPRR index that is expected to already exist at the
1539 * current mapping. If the current index doesn't match this
1540 * then the system will panic.
1541 * @param new_perm The new XPRR index to update the mapping with.
1542 */
1543 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1544 pmap_set_xprr_perm(
1545 unsigned int pai,
1546 unsigned int expected_perm,
1547 unsigned int new_perm)
1548 {
1549 pvh_assert_locked(pai);
1550
1551 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1552 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1553
1554 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1555
1556 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1557 sync_tlb_flush();
1558 }
1559
1560 /**
1561 * Update the protections on a range of physical aperture or static region
1562 * mappings and invalidate the TLB so the mappings can be used.
1563 *
1564 * @note Static region mappings can only be updated before machine_lockdown().
1565 * Physical aperture mappings can be updated at any time.
1566 *
1567 * @param start The starting virtual address of the static region or physical
1568 * aperture range whose permissions will be updated.
1569 * @param end The final (inclusive) virtual address of the static region or
1570 * physical aperture range whose permissions will be updated.
1571 * @param expected_perm The XPRR index that is expected to already exist at the
1572 * current mappings. If the current indices don't match
1573 * this then the system will panic.
1574 * @param new_perm The new XPRR index to update the mappings with.
1575 */
1576 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1577 pmap_set_range_xprr_perm(
1578 vm_address_t start,
1579 vm_address_t end,
1580 unsigned int expected_perm,
1581 unsigned int new_perm)
1582 {
1583 /**
1584 * Validate our arguments; any invalid argument will be grounds for a panic.
1585 */
1586 if (__improbable((start | end) & ARM_PGMASK)) {
1587 panic_plain("%s: start or end not page aligned, "
1588 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1589 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1590 }
1591
1592 if (__improbable(start > end)) {
1593 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1594 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1595 }
1596
1597 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1598 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1599
1600 if (__improbable(!(in_physmap || in_static))) {
1601 panic_plain("%s: address not in static region or physical aperture, "
1602 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1603 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1604 }
1605
1606 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1607 panic_plain("%s: invalid XPRR index, "
1608 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1609 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1610 }
1611
1612 /*
1613 * Walk over the PTEs for the given range, and set the protections on those
1614 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1615 * one twig entry (whichever twig entry currently maps "va").
1616 */
1617 vm_address_t va = start;
1618 while (va < end) {
1619 /**
1620 * Get the last VA that the twig entry for "va" maps. All of the leaf
1621 * PTEs from va to tte_va_end will have their permissions updated.
1622 */
1623 vm_address_t tte_va_end =
1624 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1625
1626 if (tte_va_end > end) {
1627 tte_va_end = end;
1628 }
1629
1630 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1631
1632 if (ttep == NULL) {
1633 panic_plain("%s: physical aperture or static region tte is NULL, "
1634 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1635 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1636 }
1637
1638 tt_entry_t tte = *ttep;
1639
1640 if (!tte_is_valid_table(tte)) {
1641 panic_plain("%s: tte=0x%llx is not a table type entry, "
1642 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1643 tte, (void *)start, (void *)end, new_perm, expected_perm);
1644 }
1645
1646 /* Walk over the given L3 page table page and update the PTEs. */
1647 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1648 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1649 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1650 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1651
1652 /**
1653 * The current PTE pointer is incremented by the page ratio (ratio of
1654 * VM page size to kernel hardware page size) because one call to
1655 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1656 * a PAGE_SIZE worth of hardware pages.
1657 */
1658 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1659 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1660 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1661 pvh_lock(pai);
1662 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1663 pvh_unlock(pai);
1664 }
1665
1666 va = tte_va_end;
1667 }
1668
1669 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1670 }
1671
1672 #endif /* XNU_MONITOR */
1673
1674 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1675 PMAP_ZINFO_PALLOC(
1676 pmap_t pmap, int bytes)
1677 {
1678 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1679 }
1680
1681 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1682 PMAP_ZINFO_PFREE(
1683 pmap_t pmap,
1684 int bytes)
1685 {
1686 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1687 }
1688
1689 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1690 pmap_tt_ledger_credit(
1691 pmap_t pmap,
1692 vm_size_t size)
1693 {
1694 if (pmap != kernel_pmap) {
1695 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1696 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1697 }
1698 }
1699
1700 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1701 pmap_tt_ledger_debit(
1702 pmap_t pmap,
1703 vm_size_t size)
1704 {
1705 if (pmap != kernel_pmap) {
1706 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1707 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1708 }
1709 }
1710
1711 static inline void
pmap_update_plru(uint16_t asid_index __unused)1712 pmap_update_plru(uint16_t asid_index __unused)
1713 {
1714 #if !HAS_16BIT_ASID
1715 if (__probable(pmap_asid_plru)) {
1716 unsigned plru_index = asid_index >> 6;
1717 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1718 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1719 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1720 }
1721 }
1722 #endif /* !HAS_16BIT_ASID */
1723 }
1724
1725 static bool
alloc_asid(pmap_t pmap)1726 alloc_asid(pmap_t pmap)
1727 {
1728 int vasid = -1;
1729 uint16_t hw_asid;
1730
1731 pmap_simple_lock(&asid_lock);
1732
1733 #if !HAS_16BIT_ASID
1734 if (__probable(pmap_asid_plru)) {
1735 unsigned plru_index = 0;
1736 uint64_t lowest_gen = asid_plru_generation[0];
1737 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1738 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1739 if (asid_plru_generation[i] < lowest_gen) {
1740 plru_index = i;
1741 lowest_gen = asid_plru_generation[i];
1742 lowest_gen_bitmap = asid_plru_bitmap[i];
1743 }
1744 }
1745
1746 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1747 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1748 if (temp_plru) {
1749 vasid = (plru_index << 6) + lsb_first(temp_plru);
1750 #if DEVELOPMENT || DEBUG
1751 ++pmap_asid_hits;
1752 #endif
1753 break;
1754 }
1755 }
1756 }
1757 #else
1758 /**
1759 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1760 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1761 * However, we first try to allocate starting from the position of the most-recently allocated
1762 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1763 * lower bit positions and then re-checking those same lower positions every time we allocate
1764 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1765 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1766 * logic, without requiring prohibitively expensive RCTX instructions.
1767 */
1768 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1769 #endif /* !HAS_16BIT_ASID */
1770 if (__improbable(vasid < 0)) {
1771 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1772 // slightly better with the collision detection scheme used by pmap_switch_internal().
1773 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1774 #if DEVELOPMENT || DEBUG
1775 ++pmap_asid_misses;
1776 #endif
1777 }
1778 if (__improbable(vasid < 0)) {
1779 pmap_simple_unlock(&asid_lock);
1780 return false;
1781 }
1782 assert((uint32_t)vasid < pmap_max_asids);
1783 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1784 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1785 #if HAS_16BIT_ASID
1786 last_allocated_asid = (uint16_t)vasid;
1787 #endif /* HAS_16BIT_ASID */
1788 pmap_simple_unlock(&asid_lock);
1789 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1790 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1791 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1792 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1793 * reassign to a reserved VASID. */
1794 assert(pmap->sw_asid < UINT8_MAX);
1795 pmap->sw_asid = UINT8_MAX;
1796 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1797 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1798 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1799 assert(hw_asid < MAX_HW_ASIDS);
1800 }
1801 pmap_update_plru(hw_asid);
1802 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1803 #if __ARM_KERNEL_PROTECT__
1804 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1805 #endif
1806 pmap->hw_asid = hw_asid;
1807 return true;
1808 }
1809
1810 static void
free_asid(pmap_t pmap)1811 free_asid(pmap_t pmap)
1812 {
1813 unsigned int vasid;
1814 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1815 if (__improbable(hw_asid == 0)) {
1816 return;
1817 }
1818
1819 #if __ARM_KERNEL_PROTECT__
1820 hw_asid >>= 1;
1821 #endif
1822 hw_asid -= 1;
1823
1824 #if HAS_16BIT_ASID
1825 vasid = hw_asid;
1826 #else
1827 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1828 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1829 } else {
1830 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1831 }
1832
1833 if (__probable(pmap_asid_plru)) {
1834 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1835 }
1836 #endif /* HAS_16BIT_ASID */
1837 pmap_simple_lock(&asid_lock);
1838 assert(!bitmap_test(&asid_bitmap[0], vasid));
1839 bitmap_set(&asid_bitmap[0], vasid);
1840 pmap_simple_unlock(&asid_lock);
1841 }
1842
1843
1844 boolean_t
pmap_valid_address(pmap_paddr_t addr)1845 pmap_valid_address(
1846 pmap_paddr_t addr)
1847 {
1848 return pa_valid(addr);
1849 }
1850
1851
1852
1853
1854
1855
1856 /*
1857 * Map memory at initialization. The physical addresses being
1858 * mapped are not managed and are never unmapped.
1859 *
1860 * For now, VM is already on, we only need to map the
1861 * specified memory.
1862 */
1863 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1864 pmap_map(
1865 vm_map_address_t virt,
1866 vm_offset_t start,
1867 vm_offset_t end,
1868 vm_prot_t prot,
1869 unsigned int flags)
1870 {
1871 kern_return_t kr;
1872 vm_size_t ps;
1873
1874 ps = PAGE_SIZE;
1875 while (start < end) {
1876 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1877 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1878
1879 if (kr != KERN_SUCCESS) {
1880 panic("%s: failed pmap_enter, "
1881 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1882 __FUNCTION__,
1883 (void *) virt, (void *) start, (void *) end, prot, flags);
1884 }
1885
1886 virt += ps;
1887 start += ps;
1888 }
1889 return virt;
1890 }
1891
1892 #if XNU_MONITOR
1893 /**
1894 * Remove kernel writeablity from an IO PTE value if the page is owned by
1895 * guarded mode software.
1896 *
1897 * @param paddr The physical address of the page which has to be non-DRAM.
1898 * @param tmplate The PTE value to be evaluated.
1899 *
1900 * @return A new PTE value with permission bits modified.
1901 */
1902 static inline
1903 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1904 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1905 {
1906 assert(!pa_valid(paddr));
1907
1908 const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1909
1910 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1911 /* PPL to own the page by converting KERN_RW to PPL_RW. */
1912 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1913 switch (xprr_perm) {
1914 case XPRR_KERN_RO_PERM:
1915 break;
1916 case XPRR_KERN_RW_PERM:
1917 tmplate &= ~ARM_PTE_XPRR_MASK;
1918 tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1919 break;
1920 default:
1921 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1922 }
1923 }
1924
1925 return tmplate;
1926 }
1927 #endif /* XNU_MONITOR */
1928
1929 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1930 pmap_map_bd_with_options(
1931 vm_map_address_t virt,
1932 vm_offset_t start,
1933 vm_offset_t end,
1934 vm_prot_t prot,
1935 int32_t options)
1936 {
1937 pt_entry_t mem_attr;
1938
1939 if (__improbable(start & PAGE_MASK)) {
1940 panic("%s: start 0x%lx is not page aligned", __func__, start);
1941 }
1942
1943 if (__improbable(end & PAGE_MASK)) {
1944 panic("%s: end 0x%lx is not page aligned", __func__, end);
1945 }
1946
1947 if (__improbable(!gDramBase || !gDramSize)) {
1948 panic("%s: gDramBase/gDramSize not initialized", __func__);
1949 }
1950
1951 const bool first_page_is_dram = is_dram_addr(start);
1952 for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) {
1953 if (first_page_is_dram != is_dram_addr(pa)) {
1954 panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM",
1955 __func__, pa, first_page_is_dram ? "is not" : "is");
1956 }
1957 }
1958
1959 switch (options & PMAP_MAP_BD_MASK) {
1960 case PMAP_MAP_BD_WCOMB:
1961 if (is_dram_addr(start)) {
1962 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1963 } else {
1964 #if HAS_FEAT_XS
1965 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
1966 #else /* HAS_FEAT_XS */
1967 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1968 #endif /* HAS_FEAT_XS */
1969 #if DEBUG || DEVELOPMENT
1970 pmap_wcrt_on_non_dram_count_increment_atomic();
1971 #endif /* DEBUG || DEVELOPMENT */
1972 }
1973 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1974 break;
1975 case PMAP_MAP_BD_POSTED:
1976 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1977 break;
1978 case PMAP_MAP_BD_POSTED_REORDERED:
1979 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1980 break;
1981 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1982 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1983 break;
1984 default:
1985 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1986 break;
1987 }
1988
1989 /* not cacheable and not buffered */
1990 pt_entry_t tmplate = pa_to_pte(start)
1991 | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1992 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1993 | mem_attr;
1994
1995 #if __ARM_KERNEL_PROTECT__
1996 tmplate |= ARM_PTE_NG;
1997 #endif /* __ARM_KERNEL_PROTECT__ */
1998
1999 vm_map_address_t vaddr = virt;
2000 vm_offset_t paddr = start;
2001 while (paddr < end) {
2002 pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
2003 if (ptep == PT_ENTRY_NULL) {
2004 panic("pmap_map_bd");
2005 }
2006
2007 /**
2008 * For every iteration, the paddr encoded in tmplate is incrementing,
2009 * but we always start with the original AP bits defined at the top
2010 * of the function in tmplate and only modify the AP bits in the pte
2011 * variable.
2012 */
2013 pt_entry_t pte;
2014 #if XNU_MONITOR
2015 if (!pa_valid(paddr)) {
2016 pte = pmap_construct_io_pte(paddr, tmplate);
2017 } else {
2018 pte = tmplate;
2019 }
2020 #else /* !XNU_MONITOR */
2021 pte = tmplate;
2022 #endif
2023
2024 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2025 write_pte_strong(ptep, pte);
2026
2027 pte_increment_pa(tmplate);
2028 vaddr += PAGE_SIZE;
2029 paddr += PAGE_SIZE;
2030 }
2031
2032 if (end >= start) {
2033 flush_mmu_tlb_region(virt, (unsigned)(end - start));
2034 }
2035
2036 return vaddr;
2037 }
2038
2039 /*
2040 * Back-door routine for mapping kernel VM at initialization.
2041 * Useful for mapping memory outside the range
2042 * [vm_first_phys, vm_last_phys] (i.e., devices).
2043 * Otherwise like pmap_map.
2044 */
2045 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)2046 pmap_map_bd(
2047 vm_map_address_t virt,
2048 vm_offset_t start,
2049 vm_offset_t end,
2050 vm_prot_t prot)
2051 {
2052 return pmap_map_bd_with_options(virt, start, end, prot, 0);
2053 }
2054
2055 /*
2056 * Back-door routine for mapping kernel VM at initialization.
2057 * Useful for mapping memory specific physical addresses in early
2058 * boot (i.e., before kernel_map is initialized).
2059 *
2060 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
2061 */
2062
2063 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2064 pmap_map_high_window_bd(
2065 vm_offset_t pa_start,
2066 vm_size_t len,
2067 vm_prot_t prot)
2068 {
2069 pt_entry_t *ptep, pte;
2070 vm_map_address_t va_start = VREGION1_START;
2071 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
2072 vm_map_address_t va_end;
2073 vm_map_address_t va;
2074 vm_size_t offset;
2075
2076 offset = pa_start & PAGE_MASK;
2077 pa_start -= offset;
2078 len += offset;
2079
2080 if (len > (va_max - va_start)) {
2081 panic("%s: area too large, "
2082 "pa_start=%p, len=%p, prot=0x%x",
2083 __FUNCTION__,
2084 (void*)pa_start, (void*)len, prot);
2085 }
2086
2087 scan:
2088 for (; va_start < va_max; va_start += PAGE_SIZE) {
2089 ptep = pmap_pte(kernel_pmap, va_start);
2090 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2091 if (!pte_is_valid(*ptep)) {
2092 break;
2093 }
2094 }
2095 if (va_start > va_max) {
2096 panic("%s: insufficient pages, "
2097 "pa_start=%p, len=%p, prot=0x%x",
2098 __FUNCTION__,
2099 (void*)pa_start, (void*)len, prot);
2100 }
2101
2102 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2103 ptep = pmap_pte(kernel_pmap, va_end);
2104 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2105 if (pte_is_valid(*ptep)) {
2106 va_start = va_end + PAGE_SIZE;
2107 goto scan;
2108 }
2109 }
2110
2111 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2112 ptep = pmap_pte(kernel_pmap, va);
2113 pte = pa_to_pte(pa_start)
2114 | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2115 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2116 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2117 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2118 #if __ARM_KERNEL_PROTECT__
2119 pte |= ARM_PTE_NG;
2120 #endif /* __ARM_KERNEL_PROTECT__ */
2121 write_pte_strong(ptep, pte);
2122 }
2123 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2124 #if KASAN
2125 kasan_notify_address(va_start, len);
2126 #endif
2127 return va_start;
2128 }
2129
2130 static uint32_t
pmap_compute_max_asids(void)2131 pmap_compute_max_asids(void)
2132 {
2133 DTEntry entry;
2134 void const *prop = NULL;
2135 uint32_t max_asids;
2136 int err;
2137 unsigned int prop_size;
2138
2139 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2140 assert(err == kSuccess);
2141
2142 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2143 /* TODO: consider allowing maxproc limits to be scaled earlier so that
2144 * we can choose a more flexible default value here. */
2145 return MAX_ASIDS;
2146 }
2147
2148 if (prop_size != sizeof(max_asids)) {
2149 panic("pmap-max-asids property is not a 32-bit integer");
2150 }
2151
2152 max_asids = *((uint32_t const *)prop);
2153 #if HAS_16BIT_ASID
2154 if (max_asids > MAX_HW_ASIDS) {
2155 panic("pmap-max-asids 0x%x too large", max_asids);
2156 }
2157 #else
2158 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2159 max_asids = (max_asids + 63) & ~63UL;
2160
2161 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2162 /* currently capped by size of pmap->sw_asid */
2163 panic("pmap-max-asids 0x%x too large", max_asids);
2164 }
2165 #endif /* HAS_16BIT_ASID */
2166 if (max_asids == 0) {
2167 panic("pmap-max-asids cannot be zero");
2168 }
2169 return max_asids;
2170 }
2171
2172 #if __arm64__
2173 /*
2174 * pmap_get_arm64_prot
2175 *
2176 * return effective armv8 VMSA block protections including
2177 * table AP/PXN/XN overrides of a pmap entry
2178 *
2179 */
2180
2181 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2182 pmap_get_arm64_prot(
2183 pmap_t pmap,
2184 vm_offset_t addr)
2185 {
2186 tt_entry_t tte = 0;
2187 unsigned int level = 0;
2188 uint64_t effective_prot_bits = 0;
2189 uint64_t aggregate_tte = 0;
2190 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2191 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2192
2193 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2194 tte = *pmap_ttne(pmap, level, addr);
2195
2196 if (!(tte & ARM_TTE_VALID)) {
2197 return 0;
2198 }
2199
2200 if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
2201 /* Block or page mapping; both have the same protection bit layout. */
2202 break;
2203 } else if (tte_is_table(tte)) {
2204 /* All of the table bits we care about are overrides, so just OR them together. */
2205 aggregate_tte |= tte;
2206 }
2207 }
2208
2209 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2210 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2211 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2212
2213 /* Start with the PTE bits. */
2214 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2215
2216 /* Table AP bits mask out block/page AP bits */
2217 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2218
2219 /* XN/PXN bits can be OR'd in. */
2220 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2221 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2222
2223 return effective_prot_bits;
2224 }
2225 #endif /* __arm64__ */
2226
2227 /**
2228 * Helper macros for accessing the "unnested" and "in-progress" bits in
2229 * pmap->nested_region_unnested_table_bitmap.
2230 */
2231 #define UNNEST_BIT(index) ((index) * 2)
2232 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
2233
2234
2235 /*
2236 * Bootstrap the system enough to run with virtual memory.
2237 *
2238 * The early VM initialization code has already allocated
2239 * the first CPU's translation table and made entries for
2240 * all the one-to-one mappings to be found there.
2241 *
2242 * We must set up the kernel pmap structures, the
2243 * physical-to-virtual translation lookup tables for the
2244 * physical memory to be managed (between avail_start and
2245 * avail_end).
2246 *
2247 * Map the kernel's code and data, and allocate the system page table.
2248 * Page_size must already be set.
2249 *
2250 * Parameters:
2251 * first_avail first available physical page -
2252 * after kernel page tables
2253 * avail_start PA of first managed physical page
2254 * avail_end PA of last managed physical page
2255 */
2256
2257 void
pmap_bootstrap(vm_offset_t vstart)2258 pmap_bootstrap(
2259 vm_offset_t vstart)
2260 {
2261 vm_map_offset_t maxoffset;
2262
2263 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2264
2265 #if XNU_MONITOR
2266 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
2267 pmap_ppl_disable = ml_unsafe_kernel_text();
2268 #endif
2269
2270 #endif /* XNU_MONITOR */
2271
2272 #if DEVELOPMENT || DEBUG
2273 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2274 kprintf("Kernel traces for pmap operations enabled\n");
2275 }
2276 #endif
2277
2278 /*
2279 * Initialize the kernel pmap.
2280 */
2281 #if ARM_PARAMETERIZED_PMAP
2282 kernel_pmap->pmap_pt_attr = native_pt_attr;
2283 #endif /* ARM_PARAMETERIZED_PMAP */
2284 #if HAS_APPLE_PAC
2285 kernel_pmap->disable_jop = 0;
2286 #endif /* HAS_APPLE_PAC */
2287 kernel_pmap->tte = cpu_tte;
2288 kernel_pmap->ttep = cpu_ttep;
2289 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2290 kernel_pmap->max = UINTPTR_MAX;
2291 os_atomic_init(&kernel_pmap->ref_count, 1);
2292 #if XNU_MONITOR
2293 os_atomic_init(&kernel_pmap->nested_count, 0);
2294 #endif
2295 kernel_pmap->nx_enabled = TRUE;
2296 #ifdef __arm64__
2297 kernel_pmap->is_64bit = TRUE;
2298 #else
2299 kernel_pmap->is_64bit = FALSE;
2300 #endif
2301 #if CONFIG_ROSETTA
2302 kernel_pmap->is_rosetta = FALSE;
2303 #endif
2304
2305 kernel_pmap->nested_region_addr = 0x0ULL;
2306 kernel_pmap->nested_region_size = 0x0ULL;
2307 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2308 kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2309 kernel_pmap->type = PMAP_TYPE_KERNEL;
2310
2311 kernel_pmap->hw_asid = 0;
2312 kernel_pmap->sw_asid = 0;
2313
2314 pmap_lock_init(kernel_pmap);
2315
2316 pmap_max_asids = pmap_compute_max_asids();
2317 #if HAS_16BIT_ASID
2318 asid_chunk_size = MAX_HW_ASIDS;
2319 #else
2320 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2321 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2322 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2323 * masking used by the PLRU scheme. This means we must handle the case in which
2324 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2325 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2326 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2327 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2328 #endif /* HAS_16BIT_ASIDS */
2329
2330 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2331
2332 #if HAS_SPECRES_DEBUGGING
2333 PE_parse_boot_argn("specres_debug", &specres_debug, sizeof(specres_debug));
2334
2335 if ((specres_debug & SPECRES_DEBUG_FORCE_RCTX) && (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX)) {
2336 panic("%s: invalid specres_debug value: %llu", __func__, specres_debug);
2337 }
2338 #endif /* HAS_SPECRES_DEBUGGING */
2339
2340 /**
2341 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2342 * pp_attr_table, etc). This function will use `avail_start` to allocate
2343 * space for these data structures.
2344 */
2345 pmap_data_bootstrap();
2346
2347 /**
2348 * Bootstrap any necessary SART data structures and values needed from the device tree.
2349 */
2350 sart_bootstrap();
2351
2352 /**
2353 * Don't make any assumptions about the alignment of avail_start before this
2354 * point (i.e., pmap_data_bootstrap() performs allocations).
2355 */
2356 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2357
2358 const pmap_paddr_t pmap_struct_start = avail_start;
2359
2360 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2361 avail_start = round_page(avail_start + asid_table_size);
2362
2363 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2364
2365 vm_first_phys = gPhysBase;
2366 vm_last_phys = trunc_page(avail_end);
2367
2368 queue_init(&map_pmap_list);
2369 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2370 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2371 free_page_size_tt_count = 0;
2372 free_page_size_tt_max = 0;
2373 free_tt_list = TT_FREE_ENTRY_NULL;
2374 free_tt_count = 0;
2375 free_tt_max = 0;
2376
2377 virtual_space_start = vstart;
2378 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2379
2380 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2381 #if !HAS_16BIT_ASID
2382 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2383 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2384 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2385 #endif /* !HAS_16BIT_ASID */
2386
2387
2388
2389 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2390 maxoffset = trunc_page(maxoffset);
2391 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2392 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2393 arm_pmap_max_offset_default = maxoffset;
2394 }
2395 }
2396 #if defined(__arm64__)
2397 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2398 maxoffset = trunc_page(maxoffset);
2399 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2400 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2401 arm64_pmap_max_offset_default = maxoffset;
2402 }
2403 }
2404 #endif
2405
2406 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2407
2408
2409 #if PMAP_CS_PPL_MONITOR
2410 /* Initialize the PPL trust cache read-write lock */
2411 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2412 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2413 #endif
2414
2415 #if DEVELOPMENT || DEBUG
2416 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2417 &vm_footprint_suspend_allowed,
2418 sizeof(vm_footprint_suspend_allowed));
2419 #endif /* DEVELOPMENT || DEBUG */
2420
2421 #if KASAN
2422 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2423 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2424 #endif /* KASAN */
2425
2426 /**
2427 * Ensure that avail_start is always left on a page boundary. The calling
2428 * code might not perform any alignment before allocating page tables so
2429 * this is important.
2430 */
2431 avail_start = round_page(avail_start);
2432 }
2433
2434 #if XNU_MONITOR
2435
2436 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2437 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2438 {
2439 pmap_paddr_t cur_pa;
2440 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2441 assert(pa_valid(cur_pa));
2442 ppattr_pa_set_monitor(cur_pa);
2443 }
2444 }
2445
2446 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2447 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2448 pmap_paddr_t end_pa,
2449 unsigned int expected_perm,
2450 unsigned int new_perm)
2451 {
2452 vm_offset_t start_va = phystokv(start_pa);
2453 vm_offset_t end_va = start_va + (end_pa - start_pa);
2454
2455 pa_set_range_monitor(start_pa, end_pa);
2456 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2457 }
2458
2459 static void
pmap_lockdown_kc(void)2460 pmap_lockdown_kc(void)
2461 {
2462 extern vm_offset_t vm_kernelcache_base;
2463 extern vm_offset_t vm_kernelcache_top;
2464 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2465 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2466 pmap_paddr_t cur_pa = start_pa;
2467 vm_offset_t cur_va = vm_kernelcache_base;
2468 while (cur_pa < end_pa) {
2469 vm_size_t range_size = end_pa - cur_pa;
2470 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2471 if (ptov_va != cur_va) {
2472 /*
2473 * If the physical address maps back to a virtual address that is non-linear
2474 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2475 * reclaimed by the OS and should therefore not be locked down.
2476 */
2477 cur_pa += range_size;
2478 cur_va += range_size;
2479 continue;
2480 }
2481 unsigned int pai = pa_index(cur_pa);
2482 pv_entry_t **pv_h = pai_to_pvh(pai);
2483
2484 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2485
2486 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2487 panic("pai %d already locked down", pai);
2488 }
2489
2490 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2491 cur_pa += ARM_PGBYTES;
2492 cur_va += ARM_PGBYTES;
2493 }
2494 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
2495 extern uint64_t ctrr_ro_test;
2496 extern uint64_t ctrr_nx_test;
2497 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2498 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2499 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2500 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2501 }
2502 #endif
2503 }
2504
2505 void
pmap_static_allocations_done(void)2506 pmap_static_allocations_done(void)
2507 {
2508 pmap_paddr_t monitor_start_pa;
2509 pmap_paddr_t monitor_end_pa;
2510
2511 /*
2512 * Protect the bootstrap (V=P and V->P) page tables.
2513 *
2514 * These bootstrap allocations will be used primarily for page tables.
2515 * If we wish to secure the page tables, we need to start by marking
2516 * these bootstrap allocations as pages that we want to protect.
2517 */
2518 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2519 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2520
2521 /* The bootstrap page tables are mapped RW at boostrap. */
2522 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2523
2524 /*
2525 * We use avail_start as a pointer to the first address that has not
2526 * been reserved for bootstrap, so we know which pages to give to the
2527 * virtual memory layer.
2528 */
2529 monitor_start_pa = first_avail_phys;
2530 monitor_end_pa = avail_start;
2531
2532 /* The other bootstrap allocations are mapped RW at bootstrap. */
2533 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2534
2535 /*
2536 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2537 * to RO in arm_vm_prot_finalize(), which is called after this function.
2538 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2539 * they can't be allocated for other uses. We don't need a special xPRR
2540 * protection index, as there is no PPL_RO index, and these pages are ultimately
2541 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2542 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2543 * lookup table index to USER_XO before APRR is applied, leading the hardware
2544 * to believe we are dealing with an user XO page upon performing a translation.
2545 */
2546 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2547 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2548 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2549
2550 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2551 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2552
2553 /* PPL data is RW for the PPL, RO for the kernel. */
2554 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2555
2556 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2557 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2558
2559 /* PPL text is RX for the PPL, RO for the kernel. */
2560 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2561
2562
2563 /*
2564 * In order to support DTrace, the save areas for the PPL must be
2565 * writable. This is due to the fact that DTrace will try to update
2566 * register state.
2567 */
2568 if (pmap_ppl_disable) {
2569 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2570 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2571
2572 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2573 }
2574
2575
2576 if (segSizePPLDATACONST > 0) {
2577 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2578 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2579
2580 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2581 }
2582
2583 /*
2584 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2585 * precaution. The real RW mappings are at a different location with guard pages.
2586 */
2587 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2588
2589 /* Prevent remapping of the kernelcache */
2590 pmap_lockdown_kc();
2591 }
2592
2593
2594 void
pmap_lockdown_ppl(void)2595 pmap_lockdown_ppl(void)
2596 {
2597 /* Mark the PPL as being locked down. */
2598
2599 mp_disable_preemption(); // for _nopreempt locking operations
2600 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2601 if (commpage_text_kva != 0) {
2602 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2603 false, VM_PROT_READ | VM_PROT_EXECUTE);
2604 }
2605 mp_enable_preemption();
2606
2607 /* Write-protect the kernel RO commpage. */
2608 #error "XPRR configuration error"
2609 }
2610 #endif /* XNU_MONITOR */
2611
2612 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2613 pmap_virtual_space(
2614 vm_offset_t *startp,
2615 vm_offset_t *endp
2616 )
2617 {
2618 *startp = virtual_space_start;
2619 *endp = virtual_space_end;
2620 }
2621
2622
2623 __mockable boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2624 pmap_virtual_region(
2625 unsigned int region_select,
2626 vm_map_offset_t *startp,
2627 vm_map_size_t *size
2628 )
2629 {
2630 boolean_t ret = FALSE;
2631 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2632 if (region_select == 0) {
2633 /*
2634 * In this config, the bootstrap mappings should occupy their own L2
2635 * TTs, as they should be immutable after boot. Having the associated
2636 * TTEs and PTEs in their own pages allows us to lock down those pages,
2637 * while allowing the rest of the kernel address range to be remapped.
2638 */
2639 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2640 #if defined(ARM_LARGE_MEMORY)
2641 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2642 #else
2643 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2644 #endif
2645 ret = TRUE;
2646 }
2647
2648 #if defined(ARM_LARGE_MEMORY)
2649 if (region_select == 1) {
2650 *startp = VREGION1_START;
2651 *size = VREGION1_SIZE;
2652 ret = TRUE;
2653 }
2654 #endif
2655 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */
2656 #if defined(ARM_LARGE_MEMORY)
2657 /* For large memory systems with no KTRR/CTRR */
2658 if (region_select == 0) {
2659 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2660 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2661 ret = TRUE;
2662 }
2663
2664 if (region_select == 1) {
2665 *startp = VREGION1_START;
2666 *size = VREGION1_SIZE;
2667 ret = TRUE;
2668 }
2669 #else /* !defined(ARM_LARGE_MEMORY) */
2670 unsigned long low_global_vr_mask = 0;
2671 vm_map_size_t low_global_vr_size = 0;
2672
2673 if (region_select == 0) {
2674 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2675 if (!TEST_PAGE_SIZE_4K) {
2676 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2677 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2678 } else {
2679 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2680 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2681 }
2682 ret = TRUE;
2683 }
2684 if (region_select == 1) {
2685 *startp = VREGION1_START;
2686 *size = VREGION1_SIZE;
2687 ret = TRUE;
2688 }
2689 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2690 if (!TEST_PAGE_SIZE_4K) {
2691 low_global_vr_mask = 0xFFFFFFFFFE000000;
2692 low_global_vr_size = 0x2000000;
2693 } else {
2694 low_global_vr_mask = 0xFFFFFFFFFF800000;
2695 low_global_vr_size = 0x800000;
2696 }
2697
2698 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2699 *startp = LOW_GLOBAL_BASE_ADDRESS;
2700 *size = low_global_vr_size;
2701 ret = TRUE;
2702 }
2703
2704 if (region_select == 3) {
2705 /* In this config, we allow the bootstrap mappings to occupy the same
2706 * page table pages as the heap.
2707 */
2708 *startp = VM_MIN_KERNEL_ADDRESS;
2709 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2710 ret = TRUE;
2711 }
2712 #endif /* defined(ARM_LARGE_MEMORY) */
2713 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
2714 return ret;
2715 }
2716
2717 /*
2718 * Routines to track and allocate physical pages during early boot.
2719 * On most systems that memory runs from first_avail through to avail_end
2720 * with no gaps.
2721 *
2722 * If the system supports ECC and ecc_bad_pages_count > 0, we
2723 * need to skip those pages.
2724 */
2725
2726 static unsigned int avail_page_count = 0;
2727 static bool need_ram_ranges_init = true;
2728
2729
2730 /**
2731 * Checks to see if a given page is in
2732 * the array of known bad pages
2733 *
2734 * @param ppn page number to check
2735 */
2736 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2737 pmap_is_bad_ram(__unused ppnum_t ppn)
2738 {
2739 return false;
2740 }
2741
2742 /**
2743 * Prepare bad ram pages to be skipped.
2744 */
2745
2746 /*
2747 * Initialize the count of available pages. No lock needed here,
2748 * as this code is called while kernel boot up is single threaded.
2749 */
2750 static void
initialize_ram_ranges(void)2751 initialize_ram_ranges(void)
2752 {
2753 pmap_paddr_t first = first_avail;
2754 pmap_paddr_t end = avail_end;
2755
2756 assert(first <= end);
2757 assert(first == (first & ~PAGE_MASK));
2758 assert(end == (end & ~PAGE_MASK));
2759 avail_page_count = atop(end - first);
2760
2761 need_ram_ranges_init = false;
2762 }
2763
2764 unsigned int
pmap_free_pages(void)2765 pmap_free_pages(
2766 void)
2767 {
2768 if (need_ram_ranges_init) {
2769 initialize_ram_ranges();
2770 }
2771 return avail_page_count;
2772 }
2773
2774 unsigned int
pmap_free_pages_span(void)2775 pmap_free_pages_span(
2776 void)
2777 {
2778 if (need_ram_ranges_init) {
2779 initialize_ram_ranges();
2780 }
2781 return (unsigned int)atop(avail_end - first_avail);
2782 }
2783
2784
2785 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2786 pmap_next_page_hi(
2787 ppnum_t * pnum,
2788 __unused boolean_t might_free)
2789 {
2790 return pmap_next_page(pnum);
2791 }
2792
2793
2794 boolean_t
pmap_next_page(ppnum_t * pnum)2795 pmap_next_page(
2796 ppnum_t *pnum)
2797 {
2798 if (need_ram_ranges_init) {
2799 initialize_ram_ranges();
2800 }
2801
2802
2803 if (first_avail != avail_end) {
2804 *pnum = (ppnum_t)atop(first_avail);
2805 first_avail += PAGE_SIZE;
2806 assert(avail_page_count > 0);
2807 --avail_page_count;
2808 return TRUE;
2809 }
2810 assert(avail_page_count == 0);
2811 return FALSE;
2812 }
2813
2814
2815 /**
2816 * Helper function to check wheter the given physical
2817 * page number is a restricted page.
2818 *
2819 * @param pn the physical page number to query.
2820 */
2821 bool
pmap_is_page_restricted(__unused ppnum_t pn)2822 pmap_is_page_restricted(__unused ppnum_t pn)
2823 {
2824 return false;
2825 }
2826
2827 /*
2828 * Initialize the pmap module.
2829 * Called by vm_init, to initialize any structures that the pmap
2830 * system needs to map virtual memory.
2831 */
2832 void
pmap_init(void)2833 pmap_init(
2834 void)
2835 {
2836 /*
2837 * Protect page zero in the kernel map.
2838 * (can be overruled by permanent transltion
2839 * table entries at page zero - see arm_vm_init).
2840 */
2841 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2842
2843 pmap_initialized = TRUE;
2844
2845 /*
2846 * Create the zone of physical maps
2847 * and the physical-to-virtual entries.
2848 */
2849 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2850 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2851
2852
2853 /*
2854 * Initialize the pmap object (for tracking the vm_page_t
2855 * structures for pages we allocate to be page tables in
2856 * pmap_expand().
2857 */
2858 _vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2859 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2860
2861 /*
2862 * The values of [hard_]maxproc may have been scaled, make sure
2863 * they are still less than the value of pmap_max_asids.
2864 */
2865 if ((uint32_t)maxproc > pmap_max_asids) {
2866 maxproc = pmap_max_asids;
2867 }
2868 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2869 hard_maxproc = pmap_max_asids;
2870 }
2871 }
2872
2873 /**
2874 * Verify that a given physical page contains no mappings (outside of the
2875 * default physical aperture mapping).
2876 *
2877 * @param ppnum Physical page number to check there are no mappings to.
2878 *
2879 * @return True if there are no mappings, false otherwise or if the page is not
2880 * kernel-managed.
2881 */
2882 bool
pmap_verify_free(ppnum_t ppnum)2883 pmap_verify_free(ppnum_t ppnum)
2884 {
2885 const pmap_paddr_t pa = ptoa(ppnum);
2886
2887 assert(pa != vm_page_fictitious_addr);
2888
2889 /* Only mappings to kernel-managed physical memory are tracked. */
2890 if (!pa_valid(pa)) {
2891 return false;
2892 }
2893
2894 const unsigned int pai = pa_index(pa);
2895 pv_entry_t **pvh = pai_to_pvh(pai);
2896
2897 return pvh_test_type(pvh, PVH_TYPE_NULL);
2898 }
2899
2900 #if MACH_ASSERT
2901 /**
2902 * Verify that a given physical page contains no mappings (outside of the
2903 * default physical aperture mapping) and if it does, then panic.
2904 *
2905 * @note It's recommended to use pmap_verify_free() directly when operating in
2906 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2907 * normally being called from outside of the PPL, and the pv_head_table
2908 * can't be modified outside of the PPL).
2909 *
2910 * @param ppnum Physical page number to check there are no mappings to.
2911 */
2912 void
pmap_assert_free(ppnum_t ppnum)2913 pmap_assert_free(ppnum_t ppnum)
2914 {
2915 const pmap_paddr_t pa = ptoa(ppnum);
2916
2917 /* Only mappings to kernel-managed physical memory are tracked. */
2918 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2919 return;
2920 }
2921
2922 const unsigned int pai = pa_index(pa);
2923 pv_entry_t **pvh = pai_to_pvh(pai);
2924
2925 /**
2926 * This function is always called from outside of the PPL. Because of this,
2927 * the PVH entry can't be locked. This function is generally only called
2928 * before the VM reclaims a physical page and shouldn't be creating new
2929 * mappings. Even if a new mapping is created while parsing the hierarchy,
2930 * the worst case is that the system will panic in another way, and we were
2931 * already about to panic anyway.
2932 */
2933
2934 /**
2935 * Since pmap_verify_free() returned false, that means there is at least one
2936 * mapping left. Let's get some extra info on the first mapping we find to
2937 * dump in the panic string (the common case is that there is one spare
2938 * mapping that was never unmapped).
2939 */
2940 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2941
2942 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2943 first_ptep = pvh_ptep(pvh);
2944 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2945 pv_entry_t *pvep = pvh_pve_list(pvh);
2946
2947 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2948 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2949 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2950 if (first_ptep != PT_ENTRY_NULL) {
2951 break;
2952 }
2953 }
2954
2955 /* The PVE should have at least one valid PTE. */
2956 assert(first_ptep != PT_ENTRY_NULL);
2957 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2958 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2959 __func__, pvh, pai);
2960 } else {
2961 /**
2962 * The mapping disappeared between here and the pmap_verify_free() call.
2963 * The only way that can happen is if the VM was racing this call with
2964 * a call that unmaps PTEs. Operations on this page should not be
2965 * occurring at the same time as this check, and unfortunately we can't
2966 * lock the PVH entry to prevent it, so just panic instead.
2967 */
2968 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2969 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2970 __func__, pvh, pai);
2971 }
2972
2973 /* Panic with a unique string identifying the first bad mapping and owner. */
2974 {
2975 /* First PTE is mapped by the main CPUs. */
2976 pmap_t pmap = ptep_get_pmap(first_ptep);
2977 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2978
2979 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2980 "%s CPU mapping (pmap: %p)",
2981 __func__, (uint64_t)pa, first_ptep, type, pmap);
2982 }
2983 }
2984 #endif /* MACH_ASSERT */
2985
2986 inline void
pmap_recycle_page(ppnum_t pn)2987 pmap_recycle_page(ppnum_t pn)
2988 {
2989 const bool is_freed = pmap_verify_free(pn);
2990
2991 if (__improbable(!is_freed)) {
2992 /*
2993 * There is a redundancy here, but we are going to panic anyways,
2994 * and ASSERT_PMAP_FREE traces useful information. So, we keep this
2995 * behavior.
2996 */
2997 #if MACH_ASSERT
2998 pmap_assert_free(pn);
2999 #endif /* MACH_ASSERT */
3000 panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn));
3001 }
3002 }
3003
3004
3005 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)3006 pmap_root_alloc_size(pmap_t pmap)
3007 {
3008 #pragma unused(pmap)
3009 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3010 unsigned int root_level = pt_attr_root_level(pt_attr);
3011 const uint64_t index = pt_attr_va_valid_mask(pt_attr);
3012 return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
3013 }
3014
3015
3016 /*
3017 * Create and return a physical map.
3018 *
3019 * If the size specified for the map
3020 * is zero, the map is an actual physical
3021 * map, and may be referenced by the
3022 * hardware.
3023 *
3024 * If the size specified is non-zero,
3025 * the map will be used in software only, and
3026 * is bounded by that size.
3027 */
3028 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)3029 pmap_create_options_internal(
3030 ledger_t ledger,
3031 vm_map_size_t size,
3032 unsigned int flags,
3033 kern_return_t *kr)
3034 {
3035 unsigned i;
3036 unsigned tte_index_max;
3037 pmap_t p;
3038 bool is_64bit = flags & PMAP_CREATE_64BIT;
3039 #if defined(HAS_APPLE_PAC)
3040 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
3041 #endif /* defined(HAS_APPLE_PAC) */
3042 kern_return_t local_kr = KERN_SUCCESS;
3043
3044 if (size != 0) {
3045 {
3046 // Size parameter should only be set for stage 2.
3047 return PMAP_NULL;
3048 }
3049 }
3050
3051 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
3052 return PMAP_NULL;
3053 }
3054
3055 #if XNU_MONITOR
3056 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
3057 goto pmap_create_fail;
3058 }
3059
3060 assert(p != PMAP_NULL);
3061
3062 if (ledger) {
3063 pmap_ledger_validate(ledger);
3064 pmap_ledger_retain(ledger);
3065 }
3066 #else
3067 /*
3068 * Allocate a pmap struct from the pmap_zone. Then allocate
3069 * the translation table of the right size for the pmap.
3070 */
3071 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
3072 local_kr = KERN_RESOURCE_SHORTAGE;
3073 goto pmap_create_fail;
3074 }
3075 #endif
3076
3077 p->ledger = ledger;
3078
3079
3080 p->pmap_vm_map_cs_enforced = false;
3081 p->min = 0;
3082
3083
3084 #if CONFIG_ROSETTA
3085 if (flags & PMAP_CREATE_ROSETTA) {
3086 p->is_rosetta = TRUE;
3087 } else {
3088 p->is_rosetta = FALSE;
3089 }
3090 #endif /* CONFIG_ROSETTA */
3091
3092 #if defined(HAS_APPLE_PAC)
3093 p->disable_jop = disable_jop;
3094 #endif /* defined(HAS_APPLE_PAC) */
3095
3096 p->nested_region_true_start = 0;
3097 p->nested_region_true_end = ~0;
3098
3099 p->nx_enabled = true;
3100 p->is_64bit = is_64bit;
3101 p->nested_pmap = PMAP_NULL;
3102 p->type = PMAP_TYPE_USER;
3103
3104 #if ARM_PARAMETERIZED_PMAP
3105 /* Default to the native pt_attr */
3106 p->pmap_pt_attr = native_pt_attr;
3107 #endif /* ARM_PARAMETERIZED_PMAP */
3108 #if __ARM_MIXED_PAGE_SIZE__
3109 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3110 p->pmap_pt_attr = &pmap_pt_attr_4k;
3111 }
3112 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3113 p->max = pmap_user_va_size(p);
3114
3115 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3116 local_kr = KERN_NO_SPACE;
3117 goto id_alloc_fail;
3118 }
3119
3120 pmap_lock_init(p);
3121
3122 p->tt_entry_free = (tt_entry_t *)0;
3123 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3124
3125
3126 #if XNU_MONITOR
3127 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3128 #else
3129 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3130 #endif
3131 if (!(p->tte)) {
3132 local_kr = KERN_RESOURCE_SHORTAGE;
3133 goto tt1_alloc_fail;
3134 }
3135
3136 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3137 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3138
3139 /* nullify the translation table */
3140 for (i = 0; i < tte_index_max; i++) {
3141 p->tte[i] = ARM_TTE_TYPE_FAULT;
3142 }
3143
3144 FLUSH_PTE();
3145
3146 /*
3147 * initialize the rest of the structure
3148 */
3149 p->nested_region_addr = 0x0ULL;
3150 p->nested_region_size = 0x0ULL;
3151 p->nested_region_unnested_table_bitmap = NULL;
3152 p->nested_region_unnested_table_bitmap_size = 0x0UL;
3153
3154 p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3155 p->nested_no_bounds_refcnt = 0;
3156 p->nested_bounds_set = false;
3157
3158
3159 #if MACH_ASSERT
3160 p->pmap_pid = 0;
3161 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3162 #endif /* MACH_ASSERT */
3163 #if DEVELOPMENT || DEBUG
3164 p->footprint_was_suspended = FALSE;
3165 #endif /* DEVELOPMENT || DEBUG */
3166
3167 #if XNU_MONITOR
3168 os_atomic_init(&p->nested_count, 0);
3169 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3170 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3171 os_atomic_thread_fence(release);
3172 #endif
3173 os_atomic_init(&p->ref_count, 1);
3174 pmap_simple_lock(&pmaps_lock);
3175 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3176 pmap_simple_unlock(&pmaps_lock);
3177
3178 /*
3179 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3180 * which can lead to a concurrent disconnect operation making the balance
3181 * transiently negative. The ledger should still ultimately balance out,
3182 * which we still check upon pmap destruction.
3183 */
3184 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3185 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3186 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3187 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3188 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3189 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3190 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3191 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3192 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3193
3194 return p;
3195
3196 tt1_alloc_fail:
3197 pmap_get_pt_ops(p)->free_id(p);
3198 id_alloc_fail:
3199 #if XNU_MONITOR
3200 pmap_free_pmap(p);
3201
3202 if (ledger) {
3203 pmap_ledger_release(ledger);
3204 }
3205 #else
3206 zfree(pmap_zone, p);
3207 #endif
3208 pmap_create_fail:
3209 #if XNU_MONITOR
3210 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3211 #endif
3212 *kr = local_kr;
3213 #if XNU_MONITOR
3214 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3215 #endif
3216 return PMAP_NULL;
3217 }
3218
3219 __mockable pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3220 pmap_create_options(
3221 ledger_t ledger,
3222 vm_map_size_t size,
3223 unsigned int flags)
3224 {
3225 pmap_t pmap;
3226 kern_return_t kr = KERN_SUCCESS;
3227
3228 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3229
3230 ledger_reference(ledger);
3231
3232 #if XNU_MONITOR
3233 for (;;) {
3234 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3235 if (kr != KERN_RESOURCE_SHORTAGE) {
3236 break;
3237 }
3238 assert(pmap == PMAP_NULL);
3239 pmap_alloc_page_for_ppl(0);
3240 kr = KERN_SUCCESS;
3241 }
3242 #else
3243 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3244 #endif
3245
3246 if (pmap == PMAP_NULL) {
3247 ledger_dereference(ledger);
3248 }
3249
3250 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3251
3252 return pmap;
3253 }
3254
3255 #if XNU_MONITOR
3256 /*
3257 * This symbol remains in place when the PPL is enabled so that the dispatch
3258 * table does not change from development to release configurations.
3259 */
3260 #endif
3261 #if MACH_ASSERT || XNU_MONITOR
3262 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3263 pmap_set_process_internal(
3264 __unused pmap_t pmap,
3265 __unused int pid,
3266 __unused char *procname)
3267 {
3268 #if MACH_ASSERT
3269 if (pmap == NULL || pmap->pmap_pid == -1) {
3270 return;
3271 }
3272
3273 validate_pmap_mutable(pmap);
3274
3275 pmap->pmap_pid = pid;
3276 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3277 #endif /* MACH_ASSERT */
3278 }
3279 #endif /* MACH_ASSERT || XNU_MONITOR */
3280
3281 #if MACH_ASSERT
3282 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3283 pmap_set_process(
3284 pmap_t pmap,
3285 int pid,
3286 char *procname)
3287 {
3288 #if XNU_MONITOR
3289 pmap_set_process_ppl(pmap, pid, procname);
3290 #else
3291 pmap_set_process_internal(pmap, pid, procname);
3292 #endif
3293 }
3294 #endif /* MACH_ASSERT */
3295
3296 /*
3297 * pmap_deallocate_all_leaf_tts:
3298 *
3299 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3300 * removing and deallocating all TTEs.
3301 */
3302 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3303 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3304 {
3305 tt_entry_t tte = ARM_TTE_EMPTY;
3306 tt_entry_t * ttep = NULL;
3307 tt_entry_t * last_ttep = NULL;
3308
3309 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3310
3311 assert(level < pt_attr_leaf_level(pt_attr));
3312
3313 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3314
3315 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3316 tte = *ttep;
3317
3318 if (!(tte & ARM_TTE_VALID)) {
3319 continue;
3320 }
3321
3322 if (tte_is_block(tte)) {
3323 panic("%s: found block mapping, ttep=%p, tte=%p, "
3324 "pmap=%p, first_ttep=%p, level=%u",
3325 __FUNCTION__, ttep, (void *)tte,
3326 pmap, first_ttep, level);
3327 }
3328
3329 /* Must be valid, type table */
3330 if (level < pt_attr_twig_level(pt_attr)) {
3331 /* If we haven't reached the twig level, recurse to the next level. */
3332 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3333 }
3334
3335 /* Remove the TTE. */
3336 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3337 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3338 }
3339 }
3340
3341 /*
3342 * We maintain stats and ledgers so that a task's physical footprint is:
3343 * phys_footprint = ((internal - alternate_accounting)
3344 * + (internal_compressed - alternate_accounting_compressed)
3345 * + iokit_mapped
3346 * + purgeable_nonvolatile
3347 * + purgeable_nonvolatile_compressed
3348 * + page_table)
3349 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3350 */
3351
3352 /*
3353 * Retire the given physical map from service.
3354 * Should only be called if the map contains
3355 * no valid mappings.
3356 */
3357 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3358 pmap_destroy_internal(
3359 pmap_t pmap)
3360 {
3361 if (pmap == PMAP_NULL) {
3362 return;
3363 }
3364
3365 validate_pmap(pmap);
3366
3367 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3368
3369 int32_t ref_count = os_atomic_dec(&pmap->ref_count, release);
3370 if (ref_count > 0) {
3371 return;
3372 } else if (__improbable(ref_count < 0)) {
3373 panic("pmap %p: refcount underflow", pmap);
3374 } else if (__improbable(pmap == kernel_pmap)) {
3375 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3376 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3377 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3378 }
3379
3380 /*
3381 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3382 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3383 * That ensures that if the pmap is currently in use elsewhere, this path will
3384 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3385 * ref_count of 0 and panic.
3386 */
3387 os_atomic_thread_fence(seq_cst);
3388
3389 #if XNU_MONITOR
3390 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3391 panic("pmap %p: attempt to destroy while nested", pmap);
3392 }
3393 const int max_cpu = ml_get_max_cpu_number();
3394 for (unsigned int i = 0; i <= max_cpu; ++i) {
3395 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3396 if (cpu_data == NULL) {
3397 continue;
3398 }
3399 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3400 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3401 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3402 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3403 }
3404 }
3405 #endif
3406 pmap_unmap_commpage(pmap);
3407
3408 pmap_simple_lock(&pmaps_lock);
3409 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3410 pmap_simple_unlock(&pmaps_lock);
3411
3412 pmap_trim_self(pmap);
3413
3414 /*
3415 * Free the memory maps, then the
3416 * pmap structure.
3417 */
3418 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3419
3420
3421
3422 if (pmap->tte) {
3423 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3424 pmap->tte = (tt_entry_t *) NULL;
3425 pmap->ttep = 0;
3426 }
3427
3428 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3429
3430 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3431 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3432 sync_tlb_flush();
3433 } else {
3434 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3435 sync_tlb_flush();
3436 /* return its asid to the pool */
3437 pmap_get_pt_ops(pmap)->free_id(pmap);
3438 if (pmap->nested_pmap != NULL) {
3439 #if XNU_MONITOR
3440 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3441 #endif
3442 /* release the reference we hold on the nested pmap */
3443 pmap_destroy_internal(pmap->nested_pmap);
3444 }
3445 }
3446
3447 pmap_check_ledgers(pmap);
3448
3449 if (pmap->nested_region_unnested_table_bitmap) {
3450 #if XNU_MONITOR
3451 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3452 #else
3453 kfree_data(pmap->nested_region_unnested_table_bitmap,
3454 pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3455 #endif
3456 }
3457
3458 #if XNU_MONITOR
3459 if (pmap->ledger) {
3460 pmap_ledger_release(pmap->ledger);
3461 }
3462
3463 pmap_lock_destroy(pmap);
3464 pmap_free_pmap(pmap);
3465 #else
3466 pmap_lock_destroy(pmap);
3467 zfree(pmap_zone, pmap);
3468 #endif
3469 }
3470
3471 __mockable void
pmap_destroy(pmap_t pmap)3472 pmap_destroy(
3473 pmap_t pmap)
3474 {
3475 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3476
3477 ledger_t ledger = pmap->ledger;
3478
3479 #if XNU_MONITOR
3480 pmap_destroy_ppl(pmap);
3481
3482 pmap_ledger_check_balance(pmap);
3483 #else
3484 pmap_destroy_internal(pmap);
3485 #endif
3486
3487 ledger_dereference(ledger);
3488
3489 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3490 }
3491
3492
3493 /*
3494 * Add a reference to the specified pmap.
3495 */
3496 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3497 pmap_reference_internal(
3498 pmap_t pmap)
3499 {
3500 if (pmap != PMAP_NULL) {
3501 validate_pmap_mutable(pmap);
3502 os_atomic_inc(&pmap->ref_count, acquire);
3503 }
3504 }
3505
3506 void
pmap_reference(pmap_t pmap)3507 pmap_reference(
3508 pmap_t pmap)
3509 {
3510 #if XNU_MONITOR
3511 pmap_reference_ppl(pmap);
3512 #else
3513 pmap_reference_internal(pmap);
3514 #endif
3515 }
3516
3517 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3518 pmap_tt1_allocate(
3519 pmap_t pmap,
3520 vm_size_t size,
3521 unsigned option)
3522 {
3523 tt_entry_t *tt1 = NULL;
3524 tt_free_entry_t *tt1_free;
3525 pmap_paddr_t pa;
3526 vm_address_t va;
3527 vm_address_t va_end;
3528 kern_return_t ret;
3529
3530 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3531 size = PAGE_SIZE;
3532 }
3533
3534 /**
3535 * We expect top level translation tables to always fit into a single
3536 * physical page. This would also catch a misconfiguration if 4K
3537 * concatenated page tables needed more than one physical tt1 page.
3538 */
3539 if (__improbable(size > PAGE_SIZE)) {
3540 panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)size);
3541 }
3542
3543 pmap_simple_lock(&tt1_lock);
3544 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3545 free_page_size_tt_count--;
3546 tt1 = (tt_entry_t *)free_page_size_tt_list;
3547 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3548 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3549 free_tt_count--;
3550 tt1 = (tt_entry_t *)free_tt_list;
3551 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3552 }
3553 pmap_simple_unlock(&tt1_lock);
3554
3555 if (tt1 != NULL) {
3556 pmap_tt_ledger_credit(pmap, size);
3557 return (tt_entry_t *)tt1;
3558 }
3559
3560 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3561
3562 if (ret == KERN_RESOURCE_SHORTAGE) {
3563 return (tt_entry_t *)0;
3564 }
3565
3566 #if XNU_MONITOR
3567 assert(pa);
3568 #endif
3569
3570 if (size < PAGE_SIZE) {
3571 va = phystokv(pa) + size;
3572 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3573 tt_free_entry_t *next_free = NULL;
3574 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3575 tt1_free = (tt_free_entry_t *)va;
3576 tt1_free->next = next_free;
3577 next_free = tt1_free;
3578 }
3579 pmap_simple_lock(&tt1_lock);
3580 local_free_list->next = free_tt_list;
3581 free_tt_list = next_free;
3582 free_tt_count += ((PAGE_SIZE / size) - 1);
3583 if (free_tt_count > free_tt_max) {
3584 free_tt_max = free_tt_count;
3585 }
3586 pmap_simple_unlock(&tt1_lock);
3587 }
3588
3589 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3590 * Depending on the device, this can vary between 512b and 16K. */
3591 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3592 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3593 pmap_tt_ledger_credit(pmap, size);
3594
3595 return (tt_entry_t *) phystokv(pa);
3596 }
3597
3598 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3599 pmap_tt1_deallocate(
3600 pmap_t pmap,
3601 tt_entry_t *tt,
3602 vm_size_t size,
3603 unsigned option)
3604 {
3605 tt_free_entry_t *tt_entry;
3606
3607 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3608 size = PAGE_SIZE;
3609 }
3610
3611 tt_entry = (tt_free_entry_t *)tt;
3612 assert(not_in_kdp);
3613 pmap_simple_lock(&tt1_lock);
3614
3615 if (size < PAGE_SIZE) {
3616 free_tt_count++;
3617 if (free_tt_count > free_tt_max) {
3618 free_tt_max = free_tt_count;
3619 }
3620 tt_entry->next = free_tt_list;
3621 free_tt_list = tt_entry;
3622 }
3623
3624 if (size == PAGE_SIZE) {
3625 free_page_size_tt_count++;
3626 if (free_page_size_tt_count > free_page_size_tt_max) {
3627 free_page_size_tt_max = free_page_size_tt_count;
3628 }
3629 tt_entry->next = free_page_size_tt_list;
3630 free_page_size_tt_list = tt_entry;
3631 }
3632
3633 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3634 pmap_simple_unlock(&tt1_lock);
3635 pmap_tt_ledger_debit(pmap, size);
3636 return;
3637 }
3638
3639 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3640 free_page_size_tt_count--;
3641 tt = (tt_entry_t *)free_page_size_tt_list;
3642 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3643
3644 pmap_simple_unlock(&tt1_lock);
3645
3646 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3647
3648 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3649
3650 pmap_simple_lock(&tt1_lock);
3651 }
3652
3653 pmap_simple_unlock(&tt1_lock);
3654 pmap_tt_ledger_debit(pmap, size);
3655 }
3656
3657 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3658 pmap_tt_allocate(
3659 pmap_t pmap,
3660 tt_entry_t **ttp,
3661 unsigned int level,
3662 unsigned int options)
3663 {
3664 pmap_paddr_t pa;
3665 *ttp = NULL;
3666
3667 /* Traverse the tt_entry_free list to find a free tt_entry */
3668 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3669 return KERN_ABORTED;
3670 }
3671
3672 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3673 tt_free_entry_t *tt_free_cur, *tt_free_next;
3674
3675 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3676 tt_free_next = tt_free_cur->next;
3677 tt_free_cur->next = NULL;
3678 *ttp = (tt_entry_t *)tt_free_cur;
3679 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3680 }
3681 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3682
3683 /* Only do the heavylifting here when we don't have a free tt_entry. */
3684 if (*ttp == NULL) {
3685 pt_desc_t *ptdp;
3686
3687 const unsigned int alloc_flags =
3688 (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
3689 /*
3690 * Allocate a VM page for the level x page table entries.
3691 */
3692 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags) != KERN_SUCCESS) {
3693 if (options & PMAP_OPTIONS_NOWAIT) {
3694 return KERN_RESOURCE_SHORTAGE;
3695 }
3696 VM_PAGE_WAIT();
3697 }
3698
3699 /* Allocate a new Page Table Descriptor for the newly allocated page table. */
3700 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3701 if (options & PMAP_OPTIONS_NOWAIT) {
3702 /* Deallocate all allocated resources so far. */
3703 pmap_pages_free(pa, PAGE_SIZE);
3704 return KERN_RESOURCE_SHORTAGE;
3705 }
3706 VM_PAGE_WAIT();
3707 }
3708
3709 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3710 OSAddAtomic64(1, &alloc_ttepages_count);
3711 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3712 } else {
3713 OSAddAtomic64(1, &alloc_ptepages_count);
3714 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3715 }
3716
3717 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3718
3719 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3720
3721 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3722 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3723 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3724
3725 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3726 if (PAGE_SIZE > pmap_page_size) {
3727 vm_address_t va;
3728 vm_address_t va_end;
3729
3730 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3731 /* Deallocate all allocated resources so far. */
3732 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3733 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3734 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3735 pmap_pages_free(pa, PAGE_SIZE);
3736 ptd_deallocate(ptdp);
3737
3738 return KERN_ABORTED;
3739 }
3740
3741 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3742 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3743 pmap->tt_entry_free = (tt_entry_t *)va;
3744 }
3745 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3746 }
3747
3748 *ttp = (tt_entry_t *)phystokv(pa);
3749 }
3750
3751 #if XNU_MONITOR
3752 assert(*ttp);
3753 #endif
3754
3755 return KERN_SUCCESS;
3756 }
3757
3758
3759 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3760 pmap_tt_deallocate(
3761 pmap_t pmap,
3762 tt_entry_t *ttp,
3763 unsigned int level)
3764 {
3765 pt_desc_t *ptdp;
3766 ptd_info_t *ptd_info;
3767 unsigned pt_acc_cnt;
3768 unsigned i;
3769 vm_offset_t free_page = 0;
3770 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3771 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3772
3773 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3774
3775 ptdp = ptep_get_ptd(ttp);
3776 ptd_info = ptd_get_info(ptdp, ttp);
3777
3778 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3779
3780 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3781 ptd_info->refcnt = 0;
3782 }
3783
3784 if (__improbable(ptd_info->refcnt != 0)) {
3785 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3786 }
3787
3788 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3789 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3790 }
3791
3792 if (pt_acc_cnt == 0) {
3793 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3794 unsigned pt_free_entry_cnt = 1;
3795
3796 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3797 tt_free_entry_t *tt_free_list_next;
3798
3799 tt_free_list_next = tt_free_list->next;
3800 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3801 pt_free_entry_cnt++;
3802 }
3803 tt_free_list = tt_free_list_next;
3804 }
3805 if (pt_free_entry_cnt == max_pt_index) {
3806 tt_free_entry_t *tt_free_list_cur;
3807
3808 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3809 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3810 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3811
3812 while (tt_free_list_cur) {
3813 tt_free_entry_t *tt_free_list_next;
3814
3815 tt_free_list_next = tt_free_list_cur->next;
3816 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3817 tt_free_list->next = tt_free_list_next->next;
3818 } else {
3819 tt_free_list = tt_free_list_next;
3820 }
3821 tt_free_list_cur = tt_free_list_next;
3822 }
3823 } else {
3824 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3825 pmap->tt_entry_free = ttp;
3826 }
3827 } else {
3828 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3829 pmap->tt_entry_free = ttp;
3830 }
3831
3832 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3833
3834 if (free_page != 0) {
3835 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3836 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3837 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3838 if (level < pt_attr_leaf_level(pt_attr)) {
3839 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3840 } else {
3841 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3842 }
3843 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3844 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3845 }
3846 }
3847
3848 /**
3849 * Safely clear out a translation table entry.
3850 *
3851 * @note If the TTE to clear out points to a leaf table, then that leaf table
3852 * must have a refcnt of zero before the TTE can be removed.
3853 * @note This function expects to be called with pmap locked exclusive, and will
3854 * return with pmap unlocked.
3855 *
3856 * @param pmap The pmap containing the page table whose TTE is being removed.
3857 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3858 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3859 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3860 * @param ttep Pointer to the TTE that should be cleared out.
3861 * @param level The level of the page table that contains the TTE to be removed.
3862 */
3863 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3864 pmap_tte_remove(
3865 pmap_t pmap,
3866 vm_offset_t va_start,
3867 vm_offset_t va_end,
3868 bool need_strong_sync,
3869 tt_entry_t *ttep,
3870 unsigned int level)
3871 {
3872 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3873
3874 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3875 const tt_entry_t tte = *ttep;
3876
3877 if (__improbable(tte == ARM_TTE_EMPTY)) {
3878 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3879 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3880 }
3881
3882 *ttep = (tt_entry_t) 0;
3883 FLUSH_PTE_STRONG();
3884 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3885 if (va_end > va_start) {
3886 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3887 }
3888
3889 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3890
3891 /**
3892 * Remember, the passed in "level" parameter refers to the level above the
3893 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3894 * page table).
3895 */
3896 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3897
3898 /**
3899 * Non-leaf pagetables don't track active references in the PTD and instead
3900 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3901 * the real refcount below.
3902 */
3903 unsigned short refcnt = PT_DESC_REFCOUNT;
3904
3905 /*
3906 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3907 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3908 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3909 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3910 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3911 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3912 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3913 * synchronize it against the disconnect operation. If that removal caused the
3914 * refcount to reach zero, the pagetable page could be freed before the disconnect
3915 * operation is finished using the relevant pagetable descriptor.
3916 * Address these cases by waiting until all CPUs have been observed to not be
3917 * executing pmap_disconnect().
3918 */
3919 if (remove_leaf_table) {
3920 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3921 const int max_cpu = ml_get_max_cpu_number();
3922 bitmap_full(&active_disconnects[0], max_cpu + 1);
3923 bool inflight_disconnect;
3924
3925 /*
3926 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3927 * ahead of any prior PTE load which may have observed the effect of a
3928 * concurrent disconnect operation. An acquire fence is required for this;
3929 * a load-acquire operation is insufficient.
3930 */
3931 os_atomic_thread_fence(acquire);
3932 do {
3933 inflight_disconnect = false;
3934 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3935 i >= 0;
3936 i = bitmap_next(&active_disconnects[0], i)) {
3937 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3938 if (cpu_data == NULL) {
3939 continue;
3940 }
3941 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3942 __builtin_arm_wfe();
3943 inflight_disconnect = true;
3944 continue;
3945 }
3946 os_atomic_clear_exclusive();
3947 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3948 }
3949 } while (inflight_disconnect);
3950 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3951 os_atomic_thread_fence(acquire);
3952 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3953 }
3954
3955 #if MACH_ASSERT
3956 /**
3957 * On internal devices, always do the page table consistency check
3958 * regardless of page table level or the actual refcnt value.
3959 */
3960 {
3961 #else /* MACH_ASSERT */
3962 /**
3963 * Only perform the page table consistency check when deleting leaf page
3964 * tables and it seems like there might be valid/compressed mappings
3965 * leftover.
3966 */
3967 if (__improbable(remove_leaf_table && refcnt != 0)) {
3968 #endif /* MACH_ASSERT */
3969
3970 /**
3971 * There are multiple problems that can arise as a non-zero refcnt:
3972 * 1. A bug in the refcnt management logic.
3973 * 2. A memory stomper or hardware failure.
3974 * 3. The VM forgetting to unmap all of the valid mappings in an address
3975 * space before destroying a pmap.
3976 *
3977 * By looping over the page table and determining how many valid or
3978 * compressed entries there actually are, we can narrow down which of
3979 * these three cases is causing this panic. If the expected refcnt
3980 * (valid + compressed) and the actual refcnt don't match then the
3981 * problem is probably either a memory corruption issue (if the
3982 * non-empty entries don't match valid+compressed, that could also be a
3983 * sign of corruption) or refcnt management bug. Otherwise, there
3984 * actually are leftover mappings and the higher layers of xnu are
3985 * probably at fault.
3986 */
3987 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3988 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3989
3990 pt_entry_t *ptep = bpte;
3991 unsigned short non_empty = 0, valid = 0, comp = 0;
3992
3993 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3994 /**
3995 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3996 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3997 * That's because it's possible for the 4-tuple PTE clear operation in
3998 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3999 * pmap_disconnect() to race each other in such a way that the compressed marker
4000 * may be left in the 2nd, 3rd, and/or 4th PTEs.
4001 * This should be harmless as only the 1st PTE is used for accounting purposes,
4002 * but we don't want it to trip our internal checks here.
4003 */
4004 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
4005 if ((i % PAGE_RATIO) == 0) {
4006 comp++;
4007 } else {
4008 continue;
4009 }
4010 } else if (__improbable(pte_is_valid(*ptep))) {
4011 valid++;
4012 }
4013
4014 /* Keep track of all non-empty entries to detect memory corruption. */
4015 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
4016 non_empty++;
4017 }
4018 }
4019
4020 #if MACH_ASSERT
4021 /**
4022 * On internal machines, panic whenever a page table getting deleted has
4023 * leftover mappings (valid or otherwise) or a leaf page table has a
4024 * non-zero refcnt.
4025 */
4026 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
4027 #else /* MACH_ASSERT */
4028 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
4029 {
4030 #endif /* MACH_ASSERT */
4031 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
4032 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
4033 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
4034 }
4035 }
4036 }
4037
4038 /**
4039 * Given a pointer to an entry within a `level` page table, delete the
4040 * page table at `level` + 1 that is represented by that entry. For instance,
4041 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
4042 * contains the PA of the L3 table, and `level` would be "2".
4043 *
4044 * @note If the table getting deallocated is a leaf table, then that leaf table
4045 * must have a refcnt of zero before getting deallocated. All other levels
4046 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
4047 * @note This function expects to be called with pmap locked exclusive and will
4048 * return with pmap unlocked.
4049 *
4050 * @param pmap The pmap that owns the page table to be deallocated.
4051 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
4052 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
4053 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
4054 * @param ttep Pointer to the `level` TTE to remove.
4055 * @param level The level of the table that contains an entry pointing to the
4056 * table to be removed. The deallocated page table will be a
4057 * `level` + 1 table (so if `level` is 2, then an L3 table will be
4058 * deleted).
4059 */
4060 void
4061 pmap_tte_deallocate(
4062 pmap_t pmap,
4063 vm_offset_t va_start,
4064 vm_offset_t va_end,
4065 bool need_strong_sync,
4066 tt_entry_t *ttep,
4067 unsigned int level)
4068 {
4069 tt_entry_t tte;
4070
4071 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4072
4073 tte = *ttep;
4074
4075 if (tte_get_ptd(tte)->pmap != pmap) {
4076 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4077 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4078 }
4079
4080 assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
4081 (unsigned long long)tte);
4082
4083 /* pmap_tte_remove() will drop the pmap lock */
4084 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4085
4086 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4087 }
4088
4089 /*
4090 * Remove a range of hardware page-table entries.
4091 * The entries given are the first (inclusive)
4092 * and last (exclusive) entries for the VM pages.
4093 * The virtual address is the va for the first pte.
4094 *
4095 * The pmap must be locked.
4096 * If the pmap is not the kernel pmap, the range must lie
4097 * entirely within one pte-page. This is NOT checked.
4098 * Assumes that the pte-page exists.
4099 *
4100 * Returns the number of PTE changed
4101 */
4102 MARK_AS_PMAP_TEXT static int
4103 pmap_remove_range(
4104 pmap_t pmap,
4105 vm_map_address_t va,
4106 pt_entry_t *bpte,
4107 pt_entry_t *epte)
4108 {
4109 bool need_strong_sync = false;
4110 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4111 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4112 if (num_changed > 0) {
4113 PMAP_UPDATE_TLBS(pmap, va,
4114 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4115 }
4116 return num_changed;
4117 }
4118
4119
4120 #ifdef PVH_FLAG_EXEC
4121
4122 /*
4123 * Update the access protection bits of the physical aperture mapping for a page.
4124 * This is useful, for example, in guranteeing that a verified executable page
4125 * has no writable mappings anywhere in the system, including the physical
4126 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4127 * synchronization overhead in cases where the call to this function is
4128 * guaranteed to be followed by other TLB operations.
4129 */
4130 void
4131 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4132 {
4133 #if __ARM_PTE_PHYSMAP__
4134 pvh_assert_locked(pai);
4135 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4136 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4137
4138 pt_entry_t tmplate = *pte_p;
4139 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4140 return;
4141 }
4142 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4143 if (tmplate & ARM_PTE_HINT_MASK) {
4144 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4145 __func__, pte_p, (void *)kva, tmplate);
4146 }
4147 write_pte_strong(pte_p, tmplate);
4148 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4149 if (!flush_tlb_async) {
4150 sync_tlb_flush();
4151 }
4152 #endif
4153 }
4154 #endif /* defined(PVH_FLAG_EXEC) */
4155
4156
4157
4158 MARK_AS_PMAP_TEXT int
4159 pmap_remove_range_options(
4160 pmap_t pmap,
4161 vm_map_address_t va,
4162 pt_entry_t *bpte,
4163 pt_entry_t *epte,
4164 vm_map_address_t *eva,
4165 bool *need_strong_sync __unused,
4166 int options)
4167 {
4168 pt_entry_t *cpte;
4169 size_t npages = 0;
4170 int num_removed, num_unwired;
4171 int num_pte_changed;
4172 unsigned int pai = 0;
4173 pmap_paddr_t pa;
4174 int num_external, num_internal, num_reusable;
4175 int num_alt_internal;
4176 uint64_t num_compressed, num_alt_compressed;
4177 int16_t refcnt = 0;
4178
4179 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4180
4181 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4182 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4183
4184 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4185 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4186 }
4187
4188 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4189 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4190 }
4191
4192 if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4193 panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4194 }
4195
4196 num_removed = 0;
4197 num_unwired = 0;
4198 num_pte_changed = 0;
4199 num_external = 0;
4200 num_internal = 0;
4201 num_reusable = 0;
4202 num_compressed = 0;
4203 num_alt_internal = 0;
4204 num_alt_compressed = 0;
4205
4206 #if XNU_MONITOR
4207 bool ro_va = false;
4208 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4209 ro_va = true;
4210 }
4211 #endif
4212 for (cpte = bpte; cpte < epte;
4213 cpte += PAGE_RATIO, va += pmap_page_size) {
4214 pt_entry_t spte;
4215 boolean_t managed = FALSE;
4216
4217 /*
4218 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4219 * so we need to be as aggressive as possible in checking for preemption when we can.
4220 */
4221 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4222 *eva = va;
4223 break;
4224 }
4225
4226 spte = *((volatile pt_entry_t*)cpte);
4227
4228 while (!managed) {
4229 if (pmap != kernel_pmap &&
4230 (options & PMAP_OPTIONS_REMOVE) &&
4231 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4232 /*
4233 * "pmap" must be locked at this point,
4234 * so this should not race with another
4235 * pmap_remove_range() or pmap_enter().
4236 */
4237
4238 /* one less "compressed"... */
4239 num_compressed++;
4240 if (spte & ARM_PTE_COMPRESSED_ALT) {
4241 /* ... but it used to be "ALTACCT" */
4242 num_alt_compressed++;
4243 }
4244
4245 /* clear marker */
4246 write_pte_fast(cpte, ARM_PTE_EMPTY);
4247 /*
4248 * "refcnt" also accounts for
4249 * our "compressed" markers,
4250 * so let's update it here.
4251 */
4252 --refcnt;
4253 spte = *((volatile pt_entry_t*)cpte);
4254 }
4255 /*
4256 * It may be possible for the pte to transition from managed
4257 * to unmanaged in this timeframe; for now, elide the assert.
4258 * We should break out as a consequence of checking pa_valid.
4259 */
4260 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4261 pa = pte_to_pa(spte);
4262 if (!pa_valid(pa)) {
4263 #if XNU_MONITOR
4264 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4265 #endif
4266 #if XNU_MONITOR
4267 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4268 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4269 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4270 __func__, (uint64_t)pa);
4271 }
4272 #endif
4273 break;
4274 }
4275 #if HAS_FEAT_XS
4276 if (pte_is_xs(pt_attr, spte)) {
4277 *need_strong_sync = true;
4278 }
4279 #endif /* HAS_FEAT_XS */
4280 pai = pa_index(pa);
4281 pvh_lock(pai);
4282 spte = *((volatile pt_entry_t*)cpte);
4283 pa = pte_to_pa(spte);
4284 if (pai == pa_index(pa)) {
4285 managed = TRUE;
4286 break; // Leave pai locked as we will unlock it after we free the PV entry
4287 }
4288 pvh_unlock(pai);
4289 }
4290
4291 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4292 /*
4293 * There used to be a valid mapping here but it
4294 * has already been removed when the page was
4295 * sent to the VM compressor, so nothing left to
4296 * remove now...
4297 */
4298 continue;
4299 }
4300
4301 /* remove the translation, do not flush the TLB */
4302 if (*cpte != ARM_PTE_EMPTY) {
4303 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4304 assertf(pte_is_valid(*cpte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4305 #if MACH_ASSERT
4306 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4307 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4308 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4309 }
4310 #endif
4311 write_pte_fast(cpte, ARM_PTE_EMPTY);
4312 num_pte_changed++;
4313 }
4314
4315 if ((spte != ARM_PTE_EMPTY) && (pmap != kernel_pmap)) {
4316 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4317 assertf(pte_is_valid(spte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4318 --refcnt;
4319 }
4320
4321 if (pte_is_wired(spte)) {
4322 pte_set_wired(pmap, cpte, 0);
4323 num_unwired++;
4324 }
4325 /*
4326 * if not managed, we're done
4327 */
4328 if (!managed) {
4329 continue;
4330 }
4331
4332 #if XNU_MONITOR
4333 if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4334 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4335 }
4336 if (__improbable(ro_va)) {
4337 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4338 }
4339 #endif
4340
4341 /*
4342 * find and remove the mapping from the chain for this
4343 * physical address.
4344 */
4345 bool is_internal, is_altacct;
4346 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4347
4348 if (is_altacct) {
4349 assert(is_internal);
4350 num_internal++;
4351 num_alt_internal++;
4352 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4353 ppattr_clear_altacct(pai);
4354 ppattr_clear_internal(pai);
4355 }
4356 } else if (is_internal) {
4357 if (ppattr_test_reusable(pai)) {
4358 num_reusable++;
4359 } else {
4360 num_internal++;
4361 }
4362 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4363 ppattr_clear_internal(pai);
4364 }
4365 } else {
4366 num_external++;
4367 }
4368 pvh_unlock(pai);
4369 num_removed++;
4370 }
4371
4372 /*
4373 * Update the counts
4374 */
4375 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4376
4377 if (pmap != kernel_pmap) {
4378 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4379 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4380 }
4381
4382 /* update ledgers */
4383 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4384 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4385 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4386 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4387 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4388 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4389 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4390 /* make needed adjustments to phys_footprint */
4391 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4392 ((num_internal -
4393 num_alt_internal) +
4394 (num_compressed -
4395 num_alt_compressed)) * pmap_page_size);
4396 }
4397
4398 /* flush the ptable entries we have written */
4399 if (num_pte_changed > 0) {
4400 FLUSH_PTE_STRONG();
4401 }
4402
4403 return num_pte_changed;
4404 }
4405
4406
4407 /*
4408 * Remove the given range of addresses
4409 * from the specified map.
4410 *
4411 * It is assumed that the start and end are properly
4412 * rounded to the hardware page size.
4413 */
4414 void
4415 pmap_remove(
4416 pmap_t pmap,
4417 vm_map_address_t start,
4418 vm_map_address_t end)
4419 {
4420 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4421 }
4422
4423 MARK_AS_PMAP_TEXT vm_map_address_t
4424 pmap_remove_options_internal(
4425 pmap_t pmap,
4426 vm_map_address_t start,
4427 vm_map_address_t end,
4428 int options)
4429 {
4430 vm_map_address_t eva = end;
4431 pt_entry_t *bpte, *epte;
4432 pt_entry_t *pte_p;
4433 tt_entry_t *tte_p;
4434 int remove_count = 0;
4435 bool need_strong_sync = false;
4436 bool unlock = true;
4437
4438 validate_pmap_mutable(pmap);
4439
4440 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4441
4442 if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4443 panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4444 }
4445
4446 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4447
4448 tte_p = pmap_tte(pmap, start);
4449
4450 if (tte_p == (tt_entry_t *) NULL) {
4451 goto done;
4452 }
4453
4454 if (tte_is_valid_table(*tte_p)) {
4455 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4456 bpte = &pte_p[pte_index(pt_attr, start)];
4457 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4458
4459 /*
4460 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4461 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4462 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4463 */
4464 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4465 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4466 __func__, ptep_get_pmap(bpte), pmap, bpte);
4467 }
4468
4469 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4470 &need_strong_sync, options);
4471
4472 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4473 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4474 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4475 unlock = false; // pmap_tte_deallocate() has dropped the lock
4476 }
4477 }
4478
4479 done:
4480 if (unlock) {
4481 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4482 }
4483
4484 if (remove_count > 0) {
4485 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4486 }
4487 return eva;
4488 }
4489
4490 __mockable void
4491 pmap_remove_options(
4492 pmap_t pmap,
4493 vm_map_address_t start,
4494 vm_map_address_t end,
4495 int options)
4496 {
4497 vm_map_address_t va;
4498
4499 if (pmap == PMAP_NULL) {
4500 return;
4501 }
4502
4503 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4504
4505 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4506 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4507 VM_KERNEL_ADDRHIDE(end));
4508
4509 /*
4510 * We allow single-page requests to execute non-preemptibly,
4511 * as it doesn't make sense to sample AST_URGENT for a single-page
4512 * operation, and there are a couple of special use cases that
4513 * require a non-preemptible single-page operation.
4514 */
4515 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4516 pmap_verify_preemptible();
4517 }
4518
4519 /*
4520 * Invalidate the translation buffer first
4521 */
4522 va = start;
4523 while (va < end) {
4524 vm_map_address_t l;
4525
4526 #if XNU_TARGET_OS_XR
4527 /* rdar://84856940 */
4528 unsigned int const BATCH_SIZE = 128 * pt_attr_leaf_size(pt_attr);
4529
4530 l = va + BATCH_SIZE;
4531
4532 vm_map_address_t const l_twig = l & ~pt_attr_twig_offmask(pt_attr);
4533
4534 if (l_twig > (va & ~pt_attr_twig_offmask(pt_attr))) {
4535 // We're not allowed to cross an L2 boundary.
4536 l = l_twig;
4537 }
4538 #else /* XNU_TARGET_OS_XR */
4539 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4540 #endif /* XNU_TARGET_OS_XR */
4541 if (l > end) {
4542 l = end;
4543 }
4544
4545 #if XNU_MONITOR
4546 va = pmap_remove_options_ppl(pmap, va, l, options);
4547
4548 pmap_ledger_check_balance(pmap);
4549 #else
4550 va = pmap_remove_options_internal(pmap, va, l, options);
4551 #endif
4552 }
4553
4554 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4555 }
4556
4557
4558 /*
4559 * Remove phys addr if mapped in specified map
4560 */
4561 void
4562 pmap_remove_some_phys(
4563 __unused pmap_t map,
4564 __unused ppnum_t pn)
4565 {
4566 /* Implement to support working set code */
4567 }
4568
4569 /*
4570 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4571 * switch a thread onto a new vm_map.
4572 */
4573 void
4574 pmap_switch_user(thread_t thread, vm_map_t new_map)
4575 {
4576 pmap_t new_pmap = new_map->pmap;
4577
4578
4579 thread->map = new_map;
4580 pmap_set_pmap(new_pmap, thread);
4581
4582 }
4583
4584 void
4585 pmap_set_pmap(
4586 pmap_t pmap,
4587 thread_t thread)
4588 {
4589 pmap_switch(pmap, thread);
4590 #if __ARM_USER_PROTECT__
4591 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4592 thread->machine.asid = pmap->hw_asid;
4593 #endif
4594 }
4595
4596 static void
4597 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4598 {
4599 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4600 }
4601
4602 #if HAS_SPECRES
4603 static void
4604 pmap_flush_core_cfp_asid_async(pmap_t pmap)
4605 {
4606 const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4607 asm volatile ("cfp rctx, %0" : : "r"(rctx_operand));
4608 }
4609
4610 #if REQUIRES_DVP_RCTX
4611 static void
4612 pmap_flush_core_dvp_asid_async(pmap_t pmap)
4613 {
4614 const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4615 asm volatile ("dvp rctx, %0" : : "r"(rctx_operand));
4616 }
4617 #endif /* REQUIRES_DVP_RCTX */
4618 #endif /* HAS_SPECRES */
4619
4620 static inline bool
4621 pmap_user_ttb_is_clear(void)
4622 {
4623 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4624 }
4625
4626 MARK_AS_PMAP_TEXT void
4627 pmap_switch_internal(
4628 pmap_t pmap)
4629 {
4630 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4631 #if XNU_MONITOR
4632 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4633
4634 /**
4635 * Make sure a pmap is never active-and-nested. For more details,
4636 * see pmap_set_nested_internal().
4637 */
4638 os_atomic_thread_fence(seq_cst);
4639 if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4640 panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4641 }
4642 #endif
4643 validate_pmap_mutable(pmap);
4644 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4645 uint16_t asid_index = pmap->hw_asid;
4646 bool do_asid_flush = false;
4647 bool do_commpage_flush = false;
4648 #if HAS_SPECRES
4649 bool do_speculation_restriction = false;
4650 #endif /* HAS_SPECRES */
4651
4652 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4653 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4654 }
4655 #if __ARM_KERNEL_PROTECT__
4656 asid_index >>= 1;
4657 #endif
4658
4659 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4660 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4661 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4662 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4663 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4664 bool break_before_make = do_shared_region_flush;
4665
4666 #if !HAS_16BIT_ASID
4667 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4668 asid_index -= 1;
4669 pmap_update_plru(asid_index);
4670
4671 /* Paranoia. */
4672 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4673
4674 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4675 uint8_t new_sw_asid = pmap->sw_asid;
4676 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4677
4678 if (new_sw_asid != last_sw_asid) {
4679 /**
4680 * If the virtual ASID of the new pmap does not match the virtual ASID
4681 * last seen on this CPU for the physical ASID (that was a mouthful),
4682 * then this switch runs the risk of aliasing. We need to flush the
4683 * TLB for this phyiscal ASID in this case.
4684 */
4685 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4686 do_asid_flush = true;
4687 #if HAS_SPECRES
4688 do_speculation_restriction = true;
4689 #endif /* HAS_SPECRES */
4690 break_before_make = true;
4691 }
4692 }
4693 #endif /* !HAS_16BIT_ASID */
4694
4695 #if HAS_SPECRES_DEBUGGING
4696 if (specres_debug & SPECRES_DEBUG_FORCE_RCTX) {
4697 do_speculation_restriction = true;
4698 } else if (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX) {
4699 do_speculation_restriction = false;
4700 }
4701 #endif /* HAS_SPECRES_DEBUGGING */
4702
4703 #if __ARM_MIXED_PAGE_SIZE__
4704 if (pt_attr->pta_tcr_value != get_tcr()) {
4705 break_before_make = true;
4706 }
4707 #endif
4708 #if __ARM_MIXED_PAGE_SIZE__
4709 /*
4710 * For mixed page size configurations, we need to flush the global commpage mappings from
4711 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4712 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4713 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4714 * conflict abort or other unpredictable behavior.
4715 */
4716 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4717 do_commpage_flush = true;
4718 }
4719 if (do_commpage_flush) {
4720 break_before_make = true;
4721 }
4722 #endif
4723 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4724 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4725 pmap_clear_user_ttb_internal();
4726 }
4727
4728 #if HAS_SPECRES
4729 /**
4730 * Perform an CFP/DVP flush if required.
4731 */
4732 if (__improbable(do_speculation_restriction)) {
4733 pmap_flush_core_cfp_asid_async(pmap);
4734 #if REQUIRES_DVP_RCTX
4735 pmap_flush_core_dvp_asid_async(pmap);
4736 #endif /* REQUIRES_DVP_RCTX */
4737 #if DEVELOPMENT || DEBUG
4738 os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4739 #endif /* DEVELOPMENT || DEBUG */
4740 }
4741 #endif /* HAS_SPECRES */
4742
4743 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4744 * to flush the userspace mappings for that region. Those mappings are global
4745 * and will not be protected by the ASID. It should also be cheaper to flush the
4746 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4747 if (__improbable(do_shared_region_flush)) {
4748 #if __ARM_RANGE_TLBI__
4749 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4750 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4751
4752 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4753 * There may still be non-global entries that overlap with the incoming pmap's
4754 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4755 * must necessarily belong to a different ASID than the incoming pmap, or they would
4756 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4757 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4758 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4759 * to consider additional invalidation here in the future. */
4760 if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4761 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4762 } else {
4763 /*
4764 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4765 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4766 * have a single-page shared region anyway, not least because pmap_nest()
4767 * requires L2 block alignment of the address and size.
4768 */
4769 do_asid_flush = false;
4770 flush_core_tlb_async();
4771 }
4772 #else
4773 do_asid_flush = false;
4774 flush_core_tlb_async();
4775 #endif // __ARM_RANGE_TLBI__
4776 }
4777
4778 #if __ARM_MIXED_PAGE_SIZE__
4779 if (__improbable(do_commpage_flush)) {
4780 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4781 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4782 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4783 flush_core_tlb_allrange_async(rtlbi_param);
4784 }
4785 #endif
4786 if (__improbable(do_asid_flush)) {
4787 pmap_flush_core_tlb_asid_async(pmap);
4788 #if DEVELOPMENT || DEBUG
4789 os_atomic_inc(&pmap_asid_flushes, relaxed);
4790 #endif /* DEVELOPMENT || DEBUG */
4791 }
4792
4793 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush
4794 #if HAS_SPECRES && !HAS_ERRATA_123855614
4795 || do_speculation_restriction
4796 #endif /* HAS_SPECRES && !HAS_ERRATA_123855614 */
4797 )) {
4798 sync_tlb_flush_local();
4799 }
4800
4801 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4802 }
4803
4804 void
4805 pmap_switch(
4806 pmap_t pmap,
4807 thread_t thread __unused)
4808 {
4809 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4810 #if XNU_MONITOR
4811 pmap_switch_ppl(pmap);
4812 #else
4813 pmap_switch_internal(pmap);
4814 #endif
4815 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4816 }
4817
4818 void
4819 pmap_page_protect(
4820 ppnum_t ppnum,
4821 vm_prot_t prot)
4822 {
4823 pmap_page_protect_options(ppnum, prot, 0, NULL);
4824 }
4825
4826 /*
4827 * Routine: pmap_page_protect_options
4828 *
4829 * Function:
4830 * Lower the permission for all mappings to a given
4831 * page.
4832 */
4833 MARK_AS_PMAP_TEXT static void
4834 pmap_page_protect_options_with_flush_range(
4835 ppnum_t ppnum,
4836 vm_prot_t prot,
4837 unsigned int options,
4838 pmap_tlb_flush_range_t *flush_range)
4839 {
4840 pmap_paddr_t phys = ptoa(ppnum);
4841 pv_entry_t **pv_h;
4842 pv_entry_t *pve_p, *orig_pve_p;
4843 pv_entry_t *pveh_p;
4844 pv_entry_t *pvet_p;
4845 pt_entry_t *pte_p, *orig_pte_p;
4846 pv_entry_t *new_pve_p;
4847 pt_entry_t *new_pte_p;
4848 vm_offset_t pvh_flags;
4849 unsigned int pai;
4850 bool remove;
4851 bool set_NX;
4852 unsigned int pvh_cnt = 0;
4853 unsigned int pass1_updated = 0;
4854 unsigned int pass2_updated = 0;
4855
4856 assert(ppnum != vm_page_fictitious_addr);
4857
4858 /* Only work with managed pages. */
4859 if (!pa_valid(phys)) {
4860 return;
4861 }
4862
4863 /*
4864 * Determine the new protection.
4865 */
4866 switch (prot) {
4867 case VM_PROT_ALL:
4868 return; /* nothing to do */
4869 case VM_PROT_READ:
4870 case VM_PROT_READ | VM_PROT_EXECUTE:
4871 remove = false;
4872 break;
4873 default:
4874 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4875 options = options & ~PMAP_OPTIONS_NOFLUSH;
4876 remove = true;
4877 break;
4878 }
4879
4880 pmap_cpu_data_t *pmap_cpu_data = NULL;
4881 if (remove) {
4882 #if !XNU_MONITOR
4883 mp_disable_preemption();
4884 #endif
4885 pmap_cpu_data = pmap_get_cpu_data();
4886 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4887 /*
4888 * Ensure the store to inflight_disconnect will be observed before any of the
4889 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4890 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4891 * another CPU, in between this function's clearing a PTE and dropping the
4892 * corresponding pagetable refcount. That can lead to a panic if the
4893 * destroying thread observes a non-zero refcount. For this we need a store-
4894 * store barrier; a store-release operation would not be sufficient.
4895 */
4896 os_atomic_thread_fence(release);
4897 }
4898
4899 pai = pa_index(phys);
4900 pvh_lock(pai);
4901 pv_h = pai_to_pvh(pai);
4902 pvh_flags = pvh_get_flags(pv_h);
4903
4904 #if XNU_MONITOR
4905 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4906 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4907 }
4908 if (__improbable(ppattr_pa_test_monitor(phys))) {
4909 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4910 }
4911 if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4912 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4913 }
4914 #endif
4915
4916
4917 orig_pte_p = pte_p = PT_ENTRY_NULL;
4918 orig_pve_p = pve_p = PV_ENTRY_NULL;
4919 pveh_p = PV_ENTRY_NULL;
4920 pvet_p = PV_ENTRY_NULL;
4921 new_pve_p = PV_ENTRY_NULL;
4922 new_pte_p = PT_ENTRY_NULL;
4923
4924
4925 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4926 orig_pte_p = pte_p = pvh_ptep(pv_h);
4927 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4928 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4929 pveh_p = pve_p;
4930 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4931 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4932 }
4933
4934 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4935 int pve_ptep_idx = 0;
4936
4937 /*
4938 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4939 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4940 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4941 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4942 * operation, TLB invalidation may be handled by the caller so it's possible for
4943 * tlb_flush_needed to be true while issue_tlbi is false.
4944 */
4945 bool issue_tlbi = false;
4946 bool tlb_flush_needed = false;
4947 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4948 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4949 pt_entry_t tmplate = ARM_PTE_EMPTY;
4950 bool update = false;
4951
4952 if (pve_p != PV_ENTRY_NULL) {
4953 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4954 if (pte_p == PT_ENTRY_NULL) {
4955 goto protect_skip_pve_pass1;
4956 }
4957 }
4958
4959 #ifdef PVH_FLAG_IOMMU
4960 if (pvh_ptep_is_iommu(pte_p)) {
4961 #if XNU_MONITOR
4962 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4963 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4964 ppnum, ptep_get_iommu(pte_p), pve_p);
4965 }
4966 #endif
4967 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4968 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4969 ppnum, ptep_get_iommu(pte_p), pve_p);
4970 }
4971 goto protect_skip_pve_pass1;
4972 }
4973 #endif
4974 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4975 const pmap_t pmap = ptdp->pmap;
4976 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4977
4978 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4979 #if MACH_ASSERT
4980 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4981 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4982 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4983 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4984
4985 pv_entry_t *check_pvep = pve_p;
4986
4987 do {
4988 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4989 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4990 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4991 }
4992 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4993
4994 /* Restore previous PTEP value. */
4995 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4996 }
4997 #endif
4998 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4999 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
5000 }
5001
5002 #if DEVELOPMENT || DEBUG
5003 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5004 #else
5005 if ((prot & VM_PROT_EXECUTE))
5006 #endif
5007 {
5008 set_NX = false;
5009 } else {
5010 set_NX = true;
5011 }
5012
5013 #if HAS_FEAT_XS
5014 /**
5015 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
5016 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
5017 */
5018 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
5019 #endif /* HAS_FEAT_XS */
5020
5021 /* Remove the mapping if new protection is NONE */
5022 if (remove) {
5023 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
5024 panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
5025 __func__, pmap, ppnum);
5026 }
5027
5028 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
5029 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
5030 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5031 pt_entry_t spte = *pte_p;
5032
5033 if (pte_is_wired(spte)) {
5034 pte_set_wired(pmap, pte_p, 0);
5035 spte = *pte_p;
5036 if (pmap != kernel_pmap) {
5037 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5038 }
5039 }
5040
5041 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
5042 (uint64_t)spte, pte_p, ppnum);
5043
5044 if (compress && is_internal && (pmap != kernel_pmap)) {
5045 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
5046 /* mark this PTE as having been "compressed" */
5047 tmplate = ARM_PTE_COMPRESSED;
5048 if (is_altacct) {
5049 tmplate |= ARM_PTE_COMPRESSED_ALT;
5050 }
5051 } else {
5052 tmplate = ARM_PTE_EMPTY;
5053 }
5054
5055 assert(spte != tmplate);
5056 write_pte_fast(pte_p, tmplate);
5057 update = true;
5058 ++pass1_updated;
5059
5060 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5061
5062 if (pmap != kernel_pmap) {
5063 if (ppattr_test_reusable(pai) &&
5064 is_internal &&
5065 !is_altacct) {
5066 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5067 } else if (!is_internal) {
5068 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5069 }
5070
5071 if (is_altacct) {
5072 assert(is_internal);
5073 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5074 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5075 if (options & PMAP_OPTIONS_COMPRESSOR) {
5076 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5077 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5078 }
5079 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5080 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5081 } else if (ppattr_test_reusable(pai)) {
5082 assert(is_internal);
5083 if (options & PMAP_OPTIONS_COMPRESSOR) {
5084 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5085 /* was not in footprint, but is now */
5086 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5087 }
5088 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5089 } else if (is_internal) {
5090 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5091
5092 /*
5093 * Update all stats related to physical footprint, which only
5094 * deals with internal pages.
5095 */
5096 if (options & PMAP_OPTIONS_COMPRESSOR) {
5097 /*
5098 * This removal is only being done so we can send this page to
5099 * the compressor; therefore it mustn't affect total task footprint.
5100 */
5101 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5102 } else {
5103 /*
5104 * This internal page isn't going to the compressor, so adjust stats to keep
5105 * phys_footprint up to date.
5106 */
5107 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5108 }
5109 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5110 } else {
5111 /* external page: no impact on ledgers */
5112 }
5113 }
5114 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
5115 } else {
5116 pt_entry_t spte = *pte_p;
5117 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5118
5119 if (pmap == kernel_pmap) {
5120 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5121 } else {
5122 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5123 }
5124
5125 /*
5126 * While the naive implementation of this would serve to add execute
5127 * permission, this is not how the VM uses this interface, or how
5128 * x86_64 implements it. So ignore requests to add execute permissions.
5129 */
5130 if (set_NX) {
5131 tmplate |= pt_attr_leaf_xn(pt_attr);
5132 }
5133
5134
5135 assert(spte != ARM_PTE_EMPTY);
5136 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
5137
5138 if (spte != tmplate) {
5139 /*
5140 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
5141 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
5142 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
5143 * should always be cleared by this function.
5144 */
5145 pte_set_was_writeable(tmplate, true);
5146 write_pte_fast(pte_p, tmplate);
5147 update = true;
5148 ++pass1_updated;
5149 } else if (pte_was_writeable(tmplate)) {
5150 /*
5151 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5152 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
5153 * write access to a page, this function should always at least clear that flag for
5154 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5155 * these mappings go through vm_fault(). We therefore don't want those accesses to
5156 * be handled through arm_fast_fault().
5157 */
5158 pte_set_was_writeable(tmplate, false);
5159 write_pte_fast(pte_p, tmplate);
5160 }
5161 }
5162
5163 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5164 tlb_flush_needed = true;
5165 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5166 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5167 issue_tlbi = true;
5168 }
5169 }
5170 protect_skip_pve_pass1:
5171 pte_p = PT_ENTRY_NULL;
5172 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5173 pve_ptep_idx = 0;
5174 pve_p = pve_next(pve_p);
5175 }
5176 }
5177
5178 if (tlb_flush_needed) {
5179 FLUSH_PTE_STRONG();
5180 }
5181
5182 if (!remove && !issue_tlbi) {
5183 goto protect_finish;
5184 }
5185
5186 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5187 pv_entry_t **pve_pp = pv_h;
5188 pve_p = orig_pve_p;
5189 pte_p = orig_pte_p;
5190 pve_ptep_idx = 0;
5191
5192 /*
5193 * We need to keep track of whether a particular PVE list contains IOMMU
5194 * mappings when removing entries, because we should only remove CPU
5195 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5196 * it around.
5197 */
5198 bool iommu_mapping_in_pve = false;
5199 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5200 if (pve_p != PV_ENTRY_NULL) {
5201 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5202 if (pte_p == PT_ENTRY_NULL) {
5203 goto protect_skip_pve_pass2;
5204 }
5205 }
5206
5207 #ifdef PVH_FLAG_IOMMU
5208 if (pvh_ptep_is_iommu(pte_p)) {
5209 iommu_mapping_in_pve = true;
5210 if (remove && (pve_p == PV_ENTRY_NULL)) {
5211 /*
5212 * We've found an IOMMU entry and it's the only entry in the PV list.
5213 * We don't discard IOMMU entries, so simply set up the new PV list to
5214 * contain the single IOMMU PTE and exit the loop.
5215 */
5216 new_pte_p = pte_p;
5217 break;
5218 }
5219 goto protect_skip_pve_pass2;
5220 }
5221 #endif
5222 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5223 const pmap_t pmap = ptdp->pmap;
5224 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5225
5226 if (remove) {
5227 if (!compress && (pmap != kernel_pmap)) {
5228 /*
5229 * We must wait to decrement the refcount until we're completely finished using the PTE
5230 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5231 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5232 * under us.
5233 */
5234 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5235 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5236 }
5237 }
5238 /* Remove this CPU mapping from PVE list. */
5239 if (pve_p != PV_ENTRY_NULL) {
5240 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5241 }
5242 } else {
5243 pt_entry_t spte = *pte_p;
5244 if (pte_was_writeable(spte)) {
5245 pte_set_was_writeable(spte, false);
5246 write_pte_fast(pte_p, spte);
5247 } else {
5248 goto protect_skip_pve_pass2;
5249 }
5250 }
5251 ++pass2_updated;
5252 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5253 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5254 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5255 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5256 }
5257
5258 protect_skip_pve_pass2:
5259 pte_p = PT_ENTRY_NULL;
5260 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5261 pve_ptep_idx = 0;
5262
5263 if (remove) {
5264 /**
5265 * If there are any IOMMU mappings in the PVE list, preserve
5266 * those mappings in a new PVE list (new_pve_p) which will later
5267 * become the new PVH entry. Keep track of the CPU mappings in
5268 * pveh_p/pvet_p so they can be deallocated later.
5269 */
5270 if (iommu_mapping_in_pve) {
5271 iommu_mapping_in_pve = false;
5272 pv_entry_t *temp_pve_p = pve_next(pve_p);
5273 pve_remove(pv_h, pve_pp, pve_p);
5274 pveh_p = pvh_pve_list(pv_h);
5275 pve_p->pve_next = new_pve_p;
5276 new_pve_p = pve_p;
5277 pve_p = temp_pve_p;
5278 continue;
5279 } else {
5280 pvet_p = pve_p;
5281 pvh_cnt++;
5282 }
5283 }
5284
5285 pve_pp = pve_next_ptr(pve_p);
5286 pve_p = pve_next(pve_p);
5287 iommu_mapping_in_pve = false;
5288 }
5289 }
5290
5291 protect_finish:
5292
5293 #ifdef PVH_FLAG_EXEC
5294 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5295 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5296 }
5297 #endif
5298 if (__improbable(pass1_updated != pass2_updated)) {
5299 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5300 __func__, pass1_updated, pass2_updated);
5301 }
5302 /* if we removed a bunch of entries, take care of them now */
5303 if (remove) {
5304 if (new_pve_p != PV_ENTRY_NULL) {
5305 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5306 pvh_set_flags(pv_h, pvh_flags);
5307 } else if (new_pte_p != PT_ENTRY_NULL) {
5308 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5309 pvh_set_flags(pv_h, pvh_flags);
5310 } else {
5311 if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5312 pmap_flush_noncoherent_page(phys);
5313 }
5314 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5315 }
5316 }
5317
5318 if (flush_range && tlb_flush_needed) {
5319 if (!remove) {
5320 flush_range->ptfr_flush_needed = true;
5321 tlb_flush_needed = false;
5322 }
5323 }
5324
5325 /*
5326 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5327 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5328 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5329 * a page to be repurposed while it is still live in the TLBs.
5330 */
5331 if (remove && tlb_flush_needed) {
5332 sync_tlb_flush();
5333 }
5334
5335
5336 pvh_unlock(pai);
5337
5338 if (remove) {
5339 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5340 #if !XNU_MONITOR
5341 mp_enable_preemption();
5342 #endif
5343 }
5344
5345 if (!remove && tlb_flush_needed) {
5346 sync_tlb_flush();
5347 }
5348
5349 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5350 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5351 }
5352 }
5353
5354 MARK_AS_PMAP_TEXT void
5355 pmap_page_protect_options_internal(
5356 ppnum_t ppnum,
5357 vm_prot_t prot,
5358 unsigned int options,
5359 void *arg)
5360 {
5361 if (arg != NULL) {
5362 /*
5363 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5364 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5365 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5366 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5367 * In that case, force the flush to take place.
5368 */
5369 options &= ~PMAP_OPTIONS_NOFLUSH;
5370 }
5371 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5372 }
5373
5374 void
5375 pmap_page_protect_options(
5376 ppnum_t ppnum,
5377 vm_prot_t prot,
5378 unsigned int options,
5379 void *arg)
5380 {
5381 pmap_paddr_t phys = ptoa(ppnum);
5382
5383 assert(ppnum != vm_page_fictitious_addr);
5384
5385 /* Only work with managed pages. */
5386 if (!pa_valid(phys)) {
5387 return;
5388 }
5389
5390 /*
5391 * Determine the new protection.
5392 */
5393 if (prot == VM_PROT_ALL) {
5394 return; /* nothing to do */
5395 }
5396
5397 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5398
5399 #if XNU_MONITOR
5400 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5401 #else
5402 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5403 #endif
5404
5405 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5406 }
5407
5408
5409 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5410 MARK_AS_PMAP_TEXT void
5411 pmap_disable_user_jop_internal(pmap_t pmap)
5412 {
5413 if (pmap == kernel_pmap) {
5414 panic("%s: called with kernel_pmap", __func__);
5415 }
5416 validate_pmap_mutable(pmap);
5417 pmap->disable_jop = true;
5418 }
5419
5420 void
5421 pmap_disable_user_jop(pmap_t pmap)
5422 {
5423 #if XNU_MONITOR
5424 pmap_disable_user_jop_ppl(pmap);
5425 #else
5426 pmap_disable_user_jop_internal(pmap);
5427 #endif
5428 }
5429 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5430
5431 /*
5432 * Indicates if the pmap layer enforces some additional restrictions on the
5433 * given set of protections.
5434 */
5435 bool
5436 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5437 {
5438 return false;
5439 }
5440
5441 /*
5442 * Set the physical protection on the
5443 * specified range of this map as requested.
5444 * VERY IMPORTANT: Will not increase permissions.
5445 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5446 */
5447 void
5448 pmap_protect(
5449 pmap_t pmap,
5450 vm_map_address_t b,
5451 vm_map_address_t e,
5452 vm_prot_t prot)
5453 {
5454 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5455 }
5456
5457 MARK_AS_PMAP_TEXT vm_map_address_t
5458 pmap_protect_options_internal(
5459 pmap_t pmap,
5460 vm_map_address_t start,
5461 vm_map_address_t end,
5462 vm_prot_t prot,
5463 unsigned int options,
5464 __unused void *args)
5465 {
5466 tt_entry_t *tte_p;
5467 pt_entry_t *bpte_p, *epte_p;
5468 pt_entry_t *pte_p;
5469 boolean_t set_NX = TRUE;
5470 boolean_t set_XO = FALSE;
5471 boolean_t should_have_removed = FALSE;
5472 bool need_strong_sync = false;
5473
5474 /* Validate the pmap input before accessing its data. */
5475 validate_pmap_mutable(pmap);
5476
5477 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5478
5479 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5480 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5481 }
5482
5483 #if DEVELOPMENT || DEBUG
5484 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5485 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5486 should_have_removed = TRUE;
5487 }
5488 } else
5489 #endif
5490 {
5491 /* Determine the new protection. */
5492 switch (prot) {
5493 case VM_PROT_EXECUTE:
5494 set_XO = TRUE;
5495 OS_FALLTHROUGH;
5496 case VM_PROT_READ:
5497 case VM_PROT_READ | VM_PROT_EXECUTE:
5498 break;
5499 case VM_PROT_READ | VM_PROT_WRITE:
5500 case VM_PROT_ALL:
5501 return end; /* nothing to do */
5502 default:
5503 should_have_removed = TRUE;
5504 }
5505 }
5506
5507 if (should_have_removed) {
5508 panic("%s: should have been a remove operation, "
5509 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5510 __FUNCTION__,
5511 pmap, (void *)start, (void *)end, prot, options, args);
5512 }
5513
5514 #if DEVELOPMENT || DEBUG
5515 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5516 #else
5517 if ((prot & VM_PROT_EXECUTE))
5518 #endif
5519 {
5520 set_NX = FALSE;
5521 } else {
5522 set_NX = TRUE;
5523 }
5524
5525 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5526 vm_map_address_t va = start;
5527 unsigned int npages = 0;
5528
5529 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5530
5531 tte_p = pmap_tte(pmap, start);
5532
5533 if ((tte_p != (tt_entry_t *) NULL) && tte_is_valid_table(*tte_p)) {
5534 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5535 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5536 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5537 pte_p = bpte_p;
5538
5539 for (pte_p = bpte_p;
5540 pte_p < epte_p;
5541 pte_p += PAGE_RATIO, va += pmap_page_size) {
5542 ++npages;
5543 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5544 pmap_pending_preemption())) {
5545 break;
5546 }
5547 pt_entry_t spte;
5548 #if DEVELOPMENT || DEBUG
5549 boolean_t force_write = FALSE;
5550 #endif
5551
5552 spte = *((volatile pt_entry_t*)pte_p);
5553
5554 if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5555 continue;
5556 }
5557
5558 pmap_paddr_t pa;
5559 unsigned int pai = 0;
5560 boolean_t managed = FALSE;
5561
5562 while (!managed) {
5563 /*
5564 * It may be possible for the pte to transition from managed
5565 * to unmanaged in this timeframe; for now, elide the assert.
5566 * We should break out as a consequence of checking pa_valid.
5567 */
5568 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5569 pa = pte_to_pa(spte);
5570 if (!pa_valid(pa)) {
5571 break;
5572 }
5573 pai = pa_index(pa);
5574 pvh_lock(pai);
5575 spte = *((volatile pt_entry_t*)pte_p);
5576 pa = pte_to_pa(spte);
5577 if (pai == pa_index(pa)) {
5578 managed = TRUE;
5579 break; // Leave the PVH locked as we will unlock it after we free the PTE
5580 }
5581 pvh_unlock(pai);
5582 }
5583
5584 if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5585 continue;
5586 }
5587
5588 pt_entry_t tmplate;
5589
5590 if (pmap == kernel_pmap) {
5591 #if DEVELOPMENT || DEBUG
5592 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5593 force_write = TRUE;
5594 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5595 } else
5596 #endif
5597 {
5598 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5599 }
5600 } else {
5601 #if DEVELOPMENT || DEBUG
5602 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5603 assert(pmap->type != PMAP_TYPE_NESTED);
5604 force_write = TRUE;
5605 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5606 } else
5607 #endif
5608 {
5609 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5610 }
5611 }
5612
5613 /*
5614 * XXX Removing "NX" would
5615 * grant "execute" access
5616 * immediately, bypassing any
5617 * checks VM might want to do
5618 * in its soft fault path.
5619 * pmap_protect() and co. are
5620 * not allowed to increase
5621 * access permissions.
5622 */
5623 if (set_NX) {
5624 tmplate |= pt_attr_leaf_xn(pt_attr);
5625 } else {
5626 if (pmap == kernel_pmap) {
5627 /* do NOT clear "PNX"! */
5628 tmplate |= ARM_PTE_NX;
5629 } else {
5630 /* do NOT clear "NX"! */
5631 tmplate |= pt_attr_leaf_x(pt_attr);
5632 if (set_XO) {
5633 tmplate &= ~ARM_PTE_APMASK;
5634 tmplate |= pt_attr_leaf_rona(pt_attr);
5635 }
5636 }
5637 }
5638
5639 #if DEVELOPMENT || DEBUG
5640 if (force_write) {
5641 /*
5642 * TODO: Run CS/Monitor checks here.
5643 */
5644 if (managed) {
5645 /*
5646 * We are marking the page as writable,
5647 * so we consider it to be modified and
5648 * referenced.
5649 */
5650 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5651 tmplate |= ARM_PTE_AF;
5652
5653 if (ppattr_test_reffault(pai)) {
5654 ppattr_clear_reffault(pai);
5655 }
5656
5657 if (ppattr_test_modfault(pai)) {
5658 ppattr_clear_modfault(pai);
5659 }
5660 }
5661 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5662 /*
5663 * An immediate request for anything other than
5664 * write should still mark the page as
5665 * referenced if managed.
5666 */
5667 if (managed) {
5668 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5669 tmplate |= ARM_PTE_AF;
5670
5671 if (ppattr_test_reffault(pai)) {
5672 ppattr_clear_reffault(pai);
5673 }
5674 }
5675 }
5676 #endif
5677
5678 /* We do not expect to write fast fault the entry. */
5679 pte_set_was_writeable(tmplate, false);
5680 #if HAS_FEAT_XS
5681 if (pte_is_xs(pt_attr, spte)) {
5682 need_strong_sync = true;
5683 }
5684 #endif /* HAS_FEAT_XS */
5685
5686 write_pte_fast(pte_p, tmplate);
5687
5688 if (managed) {
5689 pvh_assert_locked(pai);
5690 pvh_unlock(pai);
5691 }
5692 }
5693 FLUSH_PTE_STRONG();
5694 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5695 } else {
5696 va = end;
5697 }
5698
5699 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5700 return va;
5701 }
5702
5703 void
5704 pmap_protect_options(
5705 pmap_t pmap,
5706 vm_map_address_t b,
5707 vm_map_address_t e,
5708 vm_prot_t prot,
5709 unsigned int options,
5710 __unused void *args)
5711 {
5712 vm_map_address_t l, beg;
5713
5714 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5715
5716 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5717 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5718 pmap, (uint64_t)b, (uint64_t)e);
5719 }
5720
5721 /*
5722 * We allow single-page requests to execute non-preemptibly,
5723 * as it doesn't make sense to sample AST_URGENT for a single-page
5724 * operation, and there are a couple of special use cases that
5725 * require a non-preemptible single-page operation.
5726 */
5727 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5728 pmap_verify_preemptible();
5729 }
5730
5731 #if DEVELOPMENT || DEBUG
5732 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5733 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5734 pmap_remove_options(pmap, b, e, options);
5735 return;
5736 }
5737 } else
5738 #endif
5739 {
5740 /* Determine the new protection. */
5741 switch (prot) {
5742 case VM_PROT_EXECUTE:
5743 case VM_PROT_READ:
5744 case VM_PROT_READ | VM_PROT_EXECUTE:
5745 break;
5746 case VM_PROT_READ | VM_PROT_WRITE:
5747 case VM_PROT_ALL:
5748 return; /* nothing to do */
5749 default:
5750 pmap_remove_options(pmap, b, e, options);
5751 return;
5752 }
5753 }
5754
5755 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5756 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5757 VM_KERNEL_ADDRHIDE(e));
5758
5759 beg = b;
5760
5761 while (beg < e) {
5762 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5763
5764 if (l > e) {
5765 l = e;
5766 }
5767
5768 #if XNU_MONITOR
5769 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5770 #else
5771 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5772 #endif
5773 }
5774
5775 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5776 }
5777
5778 /**
5779 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5780 *
5781 * @param pmap pmap to insert the pages into.
5782 * @param va virtual address to map the pages into.
5783 * @param pa page number of the first physical page to map.
5784 * @param size block size, in number of pages.
5785 * @param prot mapping protection attributes.
5786 * @param attr flags to pass to pmap_enter().
5787 *
5788 * @return KERN_SUCCESS.
5789 */
5790 kern_return_t
5791 pmap_map_block(
5792 pmap_t pmap,
5793 addr64_t va,
5794 ppnum_t pa,
5795 uint32_t size,
5796 vm_prot_t prot,
5797 int attr,
5798 unsigned int flags)
5799 {
5800 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5801 }
5802
5803 /**
5804 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5805 * As opposed to pmap_map_block(), this function takes
5806 * a physical address as an input and operates using the
5807 * page size associated with the input pmap.
5808 *
5809 * @param pmap pmap to insert the pages into.
5810 * @param va virtual address to map the pages into.
5811 * @param pa physical address of the first physical page to map.
5812 * @param size block size, in number of pages.
5813 * @param prot mapping protection attributes.
5814 * @param attr flags to pass to pmap_enter().
5815 *
5816 * @return KERN_SUCCESS.
5817 */
5818 kern_return_t
5819 pmap_map_block_addr(
5820 pmap_t pmap,
5821 addr64_t va,
5822 pmap_paddr_t pa,
5823 uint32_t size,
5824 vm_prot_t prot,
5825 int attr,
5826 unsigned int flags)
5827 {
5828 #if __ARM_MIXED_PAGE_SIZE__
5829 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5830 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5831 #else
5832 const uint64_t pmap_page_size = PAGE_SIZE;
5833 #endif
5834
5835 for (ppnum_t page = 0; page < size; page++) {
5836 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5837 panic("%s: failed pmap_enter_addr, "
5838 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5839 __FUNCTION__,
5840 pmap, va, (uint64_t)pa, size, prot, flags);
5841 }
5842
5843 va += pmap_page_size;
5844 pa += pmap_page_size;
5845 }
5846
5847 return KERN_SUCCESS;
5848 }
5849
5850 kern_return_t
5851 pmap_enter_addr(
5852 pmap_t pmap,
5853 vm_map_address_t v,
5854 pmap_paddr_t pa,
5855 vm_prot_t prot,
5856 vm_prot_t fault_type,
5857 unsigned int flags,
5858 boolean_t wired)
5859 {
5860 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5861 }
5862
5863 /*
5864 * Insert the given physical page (p) at
5865 * the specified virtual address (v) in the
5866 * target physical map with the protection requested.
5867 *
5868 * If specified, the page will be wired down, meaning
5869 * that the related pte can not be reclaimed.
5870 *
5871 * NB: This is the only routine which MAY NOT lazy-evaluate
5872 * or lose information. That is, this routine must actually
5873 * insert this page into the given map eventually (must make
5874 * forward progress eventually.
5875 */
5876 kern_return_t
5877 pmap_enter(
5878 pmap_t pmap,
5879 vm_map_address_t v,
5880 ppnum_t pn,
5881 vm_prot_t prot,
5882 vm_prot_t fault_type,
5883 unsigned int flags,
5884 boolean_t wired,
5885 __unused pmap_mapping_type_t mapping_type)
5886 {
5887 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5888 }
5889
5890 /*
5891 * Attempt to commit the pte.
5892 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5893 * Performs no page table or accounting writes on failures.
5894 */
5895 static inline bool
5896 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5897 {
5898 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5899 bool success = false, changed_wiring = false;
5900
5901 __unreachable_ok_push
5902 if (TEST_PAGE_RATIO_4) {
5903 /*
5904 * 16K virtual pages w/ 4K hw pages.
5905 * We actually need to update 4 ptes here which can't easily be done atomically.
5906 * As a result we require the exclusive pmap lock.
5907 */
5908 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5909 *old_pte = *pte_p;
5910 if (*old_pte == new_pte) {
5911 /* Another thread completed this operation. Nothing to do here. */
5912 success = true;
5913 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5914 pte_is_valid(*old_pte)) {
5915 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5916 success = false;
5917 } else {
5918 write_pte_fast(pte_p, new_pte);
5919 success = true;
5920 }
5921 } else {
5922 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5923 }
5924 __unreachable_ok_pop
5925
5926 if (success && *old_pte != new_pte) {
5927 if (pte_is_valid(*old_pte)) {
5928 bool need_strong_sync = false;
5929 FLUSH_PTE_STRONG();
5930 #if HAS_FEAT_XS
5931 if (pte_is_xs(pt_attr, *old_pte)) {
5932 need_strong_sync = true;
5933 }
5934 #endif /* HAS_FEAT_XS */
5935 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5936 } else {
5937 FLUSH_PTE();
5938 __builtin_arm_isb(ISB_SY);
5939 }
5940 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5941 (new_pte & ARM_PTE_WIRED) != 0 :
5942 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5943
5944 if (pmap != kernel_pmap && changed_wiring) {
5945 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5946 if (new_pte & ARM_PTE_WIRED) {
5947 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5948 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5949 } else {
5950 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5951 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5952 }
5953 }
5954
5955 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5956 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5957 }
5958 return success;
5959 }
5960
5961 MARK_AS_PMAP_TEXT static pt_entry_t
5962 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5963 {
5964 pt_entry_t pte;
5965
5966 switch (wimg & (VM_WIMG_MASK)) {
5967 case VM_WIMG_IO:
5968 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5969 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5970 // AP, while preserving the security benefits of using device
5971 // mapping against side-channel attacks. On pre-H14 platforms,
5972 // the accesses will still be strongly ordered.
5973 if (is_dram_addr(pa)) {
5974 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5975 } else {
5976 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5977 #if HAS_FEAT_XS
5978 pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5979 if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5980 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5981 }
5982 #endif /* HAS_FEAT_XS */
5983 }
5984 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5985 break;
5986 case VM_WIMG_RT:
5987 if (is_dram_addr(pa)) {
5988 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5989 } else {
5990 #if HAS_FEAT_XS
5991 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5992 #else /* HAS_FEAT_XS */
5993 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5994 #endif /* HAS_FEAT_XS */
5995 #if DEBUG || DEVELOPMENT
5996 pmap_wcrt_on_non_dram_count_increment_atomic();
5997 #endif /* DEBUG || DEVELOPMENT */
5998 }
5999 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6000 break;
6001 case VM_WIMG_POSTED:
6002 if (is_dram_addr(pa)) {
6003 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6004 } else {
6005 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
6006 }
6007 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6008 break;
6009 case VM_WIMG_POSTED_REORDERED:
6010 if (is_dram_addr(pa)) {
6011 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6012 } else {
6013 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
6014 }
6015 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6016 break;
6017 case VM_WIMG_POSTED_COMBINED_REORDERED:
6018 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6019 #if HAS_FEAT_XS
6020 if (!is_dram_addr(pa)) {
6021 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6022 }
6023 #endif /* HAS_FEAT_XS */
6024 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6025 break;
6026 case VM_WIMG_WCOMB:
6027 if (is_dram_addr(pa)) {
6028 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
6029 } else {
6030 #if HAS_FEAT_XS
6031 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6032 #else /* HAS_FEAT_XS */
6033 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6034 #endif /* HAS_FEAT_XS */
6035 #if DEBUG || DEVELOPMENT
6036 pmap_wcrt_on_non_dram_count_increment_atomic();
6037 #endif /* DEBUG || DEVELOPMENT */
6038 }
6039 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6040 break;
6041 case VM_WIMG_WTHRU:
6042 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
6043 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6044 break;
6045 case VM_WIMG_COPYBACK:
6046 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
6047 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6048 break;
6049 case VM_WIMG_INNERWBACK:
6050 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
6051 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
6052 break;
6053 default:
6054 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6055 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6056 }
6057
6058 return pte;
6059 }
6060
6061
6062 /*
6063 * Construct a PTE (and the physical page attributes) for the given virtual to
6064 * physical mapping.
6065 *
6066 * This function has no side effects and is safe to call so that it is safe to
6067 * call while attempting a pmap_enter transaction.
6068 */
6069 MARK_AS_PMAP_TEXT static pt_entry_t
6070 pmap_construct_pte(
6071 const pmap_t pmap,
6072 vm_map_address_t va,
6073 pmap_paddr_t pa,
6074 vm_prot_t prot,
6075 vm_prot_t fault_type,
6076 boolean_t wired,
6077 const pt_attr_t* const pt_attr,
6078 uint16_t *pp_attr_bits /* OUTPUT */
6079 )
6080 {
6081 bool set_NX = false, set_XO = false;
6082 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
6083 assert(pp_attr_bits != NULL);
6084 *pp_attr_bits = 0;
6085
6086 if (wired) {
6087 pte |= ARM_PTE_WIRED;
6088 }
6089
6090 #if DEVELOPMENT || DEBUG
6091 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
6092 #else
6093 if ((prot & VM_PROT_EXECUTE))
6094 #endif
6095 {
6096 set_NX = false;
6097 } else {
6098 set_NX = true;
6099 }
6100
6101 if (prot == VM_PROT_EXECUTE) {
6102 set_XO = true;
6103 }
6104
6105 if (set_NX) {
6106 pte |= pt_attr_leaf_xn(pt_attr);
6107 } else {
6108 if (pmap == kernel_pmap) {
6109 pte |= ARM_PTE_NX;
6110 } else {
6111 pte |= pt_attr_leaf_x(pt_attr);
6112 }
6113 }
6114
6115 if (pmap == kernel_pmap) {
6116 #if __ARM_KERNEL_PROTECT__
6117 pte |= ARM_PTE_NG;
6118 #endif /* __ARM_KERNEL_PROTECT__ */
6119 if (prot & VM_PROT_WRITE) {
6120 pte |= ARM_PTE_AP(AP_RWNA);
6121 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6122 } else {
6123 pte |= ARM_PTE_AP(AP_RONA);
6124 *pp_attr_bits |= PP_ATTR_REFERENCED;
6125 }
6126 } else {
6127 if (pmap->type != PMAP_TYPE_NESTED) {
6128 pte |= ARM_PTE_NG;
6129 } else if ((pmap->nested_region_unnested_table_bitmap)
6130 && (va >= pmap->nested_region_addr)
6131 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
6132 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
6133
6134 if ((pmap->nested_region_unnested_table_bitmap)
6135 && testbit(UNNEST_BIT(index), (int *)pmap->nested_region_unnested_table_bitmap)) {
6136 pte |= ARM_PTE_NG;
6137 }
6138 }
6139 if (prot & VM_PROT_WRITE) {
6140 assert(pmap->type != PMAP_TYPE_NESTED);
6141 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6142 if (fault_type & VM_PROT_WRITE) {
6143 pte |= pt_attr_leaf_rw(pt_attr);
6144 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6145 } else {
6146 pte |= pt_attr_leaf_ro(pt_attr);
6147 /*
6148 * Mark the page as MODFAULT so that a subsequent write
6149 * may be handled through arm_fast_fault().
6150 */
6151 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6152 pte_set_was_writeable(pte, true);
6153 }
6154 } else {
6155 pte |= pt_attr_leaf_rw(pt_attr);
6156 *pp_attr_bits |= PP_ATTR_REFERENCED;
6157 }
6158 } else {
6159 if (set_XO) {
6160 pte |= pt_attr_leaf_rona(pt_attr);
6161 } else {
6162 pte |= pt_attr_leaf_ro(pt_attr);
6163 }
6164 *pp_attr_bits |= PP_ATTR_REFERENCED;
6165 }
6166 }
6167
6168 pte |= ARM_PTE_AF;
6169 return pte;
6170 }
6171
6172 MARK_AS_PMAP_TEXT kern_return_t
6173 pmap_enter_options_internal(
6174 pmap_t pmap,
6175 vm_map_address_t v,
6176 pmap_paddr_t pa,
6177 vm_prot_t prot,
6178 vm_prot_t fault_type,
6179 unsigned int flags,
6180 boolean_t wired,
6181 unsigned int options)
6182 {
6183 ppnum_t pn = (ppnum_t)atop(pa);
6184 pt_entry_t pte;
6185 pt_entry_t spte;
6186 pt_entry_t *pte_p;
6187 bool refcnt_updated;
6188 bool wiredcnt_updated;
6189 bool ro_va = false;
6190 unsigned int wimg_bits;
6191 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6192 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6193 kern_return_t kr = KERN_SUCCESS;
6194 uint16_t pp_attr_bits;
6195 volatile uint16_t *refcnt;
6196 volatile uint16_t *wiredcnt;
6197 pv_free_list_t *local_pv_free;
6198
6199 validate_pmap_mutable(pmap);
6200
6201 #if XNU_MONITOR
6202 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6203 panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6204 }
6205 #endif
6206
6207 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6208
6209 if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6210 panic("%s: pmap %p v 0x%llx not page-aligned",
6211 __func__, pmap, (unsigned long long)v);
6212 }
6213
6214 if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6215 panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6216 }
6217
6218 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6219 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6220 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6221 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6222 }
6223
6224 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6225 panic("pmap_enter_options() pmap %p pa 0x%llx",
6226 pmap, (uint64_t)pa);
6227 }
6228
6229 /* The PA should not extend beyond the architected physical address space */
6230 pa &= ARM_PTE_PAGE_MASK;
6231
6232 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6233 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
6234 extern vm_offset_t ctrr_test_page;
6235 if (__probable(v != ctrr_test_page))
6236 #endif
6237 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6238 }
6239 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6240 if (__improbable(prot != VM_PROT_READ)) {
6241 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6242 __func__, (unsigned long long)v, prot);
6243 }
6244 ro_va = true;
6245 }
6246 assert(pn != vm_page_fictitious_addr);
6247
6248 refcnt_updated = false;
6249 wiredcnt_updated = false;
6250
6251 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6252 /*
6253 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6254 *
6255 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6256 */
6257 lock_mode = PMAP_LOCK_EXCLUSIVE;
6258 }
6259
6260 if (!pmap_lock_preempt(pmap, lock_mode)) {
6261 return KERN_ABORTED;
6262 }
6263
6264 /*
6265 * Expand pmap to include this pte. Assume that
6266 * pmap is always expanded to include enough hardware
6267 * pages to map one VM page.
6268 */
6269 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6270 /* Must unlock to expand the pmap. */
6271 pmap_unlock(pmap, lock_mode);
6272
6273 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6274
6275 if (kr != KERN_SUCCESS) {
6276 return kr;
6277 }
6278
6279 if (!pmap_lock_preempt(pmap, lock_mode)) {
6280 return KERN_ABORTED;
6281 }
6282 }
6283
6284 if (options & PMAP_OPTIONS_NOENTER) {
6285 pmap_unlock(pmap, lock_mode);
6286 return KERN_SUCCESS;
6287 }
6288
6289 /*
6290 * Since we may not hold the pmap lock exclusive, updating the pte is
6291 * done via a cmpxchg loop.
6292 * We need to be careful about modifying non-local data structures before commiting
6293 * the new pte since we may need to re-do the transaction.
6294 */
6295 spte = os_atomic_load(pte_p, relaxed);
6296 while (!committed) {
6297 refcnt = NULL;
6298 wiredcnt = NULL;
6299 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6300 had_valid_mapping = pte_is_valid(spte);
6301
6302 if (pmap != kernel_pmap) {
6303 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6304 refcnt = &ptd_info->refcnt;
6305 wiredcnt = &ptd_info->wiredcnt;
6306 /*
6307 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6308 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6309 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6310 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6311 * have PTDs, so we can't use the check there.
6312 */
6313 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6314 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6315 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6316 }
6317 /*
6318 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6319 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6320 * or acquire the pmap lock exclusive.
6321 */
6322 if (!wiredcnt_updated) {
6323 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6324 wiredcnt_updated = true;
6325 }
6326 if (!refcnt_updated) {
6327 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6328 refcnt_updated = true;
6329 drop_refcnt = true;
6330 }
6331 }
6332
6333 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6334 /*
6335 * There is already a mapping here & it's for a different physical page.
6336 * First remove that mapping.
6337 *
6338 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6339 */
6340 if (lock_mode == PMAP_LOCK_SHARED) {
6341 if (pmap_lock_shared_to_exclusive(pmap)) {
6342 lock_mode = PMAP_LOCK_EXCLUSIVE;
6343 } else {
6344 /*
6345 * We failed to upgrade to an exclusive lock.
6346 * As a result we no longer hold the lock at all,
6347 * so we need to re-acquire it and restart the transaction.
6348 */
6349 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6350 lock_mode = PMAP_LOCK_EXCLUSIVE;
6351 /* pmap might have changed after we dropped the lock. Try again. */
6352 spte = os_atomic_load(pte_p, relaxed);
6353 continue;
6354 }
6355 }
6356 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6357 spte = ARM_PTE_EMPTY;
6358 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_EMPTY);
6359 }
6360
6361 /*
6362 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6363 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6364 * read-write protection. The PMAP layer though still needs to use the right
6365 * index, which is the older XO-now-TPRO one and that is specially selected
6366 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6367 */
6368 if (options & PMAP_OPTIONS_MAP_TPRO) {
6369 if (__improbable(pmap == kernel_pmap)) {
6370 panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6371 __func__);
6372 }
6373 pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6374 } else {
6375 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6376 }
6377
6378 if (pa_valid(pa)) {
6379 unsigned int pai;
6380 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6381
6382 is_internal = FALSE;
6383 is_altacct = FALSE;
6384
6385 pai = pa_index(pa);
6386
6387 pvh_lock(pai);
6388
6389 /*
6390 * Make sure that the current per-cpu PV free list has
6391 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6392 * if the transaction succeeds. We're either in the
6393 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6394 * Note that we can still be interrupted, but a primary
6395 * interrupt handler can never enter the pmap.
6396 */
6397 #if !XNU_MONITOR
6398 assert(get_preemption_level() > 0);
6399 #endif
6400 local_pv_free = &pmap_get_cpu_data()->pv_free;
6401 pv_entry_t **pv_h = pai_to_pvh(pai);
6402 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6403 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6404
6405 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6406 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6407 int new_allocated_pves = 0;
6408
6409 while (new_allocated_pves < 2) {
6410 local_pv_free = &pmap_get_cpu_data()->pv_free;
6411 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6412 if (pv_status == PV_ALLOC_FAIL) {
6413 break;
6414 } else if (pv_status == PV_ALLOC_RETRY) {
6415 /*
6416 * In the case that pv_alloc() had to grab a new page of PVEs,
6417 * it will have dropped the pmap lock while doing so.
6418 * On non-PPL devices, dropping the lock re-enables preemption so we may
6419 * be on a different CPU now.
6420 */
6421 local_pv_free = &pmap_get_cpu_data()->pv_free;
6422 } else {
6423 /* If we've gotten this far then a node should've been allocated. */
6424 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6425
6426 new_allocated_pves++;
6427 }
6428 }
6429
6430 for (int i = 0; i < new_allocated_pves; i++) {
6431 pv_free(new_pve_p[i]);
6432 }
6433 }
6434
6435 if (pv_status == PV_ALLOC_FAIL) {
6436 pvh_unlock(pai);
6437 kr = KERN_RESOURCE_SHORTAGE;
6438 break;
6439 } else if (pv_status == PV_ALLOC_RETRY) {
6440 pvh_unlock(pai);
6441 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6442 spte = os_atomic_load(pte_p, relaxed);
6443 continue;
6444 }
6445
6446 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6447 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6448 } else {
6449 wimg_bits = pmap_cache_attributes(pn);
6450 }
6451
6452 /* We may be retrying this operation after dropping the PVH lock.
6453 * Cache attributes for the physical page may have changed while the lock
6454 * was dropped, so clear any cache attributes we may have previously set
6455 * in the PTE template. */
6456 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6457 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6458
6459 #if XNU_MONITOR
6460 /* The regular old kernel is not allowed to remap PPL pages. */
6461 if (__improbable(ppattr_pa_test_monitor(pa))) {
6462 panic("%s: page belongs to PPL, "
6463 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6464 __FUNCTION__,
6465 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6466 }
6467
6468 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6469 panic("%s: page locked down, "
6470 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6471 __FUNCTION__,
6472 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6473 }
6474 #endif
6475
6476
6477
6478 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6479 if (!committed) {
6480 pvh_unlock(pai);
6481 continue;
6482 }
6483 had_valid_mapping = pte_is_valid(spte);
6484 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6485
6486 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6487 /*
6488 * If there was already a valid pte here then we reuse its reference
6489 * on the ptd and drop the one that we took above.
6490 */
6491 drop_refcnt = had_valid_mapping;
6492
6493 if (!had_valid_mapping) {
6494 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6495 int pve_ptep_idx = 0;
6496 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6497 /* We did all the allocations up top. So this shouldn't be able to fail. */
6498 if (pv_status != PV_ALLOC_SUCCESS) {
6499 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6500 __func__, pv_status, new_pve_p, pmap);
6501 }
6502
6503 if (pmap != kernel_pmap) {
6504 if (options & PMAP_OPTIONS_INTERNAL) {
6505 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6506 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6507 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6508 /*
6509 * Make a note to ourselves that this
6510 * mapping is using alternative
6511 * accounting. We'll need this in order
6512 * to know which ledger to debit when
6513 * the mapping is removed.
6514 *
6515 * The altacct bit must be set while
6516 * the pv head is locked. Defer the
6517 * ledger accounting until after we've
6518 * dropped the lock.
6519 */
6520 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6521 is_altacct = TRUE;
6522 }
6523 }
6524 if (ppattr_test_reusable(pai) &&
6525 !is_altacct) {
6526 is_reusable = TRUE;
6527 } else if (options & PMAP_OPTIONS_INTERNAL) {
6528 is_internal = TRUE;
6529 } else {
6530 is_external = TRUE;
6531 }
6532 }
6533 }
6534
6535 pvh_unlock(pai);
6536
6537 if (pp_attr_bits != 0) {
6538 ppattr_pa_set_bits(pa, pp_attr_bits);
6539 }
6540
6541 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6542 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6543
6544 if (is_internal) {
6545 /*
6546 * Make corresponding adjustments to
6547 * phys_footprint statistics.
6548 */
6549 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6550 if (is_altacct) {
6551 /*
6552 * If this page is internal and
6553 * in an IOKit region, credit
6554 * the task's total count of
6555 * dirty, internal IOKit pages.
6556 * It should *not* count towards
6557 * the task's total physical
6558 * memory footprint, because
6559 * this entire region was
6560 * already billed to the task
6561 * at the time the mapping was
6562 * created.
6563 *
6564 * Put another way, this is
6565 * internal++ and
6566 * alternate_accounting++, so
6567 * net effect on phys_footprint
6568 * is 0. That means: don't
6569 * touch phys_footprint here.
6570 */
6571 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6572 } else {
6573 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6574 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6575 skip_footprint_debit = true;
6576 } else {
6577 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6578 }
6579 }
6580 }
6581 if (is_reusable) {
6582 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6583 } else if (is_external) {
6584 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6585 }
6586 }
6587 } else {
6588 if (prot & VM_PROT_EXECUTE) {
6589 kr = KERN_FAILURE;
6590 break;
6591 }
6592
6593 wimg_bits = pmap_cache_attributes(pn);
6594 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6595 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6596 }
6597
6598 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6599
6600 #if XNU_MONITOR
6601 pte = pmap_construct_io_pte(pa, pte);
6602
6603 /**
6604 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6605 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6606 * created and later removed. We must therefore prevent an attacker from downgrading a
6607 * a writable mapping in order to allow it to be removed and remapped to something else.
6608 */
6609 if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6610 pte_is_valid(spte) &&
6611 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6612 (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6613 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6614 __func__, (uint64_t)pte_to_pa(spte));
6615 }
6616 #endif
6617
6618 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6619 if (committed) {
6620 had_valid_mapping = pte_is_valid(spte);
6621 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6622
6623 /**
6624 * If there was already a valid pte here then we reuse its
6625 * reference on the ptd and drop the one that we took above.
6626 */
6627 drop_refcnt = had_valid_mapping;
6628 }
6629 }
6630 if (committed) {
6631 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6632 assert(pmap != kernel_pmap);
6633
6634 /* One less "compressed" */
6635 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6636 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6637
6638 if (spte & ARM_PTE_COMPRESSED_ALT) {
6639 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6640 } else if (!skip_footprint_debit) {
6641 /* Was part of the footprint */
6642 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6643 }
6644 /* The old entry held a reference so drop the extra one that we took above. */
6645 drop_refcnt = true;
6646 }
6647 }
6648 }
6649
6650 if (drop_refcnt && refcnt != NULL) {
6651 assert(refcnt_updated);
6652 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6653 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6654 }
6655 }
6656
6657 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6658 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6659 }
6660
6661 pmap_unlock(pmap, lock_mode);
6662
6663 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6664 pmap_phys_write_disable(v);
6665 }
6666
6667 return kr;
6668 }
6669
6670 kern_return_t
6671 pmap_enter_options_addr(
6672 pmap_t pmap,
6673 vm_map_address_t v,
6674 pmap_paddr_t pa,
6675 vm_prot_t prot,
6676 vm_prot_t fault_type,
6677 unsigned int flags,
6678 boolean_t wired,
6679 unsigned int options,
6680 __unused void *arg,
6681 __unused pmap_mapping_type_t mapping_type)
6682 {
6683 kern_return_t kr = KERN_FAILURE;
6684
6685
6686 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6687 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6688
6689
6690 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6691 do {
6692 #if XNU_MONITOR
6693 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6694 #else
6695 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6696 #endif
6697
6698 if (kr == KERN_RESOURCE_SHORTAGE) {
6699 #if XNU_MONITOR
6700 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6701 #endif
6702 if (nowait_requested) {
6703 break;
6704 }
6705 }
6706 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6707
6708 #if XNU_MONITOR
6709 pmap_ledger_check_balance(pmap);
6710 #endif
6711
6712 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6713
6714 return kr;
6715 }
6716
6717 kern_return_t
6718 pmap_enter_options(
6719 pmap_t pmap,
6720 vm_map_address_t v,
6721 ppnum_t pn,
6722 vm_prot_t prot,
6723 vm_prot_t fault_type,
6724 unsigned int flags,
6725 boolean_t wired,
6726 unsigned int options,
6727 __unused void *arg,
6728 pmap_mapping_type_t mapping_type)
6729 {
6730 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6731 }
6732
6733 /*
6734 * Routine: pmap_change_wiring
6735 * Function: Change the wiring attribute for a map/virtual-address
6736 * pair.
6737 * In/out conditions:
6738 * The mapping must already exist in the pmap.
6739 */
6740 MARK_AS_PMAP_TEXT kern_return_t
6741 pmap_change_wiring_internal(
6742 pmap_t pmap,
6743 vm_map_address_t v,
6744 boolean_t wired)
6745 {
6746 pt_entry_t *pte_p;
6747 pmap_paddr_t pa;
6748
6749 validate_pmap_mutable(pmap);
6750
6751 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6752 return KERN_ABORTED;
6753 }
6754
6755 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6756
6757 pte_p = pmap_pte(pmap, v);
6758 if (pte_p == PT_ENTRY_NULL) {
6759 if (!wired) {
6760 /*
6761 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6762 * may have been freed by a remove operation.
6763 */
6764 goto pmap_change_wiring_return;
6765 } else {
6766 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6767 }
6768 }
6769 /*
6770 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6771 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6772 */
6773 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6774
6775 while (pa_valid(pa)) {
6776 pmap_paddr_t new_pa;
6777
6778 pvh_lock(pa_index(pa));
6779 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6780
6781 if (pa == new_pa) {
6782 break;
6783 }
6784
6785 pvh_unlock(pa_index(pa));
6786 pa = new_pa;
6787 }
6788
6789 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6790 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6791 if (!wired) {
6792 /* PTE cleared by prior remove/disconnect operation */
6793 goto pmap_change_wiring_cleanup;
6794 } else {
6795 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6796 __func__, pte_p, (uint64_t)*pte_p, pmap);
6797 }
6798 }
6799
6800 assertf(pte_is_valid(*pte_p), "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6801 if (wired != pte_is_wired(*pte_p)) {
6802 pte_set_wired(pmap, pte_p, wired);
6803 if (pmap != kernel_pmap) {
6804 if (wired) {
6805 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6806 } else if (!wired) {
6807 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6808 }
6809 }
6810 }
6811
6812 pmap_change_wiring_cleanup:
6813 if (pa_valid(pa)) {
6814 pvh_unlock(pa_index(pa));
6815 }
6816
6817 pmap_change_wiring_return:
6818 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6819
6820 return KERN_SUCCESS;
6821 }
6822
6823 void
6824 pmap_change_wiring(
6825 pmap_t pmap,
6826 vm_map_address_t v,
6827 boolean_t wired)
6828 {
6829 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6830 pmap_verify_preemptible();
6831
6832 kern_return_t kr = KERN_FAILURE;
6833 #if XNU_MONITOR
6834 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6835 do {
6836 kr = pmap_change_wiring_ppl(pmap, v, wired);
6837 } while (kr == KERN_ABORTED);
6838
6839 pmap_ledger_check_balance(pmap);
6840 #else
6841 /* Since we verified preemptibility, call the helper only once. */
6842 kr = pmap_change_wiring_internal(pmap, v, wired);
6843 #endif
6844
6845 if (kr != KERN_SUCCESS) {
6846 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6847 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6848 }
6849 }
6850
6851 MARK_AS_PMAP_TEXT pmap_paddr_t
6852 pmap_find_pa_internal(
6853 pmap_t pmap,
6854 addr64_t va)
6855 {
6856 pmap_paddr_t pa = 0;
6857
6858 validate_pmap(pmap);
6859
6860 if (pmap != kernel_pmap) {
6861 pmap_lock(pmap, PMAP_LOCK_SHARED);
6862 }
6863
6864 pa = pmap_vtophys(pmap, va);
6865
6866 if (pmap != kernel_pmap) {
6867 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6868 }
6869
6870 return pa;
6871 }
6872
6873 pmap_paddr_t
6874 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6875 {
6876 pmap_paddr_t pa = 0;
6877
6878 if (pmap == kernel_pmap) {
6879 pa = mmu_kvtop(va);
6880 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6881 /*
6882 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6883 * translation even if PAN would prevent kernel access through the translation.
6884 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6885 */
6886 pa = mmu_uvtop(va);
6887 }
6888 return pa;
6889 }
6890
6891 pmap_paddr_t
6892 pmap_find_pa(
6893 pmap_t pmap,
6894 addr64_t va)
6895 {
6896 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6897
6898 if (pa != 0) {
6899 return pa;
6900 }
6901
6902 if (not_in_kdp) {
6903 #if XNU_MONITOR
6904 return pmap_find_pa_ppl(pmap, va);
6905 #else
6906 return pmap_find_pa_internal(pmap, va);
6907 #endif
6908 } else {
6909 return pmap_vtophys(pmap, va);
6910 }
6911 }
6912
6913 ppnum_t
6914 pmap_find_phys_nofault(
6915 pmap_t pmap,
6916 addr64_t va)
6917 {
6918 ppnum_t ppn;
6919 ppn = atop(pmap_find_pa_nofault(pmap, va));
6920 return ppn;
6921 }
6922
6923 ppnum_t
6924 pmap_find_phys(
6925 pmap_t pmap,
6926 addr64_t va)
6927 {
6928 ppnum_t ppn;
6929 ppn = atop(pmap_find_pa(pmap, va));
6930 return ppn;
6931 }
6932
6933 /**
6934 * Translate a kernel virtual address into a physical address.
6935 *
6936 * @param va The kernel virtual address to translate. Does not work on user
6937 * virtual addresses.
6938 *
6939 * @return The physical address if the translation was successful, or zero if
6940 * no valid mappings were found for the given virtual address.
6941 */
6942 pmap_paddr_t
6943 kvtophys(vm_offset_t va)
6944 {
6945 /**
6946 * Attempt to do the translation first in hardware using the AT (address
6947 * translation) instruction. This will attempt to use the MMU to do the
6948 * translation for us.
6949 */
6950 pmap_paddr_t pa = mmu_kvtop(va);
6951
6952 if (pa) {
6953 return pa;
6954 }
6955
6956 /* If the MMU can't find the mapping, then manually walk the page tables. */
6957 return pmap_vtophys(kernel_pmap, va);
6958 }
6959
6960 /**
6961 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6962 * points to a non-kernel-managed physical page, then this call will panic().
6963 *
6964 * @note The output of this function is guaranteed to be a kernel-managed
6965 * physical page, which means it's safe to pass the output directly to
6966 * pa_index() to create a physical address index for various pmap data
6967 * structures.
6968 *
6969 * @param va The kernel virtual address to translate. Does not work on user
6970 * virtual addresses.
6971 *
6972 * @return The translated physical address for the given virtual address.
6973 */
6974 pmap_paddr_t
6975 kvtophys_nofail(vm_offset_t va)
6976 {
6977 pmap_paddr_t pa = kvtophys(va);
6978
6979 if (!pa_valid(pa)) {
6980 panic("%s: Invalid or non-kernel-managed physical page returned, "
6981 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6982 }
6983
6984 return pa;
6985 }
6986
6987 pmap_paddr_t
6988 pmap_vtophys(
6989 pmap_t pmap,
6990 addr64_t va)
6991 {
6992 if ((va < pmap->min) || (va >= pmap->max)) {
6993 return 0;
6994 }
6995
6996 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6997
6998 tt_entry_t * ttp = NULL;
6999 tt_entry_t * ttep = NULL;
7000 tt_entry_t tte = ARM_TTE_EMPTY;
7001 pmap_paddr_t pa = 0;
7002 unsigned int cur_level;
7003
7004 ttp = pmap->tte;
7005
7006 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
7007 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
7008
7009 tte = *ttep;
7010
7011 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
7012 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
7013 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
7014 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
7015
7016 if ((tte & valid_mask) != valid_mask) {
7017 return (pmap_paddr_t) 0;
7018 }
7019
7020 /* This detects both leaf entries and intermediate block mappings. */
7021 if ((tte & type_mask) == type_block) {
7022 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
7023 break;
7024 }
7025
7026 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
7027 }
7028
7029 return pa;
7030 }
7031
7032 /*
7033 * pmap_init_pte_page - Initialize a page table page.
7034 */
7035 MARK_AS_PMAP_TEXT void
7036 pmap_init_pte_page(
7037 pmap_t pmap,
7038 pt_entry_t *pte_p,
7039 vm_offset_t va,
7040 unsigned int ttlevel,
7041 boolean_t alloc_ptd)
7042 {
7043 pt_desc_t *ptdp = NULL;
7044 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
7045
7046 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
7047 if (alloc_ptd) {
7048 /*
7049 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
7050 * on 4KB hardware, we may already have allocated a page table descriptor for a
7051 * bootstrap request, so we check for an existing PTD here.
7052 */
7053 ptdp = ptd_alloc(pmap);
7054 if (ptdp == NULL) {
7055 panic("%s: unable to allocate PTD", __func__);
7056 }
7057 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
7058 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
7059 pvh_set_flags(pvh, 0);
7060 } else {
7061 panic("pmap_init_pte_page(): pte_p %p", pte_p);
7062 }
7063 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7064 ptdp = pvh_ptd(pvh);
7065 } else {
7066 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7067 }
7068
7069 // below barrier ensures previous updates to the page are visible to PTW before
7070 // it is linked to the PTE of previous level
7071 __builtin_arm_dmb(DMB_ISHST);
7072 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7073 }
7074
7075 /*
7076 * Routine: pmap_expand
7077 *
7078 * Expands a pmap to be able to map the specified virtual address.
7079 *
7080 * Allocates new memory for the default (COARSE) translation table
7081 * entry, initializes all the pte entries to ARM_PTE_EMPTY and
7082 * also allocates space for the corresponding pv entries.
7083 *
7084 * Nothing should be locked.
7085 */
7086 MARK_AS_PMAP_TEXT static kern_return_t
7087 pmap_expand(
7088 pmap_t pmap,
7089 vm_map_address_t v,
7090 unsigned int options,
7091 unsigned int level)
7092 {
7093 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7094
7095 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
7096 return KERN_INVALID_ADDRESS;
7097 }
7098 pmap_paddr_t pa;
7099 unsigned int ttlevel = pt_attr_root_level(pt_attr);
7100 tt_entry_t *tte_p;
7101 tt_entry_t *tt_p;
7102
7103 pa = 0x0ULL;
7104 tt_p = (tt_entry_t *)NULL;
7105
7106 for (; ttlevel < level; ttlevel++) {
7107 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7108 return KERN_ABORTED;
7109 }
7110
7111 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7112 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7113 kern_return_t ret;
7114 while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
7115 if (options & PMAP_OPTIONS_NOWAIT) {
7116 /* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
7117 return ret;
7118 }
7119 #if XNU_MONITOR
7120 panic("%s: failed to allocate tt, "
7121 "pmap=%p, v=%p, options=0x%x, level=%u",
7122 __FUNCTION__,
7123 pmap, (void *)v, options, level);
7124 #else
7125 VM_PAGE_WAIT();
7126 #endif
7127 }
7128
7129 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
7130 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7131 return KERN_ABORTED;
7132 }
7133
7134 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7135 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7136 pa = kvtophys_nofail((vm_offset_t)tt_p);
7137 tte_p = pmap_ttne(pmap, ttlevel, v);
7138 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7139 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7140 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7141 pa = 0x0ULL;
7142 tt_p = (tt_entry_t *)NULL;
7143 }
7144 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7145 } else {
7146 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7147 }
7148
7149 if (tt_p != (tt_entry_t *)NULL) {
7150 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7151 tt_p = (tt_entry_t *)NULL;
7152 }
7153 }
7154
7155 return KERN_SUCCESS;
7156 }
7157
7158 /*
7159 * Routine: pmap_gc
7160 * Function:
7161 * Pmap garbage collection
7162 * Called by the pageout daemon when pages are scarce.
7163 *
7164 */
7165 void
7166 pmap_gc(void)
7167 {
7168 /*
7169 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7170 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7171 * or may contain wired mappings. However, with the relatively recent change to
7172 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
7173 * page, it may make sense to call that function here.
7174 */
7175 }
7176
7177 /*
7178 * By default, don't attempt pmap GC more frequently
7179 * than once / 1 minutes.
7180 */
7181
7182 void
7183 compute_pmap_gc_throttle(
7184 void *arg __unused)
7185 {
7186 }
7187
7188 /*
7189 * pmap_attribute_cache_sync(vm_offset_t pa)
7190 *
7191 * Invalidates all of the instruction cache on a physical page and
7192 * pushes any dirty data from the data cache for the same physical page
7193 */
7194
7195 kern_return_t
7196 pmap_attribute_cache_sync(
7197 ppnum_t pp,
7198 vm_size_t size,
7199 __unused vm_machine_attribute_t attribute,
7200 __unused vm_machine_attribute_val_t * value)
7201 {
7202 if (size > PAGE_SIZE) {
7203 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7204 } else {
7205 cache_sync_page(pp);
7206 }
7207
7208 return KERN_SUCCESS;
7209 }
7210
7211 /*
7212 * pmap_sync_page_data_phys(ppnum_t pp)
7213 *
7214 * Invalidates all of the instruction cache on a physical page and
7215 * pushes any dirty data from the data cache for the same physical page
7216 */
7217 void
7218 pmap_sync_page_data_phys(
7219 ppnum_t pp)
7220 {
7221 cache_sync_page(pp);
7222 }
7223
7224 /*
7225 * pmap_sync_page_attributes_phys(ppnum_t pp)
7226 *
7227 * Write back and invalidate all cachelines on a physical page.
7228 */
7229 void
7230 pmap_sync_page_attributes_phys(
7231 ppnum_t pp)
7232 {
7233 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7234 }
7235
7236 #if CONFIG_COREDUMP
7237 /* temporary workaround */
7238 boolean_t
7239 coredumpok(
7240 vm_map_t map,
7241 mach_vm_offset_t va)
7242 {
7243 pt_entry_t *pte_p;
7244 pt_entry_t spte;
7245
7246 pte_p = pmap_pte(map->pmap, va);
7247 if (0 == pte_p) {
7248 return FALSE;
7249 }
7250 if (vm_map_entry_has_device_pager(map, va)) {
7251 return FALSE;
7252 }
7253 spte = *pte_p;
7254 return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT;
7255 }
7256 #endif
7257
7258 void
7259 fillPage(
7260 ppnum_t pn,
7261 unsigned int fill)
7262 {
7263 unsigned int *addr;
7264 int count;
7265
7266 addr = (unsigned int *) phystokv(ptoa(pn));
7267 count = PAGE_SIZE / sizeof(unsigned int);
7268 while (count--) {
7269 *addr++ = fill;
7270 }
7271 }
7272
7273 extern void mapping_set_mod(ppnum_t pn);
7274
7275 void
7276 mapping_set_mod(
7277 ppnum_t pn)
7278 {
7279 pmap_set_modify(pn);
7280 }
7281
7282 extern void mapping_set_ref(ppnum_t pn);
7283
7284 void
7285 mapping_set_ref(
7286 ppnum_t pn)
7287 {
7288 pmap_set_reference(pn);
7289 }
7290
7291 /*
7292 * Clear specified attribute bits.
7293 *
7294 * Try to force an arm_fast_fault() for all mappings of
7295 * the page - to force attributes to be set again at fault time.
7296 * If the forcing succeeds, clear the cached bits at the head.
7297 * Otherwise, something must have been wired, so leave the cached
7298 * attributes alone.
7299 */
7300 MARK_AS_PMAP_TEXT static void
7301 phys_attribute_clear_with_flush_range(
7302 ppnum_t pn,
7303 unsigned int bits,
7304 int options,
7305 void *arg,
7306 pmap_tlb_flush_range_t *flush_range)
7307 {
7308 pmap_paddr_t pa = ptoa(pn);
7309 vm_prot_t allow_mode = VM_PROT_ALL;
7310
7311 #if XNU_MONITOR
7312 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7313 panic("%s: illegal request, "
7314 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7315 __FUNCTION__,
7316 pn, bits, options, arg, flush_range);
7317 }
7318 #endif
7319 if ((arg != NULL) || (flush_range != NULL)) {
7320 options = options & ~PMAP_OPTIONS_NOFLUSH;
7321 }
7322
7323 if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7324 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7325 "invalid options",
7326 pn, bits, options, arg, flush_range);
7327 }
7328
7329 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7330 (options & PMAP_OPTIONS_NOFLUSH))) {
7331 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7332 "should not clear 'modified' without flushing TLBs",
7333 pn, bits, options, arg, flush_range);
7334 }
7335
7336 assert(pn != vm_page_fictitious_addr);
7337
7338 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7339 assert(bits == PP_ATTR_MODIFIED);
7340
7341 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7342 /*
7343 * We short circuit this case; it should not need to
7344 * invoke arm_force_fast_fault, so just clear the modified bit.
7345 * pmap_page_protect has taken care of resetting
7346 * the state so that we'll see the next write as a fault to
7347 * the VM (i.e. we don't want a fast fault).
7348 */
7349 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7350 return;
7351 }
7352 if (bits & PP_ATTR_REFERENCED) {
7353 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7354 }
7355 if (bits & PP_ATTR_MODIFIED) {
7356 allow_mode &= ~VM_PROT_WRITE;
7357 }
7358
7359 if (bits == PP_ATTR_NOENCRYPT) {
7360 /*
7361 * We short circuit this case; it should not need to
7362 * invoke arm_force_fast_fault, so just clear and
7363 * return. On ARM, this bit is just a debugging aid.
7364 */
7365 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7366 return;
7367 }
7368
7369 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7370 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7371 }
7372 }
7373
7374 MARK_AS_PMAP_TEXT void
7375 phys_attribute_clear_internal(
7376 ppnum_t pn,
7377 unsigned int bits,
7378 int options,
7379 void *arg)
7380 {
7381 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7382 }
7383
7384 #if __ARM_RANGE_TLBI__
7385 MARK_AS_PMAP_TEXT static vm_map_address_t
7386 phys_attribute_clear_twig_internal(
7387 pmap_t pmap,
7388 vm_map_address_t start,
7389 vm_map_address_t end,
7390 unsigned int bits,
7391 unsigned int options,
7392 pmap_tlb_flush_range_t *flush_range)
7393 {
7394 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7395 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7396 assert(end >= start);
7397 assert((end - start) <= pt_attr_twig_size(pt_attr));
7398 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7399 vm_map_address_t va = start;
7400 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7401 tt_entry_t *tte_p;
7402 tte_p = pmap_tte(pmap, start);
7403 unsigned int npages = 0;
7404
7405 if (tte_p == (tt_entry_t *) NULL) {
7406 return end;
7407 }
7408
7409 if (tte_is_valid_table(*tte_p)) {
7410 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7411
7412 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7413 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7414 assert(end_pte_p >= start_pte_p);
7415 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7416 if (__improbable(npages++ && pmap_pending_preemption())) {
7417 return va;
7418 }
7419 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7420 if (pa_valid(pa)) {
7421 ppnum_t pn = (ppnum_t) atop(pa);
7422 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7423 }
7424 }
7425 }
7426 return end;
7427 }
7428
7429 MARK_AS_PMAP_TEXT vm_map_address_t
7430 phys_attribute_clear_range_internal(
7431 pmap_t pmap,
7432 vm_map_address_t start,
7433 vm_map_address_t end,
7434 unsigned int bits,
7435 unsigned int options)
7436 {
7437 if (__improbable(end < start)) {
7438 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7439 }
7440 validate_pmap_mutable(pmap);
7441
7442 vm_map_address_t va = start;
7443 pmap_tlb_flush_range_t flush_range = {
7444 .ptfr_pmap = pmap,
7445 .ptfr_start = start,
7446 .ptfr_end = end,
7447 .ptfr_flush_needed = false
7448 };
7449
7450 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7451 return va;
7452 }
7453
7454 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7455
7456 while (va < end) {
7457 vm_map_address_t curr_end;
7458
7459 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7460 if (curr_end > end) {
7461 curr_end = end;
7462 }
7463
7464 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7465 if ((va < curr_end) || pmap_pending_preemption()) {
7466 break;
7467 }
7468 }
7469 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7470 if (flush_range.ptfr_flush_needed) {
7471 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7472 flush_range.ptfr_start,
7473 flush_range.ptfr_end - flush_range.ptfr_start,
7474 flush_range.ptfr_pmap,
7475 true,
7476 false);
7477 sync_tlb_flush();
7478 }
7479 return va;
7480 }
7481
7482 static void
7483 phys_attribute_clear_range(
7484 pmap_t pmap,
7485 vm_map_address_t start,
7486 vm_map_address_t end,
7487 unsigned int bits,
7488 unsigned int options)
7489 {
7490 /*
7491 * We allow single-page requests to execute non-preemptibly,
7492 * as it doesn't make sense to sample AST_URGENT for a single-page
7493 * operation, and there are a couple of special use cases that
7494 * require a non-preemptible single-page operation.
7495 */
7496 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7497 pmap_verify_preemptible();
7498 }
7499
7500 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7501
7502 while (start < end) {
7503 #if XNU_MONITOR
7504 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7505 #else
7506 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7507 #endif
7508 }
7509
7510 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7511 }
7512 #endif /* __ARM_RANGE_TLBI__ */
7513
7514 static void
7515 phys_attribute_clear(
7516 ppnum_t pn,
7517 unsigned int bits,
7518 int options,
7519 void *arg)
7520 {
7521 /*
7522 * Do we really want this tracepoint? It will be extremely chatty.
7523 * Also, should we have a corresponding trace point for the set path?
7524 */
7525 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7526
7527 #if XNU_MONITOR
7528 phys_attribute_clear_ppl(pn, bits, options, arg);
7529 #else
7530 phys_attribute_clear_internal(pn, bits, options, arg);
7531 #endif
7532
7533 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7534 }
7535
7536 /*
7537 * Set specified attribute bits.
7538 *
7539 * Set cached value in the pv head because we have
7540 * no per-mapping hardware support for referenced and
7541 * modify bits.
7542 */
7543 MARK_AS_PMAP_TEXT void
7544 phys_attribute_set_internal(
7545 ppnum_t pn,
7546 unsigned int bits)
7547 {
7548 pmap_paddr_t pa = ptoa(pn);
7549 assert(pn != vm_page_fictitious_addr);
7550
7551 #if XNU_MONITOR
7552 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7553 panic("%s: illegal request, "
7554 "pn=%u, bits=%#x",
7555 __FUNCTION__,
7556 pn, bits);
7557 }
7558 #endif
7559
7560 ppattr_pa_set_bits(pa, (uint16_t)bits);
7561
7562 return;
7563 }
7564
7565 static void
7566 phys_attribute_set(
7567 ppnum_t pn,
7568 unsigned int bits)
7569 {
7570 #if XNU_MONITOR
7571 phys_attribute_set_ppl(pn, bits);
7572 #else
7573 phys_attribute_set_internal(pn, bits);
7574 #endif
7575 }
7576
7577
7578 /*
7579 * Check specified attribute bits.
7580 *
7581 * use the software cached bits (since no hw support).
7582 */
7583 static boolean_t
7584 phys_attribute_test(
7585 ppnum_t pn,
7586 unsigned int bits)
7587 {
7588 pmap_paddr_t pa = ptoa(pn);
7589 assert(pn != vm_page_fictitious_addr);
7590 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7591 }
7592
7593
7594 /*
7595 * Set the modify/reference bits on the specified physical page.
7596 */
7597 void
7598 pmap_set_modify(ppnum_t pn)
7599 {
7600 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7601 }
7602
7603
7604 /*
7605 * Clear the modify bits on the specified physical page.
7606 */
7607 void
7608 pmap_clear_modify(
7609 ppnum_t pn)
7610 {
7611 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7612 }
7613
7614
7615 /*
7616 * pmap_is_modified:
7617 *
7618 * Return whether or not the specified physical page is modified
7619 * by any physical maps.
7620 */
7621 boolean_t
7622 pmap_is_modified(
7623 ppnum_t pn)
7624 {
7625 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7626 }
7627
7628
7629 /*
7630 * Set the reference bit on the specified physical page.
7631 */
7632 static void
7633 pmap_set_reference(
7634 ppnum_t pn)
7635 {
7636 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7637 }
7638
7639 /*
7640 * Clear the reference bits on the specified physical page.
7641 */
7642 void
7643 pmap_clear_reference(
7644 ppnum_t pn)
7645 {
7646 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7647 }
7648
7649
7650 /*
7651 * pmap_is_referenced:
7652 *
7653 * Return whether or not the specified physical page is referenced
7654 * by any physical maps.
7655 */
7656 boolean_t
7657 pmap_is_referenced(
7658 ppnum_t pn)
7659 {
7660 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7661 }
7662
7663 /*
7664 * pmap_get_refmod(phys)
7665 * returns the referenced and modified bits of the specified
7666 * physical page.
7667 */
7668 unsigned int
7669 pmap_get_refmod(
7670 ppnum_t pn)
7671 {
7672 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7673 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7674 }
7675
7676 static inline unsigned int
7677 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7678 {
7679 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7680 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7681 }
7682
7683 /*
7684 * pmap_clear_refmod(phys, mask)
7685 * clears the referenced and modified bits as specified by the mask
7686 * of the specified physical page.
7687 */
7688 void
7689 pmap_clear_refmod_options(
7690 ppnum_t pn,
7691 unsigned int mask,
7692 unsigned int options,
7693 void *arg)
7694 {
7695 unsigned int bits;
7696
7697 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7698 phys_attribute_clear(pn, bits, options, arg);
7699 }
7700
7701 /*
7702 * Perform pmap_clear_refmod_options on a virtual address range.
7703 * The operation will be performed in bulk & tlb flushes will be coalesced
7704 * if possible.
7705 *
7706 * Returns true if the operation is supported on this platform.
7707 * If this function returns false, the operation is not supported and
7708 * nothing has been modified in the pmap.
7709 */
7710 bool
7711 pmap_clear_refmod_range_options(
7712 pmap_t pmap __unused,
7713 vm_map_address_t start __unused,
7714 vm_map_address_t end __unused,
7715 unsigned int mask __unused,
7716 unsigned int options __unused)
7717 {
7718 #if __ARM_RANGE_TLBI__
7719 unsigned int bits;
7720 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7721 phys_attribute_clear_range(pmap, start, end, bits, options);
7722 return true;
7723 #else /* __ARM_RANGE_TLBI__ */
7724 #pragma unused(pmap, start, end, mask, options)
7725 /*
7726 * This operation allows the VM to bulk modify refmod bits on a virtually
7727 * contiguous range of addresses. This is large performance improvement on
7728 * platforms that support ranged tlbi instructions. But on older platforms,
7729 * we can only flush per-page or the entire asid. So we currently
7730 * only support this operation on platforms that support ranged tlbi.
7731 * instructions. On other platforms, we require that
7732 * the VM modify the bits on a per-page basis.
7733 */
7734 return false;
7735 #endif /* __ARM_RANGE_TLBI__ */
7736 }
7737
7738 void
7739 pmap_clear_refmod(
7740 ppnum_t pn,
7741 unsigned int mask)
7742 {
7743 pmap_clear_refmod_options(pn, mask, 0, NULL);
7744 }
7745
7746 unsigned int
7747 pmap_disconnect_options(
7748 ppnum_t pn,
7749 unsigned int options,
7750 void *arg)
7751 {
7752 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7753 /*
7754 * On ARM, the "modified" bit is managed by software, so
7755 * we know up-front if the physical page is "modified",
7756 * without having to scan all the PTEs pointing to it.
7757 * The caller should have made the VM page "busy" so noone
7758 * should be able to establish any new mapping and "modify"
7759 * the page behind us.
7760 */
7761 if (pmap_is_modified(pn)) {
7762 /*
7763 * The page has been modified and will be sent to
7764 * the VM compressor.
7765 */
7766 options |= PMAP_OPTIONS_COMPRESSOR;
7767 } else {
7768 /*
7769 * The page hasn't been modified and will be freed
7770 * instead of compressed.
7771 */
7772 }
7773 }
7774
7775 /* disconnect the page */
7776 pmap_page_protect_options(pn, 0, options, arg);
7777
7778 /* return ref/chg status */
7779 return pmap_get_refmod(pn);
7780 }
7781
7782 /*
7783 * Routine:
7784 * pmap_disconnect
7785 *
7786 * Function:
7787 * Disconnect all mappings for this page and return reference and change status
7788 * in generic format.
7789 *
7790 */
7791 unsigned int
7792 pmap_disconnect(
7793 ppnum_t pn)
7794 {
7795 pmap_page_protect(pn, 0); /* disconnect the page */
7796 return pmap_get_refmod(pn); /* return ref/chg status */
7797 }
7798
7799 boolean_t
7800 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7801 {
7802 if (ptoa(first) >= vm_last_phys) {
7803 return FALSE;
7804 }
7805 if (ptoa(last) < vm_first_phys) {
7806 return FALSE;
7807 }
7808
7809 return TRUE;
7810 }
7811
7812 /*
7813 * The state maintained by the noencrypt functions is used as a
7814 * debugging aid on ARM. This incurs some overhead on the part
7815 * of the caller. A special case check in phys_attribute_clear
7816 * (the most expensive path) currently minimizes this overhead,
7817 * but stubbing these functions out on RELEASE kernels yields
7818 * further wins.
7819 */
7820 boolean_t
7821 pmap_is_noencrypt(
7822 ppnum_t pn)
7823 {
7824 #if DEVELOPMENT || DEBUG
7825 boolean_t result = FALSE;
7826
7827 if (!pa_valid(ptoa(pn))) {
7828 return FALSE;
7829 }
7830
7831 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7832
7833 return result;
7834 #else
7835 #pragma unused(pn)
7836 return FALSE;
7837 #endif
7838 }
7839
7840 void
7841 pmap_set_noencrypt(
7842 ppnum_t pn)
7843 {
7844 #if DEVELOPMENT || DEBUG
7845 if (!pa_valid(ptoa(pn))) {
7846 return;
7847 }
7848
7849 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7850 #else
7851 #pragma unused(pn)
7852 #endif
7853 }
7854
7855 void
7856 pmap_clear_noencrypt(
7857 ppnum_t pn)
7858 {
7859 #if DEVELOPMENT || DEBUG
7860 if (!pa_valid(ptoa(pn))) {
7861 return;
7862 }
7863
7864 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7865 #else
7866 #pragma unused(pn)
7867 #endif
7868 }
7869
7870 #if XNU_MONITOR
7871 boolean_t
7872 pmap_is_monitor(ppnum_t pn)
7873 {
7874 assert(pa_valid(ptoa(pn)));
7875 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7876 }
7877 #endif
7878
7879 void
7880 pmap_lock_phys_page(ppnum_t pn)
7881 {
7882 #if !XNU_MONITOR
7883 unsigned int pai;
7884 pmap_paddr_t phys = ptoa(pn);
7885
7886 if (pa_valid(phys)) {
7887 pai = pa_index(phys);
7888 pvh_lock(pai);
7889 } else
7890 #else
7891 (void)pn;
7892 #endif
7893 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7894 }
7895
7896
7897 void
7898 pmap_unlock_phys_page(ppnum_t pn)
7899 {
7900 #if !XNU_MONITOR
7901 unsigned int pai;
7902 pmap_paddr_t phys = ptoa(pn);
7903
7904 if (pa_valid(phys)) {
7905 pai = pa_index(phys);
7906 pvh_unlock(pai);
7907 } else
7908 #else
7909 (void)pn;
7910 #endif
7911 { simple_unlock(&phys_backup_lock);}
7912 }
7913
7914 MARK_AS_PMAP_TEXT static void
7915 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7916 {
7917 if (pmap != kernel_pmap) {
7918 pmap_t nested_pmap = pmap->nested_pmap;
7919 cpu_data_ptr->cpu_nested_pmap = nested_pmap;
7920 if (nested_pmap != NULL) {
7921 cpu_data_ptr->cpu_nested_pmap_attr = pmap_get_pt_attr(nested_pmap);
7922 /**
7923 * Obtain the full shared region bounds from the nested pmap. If the top-level pmap
7924 * hasn't been fully nested yet, its bounds may not yet be configured, or may be in the
7925 * process of being configured on another core.
7926 */
7927 cpu_data_ptr->cpu_nested_region_addr = nested_pmap->nested_region_addr;
7928 cpu_data_ptr->cpu_nested_region_size = nested_pmap->nested_region_size;
7929 }
7930 #if __ARM_MIXED_PAGE_SIZE__
7931 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7932 #endif
7933 }
7934
7935
7936 #if __ARM_MIXED_PAGE_SIZE__
7937 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7938 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7939 }
7940 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7941
7942
7943 if (pmap != kernel_pmap) {
7944 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7945 } else if (!pmap_user_ttb_is_clear()) {
7946 pmap_clear_user_ttb_internal();
7947 }
7948 }
7949
7950 MARK_AS_PMAP_TEXT void
7951 pmap_clear_user_ttb_internal(void)
7952 {
7953 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7954 }
7955
7956 void
7957 pmap_clear_user_ttb(void)
7958 {
7959 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7960 #if XNU_MONITOR
7961 pmap_clear_user_ttb_ppl();
7962 #else
7963 pmap_clear_user_ttb_internal();
7964 #endif
7965 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7966 }
7967
7968
7969 #if defined(__arm64__)
7970 /*
7971 * Marker for use in multi-pass fast-fault PV list processing.
7972 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7973 * these functions, as compressed PTEs should never be present in PV lists.
7974 * Note that this only holds true for arm64; for arm32 we don't have enough
7975 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7976 * and WRITEABLE marker depending on whether the PTE is valid.
7977 */
7978 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7979 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7980 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7981 #endif
7982
7983
7984 MARK_AS_PMAP_TEXT static boolean_t
7985 arm_force_fast_fault_with_flush_range(
7986 ppnum_t ppnum,
7987 vm_prot_t allow_mode,
7988 int options,
7989 pmap_tlb_flush_range_t *flush_range)
7990 {
7991 pmap_paddr_t phys = ptoa(ppnum);
7992 pv_entry_t *pve_p;
7993 pt_entry_t *pte_p;
7994 unsigned int pai;
7995 unsigned int pass1_updated = 0;
7996 unsigned int pass2_updated = 0;
7997 boolean_t result;
7998 pv_entry_t **pv_h;
7999 bool is_reusable;
8000 bool ref_fault;
8001 bool mod_fault;
8002 bool clear_write_fault = false;
8003 bool ref_aliases_mod = false;
8004 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
8005
8006 assert(ppnum != vm_page_fictitious_addr);
8007
8008 if (!pa_valid(phys)) {
8009 return FALSE; /* Not a managed page. */
8010 }
8011
8012 result = TRUE;
8013 ref_fault = false;
8014 mod_fault = false;
8015 pai = pa_index(phys);
8016 if (__probable(mustsynch)) {
8017 pvh_lock(pai);
8018 }
8019 pv_h = pai_to_pvh(pai);
8020
8021 #if XNU_MONITOR
8022 if (__improbable(ppattr_pa_test_monitor(phys))) {
8023 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
8024 }
8025 #endif
8026 pte_p = PT_ENTRY_NULL;
8027 pve_p = PV_ENTRY_NULL;
8028 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8029 pte_p = pvh_ptep(pv_h);
8030 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8031 pve_p = pvh_pve_list(pv_h);
8032 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8033 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
8034 }
8035
8036 is_reusable = ppattr_test_reusable(pai);
8037
8038 /*
8039 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
8040 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
8041 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
8042 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
8043 * operation, TLB invalidation may be handled by the caller so it's possible for
8044 * tlb_flush_needed to be true while issue_tlbi is false.
8045 */
8046 bool issue_tlbi = false;
8047 bool tlb_flush_needed = false;
8048
8049 pv_entry_t *orig_pve_p = pve_p;
8050 pt_entry_t *orig_pte_p = pte_p;
8051 int pve_ptep_idx = 0;
8052
8053 /*
8054 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8055 * TLB invalidation in pass 2.
8056 */
8057 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8058 pt_entry_t spte;
8059 pt_entry_t tmplate;
8060
8061 if (pve_p != PV_ENTRY_NULL) {
8062 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8063 if (pte_p == PT_ENTRY_NULL) {
8064 goto fff_skip_pve_pass1;
8065 }
8066 }
8067
8068 #ifdef PVH_FLAG_IOMMU
8069 if (pvh_ptep_is_iommu(pte_p)) {
8070 goto fff_skip_pve_pass1;
8071 }
8072 #endif
8073 if (*pte_p == ARM_PTE_EMPTY) {
8074 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8075 }
8076 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8077 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8078 }
8079
8080 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8081 const pmap_t pmap = ptdp->pmap;
8082 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8083 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8084
8085 assert(va >= pmap->min && va < pmap->max);
8086
8087 /* update pmap stats and ledgers */
8088 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8089 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8090 if (is_altacct) {
8091 /*
8092 * We do not track "reusable" status for
8093 * "alternate accounting" mappings.
8094 */
8095 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8096 is_reusable &&
8097 is_internal &&
8098 pmap != kernel_pmap) {
8099 /* one less "reusable" */
8100 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8101 /* one more "internal" */
8102 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8103 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8104
8105 /*
8106 * Since the page is being marked non-reusable, we assume that it will be
8107 * modified soon. Avoid the cost of another trap to handle the fast
8108 * fault when we next write to this page.
8109 */
8110 clear_write_fault = true;
8111 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8112 !is_reusable &&
8113 is_internal &&
8114 pmap != kernel_pmap) {
8115 /* one more "reusable" */
8116 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8117 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8118 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8119 }
8120
8121 bool wiredskip = pte_is_wired(*pte_p) &&
8122 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8123
8124 if (wiredskip) {
8125 result = FALSE;
8126 goto fff_skip_pve_pass1;
8127 }
8128
8129 spte = *pte_p;
8130 tmplate = spte;
8131
8132 #if HAS_FEAT_XS
8133 /**
8134 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
8135 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
8136 */
8137 assert(!pte_is_xs(pt_attr, spte));
8138 #endif /* HAS_FEAT_XS */
8139 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8140 /* read protection sets the pte to fault */
8141 tmplate = tmplate & ~ARM_PTE_AF;
8142 ref_fault = true;
8143 }
8144 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8145 /* take away write permission if set */
8146 if (pmap == kernel_pmap) {
8147 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8148 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8149 pte_set_was_writeable(tmplate, true);
8150 mod_fault = true;
8151 }
8152 } else {
8153 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8154 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8155 pte_set_was_writeable(tmplate, true);
8156 mod_fault = true;
8157 }
8158 }
8159 }
8160
8161 #if MACH_ASSERT && XNU_MONITOR
8162 if (is_pte_xprr_protected(pmap, spte)) {
8163 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8164 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8165 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8166 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8167 ppnum, options, allow_mode);
8168 }
8169 }
8170 #endif /* MACH_ASSERT && XNU_MONITOR */
8171
8172 if (result && (tmplate != spte)) {
8173 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8174 !(options & PMAP_OPTIONS_NOFLUSH)) {
8175 tlb_flush_needed = true;
8176 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8177 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8178 #ifdef ARM_PTE_FF_MARKER
8179 assert(!(spte & ARM_PTE_FF_MARKER));
8180 tmplate |= ARM_PTE_FF_MARKER;
8181 ++pass1_updated;
8182 #endif
8183 issue_tlbi = true;
8184 }
8185 }
8186 write_pte_fast(pte_p, tmplate);
8187 }
8188
8189 fff_skip_pve_pass1:
8190 pte_p = PT_ENTRY_NULL;
8191 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8192 pve_ptep_idx = 0;
8193 pve_p = pve_next(pve_p);
8194 }
8195 }
8196
8197 if (tlb_flush_needed) {
8198 FLUSH_PTE_STRONG();
8199 }
8200
8201 if (!issue_tlbi) {
8202 goto fff_finish;
8203 }
8204
8205 /* Pass 2: Issue any required TLB invalidations */
8206 pve_p = orig_pve_p;
8207 pte_p = orig_pte_p;
8208 pve_ptep_idx = 0;
8209
8210 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8211 if (pve_p != PV_ENTRY_NULL) {
8212 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8213 if (pte_p == PT_ENTRY_NULL) {
8214 goto fff_skip_pve_pass2;
8215 }
8216 }
8217
8218 #ifdef PVH_FLAG_IOMMU
8219 if (pvh_ptep_is_iommu(pte_p)) {
8220 goto fff_skip_pve_pass2;
8221 }
8222 #endif
8223
8224 #ifdef ARM_PTE_FF_MARKER
8225 pt_entry_t spte = *pte_p;
8226
8227 if (!(spte & ARM_PTE_FF_MARKER)) {
8228 goto fff_skip_pve_pass2;
8229 } else {
8230 spte &= (~ARM_PTE_FF_MARKER);
8231 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8232 write_pte_fast(pte_p, spte);
8233 ++pass2_updated;
8234 }
8235 #endif
8236 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8237 const pmap_t pmap = ptdp->pmap;
8238 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8239
8240 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8241 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8242 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8243 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8244 }
8245
8246 fff_skip_pve_pass2:
8247 pte_p = PT_ENTRY_NULL;
8248 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8249 pve_ptep_idx = 0;
8250 pve_p = pve_next(pve_p);
8251 }
8252 }
8253
8254 fff_finish:
8255 if (__improbable(pass1_updated != pass2_updated)) {
8256 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8257 __func__, pass1_updated, pass2_updated);
8258 }
8259
8260 /*
8261 * If we are using the same approach for ref and mod
8262 * faults on this PTE, do not clear the write fault;
8263 * this would cause both ref and mod to be set on the
8264 * page again, and prevent us from taking ANY read/write
8265 * fault on the mapping.
8266 */
8267 if (clear_write_fault && !ref_aliases_mod) {
8268 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8269 }
8270 if (tlb_flush_needed) {
8271 if (flush_range) {
8272 /* Delayed flush. Signal to the caller that the flush is needed. */
8273 flush_range->ptfr_flush_needed = true;
8274 } else {
8275 sync_tlb_flush();
8276 }
8277 }
8278
8279 /* update global "reusable" status for this page */
8280 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8281 ppattr_clear_reusable(pai);
8282 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8283 ppattr_set_reusable(pai);
8284 }
8285
8286 if (mod_fault) {
8287 ppattr_set_modfault(pai);
8288 }
8289 if (ref_fault) {
8290 ppattr_set_reffault(pai);
8291 }
8292 if (__probable(mustsynch)) {
8293 pvh_unlock(pai);
8294 }
8295 return result;
8296 }
8297
8298 MARK_AS_PMAP_TEXT boolean_t
8299 arm_force_fast_fault_internal(
8300 ppnum_t ppnum,
8301 vm_prot_t allow_mode,
8302 int options)
8303 {
8304 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8305 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8306 }
8307 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8308 }
8309
8310 /*
8311 * Routine: arm_force_fast_fault
8312 *
8313 * Function:
8314 * Force all mappings for this page to fault according
8315 * to the access modes allowed, so we can gather ref/modify
8316 * bits again.
8317 */
8318
8319 boolean_t
8320 arm_force_fast_fault(
8321 ppnum_t ppnum,
8322 vm_prot_t allow_mode,
8323 int options,
8324 __unused void *arg)
8325 {
8326 pmap_paddr_t phys = ptoa(ppnum);
8327
8328 assert(ppnum != vm_page_fictitious_addr);
8329
8330 if (!pa_valid(phys)) {
8331 return FALSE; /* Not a managed page. */
8332 }
8333
8334 #if XNU_MONITOR
8335 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8336 #else
8337 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8338 #endif
8339 }
8340
8341 /*
8342 * Routine: arm_clear_fast_fault
8343 *
8344 * Function:
8345 * Clear pending force fault for all mappings for this page based on
8346 * the observed fault type, update ref/modify bits.
8347 */
8348 MARK_AS_PMAP_TEXT static boolean_t
8349 arm_clear_fast_fault(
8350 ppnum_t ppnum,
8351 vm_prot_t fault_type,
8352 pt_entry_t *pte_p)
8353 {
8354 pmap_paddr_t pa = ptoa(ppnum);
8355 pv_entry_t *pve_p;
8356 unsigned int pai;
8357 boolean_t result;
8358 bool tlb_flush_needed = false;
8359 pv_entry_t **pv_h;
8360 unsigned int npve = 0;
8361 unsigned int pass1_updated = 0;
8362 unsigned int pass2_updated = 0;
8363
8364 assert(ppnum != vm_page_fictitious_addr);
8365
8366 if (!pa_valid(pa)) {
8367 return FALSE; /* Not a managed page. */
8368 }
8369
8370 result = FALSE;
8371 pai = pa_index(pa);
8372 pvh_assert_locked(pai);
8373 pv_h = pai_to_pvh(pai);
8374
8375 pve_p = PV_ENTRY_NULL;
8376 if (pte_p == PT_ENTRY_NULL) {
8377 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8378 pte_p = pvh_ptep(pv_h);
8379 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8380 pve_p = pvh_pve_list(pv_h);
8381 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8382 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8383 }
8384 }
8385
8386 pv_entry_t *orig_pve_p = pve_p;
8387 pt_entry_t *orig_pte_p = pte_p;
8388 int pve_ptep_idx = 0;
8389
8390 /*
8391 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8392 * TLB invalidation in pass 2.
8393 */
8394 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8395 pt_entry_t spte;
8396 pt_entry_t tmplate;
8397
8398 if (pve_p != PV_ENTRY_NULL) {
8399 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8400 if (pte_p == PT_ENTRY_NULL) {
8401 goto cff_skip_pve_pass1;
8402 }
8403 }
8404
8405 #ifdef PVH_FLAG_IOMMU
8406 if (pvh_ptep_is_iommu(pte_p)) {
8407 goto cff_skip_pve_pass1;
8408 }
8409 #endif
8410 if (*pte_p == ARM_PTE_EMPTY) {
8411 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8412 }
8413
8414 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8415 const pmap_t pmap = ptdp->pmap;
8416 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8417
8418 assert(va >= pmap->min && va < pmap->max);
8419
8420 spte = *pte_p;
8421 tmplate = spte;
8422
8423 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8424 {
8425 if (pmap == kernel_pmap) {
8426 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8427 } else {
8428 assert(pmap->type != PMAP_TYPE_NESTED);
8429 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8430 }
8431 }
8432
8433 tmplate |= ARM_PTE_AF;
8434
8435 pte_set_was_writeable(tmplate, false);
8436 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8437 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8438 tmplate = spte | ARM_PTE_AF;
8439
8440 {
8441 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8442 }
8443 }
8444
8445 #if MACH_ASSERT && XNU_MONITOR
8446 if (is_pte_xprr_protected(pmap, spte)) {
8447 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8448 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8449 "ppnum=0x%x, fault_type=0x%x",
8450 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8451 ppnum, fault_type);
8452 }
8453 }
8454 #endif /* MACH_ASSERT && XNU_MONITOR */
8455
8456 assert(spte != ARM_PTE_EMPTY);
8457 if (spte != tmplate) {
8458 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8459 #ifdef ARM_PTE_FF_MARKER
8460 assert(!(spte & ARM_PTE_FF_MARKER));
8461 tmplate |= ARM_PTE_FF_MARKER;
8462 ++pass1_updated;
8463 #endif
8464 tlb_flush_needed = true;
8465 }
8466 write_pte_fast(pte_p, tmplate);
8467 result = TRUE;
8468 }
8469
8470 cff_skip_pve_pass1:
8471 pte_p = PT_ENTRY_NULL;
8472 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8473 pve_ptep_idx = 0;
8474 pve_p = pve_next(pve_p);
8475 ++npve;
8476 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8477 break;
8478 }
8479 }
8480 }
8481
8482 if (!tlb_flush_needed) {
8483 goto cff_finish;
8484 }
8485
8486 FLUSH_PTE_STRONG();
8487
8488 /* Pass 2: Issue any required TLB invalidations */
8489 pve_p = orig_pve_p;
8490 pte_p = orig_pte_p;
8491 pve_ptep_idx = 0;
8492 npve = 0;
8493
8494 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8495 if (pve_p != PV_ENTRY_NULL) {
8496 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8497 if (pte_p == PT_ENTRY_NULL) {
8498 goto cff_skip_pve_pass2;
8499 }
8500 }
8501
8502 #ifdef PVH_FLAG_IOMMU
8503 if (pvh_ptep_is_iommu(pte_p)) {
8504 goto cff_skip_pve_pass2;
8505 }
8506 #endif
8507
8508 #ifdef ARM_PTE_FF_MARKER
8509 pt_entry_t spte = *pte_p;
8510
8511 if (!(spte & ARM_PTE_FF_MARKER)) {
8512 goto cff_skip_pve_pass2;
8513 } else {
8514 spte &= (~ARM_PTE_FF_MARKER);
8515 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8516 write_pte_fast(pte_p, spte);
8517 ++pass2_updated;
8518 }
8519 #endif
8520 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8521 const pmap_t pmap = ptdp->pmap;
8522 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8523
8524 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8525 pmap, true, false);
8526
8527 cff_skip_pve_pass2:
8528 pte_p = PT_ENTRY_NULL;
8529 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8530 pve_ptep_idx = 0;
8531 pve_p = pve_next(pve_p);
8532 ++npve;
8533 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8534 break;
8535 }
8536 }
8537 }
8538
8539 cff_finish:
8540 if (__improbable(pass1_updated != pass2_updated)) {
8541 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8542 __func__, pass1_updated, pass2_updated);
8543 }
8544 if (tlb_flush_needed) {
8545 sync_tlb_flush();
8546 }
8547 return result;
8548 }
8549
8550 /*
8551 * Determine if the fault was induced by software tracking of
8552 * modify/reference bits. If so, re-enable the mapping (and set
8553 * the appropriate bits).
8554 *
8555 * Returns KERN_SUCCESS if the fault was induced and was
8556 * successfully handled.
8557 *
8558 * Returns KERN_FAILURE if the fault was not induced and
8559 * the function was unable to deal with it.
8560 *
8561 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8562 * disallows this type of access.
8563 *
8564 * Returns KERN_ABORTED if the pmap lock is taken and a
8565 * preemption is pending.
8566 *
8567 */
8568 MARK_AS_PMAP_TEXT kern_return_t
8569 arm_fast_fault_internal(
8570 pmap_t pmap,
8571 vm_map_address_t va,
8572 vm_prot_t fault_type,
8573 __unused bool was_af_fault,
8574 __unused bool from_user)
8575 {
8576 kern_return_t result = KERN_FAILURE;
8577 pt_entry_t *ptep;
8578 pt_entry_t spte = ARM_PTE_EMPTY;
8579 unsigned int pai;
8580 pmap_paddr_t pa;
8581 validate_pmap_mutable(pmap);
8582
8583 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8584 return KERN_ABORTED;
8585 }
8586
8587 /*
8588 * If the entry doesn't exist, is completely invalid, or is already
8589 * valid, we can't fix it here.
8590 */
8591
8592 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8593 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8594 if (ptep != PT_ENTRY_NULL) {
8595 while (true) {
8596 spte = *((volatile pt_entry_t*)ptep);
8597
8598 pa = pte_to_pa(spte);
8599
8600 if ((spte == ARM_PTE_EMPTY) ||
8601 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8602 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8603 return result;
8604 }
8605
8606 if (!pa_valid(pa)) {
8607 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8608 #if XNU_MONITOR
8609 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8610 return KERN_PROTECTION_FAILURE;
8611 } else
8612 #endif
8613 return result;
8614 }
8615 pai = pa_index(pa);
8616 pvh_lock(pai);
8617 if (*ptep == spte) {
8618 /*
8619 * Double-check the spte value, as we care about the AF bit.
8620 * It's also possible that pmap_page_protect() transitioned the
8621 * PTE to compressed/empty before we grabbed the PVH lock.
8622 */
8623 break;
8624 }
8625 pvh_unlock(pai);
8626 }
8627 } else {
8628 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8629 return result;
8630 }
8631
8632
8633 if ((result != KERN_SUCCESS) &&
8634 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8635 /*
8636 * An attempted access will always clear ref/mod fault state, as
8637 * appropriate for the fault type. arm_clear_fast_fault will
8638 * update the associated PTEs for the page as appropriate; if
8639 * any PTEs are updated, we redrive the access. If the mapping
8640 * does not actually allow for the attempted access, the
8641 * following fault will (hopefully) fail to update any PTEs, and
8642 * thus cause arm_fast_fault to decide that it failed to handle
8643 * the fault.
8644 */
8645 if (ppattr_test_reffault(pai)) {
8646 ppattr_clear_reffault(pai);
8647 }
8648 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8649 ppattr_clear_modfault(pai);
8650 }
8651
8652 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8653 /*
8654 * Should this preserve KERN_PROTECTION_FAILURE? The
8655 * cost of not doing so is a another fault in a case
8656 * that should already result in an exception.
8657 */
8658 result = KERN_SUCCESS;
8659 }
8660 }
8661
8662 /*
8663 * If the PTE already has sufficient permissions, we can report the fault as handled.
8664 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8665 * on mappings of the same page
8666 */
8667 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8668 uintptr_t ap_ro, ap_rw, ap_x;
8669 if (pmap == kernel_pmap) {
8670 ap_ro = ARM_PTE_AP(AP_RONA);
8671 ap_rw = ARM_PTE_AP(AP_RWNA);
8672 ap_x = ARM_PTE_NX;
8673 } else {
8674 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8675 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8676 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8677 }
8678 /*
8679 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8680 * hardware they may be xPRR-protected, in which case they'll be handled
8681 * by the is_pte_xprr_protected() case above. Additionally, the exception
8682 * handling path currently does not call arm_fast_fault() without at least
8683 * VM_PROT_READ in fault_type.
8684 */
8685 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8686 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8687 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8688 result = KERN_SUCCESS;
8689 }
8690 }
8691 }
8692
8693 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8694 /*
8695 * A prior arm_clear_fast_fault() operation may have returned early due to
8696 * another pending PV list operation or an excessively large PV list.
8697 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8698 * taking a fault on the same mapping.
8699 */
8700 result = KERN_SUCCESS;
8701 }
8702
8703 pvh_unlock(pai);
8704 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8705 return result;
8706 }
8707
8708 kern_return_t
8709 arm_fast_fault(
8710 pmap_t pmap,
8711 vm_map_address_t va,
8712 vm_prot_t fault_type,
8713 bool was_af_fault,
8714 __unused bool from_user)
8715 {
8716 kern_return_t result = KERN_FAILURE;
8717
8718 if (va < pmap->min || va >= pmap->max) {
8719 return result;
8720 }
8721
8722 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8723 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8724 from_user);
8725
8726 do {
8727 #if XNU_MONITOR
8728 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8729 #else
8730 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8731 #endif
8732 } while (result == KERN_ABORTED);
8733
8734 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8735
8736 return result;
8737 }
8738
8739 void
8740 pmap_copy_page(
8741 ppnum_t psrc,
8742 ppnum_t pdst,
8743 int options)
8744 {
8745 bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8746 (addr64_t) (ptoa(pdst)),
8747 PAGE_SIZE,
8748 options);
8749 }
8750
8751
8752 /*
8753 * pmap_copy_page copies the specified (machine independent) pages.
8754 */
8755 void
8756 pmap_copy_part_page(
8757 ppnum_t psrc,
8758 vm_offset_t src_offset,
8759 ppnum_t pdst,
8760 vm_offset_t dst_offset,
8761 vm_size_t len)
8762 {
8763 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8764 (addr64_t) (ptoa(pdst) + dst_offset),
8765 len);
8766 }
8767
8768
8769 /*
8770 * pmap_zero_page zeros the specified (machine independent) page.
8771 */
8772 void
8773 pmap_zero_page(
8774 ppnum_t pn)
8775 {
8776 assert(pn != vm_page_fictitious_addr);
8777 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8778 }
8779
8780 void
8781 pmap_zero_page_with_options(
8782 ppnum_t pn,
8783 int options)
8784 {
8785 assert(pn != vm_page_fictitious_addr);
8786 bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8787 }
8788
8789 /*
8790 * pmap_zero_part_page
8791 * zeros the specified (machine independent) part of a page.
8792 */
8793 void
8794 pmap_zero_part_page(
8795 ppnum_t pn,
8796 vm_offset_t offset,
8797 vm_size_t len)
8798 {
8799 assert(pn != vm_page_fictitious_addr);
8800 assert(offset + len <= PAGE_SIZE);
8801 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8802 }
8803
8804 void
8805 pmap_map_globals(
8806 void)
8807 {
8808 pt_entry_t *ptep, pte;
8809
8810 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8811 assert(ptep != PT_ENTRY_NULL);
8812 assert(*ptep == ARM_PTE_EMPTY);
8813
8814 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8815 #if __ARM_KERNEL_PROTECT__
8816 pte |= ARM_PTE_NG;
8817 #endif /* __ARM_KERNEL_PROTECT__ */
8818 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8819 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8820 *ptep = pte;
8821 FLUSH_PTE();
8822 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8823
8824 #if KASAN
8825 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8826 #endif
8827 }
8828
8829 vm_offset_t
8830 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8831 {
8832 if (__improbable(index >= CPUWINDOWS_MAX)) {
8833 panic("%s: invalid index %u", __func__, index);
8834 }
8835 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8836 }
8837
8838 MARK_AS_PMAP_TEXT unsigned int
8839 pmap_map_cpu_windows_copy_internal(
8840 ppnum_t pn,
8841 vm_prot_t prot,
8842 unsigned int wimg_bits)
8843 {
8844 pt_entry_t *ptep = NULL, pte;
8845 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8846 unsigned int cpu_num;
8847 unsigned int i;
8848 vm_offset_t cpu_copywindow_vaddr = 0;
8849 bool need_strong_sync = false;
8850
8851 #if XNU_MONITOR
8852 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8853 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8854 #endif
8855
8856 #if XNU_MONITOR
8857 #ifdef __ARM_COHERENT_IO__
8858 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8859 panic("%s: attempted to map a managed page, "
8860 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8861 __FUNCTION__,
8862 pn, prot, wimg_bits);
8863 }
8864 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8865 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8866 }
8867
8868 #else /* __ARM_COHERENT_IO__ */
8869 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8870 #endif /* __ARM_COHERENT_IO__ */
8871 #endif /* XNU_MONITOR */
8872 cpu_num = pmap_cpu_data->cpu_number;
8873
8874 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8875 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8876 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8877 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8878 if (!pte_is_valid(*ptep)) {
8879 break;
8880 }
8881 }
8882 if (i == CPUWINDOWS_MAX) {
8883 panic("pmap_map_cpu_windows_copy: out of window");
8884 }
8885
8886 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8887 #if __ARM_KERNEL_PROTECT__
8888 pte |= ARM_PTE_NG;
8889 #endif /* __ARM_KERNEL_PROTECT__ */
8890
8891 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8892
8893 if (prot & VM_PROT_WRITE) {
8894 pte |= ARM_PTE_AP(AP_RWNA);
8895 } else {
8896 pte |= ARM_PTE_AP(AP_RONA);
8897 }
8898 #if HAS_FEAT_XS
8899 need_strong_sync = pte_is_xs(native_pt_attr, pte);
8900 #endif
8901 write_pte_fast(ptep, pte);
8902 /*
8903 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8904 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8905 */
8906 FLUSH_PTE_STRONG();
8907 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8908 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8909
8910 return i;
8911 }
8912
8913 unsigned int
8914 pmap_map_cpu_windows_copy(
8915 ppnum_t pn,
8916 vm_prot_t prot,
8917 unsigned int wimg_bits)
8918 {
8919 #if XNU_MONITOR
8920 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8921 #else
8922 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8923 #endif
8924 }
8925
8926 MARK_AS_PMAP_TEXT void
8927 pmap_unmap_cpu_windows_copy_internal(
8928 unsigned int index)
8929 {
8930 pt_entry_t *ptep;
8931 unsigned int cpu_num;
8932 vm_offset_t cpu_copywindow_vaddr = 0;
8933 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8934
8935 cpu_num = pmap_cpu_data->cpu_number;
8936
8937 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8938 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8939 * (which are likely to have been on I/O memory) are complete before
8940 * tearing down the mapping. */
8941 __builtin_arm_dsb(DSB_SY);
8942 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8943 write_pte_strong(ptep, ARM_PTE_EMPTY);
8944 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8945 }
8946
8947 void
8948 pmap_unmap_cpu_windows_copy(
8949 unsigned int index)
8950 {
8951 #if XNU_MONITOR
8952 return pmap_unmap_cpu_windows_copy_ppl(index);
8953 #else
8954 return pmap_unmap_cpu_windows_copy_internal(index);
8955 #endif
8956 }
8957
8958 #if XNU_MONITOR
8959
8960 MARK_AS_PMAP_TEXT void
8961 pmap_invoke_with_page(
8962 ppnum_t page_number,
8963 void *ctx,
8964 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8965 {
8966 #pragma unused(page_number, ctx, callback)
8967 }
8968
8969 /*
8970 * Loop over every pmap_io_range (I/O ranges marked as owned by
8971 * the PPL in the device tree) and conditionally call callback() on each range
8972 * that needs to be included in the hibernation image.
8973 *
8974 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8975 * context is needed in the callback.
8976 * @param callback Callback function invoked on each range (gated by flag).
8977 */
8978 MARK_AS_PMAP_TEXT void
8979 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8980 {
8981 extern const pmap_io_range_t* io_attr_table;
8982 extern const unsigned int num_io_rgns;
8983 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8984 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8985 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8986 }
8987 }
8988 }
8989
8990 /**
8991 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8992 * PPL-owned page. Otherwise, do nothing.
8993 *
8994 * @param addr Physical address of the page to set the HASHED flag on.
8995 */
8996 MARK_AS_PMAP_TEXT void
8997 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8998 {
8999 /* Ignore non-managed kernel memory. */
9000 if (!pa_valid(addr)) {
9001 return;
9002 }
9003
9004 const unsigned int pai = pa_index(addr);
9005 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
9006 pv_entry_t **pv_h = pai_to_pvh(pai);
9007
9008 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
9009 pvh_lock(pai);
9010 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
9011 pvh_unlock(pai);
9012 }
9013 }
9014
9015 /**
9016 * Loop through every physical page in the system and clear out the HASHED flag
9017 * on every PPL-owned page. That flag is used to keep track of which pages have
9018 * been hashed into the hibernation image during the hibernation entry process.
9019 *
9020 * The HASHED flag needs to be cleared out between hibernation cycles because the
9021 * pv_head_table and pp_attr_table's might have been copied into the hibernation
9022 * image with the HASHED flag set on certain pages. It's important to clear the
9023 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
9024 * into the hibernation image can't be compromised across hibernation cycles.
9025 */
9026 MARK_AS_PMAP_TEXT void
9027 pmap_clear_ppl_hashed_flag_all(void)
9028 {
9029 const unsigned int last_index = pa_index(vm_last_phys);
9030 pv_entry_t **pv_h = NULL;
9031
9032 for (int pai = 0; pai < last_index; ++pai) {
9033 pv_h = pai_to_pvh(pai);
9034
9035 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
9036 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
9037 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
9038 pvh_lock(pai);
9039 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
9040 pvh_unlock(pai);
9041 }
9042 }
9043 }
9044
9045 /**
9046 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
9047 * ppl_hib driver will call this after all wired pages have been copied into the
9048 * hibernation image.
9049 */
9050 MARK_AS_PMAP_TEXT void
9051 pmap_check_ppl_hashed_flag_all(void)
9052 {
9053 const unsigned int last_index = pa_index(vm_last_phys);
9054 pv_entry_t **pv_h = NULL;
9055
9056 for (int pai = 0; pai < last_index; ++pai) {
9057 pv_h = pai_to_pvh(pai);
9058
9059 /**
9060 * The PMAP stacks are explicitly not saved into the image so skip checking
9061 * the pages that contain the PMAP stacks.
9062 */
9063 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
9064 (pai < pa_index(pmap_stacks_end_pa));
9065
9066 if (!is_pmap_stack &&
9067 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
9068 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
9069 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
9070 }
9071 }
9072 }
9073
9074 #endif /* XNU_MONITOR */
9075
9076 /*
9077 * Indicate that a pmap is intended to be used as a nested pmap
9078 * within one or more larger address spaces. This must be set
9079 * before pmap_nest() is called with this pmap as the 'subordinate'.
9080 */
9081 MARK_AS_PMAP_TEXT void
9082 pmap_set_nested_internal(
9083 pmap_t pmap)
9084 {
9085 validate_pmap_mutable(pmap);
9086 if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
9087 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9088 __func__, pmap, pmap->type);
9089 }
9090
9091 #if XNU_MONITOR
9092 /**
9093 * The "seq_cst" ordering of the atomic load here guarantees
9094 * the check below is performed after the type update above
9095 * is observed. Together with similar order guarantee at
9096 * pmap_switch_internal(), it makes sure a pmap is never
9097 * active-and-nested:
9098 *
9099 * pmap_set_nested() | pmap_switch()
9100 * --------------------------------------
9101 * set nested | set active
9102 * store-load barrier| store-load barrier
9103 * assert !active | assert !nested
9104 */
9105 const int max_cpu = ml_get_max_cpu_number();
9106 for (unsigned int i = 0; i <= max_cpu; ++i) {
9107 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
9108 if (cpu_data == NULL) {
9109 continue;
9110 }
9111 if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
9112 panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
9113 }
9114 }
9115 #endif /* XNU_MONITOR */
9116
9117 /**
9118 * Ensure that a (potentially concurrent) call to pmap_set_shared_region() hasn't tried
9119 * to give this pmap its own nested pmap.
9120 */
9121 if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
9122 panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
9123 }
9124
9125 pmap_get_pt_ops(pmap)->free_id(pmap);
9126 }
9127
9128 __mockable void
9129 pmap_set_nested(
9130 pmap_t pmap)
9131 {
9132 #if XNU_MONITOR
9133 pmap_set_nested_ppl(pmap);
9134 #else
9135 pmap_set_nested_internal(pmap);
9136 #endif
9137 }
9138
9139 bool
9140 pmap_is_nested(
9141 pmap_t pmap)
9142 {
9143 return pmap->type == PMAP_TYPE_NESTED;
9144 }
9145
9146 /*
9147 * pmap_trim_range(pmap, start, end)
9148 *
9149 * pmap = pmap to operate on
9150 * start = start of the range
9151 * end = end of the range
9152 *
9153 * Attempts to deallocate TTEs for the given range in the nested range.
9154 */
9155 MARK_AS_PMAP_TEXT static void
9156 pmap_trim_range(
9157 pmap_t pmap,
9158 addr64_t start,
9159 addr64_t end)
9160 {
9161 addr64_t cur;
9162 addr64_t nested_region_start;
9163 addr64_t nested_region_end;
9164 addr64_t adjusted_start;
9165 addr64_t adjusted_end;
9166 addr64_t adjust_offmask;
9167 tt_entry_t * tte_p;
9168 pt_entry_t * pte_p;
9169 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9170
9171 if (__improbable(end < start)) {
9172 panic("%s: invalid address range, "
9173 "pmap=%p, start=%p, end=%p",
9174 __func__,
9175 pmap, (void*)start, (void*)end);
9176 }
9177
9178 nested_region_start = pmap->nested_region_addr;
9179 nested_region_end = nested_region_start + pmap->nested_region_size;
9180
9181 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9182 panic("%s: range outside nested region %p-%p, "
9183 "pmap=%p, start=%p, end=%p",
9184 __func__, (void *)nested_region_start, (void *)nested_region_end,
9185 pmap, (void*)start, (void*)end);
9186 }
9187
9188 /* Contract the range to TT page boundaries. */
9189 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9190 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9191 adjusted_end = end & ~adjust_offmask;
9192
9193 /* Iterate over the range, trying to remove TTEs. */
9194 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9195 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9196
9197 tte_p = pmap_tte(pmap, cur);
9198
9199 if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
9200 pte_p = (pt_entry_t *) ttetokv(*tte_p);
9201
9202 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9203 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9204 /* Deallocate for the nested map. */
9205 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9206 } else if (pmap->type == PMAP_TYPE_USER) {
9207 /**
9208 * Just remove for the parent map. If the leaf table pointed
9209 * to by the TTE being removed (owned by the nested pmap)
9210 * has any mappings, then this call will panic. This
9211 * enforces the policy that tables being trimmed must be
9212 * empty to prevent possible use-after-free attacks.
9213 */
9214 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9215 } else {
9216 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9217 }
9218 } else {
9219 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9220 }
9221 }
9222
9223 /* Remove empty L2 TTs. */
9224 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9225 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9226
9227 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9228 /* For each L1 entry in our range... */
9229 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9230
9231 bool remove_tt1e = true;
9232 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9233 tt_entry_t * tt2e_start;
9234 tt_entry_t * tt2e_end;
9235 tt_entry_t * tt2e_p;
9236 tt_entry_t tt1e;
9237
9238 if (tt1e_p == NULL) {
9239 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9240 continue;
9241 }
9242
9243 tt1e = *tt1e_p;
9244
9245 if (tt1e == ARM_TTE_TYPE_FAULT) {
9246 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9247 continue;
9248 }
9249
9250 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9251 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9252
9253 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9254 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9255 /*
9256 * If any TTEs are populated, don't remove the
9257 * L1 TT.
9258 */
9259 remove_tt1e = false;
9260 }
9261 }
9262
9263 if (remove_tt1e) {
9264 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9265 } else {
9266 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9267 }
9268 }
9269 }
9270
9271 /**
9272 * State machine for multi-step pmap trimming. Trimming is the action of
9273 * deallocating the TTEs of the shared region of pmaps down to a given range.
9274 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9275 * disabling preemption for too long. These steps include computing the bounds
9276 * of the shared region, trimming the head of the "grand", trimming the tail of
9277 * the "grand", and trimming the "subord". Some of the steps can be skipped under
9278 * different conditions.
9279 *
9280 * @param grand the pmap in which the pages are nested
9281 * @param subord the pmap from which the pages are shared, or nested
9282 * @param vstart start of the used range in "grand"
9283 * @param size size of the used range
9284 * @param state the current state of the state machine
9285 *
9286 * @return the next state of the state machine, to be used in the next call
9287 * into this function.
9288 */
9289 MARK_AS_PMAP_TEXT pmap_trim_state_t
9290 pmap_trim_internal(
9291 pmap_t grand,
9292 pmap_t subord,
9293 addr64_t vstart,
9294 uint64_t size,
9295 pmap_trim_state_t state)
9296 {
9297 /* Validation needs to be done regardless of state. */
9298 addr64_t vend;
9299
9300 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9301 panic("%s: grand addr wraps around, "
9302 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9303 __func__, grand, subord, (void*)vstart, size, state);
9304 }
9305
9306 validate_pmap_mutable(grand);
9307 validate_pmap(subord);
9308
9309 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9310 panic("%s: subord is of non-nestable type 0x%hhx, "
9311 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9312 __func__, subord->type, grand, subord, (void*)vstart, size, state);
9313 }
9314
9315 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9316 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9317 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9318 __func__, grand->type, grand, subord, (void*)vstart, size, state);
9319 }
9320
9321 if (__improbable(grand->nested_pmap != subord)) {
9322 panic("%s: grand->nested != subord, "
9323 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9324 __func__, grand, subord, (void*)vstart, size, state);
9325 }
9326
9327 if (__improbable((size != 0) &&
9328 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9329 panic("%s: grand range not in nested region, "
9330 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9331 __func__, grand, subord, (void*)vstart, size, state);
9332 }
9333
9334
9335 /* Trimming starts with figuring out the bounds for the grand. */
9336 if (state == PMAP_TRIM_STATE_START) {
9337 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9338
9339 /**
9340 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9341 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9342 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9343 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9344 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9345 * PMAP_TRIM_STATE_DONE.
9346 */
9347 if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9348 assert(subord->nested_bounds_set);
9349
9350 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9351 if (!grand->nested_bounds_set) {
9352 /* Inherit the bounds from subord. */
9353 grand->nested_region_true_start = subord->nested_region_true_start;
9354 grand->nested_region_true_end = subord->nested_region_true_end;
9355 grand->nested_bounds_set = true;
9356 }
9357
9358 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9359
9360 /* Now that the grand has bounds, we are done. */
9361 return PMAP_TRIM_STATE_DONE;
9362 }
9363
9364 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9365 if ((!subord->nested_bounds_set) && size) {
9366 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9367 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9368
9369 subord->nested_region_true_start = vstart;
9370 subord->nested_region_true_end = vend;
9371 subord->nested_region_true_start &= ~adjust_offmask;
9372
9373 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9374 panic("%s: padded true end wraps around, "
9375 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9376 __func__, grand, subord, (void*)vstart, size, state);
9377 }
9378
9379 subord->nested_region_true_end &= ~adjust_offmask;
9380 subord->nested_bounds_set = true;
9381 }
9382
9383 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9384 if (subord->nested_bounds_set) {
9385 /* Inherit the bounds from subord. */
9386 grand->nested_region_true_start = subord->nested_region_true_start;
9387 grand->nested_region_true_end = subord->nested_region_true_end;
9388 grand->nested_bounds_set = true;
9389
9390 /* If we know the bounds, we can trim the pmap. */
9391 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9392
9393 state = PMAP_TRIM_STATE_GRAND_BEFORE;
9394 } else {
9395 /* Don't trim if we don't know the bounds. */
9396 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9397
9398 return PMAP_TRIM_STATE_DONE;
9399 }
9400 }
9401
9402 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
9403 if (!grand->nested_bounds_set) {
9404 panic("%s: !grand->nested_bounds_set, "
9405 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9406 __func__, grand, subord, (void*)vstart, size, state);
9407 }
9408
9409 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9410 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9411 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9412 NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9413 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9414 (unsigned int)grand->nested_no_bounds_ref_state);
9415 }
9416
9417 #if XNU_MONITOR
9418 if (pmap_pending_preemption()) {
9419 return PMAP_TRIM_STATE_GRAND_AFTER;
9420 }
9421 #endif
9422
9423 state = PMAP_TRIM_STATE_GRAND_AFTER;
9424 }
9425
9426 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9427 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9428 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9429 NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9430 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9431 (unsigned int)grand->nested_no_bounds_ref_state);
9432 }
9433
9434 #if XNU_MONITOR
9435 if (pmap_pending_preemption()) {
9436 return PMAP_TRIM_STATE_SUBORD;
9437 }
9438 #endif
9439
9440 state = PMAP_TRIM_STATE_SUBORD;
9441 }
9442
9443 /* START state is guaranteed to compute the bounds for the subord. */
9444 if (!subord->nested_bounds_set) {
9445 panic("%s: !subord->nested_bounds_set, "
9446 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9447 __func__, grand, subord, (void*)vstart, size, state);
9448 }
9449
9450 if (state == PMAP_TRIM_STATE_SUBORD) {
9451 /**
9452 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9453 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9454 * called once grand's nested tables have been fully trimmed, and can only be called once
9455 * for each 'grand' pmap. We use release ordering for the atomics above to ensure that
9456 * the state update is visible only once the preceding trim operation is complete. An
9457 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9458 * but locking within pmap_trim_range() should make that harmless (and all but one will
9459 * ultimately panic due to a failed atomic state CAS). We use acquire ordering here to
9460 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9461 * of the state CAS.
9462 */
9463 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9464 NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9465 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9466 (unsigned int)grand->nested_no_bounds_ref_state);
9467 }
9468 pmap_trim_subord(subord);
9469 }
9470
9471 return PMAP_TRIM_STATE_DONE;
9472 }
9473
9474 MARK_AS_PMAP_TEXT static void
9475 pmap_trim_self(pmap_t pmap)
9476 {
9477 if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9478 /* If we have a no bounds ref, we need to drop it. */
9479 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9480 pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9481 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9482 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9483 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9484 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9485
9486 if (nested_bounds_set) {
9487 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9488 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9489 }
9490 /*
9491 * Try trimming the nested pmap, in case we had the
9492 * last reference.
9493 */
9494 pmap_trim_subord(pmap->nested_pmap);
9495 }
9496 }
9497
9498 /*
9499 * pmap_trim_subord(grand, subord)
9500 *
9501 * grand = pmap that we have nested subord in
9502 * subord = nested pmap we are attempting to trim
9503 *
9504 * Trims subord if possible
9505 */
9506 MARK_AS_PMAP_TEXT static void
9507 pmap_trim_subord(pmap_t subord)
9508 {
9509 bool contract_subord = false;
9510
9511 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9512
9513 subord->nested_no_bounds_refcnt--;
9514
9515 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9516 /* If this was the last no bounds reference, trim subord. */
9517 contract_subord = true;
9518 }
9519
9520 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9521
9522 if (contract_subord) {
9523 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9524 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9525 }
9526 }
9527
9528 /**
9529 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9530 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9531 * disabling preemption for too long.
9532 *
9533 * @note When we load the shared region we always create pages tables for the
9534 * entire region. In practice, the shared cache may use just a portion
9535 * of that. Before we know the bounds of the shared region, it can
9536 * already be mapped into processes. Therefore, once the bounds are
9537 * known, "trimming" comes in handy to remove the unnecessary page
9538 * tables in the processes the shared region is mapped in, and eventually
9539 * those in the shared region itself. Note that the shared region must
9540 * be trimmed after the user processes because it has the L3 entries
9541 * everyone else is pointing to.
9542 *
9543 * @param grand the pmap in which the pages are nested
9544 * @param subord the pmap from which the pages are shared, or nested
9545 * @param vstart start of the used range in "grand"
9546 * @param size size of the used range
9547 */
9548 void
9549 pmap_trim(
9550 pmap_t grand,
9551 pmap_t subord,
9552 addr64_t vstart,
9553 uint64_t size)
9554 {
9555 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9556
9557 #if XNU_MONITOR
9558 /* On PPL systems, drives the state machine until its done. */
9559 while (state != PMAP_TRIM_STATE_DONE) {
9560 __assert_only pmap_trim_state_t old_state = state;
9561 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9562
9563 /* Are we making progress? */
9564 assert(old_state != state);
9565 }
9566
9567 pmap_ledger_check_balance(grand);
9568 pmap_ledger_check_balance(subord);
9569 #else
9570 state = pmap_trim_internal(grand, subord, vstart, size, state);
9571
9572 /* On non-PPL systems, we expect the implementation to finish in one call. */
9573 assert(state == PMAP_TRIM_STATE_DONE);
9574 #endif
9575 }
9576
9577 #if HAS_APPLE_PAC
9578 void *
9579 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9580 {
9581 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9582 panic("attempt to sign user pointer without process independent key");
9583 }
9584
9585 void *res = NULL;
9586 uint64_t current_intr_state = pmap_interrupts_disable();
9587
9588 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9589
9590 __compiler_materialize_and_prevent_reordering_on(value);
9591 switch (key) {
9592 case ptrauth_key_asia:
9593 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9594 break;
9595 case ptrauth_key_asda:
9596 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9597 break;
9598 default:
9599 __builtin_unreachable();
9600 }
9601 __compiler_materialize_and_prevent_reordering_on(res);
9602
9603 ml_disable_user_jop_key(jop_key, saved_jop_state);
9604
9605 pmap_interrupts_restore(current_intr_state);
9606
9607 return res;
9608 }
9609
9610 void *
9611 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9612 {
9613 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9614 }
9615
9616 void *
9617 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9618 {
9619 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9620 panic("attempt to auth user pointer without process independent key");
9621 }
9622
9623 void *res = NULL;
9624 uint64_t current_intr_state = pmap_interrupts_disable();
9625
9626 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9627 __compiler_materialize_and_prevent_reordering_on(value);
9628 res = ml_auth_ptr_unchecked(value, key, discriminator);
9629 __compiler_materialize_and_prevent_reordering_on(res);
9630 ml_disable_user_jop_key(jop_key, saved_jop_state);
9631
9632 pmap_interrupts_restore(current_intr_state);
9633
9634 return res;
9635 }
9636
9637 void *
9638 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9639 {
9640 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9641 }
9642 #endif /* HAS_APPLE_PAC */
9643
9644 /*
9645 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9646 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9647 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9648 * return value, to indicate where a preempted [un]nest operation should resume.
9649 * When the return value contains the ending address of the nested region with
9650 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9651 */
9652 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9653
9654 /**
9655 * Establishes the pmap associated with a shared region as the nested pmap
9656 * for a top-level user pmap.
9657 *
9658 * @param grand The top-level user pmap
9659 * @param subord The pmap to be set as [grand]'s nested pmap
9660 * @param vstart The base VA of the region to be nested.
9661 * @param size The size (in bytes) of the region to be nested.
9662 */
9663 MARK_AS_PMAP_TEXT kern_return_t
9664 pmap_set_shared_region_internal(
9665 pmap_t grand,
9666 pmap_t subord,
9667 addr64_t vstart,
9668 uint64_t size)
9669 {
9670 addr64_t vend;
9671 uint64_t nested_region_unnested_table_bitmap_size;
9672 unsigned int* nested_region_unnested_table_bitmap = NULL;
9673 kern_return_t kr = KERN_SUCCESS;
9674
9675 validate_pmap_mutable(grand);
9676 validate_pmap(subord);
9677
9678 #if XNU_MONITOR
9679 /*
9680 * Ordering is important here. validate_pmap() has already ensured subord is a
9681 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9682 * be in the process of being destroyed. If destruction is already committed,
9683 * then the check of ref_count below will cover us. If destruction is initiated
9684 * during or after this call, then pmap_destroy() will catch the non-zero
9685 * nested_count.
9686 */
9687 os_atomic_inc(&subord->nested_count, relaxed);
9688 os_atomic_thread_fence(seq_cst);
9689 #endif
9690 if (__improbable(os_atomic_inc_orig(&subord->ref_count, acquire) <= 0)) {
9691 panic("%s: invalid subordinate pmap %p", __func__, subord);
9692 }
9693
9694 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9695 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9696 }
9697
9698 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9699 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9700 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9701 }
9702 if (__improbable(((size | vstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9703 panic("%s: pmap %p unaligned set_shared_region request 0x%llx, 0x%llx",
9704 __func__, grand, vstart, size);
9705 }
9706 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9707 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9708 }
9709
9710 if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) {
9711 nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9712
9713 /**
9714 * Each even-numbered entry in the unnested table bit stores the table's unnesting state to be used
9715 * by pmap_enter() in determining whether to set the NG bit, while the subsequent odd-numbered
9716 * entry stores the "unnest in progress" indicator for the table, which is used by pmap_unnest()
9717 * to determine if an L3 table may not have been fully marked NG due to an interrupted operation.
9718 */
9719 nested_region_unnested_table_bitmap_size <<= 1;
9720
9721 if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9722 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9723 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9724 __func__, nested_region_unnested_table_bitmap_size,
9725 grand, subord, vstart, size);
9726 }
9727
9728 #if XNU_MONITOR
9729 pmap_paddr_t pa = 0;
9730
9731 if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9732 panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9733 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9734 __FUNCTION__, nested_region_unnested_table_bitmap_size,
9735 grand, subord, vstart, size);
9736 }
9737
9738 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9739
9740 if (kr != KERN_SUCCESS) {
9741 goto done;
9742 }
9743
9744 assert(pa);
9745
9746 nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9747 #else
9748 nested_region_unnested_table_bitmap = kalloc_data(
9749 nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9750 Z_WAITOK | Z_ZERO);
9751 #endif
9752
9753 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9754 kr = KERN_ABORTED;
9755 goto done;
9756 }
9757
9758 if (subord->nested_region_unnested_table_bitmap == NULL) {
9759 subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9760 subord->nested_region_addr = vstart;
9761 subord->nested_region_size = (mach_vm_offset_t) size;
9762
9763 /**
9764 * Use a store-release operation to ensure that the rest of the subord->nested_region_*
9765 * fields are initialized and visible before setting the nested_region_unnested_table_bitmap
9766 * field (which is used as the flag to say that the rest are initialized).
9767 */
9768 os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release);
9769 nested_region_unnested_table_bitmap = NULL;
9770 }
9771 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9772 }
9773
9774 if (__improbable(!os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst))) {
9775 panic("%s: attempt to nest pmap %p into pmap %p which already has a nested pmap %p",
9776 __func__, subord, grand, grand->nested_pmap);
9777 }
9778 /**
9779 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9780 * into a nested pmap, which would then produce multiple levels of nesting.
9781 */
9782 if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9783 panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9784 }
9785
9786 done:
9787 if (nested_region_unnested_table_bitmap != NULL) {
9788 #if XNU_MONITOR
9789 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9790 #else
9791 kfree_data(nested_region_unnested_table_bitmap,
9792 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9793 #endif
9794 nested_region_unnested_table_bitmap = NULL;
9795 }
9796
9797 if (kr != KERN_SUCCESS) {
9798 #if XNU_MONITOR
9799 os_atomic_dec(&subord->nested_count, relaxed);
9800 #endif
9801 pmap_destroy_internal(subord);
9802 }
9803
9804 return kr;
9805 }
9806
9807 __mockable void
9808 pmap_set_shared_region(
9809 pmap_t grand,
9810 pmap_t subord,
9811 addr64_t vstart,
9812 uint64_t size)
9813 {
9814 kern_return_t kr = KERN_SUCCESS;
9815
9816 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START,
9817 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size);
9818
9819 pmap_verify_preemptible();
9820 #if XNU_MONITOR
9821 do {
9822 kr = pmap_set_shared_region_ppl(grand, subord, vstart, size);
9823 if (kr == KERN_RESOURCE_SHORTAGE) {
9824 pmap_alloc_page_for_ppl(0);
9825 } else if ((kr != KERN_SUCCESS) && (kr != KERN_ABORTED)) {
9826 panic("%s: unexpected return code 0x%x from pmap_set_shared_region_ppl",
9827 __func__, kr);
9828 }
9829 } while (kr != KERN_SUCCESS);
9830
9831 pmap_ledger_check_balance(grand);
9832 pmap_ledger_check_balance(subord);
9833 #else
9834 /**
9835 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9836 * we have verified preemptibility. Therefore, pmap_set_shared_region_internal()
9837 * will wait for a page or a lock instead of bailing out as in the PPL flavor.
9838 */
9839 kr = pmap_set_shared_region_internal(grand, subord, vstart, size);
9840 assert3u(kr, ==, KERN_SUCCESS);
9841 #endif
9842
9843 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END);
9844 }
9845
9846 /**
9847 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9848 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9849 * This function operates in 3 main phases:
9850 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9851 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9852 * the mapping range are present in subord.
9853 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9854 * contains pointers to subord's leaf-level pagetable pages for the specified
9855 * VA range.
9856 *
9857 * This function may return early due to pending AST_URGENT preemption; if so
9858 * it will indicate the need to be re-entered.
9859 *
9860 * @note This function requires that [subord] has already been associated with
9861 * [grand] through a call to pmap_set_shared_region().
9862 *
9863 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9864 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9865 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9866 * @param size twig-aligned size of the nesting range
9867 * @param vrestart the twig-aligned starting address of the current call. May contain
9868 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9869 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9870 * KERN_RESOURCE_SHORTAGE on allocation failure.
9871 *
9872 * @return the virtual address at which to restart the operation, possibly including
9873 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9874 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9875 */
9876 MARK_AS_PMAP_TEXT vm_map_offset_t
9877 pmap_nest_internal(
9878 pmap_t grand,
9879 pmap_t subord,
9880 addr64_t vstart,
9881 uint64_t size,
9882 vm_map_offset_t vrestart,
9883 kern_return_t *krp)
9884 {
9885 kern_return_t kr = KERN_FAILURE;
9886 vm_map_offset_t vaddr;
9887 tt_entry_t *stte_p;
9888 tt_entry_t *gtte_p;
9889 int expand_options = 0;
9890 bool grand_locked = false;
9891
9892 addr64_t vend;
9893 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9894 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9895 }
9896 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9897 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9898 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9899 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9900 }
9901
9902 assert(krp != NULL);
9903 validate_pmap_mutable(grand);
9904 validate_pmap(subord);
9905
9906 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9907
9908 if (__improbable(subord != grand->nested_pmap)) {
9909 panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p",
9910 __func__, subord, grand, grand->nested_pmap);
9911 }
9912
9913 #if XNU_MONITOR
9914 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9915 #endif
9916
9917 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9918 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9919 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9920 grand, vstart, size, (unsigned long long)vrestart);
9921 }
9922
9923 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9924 kr = KERN_ABORTED;
9925 goto nest_cleanup;
9926 }
9927
9928 if (__improbable((subord->nested_region_addr + subord->nested_region_size) < vend) ||
9929 (subord->nested_region_addr > vstart)) {
9930 panic("%s: attempt to nest [0x%llx, 0x%llx) in pmap %p outside nested pmap %p bounds [0x%llx, 0x%llx)\n",
9931 __func__, vstart, vend, grand, subord, subord->nested_region_addr, subord->nested_region_addr + subord->nested_region_size);
9932 }
9933 if (grand->nested_region_size == 0) {
9934 /*
9935 * If this is grand's first nesting operation, keep the reference on subord.
9936 * It will be released by pmap_destroy_internal() when grand is destroyed.
9937 */
9938 if (!subord->nested_bounds_set) {
9939 /*
9940 * We are nesting without the shared regions bounds
9941 * being known. We'll have to trim the pmap later.
9942 */
9943 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9944 NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9945 panic("%s: grand %p already nested", __func__, grand);
9946 }
9947 subord->nested_no_bounds_refcnt++;
9948 }
9949
9950 /**
9951 * Ensure that we won't exceed the nested_region_unnested_table bitmap bounds established
9952 * in pmap_set_shared_region_internal().
9953 */
9954 if (__improbable((vstart < subord->nested_region_addr) ||
9955 (vend > (subord->nested_region_addr + subord->nested_region_size)))) {
9956 panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9957 __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9958 (void *) (subord->nested_region_addr + subord->nested_region_size));
9959 }
9960
9961 grand->nested_region_addr = vstart;
9962 grand->nested_region_size = (mach_vm_offset_t) size;
9963 } else {
9964 if (__improbable(grand->nested_region_addr > vstart)) {
9965 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9966 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9967 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9968 }
9969 }
9970
9971 vaddr = vrestart & ~PMAP_NEST_GRAND;
9972 if (vaddr < subord->nested_region_true_start) {
9973 vaddr = subord->nested_region_true_start;
9974 }
9975
9976 addr64_t true_end = vend;
9977 if (true_end > subord->nested_region_true_end) {
9978 true_end = subord->nested_region_true_end;
9979 }
9980 __unused unsigned int ttecount = 0;
9981
9982 if (vrestart & PMAP_NEST_GRAND) {
9983 goto nest_grand;
9984 }
9985
9986 while (vaddr < true_end) {
9987 stte_p = pmap_tte(subord, vaddr);
9988 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9989 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9990 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9991
9992 if (kr != KERN_SUCCESS) {
9993 goto done;
9994 }
9995
9996 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9997 }
9998 vaddr += pt_attr_twig_size(pt_attr);
9999 vrestart = vaddr;
10000 ++ttecount;
10001 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10002 pmap_pending_preemption())) {
10003 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10004 kr = KERN_SUCCESS;
10005 goto done;
10006 }
10007 }
10008 /*
10009 * copy TTEs from subord pmap into grand pmap
10010 */
10011
10012 vaddr = (vm_map_offset_t) vstart;
10013 if (vaddr < subord->nested_region_true_start) {
10014 vaddr = subord->nested_region_true_start;
10015 }
10016 vrestart = vaddr | PMAP_NEST_GRAND;
10017
10018 nest_grand:
10019 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10020
10021 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10022 kr = KERN_ABORTED;
10023 goto done;
10024 }
10025 while (vaddr < true_end) {
10026 gtte_p = pmap_tte(grand, vaddr);
10027 if (gtte_p == PT_ENTRY_NULL) {
10028 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10029 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
10030 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10031 if (kr == KERN_SUCCESS) {
10032 kr = KERN_ABORTED;
10033 }
10034 }
10035
10036 if (kr != KERN_SUCCESS) {
10037 goto done;
10038 }
10039
10040 gtte_p = pmap_tt2e(grand, vaddr);
10041 }
10042 /* Don't leak a page table page. Don't violate break-before-make. */
10043 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
10044 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
10045 __func__, gtte_p, grand);
10046 }
10047 /**
10048 * It's possible that grand was trimmed by pmap_trim_internal() while the
10049 * lock was dropped, in which case the previously stored "true" start/end
10050 * will no longer be accurate. In that case, we need to avoid nesting
10051 * tables outside the trimmed range, as those tables may be immediately freed
10052 * which would lead to a dangling page table pointer in grand.
10053 * Note that pmap_trim() may concurrently update grand's bounds as we are
10054 * making these checks, but in that case pmap_trim_range() has not yet
10055 * been called on grand and will wait for us to drop grand's lock, so it
10056 * should see any TTEs we've nested here and clear them appropriately.
10057 */
10058 if (__probable((vaddr >= grand->nested_region_true_start) &&
10059 (vaddr < grand->nested_region_true_end))) {
10060 stte_p = pmap_tte(subord, vaddr);
10061 if (__improbable(stte_p == PT_ENTRY_NULL)) {
10062 panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
10063 }
10064 *gtte_p = *stte_p;
10065 }
10066
10067 vaddr += pt_attr_twig_size(pt_attr);
10068 vrestart = vaddr | PMAP_NEST_GRAND;
10069 ++ttecount;
10070 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10071 pmap_pending_preemption())) {
10072 break;
10073 }
10074 }
10075 if (vaddr >= true_end) {
10076 vrestart = vend | PMAP_NEST_GRAND;
10077 }
10078
10079 kr = KERN_SUCCESS;
10080 done:
10081
10082 FLUSH_PTE();
10083 __builtin_arm_isb(ISB_SY);
10084
10085 if (grand_locked) {
10086 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10087 }
10088
10089 nest_cleanup:
10090 #if XNU_MONITOR
10091 if (kr != KERN_SUCCESS) {
10092 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10093 *krp = kr;
10094 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10095 }
10096 #else
10097 if (kr != KERN_SUCCESS) {
10098 *krp = kr;
10099 }
10100 #endif
10101 return vrestart;
10102 }
10103
10104 __mockable kern_return_t
10105 pmap_nest(
10106 pmap_t grand,
10107 pmap_t subord,
10108 addr64_t vstart,
10109 uint64_t size)
10110 {
10111 kern_return_t kr = KERN_SUCCESS;
10112 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
10113 vm_map_offset_t vend = vaddr + size;
10114 __unused vm_map_offset_t vlast = vaddr;
10115
10116 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
10117 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
10118 VM_KERNEL_ADDRHIDE(vstart));
10119
10120 pmap_verify_preemptible();
10121 #if XNU_MONITOR
10122 while (vaddr != (vend | PMAP_NEST_GRAND)) {
10123 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
10124 if (kr == KERN_RESOURCE_SHORTAGE) {
10125 pmap_alloc_page_for_ppl(0);
10126 kr = KERN_SUCCESS;
10127 } else if (kr == KERN_ABORTED) {
10128 /**
10129 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
10130 * that it won't update kr when KERN_SUCCESS is to be returned.
10131 * Therefore, the KERN_ABORTED needs to be manually cleared here,
10132 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
10133 */
10134 kr = KERN_SUCCESS;
10135 continue;
10136 } else if (kr != KERN_SUCCESS) {
10137 break;
10138 } else if (vaddr == vlast) {
10139 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10140 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
10141 }
10142 vlast = vaddr;
10143 }
10144
10145 pmap_ledger_check_balance(grand);
10146 pmap_ledger_check_balance(subord);
10147 #else
10148 /**
10149 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
10150 * we have verified preemptibility. Therefore, pmap_nest_internal() will
10151 * wait for a page or a lock instead of bailing out as in the PPL flavor.
10152 */
10153 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
10154 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
10155 }
10156 #endif
10157
10158 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
10159
10160 return kr;
10161 }
10162
10163 /*
10164 * kern_return_t pmap_unnest(grand, vaddr)
10165 *
10166 * grand = the pmap that will have the virtual range unnested
10167 * vaddr = start of range in pmap to be unnested
10168 * size = size of range in pmap to be unnested
10169 *
10170 */
10171
10172 kern_return_t
10173 pmap_unnest(
10174 pmap_t grand,
10175 addr64_t vaddr,
10176 uint64_t size)
10177 {
10178 return pmap_unnest_options(grand, vaddr, size, 0);
10179 }
10180
10181 /**
10182 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10183 * from a top-level pmap ('grand'). The corresponding mappings in the nested
10184 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10185 * still have the region nested. The mappings in 'grand' will be left empty
10186 * with the assumption that they will be demand-filled by subsequent access faults.
10187 *
10188 * This function operates in 2 main phases:
10189 * 1. Iteration over the nested pmap's mappings for the specified range to mark
10190 * them non-global.
10191 * 2. Clearing of the twig-level TTEs for the address range in grand.
10192 *
10193 * This function may return early due to pending AST_URGENT preemption; if so
10194 * it will indicate the need to be re-entered.
10195 *
10196 * @param grand pmap from which to unnest mappings
10197 * @param vaddr twig-aligned virtual address for the beginning of the nested range
10198 * @param size twig-aligned size of the nested range
10199 * @param vrestart the page-aligned starting address of the current call. May contain
10200 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10201 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10202 * grand is being torn down and step 1) above is not needed.
10203 *
10204 * @return the virtual address at which to restart the operation, possibly including
10205 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
10206 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10207 */
10208 MARK_AS_PMAP_TEXT vm_map_offset_t
10209 pmap_unnest_options_internal(
10210 pmap_t grand,
10211 addr64_t vaddr,
10212 uint64_t size,
10213 vm_map_offset_t vrestart,
10214 unsigned int option)
10215 {
10216 vm_map_offset_t start;
10217 vm_map_offset_t addr;
10218 tt_entry_t *tte_p;
10219 unsigned int current_index;
10220 unsigned int start_index;
10221 unsigned int max_index;
10222 unsigned int entry_count = 0;
10223
10224 addr64_t vend;
10225 addr64_t true_end;
10226 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10227 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10228 }
10229 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10230 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10231 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10232 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10233 }
10234
10235 validate_pmap_mutable(grand);
10236
10237 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10238 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10239 }
10240
10241 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10242
10243 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10244 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10245 (unsigned long long)vaddr, (unsigned long long)size);
10246 }
10247
10248 if (__improbable(grand->nested_pmap == NULL)) {
10249 panic("%s: %p has no nested pmap", __func__, grand);
10250 }
10251
10252 true_end = vend;
10253 if (true_end > grand->nested_pmap->nested_region_true_end) {
10254 true_end = grand->nested_pmap->nested_region_true_end;
10255 }
10256
10257 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10258 if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10259 return vrestart;
10260 }
10261
10262 start = vrestart;
10263 if (start < grand->nested_pmap->nested_region_true_start) {
10264 start = grand->nested_pmap->nested_region_true_start;
10265 }
10266 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10267 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10268 bool flush_tlb = false;
10269
10270 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10271 pt_entry_t *bpte, *cpte;
10272
10273 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10274
10275 bpte = pmap_pte(grand->nested_pmap, addr);
10276
10277 /*
10278 * If we've re-entered this function partway through unnesting a leaf region, the
10279 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10280 * the run of PTEs and the adjacent "in-progress" bit will be set.
10281 */
10282 if (!testbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10283 testbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap)) {
10284 /*
10285 * Mark the 'twig' region as being unnested. Every mapping entered within
10286 * the nested pmap in this region will now be marked non-global. Do this
10287 * before marking any of the PTEs within the region as non-global to avoid
10288 * the possibility of pmap_enter() subsequently inserting a global mapping
10289 * in the region, which could lead to a TLB conflict if a non-global entry
10290 * is later inserted for the same VA in a pmap which has fully unnested this
10291 * region.
10292 */
10293 setbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10294 setbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10295 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10296 pmap_paddr_t pa;
10297 unsigned int pai = 0;
10298 boolean_t managed = FALSE;
10299 pt_entry_t spte;
10300
10301 if (pte_is_valid(*cpte) && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10302 spte = *((volatile pt_entry_t*)cpte);
10303 while (!managed) {
10304 pa = pte_to_pa(spte);
10305 if (!pa_valid(pa)) {
10306 break;
10307 }
10308 pai = pa_index(pa);
10309 pvh_lock(pai);
10310 spte = *((volatile pt_entry_t*)cpte);
10311 pa = pte_to_pa(spte);
10312 if (pai == pa_index(pa)) {
10313 managed = TRUE;
10314 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10315 }
10316 pvh_unlock(pai);
10317 }
10318
10319 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10320 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10321 flush_tlb = true;
10322 }
10323
10324 if (managed) {
10325 pvh_assert_locked(pai);
10326 pvh_unlock(pai);
10327 }
10328 }
10329
10330 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10331 vrestart = addr;
10332 ++entry_count;
10333 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10334 pmap_pending_preemption())) {
10335 goto unnest_subord_done;
10336 }
10337 }
10338 clrbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10339 }
10340 addr = vlim;
10341 vrestart = addr;
10342 ++entry_count;
10343 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10344 pmap_pending_preemption())) {
10345 break;
10346 }
10347 }
10348
10349 unnest_subord_done:
10350 if (flush_tlb) {
10351 FLUSH_PTE_STRONG();
10352 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10353 }
10354
10355 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10356 if (current_index < max_index) {
10357 return vrestart;
10358 }
10359 }
10360
10361 /*
10362 * invalidate all pdes for segment at vaddr in pmap grand
10363 */
10364 if (vrestart & PMAP_NEST_GRAND) {
10365 addr = vrestart & ~PMAP_NEST_GRAND;
10366 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10367 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10368 }
10369 } else {
10370 addr = vaddr;
10371 vrestart = vaddr | PMAP_NEST_GRAND;
10372 }
10373
10374 /**
10375 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10376 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10377 * upon reentry.
10378 */
10379 if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10380 return vrestart;
10381 }
10382
10383 if (addr < grand->nested_pmap->nested_region_true_start) {
10384 addr = grand->nested_pmap->nested_region_true_start;
10385 }
10386
10387 start = addr;
10388
10389 while (addr < true_end) {
10390 tte_p = pmap_tte(grand, addr);
10391 /*
10392 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10393 * so it's possible that a region we're trying to unnest may not have been
10394 * nested in the first place.
10395 */
10396 if (tte_p != NULL) {
10397 *tte_p = ARM_TTE_TYPE_FAULT;
10398 }
10399 addr += pt_attr_twig_size(pt_attr);
10400 vrestart = addr | PMAP_NEST_GRAND;
10401 ++entry_count;
10402 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10403 pmap_pending_preemption())) {
10404 break;
10405 }
10406 }
10407 if (addr >= true_end) {
10408 vrestart = vend | PMAP_NEST_GRAND;
10409 }
10410
10411 FLUSH_PTE_STRONG();
10412 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10413
10414 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10415
10416 return vrestart;
10417 }
10418
10419 __mockable kern_return_t
10420 pmap_unnest_options(
10421 pmap_t grand,
10422 addr64_t vaddr,
10423 uint64_t size,
10424 unsigned int option)
10425 {
10426 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10427 vm_map_offset_t vend = vaddr + size;
10428
10429 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10430 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10431
10432 pmap_verify_preemptible();
10433 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10434 #if XNU_MONITOR
10435 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10436 #else
10437 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10438 #endif
10439 }
10440
10441 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10442
10443 return KERN_SUCCESS;
10444 }
10445
10446 boolean_t
10447 pmap_adjust_unnest_parameters(
10448 __unused pmap_t p,
10449 __unused vm_map_offset_t *s,
10450 __unused vm_map_offset_t *e)
10451 {
10452 return TRUE; /* to get to log_unnest_badness()... */
10453 }
10454
10455 /**
10456 * Perform any necessary pre-nesting of the parent's shared region at fork()
10457 * time.
10458 *
10459 * @note This should only be called from vm_map_fork().
10460 *
10461 * @param old_pmap The pmap of the parent task.
10462 * @param new_pmap The pmap of the child task.
10463 *
10464 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10465 * KERN_INVALID_ARGUMENT if the arguments were not valid.
10466 */
10467 kern_return_t
10468 pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap)
10469 {
10470 if (old_pmap == NULL || new_pmap == NULL) {
10471 return KERN_INVALID_ARGUMENT;
10472 }
10473 if (old_pmap->nested_pmap == NULL) {
10474 return KERN_SUCCESS;
10475 }
10476 /**
10477 * Obtain the full shared region bounds from the nested pmap. If old_pmap
10478 * hasn't been fully nested yet, its bounds may not yet be configured.
10479 */
10480 pmap_set_shared_region(new_pmap,
10481 old_pmap->nested_pmap,
10482 old_pmap->nested_pmap->nested_region_addr,
10483 old_pmap->nested_pmap->nested_region_size);
10484 return KERN_SUCCESS;
10485 }
10486
10487 /*
10488 * disable no-execute capability on
10489 * the specified pmap
10490 */
10491 #if DEVELOPMENT || DEBUG
10492 void
10493 pmap_disable_NX(
10494 pmap_t pmap)
10495 {
10496 pmap->nx_enabled = FALSE;
10497 }
10498 #else
10499 void
10500 pmap_disable_NX(
10501 __unused pmap_t pmap)
10502 {
10503 }
10504 #endif
10505
10506 /*
10507 * flush a range of hardware TLB entries.
10508 * NOTE: assumes the smallest TLB entry in use will be for
10509 * an ARM small page (4K).
10510 */
10511
10512 #if __ARM_RANGE_TLBI__
10513 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10514 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
10515 #else
10516 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10517 #endif // __ARM_RANGE_TLBI__
10518 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10519 "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10520 "of npages to 32 bits below may truncate.");
10521
10522 static void
10523 flush_mmu_tlb_region_asid_async(
10524 vm_offset_t va,
10525 size_t length,
10526 pmap_t pmap,
10527 bool last_level_only __unused,
10528 bool strong __unused)
10529 {
10530 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10531 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10532 size_t npages = length >> pmap_page_shift;
10533 uint32_t asid;
10534
10535 asid = pmap->hw_asid;
10536
10537 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10538 boolean_t flush_all = FALSE;
10539
10540 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10541 flush_all = TRUE;
10542 }
10543 if (flush_all) {
10544 flush_mmu_tlb_async();
10545 } else {
10546 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10547 }
10548 return;
10549 }
10550 #if __ARM_RANGE_TLBI__
10551 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10552 /**
10553 * Note that casting npages to 32 bits here is always safe thanks to
10554 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10555 */
10556 va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10557 if (pmap->type == PMAP_TYPE_NESTED) {
10558 flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10559 } else {
10560 flush_mmu_tlb_range_async(va, last_level_only, strong);
10561 }
10562 return;
10563 }
10564 #endif
10565 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10566 va = tlbi_asid(asid) | tlbi_addr(va);
10567
10568 if (pmap->type == PMAP_TYPE_NESTED) {
10569 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10570 } else {
10571 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10572 }
10573 }
10574
10575 MARK_AS_PMAP_TEXT static void
10576 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10577 {
10578 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10579 }
10580
10581 void
10582 flush_mmu_tlb_region(
10583 vm_offset_t va,
10584 unsigned length)
10585 {
10586 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10587 sync_tlb_flush();
10588 }
10589
10590 unsigned int
10591 pmap_cache_attributes(
10592 ppnum_t pn)
10593 {
10594 pmap_paddr_t paddr;
10595 unsigned int pai;
10596 unsigned int result;
10597 pp_attr_t pp_attr_current;
10598
10599 paddr = ptoa(pn);
10600
10601 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10602
10603 if (!pa_valid(paddr)) {
10604 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10605 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10606 }
10607
10608 result = VM_WIMG_DEFAULT;
10609
10610 pai = pa_index(paddr);
10611
10612 pp_attr_current = pp_attr_table[pai];
10613 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10614 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10615 }
10616 return result;
10617 }
10618
10619 MARK_AS_PMAP_TEXT static void
10620 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10621 {
10622 if ((wimg_bits_prev != wimg_bits_new)
10623 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10624 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10625 && (wimg_bits_new != VM_WIMG_COPYBACK))
10626 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10627 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10628 pmap_sync_page_attributes_phys(pn);
10629 }
10630
10631 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10632 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10633 }
10634 }
10635
10636 MARK_AS_PMAP_TEXT __unused void
10637 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10638 {
10639 pmap_paddr_t paddr = ptoa(pn);
10640 const unsigned int pai = pa_index(paddr);
10641
10642 if (__improbable(!pa_valid(paddr))) {
10643 panic("%s called on non-managed page 0x%08x", __func__, pn);
10644 }
10645
10646 pvh_lock(pai);
10647
10648 #if XNU_MONITOR
10649 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10650 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10651 }
10652 #endif
10653
10654 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10655
10656 pvh_unlock(pai);
10657
10658 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10659 }
10660
10661 void *
10662 pmap_map_compressor_page(ppnum_t pn)
10663 {
10664 #if __ARM_PTE_PHYSMAP__
10665 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10666 if (cacheattr != VM_WIMG_DEFAULT) {
10667 #if XNU_MONITOR
10668 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10669 #else
10670 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10671 #endif
10672 }
10673 #endif
10674 return (void*)phystokv(ptoa(pn));
10675 }
10676
10677 void
10678 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10679 {
10680 #if __ARM_PTE_PHYSMAP__
10681 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10682 if (cacheattr != VM_WIMG_DEFAULT) {
10683 #if XNU_MONITOR
10684 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10685 #else
10686 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10687 #endif
10688 }
10689 #endif
10690 }
10691
10692 /**
10693 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10694 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10695 *
10696 * @param page_list List of pages to be updated.
10697 * @param cacheattr The new cache attribute.
10698 */
10699 void
10700 pmap_batch_set_cache_attributes(
10701 const unified_page_list_t *page_list,
10702 unsigned int cacheattr)
10703 {
10704 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10705
10706 if (page_list->type != UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY) {
10707 /**
10708 * For the UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY case, we process the UPL in batch below.
10709 * In an ideal world we would just use these iterator functions within
10710 * pmap_batch_set_cache_attributes_internal() for all cases, but since this is the PPL
10711 * that means we'll need to take special care to handle pending preemption and
10712 * if necessary return the iterator position out to this function and then re-enter
10713 * pmap_batch_set_cache_attributes_internal() at the same iterator position in a
10714 * secure manner. Not impossible, but also not trivial, so unless someone asks for
10715 * this perf improvement on the PPL I'm going to take the lazy approach here.
10716 */
10717 unified_page_list_iterator_t iter;
10718
10719 for (unified_page_list_iterator_init(page_list, &iter);
10720 !unified_page_list_iterator_end(&iter);
10721 unified_page_list_iterator_next(&iter)) {
10722 bool is_fictitious = false;
10723 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10724 if (__probable(!is_fictitious)) {
10725 #if XNU_MONITOR
10726 pmap_set_cache_attributes_ppl(pn, cacheattr);
10727 #else /* !XNU_MONITOR */
10728 pmap_set_cache_attributes_internal(pn, cacheattr);
10729 #endif /* XNU_MONITOR */
10730 }
10731 }
10732 return;
10733 }
10734
10735 if (page_list->upl.upl_size == 0) {
10736 return;
10737 }
10738
10739 batch_set_cache_attr_state_t states;
10740 states.page_index = 0;
10741 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10742 states.tlb_flush_pass_needed = false;
10743 states.rt_cache_flush_pass_needed = false;
10744
10745 /* Verify we are being called from a preemptible context. */
10746 pmap_verify_preemptible();
10747
10748 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10749 #if XNU_MONITOR
10750 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) page_list->upl.upl_info,
10751 states, page_list->upl.upl_size, cacheattr);
10752 #else /* !XNU_MONITOR */
10753 states = pmap_batch_set_cache_attributes_internal(page_list->upl.upl_info,
10754 states, page_list->upl.upl_size, cacheattr);
10755 #endif /* XNU_MONITOR */
10756 }
10757
10758 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10759 }
10760
10761 /**
10762 * Flushes TLB entries associated with the page specified by paddr, but do not
10763 * issue barriers yet.
10764 *
10765 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10766 */
10767 MARK_AS_PMAP_TEXT static void
10768 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10769 {
10770 #if __ARM_PTE_PHYSMAP__
10771 /* Flush the physical aperture mappings. */
10772 const vm_offset_t kva = phystokv(paddr);
10773 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10774 #endif /* __ARM_PTE_PHYSMAP__ */
10775
10776 /* Flush the mappings tracked in the ptes. */
10777 const unsigned int pai = pa_index(paddr);
10778 pv_entry_t **pv_h = pai_to_pvh(pai);
10779
10780 pt_entry_t *pte_p = PT_ENTRY_NULL;
10781 pv_entry_t *pve_p = PV_ENTRY_NULL;
10782
10783 pvh_assert_locked(pai);
10784
10785 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10786 pte_p = pvh_ptep(pv_h);
10787 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10788 pve_p = pvh_pve_list(pv_h);
10789 pte_p = PT_ENTRY_NULL;
10790 }
10791
10792 int pve_ptep_idx = 0;
10793 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10794 if (pve_p != PV_ENTRY_NULL) {
10795 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10796 if (pte_p == PT_ENTRY_NULL) {
10797 goto flush_tlb_skip_pte;
10798 }
10799 }
10800
10801 #ifdef PVH_FLAG_IOMMU
10802 if (pvh_ptep_is_iommu(pte_p)) {
10803 goto flush_tlb_skip_pte;
10804 }
10805 #endif /* PVH_FLAG_IOMMU */
10806 pmap_t pmap = ptep_get_pmap(pte_p);
10807 vm_map_address_t va = ptep_get_va(pte_p);
10808
10809 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10810 pmap, true, false);
10811
10812 flush_tlb_skip_pte:
10813 pte_p = PT_ENTRY_NULL;
10814 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10815 pve_ptep_idx = 0;
10816 pve_p = pve_next(pve_p);
10817 }
10818 }
10819 }
10820
10821 /**
10822 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10823 *
10824 * @param pai The Physical Address Index of the entry.
10825 * @param cacheattr The new cache attribute.
10826 */
10827 MARK_AS_PMAP_TEXT static void
10828 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10829 {
10830 pvh_assert_locked(pai);
10831
10832 pp_attr_t pp_attr_current, pp_attr_template;
10833 do {
10834 pp_attr_current = pp_attr_table[pai];
10835 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10836
10837 /**
10838 * WIMG bits should only be updated under the PVH lock, but we should do
10839 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10840 */
10841 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10842 }
10843
10844 /**
10845 * Batch updates the cache attributes of a list of pages in three passes.
10846 *
10847 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10848 * In pass two, TLB entries are flushed for each page in the list if necessary.
10849 * In pass three, caches are cleaned for each page in the list if necessary.
10850 *
10851 * When running in PPL, this function may decide to return to the caller in response
10852 * to AST_URGENT.
10853 *
10854 * @param user_page_list List of pages to be updated.
10855 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10856 * @param page_cnt Number of pages in total in user_page_list.
10857 * @param cacheattr The new cache attributes.
10858 *
10859 * @return The new state of the state machine.
10860 */
10861 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10862 pmap_batch_set_cache_attributes_internal(
10863 #if XNU_MONITOR
10864 volatile upl_page_info_t *user_page_list,
10865 #else /* !XNU_MONITOR */
10866 upl_page_info_array_t user_page_list,
10867 #endif /* XNU_MONITOR */
10868 batch_set_cache_attr_state_t states,
10869 unsigned int page_cnt,
10870 unsigned int cacheattr)
10871 {
10872 uint64_t page_index = states.page_index;
10873 uint64_t state = states.state;
10874 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10875 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10876
10877 /* For verifying progress. */
10878 __assert_only const uint64_t page_index_old = page_index;
10879 __assert_only const uint64_t state_old = state;
10880
10881 /* Assert page_index and state are within their range. */
10882 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10883 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10884 }
10885
10886 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10887 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10888 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10889 while (page_index < page_cnt) {
10890 const ppnum_t pn = user_page_list[page_index].phys_addr;
10891 const pmap_paddr_t paddr = ptoa(pn);
10892
10893 if (!pa_valid(paddr)) {
10894 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10895 }
10896
10897 const unsigned int pai = pa_index(paddr);
10898
10899 /* Lock the page. */
10900 pvh_lock(pai);
10901
10902 #if XNU_MONITOR
10903 if (ppattr_pa_test_monitor(paddr)) {
10904 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10905 }
10906 #endif /* XNU_MONITOR */
10907 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10908
10909 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10910 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10911 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10912 }
10913
10914 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10915
10916 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10917 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10918 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10919 }
10920
10921 /* Update the cache attributes in PTE and PP_ATTR table. */
10922 if (wimg_bits_new != wimg_bits_prev) {
10923 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10924 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10925 }
10926
10927 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10928 rt_cache_flush_pass_needed = true;
10929 }
10930
10931 pvh_unlock(pai);
10932
10933 page_index++;
10934
10935 #if XNU_MONITOR
10936 /**
10937 * Check for AST_URGENT every page, as the pve list search in cache
10938 * update can take non-constant time.
10939 */
10940 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10941 goto pbscai_exit;
10942 }
10943 #endif /* XNU_MONITOR */
10944 }
10945
10946 /* page_index == page_cnt && !pmap_pending_preemption() */
10947 if (tlb_flush_pass_needed) {
10948 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10949 } else if (rt_cache_flush_pass_needed) {
10950 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10951 } else {
10952 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10953 }
10954 page_index = 0;
10955
10956 /* Sync the PTE writes before potential TLB/Cache flushes. */
10957 FLUSH_PTE_STRONG();
10958
10959 #if XNU_MONITOR
10960 if (__improbable(pmap_pending_preemption())) {
10961 goto pbscai_exit;
10962 }
10963 #endif /* XNU_MONITOR */
10964 }
10965
10966 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10967 /**
10968 * Pass 2: for each physical page and for each mapping, we need to flush
10969 * the TLB for it.
10970 */
10971 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10972 while (page_index < page_cnt) {
10973 const ppnum_t pn = user_page_list[page_index].phys_addr;
10974
10975 const pmap_paddr_t paddr = ptoa(pn);
10976 if (!pa_valid(paddr)) {
10977 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10978 }
10979
10980 const unsigned int pai = pa_index(paddr);
10981
10982 pvh_lock(pai);
10983 pmap_flush_tlb_for_paddr_locked_async(paddr);
10984 pvh_unlock(pai);
10985
10986 page_index++;
10987
10988 #if XNU_MONITOR
10989 /**
10990 * Check for AST_URGENT every page, as the pve list search in cache
10991 * update can take non-constant time.
10992 */
10993 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10994 goto pbscai_exit;
10995 }
10996 #endif /* XNU_MONITOR */
10997 }
10998
10999 #if HAS_FEAT_XS
11000 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11001 arm64_sync_tlb(false);
11002 #else
11003 /**
11004 * For targets that distinguish between mild and strong DSB, mild DSB
11005 * will not drain the prefetcher. This can lead to prefetch-driven
11006 * cache fills that defeat the uncacheable requirement of the RT memory type.
11007 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11008 */
11009 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
11010 #endif
11011
11012 if (rt_cache_flush_pass_needed) {
11013 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
11014 } else {
11015 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11016 }
11017 page_index = 0;
11018
11019 #if XNU_MONITOR
11020 if (__improbable(pmap_pending_preemption())) {
11021 goto pbscai_exit;
11022 }
11023 #endif /* XNU_MONITOR */
11024 }
11025
11026 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
11027 /* Pass 3: Flush the cache if the page is recently set to RT */
11028 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
11029 #if !XNU_MONITOR
11030 /**
11031 * On non-PPL platforms, we disable preemption to ensure we are not preempted
11032 * in the state where DC by VA instructions remain enabled.
11033 */
11034 disable_preemption();
11035 #endif /* !XNU_MONITOR */
11036
11037 assert(get_preemption_level() > 0);
11038
11039 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11040 /**
11041 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
11042 * and the host will handle cache maintenance for it. So we don't need to
11043 * worry about enabling the ops here for AVP.
11044 */
11045 enable_dc_mva_ops();
11046 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11047
11048 while (page_index < page_cnt) {
11049 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
11050
11051 if (!pa_valid(paddr)) {
11052 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
11053 }
11054
11055 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
11056
11057 page_index++;
11058
11059 #if XNU_MONITOR
11060 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11061 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11062 disable_dc_mva_ops();
11063 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11064 goto pbscai_exit;
11065 }
11066 #endif /* XNU_MONITOR */
11067 }
11068
11069 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11070 disable_dc_mva_ops();
11071 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11072
11073 #if !XNU_MONITOR
11074 enable_preemption();
11075 #endif /* !XNU_MONITOR */
11076
11077 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11078 page_index = 0;
11079 }
11080
11081 #if XNU_MONITOR
11082 pbscai_exit:
11083 #endif /* XNU_MONITOR */
11084 /* Assert page_index and state are within their range. */
11085 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
11086
11087 /* Make sure we are making progress in this call. */
11088 assert(page_index > page_index_old || state > state_old);
11089
11090 batch_set_cache_attr_state_t states_new;
11091 states_new.page_index = page_index;
11092 states_new.state = state;
11093 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
11094 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
11095 return states_new;
11096 }
11097
11098 MARK_AS_PMAP_TEXT static void
11099 pmap_set_cache_attributes_priv(
11100 ppnum_t pn,
11101 unsigned int cacheattr,
11102 boolean_t external __unused)
11103 {
11104 pmap_paddr_t paddr;
11105 unsigned int pai;
11106 pp_attr_t pp_attr_current;
11107 pp_attr_t pp_attr_template;
11108 unsigned int wimg_bits_prev, wimg_bits_new;
11109
11110 paddr = ptoa(pn);
11111
11112 if (!pa_valid(paddr)) {
11113 return; /* Not a managed page. */
11114 }
11115
11116 if (cacheattr & VM_WIMG_USE_DEFAULT) {
11117 cacheattr = VM_WIMG_DEFAULT;
11118 }
11119
11120 pai = pa_index(paddr);
11121
11122 pvh_lock(pai);
11123
11124 #if XNU_MONITOR
11125 if (external && ppattr_pa_test_monitor(paddr)) {
11126 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
11127 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
11128 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
11129 }
11130 #endif
11131
11132 do {
11133 pp_attr_current = pp_attr_table[pai];
11134 wimg_bits_prev = VM_WIMG_DEFAULT;
11135 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
11136 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
11137 }
11138
11139 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
11140
11141 /**
11142 * WIMG bits should only be updated under the PVH lock, but we should do
11143 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
11144 */
11145 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
11146
11147 wimg_bits_new = VM_WIMG_DEFAULT;
11148 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
11149 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
11150 }
11151
11152 if (wimg_bits_new != wimg_bits_prev) {
11153 pmap_update_cache_attributes_locked(pn, cacheattr, true);
11154 }
11155
11156 pvh_unlock(pai);
11157
11158 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
11159 }
11160
11161 MARK_AS_PMAP_TEXT void
11162 pmap_set_cache_attributes_internal(
11163 ppnum_t pn,
11164 unsigned int cacheattr)
11165 {
11166 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
11167 }
11168
11169 void
11170 pmap_set_cache_attributes(
11171 ppnum_t pn,
11172 unsigned int cacheattr)
11173 {
11174 #if XNU_MONITOR
11175 pmap_set_cache_attributes_ppl(pn, cacheattr);
11176 #else
11177 pmap_set_cache_attributes_internal(pn, cacheattr);
11178 #endif
11179 }
11180
11181 /**
11182 * Updates the page numbered ppnum to have attribute specified by attributes.
11183 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
11184 * The necessity of the TLB flush is returned in case this function is called
11185 * in a batched manner and the TLB flush is intended to be done at a different
11186 * timing.
11187 *
11188 * @param ppnum Page Number of the page to be updated.
11189 * @param attributes The new cache attributes.
11190 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11191 * immediately.
11192 *
11193 * @return Returns true if a TLB flush is needed for this update regardless of
11194 * whether a flush has occurred already.
11195 */
11196 MARK_AS_PMAP_TEXT bool
11197 pmap_update_cache_attributes_locked(
11198 ppnum_t ppnum,
11199 unsigned attributes,
11200 bool perform_tlbi)
11201 {
11202 pmap_paddr_t phys = ptoa(ppnum);
11203 pv_entry_t *pve_p;
11204 pt_entry_t *pte_p;
11205 pv_entry_t **pv_h;
11206 pt_entry_t tmplate;
11207 unsigned int pai;
11208 boolean_t tlb_flush_needed = false;
11209
11210 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11211
11212 if (pmap_panic_dev_wimg_on_managed) {
11213 switch (attributes & VM_WIMG_MASK) {
11214 case VM_WIMG_IO: // nGnRnE
11215 case VM_WIMG_POSTED: // nGnRE
11216 /* supported on DRAM, but slow, so we disallow */
11217
11218 case VM_WIMG_POSTED_REORDERED: // nGRE
11219 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11220 /* unsupported on DRAM */
11221
11222 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11223 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11224 break;
11225
11226 default:
11227 /* not device type memory, all good */
11228
11229 break;
11230 }
11231 }
11232
11233 #if __ARM_PTE_PHYSMAP__
11234 vm_offset_t kva = phystokv(phys);
11235 pte_p = pmap_pte(kernel_pmap, kva);
11236
11237 tmplate = *pte_p;
11238 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11239 #if XNU_MONITOR
11240 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11241 #else
11242 tmplate |= wimg_to_pte(attributes, phys);
11243 #endif
11244 if (tmplate & ARM_PTE_HINT_MASK) {
11245 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11246 __FUNCTION__, pte_p, (void *)kva, tmplate);
11247 }
11248
11249 if (perform_tlbi) {
11250 write_pte_strong(pte_p, tmplate);
11251 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11252 } else {
11253 write_pte_fast(pte_p, tmplate);
11254 }
11255 tlb_flush_needed = true;
11256 #endif
11257
11258 pai = pa_index(phys);
11259
11260 pv_h = pai_to_pvh(pai);
11261
11262 pte_p = PT_ENTRY_NULL;
11263 pve_p = PV_ENTRY_NULL;
11264 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11265 pte_p = pvh_ptep(pv_h);
11266 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11267 pve_p = pvh_pve_list(pv_h);
11268 pte_p = PT_ENTRY_NULL;
11269 }
11270
11271 int pve_ptep_idx = 0;
11272 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11273 vm_map_address_t va;
11274 pmap_t pmap;
11275
11276 if (pve_p != PV_ENTRY_NULL) {
11277 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11278 if (pte_p == PT_ENTRY_NULL) {
11279 goto cache_skip_pve;
11280 }
11281 }
11282
11283 #ifdef PVH_FLAG_IOMMU
11284 if (pvh_ptep_is_iommu(pte_p)) {
11285 goto cache_skip_pve;
11286 }
11287 #endif
11288 pmap = ptep_get_pmap(pte_p);
11289 #if HAS_FEAT_XS
11290 /**
11291 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11292 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11293 */
11294 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11295 #endif /* HAS_FEAT_XS */
11296 va = ptep_get_va(pte_p);
11297
11298 tmplate = *pte_p;
11299 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11300 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11301
11302 if (perform_tlbi) {
11303 write_pte_strong(pte_p, tmplate);
11304 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11305 pmap, true, false);
11306 } else {
11307 write_pte_fast(pte_p, tmplate);
11308 }
11309 tlb_flush_needed = true;
11310
11311 cache_skip_pve:
11312 pte_p = PT_ENTRY_NULL;
11313 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11314 pve_ptep_idx = 0;
11315 pve_p = pve_next(pve_p);
11316 }
11317 }
11318 if (perform_tlbi && tlb_flush_needed) {
11319 #if HAS_FEAT_XS
11320 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11321 arm64_sync_tlb(false);
11322 #else
11323 /**
11324 * For targets that distinguish between mild and strong DSB, mild DSB
11325 * will not drain the prefetcher. This can lead to prefetch-driven
11326 * cache fills that defeat the uncacheable requirement of the RT memory type.
11327 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11328 */
11329 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11330 #endif
11331 }
11332
11333 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11334
11335 return tlb_flush_needed;
11336 }
11337
11338 /**
11339 * Mark a pmap as being dedicated to use for a commpage mapping.
11340 * The pmap itself will never be activated on a CPU; its mappings will
11341 * only be embedded in userspace pmaps at a fixed virtual address.
11342 *
11343 * @param pmap the pmap to mark as belonging to a commpage.
11344 */
11345 static void
11346 pmap_set_commpage(pmap_t pmap)
11347 {
11348 #if XNU_MONITOR
11349 assert(!pmap_ppl_locked_down);
11350 #endif
11351 assert(pmap->type == PMAP_TYPE_USER);
11352 pmap->type = PMAP_TYPE_COMMPAGE;
11353 /*
11354 * Free the pmap's ASID. This pmap should not ever be directly
11355 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
11356 * ASID space contention but will also cause pmap_switch() to panic
11357 * if an attacker tries to activate this pmap. Disable preemption to
11358 * accommodate the *_nopreempt spinlock in free_asid().
11359 */
11360 mp_disable_preemption();
11361 pmap_get_pt_ops(pmap)->free_id(pmap);
11362 mp_enable_preemption();
11363 }
11364
11365 static void
11366 pmap_update_tt3e(
11367 pmap_t pmap,
11368 vm_address_t address,
11369 tt_entry_t template)
11370 {
11371 tt_entry_t *ptep, pte;
11372
11373 ptep = pmap_tt3e(pmap, address);
11374 if (ptep == NULL) {
11375 panic("%s: no ptep?", __FUNCTION__);
11376 }
11377
11378 pte = *ptep;
11379 pte = tte_to_pa(pte) | template;
11380 write_pte_strong(ptep, pte);
11381 }
11382
11383 /* Note absence of non-global bit */
11384 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11385 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11386 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11387 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11388
11389 /* Note absence of non-global bit and no-execute bit. */
11390 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11391 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11392 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11393 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11394
11395 void
11396 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11397 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11398 {
11399 kern_return_t kr;
11400 pmap_paddr_t data_pa = 0; // data address
11401 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11402 pmap_paddr_t text_pa = 0; // text address
11403
11404 *kernel_data_addr = 0;
11405 *kernel_text_addr = 0;
11406 *user_text_addr = 0;
11407
11408 #if XNU_MONITOR
11409 data_pa = pmap_alloc_page_for_kern(0);
11410 assert(data_pa);
11411 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11412 ro_data_pa = pmap_alloc_page_for_kern(0);
11413 assert(ro_data_pa);
11414 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11415 #if CONFIG_ARM_PFZ
11416 text_pa = pmap_alloc_page_for_kern(0);
11417 assert(text_pa);
11418 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11419 #endif
11420
11421 #else /* XNU_MONITOR */
11422 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11423 /*
11424 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11425 * mapped at page granularity, so a separate page for kernel RO data would not
11426 * be useful.
11427 */
11428 ro_data_pa = data_pa;
11429 #if CONFIG_ARM_PFZ
11430 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11431 #endif
11432
11433 #endif /* XNU_MONITOR */
11434
11435 /*
11436 * In order to avoid burning extra pages on mapping the shared page, we
11437 * create a dedicated pmap for the shared page. We forcibly nest the
11438 * translation tables from this pmap into other pmaps. The level we
11439 * will nest at depends on the MMU configuration (page size, TTBR range,
11440 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11441 *
11442 * Note that this is NOT "the nested pmap" (which is used to nest the
11443 * shared cache).
11444 *
11445 * Note that we update parameters of the entry for our unique needs (NG
11446 * entry, etc.).
11447 */
11448 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11449 assert(commpage_pmap_default != NULL);
11450 pmap_set_commpage(commpage_pmap_default);
11451
11452 /* The user 64-bit mappings... */
11453 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11454 assert(kr == KERN_SUCCESS);
11455 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11456
11457 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11458 assert(kr == KERN_SUCCESS);
11459 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11460 #if CONFIG_ARM_PFZ
11461 /* User mapping of comm page text section for 64 bit mapping only
11462 *
11463 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11464 * user processes to get this page mapped in, they should never call into
11465 * this page.
11466 *
11467 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11468 * is slid in the same L3 as the data commpage. It is either outside the
11469 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11470 * it is reserved and unavailable to mach VM for future mappings.
11471 */
11472 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11473 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11474
11475 vm_map_address_t commpage_text_va = 0;
11476
11477 do {
11478 int text_leaf_index = random() % num_ptes;
11479
11480 // Generate a VA for the commpage text with the same root and twig index as data
11481 // comm page, but with new leaf index we've just generated.
11482 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11483 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11484 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11485
11486 // Assert that this is empty
11487 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11488 assert(ptep != PT_ENTRY_NULL);
11489 assert(*ptep == ARM_TTE_EMPTY);
11490
11491 // At this point, we've found the address we want to insert our comm page at
11492 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11493 assert(kr == KERN_SUCCESS);
11494 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11495 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11496
11497 *user_text_addr = commpage_text_va;
11498 #endif
11499
11500 /* ...and the user 32-bit mappings. */
11501 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11502 assert(kr == KERN_SUCCESS);
11503 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11504
11505 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11506 assert(kr == KERN_SUCCESS);
11507 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11508 #if __ARM_MIXED_PAGE_SIZE__
11509 /**
11510 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11511 * new set of page tables that point to the exact same 16K shared page as
11512 * before. Only the first 4K of the 16K shared page is mapped since that's
11513 * the only part that contains relevant data.
11514 */
11515 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11516 assert(commpage_pmap_4k != NULL);
11517 pmap_set_commpage(commpage_pmap_4k);
11518
11519 /* The user 64-bit mappings... */
11520 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11521 assert(kr == KERN_SUCCESS);
11522 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11523
11524 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11525 assert(kr == KERN_SUCCESS);
11526 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11527
11528 /* ...and the user 32-bit mapping. */
11529 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11530 assert(kr == KERN_SUCCESS);
11531 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11532
11533 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11534 assert(kr == KERN_SUCCESS);
11535 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11536 #endif
11537
11538 /* For manipulation in kernel, go straight to physical page */
11539 *kernel_data_addr = phystokv(data_pa);
11540 assert(commpage_ro_data_kva == 0);
11541 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11542 assert(commpage_text_kva == 0);
11543 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11544 }
11545
11546
11547 /*
11548 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11549 * with user controlled TTEs for regions that aren't explicitly reserved by the
11550 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11551 */
11552 #if (ARM_PGSHIFT == 14)
11553 /**
11554 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11555 * commpage completely above the maximum 32-bit userspace VA.
11556 */
11557 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11558
11559 /**
11560 * Normally there'd be an assert to check that 64-bit devices with 64-bit
11561 * userspace VAs can nest the commpage completely above the maximum 64-bit
11562 * userpace VA, but that technically isn't true on macOS. On those systems, the
11563 * commpage lives within the userspace VA range, but is protected by the VM as
11564 * a reserved region (see vm_reserved_regions[] definition for more info).
11565 */
11566
11567 #elif (ARM_PGSHIFT == 12)
11568 /**
11569 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11570 * above the maximum userspace VA.
11571 */
11572 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11573 #else
11574 #error Nested shared page mapping is unsupported on this config
11575 #endif
11576
11577 MARK_AS_PMAP_TEXT kern_return_t
11578 pmap_insert_commpage_internal(
11579 pmap_t pmap)
11580 {
11581 kern_return_t kr = KERN_SUCCESS;
11582 vm_offset_t commpage_vaddr;
11583 pt_entry_t *ttep, *src_ttep;
11584 int options = 0;
11585 pmap_t commpage_pmap = commpage_pmap_default;
11586
11587 /* Validate the pmap input before accessing its data. */
11588 validate_pmap_mutable(pmap);
11589
11590 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11591 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11592
11593 #if __ARM_MIXED_PAGE_SIZE__
11594 #if !__ARM_16K_PG__
11595 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11596 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11597 #endif /* !__ARM_16K_PG__ */
11598
11599 /* Choose the correct shared page pmap to use. */
11600 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11601 if (pmap_page_size == 16384) {
11602 commpage_pmap = commpage_pmap_default;
11603 } else if (pmap_page_size == 4096) {
11604 commpage_pmap = commpage_pmap_4k;
11605 } else {
11606 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11607 }
11608 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11609
11610 #if XNU_MONITOR
11611 options |= PMAP_OPTIONS_NOWAIT;
11612 #endif /* XNU_MONITOR */
11613
11614 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11615 #error We assume a single page.
11616 #endif
11617
11618 if (pmap_is_64bit(pmap)) {
11619 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11620 } else {
11621 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11622 }
11623
11624
11625 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11626
11627 /*
11628 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11629 * two (2MB) depending on the address space layout. For 16KB pages, each level
11630 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11631 * to "nest".
11632 *
11633 * Note: This is not "nesting" in the shared cache sense. This definition of
11634 * nesting just means inserting pointers to pre-allocated tables inside of
11635 * the passed in pmap to allow us to share page tables (which map the shared
11636 * page) for every task. This saves at least one page of memory per process
11637 * compared to creating new page tables in every process for mapping the
11638 * shared page.
11639 */
11640
11641 /**
11642 * Allocate the twig page tables if needed, and slam a pointer to the shared
11643 * page's tables into place.
11644 */
11645 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11646 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11647
11648 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11649
11650 if (kr != KERN_SUCCESS) {
11651 #if XNU_MONITOR
11652 if (kr == KERN_RESOURCE_SHORTAGE) {
11653 return kr;
11654 } else
11655 #endif
11656 if (kr == KERN_ABORTED) {
11657 return kr;
11658 } else {
11659 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11660 }
11661 }
11662
11663 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11664 }
11665
11666 if (*ttep != ARM_PTE_EMPTY) {
11667 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11668 }
11669
11670 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11671
11672 *ttep = *src_ttep;
11673 FLUSH_PTE_STRONG();
11674
11675 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11676
11677 return kr;
11678 }
11679
11680 static void
11681 pmap_unmap_commpage(
11682 pmap_t pmap)
11683 {
11684 pt_entry_t *ttep;
11685 vm_offset_t commpage_vaddr;
11686 pmap_t commpage_pmap = commpage_pmap_default;
11687
11688 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11689 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11690
11691 #if __ARM_MIXED_PAGE_SIZE__
11692 #if !__ARM_16K_PG__
11693 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11694 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11695 #endif /* !__ARM_16K_PG__ */
11696
11697 /* Choose the correct shared page pmap to use. */
11698 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11699 if (pmap_page_size == 16384) {
11700 commpage_pmap = commpage_pmap_default;
11701 } else if (pmap_page_size == 4096) {
11702 commpage_pmap = commpage_pmap_4k;
11703 } else {
11704 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11705 }
11706 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11707
11708 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11709 #error We assume a single page.
11710 #endif
11711
11712 if (pmap_is_64bit(pmap)) {
11713 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11714 } else {
11715 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11716 }
11717
11718
11719 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11720
11721 if (ttep == NULL) {
11722 return;
11723 }
11724
11725 /* It had better be mapped to the shared page. */
11726 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11727 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11728 }
11729
11730 *ttep = ARM_TTE_EMPTY;
11731 FLUSH_PTE_STRONG();
11732
11733 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11734 sync_tlb_flush();
11735 }
11736
11737 void
11738 pmap_insert_commpage(
11739 pmap_t pmap)
11740 {
11741 kern_return_t kr = KERN_FAILURE;
11742 #if XNU_MONITOR
11743 do {
11744 kr = pmap_insert_commpage_ppl(pmap);
11745
11746 if (kr == KERN_RESOURCE_SHORTAGE) {
11747 pmap_alloc_page_for_ppl(0);
11748 }
11749 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11750
11751 pmap_ledger_check_balance(pmap);
11752 #else
11753 do {
11754 kr = pmap_insert_commpage_internal(pmap);
11755 } while (kr == KERN_ABORTED);
11756 #endif
11757
11758 if (kr != KERN_SUCCESS) {
11759 panic("%s: failed to insert the shared page, kr=%d, "
11760 "pmap=%p",
11761 __FUNCTION__, kr,
11762 pmap);
11763 }
11764 }
11765
11766 static boolean_t
11767 pmap_is_64bit(
11768 pmap_t pmap)
11769 {
11770 return pmap->is_64bit;
11771 }
11772
11773 bool
11774 pmap_is_exotic(
11775 pmap_t pmap __unused)
11776 {
11777 return false;
11778 }
11779
11780
11781 /* ARMTODO -- an implementation that accounts for
11782 * holes in the physical map, if any.
11783 */
11784 boolean_t
11785 pmap_valid_page(
11786 ppnum_t pn)
11787 {
11788 return pa_valid(ptoa(pn));
11789 }
11790
11791 boolean_t
11792 pmap_bootloader_page(
11793 ppnum_t pn)
11794 {
11795 pmap_paddr_t paddr = ptoa(pn);
11796
11797 if (pa_valid(paddr)) {
11798 return FALSE;
11799 }
11800 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11801 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11802 }
11803
11804 MARK_AS_PMAP_TEXT boolean_t
11805 pmap_is_empty_internal(
11806 pmap_t pmap,
11807 vm_map_offset_t va_start,
11808 vm_map_offset_t va_end)
11809 {
11810 vm_map_offset_t block_start, block_end;
11811 tt_entry_t *tte_p;
11812
11813 if (pmap == NULL) {
11814 return TRUE;
11815 }
11816
11817 validate_pmap(pmap);
11818
11819 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11820 unsigned int initial_not_in_kdp = not_in_kdp;
11821
11822 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11823 pmap_lock(pmap, PMAP_LOCK_SHARED);
11824 }
11825
11826
11827 /* TODO: This will be faster if we increment ttep at each level. */
11828 block_start = va_start;
11829
11830 while (block_start < va_end) {
11831 pt_entry_t *bpte_p, *epte_p;
11832 pt_entry_t *pte_p;
11833
11834 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11835 if (block_end > va_end) {
11836 block_end = va_end;
11837 }
11838
11839 tte_p = pmap_tte(pmap, block_start);
11840 if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
11841 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11842 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11843 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11844
11845 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11846 if (*pte_p != ARM_PTE_EMPTY) {
11847 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11848 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11849 }
11850 return FALSE;
11851 }
11852 }
11853 }
11854 block_start = block_end;
11855 }
11856
11857 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11858 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11859 }
11860
11861 return TRUE;
11862 }
11863
11864 boolean_t
11865 pmap_is_empty(
11866 pmap_t pmap,
11867 vm_map_offset_t va_start,
11868 vm_map_offset_t va_end)
11869 {
11870 #if XNU_MONITOR
11871 return pmap_is_empty_ppl(pmap, va_start, va_end);
11872 #else
11873 return pmap_is_empty_internal(pmap, va_start, va_end);
11874 #endif
11875 }
11876
11877 vm_map_offset_t
11878 pmap_max_offset(
11879 boolean_t is64,
11880 unsigned int option)
11881 {
11882 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11883 }
11884
11885 vm_map_offset_t
11886 pmap_max_64bit_offset(
11887 __unused unsigned int option)
11888 {
11889 vm_map_offset_t max_offset_ret = 0;
11890
11891 #if defined(__arm64__)
11892 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11893 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11894 max_offset_ret = arm64_pmap_max_offset_default;
11895 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11896 max_offset_ret = min_max_offset;
11897 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11898 max_offset_ret = MACH_VM_MAX_ADDRESS;
11899 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11900 if (arm64_pmap_max_offset_default) {
11901 max_offset_ret = arm64_pmap_max_offset_default;
11902 } else if (max_mem > 0xC0000000) {
11903 // devices with > 3GB of memory
11904 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11905 } else if (max_mem > 0x40000000) {
11906 // devices with > 1GB and <= 3GB of memory
11907 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11908 } else {
11909 // devices with <= 1 GB of memory
11910 max_offset_ret = min_max_offset;
11911 }
11912 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11913 if (arm64_pmap_max_offset_default) {
11914 // Allow the boot-arg to override jumbo size
11915 max_offset_ret = arm64_pmap_max_offset_default;
11916 } else {
11917 max_offset_ret = MACH_VM_JUMBO_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11918 }
11919 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11920 } else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11921 max_offset_ret = MACH_VM_MAX_ADDRESS;
11922 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11923 } else {
11924 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11925 }
11926
11927 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11928 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11929 assert(max_offset_ret >= min_max_offset);
11930 }
11931 #else
11932 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11933 #endif
11934
11935 return max_offset_ret;
11936 }
11937
11938 vm_map_offset_t
11939 pmap_max_32bit_offset(
11940 unsigned int option)
11941 {
11942 vm_map_offset_t max_offset_ret = 0;
11943
11944 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11945 max_offset_ret = arm_pmap_max_offset_default;
11946 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11947 max_offset_ret = VM_MAX_ADDRESS;
11948 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11949 max_offset_ret = VM_MAX_ADDRESS;
11950 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11951 if (arm_pmap_max_offset_default) {
11952 max_offset_ret = arm_pmap_max_offset_default;
11953 } else if (max_mem > 0x20000000) {
11954 max_offset_ret = VM_MAX_ADDRESS;
11955 } else {
11956 max_offset_ret = VM_MAX_ADDRESS;
11957 }
11958 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11959 max_offset_ret = VM_MAX_ADDRESS;
11960 } else {
11961 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11962 }
11963
11964 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11965 return max_offset_ret;
11966 }
11967
11968 #if CONFIG_DTRACE
11969 /*
11970 * Constrain DTrace copyin/copyout actions
11971 */
11972 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11973 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11974
11975 kern_return_t
11976 dtrace_copyio_preflight(
11977 __unused addr64_t va)
11978 {
11979 if (current_map() == kernel_map) {
11980 return KERN_FAILURE;
11981 } else {
11982 return KERN_SUCCESS;
11983 }
11984 }
11985
11986 kern_return_t
11987 dtrace_copyio_postflight(
11988 __unused addr64_t va)
11989 {
11990 return KERN_SUCCESS;
11991 }
11992 #endif /* CONFIG_DTRACE */
11993
11994
11995 void
11996 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11997 {
11998 }
11999
12000
12001 void
12002 pmap_flush(
12003 __unused pmap_flush_context *cpus_to_flush)
12004 {
12005 /* not implemented yet */
12006 return;
12007 }
12008
12009 #if XNU_MONITOR
12010
12011 /*
12012 * Enforce that the address range described by kva and nbytes is not currently
12013 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
12014 * unintentionally writing to PPL-owned memory.
12015 */
12016 void
12017 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
12018 {
12019 vm_offset_t end;
12020 if (os_add_overflow(kva, nbytes, &end)) {
12021 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12022 }
12023 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12024 pmap_paddr_t pa = kvtophys_nofail(ckva);
12025 unsigned int pai = pa_index(pa);
12026 pp_attr_t attr;
12027 if (__improbable(!pa_valid(pa))) {
12028 panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12029 }
12030 pvh_lock(pai);
12031 if (__improbable(ckva == phystokv(pa))) {
12032 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12033 }
12034 do {
12035 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
12036 if (__improbable(attr & PP_ATTR_MONITOR)) {
12037 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
12038 }
12039 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
12040 pvh_unlock(pai);
12041 if (__improbable(kvtophys_nofail(ckva) != pa)) {
12042 panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12043 }
12044 }
12045 }
12046
12047 void
12048 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
12049 {
12050 vm_offset_t end;
12051 if (os_add_overflow(kva, nbytes, &end)) {
12052 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12053 }
12054 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12055 pmap_paddr_t pa = kvtophys_nofail(ckva);
12056
12057 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
12058 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
12059 }
12060 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
12061 ppattr_pa_clear_no_monitor(pa);
12062 }
12063 }
12064
12065 /**
12066 * Lock down a page, making all mappings read-only, and preventing further
12067 * mappings or removal of this particular kva's mapping. Effectively, it makes
12068 * the physical page at kva immutable (see the ppl_writable parameter for an
12069 * exception to this).
12070 *
12071 * @param kva Valid address to any mapping of the physical page to lockdown.
12072 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12073 * @param ppl_writable True if the PPL should still be able to write to the page
12074 * using the physical aperture mapping. False will make the
12075 * page read-only for both the kernel and PPL in the
12076 * physical aperture.
12077 */
12078
12079 MARK_AS_PMAP_TEXT static void
12080 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12081 {
12082 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
12083 }
12084
12085 /**
12086 * Lock down a page, giving all mappings the specified maximum permissions, and
12087 * preventing further mappings or removal of this particular kva's mapping.
12088 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
12089 * parameter for an exception to this).
12090 *
12091 * @param kva Valid address to any mapping of the physical page to lockdown.
12092 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12093 * @param ppl_writable True if the PPL should still be able to write to the page
12094 * using the physical aperture mapping. False will make the
12095 * page read-only for both the kernel and PPL in the
12096 * physical aperture.
12097 * @param prot Maximum permissions to allow in existing alias mappings
12098 */
12099 MARK_AS_PMAP_TEXT static void
12100 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
12101 {
12102 const pmap_paddr_t pa = kvtophys_nofail(kva);
12103 const unsigned int pai = pa_index(pa);
12104
12105 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12106 pvh_lock(pai);
12107 pv_entry_t **pvh = pai_to_pvh(pai);
12108 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12109
12110 if (__improbable(ppattr_pa_test_monitor(pa))) {
12111 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
12112 }
12113
12114 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
12115 panic("%s: %#lx already locked down/executable (%#llx)",
12116 __func__, kva, (uint64_t)pvh_flags);
12117 }
12118
12119
12120 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
12121
12122 /* Update the physical aperture mapping to prevent kernel write access. */
12123 const unsigned int new_xprr_perm =
12124 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12125 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
12126
12127 pvh_unlock(pai);
12128
12129 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
12130
12131 /**
12132 * Double-check that the mapping didn't change physical addresses before the
12133 * LOCKDOWN flag was set (there is a brief window between the above
12134 * kvtophys() and pvh_lock() calls where the mapping could have changed).
12135 *
12136 * This doesn't solve the ABA problem, but this doesn't have to since once
12137 * the pvh_lock() is grabbed no new mappings can be created on this physical
12138 * page without the LOCKDOWN flag already set (so any future mappings can
12139 * only be RO, and no existing mappings can be removed).
12140 */
12141 if (kvtophys_nofail(kva) != pa) {
12142 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
12143 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
12144 }
12145 }
12146
12147 /**
12148 * Helper for releasing a page from being locked down to the PPL, making it writable to the
12149 * kernel once again.
12150 *
12151 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12152 * to unlockdown a page that was never locked down, will panic.
12153 *
12154 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
12155 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12156 * @param ppl_writable This must match whatever `ppl_writable` parameter was
12157 * passed to the paired pmap_ppl_lockdown_page() call. Any
12158 * deviation will result in a panic.
12159 */
12160 MARK_AS_PMAP_TEXT static void
12161 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
12162 {
12163 pvh_assert_locked(pai);
12164 pv_entry_t **pvh = pai_to_pvh(pai);
12165 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12166
12167 if (__improbable(!(pvh_flags & lockdown_flag))) {
12168 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
12169 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
12170 }
12171
12172
12173 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
12174
12175 /* Restore the pre-lockdown physical aperture mapping permissions. */
12176 const unsigned int old_xprr_perm =
12177 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12178 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
12179 }
12180
12181 /**
12182 * Release a page from being locked down to the PPL, making it writable to the
12183 * kernel once again.
12184 *
12185 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12186 * to unlockdown a page that was never locked down, will panic.
12187 *
12188 * @param kva Valid address to any mapping of the physical page to unlockdown.
12189 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12190 * @param ppl_writable This must match whatever `ppl_writable` parameter was
12191 * passed to the paired pmap_ppl_lockdown_page() call. Any
12192 * deviation will result in a panic.
12193 */
12194 MARK_AS_PMAP_TEXT static void
12195 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12196 {
12197 const pmap_paddr_t pa = kvtophys_nofail(kva);
12198 const unsigned int pai = pa_index(pa);
12199
12200 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12201 pvh_lock(pai);
12202 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12203 pvh_unlock(pai);
12204 }
12205
12206 #else /* XNU_MONITOR */
12207
12208 void __unused
12209 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12210 {
12211 }
12212
12213 void __unused
12214 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12215 {
12216 }
12217
12218 #endif /* !XNU_MONITOR */
12219
12220
12221 MARK_AS_PMAP_TEXT static inline void
12222 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12223 {
12224 #if XNU_MONITOR
12225 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12226 #else
12227 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12228 #endif
12229 }
12230
12231 MARK_AS_PMAP_TEXT static inline void
12232 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12233 {
12234 #if XNU_MONITOR
12235 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12236 #else
12237 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12238 #endif
12239 }
12240
12241 /**
12242 * Perform basic validation checks on the destination only and
12243 * corresponding offset/sizes prior to writing to a read only allocation.
12244 *
12245 * @note Should be called before writing to an allocation from the read
12246 * only allocator.
12247 *
12248 * @param zid The ID of the zone the allocation belongs to.
12249 * @param va VA of element being modified (destination).
12250 * @param offset Offset being written to, in the element.
12251 * @param new_data_size Size of modification.
12252 *
12253 */
12254
12255 MARK_AS_PMAP_TEXT static void
12256 pmap_ro_zone_validate_element_dst(
12257 zone_id_t zid,
12258 vm_offset_t va,
12259 vm_offset_t offset,
12260 vm_size_t new_data_size)
12261 {
12262 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12263 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12264 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12265 }
12266
12267 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12268
12269 /* Check element is from correct zone and properly aligned */
12270 zone_require_ro(zid, elem_size, (void*)va);
12271
12272 if (__improbable(new_data_size > (elem_size - offset))) {
12273 panic("%s: New data size %lu too large for elem size %lu at addr %p",
12274 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12275 }
12276 if (__improbable(offset >= elem_size)) {
12277 panic("%s: Offset %lu too large for elem size %lu at addr %p",
12278 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12279 }
12280 }
12281
12282
12283 /**
12284 * Perform basic validation checks on the source, destination and
12285 * corresponding offset/sizes prior to writing to a read only allocation.
12286 *
12287 * @note Should be called before writing to an allocation from the read
12288 * only allocator.
12289 *
12290 * @param zid The ID of the zone the allocation belongs to.
12291 * @param va VA of element being modified (destination).
12292 * @param offset Offset being written to, in the element.
12293 * @param new_data Pointer to new data (source).
12294 * @param new_data_size Size of modification.
12295 *
12296 */
12297
12298 MARK_AS_PMAP_TEXT static void
12299 pmap_ro_zone_validate_element(
12300 zone_id_t zid,
12301 vm_offset_t va,
12302 vm_offset_t offset,
12303 const vm_offset_t new_data,
12304 vm_size_t new_data_size)
12305 {
12306 vm_offset_t sum = 0;
12307
12308 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12309 panic("%s: Integer addition overflow %p + %lu = %lu",
12310 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12311 }
12312
12313 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12314 }
12315
12316 /**
12317 * Ensure that physical page is locked down before writing to it.
12318 *
12319 * @note Should be called before writing to an allocation from the read
12320 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12321 * ensure that it is called after the modification.
12322 *
12323 *
12324 * @param pa Physical address of the element being modified.
12325 * @param va Virtual address of element being modified.
12326 * @param size Size of the modification.
12327 *
12328 */
12329
12330 MARK_AS_PMAP_TEXT static void
12331 pmap_ro_zone_lock_phy_page(
12332 const pmap_paddr_t pa,
12333 vm_offset_t va,
12334 vm_size_t size)
12335 {
12336 if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12337 panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12338 __func__, (unsigned long long)va, (unsigned long long)size);
12339 }
12340 const unsigned int pai = pa_index(pa);
12341 pvh_lock(pai);
12342
12343 /* Ensure that the physical page is locked down */
12344 #if XNU_MONITOR
12345 pv_entry_t **pvh = pai_to_pvh(pai);
12346 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12347 panic("%s: Physical page not locked down %llx", __func__, pa);
12348 }
12349 #endif /* XNU_MONITOR */
12350 }
12351
12352 /**
12353 * Unlock physical page after writing to it.
12354 *
12355 * @note Should be called after writing to an allocation from the read
12356 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12357 * ensure that it has been called prior to the modification.
12358 *
12359 * @param pa Physical address of the element that was modified.
12360 * @param va Virtual address of element that was modified.
12361 * @param size Size of the modification.
12362 *
12363 */
12364
12365 MARK_AS_PMAP_TEXT static void
12366 pmap_ro_zone_unlock_phy_page(
12367 const pmap_paddr_t pa,
12368 vm_offset_t va __unused,
12369 vm_size_t size __unused)
12370 {
12371 const unsigned int pai = pa_index(pa);
12372 pvh_unlock(pai);
12373 }
12374
12375 /**
12376 * Function to copy kauth_cred from new_data to kv.
12377 * Function defined in "kern_prot.c"
12378 *
12379 * @note Will be removed upon completion of
12380 * <rdar://problem/72635194> Compiler PAC support for memcpy.
12381 *
12382 * @param kv Address to copy new data to.
12383 * @param new_data Pointer to new data.
12384 *
12385 */
12386
12387 extern void
12388 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12389
12390 /**
12391 * Zalloc-specific memcpy that writes through the physical aperture
12392 * and ensures the element being modified is from a read-only zone.
12393 *
12394 * @note Designed to work only with the zone allocator's read-only submap.
12395 *
12396 * @param zid The ID of the zone to allocate from.
12397 * @param va VA of element to be modified.
12398 * @param offset Offset from element.
12399 * @param new_data Pointer to new data.
12400 * @param new_data_size Size of modification.
12401 *
12402 */
12403
12404 void
12405 pmap_ro_zone_memcpy(
12406 zone_id_t zid,
12407 vm_offset_t va,
12408 vm_offset_t offset,
12409 const vm_offset_t new_data,
12410 vm_size_t new_data_size)
12411 {
12412 #if XNU_MONITOR
12413 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12414 #else /* XNU_MONITOR */
12415 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12416 #endif /* XNU_MONITOR */
12417 }
12418
12419 MARK_AS_PMAP_TEXT void
12420 pmap_ro_zone_memcpy_internal(
12421 zone_id_t zid,
12422 vm_offset_t va,
12423 vm_offset_t offset,
12424 const vm_offset_t new_data,
12425 vm_size_t new_data_size)
12426 {
12427 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12428
12429 if (!new_data || new_data_size == 0) {
12430 return;
12431 }
12432
12433 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12434 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12435 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12436 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12437 }
12438
12439 /**
12440 * Zalloc-specific function to atomically mutate fields of an element that
12441 * belongs to a read-only zone, via the physcial aperture.
12442 *
12443 * @note Designed to work only with the zone allocator's read-only submap.
12444 *
12445 * @param zid The ID of the zone the element belongs to.
12446 * @param va VA of element to be modified.
12447 * @param offset Offset in element.
12448 * @param op Atomic operation to perform.
12449 * @param value Mutation value.
12450 *
12451 */
12452
12453 uint64_t
12454 pmap_ro_zone_atomic_op(
12455 zone_id_t zid,
12456 vm_offset_t va,
12457 vm_offset_t offset,
12458 zro_atomic_op_t op,
12459 uint64_t value)
12460 {
12461 #if XNU_MONITOR
12462 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12463 #else /* XNU_MONITOR */
12464 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12465 #endif /* XNU_MONITOR */
12466 }
12467
12468 MARK_AS_PMAP_TEXT uint64_t
12469 pmap_ro_zone_atomic_op_internal(
12470 zone_id_t zid,
12471 vm_offset_t va,
12472 vm_offset_t offset,
12473 zro_atomic_op_t op,
12474 uint64_t value)
12475 {
12476 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12477 vm_size_t value_size = op & 0xf;
12478
12479 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12480 pmap_ro_zone_lock_phy_page(pa, va, value_size);
12481 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12482 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12483
12484 return value;
12485 }
12486
12487 /**
12488 * bzero for allocations from read only zones, that writes through the
12489 * physical aperture.
12490 *
12491 * @note This is called by the zfree path of all allocations from read
12492 * only zones.
12493 *
12494 * @param zid The ID of the zone the allocation belongs to.
12495 * @param va VA of element to be zeroed.
12496 * @param offset Offset in the element.
12497 * @param size Size of allocation.
12498 *
12499 */
12500
12501 void
12502 pmap_ro_zone_bzero(
12503 zone_id_t zid,
12504 vm_offset_t va,
12505 vm_offset_t offset,
12506 vm_size_t size)
12507 {
12508 #if XNU_MONITOR
12509 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12510 #else /* XNU_MONITOR */
12511 pmap_ro_zone_bzero_internal(zid, va, offset, size);
12512 #endif /* XNU_MONITOR */
12513 }
12514
12515 MARK_AS_PMAP_TEXT void
12516 pmap_ro_zone_bzero_internal(
12517 zone_id_t zid,
12518 vm_offset_t va,
12519 vm_offset_t offset,
12520 vm_size_t size)
12521 {
12522 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12523 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12524 pmap_ro_zone_lock_phy_page(pa, va, size);
12525 bzero((void*)phystokv(pa), size);
12526 pmap_ro_zone_unlock_phy_page(pa, va, size);
12527 }
12528
12529 /**
12530 * Removes write access from the Physical Aperture.
12531 *
12532 * @note For non-PPL devices, it simply makes all virtual mappings RO.
12533 * @note Designed to work only with the zone allocator's read-only submap.
12534 *
12535 * @param va VA of the page to restore write access to.
12536 *
12537 */
12538 MARK_AS_PMAP_TEXT static void
12539 pmap_phys_write_disable(vm_address_t va)
12540 {
12541 #if XNU_MONITOR
12542 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12543 #else /* XNU_MONITOR */
12544 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12545 #endif /* XNU_MONITOR */
12546 }
12547
12548 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
12549
12550 MARK_AS_PMAP_TEXT mach_vm_size_t
12551 pmap_query_resident_internal(
12552 pmap_t pmap,
12553 vm_map_address_t start,
12554 vm_map_address_t end,
12555 mach_vm_size_t *compressed_bytes_p)
12556 {
12557 mach_vm_size_t resident_bytes = 0;
12558 mach_vm_size_t compressed_bytes = 0;
12559
12560 pt_entry_t *bpte, *epte;
12561 pt_entry_t *pte_p;
12562 tt_entry_t *tte_p;
12563
12564 if (pmap == NULL) {
12565 return PMAP_RESIDENT_INVALID;
12566 }
12567
12568 validate_pmap(pmap);
12569
12570 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12571
12572 /* Ensure that this request is valid, and addresses exactly one TTE. */
12573 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12574 (end % pt_attr_page_size(pt_attr)))) {
12575 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12576 }
12577
12578 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12579 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12580 }
12581
12582 pmap_lock(pmap, PMAP_LOCK_SHARED);
12583 tte_p = pmap_tte(pmap, start);
12584 if (tte_p == (tt_entry_t *) NULL) {
12585 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12586 return PMAP_RESIDENT_INVALID;
12587 }
12588 if (tte_is_valid_table(*tte_p)) {
12589 pte_p = (pt_entry_t *) ttetokv(*tte_p);
12590 bpte = &pte_p[pte_index(pt_attr, start)];
12591 epte = &pte_p[pte_index(pt_attr, end)];
12592
12593 for (; bpte < epte; bpte++) {
12594 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12595 compressed_bytes += pt_attr_page_size(pt_attr);
12596 } else if (pa_valid(pte_to_pa(*bpte))) {
12597 resident_bytes += pt_attr_page_size(pt_attr);
12598 }
12599 }
12600 }
12601 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12602
12603 if (compressed_bytes_p) {
12604 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12605 *compressed_bytes_p += compressed_bytes;
12606 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12607 }
12608
12609 return resident_bytes;
12610 }
12611
12612 mach_vm_size_t
12613 pmap_query_resident(
12614 pmap_t pmap,
12615 vm_map_address_t start,
12616 vm_map_address_t end,
12617 mach_vm_size_t *compressed_bytes_p)
12618 {
12619 mach_vm_size_t total_resident_bytes;
12620 mach_vm_size_t compressed_bytes;
12621 vm_map_address_t va;
12622
12623
12624 if (pmap == PMAP_NULL) {
12625 if (compressed_bytes_p) {
12626 *compressed_bytes_p = 0;
12627 }
12628 return 0;
12629 }
12630
12631 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12632
12633 total_resident_bytes = 0;
12634 compressed_bytes = 0;
12635
12636 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12637 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12638 VM_KERNEL_ADDRHIDE(end));
12639
12640 va = start;
12641 while (va < end) {
12642 vm_map_address_t l;
12643 mach_vm_size_t resident_bytes;
12644
12645 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12646
12647 if (l > end) {
12648 l = end;
12649 }
12650 #if XNU_MONITOR
12651 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12652 #else
12653 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12654 #endif
12655 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12656 break;
12657 }
12658
12659 total_resident_bytes += resident_bytes;
12660
12661 va = l;
12662 }
12663
12664 if (compressed_bytes_p) {
12665 *compressed_bytes_p = compressed_bytes;
12666 }
12667
12668 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12669 total_resident_bytes);
12670
12671 return total_resident_bytes;
12672 }
12673
12674 #if MACH_ASSERT
12675 static void
12676 pmap_check_ledgers(
12677 pmap_t pmap)
12678 {
12679 int pid;
12680 char *procname;
12681
12682 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12683 /*
12684 * This pmap was not or is no longer fully associated
12685 * with a task (e.g. the old pmap after a fork()/exec() or
12686 * spawn()). Its "ledger" still points at a task that is
12687 * now using a different (and active) address space, so
12688 * we can't check that all the pmap ledgers are balanced here.
12689 *
12690 * If the "pid" is set, that means that we went through
12691 * pmap_set_process() in task_terminate_internal(), so
12692 * this task's ledger should not have been re-used and
12693 * all the pmap ledgers should be back to 0.
12694 */
12695 return;
12696 }
12697
12698 pid = pmap->pmap_pid;
12699 procname = pmap->pmap_procname;
12700
12701 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12702 }
12703 #endif /* MACH_ASSERT */
12704
12705 void
12706 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12707 {
12708 }
12709
12710 /**
12711 * The minimum shared region nesting size is used by the VM to determine when to
12712 * break up large mappings to nested regions. The smallest size that these
12713 * mappings can be broken into is determined by what page table level those
12714 * regions are being nested in at and the size of the page tables.
12715 *
12716 * For instance, if a nested region is nesting at L2 for a process utilizing
12717 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12718 * block entry).
12719 *
12720 * @param pmap The target pmap to determine the block size based on whether it's
12721 * using 16KB or 4KB page tables.
12722 */
12723 uint64_t
12724 pmap_shared_region_size_min(__unused pmap_t pmap)
12725 {
12726 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12727
12728 /**
12729 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12730 * 4KB pages). This means that a target pmap will contain L2 entries that
12731 * point to shared L3 page tables in the shared region pmap.
12732 */
12733 return pt_attr_twig_size(pt_attr);
12734 }
12735
12736 boolean_t
12737 pmap_enforces_execute_only(
12738 pmap_t pmap)
12739 {
12740 return pmap != kernel_pmap;
12741 }
12742
12743 MARK_AS_PMAP_TEXT void
12744 pmap_set_vm_map_cs_enforced_internal(
12745 pmap_t pmap,
12746 bool new_value)
12747 {
12748 validate_pmap_mutable(pmap);
12749 pmap->pmap_vm_map_cs_enforced = new_value;
12750 }
12751
12752 void
12753 pmap_set_vm_map_cs_enforced(
12754 pmap_t pmap,
12755 bool new_value)
12756 {
12757 #if XNU_MONITOR
12758 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12759 #else
12760 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12761 #endif
12762 }
12763
12764 extern int cs_process_enforcement_enable;
12765 bool
12766 pmap_get_vm_map_cs_enforced(
12767 pmap_t pmap)
12768 {
12769 if (cs_process_enforcement_enable) {
12770 return true;
12771 }
12772 return pmap->pmap_vm_map_cs_enforced;
12773 }
12774
12775 MARK_AS_PMAP_TEXT void
12776 pmap_set_jit_entitled_internal(
12777 __unused pmap_t pmap)
12778 {
12779 return;
12780 }
12781
12782 void
12783 pmap_set_jit_entitled(
12784 pmap_t pmap)
12785 {
12786 #if XNU_MONITOR
12787 pmap_set_jit_entitled_ppl(pmap);
12788 #else
12789 pmap_set_jit_entitled_internal(pmap);
12790 #endif
12791 }
12792
12793 bool
12794 pmap_get_jit_entitled(
12795 __unused pmap_t pmap)
12796 {
12797 return false;
12798 }
12799
12800 MARK_AS_PMAP_TEXT void
12801 pmap_set_tpro_internal(
12802 __unused pmap_t pmap)
12803 {
12804 return;
12805 }
12806
12807 void
12808 pmap_set_tpro(
12809 pmap_t pmap)
12810 {
12811 #if XNU_MONITOR
12812 pmap_set_tpro_ppl(pmap);
12813 #else /* XNU_MONITOR */
12814 pmap_set_tpro_internal(pmap);
12815 #endif /* XNU_MONITOR */
12816 }
12817
12818 bool
12819 pmap_get_tpro(
12820 __unused pmap_t pmap)
12821 {
12822 return false;
12823 }
12824
12825 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12826
12827 MARK_AS_PMAP_TEXT kern_return_t
12828 pmap_query_page_info_internal(
12829 pmap_t pmap,
12830 vm_map_offset_t va,
12831 int *disp_p)
12832 {
12833 pmap_paddr_t pa;
12834 int disp;
12835 unsigned int pai;
12836 pt_entry_t *pte_p, pte;
12837 pv_entry_t **pv_h, *pve_p;
12838
12839 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12840 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12841 *disp_p = 0;
12842 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12843 return KERN_INVALID_ARGUMENT;
12844 }
12845
12846 validate_pmap(pmap);
12847 pmap_lock(pmap, PMAP_LOCK_SHARED);
12848
12849 try_again:
12850 disp = 0;
12851 pte_p = pmap_pte(pmap, va);
12852 if (pte_p == PT_ENTRY_NULL) {
12853 goto done;
12854 }
12855 pte = *(volatile pt_entry_t*)pte_p;
12856 pa = pte_to_pa(pte);
12857 if (pa == 0) {
12858 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12859 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12860 if (pte & ARM_PTE_COMPRESSED_ALT) {
12861 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12862 }
12863 }
12864 } else {
12865 disp |= PMAP_QUERY_PAGE_PRESENT;
12866 pai = pa_index(pa);
12867 if (!pa_valid(pa)) {
12868 goto done;
12869 }
12870 pvh_lock(pai);
12871 if (pte != *(volatile pt_entry_t*)pte_p) {
12872 /* something changed: try again */
12873 pvh_unlock(pai);
12874 pmap_query_page_info_retries++;
12875 goto try_again;
12876 }
12877 pv_h = pai_to_pvh(pai);
12878 pve_p = PV_ENTRY_NULL;
12879 int pve_ptep_idx = 0;
12880 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12881 pve_p = pvh_pve_list(pv_h);
12882 while (pve_p != PV_ENTRY_NULL &&
12883 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12884 pve_p = pve_next(pve_p);
12885 }
12886 }
12887
12888 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12889 disp |= PMAP_QUERY_PAGE_ALTACCT;
12890 } else if (ppattr_test_reusable(pai)) {
12891 disp |= PMAP_QUERY_PAGE_REUSABLE;
12892 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12893 disp |= PMAP_QUERY_PAGE_INTERNAL;
12894 }
12895 pvh_unlock(pai);
12896 }
12897
12898 done:
12899 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12900 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12901 *disp_p = disp;
12902 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12903 return KERN_SUCCESS;
12904 }
12905
12906 kern_return_t
12907 pmap_query_page_info(
12908 pmap_t pmap,
12909 vm_map_offset_t va,
12910 int *disp_p)
12911 {
12912 #if XNU_MONITOR
12913 return pmap_query_page_info_ppl(pmap, va, disp_p);
12914 #else
12915 return pmap_query_page_info_internal(pmap, va, disp_p);
12916 #endif
12917 }
12918
12919
12920
12921 uint32_t
12922 pmap_user_va_bits(pmap_t pmap __unused)
12923 {
12924 #if __ARM_MIXED_PAGE_SIZE__
12925 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12926 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12927 #else
12928 return 64 - T0SZ_BOOT;
12929 #endif
12930 }
12931
12932 uint32_t
12933 pmap_kernel_va_bits(void)
12934 {
12935 return 64 - T1SZ_BOOT;
12936 }
12937
12938 static vm_map_size_t
12939 pmap_user_va_size(pmap_t pmap)
12940 {
12941 return 1ULL << pmap_user_va_bits(pmap);
12942 }
12943
12944
12945
12946
12947 bool
12948 pmap_in_ppl(void)
12949 {
12950 // Unsupported
12951 return false;
12952 }
12953
12954 __attribute__((__noreturn__))
12955 void
12956 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12957 {
12958 panic("%s called on an unsupported platform.", __FUNCTION__);
12959 }
12960
12961 void *
12962 pmap_claim_reserved_ppl_page(void)
12963 {
12964 // Unsupported
12965 return NULL;
12966 }
12967
12968 void
12969 pmap_free_reserved_ppl_page(void __unused *kva)
12970 {
12971 // Unsupported
12972 }
12973
12974
12975 #if PMAP_CS_PPL_MONITOR
12976
12977 /* Immutable part of the trust cache runtime */
12978 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12979
12980 /* Mutable part of the trust cache runtime */
12981 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12982
12983 /* Lock for the trust cache runtime */
12984 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12985
12986 MARK_AS_PMAP_TEXT kern_return_t
12987 pmap_check_trust_cache_runtime_for_uuid_internal(
12988 const uint8_t check_uuid[kUUIDSize])
12989 {
12990 kern_return_t ret = KERN_DENIED;
12991
12992 /* Lock the runtime as shared */
12993 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12994
12995 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12996 &ppl_trust_cache_rt,
12997 check_uuid,
12998 NULL);
12999
13000 /* Unlock the runtime */
13001 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13002
13003 if (tc_ret.error == kTCReturnSuccess) {
13004 ret = KERN_SUCCESS;
13005 } else if (tc_ret.error == kTCReturnNotFound) {
13006 ret = KERN_NOT_FOUND;
13007 } else {
13008 ret = KERN_FAILURE;
13009 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
13010 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13011 }
13012
13013 return ret;
13014 }
13015
13016 kern_return_t
13017 pmap_check_trust_cache_runtime_for_uuid(
13018 const uint8_t check_uuid[kUUIDSize])
13019 {
13020 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
13021 }
13022
13023 MARK_AS_PMAP_TEXT kern_return_t
13024 pmap_load_trust_cache_with_type_internal(
13025 TCType_t type,
13026 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13027 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13028 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13029 {
13030 kern_return_t ret = KERN_DENIED;
13031 pmap_img4_payload_t *payload = NULL;
13032 size_t img4_payload_len = 0;
13033 size_t payload_len_aligned = 0;
13034 size_t manifest_len_aligned = 0;
13035
13036 /* Ignore the auxiliary manifest until we add support for it */
13037 (void)img4_aux_manifest;
13038 (void)img4_aux_manifest_len;
13039
13040
13041 #if PMAP_CS_INCLUDE_CODE_SIGNING
13042 if (pmap_cs) {
13043 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
13044 panic("trust cache type not loadable from interface: %u", type);
13045 } else if (type >= kTCTypeTotal) {
13046 panic("attempted to load an unsupported trust cache type: %u", type);
13047 }
13048
13049 /* Validate entitlement for the calling process */
13050 if (TCTypeConfig[type].entitlementValue != NULL) {
13051 const bool entitlement_satisfied = check_entitlement_pmap(
13052 NULL,
13053 "com.apple.private.pmap.load-trust-cache",
13054 TCTypeConfig[type].entitlementValue,
13055 false,
13056 true);
13057
13058 if (entitlement_satisfied == false) {
13059 panic("attempted to load trust cache without entitlement: %u", type);
13060 }
13061 }
13062 }
13063 #endif
13064
13065 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13066 ret = pmap_reserve_ppl_page();
13067 if (ret != KERN_SUCCESS) {
13068 if (ret != KERN_RESOURCE_SHORTAGE) {
13069 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
13070 }
13071 return ret;
13072 }
13073
13074 /* Align the passed in lengths to the page size -- round_page is overflow safe */
13075 payload_len_aligned = round_page(pmap_img4_payload_len);
13076 manifest_len_aligned = round_page(img4_manifest_len);
13077
13078 /* Ensure we have valid data passed in */
13079 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
13080 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
13081
13082 /*
13083 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
13084 * data structure used by libTrustCache to manage the payload. We need to be able to
13085 * write to that data structure, so we keep the payload PPL writable.
13086 */
13087 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13088 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
13089
13090 /* Should be safe to read from this now */
13091 payload = (pmap_img4_payload_t*)pmap_img4_payload;
13092
13093 /* Acquire a writable version of the trust cache data structure */
13094 TrustCache_t *trust_cache = &payload->trust_cache;
13095 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
13096
13097 /* Calculate the correct length of the img4 payload */
13098 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
13099 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
13100 }
13101
13102 /* Exclusively lock the runtime */
13103 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
13104
13105 /* Load the trust cache */
13106 TCReturn_t tc_ret = amfi->TrustCache.load(
13107 &ppl_trust_cache_rt,
13108 type,
13109 trust_cache,
13110 (const uintptr_t)payload->img4_payload, img4_payload_len,
13111 (const uintptr_t)img4_manifest, img4_manifest_len);
13112
13113 /* Unlock the runtime */
13114 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
13115
13116 if (tc_ret.error == kTCReturnSuccess) {
13117 ret = KERN_SUCCESS;
13118 } else {
13119 if (tc_ret.error == kTCReturnDuplicate) {
13120 ret = KERN_ALREADY_IN_SET;
13121 } else {
13122 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
13123 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13124
13125 ret = KERN_FAILURE;
13126 }
13127
13128 /* Unlock the payload data */
13129 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13130 trust_cache = NULL;
13131 payload = NULL;
13132 }
13133
13134 /* Unlock the manifest since it is no longer needed */
13135 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
13136
13137 /* Return the CoreCrypto reserved page back to the free list */
13138 pmap_release_reserved_ppl_page();
13139
13140 return ret;
13141 }
13142
13143 kern_return_t
13144 pmap_load_trust_cache_with_type(
13145 TCType_t type,
13146 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13147 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13148 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13149 {
13150 kern_return_t ret = KERN_DENIED;
13151
13152 ret = pmap_load_trust_cache_with_type_ppl(
13153 type,
13154 pmap_img4_payload, pmap_img4_payload_len,
13155 img4_manifest, img4_manifest_len,
13156 img4_aux_manifest, img4_aux_manifest_len);
13157
13158 while (ret == KERN_RESOURCE_SHORTAGE) {
13159 /* Allocate a page from the free list */
13160 pmap_alloc_page_for_ppl(0);
13161
13162 /* Attempt the call again */
13163 ret = pmap_load_trust_cache_with_type_ppl(
13164 type,
13165 pmap_img4_payload, pmap_img4_payload_len,
13166 img4_manifest, img4_manifest_len,
13167 img4_aux_manifest, img4_aux_manifest_len);
13168 }
13169
13170 return ret;
13171 }
13172
13173 MARK_AS_PMAP_TEXT kern_return_t
13174 pmap_query_trust_cache_safe(
13175 TCQueryType_t query_type,
13176 const uint8_t cdhash[kTCEntryHashSize],
13177 TrustCacheQueryToken_t *query_token)
13178 {
13179 kern_return_t ret = KERN_NOT_FOUND;
13180
13181 /* Validate the query type preemptively */
13182 if (query_type >= kTCQueryTypeTotal) {
13183 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
13184 return KERN_INVALID_ARGUMENT;
13185 }
13186
13187 /* Lock the runtime as shared */
13188 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13189
13190 TCReturn_t tc_ret = amfi->TrustCache.query(
13191 &ppl_trust_cache_rt,
13192 query_type,
13193 cdhash,
13194 query_token);
13195
13196 /* Unlock the runtime */
13197 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13198
13199 if (tc_ret.error == kTCReturnSuccess) {
13200 ret = KERN_SUCCESS;
13201 } else if (tc_ret.error == kTCReturnNotFound) {
13202 ret = KERN_NOT_FOUND;
13203 } else {
13204 ret = KERN_FAILURE;
13205 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13206 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13207 }
13208
13209 return ret;
13210 }
13211
13212 MARK_AS_PMAP_TEXT kern_return_t
13213 pmap_query_trust_cache_internal(
13214 TCQueryType_t query_type,
13215 const uint8_t cdhash[kTCEntryHashSize],
13216 TrustCacheQueryToken_t *query_token)
13217 {
13218 kern_return_t ret = KERN_NOT_FOUND;
13219 TrustCacheQueryToken_t query_token_safe = {0};
13220 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13221
13222 /* Copy in the CDHash into PPL storage */
13223 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13224
13225 /* Query through the safe API since we're in the PPL now */
13226 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13227
13228 if (query_token != NULL) {
13229 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13230 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13231 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13232 }
13233
13234 return ret;
13235 }
13236
13237 kern_return_t
13238 pmap_query_trust_cache(
13239 TCQueryType_t query_type,
13240 const uint8_t cdhash[kTCEntryHashSize],
13241 TrustCacheQueryToken_t *query_token)
13242 {
13243 kern_return_t ret = KERN_NOT_FOUND;
13244
13245 ret = pmap_query_trust_cache_ppl(
13246 query_type,
13247 cdhash,
13248 query_token);
13249
13250 return ret;
13251 }
13252
13253 MARK_AS_PMAP_DATA uint8_t ppl_developer_mode_set = 0;
13254 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13255
13256 MARK_AS_PMAP_TEXT void
13257 pmap_toggle_developer_mode_internal(
13258 bool state)
13259 {
13260 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13261 /*
13262 * On internal builds, we may call into the PPL twice in order to enable developer
13263 * mode during early boot and during data migration. The latter does not happen for
13264 * non-internal builds, and thus those only need to support a single transition to
13265 * enabling developer mode.
13266 */
13267 const uint8_t epoch_enable = 2;
13268 #else
13269 const uint8_t epoch_enable = 1;
13270 #endif
13271
13272 /*
13273 * We don't really care if the state is false -- in that case, the transition can
13274 * happen as many times as needed. However, we still need to increment whenever we
13275 * set the state as such. This is partly because we need to track whether we have
13276 * actually resolved the state or not, and also because we expect developer mode
13277 * to only be enabled during the first or second (internal-only) call into this
13278 * function.
13279 */
13280 uint8_t epoch = os_atomic_inc_orig(&ppl_developer_mode_set, relaxed);
13281
13282 if (state == os_atomic_load(&ppl_developer_mode_storage, relaxed)) {
13283 return;
13284 } else if ((state == true) && (epoch >= epoch_enable)) {
13285 panic("PMAP_CS: enabling developer mode incorrectly [%u]", epoch);
13286 }
13287
13288 /* Update the developer mode state on the system */
13289 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13290 }
13291
13292 void
13293 pmap_toggle_developer_mode(
13294 bool state)
13295 {
13296 pmap_toggle_developer_mode_ppl(state);
13297 }
13298
13299 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13300 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13301
13302 #pragma mark Image4 - New
13303
13304 typedef struct _pmap_image4_dispatch {
13305 image4_cs_trap_t selector;
13306 image4_cs_trap_handler_t handler;
13307 } pmap_image4_dispatch_t;
13308
13309 MARK_AS_PMAP_TEXT static errno_t
13310 _pmap_image4_monitor_trap_set_release_type(
13311 const pmap_image4_dispatch_t *dispatch,
13312 const void *input_data)
13313 {
13314 /*
13315 * csmx_release_type --> __cs_copy
13316 */
13317 image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13318
13319 /* Copy the input data to prevent ToCToU */
13320 memcpy(&input, input_data, sizeof(input));
13321
13322 /* Dispatch to AppleImage4 */
13323 return dispatch->handler(
13324 dispatch->selector,
13325 &input, sizeof(input),
13326 NULL, NULL);
13327 }
13328
13329
13330
13331 MARK_AS_PMAP_TEXT static errno_t
13332 _pmap_image4_monitor_trap_nonce_set(
13333 const pmap_image4_dispatch_t *dispatch,
13334 const void *input_data)
13335 {
13336 /*
13337 * csmx_clear --> __cs_copy
13338 * csmx_cipher --> __cs_copy
13339 */
13340 image4_cs_trap_argv_nonce_set_t input = {0};
13341
13342 /* Copy the input data to prevent ToCToU */
13343 memcpy(&input, input_data, sizeof(input));
13344
13345 /* Dispatch to AppleImage4 */
13346 return dispatch->handler(
13347 dispatch->selector,
13348 &input, sizeof(input),
13349 NULL, NULL);
13350 }
13351
13352 MARK_AS_PMAP_TEXT static errno_t
13353 _pmap_image4_monitor_trap_nonce_roll(
13354 const pmap_image4_dispatch_t *dispatch,
13355 const void *input_data)
13356 {
13357 image4_cs_trap_argv_nonce_roll_t input = {0};
13358
13359 /* Copy the input data to prevent ToCToU */
13360 memcpy(&input, input_data, sizeof(input));
13361
13362 /* Dispatch to AppleImage4 */
13363 return dispatch->handler(
13364 dispatch->selector,
13365 &input, sizeof(input),
13366 NULL, NULL);
13367 }
13368
13369 MARK_AS_PMAP_TEXT static errno_t
13370 _pmap_image4_monitor_trap_image_activate(
13371 const pmap_image4_dispatch_t *dispatch,
13372 const void *input_data)
13373 {
13374 /*
13375 * csmx_payload (csmx_payload_len) --> __cs_xfer
13376 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13377 */
13378 image4_cs_trap_argv_image_activate_t input = {0};
13379
13380 /* Copy the input data to prevent ToCToU */
13381 memcpy(&input, input_data, sizeof(input));
13382
13383 /* Validate the payload region */
13384 pmap_cs_assert_addr(
13385 input.csmx_payload, round_page(input.csmx_payload_len),
13386 false, false);
13387
13388 /* Validate the manifest region */
13389 pmap_cs_assert_addr(
13390 input.csmx_manifest, round_page(input.csmx_manifest_len),
13391 false, false);
13392
13393 /* Lockdown the payload region */
13394 pmap_cs_lockdown_pages(
13395 input.csmx_payload, round_page(input.csmx_payload_len), false);
13396
13397 /* Lockdown the manifest region */
13398 pmap_cs_lockdown_pages(
13399 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13400
13401 /* Dispatch the handler */
13402 errno_t err = dispatch->handler(
13403 dispatch->selector,
13404 &input, sizeof(input),
13405 NULL, NULL);
13406
13407 /*
13408 * Image activation always returns the manifest back to the kernel since it isn't
13409 * needed once the evaluation of the image has been completed. The payload must
13410 * remain owned by the monitor if the activation was successful.
13411 */
13412 if (err != 0) {
13413 /* Unlock the payload region */
13414 pmap_cs_unlockdown_pages(
13415 input.csmx_payload, round_page(input.csmx_payload_len), false);
13416 }
13417
13418 /* Unlock the manifest region */
13419 pmap_cs_unlockdown_pages(
13420 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13421
13422 return err;
13423 }
13424
13425 MARK_AS_PMAP_TEXT static errno_t
13426 _pmap_image4_monitor_trap_passthrough(
13427 __unused const pmap_image4_dispatch_t *dispatch,
13428 __unused const void *input_data,
13429 __unused size_t input_size)
13430 {
13431 #if DEVELOPMENT || DEBUG || KASAN
13432 return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13433 #else
13434 pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13435 return ENOSYS;
13436 #endif
13437 }
13438
13439 MARK_AS_PMAP_TEXT errno_t
13440 pmap_image4_monitor_trap_internal(
13441 image4_cs_trap_t selector,
13442 const void *input_data,
13443 size_t input_size)
13444 {
13445 kern_return_t ret = KERN_DENIED;
13446 errno_t err = EPERM;
13447
13448 /* Acquire the handler for this selector */
13449 image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13450 if (handler == NULL) {
13451 pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13452 return EINVAL;
13453 }
13454
13455 /* Verify input size for the handler */
13456 if (input_size != image4_cs_trap_vector_size(selector)) {
13457 pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13458 return EINVAL;
13459 }
13460
13461 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13462 ret = pmap_reserve_ppl_page();
13463 if (ret != KERN_SUCCESS) {
13464 if (ret == KERN_RESOURCE_SHORTAGE) {
13465 return ENOMEM;
13466 }
13467 pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13468 return EPERM;
13469 }
13470
13471 /* Setup dispatch parameters */
13472 pmap_image4_dispatch_t dispatch = {
13473 .selector = selector,
13474 .handler = handler
13475 };
13476
13477 switch (selector) {
13478 case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13479 err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13480 break;
13481
13482 case IMAGE4_CS_TRAP_NONCE_SET:
13483 err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13484 break;
13485
13486 case IMAGE4_CS_TRAP_NONCE_ROLL:
13487 err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13488 break;
13489
13490 case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13491 err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13492 break;
13493
13494 default:
13495 err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13496 break;
13497 }
13498
13499 /* Return the CoreCrypto reserved page back to the free list */
13500 pmap_release_reserved_ppl_page();
13501
13502 return err;
13503 }
13504
13505 errno_t
13506 pmap_image4_monitor_trap(
13507 image4_cs_trap_t selector,
13508 const void *input_data,
13509 size_t input_size)
13510 {
13511 errno_t err = EPERM;
13512
13513 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13514 while (err == ENOMEM) {
13515 /* Allocate a page from the free list */
13516 pmap_alloc_page_for_ppl(0);
13517
13518 /* Call the monitor dispatch again */
13519 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13520 }
13521
13522 return err;
13523 }
13524
13525 #endif /* PMAP_CS_PPL_MONITOR */
13526
13527 #if PMAP_CS_INCLUDE_CODE_SIGNING
13528
13529 static int
13530 pmap_cs_profiles_rbtree_compare(
13531 void *profile0,
13532 void *profile1)
13533 {
13534 if (profile0 < profile1) {
13535 return -1;
13536 } else if (profile0 > profile1) {
13537 return 1;
13538 }
13539 return 0;
13540 }
13541
13542 /* Red-black tree for managing provisioning profiles */
13543 MARK_AS_PMAP_DATA static
13544 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13545
13546 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13547 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13548
13549 /* Lock for the profile red-black tree */
13550 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13551
13552 void
13553 pmap_initialize_provisioning_profiles(void)
13554 {
13555 /* Initialize the profiles red-black tree lock */
13556 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13557 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13558
13559 /* Initialize the red-black tree itself */
13560 RB_INIT(&pmap_cs_registered_profiles);
13561
13562 printf("initialized PPL provisioning profile data\n");
13563 }
13564
13565 static bool
13566 pmap_is_testflight_profile(
13567 pmap_cs_profile_t *profile_obj)
13568 {
13569 const char *entitlement_name = "beta-reports-active";
13570 const size_t entitlement_length = strlen(entitlement_name);
13571 CEQueryOperation_t query[2] = {0};
13572
13573 /* If the profile provisions no entitlements, then it isn't a test flight one */
13574 if (profile_obj->entitlements_ctx == NULL) {
13575 return false;
13576 }
13577
13578 /* Build our CoreEntitlements query */
13579 query[0].opcode = kCEOpSelectKey;
13580 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13581 query[0].parameters.stringParameter.length = entitlement_length;
13582 query[1] = CEMatchBool(true);
13583
13584 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13585 profile_obj->entitlements_ctx,
13586 query, 2);
13587
13588 if (ce_err == amfi->CoreEntitlements.kNoError) {
13589 return true;
13590 }
13591
13592 return false;
13593 }
13594
13595 static bool
13596 pmap_is_development_profile(
13597 pmap_cs_profile_t *profile_obj)
13598 {
13599 /* Check for UPP */
13600 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13601 *profile_obj->profile_ctx,
13602 CESelectDictValue("ProvisionsAllDevices"));
13603 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13604 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13605 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13606 return false;
13607 }
13608 }
13609
13610 /* Check for TestFlight profile */
13611 if (pmap_is_testflight_profile(profile_obj) == true) {
13612 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13613 return false;
13614 }
13615
13616 pmap_cs_log_info("%p: development profile", profile_obj);
13617 return true;
13618 }
13619
13620 static kern_return_t
13621 pmap_initialize_profile_entitlements(
13622 pmap_cs_profile_t *profile_obj)
13623 {
13624 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13625 *profile_obj->profile_ctx,
13626 CESelectDictValue("Entitlements"));
13627
13628 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13629 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13630 profile_obj->entitlements_ctx = NULL;
13631
13632 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13633 return KERN_NOT_FOUND;
13634 }
13635
13636 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13637 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13638
13639 CEValidationResult ce_result = {0};
13640 CEError_t ce_err = amfi->CoreEntitlements.Validate(
13641 pmap_cs_core_entitlements_runtime,
13642 &ce_result,
13643 der_start, der_end);
13644 if (ce_err != amfi->CoreEntitlements.kNoError) {
13645 pmap_cs_log_error("unable to validate profile entitlements: %s",
13646 amfi->CoreEntitlements.GetErrorString(ce_err));
13647
13648 return KERN_ABORTED;
13649 }
13650
13651 struct CEQueryContext query_ctx = {0};
13652 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13653 pmap_cs_core_entitlements_runtime,
13654 ce_result,
13655 &query_ctx);
13656 if (ce_err != amfi->CoreEntitlements.kNoError) {
13657 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13658 amfi->CoreEntitlements.GetErrorString(ce_err));
13659
13660 return KERN_ABORTED;
13661 }
13662
13663 /* Setup the entitlements context within the profile object */
13664 profile_obj->entitlements_ctx_storage = query_ctx;
13665 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13666
13667 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13668 return KERN_SUCCESS;
13669 }
13670
13671 kern_return_t
13672 pmap_register_provisioning_profile_internal(
13673 const vm_address_t payload_addr,
13674 const vm_size_t payload_size)
13675 {
13676 kern_return_t ret = KERN_DENIED;
13677 pmap_cs_profile_t *profile_obj = NULL;
13678 pmap_profile_payload_t *profile_payload = NULL;
13679 vm_size_t max_profile_blob_size = 0;
13680 const uint8_t *profile_content = NULL;
13681 size_t profile_content_length = 0;
13682
13683
13684 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
13685 ret = pmap_reserve_ppl_page();
13686 if (ret != KERN_SUCCESS) {
13687 if (ret != KERN_RESOURCE_SHORTAGE) {
13688 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13689 }
13690 return ret;
13691 }
13692
13693 /* Ensure we have valid data passed in */
13694 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13695
13696 /*
13697 * Lockdown the data passed in. The pmap profile payload also contains the profile
13698 * data structure used by the PPL to manage the payload. We need to be able to write
13699 * to that data structure, so we keep the payload PPL writable.
13700 */
13701 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13702
13703 /* Should be safe to read from this now */
13704 profile_payload = (pmap_profile_payload_t*)payload_addr;
13705
13706 /* Ensure the profile blob size provided is valid */
13707 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13708 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13709 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13710 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13711 }
13712
13713 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13714 const bool allow_development_root_cert = true;
13715 #else
13716 const bool allow_development_root_cert = false;
13717 #endif
13718
13719 int ct_result = coretrust->CTEvaluateProvisioningProfile(
13720 profile_payload->profile_blob, profile_payload->profile_blob_size,
13721 allow_development_root_cert,
13722 &profile_content, &profile_content_length);
13723
13724 /* Release the PPL page allocated for CoreCrypto */
13725 pmap_release_reserved_ppl_page();
13726
13727 if (ct_result != 0) {
13728 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13729 } else if ((profile_content == NULL) || profile_content_length == 0) {
13730 panic("PMAP_CS: profile does not have any content: %p | %lu",
13731 profile_content, profile_content_length);
13732 }
13733
13734 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13735 pmap_cs_core_entitlements_runtime,
13736 CCDER_CONSTRUCTED_SET,
13737 false,
13738 profile_content, profile_content + profile_content_length);
13739 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13740 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13741 }
13742
13743 /* Acquire a writable version of the profile data structure */
13744 profile_obj = &profile_payload->profile_obj_storage;
13745 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13746
13747 profile_obj->original_payload = profile_payload;
13748 profile_obj->profile_ctx_storage = profile_ctx_storage;
13749 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13750 os_atomic_store(&profile_obj->reference_count, 0, release);
13751
13752 /* Setup the entitlements provisioned by the profile */
13753 ret = pmap_initialize_profile_entitlements(profile_obj);
13754 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13755 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13756 }
13757
13758 /* Setup properties of the profile */
13759 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13760
13761 /* Mark as validated since it passed all checks */
13762 profile_obj->profile_validated = true;
13763
13764 /* Add the profile to the red-black tree */
13765 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13766 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13767 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13768 }
13769 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13770
13771 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13772 return KERN_SUCCESS;
13773 }
13774
13775 kern_return_t
13776 pmap_register_provisioning_profile(
13777 const vm_address_t payload_addr,
13778 const vm_size_t payload_size)
13779 {
13780 kern_return_t ret = KERN_DENIED;
13781
13782 ret = pmap_register_provisioning_profile_ppl(
13783 payload_addr,
13784 payload_size);
13785
13786 while (ret == KERN_RESOURCE_SHORTAGE) {
13787 /* Allocate a page from the free list */
13788 pmap_alloc_page_for_ppl(0);
13789
13790 /* Attempt the call again */
13791 ret = pmap_register_provisioning_profile_ppl(
13792 payload_addr,
13793 payload_size);
13794 }
13795
13796 return ret;
13797 }
13798
13799 kern_return_t
13800 pmap_unregister_provisioning_profile_internal(
13801 pmap_cs_profile_t *profile_obj)
13802 {
13803 kern_return_t ret = KERN_DENIED;
13804
13805 /* Lock the red-black tree exclusively */
13806 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13807
13808 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13809 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13810 }
13811
13812 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13813 if (reference_count != 0) {
13814 ret = KERN_FAILURE;
13815 goto exit;
13816 }
13817
13818 /* Remove the profile from the red-black tree */
13819 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13820
13821 /* Unregistration was a success */
13822 ret = KERN_SUCCESS;
13823
13824 exit:
13825 /* Unlock the red-black tree */
13826 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13827
13828 if (ret == KERN_SUCCESS) {
13829 /* Get the original payload address */
13830 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13831 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13832
13833 /* Get the original payload size */
13834 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13835 payload_size = round_page(payload_size);
13836
13837 /* Unlock the profile payload */
13838 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13839 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13840 profile_payload, payload_size);
13841
13842 profile_obj = NULL;
13843 }
13844 return ret;
13845 }
13846
13847 kern_return_t
13848 pmap_unregister_provisioning_profile(
13849 pmap_cs_profile_t *profile_obj)
13850 {
13851 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13852 }
13853
13854 kern_return_t
13855 pmap_associate_provisioning_profile_internal(
13856 pmap_cs_code_directory_t *cd_entry,
13857 pmap_cs_profile_t *profile_obj)
13858 {
13859 kern_return_t ret = KERN_DENIED;
13860
13861 /* Acquire the lock on the code directory */
13862 pmap_cs_lock_code_directory(cd_entry);
13863
13864 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13865 pmap_cs_log_error("disallowing profile association with verified signature");
13866 goto exit;
13867 } else if (cd_entry->profile_obj != NULL) {
13868 pmap_cs_log_error("disallowing multiple profile associations with signature");
13869 goto exit;
13870 }
13871
13872 /* Lock the red-black tree as shared */
13873 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13874
13875 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13876 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13877 } else if (profile_obj->profile_validated == false) {
13878 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13879 }
13880
13881 /* Associate the profile with the signature */
13882 cd_entry->profile_obj = profile_obj;
13883
13884 /* Increment the reference count on the profile object */
13885 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13886 if (reference_count == 0) {
13887 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13888 }
13889
13890 /* Unlock the red-black tree */
13891 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13892
13893 /* Association was a success */
13894 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13895 ret = KERN_SUCCESS;
13896
13897 exit:
13898 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13899
13900 return ret;
13901 }
13902
13903 kern_return_t
13904 pmap_associate_provisioning_profile(
13905 pmap_cs_code_directory_t *cd_entry,
13906 pmap_cs_profile_t *profile_obj)
13907 {
13908 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13909 }
13910
13911 kern_return_t
13912 pmap_disassociate_provisioning_profile_internal(
13913 pmap_cs_code_directory_t *cd_entry)
13914 {
13915 pmap_cs_profile_t *profile_obj = NULL;
13916 kern_return_t ret = KERN_DENIED;
13917
13918 /* Acquire the lock on the code directory */
13919 pmap_cs_lock_code_directory(cd_entry);
13920
13921 if (cd_entry->profile_obj == NULL) {
13922 ret = KERN_NOT_FOUND;
13923 goto exit;
13924 }
13925 profile_obj = cd_entry->profile_obj;
13926
13927 /* Disassociate the profile from the signature */
13928 cd_entry->profile_obj = NULL;
13929
13930 /* Disassociation was a success */
13931 ret = KERN_SUCCESS;
13932
13933 exit:
13934 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13935
13936 if (ret == KERN_SUCCESS) {
13937 /* Decrement the reference count on the profile object */
13938 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13939 if (reference_count == UINT32_MAX) {
13940 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13941 }
13942 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13943 }
13944 return ret;
13945 }
13946
13947 kern_return_t
13948 pmap_disassociate_provisioning_profile(
13949 pmap_cs_code_directory_t *cd_entry)
13950 {
13951 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13952 }
13953
13954 kern_return_t
13955 pmap_associate_kernel_entitlements_internal(
13956 pmap_cs_code_directory_t *cd_entry,
13957 const void *kernel_entitlements)
13958 {
13959 kern_return_t ret = KERN_DENIED;
13960
13961 if (kernel_entitlements == NULL) {
13962 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13963 }
13964
13965 /* Acquire the lock on the code directory */
13966 pmap_cs_lock_code_directory(cd_entry);
13967
13968 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13969 ret = KERN_DENIED;
13970 goto out;
13971 } else if (cd_entry->kernel_entitlements != NULL) {
13972 ret = KERN_DENIED;
13973 goto out;
13974 }
13975 cd_entry->kernel_entitlements = kernel_entitlements;
13976
13977 /* Association was a success */
13978 ret = KERN_SUCCESS;
13979
13980 out:
13981 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13982 return ret;
13983 }
13984
13985 kern_return_t
13986 pmap_associate_kernel_entitlements(
13987 pmap_cs_code_directory_t *cd_entry,
13988 const void *kernel_entitlements)
13989 {
13990 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13991 }
13992
13993 kern_return_t
13994 pmap_resolve_kernel_entitlements_internal(
13995 pmap_t pmap,
13996 const void **kernel_entitlements)
13997 {
13998 const void *entitlements = NULL;
13999 pmap_cs_code_directory_t *cd_entry = NULL;
14000 kern_return_t ret = KERN_DENIED;
14001
14002 /* Validate the PMAP object */
14003 validate_pmap(pmap);
14004
14005 /* Ensure no kernel PMAP */
14006 if (pmap == kernel_pmap) {
14007 return KERN_NOT_FOUND;
14008 }
14009
14010 /* Attempt a shared lock on the PMAP */
14011 if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
14012 return KERN_ABORTED;
14013 }
14014
14015 /*
14016 * Acquire the code signature from the PMAP. This function is called when
14017 * performing an entitlement check, and since we've confirmed this isn't
14018 * the kernel_pmap, at this stage, each pmap _should_ have a main region
14019 * with a code signature.
14020 */
14021 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
14022 if (cd_entry == NULL) {
14023 ret = KERN_NOT_FOUND;
14024 goto out;
14025 }
14026
14027 entitlements = cd_entry->kernel_entitlements;
14028 if (entitlements == NULL) {
14029 ret = KERN_NOT_FOUND;
14030 goto out;
14031 }
14032
14033 /* Pin and write out the entitlements object pointer */
14034 if (kernel_entitlements != NULL) {
14035 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14036 *kernel_entitlements = entitlements;
14037 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14038 }
14039
14040 /* Successfully resolved the entitlements */
14041 ret = KERN_SUCCESS;
14042
14043 out:
14044 /* Unlock the code signature object */
14045 if (cd_entry != NULL) {
14046 lck_rw_unlock_shared(&cd_entry->rwlock);
14047 cd_entry = NULL;
14048 }
14049
14050 /* Unlock the PMAP object */
14051 pmap_unlock(pmap, PMAP_LOCK_SHARED);
14052
14053 return ret;
14054 }
14055
14056 kern_return_t
14057 pmap_resolve_kernel_entitlements(
14058 pmap_t pmap,
14059 const void **kernel_entitlements)
14060 {
14061 kern_return_t ret = KERN_DENIED;
14062
14063 do {
14064 ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
14065 } while (ret == KERN_ABORTED);
14066
14067 return ret;
14068 }
14069
14070 kern_return_t
14071 pmap_accelerate_entitlements_internal(
14072 pmap_cs_code_directory_t *cd_entry)
14073 {
14074 const coreentitlements_t *CoreEntitlements = NULL;
14075 const CS_SuperBlob *superblob = NULL;
14076 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
14077 size_t signature_length = 0;
14078 size_t acceleration_length = 0;
14079 size_t required_length = 0;
14080 kern_return_t ret = KERN_DENIED;
14081
14082 /* Setup the CoreEntitlements interface */
14083 CoreEntitlements = &amfi->CoreEntitlements;
14084
14085 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
14086
14087 /* Acquire the lock on the code directory */
14088 pmap_cs_lock_code_directory(cd_entry);
14089
14090 /*
14091 * Only reconstituted code signatures can be accelerated. This is only a policy
14092 * decision we make since this allows us to re-use any unused space within the
14093 * locked down code signature region. There is also a decent bit of validation
14094 * within the reconstitution function to ensure blobs are ordered and do not
14095 * contain any padding around them which can cause issues here.
14096 *
14097 * This also serves as a check to ensure the signature is trusted.
14098 */
14099 if (cd_entry->unneeded_code_signature_unlocked == false) {
14100 ret = KERN_DENIED;
14101 goto out;
14102 }
14103
14104 if (cd_entry->ce_ctx == NULL) {
14105 ret = KERN_SUCCESS;
14106 goto out;
14107 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
14108 ret = KERN_SUCCESS;
14109 goto out;
14110 }
14111
14112 /* We only support accelerating when size <= PAGE_SIZE */
14113 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
14114 if (ce_err != CoreEntitlements->kNoError) {
14115 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
14116 /* Small entitlement blobs aren't eligible */
14117 ret = KERN_SUCCESS;
14118 goto out;
14119 }
14120 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
14121 cd_entry, CoreEntitlements->GetErrorString(ce_err));
14122 } else if (acceleration_length > PAGE_SIZE) {
14123 ret = KERN_ABORTED;
14124 goto out;
14125 }
14126 assert(acceleration_length > 0);
14127
14128 superblob = cd_entry->superblob;
14129 signature_length = ntohl(superblob->length);
14130
14131 /* Adjust the required length for the overhead structure -- can't overflow */
14132 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
14133 if (required_length > PAGE_SIZE) {
14134 ret = KERN_ABORTED;
14135 goto out;
14136 }
14137
14138 /*
14139 * First we'll check if the code signature has enough space within the locked down
14140 * region of memory to hold the buffer. If not, then we'll see if we can bucket
14141 * allocate the buffer, and if not, we'll just allocate an entire page from the
14142 * free list.
14143 *
14144 * When we're storing the buffer within the code signature, we also need to make
14145 * sure we account for alignment of the buffer.
14146 */
14147 const vm_address_t align_mask = sizeof(void*) - 1;
14148 size_t required_length_within_sig = required_length + align_mask;
14149
14150 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
14151 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
14152 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
14153
14154 /* We need to resolve to the physical aperture */
14155 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
14156 acceleration_buf = (void*)phystokv(phys_addr);
14157
14158 /* Ensure the offset within the page wasn't lost */
14159 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
14160
14161 acceleration_buf->allocated = false;
14162 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
14163 } else {
14164 if (required_length <= pmap_cs_blob_limit) {
14165 struct pmap_cs_blob *bucket = NULL;
14166 size_t bucket_size = 0;
14167
14168 /* Allocate a buffer from the blob allocator */
14169 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
14170 if (ret != KERN_SUCCESS) {
14171 goto out;
14172 }
14173 acceleration_buf = (void*)bucket->blob;
14174 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
14175 } else {
14176 pmap_paddr_t phys_addr = 0;
14177 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
14178 if (ret != KERN_SUCCESS) {
14179 goto out;
14180 }
14181 acceleration_buf = (void*)phystokv(phys_addr);
14182 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
14183 }
14184 acceleration_buf->allocated = true;
14185 }
14186 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
14187 acceleration_buf->length = acceleration_length;
14188
14189 /* Take the acceleration buffer lock */
14190 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
14191
14192 /* Setup the global acceleration buffer state */
14193 pmap_cs_acceleration_buf = acceleration_buf;
14194
14195 /* Accelerate the entitlements */
14196 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14197 if (ce_err != CoreEntitlements->kNoError) {
14198 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14199 cd_entry, CoreEntitlements->GetErrorString(ce_err));
14200 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14201 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14202 }
14203
14204 /*
14205 * The global acceleration buffer lock is unlocked by the allocation function itself
14206 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14207 * an assert that the lock is unlocked here since another thread could have acquired
14208 * it by now.
14209 */
14210 ret = KERN_SUCCESS;
14211
14212 out:
14213 lck_rw_unlock_exclusive(&cd_entry->rwlock);
14214 return ret;
14215 }
14216
14217 kern_return_t
14218 pmap_accelerate_entitlements(
14219 pmap_cs_code_directory_t *cd_entry)
14220 {
14221 kern_return_t ret = KERN_DENIED;
14222
14223 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14224 while (ret == KERN_RESOURCE_SHORTAGE) {
14225 /* Allocate a page for the PPL */
14226 pmap_alloc_page_for_ppl(0);
14227
14228 /* Try again */
14229 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14230 }
14231
14232 return ret;
14233 }
14234
14235 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14236
14237 MARK_AS_PMAP_TEXT bool
14238 pmap_lookup_in_loaded_trust_caches_internal(
14239 const uint8_t cdhash[CS_CDHASH_LEN])
14240 {
14241 kern_return_t kr = KERN_NOT_FOUND;
14242
14243 #if PMAP_CS_PPL_MONITOR
14244 /*
14245 * If we have the PPL monitor, then this function can only be called from
14246 * within the PPL. Calling it directly would've caused a panic, so we can
14247 * assume that we're in the PPL here.
14248 */
14249 uint8_t cdhash_safe[CS_CDHASH_LEN];
14250 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14251
14252 kr = pmap_query_trust_cache_safe(
14253 kTCQueryTypeLoadable,
14254 cdhash_safe,
14255 NULL);
14256 #else
14257 kr = query_trust_cache(
14258 kTCQueryTypeLoadable,
14259 cdhash,
14260 NULL);
14261 #endif
14262
14263 if (kr == KERN_SUCCESS) {
14264 return true;
14265 }
14266 return false;
14267 }
14268
14269 bool
14270 pmap_lookup_in_loaded_trust_caches(
14271 const uint8_t cdhash[CS_CDHASH_LEN])
14272 {
14273 #if XNU_MONITOR
14274 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14275 #else
14276 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14277 #endif
14278 }
14279
14280 MARK_AS_PMAP_TEXT uint32_t
14281 pmap_lookup_in_static_trust_cache_internal(
14282 const uint8_t cdhash[CS_CDHASH_LEN])
14283 {
14284 TrustCacheQueryToken_t query_token = {0};
14285 kern_return_t kr = KERN_NOT_FOUND;
14286 uint64_t flags = 0;
14287 uint8_t hash_type = 0;
14288
14289 #if PMAP_CS_PPL_MONITOR
14290 /*
14291 * If we have the PPL monitor, then this function can only be called from
14292 * within the PPL. Calling it directly would've caused a panic, so we can
14293 * assume that we're in the PPL here.
14294 */
14295 uint8_t cdhash_safe[CS_CDHASH_LEN];
14296 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14297
14298 kr = pmap_query_trust_cache_safe(
14299 kTCQueryTypeStatic,
14300 cdhash_safe,
14301 &query_token);
14302 #else
14303 kr = query_trust_cache(
14304 kTCQueryTypeStatic,
14305 cdhash,
14306 &query_token);
14307 #endif
14308
14309 if (kr == KERN_SUCCESS) {
14310 amfi->TrustCache.queryGetFlags(&query_token, &flags);
14311 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14312
14313 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14314 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14315 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14316 }
14317
14318 return 0;
14319 }
14320
14321 uint32_t
14322 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14323 {
14324 #if XNU_MONITOR
14325 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14326 #else
14327 return pmap_lookup_in_static_trust_cache_internal(cdhash);
14328 #endif
14329 }
14330
14331 #if PMAP_CS_INCLUDE_CODE_SIGNING
14332
14333 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14334 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14335
14336 MARK_AS_PMAP_TEXT void
14337 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14338 {
14339
14340 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14341 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14342 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14343
14344 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14345 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14346 }
14347
14348 MARK_AS_PMAP_TEXT bool
14349 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14350 {
14351 bool match = false;
14352
14353 /* Lockdown mode disallows compilation service */
14354 if (ppl_lockdown_mode_enabled == true) {
14355 return false;
14356 }
14357
14358 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14359 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14360 match = true;
14361 }
14362 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14363
14364 if (match) {
14365 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14366 }
14367
14368 return match;
14369 }
14370
14371 void
14372 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14373 {
14374 #if XNU_MONITOR
14375 pmap_set_compilation_service_cdhash_ppl(cdhash);
14376 #else
14377 pmap_set_compilation_service_cdhash_internal(cdhash);
14378 #endif
14379 }
14380
14381 bool
14382 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14383 {
14384 #if XNU_MONITOR
14385 return pmap_match_compilation_service_cdhash_ppl(cdhash);
14386 #else
14387 return pmap_match_compilation_service_cdhash_internal(cdhash);
14388 #endif
14389 }
14390
14391 /*
14392 * As part of supporting local signing on the device, we need the PMAP layer
14393 * to store the local signing key so that PMAP_CS can validate with it. We
14394 * store it at the PMAP layer such that it is accessible to both AMFI and
14395 * PMAP_CS should they need it.
14396 */
14397 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14398 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14399
14400 MARK_AS_PMAP_TEXT void
14401 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14402 {
14403 bool key_set = false;
14404
14405 /*
14406 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14407 * a successful exchange means that the local signing public key has _not_ been
14408 * set. In case the key has been set, we panic as we would never expect the
14409 * kernel to attempt to set the key more than once.
14410 */
14411 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14412
14413 if (key_set) {
14414 panic("attempted to set the local signing public key multiple times");
14415 }
14416
14417 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14418 pmap_cs_log_info("set local signing public key");
14419 }
14420
14421 void
14422 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14423 {
14424 #if XNU_MONITOR
14425 return pmap_set_local_signing_public_key_ppl(public_key);
14426 #else
14427 return pmap_set_local_signing_public_key_internal(public_key);
14428 #endif
14429 }
14430
14431 uint8_t*
14432 pmap_get_local_signing_public_key(void)
14433 {
14434 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14435
14436 if (key_set) {
14437 return pmap_local_signing_public_key;
14438 }
14439
14440 return NULL;
14441 }
14442
14443 /*
14444 * Locally signed applications need to be explicitly authorized by an entitled application
14445 * before we allow them to run.
14446 */
14447 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14448 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14449
14450 MARK_AS_PMAP_TEXT void
14451 pmap_unrestrict_local_signing_internal(
14452 const uint8_t cdhash[CS_CDHASH_LEN])
14453 {
14454
14455 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14456 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14457 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14458
14459 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14460 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14461 }
14462
14463 void
14464 pmap_unrestrict_local_signing(
14465 const uint8_t cdhash[CS_CDHASH_LEN])
14466 {
14467 #if XNU_MONITOR
14468 return pmap_unrestrict_local_signing_ppl(cdhash);
14469 #else
14470 return pmap_unrestrict_local_signing_internal(cdhash);
14471 #endif
14472 }
14473
14474 #if PMAP_CS
14475 MARK_AS_PMAP_TEXT static void
14476 pmap_restrict_local_signing(void)
14477 {
14478 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14479 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14480 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14481 }
14482
14483 MARK_AS_PMAP_TEXT static bool
14484 pmap_local_signing_restricted(
14485 const uint8_t cdhash[CS_CDHASH_LEN])
14486 {
14487 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14488 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14489 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14490
14491 return ret != 0;
14492 }
14493
14494 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14495 #endif
14496
14497 MARK_AS_PMAP_TEXT void
14498 pmap_footprint_suspend_internal(
14499 vm_map_t map,
14500 boolean_t suspend)
14501 {
14502 #if DEVELOPMENT || DEBUG
14503 if (suspend) {
14504 current_thread()->pmap_footprint_suspended = TRUE;
14505 map->pmap->footprint_was_suspended = TRUE;
14506 } else {
14507 current_thread()->pmap_footprint_suspended = FALSE;
14508 }
14509 #else /* DEVELOPMENT || DEBUG */
14510 (void) map;
14511 (void) suspend;
14512 #endif /* DEVELOPMENT || DEBUG */
14513 }
14514
14515 void
14516 pmap_footprint_suspend(
14517 vm_map_t map,
14518 boolean_t suspend)
14519 {
14520 #if XNU_MONITOR
14521 pmap_footprint_suspend_ppl(map, suspend);
14522 #else
14523 pmap_footprint_suspend_internal(map, suspend);
14524 #endif
14525 }
14526
14527 MARK_AS_PMAP_TEXT void
14528 pmap_nop_internal(pmap_t pmap __unused)
14529 {
14530 validate_pmap_mutable(pmap);
14531 }
14532
14533 void
14534 pmap_nop(pmap_t pmap)
14535 {
14536 #if XNU_MONITOR
14537 pmap_nop_ppl(pmap);
14538 #else
14539 pmap_nop_internal(pmap);
14540 #endif
14541 }
14542
14543 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14544
14545 struct page_table_dump_header {
14546 uint64_t pa;
14547 uint64_t num_entries;
14548 uint64_t start_va;
14549 uint64_t end_va;
14550 };
14551
14552 static kern_return_t
14553 pmap_dump_page_tables_recurse(pmap_t pmap,
14554 const tt_entry_t *ttp,
14555 unsigned int cur_level,
14556 unsigned int level_mask,
14557 uint64_t start_va,
14558 void *buf_start,
14559 void *buf_end,
14560 size_t *bytes_copied)
14561 {
14562 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14563 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14564
14565 uint64_t size = pt_attr->pta_level_info[cur_level].size;
14566 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14567 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14568 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14569
14570 void *bufp = (uint8_t*)buf_start + *bytes_copied;
14571
14572 if (cur_level == pt_attr_root_level(pt_attr)) {
14573 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14574 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14575 }
14576
14577 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14578 const tt_entry_t *tt_end = &ttp[num_entries];
14579
14580 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14581 return KERN_INSUFFICIENT_BUFFER_SIZE;
14582 }
14583
14584 if (level_mask & (1U << cur_level)) {
14585 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14586 header->pa = ml_static_vtop((vm_offset_t)ttp);
14587 header->num_entries = num_entries;
14588 header->start_va = start_va;
14589 header->end_va = start_va + (num_entries * size);
14590
14591 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14592 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14593 }
14594 uint64_t current_va = start_va;
14595
14596 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14597 tt_entry_t tte = *ttep;
14598
14599 if (!(tte & valid_mask)) {
14600 continue;
14601 }
14602
14603 if ((tte & type_mask) == type_block) {
14604 continue;
14605 } else {
14606 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14607 panic("%s: corrupt entry %#llx at %p, "
14608 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14609 __FUNCTION__, tte, ttep,
14610 ttp, cur_level, bufp, buf_end);
14611 }
14612
14613 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14614
14615 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14616 level_mask, current_va, buf_start, buf_end, bytes_copied);
14617
14618 if (recurse_result != KERN_SUCCESS) {
14619 return recurse_result;
14620 }
14621 }
14622 }
14623
14624 return KERN_SUCCESS;
14625 }
14626
14627 kern_return_t
14628 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14629 {
14630 if (not_in_kdp) {
14631 panic("pmap_dump_page_tables must only be called from kernel debugger context");
14632 }
14633 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14634 level_mask, pmap->min, bufp, buf_end, bytes_copied);
14635 }
14636
14637 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14638
14639 kern_return_t
14640 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14641 unsigned int level_mask __unused, size_t *bytes_copied __unused)
14642 {
14643 return KERN_NOT_SUPPORTED;
14644 }
14645 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14646
14647
14648 #ifdef CONFIG_XNUPOST
14649 #ifdef __arm64__
14650 static volatile bool pmap_test_took_fault = false;
14651
14652 static bool
14653 pmap_test_fault_handler(arm_saved_state_t * state)
14654 {
14655 bool retval = false;
14656 uint64_t esr = get_saved_state_esr(state);
14657 esr_exception_class_t class = ESR_EC(esr);
14658 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
14659
14660 if ((class == ESR_EC_DABORT_EL1) &&
14661 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14662 pmap_test_took_fault = true;
14663 /* return to the instruction immediately after the call to NX page */
14664 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14665 retval = true;
14666 }
14667
14668 return retval;
14669 }
14670
14671 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14672 static NOKASAN bool
14673 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14674 {
14675 pmap_t old_pmap = NULL;
14676 thread_t thread = current_thread();
14677
14678 pmap_test_took_fault = false;
14679
14680 /*
14681 * We're potentially switching pmaps without using the normal thread
14682 * mechanism; disable interrupts and preemption to avoid any unexpected
14683 * memory accesses.
14684 */
14685 uint64_t old_int_state = pmap_interrupts_disable();
14686 mp_disable_preemption();
14687
14688 if (pmap != NULL) {
14689 old_pmap = current_pmap();
14690 pmap_switch(pmap, thread);
14691
14692 /* Disable PAN; pmap shouldn't be the kernel pmap. */
14693 #if __ARM_PAN_AVAILABLE__
14694 __builtin_arm_wsr("pan", 0);
14695 #endif /* __ARM_PAN_AVAILABLE__ */
14696 }
14697
14698 ml_expect_fault_begin(pmap_test_fault_handler, va);
14699
14700 if (is_write) {
14701 *((volatile uint64_t*)(va)) = 0xdec0de;
14702 } else {
14703 volatile uint64_t tmp = *((volatile uint64_t*)(va));
14704 (void)tmp;
14705 }
14706
14707 /* Save the fault bool, and undo the gross stuff we did. */
14708 bool took_fault = pmap_test_took_fault;
14709 ml_expect_fault_end();
14710
14711 if (pmap != NULL) {
14712 #if __ARM_PAN_AVAILABLE__
14713 __builtin_arm_wsr("pan", 1);
14714 #endif /* __ARM_PAN_AVAILABLE__ */
14715
14716 pmap_switch(old_pmap, thread);
14717 }
14718
14719 mp_enable_preemption();
14720 pmap_interrupts_restore(old_int_state);
14721 bool retval = (took_fault == should_fault);
14722 return retval;
14723 }
14724
14725 static bool
14726 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14727 {
14728 bool retval = pmap_test_access(pmap, va, should_fault, false);
14729
14730 if (!retval) {
14731 T_FAIL("%s: %s, "
14732 "pmap=%p, va=%p, should_fault=%u",
14733 __func__, should_fault ? "did not fault" : "faulted",
14734 pmap, (void*)va, (unsigned)should_fault);
14735 }
14736
14737 return retval;
14738 }
14739
14740 static bool
14741 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14742 {
14743 bool retval = pmap_test_access(pmap, va, should_fault, true);
14744
14745 if (!retval) {
14746 T_FAIL("%s: %s, "
14747 "pmap=%p, va=%p, should_fault=%u",
14748 __func__, should_fault ? "did not fault" : "faulted",
14749 pmap, (void*)va, (unsigned)should_fault);
14750 }
14751
14752 return retval;
14753 }
14754
14755 static bool
14756 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14757 {
14758 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14759 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14760
14761 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14762
14763 if (!retval) {
14764 T_FAIL("%s: bits=%u, "
14765 "pa=%p, should_be_set=%u",
14766 __func__, bits,
14767 (void*)pa, should_be_set);
14768 }
14769
14770 return retval;
14771 }
14772
14773 static __attribute__((noinline)) bool
14774 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14775 {
14776 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14777 return retval;
14778 }
14779
14780 static int
14781 pmap_test_test_config(unsigned int flags)
14782 {
14783 T_LOG("running pmap_test_test_config flags=0x%X", flags);
14784 unsigned int map_count = 0;
14785 unsigned long page_ratio = 0;
14786 pmap_t pmap = pmap_create_options(NULL, 0, flags);
14787
14788 if (!pmap) {
14789 panic("Failed to allocate pmap");
14790 }
14791
14792 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14793 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14794 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14795 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14796
14797 if (pmap_page_size <= native_page_size) {
14798 page_ratio = native_page_size / pmap_page_size;
14799 } else {
14800 /*
14801 * We claim to support a page_ratio of less than 1, which is
14802 * not currently supported by the pmap layer; panic.
14803 */
14804 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14805 "flags=%u",
14806 __func__, native_page_size, pmap_page_size,
14807 flags);
14808 }
14809
14810 if (PAGE_RATIO > 1) {
14811 /*
14812 * The kernel is deliberately pretending to have 16KB pages.
14813 * The pmap layer has code that supports this, so pretend the
14814 * page size is larger than it is.
14815 */
14816 pmap_page_size = PAGE_SIZE;
14817 native_page_size = PAGE_SIZE;
14818 }
14819
14820 /*
14821 * Get two pages from the VM; one to be mapped wired, and one to be
14822 * mapped nonwired.
14823 */
14824 vm_page_t unwired_vm_page = vm_page_grab();
14825 vm_page_t wired_vm_page = vm_page_grab();
14826
14827 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14828 panic("Failed to grab VM pages");
14829 }
14830
14831 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14832 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14833
14834 pmap_paddr_t pa = ptoa(pn);
14835 pmap_paddr_t wired_pa = ptoa(wired_pn);
14836
14837 /*
14838 * We'll start mappings at the second twig TT. This keeps us from only
14839 * using the first entry in each TT, which would trivially be address
14840 * 0; one of the things we will need to test is retrieving the VA for
14841 * a given PTE.
14842 */
14843 vm_map_address_t va_base = pmap_twig_size;
14844 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14845
14846 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14847 /*
14848 * Not exactly a functional failure, but this test relies on
14849 * there being a spare PTE slot we can use to pin the TT.
14850 */
14851 panic("Cannot pin translation table");
14852 }
14853
14854 /*
14855 * Create the wired mapping; this will prevent the pmap layer from
14856 * reclaiming our test TTs, which would interfere with this test
14857 * ("interfere" -> "make it panic").
14858 */
14859 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14860
14861 #if XNU_MONITOR
14862 /*
14863 * If the PPL is enabled, make sure that the kernel cannot write
14864 * to PPL memory.
14865 */
14866 if (!pmap_ppl_disable) {
14867 T_LOG("Validate that kernel cannot write to PPL memory.");
14868 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14869 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14870 }
14871 #endif
14872
14873 /*
14874 * Create read-only mappings of the nonwired page; if the pmap does
14875 * not use the same page size as the kernel, create multiple mappings
14876 * so that the kernel page is fully mapped.
14877 */
14878 for (map_count = 0; map_count < page_ratio; map_count++) {
14879 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14880 }
14881
14882 /* Validate that all the PTEs have the expected PA and VA. */
14883 for (map_count = 0; map_count < page_ratio; map_count++) {
14884 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14885
14886 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14887 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14888 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14889 }
14890
14891 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14892 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14893 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14894 }
14895 }
14896
14897 T_LOG("Validate that reads to our mapping do not fault.");
14898 pmap_test_read(pmap, va_base, false);
14899
14900 T_LOG("Validate that writes to our mapping fault.");
14901 pmap_test_write(pmap, va_base, true);
14902
14903 T_LOG("Make the first mapping writable.");
14904 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14905
14906 T_LOG("Validate that writes to our mapping do not fault.");
14907 pmap_test_write(pmap, va_base, false);
14908
14909
14910 T_LOG("Make the first mapping execute-only");
14911 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14912
14913
14914 T_LOG("Validate that reads to our mapping do not fault.");
14915 pmap_test_read(pmap, va_base, false);
14916
14917 T_LOG("Validate that writes to our mapping fault.");
14918 pmap_test_write(pmap, va_base, true);
14919
14920
14921 /*
14922 * For page ratios of greater than 1: validate that writes to the other
14923 * mappings still fault. Remove the mappings afterwards (we're done
14924 * with page ratio testing).
14925 */
14926 for (map_count = 1; map_count < page_ratio; map_count++) {
14927 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14928 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14929 }
14930
14931 T_LOG("Mark the page unreferenced and unmodified.");
14932 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14933 pmap_test_check_refmod(pa, 0);
14934
14935 /*
14936 * Begin testing the ref/mod state machine. Re-enter the mapping with
14937 * different protection/fault_type settings, and confirm that the
14938 * ref/mod state matches our expectations at each step.
14939 */
14940 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
14941 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14942 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14943
14944 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
14945 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14946 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14947 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14948
14949 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
14950 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14951 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14952 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14953
14954 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
14955 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14956 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14957
14958 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
14959 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14960 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14961 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14962
14963 /*
14964 * Shared memory testing; we'll have two mappings; one read-only,
14965 * one read-write.
14966 */
14967 vm_map_address_t rw_base = va_base;
14968 vm_map_address_t ro_base = va_base + pmap_page_size;
14969
14970 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14971 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14972
14973 /*
14974 * Test that we take faults as expected for unreferenced/unmodified
14975 * pages. Also test the arm_fast_fault interface, to ensure that
14976 * mapping permissions change as expected.
14977 */
14978 T_LOG("!ref/!mod: expect no access");
14979 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14980 pmap_test_read_write(pmap, ro_base, false, false);
14981 pmap_test_read_write(pmap, rw_base, false, false);
14982
14983 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14984 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14985 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14986 pmap_test_read_write(pmap, ro_base, true, false);
14987 pmap_test_read_write(pmap, rw_base, true, false);
14988
14989 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14990 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14991 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14992 pmap_test_read_write(pmap, ro_base, true, false);
14993 pmap_test_read_write(pmap, rw_base, true, true);
14994
14995 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14996 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14997 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14998 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14999 pmap_test_read_write(pmap, ro_base, true, false);
15000 pmap_test_read_write(pmap, rw_base, true, true);
15001
15002 T_LOG("RW protect both mappings; should not change protections.");
15003 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15004 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15005 pmap_test_read_write(pmap, ro_base, true, false);
15006 pmap_test_read_write(pmap, rw_base, true, true);
15007
15008 T_LOG("Read protect both mappings; RW mapping should become RO.");
15009 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
15010 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
15011 pmap_test_read_write(pmap, ro_base, true, false);
15012 pmap_test_read_write(pmap, rw_base, true, false);
15013
15014 T_LOG("RW protect the page; mappings should not change protections.");
15015 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15016 pmap_page_protect(pn, VM_PROT_ALL);
15017 pmap_test_read_write(pmap, ro_base, true, false);
15018 pmap_test_read_write(pmap, rw_base, true, true);
15019
15020 T_LOG("Read protect the page; RW mapping should become RO.");
15021 pmap_page_protect(pn, VM_PROT_READ);
15022 pmap_test_read_write(pmap, ro_base, true, false);
15023 pmap_test_read_write(pmap, rw_base, true, false);
15024
15025 T_LOG("Validate that disconnect removes all known mappings of the page.");
15026 pmap_disconnect(pn);
15027 if (!pmap_verify_free(pn)) {
15028 T_FAIL("Page still has mappings");
15029 }
15030
15031 T_LOG("Remove the wired mapping, so we can tear down the test map.");
15032 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
15033 pmap_destroy(pmap);
15034
15035 T_LOG("Release the pages back to the VM.");
15036 vm_page_lock_queues();
15037 vm_page_free(unwired_vm_page);
15038 vm_page_free(wired_vm_page);
15039 vm_page_unlock_queues();
15040
15041 T_LOG("Testing successful!");
15042 return 0;
15043 }
15044 #endif /* __arm64__ */
15045
15046 kern_return_t
15047 pmap_test(void)
15048 {
15049 T_LOG("Starting pmap_tests");
15050 #ifdef __arm64__
15051 int flags = 0;
15052 flags |= PMAP_CREATE_64BIT;
15053
15054 #if __ARM_MIXED_PAGE_SIZE__
15055 T_LOG("Testing VM_PAGE_SIZE_4KB");
15056 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
15057 T_LOG("Testing VM_PAGE_SIZE_16KB");
15058 pmap_test_test_config(flags);
15059 #else /* __ARM_MIXED_PAGE_SIZE__ */
15060 pmap_test_test_config(flags);
15061 #endif /* __ARM_MIXED_PAGE_SIZE__ */
15062
15063 #endif /* __arm64__ */
15064 T_PASS("completed pmap_test successfully");
15065 return KERN_SUCCESS;
15066 }
15067 #endif /* CONFIG_XNUPOST */
15068
15069 /*
15070 * The following function should never make it to RELEASE code, since
15071 * it provides a way to get the PPL to modify text pages.
15072 */
15073 #if DEVELOPMENT || DEBUG
15074
15075 #define ARM_UNDEFINED_INSN 0xe7f000f0
15076 #define ARM_UNDEFINED_INSN_THUMB 0xde00
15077
15078 /**
15079 * Forcibly overwrite executable text with an illegal instruction.
15080 *
15081 * @note Only used for xnu unit testing.
15082 *
15083 * @param pa The physical address to corrupt.
15084 *
15085 * @return KERN_SUCCESS on success.
15086 */
15087 kern_return_t
15088 pmap_test_text_corruption(pmap_paddr_t pa)
15089 {
15090 #if XNU_MONITOR
15091 return pmap_test_text_corruption_ppl(pa);
15092 #else /* XNU_MONITOR */
15093 return pmap_test_text_corruption_internal(pa);
15094 #endif /* XNU_MONITOR */
15095 }
15096
15097 MARK_AS_PMAP_TEXT kern_return_t
15098 pmap_test_text_corruption_internal(pmap_paddr_t pa)
15099 {
15100 vm_offset_t va = phystokv(pa);
15101 unsigned int pai = pa_index(pa);
15102
15103 assert(pa_valid(pa));
15104
15105 pvh_lock(pai);
15106
15107 pv_entry_t **pv_h = pai_to_pvh(pai);
15108 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
15109 #if defined(PVH_FLAG_EXEC)
15110 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
15111
15112 if (need_ap_twiddle) {
15113 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
15114 }
15115 #endif /* defined(PVH_FLAG_EXEC) */
15116
15117 /*
15118 * The low bit in an instruction address indicates a THUMB instruction
15119 */
15120 if (va & 1) {
15121 va &= ~(vm_offset_t)1;
15122 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
15123 } else {
15124 *(uint32_t *)va = ARM_UNDEFINED_INSN;
15125 }
15126
15127 #if defined(PVH_FLAG_EXEC)
15128 if (need_ap_twiddle) {
15129 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
15130 }
15131 #endif /* defined(PVH_FLAG_EXEC) */
15132
15133 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
15134
15135 pvh_unlock(pai);
15136
15137 return KERN_SUCCESS;
15138 }
15139
15140 #endif /* DEVELOPMENT || DEBUG */
15141