1 /*
2 * Copyright (c) 2011-2021, 2023-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object_internal.h>
60 #include <vm/vm_page_internal.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm_internal.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 #include <machine/machine_routines.h>
75
76 #include <arm/caches_internal.h>
77 #include <arm/cpu_data.h>
78 #include <arm/cpu_data_internal.h>
79 #include <arm/cpu_capabilities.h>
80 #include <arm/cpu_number.h>
81 #include <arm/machine_cpu.h>
82 #include <arm/misc_protos.h>
83 #include <arm/pmap/pmap_internal.h>
84 #include <arm/trap_internal.h>
85
86 #include <arm64/proc_reg.h>
87 #include <pexpert/arm64/boot.h>
88 #include <arm64/ppl/sart.h>
89 #include <arm64/ppl/uat.h>
90
91 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
92 #include <arm64/amcc_rorgn.h>
93 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
94
95 #include <pexpert/device_tree.h>
96
97 #include <san/kasan.h>
98 #include <sys/cdefs.h>
99
100 #if defined(HAS_APPLE_PAC)
101 #include <ptrauth.h>
102 #endif
103
104 #ifdef CONFIG_XNUPOST
105 #include <tests/xnupost.h>
106 #endif
107
108
109 #if HAS_MTE
110 #error invalid configuration, you must be using CONFIG_SPTM
111 #endif
112
113 #if HIBERNATION
114 #include <IOKit/IOHibernatePrivate.h>
115 #endif /* HIBERNATION */
116
117 #define PMAP_L1_MAX_ENTRY (ARM_PTE_T1_REGION_MASK(TCR_EL1_BOOT) >> ARM_TT_L1_SHIFT)
118 #define PMAP_ROOT_ALLOC_SIZE ((PMAP_L1_MAX_ENTRY + 1) * sizeof(tt_entry_t))
119
120 #ifndef __ARM64_PMAP_SUBPAGE_L1__
121 _Static_assert(ARM_PGBYTES == PMAP_ROOT_ALLOC_SIZE, "Unexpected L1 Size");
122 #endif
123
124 #if __ARM_VMSA__ != 8
125 #error Unknown __ARM_VMSA__
126 #endif
127
128 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
129
130 extern u_int32_t random(void); /* from <libkern/libkern.h> */
131
132 static bool alloc_asid(pmap_t pmap);
133 static void free_asid(pmap_t pmap);
134 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
135 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
136 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
137
138 const struct page_table_ops native_pt_ops =
139 {
140 .alloc_id = alloc_asid,
141 .free_id = free_asid,
142 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
143 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
144 .wimg_to_pte = wimg_to_pte,
145 };
146
147 const struct page_table_level_info pmap_table_level_info_16k[] =
148 {
149 [0] = {
150 .size = ARM_16K_TT_L0_SIZE,
151 .offmask = ARM_16K_TT_L0_OFFMASK,
152 .shift = ARM_16K_TT_L0_SHIFT,
153 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
154 .valid_mask = ARM_TTE_VALID,
155 .type_mask = ARM_TTE_TYPE_MASK,
156 .type_block = ARM_TTE_TYPE_BLOCK
157 },
158 [1] = {
159 .size = ARM_16K_TT_L1_SIZE,
160 .offmask = ARM_16K_TT_L1_OFFMASK,
161 .shift = ARM_16K_TT_L1_SHIFT,
162 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
163 .valid_mask = ARM_TTE_VALID,
164 .type_mask = ARM_TTE_TYPE_MASK,
165 .type_block = ARM_TTE_TYPE_BLOCK
166 },
167 [2] = {
168 .size = ARM_16K_TT_L2_SIZE,
169 .offmask = ARM_16K_TT_L2_OFFMASK,
170 .shift = ARM_16K_TT_L2_SHIFT,
171 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
172 .valid_mask = ARM_TTE_VALID,
173 .type_mask = ARM_TTE_TYPE_MASK,
174 .type_block = ARM_TTE_TYPE_BLOCK
175 },
176 [3] = {
177 .size = ARM_16K_TT_L3_SIZE,
178 .offmask = ARM_16K_TT_L3_OFFMASK,
179 .shift = ARM_16K_TT_L3_SHIFT,
180 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
181 .valid_mask = ARM_PTE_TYPE_VALID,
182 .type_mask = ARM_TTE_TYPE_MASK,
183 .type_block = ARM_TTE_TYPE_L3BLOCK
184 }
185 };
186
187 const struct page_table_level_info pmap_table_level_info_4k[] =
188 {
189 [0] = {
190 .size = ARM_4K_TT_L0_SIZE,
191 .offmask = ARM_4K_TT_L0_OFFMASK,
192 .shift = ARM_4K_TT_L0_SHIFT,
193 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
194 .valid_mask = ARM_TTE_VALID,
195 .type_mask = ARM_TTE_TYPE_MASK,
196 .type_block = ARM_TTE_TYPE_BLOCK
197 },
198 [1] = {
199 .size = ARM_4K_TT_L1_SIZE,
200 .offmask = ARM_4K_TT_L1_OFFMASK,
201 .shift = ARM_4K_TT_L1_SHIFT,
202 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
203 .valid_mask = ARM_TTE_VALID,
204 .type_mask = ARM_TTE_TYPE_MASK,
205 .type_block = ARM_TTE_TYPE_BLOCK
206 },
207 [2] = {
208 .size = ARM_4K_TT_L2_SIZE,
209 .offmask = ARM_4K_TT_L2_OFFMASK,
210 .shift = ARM_4K_TT_L2_SHIFT,
211 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
212 .valid_mask = ARM_TTE_VALID,
213 .type_mask = ARM_TTE_TYPE_MASK,
214 .type_block = ARM_TTE_TYPE_BLOCK
215 },
216 [3] = {
217 .size = ARM_4K_TT_L3_SIZE,
218 .offmask = ARM_4K_TT_L3_OFFMASK,
219 .shift = ARM_4K_TT_L3_SHIFT,
220 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
221 .valid_mask = ARM_PTE_TYPE_VALID,
222 .type_mask = ARM_TTE_TYPE_MASK,
223 .type_block = ARM_TTE_TYPE_L3BLOCK
224 }
225 };
226
227 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
228 {
229 [0] = { /* Unused */
230 .size = ARM_4K_TT_L0_SIZE,
231 .offmask = ARM_4K_TT_L0_OFFMASK,
232 .shift = ARM_4K_TT_L0_SHIFT,
233 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
234 .valid_mask = ARM_TTE_VALID,
235 .type_mask = ARM_TTE_TYPE_MASK,
236 .type_block = ARM_TTE_TYPE_BLOCK
237 },
238 [1] = { /* Concatenated, so index mask is larger than normal */
239 .size = ARM_4K_TT_L1_SIZE,
240 .offmask = ARM_4K_TT_L1_OFFMASK,
241 .shift = ARM_4K_TT_L1_SHIFT,
242 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
243 .index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
244 #else
245 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
246 #endif
247 .valid_mask = ARM_TTE_VALID,
248 .type_mask = ARM_TTE_TYPE_MASK,
249 .type_block = ARM_TTE_TYPE_BLOCK
250 },
251 [2] = {
252 .size = ARM_4K_TT_L2_SIZE,
253 .offmask = ARM_4K_TT_L2_OFFMASK,
254 .shift = ARM_4K_TT_L2_SHIFT,
255 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
256 .valid_mask = ARM_TTE_VALID,
257 .type_mask = ARM_TTE_TYPE_MASK,
258 .type_block = ARM_TTE_TYPE_BLOCK
259 },
260 [3] = {
261 .size = ARM_4K_TT_L3_SIZE,
262 .offmask = ARM_4K_TT_L3_OFFMASK,
263 .shift = ARM_4K_TT_L3_SHIFT,
264 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
265 .valid_mask = ARM_PTE_TYPE_VALID,
266 .type_mask = ARM_TTE_TYPE_MASK,
267 .type_block = ARM_TTE_TYPE_L3BLOCK
268 }
269 };
270
271 const struct page_table_attr pmap_pt_attr_4k = {
272 .pta_level_info = pmap_table_level_info_4k,
273 .pta_root_level = (T0SZ_BOOT - 16) / 9,
274 #if __ARM_MIXED_PAGE_SIZE__
275 .pta_commpage_level = PMAP_TT_L2_LEVEL,
276 #else /* __ARM_MIXED_PAGE_SIZE__ */
277 #if __ARM_16K_PG__
278 .pta_commpage_level = PMAP_TT_L2_LEVEL,
279 #else /* __ARM_16K_PG__ */
280 .pta_commpage_level = PMAP_TT_L1_LEVEL,
281 #endif /* __ARM_16K_PG__ */
282 #endif /* __ARM_MIXED_PAGE_SIZE__ */
283 .pta_max_level = PMAP_TT_L3_LEVEL,
284 .pta_ops = &native_pt_ops,
285 .ap_ro = ARM_PTE_AP(AP_RORO),
286 .ap_rw = ARM_PTE_AP(AP_RWRW),
287 .ap_rona = ARM_PTE_AP(AP_RONA),
288 .ap_rwna = ARM_PTE_AP(AP_RWNA),
289 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
290 .ap_x = ARM_PTE_PNX,
291 #if __ARM_MIXED_PAGE_SIZE__
292 .pta_tcr_value = TCR_EL1_4KB,
293 #endif /* __ARM_MIXED_PAGE_SIZE__ */
294 .pta_page_size = 4096,
295 .pta_pagezero_size = 4096,
296 .pta_page_shift = 12,
297 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB),
298 };
299
300 const struct page_table_attr pmap_pt_attr_16k = {
301 .pta_level_info = pmap_table_level_info_16k,
302 .pta_root_level = PMAP_TT_L1_LEVEL,
303 .pta_commpage_level = PMAP_TT_L2_LEVEL,
304 .pta_max_level = PMAP_TT_L3_LEVEL,
305 .pta_ops = &native_pt_ops,
306 .ap_ro = ARM_PTE_AP(AP_RORO),
307 .ap_rw = ARM_PTE_AP(AP_RWRW),
308 .ap_rona = ARM_PTE_AP(AP_RONA),
309 .ap_rwna = ARM_PTE_AP(AP_RWNA),
310 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
311 .ap_x = ARM_PTE_PNX,
312 #if __ARM_MIXED_PAGE_SIZE__
313 .pta_tcr_value = TCR_EL1_16KB,
314 #endif /* __ARM_MIXED_PAGE_SIZE__ */
315 .pta_page_size = 16384,
316 .pta_pagezero_size = 16384,
317 .pta_page_shift = 14,
318 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB),
319 };
320
321 #if __ARM_16K_PG__
322 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
323 #else /* !__ARM_16K_PG__ */
324 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
325 #endif /* !__ARM_16K_PG__ */
326
327
328 #if DEVELOPMENT || DEBUG
329 int vm_footprint_suspend_allowed = 1;
330
331 extern int pmap_ledgers_panic;
332 extern int pmap_ledgers_panic_leeway;
333
334 #endif /* DEVELOPMENT || DEBUG */
335
336 #if DEVELOPMENT || DEBUG
337 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
338 (current_thread()->pmap_footprint_suspended)
339 #else /* DEVELOPMENT || DEBUG */
340 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
341 #endif /* DEVELOPMENT || DEBUG */
342
343
344 /*
345 * Represents a tlb range that will be flushed before exiting
346 * the ppl.
347 * Used by phys_attribute_clear_range to defer flushing pages in
348 * this range until the end of the operation.
349 */
350 typedef struct pmap_tlb_flush_range {
351 pmap_t ptfr_pmap;
352 vm_map_address_t ptfr_start;
353 vm_map_address_t ptfr_end;
354 bool ptfr_flush_needed;
355 } pmap_tlb_flush_range_t;
356
357 #if XNU_MONITOR
358 /*
359 * PPL External References.
360 */
361 extern vm_offset_t segPPLDATAB;
362 extern unsigned long segSizePPLDATA;
363 extern vm_offset_t segPPLTEXTB;
364 extern unsigned long segSizePPLTEXT;
365 extern vm_offset_t segPPLDATACONSTB;
366 extern unsigned long segSizePPLDATACONST;
367
368
369 /*
370 * PPL Global Variables
371 */
372
373 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
374 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
375 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
376 #else
377 const boolean_t pmap_ppl_disable = FALSE;
378 #endif
379
380 /*
381 * Indicates if the PPL has started applying APRR.
382 * This variable is accessed from various assembly trampolines, so be sure to change
383 * those if you change the size or layout of this variable.
384 */
385 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
386
387 extern void *pmap_stacks_start;
388 extern void *pmap_stacks_end;
389
390 #endif /* !XNU_MONITOR */
391
392
393
394 /* Virtual memory region for early allocation */
395 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
396 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
397 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
398
399 extern uint8_t bootstrap_pagetables[];
400
401 extern unsigned int not_in_kdp;
402
403 extern vm_offset_t first_avail;
404
405 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
406 extern vm_offset_t virtual_space_end; /* End of kernel address space */
407 extern vm_offset_t static_memory_end;
408
409 extern const vm_map_address_t physmap_base;
410 extern const vm_map_address_t physmap_end;
411
412 extern int maxproc, hard_maxproc;
413
414 /* The number of address bits one TTBR can cover. */
415 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
416
417 /*
418 * The bounds on our TTBRs. These are for sanity checking that
419 * an address is accessible by a TTBR before we attempt to map it.
420 */
421
422 /* The level of the root of a page table. */
423 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
424
425 /* The number of entries in the root TT of a page table. */
426 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
427
428 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
429 const pmap_t kernel_pmap = &kernel_pmap_store;
430
431 __static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
432
433 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
434 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
435 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
436
437 typedef struct tt_free_entry {
438 struct tt_free_entry *next;
439 } tt_free_entry_t;
440
441 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
442
443 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
444 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
445 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
446 #define FREE_PAGE_SIZE_TT_MAX 4
447 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
448 unsigned int free_tt_count MARK_AS_PMAP_DATA;
449 unsigned int free_tt_max MARK_AS_PMAP_DATA;
450
451 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
452
453 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
454 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
455 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
456 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
457 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
458 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
459
460 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
462
463 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
464 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
465
466 /* Lock group used for all pmap object locks. */
467 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
468
469 #if DEVELOPMENT || DEBUG
470 int nx_enabled = 1; /* enable no-execute protection */
471 int allow_data_exec = 0; /* No apps may execute data */
472 int allow_stack_exec = 0; /* No apps may execute from the stack */
473 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
474 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
475 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
476 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
477 #else /* DEVELOPMENT || DEBUG */
478 const int nx_enabled = 1; /* enable no-execute protection */
479 const int allow_data_exec = 0; /* No apps may execute data */
480 const int allow_stack_exec = 0; /* No apps may execute from the stack */
481 #endif /* DEVELOPMENT || DEBUG */
482
483 /**
484 * This variable is set true during hibernation entry to protect pmap data structures
485 * during image copying, and reset false on hibernation exit.
486 */
487 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
488
489 #if MACH_ASSERT
490 static void pmap_check_ledgers(pmap_t pmap);
491 #else
492 static inline void
pmap_check_ledgers(__unused pmap_t pmap)493 pmap_check_ledgers(__unused pmap_t pmap)
494 {
495 }
496 #endif /* MACH_ASSERT */
497
498 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
499
500 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
501 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
502
503 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
504
505 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
506 #if defined(__arm64__)
507 /* end of shared region + 512MB for various purposes */
508 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
509 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
510 "Minimum address space size outside allowable range");
511
512 // Max offset is 15.375GB for devices with "large" memory config
513 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
514 // Max offset is 11.375GB for devices with "small" memory config
515 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
516
517
518 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
519 "Large device address space size outside allowable range");
520 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
521 "Small device address space size outside allowable range");
522
523 # ifdef XNU_TARGET_OS_OSX
524 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
525 # else
526 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
527 # endif
528 #endif /* __arm64__ */
529
530 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
531 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
532 #else
533 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
534 #endif
535
536 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
537 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
538 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
539 SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap;
540 #if !HAS_16BIT_ASID
541 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
542 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
543 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
544 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
545 #else
546 static uint16_t last_allocated_asid = 0;
547 #endif /* !HAS_16BIT_ASID */
548
549 #if HAS_SPECRES_DEBUGGING
550 /* A debug flag that controls SPECRES instructions behavior facilitating perf evaluation. */
551 static SECURITY_READ_ONLY_LATE(uint64_t) specres_debug = 0;
552 #endif /* HAS_SPECRES_DEBUGGING */
553
554
555 #if __ARM_MIXED_PAGE_SIZE__
556 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
557 #endif
558 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
559 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
560 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
561
562 /* PTE Define Macros */
563
564 #define ARM_PTE_IS_COMPRESSED(x, p) \
565 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
566 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
567 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
568 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
569 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
570
571 #define pte_is_wired(pte) \
572 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
573
574 #define pte_was_writeable(pte) \
575 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
576
577 #define pte_set_was_writeable(pte, was_writeable) \
578 do { \
579 if ((was_writeable)) { \
580 (pte) |= ARM_PTE_WRITEABLE; \
581 } else { \
582 (pte) &= ~ARM_PTE_WRITEABLE; \
583 } \
584 } while(0)
585
586 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)587 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
588 {
589 if (wired) {
590 *ptep |= ARM_PTE_WIRED;
591 } else {
592 *ptep &= ~ARM_PTE_WIRED;
593 }
594 /*
595 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
596 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
597 * never reclaimed.
598 */
599 if (pmap == kernel_pmap) {
600 return;
601 }
602 unsigned short *ptd_wiredcnt_ptr;
603 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
604 if (wired) {
605 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
606 } else {
607 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
608 if (__improbable(prev_wired == 0)) {
609 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
610 }
611 }
612 }
613
614 #if HAS_FEAT_XS
615
616 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)617 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
618 {
619 if (__improbable(pt_attr->stage2)) {
620 return false;
621 }
622 switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
623 case CACHE_ATTRINDX_DISABLE_XS:
624 case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
625 return true;
626 default:
627 return false;
628 }
629 }
630
631 #endif /* HAS_FEAT_XS */
632
633 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
634 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
635 arm64_sync_tlb(strong); \
636 }
637
638 /*
639 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
640 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
641 * will observe the updated PTE.
642 */
643 #define FLUSH_PTE() \
644 __builtin_arm_dmb(DMB_ISH);
645
646 /*
647 * Synchronize updates to PTEs that were previously valid and thus may be cached in
648 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
649 * TLBI. This should only require a store-store barrier, as subsequent accesses in
650 * program order will not issue until the DSB completes. Prior loads may be reordered
651 * after the barrier, but their behavior should not be materially affected by the
652 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
653 * matter for loads until the access is re-driven well after the TLB update is
654 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
655 * we should be in a position to handle access faults. For "voluntary" PTE access
656 * restriction due to unmapping or protection, the decision to restrict access should
657 * have a data dependency on prior loads in order to avoid a data race.
658 */
659 #define FLUSH_PTE_STRONG() \
660 __builtin_arm_dsb(DSB_ISHST);
661
662 /**
663 * Write enough page table entries to map a single VM page. On systems where the
664 * VM page size does not match the hardware page size, multiple page table
665 * entries will need to be written.
666 *
667 * @note This function does not emit a barrier to ensure these page table writes
668 * have completed before continuing. This is commonly needed. In the case
669 * where a DMB or DSB barrier is needed, then use the write_pte() and
670 * write_pte_strong() functions respectively instead of this one.
671 *
672 * @param ptep Pointer to the first page table entry to update.
673 * @param pte The value to write into each page table entry. In the case that
674 * multiple PTEs are updated to a non-empty value, then the address
675 * in this value will automatically be incremented for each PTE
676 * write.
677 */
678 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)679 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
680 {
681 /**
682 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
683 * systems, which is why it's checked at runtime instead of compile time.
684 * The "unreachable" warning needs to be suppressed because it still is a
685 * compile time constant on some systems.
686 */
687 __unreachable_ok_push
688 if (TEST_PAGE_RATIO_4) {
689 if (((uintptr_t)ptep) & 0x1f) {
690 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
691 __func__, ptep, (void*)pte);
692 }
693
694 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
695 /**
696 * If we're writing an empty/compressed PTE value, then don't
697 * auto-increment the address for each PTE write.
698 */
699 *ptep = pte;
700 *(ptep + 1) = pte;
701 *(ptep + 2) = pte;
702 *(ptep + 3) = pte;
703 } else {
704 *ptep = pte;
705 *(ptep + 1) = pte | 0x1000;
706 *(ptep + 2) = pte | 0x2000;
707 *(ptep + 3) = pte | 0x3000;
708 }
709 } else {
710 *ptep = pte;
711 }
712 __unreachable_ok_pop
713 }
714
715 /**
716 * Writes enough page table entries to map a single VM page and then ensures
717 * those writes complete by executing a Data Memory Barrier.
718 *
719 * @note The DMB issued by this function is not strong enough to protect against
720 * TLB invalidates from being reordered above the PTE writes. If a TLBI
721 * instruction is going to immediately be called after this write, it's
722 * recommended to call write_pte_strong() instead of this function.
723 *
724 * See the function header for write_pte_fast() for more details on the
725 * parameters.
726 */
727 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)728 write_pte(pt_entry_t *ptep, pt_entry_t pte)
729 {
730 write_pte_fast(ptep, pte);
731 FLUSH_PTE();
732 }
733
734 /**
735 * Writes enough page table entries to map a single VM page and then ensures
736 * those writes complete by executing a Data Synchronization Barrier. This
737 * barrier provides stronger guarantees than the DMB executed by write_pte().
738 *
739 * @note This function is useful if you're going to immediately flush the TLB
740 * after making the PTE write. A DSB is required to protect against the
741 * TLB invalidate being reordered before the PTE write.
742 *
743 * See the function header for write_pte_fast() for more details on the
744 * parameters.
745 */
746 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)747 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
748 {
749 write_pte_fast(ptep, pte);
750 FLUSH_PTE_STRONG();
751 }
752
753 /**
754 * Retrieve the pmap structure for the thread running on the current CPU.
755 */
756 pmap_t
current_pmap()757 current_pmap()
758 {
759 const pmap_t current = vm_map_pmap(current_thread()->map);
760
761 assert(current != NULL);
762
763 #if XNU_MONITOR
764 /**
765 * On PPL-enabled systems, it's important that PPL policy decisions aren't
766 * decided by kernel-writable memory. This function is used in various parts
767 * of the PPL, and besides validating that the pointer returned by this
768 * function is indeed a pmap structure, it's also important to ensure that
769 * it's actually the current thread's pmap. This is because different pmaps
770 * will have access to different entitlements based on the code signature of
771 * their loaded process. So if a different user pmap is set in the current
772 * thread structure (in an effort to bypass code signing restrictions), even
773 * though the structure would validate correctly as it is a real pmap
774 * structure, it should fail here.
775 *
776 * This only needs to occur for user pmaps because the kernel pmap's root
777 * page table is always the same as TTBR1 (it's set during bootstrap and not
778 * changed so it'd be redundant to check), and its code signing fields are
779 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
780 * it shouldn't be possible to set those fields. Due to that, an attacker
781 * setting the current thread's pmap to the kernel pmap as a way to bypass
782 * this check won't accomplish anything as it doesn't provide any extra code
783 * signing entitlements.
784 */
785 if ((current != kernel_pmap) &&
786 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
787 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
788 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
789 }
790 #endif /* XNU_MONITOR */
791
792 return current;
793 }
794
795 #if DEVELOPMENT || DEBUG
796
797 /*
798 * Trace levels are controlled by a bitmask in which each
799 * level can be enabled/disabled by the (1<<level) position
800 * in the boot arg
801 * Level 0: PPL extension functionality
802 * Level 1: pmap lifecycle (create/destroy/switch)
803 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
804 * Level 3: internal state management (attributes/fast-fault)
805 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
806 */
807
808 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
809
810 #define PMAP_TRACE(level, ...) \
811 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
812 KDBG_RELEASE(__VA_ARGS__); \
813 }
814 #else /* DEVELOPMENT || DEBUG */
815
816 #define PMAP_TRACE(level, ...)
817
818 #endif /* DEVELOPMENT || DEBUG */
819
820
821 /*
822 * Internal function prototypes (forward declarations).
823 */
824
825 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
826
827 static void pmap_set_reference(ppnum_t pn);
828
829 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
830
831 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
832
833 static kern_return_t pmap_expand(
834 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
835
836 static int pmap_remove_range(
837 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
838
839 static tt_entry_t *pmap_tt1_allocate(
840 pmap_t, vm_size_t, unsigned int);
841
842 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
843
844 static void pmap_tt1_deallocate(
845 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
846
847 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
848
849 static kern_return_t pmap_tt_allocate(
850 pmap_t, tt_entry_t **, unsigned int, unsigned int);
851
852 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
853
854 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
855 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
856 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
857
858 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
859
860
861 static void pmap_unmap_commpage(
862 pmap_t pmap);
863
864 static boolean_t
865 pmap_is_64bit(pmap_t);
866
867
868 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
869
870 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
871
872 static bool pmap_update_cache_attributes_locked(
873 ppnum_t, unsigned, bool);
874
875 static boolean_t arm_clear_fast_fault(
876 ppnum_t ppnum,
877 vm_prot_t fault_type,
878 pt_entry_t *pte_p);
879
880 static void pmap_trim_self(pmap_t pmap);
881 static void pmap_trim_subord(pmap_t subord);
882
883
884 /*
885 * Temporary prototypes, while we wait for pmap_enter to move to taking an
886 * address instead of a page number.
887 */
888 static kern_return_t
889 pmap_enter_addr(
890 pmap_t pmap,
891 vm_map_address_t v,
892 pmap_paddr_t pa,
893 vm_prot_t prot,
894 vm_prot_t fault_type,
895 unsigned int flags,
896 boolean_t wired);
897
898 kern_return_t
899 pmap_enter_options_addr(
900 pmap_t pmap,
901 vm_map_address_t v,
902 pmap_paddr_t pa,
903 vm_prot_t prot,
904 vm_prot_t fault_type,
905 unsigned int flags,
906 boolean_t wired,
907 unsigned int options,
908 __unused void *arg,
909 __unused pmap_mapping_type_t mapping_type);
910
911 #ifdef CONFIG_XNUPOST
912 kern_return_t pmap_test(void);
913 #endif /* CONFIG_XNUPOST */
914
915 PMAP_SUPPORT_PROTOTYPES(
916 kern_return_t,
917 arm_fast_fault, (pmap_t pmap,
918 vm_map_address_t va,
919 vm_prot_t fault_type,
920 bool was_af_fault,
921 bool from_user), ARM_FAST_FAULT_INDEX);
922
923 PMAP_SUPPORT_PROTOTYPES(
924 boolean_t,
925 arm_force_fast_fault, (ppnum_t ppnum,
926 vm_prot_t allow_mode,
927 int options), ARM_FORCE_FAST_FAULT_INDEX);
928
929 MARK_AS_PMAP_TEXT static boolean_t
930 arm_force_fast_fault_with_flush_range(
931 ppnum_t ppnum,
932 vm_prot_t allow_mode,
933 int options,
934 pmap_tlb_flush_range_t *flush_range);
935
936 /**
937 * Definition of the states driving the batch cache attributes update
938 * state machine.
939 */
940 typedef struct {
941 uint64_t page_index : 32, /* The page index to be operated on */
942 state : 8, /* The current state of the update machine */
943 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
944 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
945 :0;
946 } batch_set_cache_attr_state_t;
947
948 /* Possible values of the "state" field. */
949 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
950 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
951 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
952 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
953
954 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
955
956 PMAP_SUPPORT_PROTOTYPES(
957 batch_set_cache_attr_state_t,
958 pmap_batch_set_cache_attributes, (
959 #if XNU_MONITOR
960 volatile upl_page_info_t *user_page_list,
961 #else /* !XNU_MONITOR */
962 upl_page_info_array_t user_page_list,
963 #endif /* XNU_MONITOR */
964 batch_set_cache_attr_state_t state,
965 unsigned int page_cnt,
966 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
967
968 PMAP_SUPPORT_PROTOTYPES(
969 kern_return_t,
970 pmap_change_wiring, (pmap_t pmap,
971 vm_map_address_t v,
972 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
973
974 PMAP_SUPPORT_PROTOTYPES(
975 pmap_t,
976 pmap_create_options, (ledger_t ledger,
977 vm_map_size_t size,
978 unsigned int flags,
979 kern_return_t * kr), PMAP_CREATE_INDEX);
980
981 PMAP_SUPPORT_PROTOTYPES(
982 void,
983 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
984
985 PMAP_SUPPORT_PROTOTYPES(
986 kern_return_t,
987 pmap_enter_options, (pmap_t pmap,
988 vm_map_address_t v,
989 pmap_paddr_t pa,
990 vm_prot_t prot,
991 vm_prot_t fault_type,
992 unsigned int flags,
993 boolean_t wired,
994 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
995
996 PMAP_SUPPORT_PROTOTYPES(
997 pmap_paddr_t,
998 pmap_find_pa, (pmap_t pmap,
999 addr64_t va), PMAP_FIND_PA_INDEX);
1000
1001 PMAP_SUPPORT_PROTOTYPES(
1002 kern_return_t,
1003 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
1004
1005
1006 PMAP_SUPPORT_PROTOTYPES(
1007 boolean_t,
1008 pmap_is_empty, (pmap_t pmap,
1009 vm_map_offset_t va_start,
1010 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1011
1012
1013 PMAP_SUPPORT_PROTOTYPES(
1014 unsigned int,
1015 pmap_map_cpu_windows_copy, (ppnum_t pn,
1016 vm_prot_t prot,
1017 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1018
1019 PMAP_SUPPORT_PROTOTYPES(
1020 void,
1021 pmap_ro_zone_memcpy, (zone_id_t zid,
1022 vm_offset_t va,
1023 vm_offset_t offset,
1024 const vm_offset_t new_data,
1025 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1026
1027 PMAP_SUPPORT_PROTOTYPES(
1028 uint64_t,
1029 pmap_ro_zone_atomic_op, (zone_id_t zid,
1030 vm_offset_t va,
1031 vm_offset_t offset,
1032 zro_atomic_op_t op,
1033 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1034
1035 PMAP_SUPPORT_PROTOTYPES(
1036 void,
1037 pmap_ro_zone_bzero, (zone_id_t zid,
1038 vm_offset_t va,
1039 vm_offset_t offset,
1040 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1041
1042 PMAP_SUPPORT_PROTOTYPES(
1043 kern_return_t,
1044 pmap_set_shared_region, (pmap_t grand,
1045 pmap_t subord,
1046 addr64_t vstart,
1047 uint64_t size), PMAP_SET_SHARED_REGION_INDEX);
1048
1049 PMAP_SUPPORT_PROTOTYPES(
1050 vm_map_offset_t,
1051 pmap_nest, (pmap_t grand,
1052 pmap_t subord,
1053 addr64_t vstart,
1054 uint64_t size,
1055 vm_map_offset_t vrestart,
1056 kern_return_t * krp), PMAP_NEST_INDEX);
1057
1058 PMAP_SUPPORT_PROTOTYPES(
1059 void,
1060 pmap_page_protect_options, (ppnum_t ppnum,
1061 vm_prot_t prot,
1062 unsigned int options,
1063 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1064
1065 PMAP_SUPPORT_PROTOTYPES(
1066 vm_map_address_t,
1067 pmap_protect_options, (pmap_t pmap,
1068 vm_map_address_t start,
1069 vm_map_address_t end,
1070 vm_prot_t prot,
1071 unsigned int options,
1072 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1073
1074 PMAP_SUPPORT_PROTOTYPES(
1075 kern_return_t,
1076 pmap_query_page_info, (pmap_t pmap,
1077 vm_map_offset_t va,
1078 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1079
1080 PMAP_SUPPORT_PROTOTYPES(
1081 mach_vm_size_t,
1082 pmap_query_resident, (pmap_t pmap,
1083 vm_map_address_t start,
1084 vm_map_address_t end,
1085 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1086
1087 PMAP_SUPPORT_PROTOTYPES(
1088 void,
1089 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1090
1091 PMAP_SUPPORT_PROTOTYPES(
1092 vm_map_address_t,
1093 pmap_remove_options, (pmap_t pmap,
1094 vm_map_address_t start,
1095 vm_map_address_t end,
1096 int options), PMAP_REMOVE_OPTIONS_INDEX);
1097
1098
1099 PMAP_SUPPORT_PROTOTYPES(
1100 void,
1101 pmap_set_cache_attributes, (ppnum_t pn,
1102 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1103
1104 PMAP_SUPPORT_PROTOTYPES(
1105 void,
1106 pmap_update_compressor_page, (ppnum_t pn,
1107 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1108
1109 PMAP_SUPPORT_PROTOTYPES(
1110 void,
1111 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1112
1113 #if MACH_ASSERT || XNU_MONITOR
1114 PMAP_SUPPORT_PROTOTYPES(
1115 void,
1116 pmap_set_process, (pmap_t pmap,
1117 int pid,
1118 char *procname), PMAP_SET_PROCESS_INDEX);
1119 #endif
1120
1121 PMAP_SUPPORT_PROTOTYPES(
1122 void,
1123 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1124
1125 PMAP_SUPPORT_PROTOTYPES(
1126 vm_map_offset_t,
1127 pmap_unnest_options, (pmap_t grand,
1128 addr64_t vaddr,
1129 uint64_t size,
1130 vm_map_offset_t vrestart,
1131 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1132
1133 PMAP_SUPPORT_PROTOTYPES(
1134 void,
1135 phys_attribute_set, (ppnum_t pn,
1136 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1137
1138 PMAP_SUPPORT_PROTOTYPES(
1139 void,
1140 phys_attribute_clear, (ppnum_t pn,
1141 unsigned int bits,
1142 int options,
1143 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1144
1145 #if __ARM_RANGE_TLBI__
1146 PMAP_SUPPORT_PROTOTYPES(
1147 vm_map_address_t,
1148 phys_attribute_clear_range, (pmap_t pmap,
1149 vm_map_address_t start,
1150 vm_map_address_t end,
1151 unsigned int bits,
1152 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1153 #endif /* __ARM_RANGE_TLBI__ */
1154
1155
1156 PMAP_SUPPORT_PROTOTYPES(
1157 void,
1158 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1159
1160 PMAP_SUPPORT_PROTOTYPES(
1161 void,
1162 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1163
1164 PMAP_SUPPORT_PROTOTYPES(
1165 void,
1166 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1167
1168 PMAP_SUPPORT_PROTOTYPES(
1169 void,
1170 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1171
1172 PMAP_SUPPORT_PROTOTYPES(
1173 void,
1174 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1175
1176 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1177 PMAP_SUPPORT_PROTOTYPES(
1178 void,
1179 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1180 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1181
1182 /* Definition of the states used by pmap_trim(). */
1183 typedef enum {
1184 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1185 PMAP_TRIM_STATE_START = 0,
1186
1187 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1188 PMAP_TRIM_STATE_GRAND_BEFORE,
1189
1190 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1191 PMAP_TRIM_STATE_GRAND_AFTER,
1192
1193 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1194 PMAP_TRIM_STATE_SUBORD,
1195
1196 /* Marks that trimming is finished. */
1197 PMAP_TRIM_STATE_DONE,
1198
1199 /* Sentry enum for sanity checks. */
1200 PMAP_TRIM_STATE_COUNT,
1201 } pmap_trim_state_t;
1202
1203 PMAP_SUPPORT_PROTOTYPES(
1204 pmap_trim_state_t,
1205 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1206
1207 #if HAS_APPLE_PAC
1208 PMAP_SUPPORT_PROTOTYPES(
1209 void *,
1210 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1211 PMAP_SUPPORT_PROTOTYPES(
1212 void *,
1213 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1214 #endif /* HAS_APPLE_PAC */
1215
1216
1217
1218
1219 PMAP_SUPPORT_PROTOTYPES(
1220 kern_return_t,
1221 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1222 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1223
1224 PMAP_SUPPORT_PROTOTYPES(
1225 kern_return_t,
1226 pmap_load_trust_cache_with_type, (TCType_t type,
1227 const vm_address_t pmap_img4_payload,
1228 const vm_size_t pmap_img4_payload_len,
1229 const vm_address_t img4_manifest,
1230 const vm_size_t img4_manifest_len,
1231 const vm_address_t img4_aux_manifest,
1232 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1233
1234 PMAP_SUPPORT_PROTOTYPES(
1235 void,
1236 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1237
1238 PMAP_SUPPORT_PROTOTYPES(
1239 kern_return_t,
1240 pmap_query_trust_cache, (TCQueryType_t query_type,
1241 const uint8_t cdhash[kTCEntryHashSize],
1242 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1243
1244 PMAP_SUPPORT_PROTOTYPES(
1245 errno_t,
1246 pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1247 const void *input_data,
1248 size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1249
1250 #if PMAP_CS_INCLUDE_CODE_SIGNING
1251
1252 PMAP_SUPPORT_PROTOTYPES(
1253 kern_return_t,
1254 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1255 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1256
1257 PMAP_SUPPORT_PROTOTYPES(
1258 kern_return_t,
1259 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1260 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1261
1262 PMAP_SUPPORT_PROTOTYPES(
1263 kern_return_t,
1264 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1265 pmap_cs_profile_t * profile_obj),
1266 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1267
1268 PMAP_SUPPORT_PROTOTYPES(
1269 kern_return_t,
1270 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1271 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1272
1273 PMAP_SUPPORT_PROTOTYPES(
1274 kern_return_t,
1275 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1276 const void *kernel_entitlements),
1277 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1278
1279 PMAP_SUPPORT_PROTOTYPES(
1280 kern_return_t,
1281 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1282 const void **kernel_entitlements),
1283 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1284
1285 PMAP_SUPPORT_PROTOTYPES(
1286 kern_return_t,
1287 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1288 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1289
1290 PMAP_SUPPORT_PROTOTYPES(
1291 kern_return_t,
1292 pmap_cs_allow_invalid, (pmap_t pmap),
1293 PMAP_CS_ALLOW_INVALID_INDEX);
1294
1295 PMAP_SUPPORT_PROTOTYPES(
1296 void,
1297 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1298 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1299
1300 PMAP_SUPPORT_PROTOTYPES(
1301 bool,
1302 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1303 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1304
1305 PMAP_SUPPORT_PROTOTYPES(
1306 void,
1307 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1308 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1309
1310 PMAP_SUPPORT_PROTOTYPES(
1311 void,
1312 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1313 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1314
1315 #endif
1316
1317 PMAP_SUPPORT_PROTOTYPES(
1318 uint32_t,
1319 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1320
1321 PMAP_SUPPORT_PROTOTYPES(
1322 bool,
1323 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1324
1325 PMAP_SUPPORT_PROTOTYPES(
1326 void,
1327 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1328
1329 void pmap_footprint_suspend(vm_map_t map,
1330 boolean_t suspend);
1331 PMAP_SUPPORT_PROTOTYPES(
1332 void,
1333 pmap_footprint_suspend, (vm_map_t map,
1334 boolean_t suspend),
1335 PMAP_FOOTPRINT_SUSPEND_INDEX);
1336
1337
1338
1339
1340 #if DEVELOPMENT || DEBUG
1341 PMAP_SUPPORT_PROTOTYPES(
1342 kern_return_t,
1343 pmap_test_text_corruption, (pmap_paddr_t),
1344 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1345 #endif /* DEVELOPMENT || DEBUG */
1346
1347 /*
1348 * The low global vector page is mapped at a fixed alias.
1349 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1350 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1351 * to check both addresses anyway for backward compatibility. So for now
1352 * we leave H6 and H7 where they were.
1353 */
1354 #if (ARM_PGSHIFT == 14)
1355 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1356 #else
1357 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1358 #endif
1359
1360
1361 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1362 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1363 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1364
1365 #if XNU_MONITOR
1366
1367 #if __has_feature(ptrauth_calls)
1368 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1369 #else
1370 #define __ptrauth_ppl_handler
1371 #endif
1372
1373 /*
1374 * Table of function pointers used for PPL dispatch.
1375 */
1376 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1377 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1378 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1379 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1380 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1381 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1382 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1383 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1384 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1385 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1386 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1387 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1388 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1389 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1390 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1391 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1392 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1393 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1394 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1395 [PMAP_SET_SHARED_REGION_INDEX] = pmap_set_shared_region_internal,
1396 [PMAP_NEST_INDEX] = pmap_nest_internal,
1397 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1398 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1399 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1400 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1401 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1402 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1403 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1404 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1405 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1406 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1407 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1408 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1409 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1410 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1411 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1412 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1413 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1414 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1415 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1416 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1417 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1418 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1419 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1420 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1421 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1422 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1423 [PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1424 #if PMAP_CS_INCLUDE_CODE_SIGNING
1425 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1426 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1427 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1428 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1429 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1430 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1431 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1432 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1433 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1434 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1435 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1436 #endif
1437 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1438 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1439 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1440 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1441 #if HAS_APPLE_PAC
1442 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1443 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1444 #endif /* HAS_APPLE_PAC */
1445 #if __ARM_RANGE_TLBI__
1446 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1447 #endif /* __ARM_RANGE_TLBI__ */
1448 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1449 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1450 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1451 [PMAP_NOP_INDEX] = pmap_nop_internal,
1452
1453 #if DEVELOPMENT || DEBUG
1454 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1455 #endif /* DEVELOPMENT || DEBUG */
1456
1457 };
1458 #endif
1459
1460 #if XNU_MONITOR
1461 /**
1462 * A convenience function for setting protections on a single physical
1463 * aperture or static region mapping without invalidating the TLB.
1464 *
1465 * @note This function does not perform any TLB invalidations. That must be done
1466 * separately to be able to safely use the updated mapping.
1467 *
1468 * @note This function understands the difference between the VM page size and
1469 * the kernel page size and will update multiple PTEs if the sizes differ.
1470 * In other words, enough PTEs will always get updated to change the
1471 * permissions on a PAGE_SIZE amount of memory.
1472 *
1473 * @note The PVH lock for the physical page represented by this mapping must
1474 * already be locked.
1475 *
1476 * @note This function assumes the caller has already verified that the PTE
1477 * pointer does indeed point to a physical aperture or static region page
1478 * table. Please validate your inputs before passing it along to this
1479 * function.
1480 *
1481 * @param ptep Pointer to the physical aperture or static region page table to
1482 * update with a new XPRR index.
1483 * @param expected_perm The XPRR index that is expected to already exist at the
1484 * current mapping. If the current index doesn't match this
1485 * then the system will panic.
1486 * @param new_perm The new XPRR index to update the mapping with.
1487 */
1488 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1489 pmap_set_pte_xprr_perm(
1490 pt_entry_t * const ptep,
1491 unsigned int expected_perm,
1492 unsigned int new_perm)
1493 {
1494 assert(ptep != NULL);
1495
1496 pt_entry_t spte = *ptep;
1497 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1498
1499 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1500 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1501 __func__, ptep, new_perm, expected_perm);
1502 }
1503
1504 /**
1505 * The PTE involved should be valid, should not have the hint bit set, and
1506 * should have the expected XPRR index.
1507 */
1508 if (__improbable(!pte_is_valid(spte))) {
1509 panic_plain("%s: physical aperture or static region PTE is invalid, "
1510 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1511 __func__, ptep, spte, new_perm, expected_perm);
1512 }
1513
1514 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1515 panic_plain("%s: physical aperture or static region PTE has hint bit "
1516 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1517 __func__, ptep, spte, new_perm, expected_perm);
1518 }
1519
1520 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1521 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1522 "ptep=%p, new_perm=%u, expected_perm=%u",
1523 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1524 }
1525
1526 pt_entry_t template = spte;
1527 template &= ~ARM_PTE_XPRR_MASK;
1528 template |= xprr_perm_to_pte(new_perm);
1529
1530 write_pte_strong(ptep, template);
1531 }
1532
1533 /**
1534 * Update the protections on a single physical aperture mapping and invalidate
1535 * the TLB so the mapping can be used.
1536 *
1537 * @note The PVH lock for the physical page must already be locked.
1538 *
1539 * @param pai The physical address index of the page whose physical aperture
1540 * mapping will be updated with new permissions.
1541 * @param expected_perm The XPRR index that is expected to already exist at the
1542 * current mapping. If the current index doesn't match this
1543 * then the system will panic.
1544 * @param new_perm The new XPRR index to update the mapping with.
1545 */
1546 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1547 pmap_set_xprr_perm(
1548 unsigned int pai,
1549 unsigned int expected_perm,
1550 unsigned int new_perm)
1551 {
1552 pvh_assert_locked(pai);
1553
1554 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1555 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1556
1557 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1558
1559 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1560 sync_tlb_flush();
1561 }
1562
1563 /**
1564 * Update the protections on a range of physical aperture or static region
1565 * mappings and invalidate the TLB so the mappings can be used.
1566 *
1567 * @note Static region mappings can only be updated before machine_lockdown().
1568 * Physical aperture mappings can be updated at any time.
1569 *
1570 * @param start The starting virtual address of the static region or physical
1571 * aperture range whose permissions will be updated.
1572 * @param end The final (inclusive) virtual address of the static region or
1573 * physical aperture range whose permissions will be updated.
1574 * @param expected_perm The XPRR index that is expected to already exist at the
1575 * current mappings. If the current indices don't match
1576 * this then the system will panic.
1577 * @param new_perm The new XPRR index to update the mappings with.
1578 */
1579 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1580 pmap_set_range_xprr_perm(
1581 vm_address_t start,
1582 vm_address_t end,
1583 unsigned int expected_perm,
1584 unsigned int new_perm)
1585 {
1586 /**
1587 * Validate our arguments; any invalid argument will be grounds for a panic.
1588 */
1589 if (__improbable((start | end) & ARM_PGMASK)) {
1590 panic_plain("%s: start or end not page aligned, "
1591 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1592 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1593 }
1594
1595 if (__improbable(start > end)) {
1596 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1597 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1598 }
1599
1600 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1601 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1602
1603 if (__improbable(!(in_physmap || in_static))) {
1604 panic_plain("%s: address not in static region or physical aperture, "
1605 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1606 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1607 }
1608
1609 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1610 panic_plain("%s: invalid XPRR index, "
1611 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1612 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1613 }
1614
1615 /*
1616 * Walk over the PTEs for the given range, and set the protections on those
1617 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1618 * one twig entry (whichever twig entry currently maps "va").
1619 */
1620 vm_address_t va = start;
1621 while (va < end) {
1622 /**
1623 * Get the last VA that the twig entry for "va" maps. All of the leaf
1624 * PTEs from va to tte_va_end will have their permissions updated.
1625 */
1626 vm_address_t tte_va_end =
1627 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1628
1629 if (tte_va_end > end) {
1630 tte_va_end = end;
1631 }
1632
1633 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1634
1635 if (ttep == NULL) {
1636 panic_plain("%s: physical aperture or static region tte is NULL, "
1637 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1638 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1639 }
1640
1641 tt_entry_t tte = *ttep;
1642
1643 if (!tte_is_valid_table(tte)) {
1644 panic_plain("%s: tte=0x%llx is not a table type entry, "
1645 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1646 tte, (void *)start, (void *)end, new_perm, expected_perm);
1647 }
1648
1649 /* Walk over the given L3 page table page and update the PTEs. */
1650 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1651 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1652 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1653 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1654
1655 /**
1656 * The current PTE pointer is incremented by the page ratio (ratio of
1657 * VM page size to kernel hardware page size) because one call to
1658 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1659 * a PAGE_SIZE worth of hardware pages.
1660 */
1661 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1662 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1663 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1664 pvh_lock(pai);
1665 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1666 pvh_unlock(pai);
1667 }
1668
1669 va = tte_va_end;
1670 }
1671
1672 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1673 }
1674
1675 #endif /* XNU_MONITOR */
1676
1677 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1678 PMAP_ZINFO_PALLOC(
1679 pmap_t pmap, int bytes)
1680 {
1681 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1682 }
1683
1684 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1685 PMAP_ZINFO_PFREE(
1686 pmap_t pmap,
1687 int bytes)
1688 {
1689 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1690 }
1691
1692 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1693 pmap_tt_ledger_credit(
1694 pmap_t pmap,
1695 vm_size_t size)
1696 {
1697 if (pmap != kernel_pmap) {
1698 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1699 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1700 }
1701 }
1702
1703 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1704 pmap_tt_ledger_debit(
1705 pmap_t pmap,
1706 vm_size_t size)
1707 {
1708 if (pmap != kernel_pmap) {
1709 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1710 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1711 }
1712 }
1713
1714 static inline void
pmap_update_plru(uint16_t asid_index __unused)1715 pmap_update_plru(uint16_t asid_index __unused)
1716 {
1717 #if !HAS_16BIT_ASID
1718 if (__probable(pmap_asid_plru)) {
1719 unsigned plru_index = asid_index >> 6;
1720 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1721 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1722 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1723 }
1724 }
1725 #endif /* !HAS_16BIT_ASID */
1726 }
1727
1728 static bool
alloc_asid(pmap_t pmap)1729 alloc_asid(pmap_t pmap)
1730 {
1731 int vasid = -1;
1732 uint16_t hw_asid;
1733
1734 pmap_simple_lock(&asid_lock);
1735
1736 #if !HAS_16BIT_ASID
1737 if (__probable(pmap_asid_plru)) {
1738 unsigned plru_index = 0;
1739 uint64_t lowest_gen = asid_plru_generation[0];
1740 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1741 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1742 if (asid_plru_generation[i] < lowest_gen) {
1743 plru_index = i;
1744 lowest_gen = asid_plru_generation[i];
1745 lowest_gen_bitmap = asid_plru_bitmap[i];
1746 }
1747 }
1748
1749 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1750 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1751 if (temp_plru) {
1752 vasid = (plru_index << 6) + lsb_first(temp_plru);
1753 #if DEVELOPMENT || DEBUG
1754 ++pmap_asid_hits;
1755 #endif
1756 break;
1757 }
1758 }
1759 }
1760 #else
1761 /**
1762 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1763 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1764 * However, we first try to allocate starting from the position of the most-recently allocated
1765 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1766 * lower bit positions and then re-checking those same lower positions every time we allocate
1767 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1768 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1769 * logic, without requiring prohibitively expensive RCTX instructions.
1770 */
1771 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1772 #endif /* !HAS_16BIT_ASID */
1773 if (__improbable(vasid < 0)) {
1774 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1775 // slightly better with the collision detection scheme used by pmap_switch_internal().
1776 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1777 #if DEVELOPMENT || DEBUG
1778 ++pmap_asid_misses;
1779 #endif
1780 }
1781 if (__improbable(vasid < 0)) {
1782 pmap_simple_unlock(&asid_lock);
1783 return false;
1784 }
1785 assert((uint32_t)vasid < pmap_max_asids);
1786 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1787 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1788 #if HAS_16BIT_ASID
1789 last_allocated_asid = (uint16_t)vasid;
1790 #endif /* HAS_16BIT_ASID */
1791 pmap_simple_unlock(&asid_lock);
1792 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1793 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1794 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1795 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1796 * reassign to a reserved VASID. */
1797 assert(pmap->sw_asid < UINT8_MAX);
1798 pmap->sw_asid = UINT8_MAX;
1799 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1800 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1801 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1802 assert(hw_asid < MAX_HW_ASIDS);
1803 }
1804 pmap_update_plru(hw_asid);
1805 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1806 #if __ARM_KERNEL_PROTECT__
1807 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1808 #endif
1809 pmap->hw_asid = hw_asid;
1810 return true;
1811 }
1812
1813 static void
free_asid(pmap_t pmap)1814 free_asid(pmap_t pmap)
1815 {
1816 unsigned int vasid;
1817 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1818 if (__improbable(hw_asid == 0)) {
1819 return;
1820 }
1821
1822 #if __ARM_KERNEL_PROTECT__
1823 hw_asid >>= 1;
1824 #endif
1825 hw_asid -= 1;
1826
1827 #if HAS_16BIT_ASID
1828 vasid = hw_asid;
1829 #else
1830 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1831 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1832 } else {
1833 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1834 }
1835
1836 if (__probable(pmap_asid_plru)) {
1837 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1838 }
1839 #endif /* HAS_16BIT_ASID */
1840 pmap_simple_lock(&asid_lock);
1841 assert(!bitmap_test(&asid_bitmap[0], vasid));
1842 bitmap_set(&asid_bitmap[0], vasid);
1843 pmap_simple_unlock(&asid_lock);
1844 }
1845
1846
1847 boolean_t
pmap_valid_address(pmap_paddr_t addr)1848 pmap_valid_address(
1849 pmap_paddr_t addr)
1850 {
1851 return pa_valid(addr);
1852 }
1853
1854
1855
1856
1857
1858
1859 /*
1860 * Map memory at initialization. The physical addresses being
1861 * mapped are not managed and are never unmapped.
1862 *
1863 * For now, VM is already on, we only need to map the
1864 * specified memory.
1865 */
1866 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1867 pmap_map(
1868 vm_map_address_t virt,
1869 vm_offset_t start,
1870 vm_offset_t end,
1871 vm_prot_t prot,
1872 unsigned int flags)
1873 {
1874 kern_return_t kr;
1875 vm_size_t ps;
1876
1877 ps = PAGE_SIZE;
1878 while (start < end) {
1879 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1880 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1881
1882 if (kr != KERN_SUCCESS) {
1883 panic("%s: failed pmap_enter, "
1884 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1885 __FUNCTION__,
1886 (void *) virt, (void *) start, (void *) end, prot, flags);
1887 }
1888
1889 virt += ps;
1890 start += ps;
1891 }
1892 return virt;
1893 }
1894
1895 #if XNU_MONITOR
1896 /**
1897 * Remove kernel writeablity from an IO PTE value if the page is owned by
1898 * guarded mode software.
1899 *
1900 * @param paddr The physical address of the page which has to be non-DRAM.
1901 * @param tmplate The PTE value to be evaluated.
1902 *
1903 * @return A new PTE value with permission bits modified.
1904 */
1905 static inline
1906 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1907 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1908 {
1909 assert(!pa_valid(paddr));
1910
1911 const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1912
1913 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1914 /* PPL to own the page by converting KERN_RW to PPL_RW. */
1915 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1916 switch (xprr_perm) {
1917 case XPRR_KERN_RO_PERM:
1918 break;
1919 case XPRR_KERN_RW_PERM:
1920 tmplate &= ~ARM_PTE_XPRR_MASK;
1921 tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1922 break;
1923 default:
1924 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1925 }
1926 }
1927
1928 return tmplate;
1929 }
1930 #endif /* XNU_MONITOR */
1931
1932 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1933 pmap_map_bd_with_options(
1934 vm_map_address_t virt,
1935 vm_offset_t start,
1936 vm_offset_t end,
1937 vm_prot_t prot,
1938 int32_t options)
1939 {
1940 pt_entry_t mem_attr;
1941
1942 if (__improbable(start & PAGE_MASK)) {
1943 panic("%s: start 0x%lx is not page aligned", __func__, start);
1944 }
1945
1946 if (__improbable(end & PAGE_MASK)) {
1947 panic("%s: end 0x%lx is not page aligned", __func__, end);
1948 }
1949
1950 if (__improbable(!gDramBase || !gDramSize)) {
1951 panic("%s: gDramBase/gDramSize not initialized", __func__);
1952 }
1953
1954 const bool first_page_is_dram = is_dram_addr(start);
1955 for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) {
1956 if (first_page_is_dram != is_dram_addr(pa)) {
1957 panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM",
1958 __func__, pa, first_page_is_dram ? "is not" : "is");
1959 }
1960 }
1961
1962 switch (options & PMAP_MAP_BD_MASK) {
1963 case PMAP_MAP_BD_WCOMB:
1964 if (is_dram_addr(start)) {
1965 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1966 } else {
1967 #if HAS_FEAT_XS
1968 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
1969 #else /* HAS_FEAT_XS */
1970 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1971 #endif /* HAS_FEAT_XS */
1972 #if DEBUG || DEVELOPMENT
1973 pmap_wcrt_on_non_dram_count_increment_atomic();
1974 #endif /* DEBUG || DEVELOPMENT */
1975 }
1976 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1977 break;
1978 case PMAP_MAP_BD_POSTED:
1979 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1980 break;
1981 case PMAP_MAP_BD_POSTED_REORDERED:
1982 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1983 break;
1984 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1985 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1986 break;
1987 default:
1988 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1989 break;
1990 }
1991
1992 /* not cacheable and not buffered */
1993 pt_entry_t tmplate = pa_to_pte(start)
1994 | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1995 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1996 | mem_attr;
1997
1998 #if __ARM_KERNEL_PROTECT__
1999 tmplate |= ARM_PTE_NG;
2000 #endif /* __ARM_KERNEL_PROTECT__ */
2001
2002 vm_map_address_t vaddr = virt;
2003 vm_offset_t paddr = start;
2004 while (paddr < end) {
2005 pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
2006 if (ptep == PT_ENTRY_NULL) {
2007 panic("pmap_map_bd");
2008 }
2009
2010 /**
2011 * For every iteration, the paddr encoded in tmplate is incrementing,
2012 * but we always start with the original AP bits defined at the top
2013 * of the function in tmplate and only modify the AP bits in the pte
2014 * variable.
2015 */
2016 pt_entry_t pte;
2017 #if XNU_MONITOR
2018 if (!pa_valid(paddr)) {
2019 pte = pmap_construct_io_pte(paddr, tmplate);
2020 } else {
2021 pte = tmplate;
2022 }
2023 #else /* !XNU_MONITOR */
2024 pte = tmplate;
2025 #endif
2026
2027 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2028 write_pte_strong(ptep, pte);
2029
2030 pte_increment_pa(tmplate);
2031 vaddr += PAGE_SIZE;
2032 paddr += PAGE_SIZE;
2033 }
2034
2035 if (end >= start) {
2036 flush_mmu_tlb_region(virt, (unsigned)(end - start));
2037 }
2038
2039 return vaddr;
2040 }
2041
2042 /*
2043 * Back-door routine for mapping kernel VM at initialization.
2044 * Useful for mapping memory outside the range
2045 * [vm_first_phys, vm_last_phys] (i.e., devices).
2046 * Otherwise like pmap_map.
2047 */
2048 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)2049 pmap_map_bd(
2050 vm_map_address_t virt,
2051 vm_offset_t start,
2052 vm_offset_t end,
2053 vm_prot_t prot)
2054 {
2055 return pmap_map_bd_with_options(virt, start, end, prot, 0);
2056 }
2057
2058 /*
2059 * Back-door routine for mapping kernel VM at initialization.
2060 * Useful for mapping memory specific physical addresses in early
2061 * boot (i.e., before kernel_map is initialized).
2062 *
2063 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
2064 */
2065
2066 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2067 pmap_map_high_window_bd(
2068 vm_offset_t pa_start,
2069 vm_size_t len,
2070 vm_prot_t prot)
2071 {
2072 pt_entry_t *ptep, pte;
2073 vm_map_address_t va_start = VREGION1_START;
2074 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
2075 vm_map_address_t va_end;
2076 vm_map_address_t va;
2077 vm_size_t offset;
2078
2079 offset = pa_start & PAGE_MASK;
2080 pa_start -= offset;
2081 len += offset;
2082
2083 if (len > (va_max - va_start)) {
2084 panic("%s: area too large, "
2085 "pa_start=%p, len=%p, prot=0x%x",
2086 __FUNCTION__,
2087 (void*)pa_start, (void*)len, prot);
2088 }
2089
2090 scan:
2091 for (; va_start < va_max; va_start += PAGE_SIZE) {
2092 ptep = pmap_pte(kernel_pmap, va_start);
2093 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2094 if (!pte_is_valid(*ptep)) {
2095 break;
2096 }
2097 }
2098 if (va_start > va_max) {
2099 panic("%s: insufficient pages, "
2100 "pa_start=%p, len=%p, prot=0x%x",
2101 __FUNCTION__,
2102 (void*)pa_start, (void*)len, prot);
2103 }
2104
2105 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2106 ptep = pmap_pte(kernel_pmap, va_end);
2107 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2108 if (pte_is_valid(*ptep)) {
2109 va_start = va_end + PAGE_SIZE;
2110 goto scan;
2111 }
2112 }
2113
2114 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2115 ptep = pmap_pte(kernel_pmap, va);
2116 pte = pa_to_pte(pa_start)
2117 | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2118 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2119 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2120 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2121 #if __ARM_KERNEL_PROTECT__
2122 pte |= ARM_PTE_NG;
2123 #endif /* __ARM_KERNEL_PROTECT__ */
2124 write_pte_strong(ptep, pte);
2125 }
2126 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2127 #if KASAN
2128 kasan_notify_address(va_start, len);
2129 #endif
2130 return va_start;
2131 }
2132
2133 static uint32_t
pmap_compute_max_asids(void)2134 pmap_compute_max_asids(void)
2135 {
2136 DTEntry entry;
2137 void const *prop = NULL;
2138 uint32_t max_asids;
2139 int err;
2140 unsigned int prop_size;
2141
2142 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2143 assert(err == kSuccess);
2144
2145 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2146 /* TODO: consider allowing maxproc limits to be scaled earlier so that
2147 * we can choose a more flexible default value here. */
2148 return MAX_ASIDS;
2149 }
2150
2151 if (prop_size != sizeof(max_asids)) {
2152 panic("pmap-max-asids property is not a 32-bit integer");
2153 }
2154
2155 max_asids = *((uint32_t const *)prop);
2156 #if HAS_16BIT_ASID
2157 if (max_asids > MAX_HW_ASIDS) {
2158 panic("pmap-max-asids 0x%x too large", max_asids);
2159 }
2160 #else
2161 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2162 max_asids = (max_asids + 63) & ~63UL;
2163
2164 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2165 /* currently capped by size of pmap->sw_asid */
2166 panic("pmap-max-asids 0x%x too large", max_asids);
2167 }
2168 #endif /* HAS_16BIT_ASID */
2169 if (max_asids == 0) {
2170 panic("pmap-max-asids cannot be zero");
2171 }
2172 return max_asids;
2173 }
2174
2175 #if __arm64__
2176 /*
2177 * pmap_get_arm64_prot
2178 *
2179 * return effective armv8 VMSA block protections including
2180 * table AP/PXN/XN overrides of a pmap entry
2181 *
2182 */
2183
2184 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2185 pmap_get_arm64_prot(
2186 pmap_t pmap,
2187 vm_offset_t addr)
2188 {
2189 tt_entry_t tte = 0;
2190 unsigned int level = 0;
2191 uint64_t effective_prot_bits = 0;
2192 uint64_t aggregate_tte = 0;
2193 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2194 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2195
2196 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2197 tte = *pmap_ttne(pmap, level, addr);
2198
2199 if (!(tte & ARM_TTE_VALID)) {
2200 return 0;
2201 }
2202
2203 if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
2204 /* Block or page mapping; both have the same protection bit layout. */
2205 break;
2206 } else if (tte_is_table(tte)) {
2207 /* All of the table bits we care about are overrides, so just OR them together. */
2208 aggregate_tte |= tte;
2209 }
2210 }
2211
2212 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2213 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2214 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2215
2216 /* Start with the PTE bits. */
2217 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2218
2219 /* Table AP bits mask out block/page AP bits */
2220 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2221
2222 /* XN/PXN bits can be OR'd in. */
2223 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2224 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2225
2226 return effective_prot_bits;
2227 }
2228 #endif /* __arm64__ */
2229
2230 /**
2231 * Helper macros for accessing the "unnested" and "in-progress" bits in
2232 * pmap->nested_region_unnested_table_bitmap.
2233 */
2234 #define UNNEST_BIT(index) ((index) * 2)
2235 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
2236
2237
2238 /*
2239 * Bootstrap the system enough to run with virtual memory.
2240 *
2241 * The early VM initialization code has already allocated
2242 * the first CPU's translation table and made entries for
2243 * all the one-to-one mappings to be found there.
2244 *
2245 * We must set up the kernel pmap structures, the
2246 * physical-to-virtual translation lookup tables for the
2247 * physical memory to be managed (between avail_start and
2248 * avail_end).
2249 *
2250 * Map the kernel's code and data, and allocate the system page table.
2251 * Page_size must already be set.
2252 *
2253 * Parameters:
2254 * first_avail first available physical page -
2255 * after kernel page tables
2256 * avail_start PA of first managed physical page
2257 * avail_end PA of last managed physical page
2258 */
2259
2260 void
pmap_bootstrap(vm_offset_t vstart)2261 pmap_bootstrap(
2262 vm_offset_t vstart)
2263 {
2264 vm_map_offset_t maxoffset;
2265
2266 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2267
2268 #if XNU_MONITOR
2269 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
2270 pmap_ppl_disable = ml_unsafe_kernel_text();
2271 #endif
2272
2273 #endif /* XNU_MONITOR */
2274
2275 #if DEVELOPMENT || DEBUG
2276 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2277 kprintf("Kernel traces for pmap operations enabled\n");
2278 }
2279 #endif
2280
2281 /*
2282 * Initialize the kernel pmap.
2283 */
2284 #if ARM_PARAMETERIZED_PMAP
2285 kernel_pmap->pmap_pt_attr = native_pt_attr;
2286 #endif /* ARM_PARAMETERIZED_PMAP */
2287 #if HAS_APPLE_PAC
2288 kernel_pmap->disable_jop = 0;
2289 #endif /* HAS_APPLE_PAC */
2290 kernel_pmap->tte = cpu_tte;
2291 kernel_pmap->ttep = cpu_ttep;
2292 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2293 kernel_pmap->max = UINTPTR_MAX;
2294 os_atomic_init(&kernel_pmap->ref_count, 1);
2295 #if XNU_MONITOR
2296 os_atomic_init(&kernel_pmap->nested_count, 0);
2297 #endif
2298 kernel_pmap->nx_enabled = TRUE;
2299 #ifdef __arm64__
2300 kernel_pmap->is_64bit = TRUE;
2301 #else
2302 kernel_pmap->is_64bit = FALSE;
2303 #endif
2304 #if CONFIG_ROSETTA
2305 kernel_pmap->is_rosetta = FALSE;
2306 #endif
2307
2308 kernel_pmap->nested_region_addr = 0x0ULL;
2309 kernel_pmap->nested_region_size = 0x0ULL;
2310 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2311 kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2312 kernel_pmap->type = PMAP_TYPE_KERNEL;
2313
2314 kernel_pmap->hw_asid = 0;
2315 kernel_pmap->sw_asid = 0;
2316
2317 pmap_lock_init(kernel_pmap);
2318
2319 pmap_max_asids = pmap_compute_max_asids();
2320 #if HAS_16BIT_ASID
2321 asid_chunk_size = MAX_HW_ASIDS;
2322 #else
2323 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2324 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2325 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2326 * masking used by the PLRU scheme. This means we must handle the case in which
2327 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2328 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2329 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2330 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2331 #endif /* HAS_16BIT_ASIDS */
2332
2333 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2334
2335 #if HAS_SPECRES_DEBUGGING
2336 PE_parse_boot_argn("specres_debug", &specres_debug, sizeof(specres_debug));
2337
2338 if ((specres_debug & SPECRES_DEBUG_FORCE_RCTX) && (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX)) {
2339 panic("%s: invalid specres_debug value: %llu", __func__, specres_debug);
2340 }
2341 #endif /* HAS_SPECRES_DEBUGGING */
2342
2343 /**
2344 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2345 * pp_attr_table, etc). This function will use `avail_start` to allocate
2346 * space for these data structures.
2347 */
2348 pmap_data_bootstrap();
2349
2350 /**
2351 * Bootstrap any necessary SART data structures and values needed from the device tree.
2352 */
2353 sart_bootstrap();
2354
2355 /**
2356 * Don't make any assumptions about the alignment of avail_start before this
2357 * point (i.e., pmap_data_bootstrap() performs allocations).
2358 */
2359 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2360
2361 const pmap_paddr_t pmap_struct_start = avail_start;
2362
2363 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2364 avail_start = round_page(avail_start + asid_table_size);
2365
2366 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2367
2368 vm_first_phys = gPhysBase;
2369 vm_last_phys = trunc_page(avail_end);
2370
2371 queue_init(&map_pmap_list);
2372 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2373 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2374 free_page_size_tt_count = 0;
2375 free_page_size_tt_max = 0;
2376 free_tt_list = TT_FREE_ENTRY_NULL;
2377 free_tt_count = 0;
2378 free_tt_max = 0;
2379
2380 virtual_space_start = vstart;
2381 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2382
2383 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2384 #if !HAS_16BIT_ASID
2385 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2386 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2387 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2388 #endif /* !HAS_16BIT_ASID */
2389
2390
2391
2392 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2393 maxoffset = trunc_page(maxoffset);
2394 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2395 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2396 arm_pmap_max_offset_default = maxoffset;
2397 }
2398 }
2399 #if defined(__arm64__)
2400 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2401 maxoffset = trunc_page(maxoffset);
2402 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2403 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2404 arm64_pmap_max_offset_default = maxoffset;
2405 }
2406 }
2407 #endif
2408
2409 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2410
2411
2412 #if PMAP_CS_PPL_MONITOR
2413 /* Initialize the PPL trust cache read-write lock */
2414 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2415 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2416 #endif
2417
2418 #if DEVELOPMENT || DEBUG
2419 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2420 &vm_footprint_suspend_allowed,
2421 sizeof(vm_footprint_suspend_allowed));
2422 #endif /* DEVELOPMENT || DEBUG */
2423
2424 #if KASAN
2425 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2426 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2427 #endif /* KASAN */
2428
2429 /**
2430 * Ensure that avail_start is always left on a page boundary. The calling
2431 * code might not perform any alignment before allocating page tables so
2432 * this is important.
2433 */
2434 avail_start = round_page(avail_start);
2435 }
2436
2437 #if XNU_MONITOR
2438
2439 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2440 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2441 {
2442 pmap_paddr_t cur_pa;
2443 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2444 assert(pa_valid(cur_pa));
2445 ppattr_pa_set_monitor(cur_pa);
2446 }
2447 }
2448
2449 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2450 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2451 pmap_paddr_t end_pa,
2452 unsigned int expected_perm,
2453 unsigned int new_perm)
2454 {
2455 vm_offset_t start_va = phystokv(start_pa);
2456 vm_offset_t end_va = start_va + (end_pa - start_pa);
2457
2458 pa_set_range_monitor(start_pa, end_pa);
2459 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2460 }
2461
2462 static void
pmap_lockdown_kc(void)2463 pmap_lockdown_kc(void)
2464 {
2465 extern vm_offset_t vm_kernelcache_base;
2466 extern vm_offset_t vm_kernelcache_top;
2467 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2468 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2469 pmap_paddr_t cur_pa = start_pa;
2470 vm_offset_t cur_va = vm_kernelcache_base;
2471 while (cur_pa < end_pa) {
2472 vm_size_t range_size = end_pa - cur_pa;
2473 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2474 if (ptov_va != cur_va) {
2475 /*
2476 * If the physical address maps back to a virtual address that is non-linear
2477 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2478 * reclaimed by the OS and should therefore not be locked down.
2479 */
2480 cur_pa += range_size;
2481 cur_va += range_size;
2482 continue;
2483 }
2484 unsigned int pai = pa_index(cur_pa);
2485 pv_entry_t **pv_h = pai_to_pvh(pai);
2486
2487 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2488
2489 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2490 panic("pai %d already locked down", pai);
2491 }
2492
2493 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2494 cur_pa += ARM_PGBYTES;
2495 cur_va += ARM_PGBYTES;
2496 }
2497 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
2498 extern uint64_t ctrr_ro_test;
2499 extern uint64_t ctrr_nx_test;
2500 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2501 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2502 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2503 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2504 }
2505 #endif
2506 }
2507
2508 void
pmap_static_allocations_done(void)2509 pmap_static_allocations_done(void)
2510 {
2511 pmap_paddr_t monitor_start_pa;
2512 pmap_paddr_t monitor_end_pa;
2513
2514 /*
2515 * Protect the bootstrap (V=P and V->P) page tables.
2516 *
2517 * These bootstrap allocations will be used primarily for page tables.
2518 * If we wish to secure the page tables, we need to start by marking
2519 * these bootstrap allocations as pages that we want to protect.
2520 */
2521 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2522 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2523
2524 /* The bootstrap page tables are mapped RW at boostrap. */
2525 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2526
2527 /*
2528 * We use avail_start as a pointer to the first address that has not
2529 * been reserved for bootstrap, so we know which pages to give to the
2530 * virtual memory layer.
2531 */
2532 monitor_start_pa = first_avail_phys;
2533 monitor_end_pa = avail_start;
2534
2535 /* The other bootstrap allocations are mapped RW at bootstrap. */
2536 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2537
2538 /*
2539 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2540 * to RO in arm_vm_prot_finalize(), which is called after this function.
2541 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2542 * they can't be allocated for other uses. We don't need a special xPRR
2543 * protection index, as there is no PPL_RO index, and these pages are ultimately
2544 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2545 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2546 * lookup table index to USER_XO before APRR is applied, leading the hardware
2547 * to believe we are dealing with an user XO page upon performing a translation.
2548 */
2549 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2550 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2551 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2552
2553 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2554 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2555
2556 /* PPL data is RW for the PPL, RO for the kernel. */
2557 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2558
2559 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2560 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2561
2562 /* PPL text is RX for the PPL, RO for the kernel. */
2563 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2564
2565
2566 /*
2567 * In order to support DTrace, the save areas for the PPL must be
2568 * writable. This is due to the fact that DTrace will try to update
2569 * register state.
2570 */
2571 if (pmap_ppl_disable) {
2572 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2573 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2574
2575 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2576 }
2577
2578
2579 if (segSizePPLDATACONST > 0) {
2580 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2581 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2582
2583 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2584 }
2585
2586 /*
2587 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2588 * precaution. The real RW mappings are at a different location with guard pages.
2589 */
2590 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2591
2592 /* Prevent remapping of the kernelcache */
2593 pmap_lockdown_kc();
2594 }
2595
2596
2597 void
pmap_lockdown_ppl(void)2598 pmap_lockdown_ppl(void)
2599 {
2600 /* Mark the PPL as being locked down. */
2601
2602 mp_disable_preemption(); // for _nopreempt locking operations
2603 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2604 if (commpage_text_kva != 0) {
2605 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2606 false, VM_PROT_READ | VM_PROT_EXECUTE);
2607 }
2608 mp_enable_preemption();
2609
2610 /* Write-protect the kernel RO commpage. */
2611 #error "XPRR configuration error"
2612 }
2613 #endif /* XNU_MONITOR */
2614
2615 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2616 pmap_virtual_space(
2617 vm_offset_t *startp,
2618 vm_offset_t *endp
2619 )
2620 {
2621 *startp = virtual_space_start;
2622 *endp = virtual_space_end;
2623 }
2624
2625
2626 __mockable boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2627 pmap_virtual_region(
2628 unsigned int region_select,
2629 vm_map_offset_t *startp,
2630 vm_map_size_t *size
2631 )
2632 {
2633 boolean_t ret = FALSE;
2634 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2635 if (region_select == 0) {
2636 /*
2637 * In this config, the bootstrap mappings should occupy their own L2
2638 * TTs, as they should be immutable after boot. Having the associated
2639 * TTEs and PTEs in their own pages allows us to lock down those pages,
2640 * while allowing the rest of the kernel address range to be remapped.
2641 */
2642 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2643 #if defined(ARM_LARGE_MEMORY)
2644 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2645 #else
2646 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2647 #endif
2648 ret = TRUE;
2649 }
2650
2651 #if defined(ARM_LARGE_MEMORY)
2652 if (region_select == 1) {
2653 *startp = VREGION1_START;
2654 *size = VREGION1_SIZE;
2655 ret = TRUE;
2656 }
2657 #endif
2658 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */
2659 #if defined(ARM_LARGE_MEMORY)
2660 /* For large memory systems with no KTRR/CTRR */
2661 if (region_select == 0) {
2662 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2663 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2664 ret = TRUE;
2665 }
2666
2667 if (region_select == 1) {
2668 *startp = VREGION1_START;
2669 *size = VREGION1_SIZE;
2670 ret = TRUE;
2671 }
2672 #else /* !defined(ARM_LARGE_MEMORY) */
2673 unsigned long low_global_vr_mask = 0;
2674 vm_map_size_t low_global_vr_size = 0;
2675
2676 if (region_select == 0) {
2677 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2678 if (!TEST_PAGE_SIZE_4K) {
2679 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2680 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2681 } else {
2682 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2683 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2684 }
2685 ret = TRUE;
2686 }
2687 if (region_select == 1) {
2688 *startp = VREGION1_START;
2689 *size = VREGION1_SIZE;
2690 ret = TRUE;
2691 }
2692 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2693 if (!TEST_PAGE_SIZE_4K) {
2694 low_global_vr_mask = 0xFFFFFFFFFE000000;
2695 low_global_vr_size = 0x2000000;
2696 } else {
2697 low_global_vr_mask = 0xFFFFFFFFFF800000;
2698 low_global_vr_size = 0x800000;
2699 }
2700
2701 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2702 *startp = LOW_GLOBAL_BASE_ADDRESS;
2703 *size = low_global_vr_size;
2704 ret = TRUE;
2705 }
2706
2707 if (region_select == 3) {
2708 /* In this config, we allow the bootstrap mappings to occupy the same
2709 * page table pages as the heap.
2710 */
2711 *startp = VM_MIN_KERNEL_ADDRESS;
2712 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2713 ret = TRUE;
2714 }
2715 #endif /* defined(ARM_LARGE_MEMORY) */
2716 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
2717 return ret;
2718 }
2719
2720 /*
2721 * Routines to track and allocate physical pages during early boot.
2722 * On most systems that memory runs from first_avail through to avail_end
2723 * with no gaps.
2724 *
2725 * If the system supports ECC and ecc_bad_pages_count > 0, we
2726 * need to skip those pages.
2727 */
2728
2729 static unsigned int avail_page_count = 0;
2730 static bool need_ram_ranges_init = true;
2731
2732
2733 /**
2734 * Checks to see if a given page is in
2735 * the array of known bad pages
2736 *
2737 * @param ppn page number to check
2738 */
2739 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2740 pmap_is_bad_ram(__unused ppnum_t ppn)
2741 {
2742 return false;
2743 }
2744
2745 /**
2746 * Prepare bad ram pages to be skipped.
2747 */
2748
2749 /*
2750 * Initialize the count of available pages. No lock needed here,
2751 * as this code is called while kernel boot up is single threaded.
2752 */
2753 static void
initialize_ram_ranges(void)2754 initialize_ram_ranges(void)
2755 {
2756 pmap_paddr_t first = first_avail;
2757 pmap_paddr_t end = avail_end;
2758
2759 assert(first <= end);
2760 assert(first == (first & ~PAGE_MASK));
2761 assert(end == (end & ~PAGE_MASK));
2762 avail_page_count = atop(end - first);
2763
2764 need_ram_ranges_init = false;
2765 }
2766
2767 unsigned int
pmap_free_pages(void)2768 pmap_free_pages(
2769 void)
2770 {
2771 if (need_ram_ranges_init) {
2772 initialize_ram_ranges();
2773 }
2774 return avail_page_count;
2775 }
2776
2777 unsigned int
pmap_free_pages_span(void)2778 pmap_free_pages_span(
2779 void)
2780 {
2781 if (need_ram_ranges_init) {
2782 initialize_ram_ranges();
2783 }
2784 return (unsigned int)atop(avail_end - first_avail);
2785 }
2786
2787
2788 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2789 pmap_next_page_hi(
2790 ppnum_t * pnum,
2791 __unused boolean_t might_free)
2792 {
2793 return pmap_next_page(pnum);
2794 }
2795
2796
2797 boolean_t
pmap_next_page(ppnum_t * pnum)2798 pmap_next_page(
2799 ppnum_t *pnum)
2800 {
2801 if (need_ram_ranges_init) {
2802 initialize_ram_ranges();
2803 }
2804
2805
2806 if (first_avail != avail_end) {
2807 *pnum = (ppnum_t)atop(first_avail);
2808 first_avail += PAGE_SIZE;
2809 assert(avail_page_count > 0);
2810 --avail_page_count;
2811 return TRUE;
2812 }
2813 assert(avail_page_count == 0);
2814 return FALSE;
2815 }
2816
2817
2818 /**
2819 * Helper function to check wheter the given physical
2820 * page number is a restricted page.
2821 *
2822 * @param pn the physical page number to query.
2823 */
2824 bool
pmap_is_page_restricted(__unused ppnum_t pn)2825 pmap_is_page_restricted(__unused ppnum_t pn)
2826 {
2827 return false;
2828 }
2829
2830 /*
2831 * Initialize the pmap module.
2832 * Called by vm_init, to initialize any structures that the pmap
2833 * system needs to map virtual memory.
2834 */
2835 void
pmap_init(void)2836 pmap_init(
2837 void)
2838 {
2839 /*
2840 * Protect page zero in the kernel map.
2841 * (can be overruled by permanent transltion
2842 * table entries at page zero - see arm_vm_init).
2843 */
2844 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2845
2846 pmap_initialized = TRUE;
2847
2848 /*
2849 * Create the zone of physical maps
2850 * and the physical-to-virtual entries.
2851 */
2852 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2853 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2854
2855
2856 /*
2857 * Initialize the pmap object (for tracking the vm_page_t
2858 * structures for pages we allocate to be page tables in
2859 * pmap_expand().
2860 */
2861 _vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2862 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2863
2864 /*
2865 * The values of [hard_]maxproc may have been scaled, make sure
2866 * they are still less than the value of pmap_max_asids.
2867 */
2868 if ((uint32_t)maxproc > pmap_max_asids) {
2869 maxproc = pmap_max_asids;
2870 }
2871 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2872 hard_maxproc = pmap_max_asids;
2873 }
2874 }
2875
2876 /**
2877 * Verify that a given physical page contains no mappings (outside of the
2878 * default physical aperture mapping).
2879 *
2880 * @param ppnum Physical page number to check there are no mappings to.
2881 *
2882 * @return True if there are no mappings, false otherwise or if the page is not
2883 * kernel-managed.
2884 */
2885 bool
pmap_verify_free(ppnum_t ppnum)2886 pmap_verify_free(ppnum_t ppnum)
2887 {
2888 const pmap_paddr_t pa = ptoa(ppnum);
2889
2890 assert(pa != vm_page_fictitious_addr);
2891
2892 /* Only mappings to kernel-managed physical memory are tracked. */
2893 if (!pa_valid(pa)) {
2894 return false;
2895 }
2896
2897 const unsigned int pai = pa_index(pa);
2898 pv_entry_t **pvh = pai_to_pvh(pai);
2899
2900 return pvh_test_type(pvh, PVH_TYPE_NULL);
2901 }
2902
2903 #if MACH_ASSERT
2904 /**
2905 * Verify that a given physical page contains no mappings (outside of the
2906 * default physical aperture mapping) and if it does, then panic.
2907 *
2908 * @note It's recommended to use pmap_verify_free() directly when operating in
2909 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2910 * normally being called from outside of the PPL, and the pv_head_table
2911 * can't be modified outside of the PPL).
2912 *
2913 * @param ppnum Physical page number to check there are no mappings to.
2914 */
2915 void
pmap_assert_free(ppnum_t ppnum)2916 pmap_assert_free(ppnum_t ppnum)
2917 {
2918 const pmap_paddr_t pa = ptoa(ppnum);
2919
2920 /* Only mappings to kernel-managed physical memory are tracked. */
2921 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2922 return;
2923 }
2924
2925 const unsigned int pai = pa_index(pa);
2926 pv_entry_t **pvh = pai_to_pvh(pai);
2927
2928 /**
2929 * This function is always called from outside of the PPL. Because of this,
2930 * the PVH entry can't be locked. This function is generally only called
2931 * before the VM reclaims a physical page and shouldn't be creating new
2932 * mappings. Even if a new mapping is created while parsing the hierarchy,
2933 * the worst case is that the system will panic in another way, and we were
2934 * already about to panic anyway.
2935 */
2936
2937 /**
2938 * Since pmap_verify_free() returned false, that means there is at least one
2939 * mapping left. Let's get some extra info on the first mapping we find to
2940 * dump in the panic string (the common case is that there is one spare
2941 * mapping that was never unmapped).
2942 */
2943 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2944
2945 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2946 first_ptep = pvh_ptep(pvh);
2947 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2948 pv_entry_t *pvep = pvh_pve_list(pvh);
2949
2950 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2951 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2952 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2953 if (first_ptep != PT_ENTRY_NULL) {
2954 break;
2955 }
2956 }
2957
2958 /* The PVE should have at least one valid PTE. */
2959 assert(first_ptep != PT_ENTRY_NULL);
2960 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2961 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2962 __func__, pvh, pai);
2963 } else {
2964 /**
2965 * The mapping disappeared between here and the pmap_verify_free() call.
2966 * The only way that can happen is if the VM was racing this call with
2967 * a call that unmaps PTEs. Operations on this page should not be
2968 * occurring at the same time as this check, and unfortunately we can't
2969 * lock the PVH entry to prevent it, so just panic instead.
2970 */
2971 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2972 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2973 __func__, pvh, pai);
2974 }
2975
2976 /* Panic with a unique string identifying the first bad mapping and owner. */
2977 {
2978 /* First PTE is mapped by the main CPUs. */
2979 pmap_t pmap = ptep_get_pmap(first_ptep);
2980 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2981
2982 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2983 "%s CPU mapping (pmap: %p)",
2984 __func__, (uint64_t)pa, first_ptep, type, pmap);
2985 }
2986 }
2987 #endif /* MACH_ASSERT */
2988
2989 inline void
pmap_recycle_page(ppnum_t pn)2990 pmap_recycle_page(ppnum_t pn)
2991 {
2992 const bool is_freed = pmap_verify_free(pn);
2993
2994 if (__improbable(!is_freed)) {
2995 /*
2996 * There is a redundancy here, but we are going to panic anyways,
2997 * and ASSERT_PMAP_FREE traces useful information. So, we keep this
2998 * behavior.
2999 */
3000 #if MACH_ASSERT
3001 pmap_assert_free(pn);
3002 #endif /* MACH_ASSERT */
3003 panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn));
3004 }
3005 }
3006
3007
3008 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)3009 pmap_root_alloc_size(pmap_t pmap)
3010 {
3011 #pragma unused(pmap)
3012 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3013 unsigned int root_level = pt_attr_root_level(pt_attr);
3014 const uint64_t index = pt_attr_va_valid_mask(pt_attr);
3015 return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
3016 }
3017
3018
3019 /*
3020 * Create and return a physical map.
3021 *
3022 * If the size specified for the map
3023 * is zero, the map is an actual physical
3024 * map, and may be referenced by the
3025 * hardware.
3026 *
3027 * If the size specified is non-zero,
3028 * the map will be used in software only, and
3029 * is bounded by that size.
3030 */
3031 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)3032 pmap_create_options_internal(
3033 ledger_t ledger,
3034 vm_map_size_t size,
3035 unsigned int flags,
3036 kern_return_t *kr)
3037 {
3038 unsigned i;
3039 unsigned tte_index_max;
3040 pmap_t p;
3041 bool is_64bit = flags & PMAP_CREATE_64BIT;
3042 #if defined(HAS_APPLE_PAC)
3043 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
3044 #endif /* defined(HAS_APPLE_PAC) */
3045 kern_return_t local_kr = KERN_SUCCESS;
3046
3047 if (size != 0) {
3048 {
3049 // Size parameter should only be set for stage 2.
3050 return PMAP_NULL;
3051 }
3052 }
3053
3054 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
3055 return PMAP_NULL;
3056 }
3057
3058 #if XNU_MONITOR
3059 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
3060 goto pmap_create_fail;
3061 }
3062
3063 assert(p != PMAP_NULL);
3064
3065 if (ledger) {
3066 pmap_ledger_validate(ledger);
3067 pmap_ledger_retain(ledger);
3068 }
3069 #else
3070 /*
3071 * Allocate a pmap struct from the pmap_zone. Then allocate
3072 * the translation table of the right size for the pmap.
3073 */
3074 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
3075 local_kr = KERN_RESOURCE_SHORTAGE;
3076 goto pmap_create_fail;
3077 }
3078 #endif
3079
3080 p->ledger = ledger;
3081
3082
3083 p->pmap_vm_map_cs_enforced = false;
3084 p->min = 0;
3085
3086
3087 #if CONFIG_ROSETTA
3088 if (flags & PMAP_CREATE_ROSETTA) {
3089 p->is_rosetta = TRUE;
3090 } else {
3091 p->is_rosetta = FALSE;
3092 }
3093 #endif /* CONFIG_ROSETTA */
3094
3095 #if defined(HAS_APPLE_PAC)
3096 p->disable_jop = disable_jop;
3097 #endif /* defined(HAS_APPLE_PAC) */
3098
3099 p->nested_region_true_start = 0;
3100 p->nested_region_true_end = ~0;
3101
3102 p->nx_enabled = true;
3103 p->is_64bit = is_64bit;
3104 p->nested_pmap = PMAP_NULL;
3105 p->type = PMAP_TYPE_USER;
3106
3107 #if ARM_PARAMETERIZED_PMAP
3108 /* Default to the native pt_attr */
3109 p->pmap_pt_attr = native_pt_attr;
3110 #endif /* ARM_PARAMETERIZED_PMAP */
3111 #if __ARM_MIXED_PAGE_SIZE__
3112 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3113 p->pmap_pt_attr = &pmap_pt_attr_4k;
3114 }
3115 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3116 p->max = pmap_user_va_size(p);
3117
3118 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3119 local_kr = KERN_NO_SPACE;
3120 goto id_alloc_fail;
3121 }
3122
3123 pmap_lock_init(p);
3124
3125 p->tt_entry_free = (tt_entry_t *)0;
3126 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3127
3128
3129 #if XNU_MONITOR
3130 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3131 #else
3132 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3133 #endif
3134 if (!(p->tte)) {
3135 local_kr = KERN_RESOURCE_SHORTAGE;
3136 goto tt1_alloc_fail;
3137 }
3138
3139 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3140 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3141
3142 /* nullify the translation table */
3143 for (i = 0; i < tte_index_max; i++) {
3144 p->tte[i] = ARM_TTE_TYPE_FAULT;
3145 }
3146
3147 FLUSH_PTE();
3148
3149 /*
3150 * initialize the rest of the structure
3151 */
3152 p->nested_region_addr = 0x0ULL;
3153 p->nested_region_size = 0x0ULL;
3154 p->nested_region_unnested_table_bitmap = NULL;
3155 p->nested_region_unnested_table_bitmap_size = 0x0UL;
3156
3157 p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3158 p->nested_no_bounds_refcnt = 0;
3159 p->nested_bounds_set = false;
3160
3161
3162 #if MACH_ASSERT
3163 p->pmap_pid = 0;
3164 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3165 #endif /* MACH_ASSERT */
3166 #if DEVELOPMENT || DEBUG
3167 p->footprint_was_suspended = FALSE;
3168 #endif /* DEVELOPMENT || DEBUG */
3169
3170 #if XNU_MONITOR
3171 os_atomic_init(&p->nested_count, 0);
3172 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3173 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3174 os_atomic_thread_fence(release);
3175 #endif
3176 os_atomic_init(&p->ref_count, 1);
3177 pmap_simple_lock(&pmaps_lock);
3178 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3179 pmap_simple_unlock(&pmaps_lock);
3180
3181 /*
3182 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3183 * which can lead to a concurrent disconnect operation making the balance
3184 * transiently negative. The ledger should still ultimately balance out,
3185 * which we still check upon pmap destruction.
3186 */
3187 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3188 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3189 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3190 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3191 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3192 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3193 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3194 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3195 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3196
3197 return p;
3198
3199 tt1_alloc_fail:
3200 pmap_get_pt_ops(p)->free_id(p);
3201 id_alloc_fail:
3202 #if XNU_MONITOR
3203 pmap_free_pmap(p);
3204
3205 if (ledger) {
3206 pmap_ledger_release(ledger);
3207 }
3208 #else
3209 zfree(pmap_zone, p);
3210 #endif
3211 pmap_create_fail:
3212 #if XNU_MONITOR
3213 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3214 #endif
3215 *kr = local_kr;
3216 #if XNU_MONITOR
3217 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3218 #endif
3219 return PMAP_NULL;
3220 }
3221
3222 __mockable pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3223 pmap_create_options(
3224 ledger_t ledger,
3225 vm_map_size_t size,
3226 unsigned int flags)
3227 {
3228 pmap_t pmap;
3229 kern_return_t kr = KERN_SUCCESS;
3230
3231 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3232
3233 ledger_reference(ledger);
3234
3235 #if XNU_MONITOR
3236 for (;;) {
3237 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3238 if (kr != KERN_RESOURCE_SHORTAGE) {
3239 break;
3240 }
3241 assert(pmap == PMAP_NULL);
3242 pmap_alloc_page_for_ppl(0);
3243 kr = KERN_SUCCESS;
3244 }
3245 #else
3246 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3247 #endif
3248
3249 if (pmap == PMAP_NULL) {
3250 ledger_dereference(ledger);
3251 }
3252
3253 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3254
3255 return pmap;
3256 }
3257
3258 #if XNU_MONITOR
3259 /*
3260 * This symbol remains in place when the PPL is enabled so that the dispatch
3261 * table does not change from development to release configurations.
3262 */
3263 #endif
3264 #if MACH_ASSERT || XNU_MONITOR
3265 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3266 pmap_set_process_internal(
3267 __unused pmap_t pmap,
3268 __unused int pid,
3269 __unused char *procname)
3270 {
3271 #if MACH_ASSERT
3272 if (pmap == NULL || pmap->pmap_pid == -1) {
3273 return;
3274 }
3275
3276 validate_pmap_mutable(pmap);
3277
3278 pmap->pmap_pid = pid;
3279 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3280 #endif /* MACH_ASSERT */
3281 }
3282 #endif /* MACH_ASSERT || XNU_MONITOR */
3283
3284 #if MACH_ASSERT
3285 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3286 pmap_set_process(
3287 pmap_t pmap,
3288 int pid,
3289 char *procname)
3290 {
3291 #if XNU_MONITOR
3292 pmap_set_process_ppl(pmap, pid, procname);
3293 #else
3294 pmap_set_process_internal(pmap, pid, procname);
3295 #endif
3296 }
3297 #endif /* MACH_ASSERT */
3298
3299 /*
3300 * pmap_deallocate_all_leaf_tts:
3301 *
3302 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3303 * removing and deallocating all TTEs.
3304 */
3305 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3306 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3307 {
3308 tt_entry_t tte = ARM_TTE_EMPTY;
3309 tt_entry_t * ttep = NULL;
3310 tt_entry_t * last_ttep = NULL;
3311
3312 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3313
3314 assert(level < pt_attr_leaf_level(pt_attr));
3315
3316 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3317
3318 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3319 tte = *ttep;
3320
3321 if (!(tte & ARM_TTE_VALID)) {
3322 continue;
3323 }
3324
3325 if (tte_is_block(tte)) {
3326 panic("%s: found block mapping, ttep=%p, tte=%p, "
3327 "pmap=%p, first_ttep=%p, level=%u",
3328 __FUNCTION__, ttep, (void *)tte,
3329 pmap, first_ttep, level);
3330 }
3331
3332 /* Must be valid, type table */
3333 if (level < pt_attr_twig_level(pt_attr)) {
3334 /* If we haven't reached the twig level, recurse to the next level. */
3335 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3336 }
3337
3338 /* Remove the TTE. */
3339 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3340 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3341 }
3342 }
3343
3344 /*
3345 * We maintain stats and ledgers so that a task's physical footprint is:
3346 * phys_footprint = ((internal - alternate_accounting)
3347 * + (internal_compressed - alternate_accounting_compressed)
3348 * + iokit_mapped
3349 * + purgeable_nonvolatile
3350 * + purgeable_nonvolatile_compressed
3351 * + page_table)
3352 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3353 */
3354
3355 /*
3356 * Retire the given physical map from service.
3357 * Should only be called if the map contains
3358 * no valid mappings.
3359 */
3360 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3361 pmap_destroy_internal(
3362 pmap_t pmap)
3363 {
3364 if (pmap == PMAP_NULL) {
3365 return;
3366 }
3367
3368 validate_pmap(pmap);
3369
3370 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3371
3372 int32_t ref_count = os_atomic_dec(&pmap->ref_count, release);
3373 if (ref_count > 0) {
3374 return;
3375 } else if (__improbable(ref_count < 0)) {
3376 panic("pmap %p: refcount underflow", pmap);
3377 } else if (__improbable(pmap == kernel_pmap)) {
3378 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3379 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3380 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3381 }
3382
3383 /*
3384 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3385 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3386 * That ensures that if the pmap is currently in use elsewhere, this path will
3387 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3388 * ref_count of 0 and panic.
3389 */
3390 os_atomic_thread_fence(seq_cst);
3391
3392 #if XNU_MONITOR
3393 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3394 panic("pmap %p: attempt to destroy while nested", pmap);
3395 }
3396 const int max_cpu = ml_get_max_cpu_number();
3397 for (unsigned int i = 0; i <= max_cpu; ++i) {
3398 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3399 if (cpu_data == NULL) {
3400 continue;
3401 }
3402 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3403 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3404 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3405 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3406 }
3407 }
3408 #endif
3409 pmap_unmap_commpage(pmap);
3410
3411 pmap_simple_lock(&pmaps_lock);
3412 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3413 pmap_simple_unlock(&pmaps_lock);
3414
3415 pmap_trim_self(pmap);
3416
3417 /*
3418 * Free the memory maps, then the
3419 * pmap structure.
3420 */
3421 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3422
3423
3424
3425 if (pmap->tte) {
3426 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3427 pmap->tte = (tt_entry_t *) NULL;
3428 pmap->ttep = 0;
3429 }
3430
3431 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3432
3433 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3434 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3435 sync_tlb_flush();
3436 } else {
3437 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3438 sync_tlb_flush();
3439 /* return its asid to the pool */
3440 pmap_get_pt_ops(pmap)->free_id(pmap);
3441 if (pmap->nested_pmap != NULL) {
3442 #if XNU_MONITOR
3443 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3444 #endif
3445 /* release the reference we hold on the nested pmap */
3446 pmap_destroy_internal(pmap->nested_pmap);
3447 }
3448 }
3449
3450 pmap_check_ledgers(pmap);
3451
3452 if (pmap->nested_region_unnested_table_bitmap) {
3453 #if XNU_MONITOR
3454 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3455 #else
3456 kfree_data(pmap->nested_region_unnested_table_bitmap,
3457 pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3458 #endif
3459 }
3460
3461 #if XNU_MONITOR
3462 if (pmap->ledger) {
3463 pmap_ledger_release(pmap->ledger);
3464 }
3465
3466 pmap_lock_destroy(pmap);
3467 pmap_free_pmap(pmap);
3468 #else
3469 pmap_lock_destroy(pmap);
3470 zfree(pmap_zone, pmap);
3471 #endif
3472 }
3473
3474 __mockable void
pmap_destroy(pmap_t pmap)3475 pmap_destroy(
3476 pmap_t pmap)
3477 {
3478 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3479
3480 ledger_t ledger = pmap->ledger;
3481
3482 #if XNU_MONITOR
3483 pmap_destroy_ppl(pmap);
3484
3485 pmap_ledger_check_balance(pmap);
3486 #else
3487 pmap_destroy_internal(pmap);
3488 #endif
3489
3490 ledger_dereference(ledger);
3491
3492 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3493 }
3494
3495
3496 /*
3497 * Add a reference to the specified pmap.
3498 */
3499 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3500 pmap_reference_internal(
3501 pmap_t pmap)
3502 {
3503 if (pmap != PMAP_NULL) {
3504 validate_pmap_mutable(pmap);
3505 os_atomic_inc(&pmap->ref_count, acquire);
3506 }
3507 }
3508
3509 void
pmap_reference(pmap_t pmap)3510 pmap_reference(
3511 pmap_t pmap)
3512 {
3513 #if XNU_MONITOR
3514 pmap_reference_ppl(pmap);
3515 #else
3516 pmap_reference_internal(pmap);
3517 #endif
3518 }
3519
3520 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3521 pmap_tt1_allocate(
3522 pmap_t pmap,
3523 vm_size_t size,
3524 unsigned option)
3525 {
3526 tt_entry_t *tt1 = NULL;
3527 tt_free_entry_t *tt1_free;
3528 pmap_paddr_t pa;
3529 vm_address_t va;
3530 vm_address_t va_end;
3531 kern_return_t ret;
3532
3533 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3534 size = PAGE_SIZE;
3535 }
3536
3537 /**
3538 * We expect top level translation tables to always fit into a single
3539 * physical page. This would also catch a misconfiguration if 4K
3540 * concatenated page tables needed more than one physical tt1 page.
3541 */
3542 if (__improbable(size > PAGE_SIZE)) {
3543 panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)size);
3544 }
3545
3546 pmap_simple_lock(&tt1_lock);
3547 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3548 free_page_size_tt_count--;
3549 tt1 = (tt_entry_t *)free_page_size_tt_list;
3550 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3551 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3552 free_tt_count--;
3553 tt1 = (tt_entry_t *)free_tt_list;
3554 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3555 }
3556 pmap_simple_unlock(&tt1_lock);
3557
3558 if (tt1 != NULL) {
3559 pmap_tt_ledger_credit(pmap, size);
3560 return (tt_entry_t *)tt1;
3561 }
3562
3563 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3564
3565 if (ret == KERN_RESOURCE_SHORTAGE) {
3566 return (tt_entry_t *)0;
3567 }
3568
3569 #if XNU_MONITOR
3570 assert(pa);
3571 #endif
3572
3573 if (size < PAGE_SIZE) {
3574 va = phystokv(pa) + size;
3575 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3576 tt_free_entry_t *next_free = NULL;
3577 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3578 tt1_free = (tt_free_entry_t *)va;
3579 tt1_free->next = next_free;
3580 next_free = tt1_free;
3581 }
3582 pmap_simple_lock(&tt1_lock);
3583 local_free_list->next = free_tt_list;
3584 free_tt_list = next_free;
3585 free_tt_count += ((PAGE_SIZE / size) - 1);
3586 if (free_tt_count > free_tt_max) {
3587 free_tt_max = free_tt_count;
3588 }
3589 pmap_simple_unlock(&tt1_lock);
3590 }
3591
3592 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3593 * Depending on the device, this can vary between 512b and 16K. */
3594 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3595 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3596 pmap_tt_ledger_credit(pmap, size);
3597
3598 return (tt_entry_t *) phystokv(pa);
3599 }
3600
3601 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3602 pmap_tt1_deallocate(
3603 pmap_t pmap,
3604 tt_entry_t *tt,
3605 vm_size_t size,
3606 unsigned option)
3607 {
3608 tt_free_entry_t *tt_entry;
3609
3610 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3611 size = PAGE_SIZE;
3612 }
3613
3614 tt_entry = (tt_free_entry_t *)tt;
3615 assert(not_in_kdp);
3616 pmap_simple_lock(&tt1_lock);
3617
3618 if (size < PAGE_SIZE) {
3619 free_tt_count++;
3620 if (free_tt_count > free_tt_max) {
3621 free_tt_max = free_tt_count;
3622 }
3623 tt_entry->next = free_tt_list;
3624 free_tt_list = tt_entry;
3625 }
3626
3627 if (size == PAGE_SIZE) {
3628 free_page_size_tt_count++;
3629 if (free_page_size_tt_count > free_page_size_tt_max) {
3630 free_page_size_tt_max = free_page_size_tt_count;
3631 }
3632 tt_entry->next = free_page_size_tt_list;
3633 free_page_size_tt_list = tt_entry;
3634 }
3635
3636 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3637 pmap_simple_unlock(&tt1_lock);
3638 pmap_tt_ledger_debit(pmap, size);
3639 return;
3640 }
3641
3642 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3643 free_page_size_tt_count--;
3644 tt = (tt_entry_t *)free_page_size_tt_list;
3645 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3646
3647 pmap_simple_unlock(&tt1_lock);
3648
3649 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3650
3651 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3652
3653 pmap_simple_lock(&tt1_lock);
3654 }
3655
3656 pmap_simple_unlock(&tt1_lock);
3657 pmap_tt_ledger_debit(pmap, size);
3658 }
3659
3660 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3661 pmap_tt_allocate(
3662 pmap_t pmap,
3663 tt_entry_t **ttp,
3664 unsigned int level,
3665 unsigned int options)
3666 {
3667 pmap_paddr_t pa;
3668 *ttp = NULL;
3669
3670 /* Traverse the tt_entry_free list to find a free tt_entry */
3671 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3672 return KERN_ABORTED;
3673 }
3674
3675 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3676 tt_free_entry_t *tt_free_cur, *tt_free_next;
3677
3678 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3679 tt_free_next = tt_free_cur->next;
3680 tt_free_cur->next = NULL;
3681 *ttp = (tt_entry_t *)tt_free_cur;
3682 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3683 }
3684 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3685
3686 /* Only do the heavylifting here when we don't have a free tt_entry. */
3687 if (*ttp == NULL) {
3688 pt_desc_t *ptdp;
3689
3690 const unsigned int alloc_flags =
3691 (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
3692 /*
3693 * Allocate a VM page for the level x page table entries.
3694 */
3695 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags) != KERN_SUCCESS) {
3696 if (options & PMAP_OPTIONS_NOWAIT) {
3697 return KERN_RESOURCE_SHORTAGE;
3698 }
3699 VM_PAGE_WAIT();
3700 }
3701
3702 /* Allocate a new Page Table Descriptor for the newly allocated page table. */
3703 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3704 if (options & PMAP_OPTIONS_NOWAIT) {
3705 /* Deallocate all allocated resources so far. */
3706 pmap_pages_free(pa, PAGE_SIZE);
3707 return KERN_RESOURCE_SHORTAGE;
3708 }
3709 VM_PAGE_WAIT();
3710 }
3711
3712 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3713 OSAddAtomic64(1, &alloc_ttepages_count);
3714 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3715 } else {
3716 OSAddAtomic64(1, &alloc_ptepages_count);
3717 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3718 }
3719
3720 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3721
3722 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3723
3724 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3725 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3726 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3727
3728 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3729 if (PAGE_SIZE > pmap_page_size) {
3730 vm_address_t va;
3731 vm_address_t va_end;
3732
3733 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3734 /* Deallocate all allocated resources so far. */
3735 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3736 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3737 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3738 pmap_pages_free(pa, PAGE_SIZE);
3739 ptd_deallocate(ptdp);
3740
3741 return KERN_ABORTED;
3742 }
3743
3744 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3745 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3746 pmap->tt_entry_free = (tt_entry_t *)va;
3747 }
3748 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3749 }
3750
3751 *ttp = (tt_entry_t *)phystokv(pa);
3752 }
3753
3754 #if XNU_MONITOR
3755 assert(*ttp);
3756 #endif
3757
3758 return KERN_SUCCESS;
3759 }
3760
3761
3762 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3763 pmap_tt_deallocate(
3764 pmap_t pmap,
3765 tt_entry_t *ttp,
3766 unsigned int level)
3767 {
3768 pt_desc_t *ptdp;
3769 ptd_info_t *ptd_info;
3770 unsigned pt_acc_cnt;
3771 unsigned i;
3772 vm_offset_t free_page = 0;
3773 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3774 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3775
3776 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3777
3778 ptdp = ptep_get_ptd(ttp);
3779 ptd_info = ptd_get_info(ptdp, ttp);
3780
3781 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3782
3783 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3784 ptd_info->refcnt = 0;
3785 }
3786
3787 if (__improbable(ptd_info->refcnt != 0)) {
3788 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3789 }
3790
3791 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3792 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3793 }
3794
3795 if (pt_acc_cnt == 0) {
3796 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3797 unsigned pt_free_entry_cnt = 1;
3798
3799 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3800 tt_free_entry_t *tt_free_list_next;
3801
3802 tt_free_list_next = tt_free_list->next;
3803 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3804 pt_free_entry_cnt++;
3805 }
3806 tt_free_list = tt_free_list_next;
3807 }
3808 if (pt_free_entry_cnt == max_pt_index) {
3809 tt_free_entry_t *tt_free_list_cur;
3810
3811 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3812 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3813 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3814
3815 while (tt_free_list_cur) {
3816 tt_free_entry_t *tt_free_list_next;
3817
3818 tt_free_list_next = tt_free_list_cur->next;
3819 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3820 tt_free_list->next = tt_free_list_next->next;
3821 } else {
3822 tt_free_list = tt_free_list_next;
3823 }
3824 tt_free_list_cur = tt_free_list_next;
3825 }
3826 } else {
3827 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3828 pmap->tt_entry_free = ttp;
3829 }
3830 } else {
3831 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3832 pmap->tt_entry_free = ttp;
3833 }
3834
3835 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3836
3837 if (free_page != 0) {
3838 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3839 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3840 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3841 if (level < pt_attr_leaf_level(pt_attr)) {
3842 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3843 } else {
3844 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3845 }
3846 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3847 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3848 }
3849 }
3850
3851 /**
3852 * Safely clear out a translation table entry.
3853 *
3854 * @note If the TTE to clear out points to a leaf table, then that leaf table
3855 * must have a refcnt of zero before the TTE can be removed.
3856 * @note This function expects to be called with pmap locked exclusive, and will
3857 * return with pmap unlocked.
3858 *
3859 * @param pmap The pmap containing the page table whose TTE is being removed.
3860 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3861 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3862 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3863 * @param ttep Pointer to the TTE that should be cleared out.
3864 * @param level The level of the page table that contains the TTE to be removed.
3865 */
3866 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3867 pmap_tte_remove(
3868 pmap_t pmap,
3869 vm_offset_t va_start,
3870 vm_offset_t va_end,
3871 bool need_strong_sync,
3872 tt_entry_t *ttep,
3873 unsigned int level)
3874 {
3875 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3876
3877 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3878 const tt_entry_t tte = *ttep;
3879
3880 if (__improbable(tte == ARM_TTE_EMPTY)) {
3881 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3882 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3883 }
3884
3885 *ttep = (tt_entry_t) 0;
3886 FLUSH_PTE_STRONG();
3887 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3888 if (va_end > va_start) {
3889 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3890 }
3891
3892 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3893
3894 /**
3895 * Remember, the passed in "level" parameter refers to the level above the
3896 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3897 * page table).
3898 */
3899 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3900
3901 /**
3902 * Non-leaf pagetables don't track active references in the PTD and instead
3903 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3904 * the real refcount below.
3905 */
3906 unsigned short refcnt = PT_DESC_REFCOUNT;
3907
3908 /*
3909 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3910 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3911 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3912 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3913 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3914 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3915 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3916 * synchronize it against the disconnect operation. If that removal caused the
3917 * refcount to reach zero, the pagetable page could be freed before the disconnect
3918 * operation is finished using the relevant pagetable descriptor.
3919 * Address these cases by waiting until all CPUs have been observed to not be
3920 * executing pmap_disconnect().
3921 */
3922 if (remove_leaf_table) {
3923 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3924 const int max_cpu = ml_get_max_cpu_number();
3925 bitmap_full(&active_disconnects[0], max_cpu + 1);
3926 bool inflight_disconnect;
3927
3928 /*
3929 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3930 * ahead of any prior PTE load which may have observed the effect of a
3931 * concurrent disconnect operation. An acquire fence is required for this;
3932 * a load-acquire operation is insufficient.
3933 */
3934 os_atomic_thread_fence(acquire);
3935 do {
3936 inflight_disconnect = false;
3937 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3938 i >= 0;
3939 i = bitmap_next(&active_disconnects[0], i)) {
3940 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3941 if (cpu_data == NULL) {
3942 continue;
3943 }
3944 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3945 __builtin_arm_wfe();
3946 inflight_disconnect = true;
3947 continue;
3948 }
3949 os_atomic_clear_exclusive();
3950 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3951 }
3952 } while (inflight_disconnect);
3953 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3954 os_atomic_thread_fence(acquire);
3955 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3956 }
3957
3958 #if MACH_ASSERT
3959 /**
3960 * On internal devices, always do the page table consistency check
3961 * regardless of page table level or the actual refcnt value.
3962 */
3963 {
3964 #else /* MACH_ASSERT */
3965 /**
3966 * Only perform the page table consistency check when deleting leaf page
3967 * tables and it seems like there might be valid/compressed mappings
3968 * leftover.
3969 */
3970 if (__improbable(remove_leaf_table && refcnt != 0)) {
3971 #endif /* MACH_ASSERT */
3972
3973 /**
3974 * There are multiple problems that can arise as a non-zero refcnt:
3975 * 1. A bug in the refcnt management logic.
3976 * 2. A memory stomper or hardware failure.
3977 * 3. The VM forgetting to unmap all of the valid mappings in an address
3978 * space before destroying a pmap.
3979 *
3980 * By looping over the page table and determining how many valid or
3981 * compressed entries there actually are, we can narrow down which of
3982 * these three cases is causing this panic. If the expected refcnt
3983 * (valid + compressed) and the actual refcnt don't match then the
3984 * problem is probably either a memory corruption issue (if the
3985 * non-empty entries don't match valid+compressed, that could also be a
3986 * sign of corruption) or refcnt management bug. Otherwise, there
3987 * actually are leftover mappings and the higher layers of xnu are
3988 * probably at fault.
3989 */
3990 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3991 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3992
3993 pt_entry_t *ptep = bpte;
3994 unsigned short non_empty = 0, valid = 0, comp = 0;
3995
3996 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3997 /**
3998 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3999 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
4000 * That's because it's possible for the 4-tuple PTE clear operation in
4001 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
4002 * pmap_disconnect() to race each other in such a way that the compressed marker
4003 * may be left in the 2nd, 3rd, and/or 4th PTEs.
4004 * This should be harmless as only the 1st PTE is used for accounting purposes,
4005 * but we don't want it to trip our internal checks here.
4006 */
4007 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
4008 if ((i % PAGE_RATIO) == 0) {
4009 comp++;
4010 } else {
4011 continue;
4012 }
4013 } else if (__improbable(pte_is_valid(*ptep))) {
4014 valid++;
4015 }
4016
4017 /* Keep track of all non-empty entries to detect memory corruption. */
4018 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
4019 non_empty++;
4020 }
4021 }
4022
4023 #if MACH_ASSERT
4024 /**
4025 * On internal machines, panic whenever a page table getting deleted has
4026 * leftover mappings (valid or otherwise) or a leaf page table has a
4027 * non-zero refcnt.
4028 */
4029 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
4030 #else /* MACH_ASSERT */
4031 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
4032 {
4033 #endif /* MACH_ASSERT */
4034 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
4035 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
4036 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
4037 }
4038 }
4039 }
4040
4041 /**
4042 * Given a pointer to an entry within a `level` page table, delete the
4043 * page table at `level` + 1 that is represented by that entry. For instance,
4044 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
4045 * contains the PA of the L3 table, and `level` would be "2".
4046 *
4047 * @note If the table getting deallocated is a leaf table, then that leaf table
4048 * must have a refcnt of zero before getting deallocated. All other levels
4049 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
4050 * @note This function expects to be called with pmap locked exclusive and will
4051 * return with pmap unlocked.
4052 *
4053 * @param pmap The pmap that owns the page table to be deallocated.
4054 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
4055 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
4056 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
4057 * @param ttep Pointer to the `level` TTE to remove.
4058 * @param level The level of the table that contains an entry pointing to the
4059 * table to be removed. The deallocated page table will be a
4060 * `level` + 1 table (so if `level` is 2, then an L3 table will be
4061 * deleted).
4062 */
4063 void
4064 pmap_tte_deallocate(
4065 pmap_t pmap,
4066 vm_offset_t va_start,
4067 vm_offset_t va_end,
4068 bool need_strong_sync,
4069 tt_entry_t *ttep,
4070 unsigned int level)
4071 {
4072 tt_entry_t tte;
4073
4074 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4075
4076 tte = *ttep;
4077
4078 if (tte_get_ptd(tte)->pmap != pmap) {
4079 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4080 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4081 }
4082
4083 assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
4084 (unsigned long long)tte);
4085
4086 /* pmap_tte_remove() will drop the pmap lock */
4087 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4088
4089 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4090 }
4091
4092 /*
4093 * Remove a range of hardware page-table entries.
4094 * The entries given are the first (inclusive)
4095 * and last (exclusive) entries for the VM pages.
4096 * The virtual address is the va for the first pte.
4097 *
4098 * The pmap must be locked.
4099 * If the pmap is not the kernel pmap, the range must lie
4100 * entirely within one pte-page. This is NOT checked.
4101 * Assumes that the pte-page exists.
4102 *
4103 * Returns the number of PTE changed
4104 */
4105 MARK_AS_PMAP_TEXT static int
4106 pmap_remove_range(
4107 pmap_t pmap,
4108 vm_map_address_t va,
4109 pt_entry_t *bpte,
4110 pt_entry_t *epte)
4111 {
4112 bool need_strong_sync = false;
4113 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4114 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4115 if (num_changed > 0) {
4116 PMAP_UPDATE_TLBS(pmap, va,
4117 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4118 }
4119 return num_changed;
4120 }
4121
4122
4123 #ifdef PVH_FLAG_EXEC
4124
4125 /*
4126 * Update the access protection bits of the physical aperture mapping for a page.
4127 * This is useful, for example, in guranteeing that a verified executable page
4128 * has no writable mappings anywhere in the system, including the physical
4129 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4130 * synchronization overhead in cases where the call to this function is
4131 * guaranteed to be followed by other TLB operations.
4132 */
4133 void
4134 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4135 {
4136 #if __ARM_PTE_PHYSMAP__
4137 pvh_assert_locked(pai);
4138 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4139 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4140
4141 pt_entry_t tmplate = *pte_p;
4142 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4143 return;
4144 }
4145 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4146 if (tmplate & ARM_PTE_HINT_MASK) {
4147 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4148 __func__, pte_p, (void *)kva, tmplate);
4149 }
4150 write_pte_strong(pte_p, tmplate);
4151 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4152 if (!flush_tlb_async) {
4153 sync_tlb_flush();
4154 }
4155 #endif
4156 }
4157 #endif /* defined(PVH_FLAG_EXEC) */
4158
4159
4160
4161 MARK_AS_PMAP_TEXT int
4162 pmap_remove_range_options(
4163 pmap_t pmap,
4164 vm_map_address_t va,
4165 pt_entry_t *bpte,
4166 pt_entry_t *epte,
4167 vm_map_address_t *eva,
4168 bool *need_strong_sync __unused,
4169 int options)
4170 {
4171 pt_entry_t *cpte;
4172 size_t npages = 0;
4173 int num_removed, num_unwired;
4174 int num_pte_changed;
4175 unsigned int pai = 0;
4176 pmap_paddr_t pa;
4177 int num_external, num_internal, num_reusable;
4178 int num_alt_internal;
4179 uint64_t num_compressed, num_alt_compressed;
4180 int16_t refcnt = 0;
4181
4182 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4183
4184 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4185 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4186
4187 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4188 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4189 }
4190
4191 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4192 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4193 }
4194
4195 if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4196 panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4197 }
4198
4199 num_removed = 0;
4200 num_unwired = 0;
4201 num_pte_changed = 0;
4202 num_external = 0;
4203 num_internal = 0;
4204 num_reusable = 0;
4205 num_compressed = 0;
4206 num_alt_internal = 0;
4207 num_alt_compressed = 0;
4208
4209 #if XNU_MONITOR
4210 bool ro_va = false;
4211 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4212 ro_va = true;
4213 }
4214 #endif
4215 for (cpte = bpte; cpte < epte;
4216 cpte += PAGE_RATIO, va += pmap_page_size) {
4217 pt_entry_t spte;
4218 boolean_t managed = FALSE;
4219
4220 /*
4221 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4222 * so we need to be as aggressive as possible in checking for preemption when we can.
4223 */
4224 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4225 *eva = va;
4226 break;
4227 }
4228
4229 spte = *((volatile pt_entry_t*)cpte);
4230
4231 while (!managed) {
4232 if (pmap != kernel_pmap &&
4233 (options & PMAP_OPTIONS_REMOVE) &&
4234 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4235 /*
4236 * "pmap" must be locked at this point,
4237 * so this should not race with another
4238 * pmap_remove_range() or pmap_enter().
4239 */
4240
4241 /* one less "compressed"... */
4242 num_compressed++;
4243 if (spte & ARM_PTE_COMPRESSED_ALT) {
4244 /* ... but it used to be "ALTACCT" */
4245 num_alt_compressed++;
4246 }
4247
4248 /* clear marker */
4249 write_pte_fast(cpte, ARM_PTE_EMPTY);
4250 /*
4251 * "refcnt" also accounts for
4252 * our "compressed" markers,
4253 * so let's update it here.
4254 */
4255 --refcnt;
4256 spte = *((volatile pt_entry_t*)cpte);
4257 }
4258 /*
4259 * It may be possible for the pte to transition from managed
4260 * to unmanaged in this timeframe; for now, elide the assert.
4261 * We should break out as a consequence of checking pa_valid.
4262 */
4263 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4264 pa = pte_to_pa(spte);
4265 if (!pa_valid(pa)) {
4266 #if XNU_MONITOR
4267 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4268 #endif
4269 #if XNU_MONITOR
4270 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4271 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4272 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4273 __func__, (uint64_t)pa);
4274 }
4275 #endif
4276 break;
4277 }
4278 #if HAS_FEAT_XS
4279 if (pte_is_xs(pt_attr, spte)) {
4280 *need_strong_sync = true;
4281 }
4282 #endif /* HAS_FEAT_XS */
4283 pai = pa_index(pa);
4284 pvh_lock(pai);
4285 spte = *((volatile pt_entry_t*)cpte);
4286 pa = pte_to_pa(spte);
4287 if (pai == pa_index(pa)) {
4288 managed = TRUE;
4289 break; // Leave pai locked as we will unlock it after we free the PV entry
4290 }
4291 pvh_unlock(pai);
4292 }
4293
4294 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4295 /*
4296 * There used to be a valid mapping here but it
4297 * has already been removed when the page was
4298 * sent to the VM compressor, so nothing left to
4299 * remove now...
4300 */
4301 continue;
4302 }
4303
4304 /* remove the translation, do not flush the TLB */
4305 if (*cpte != ARM_PTE_EMPTY) {
4306 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4307 assertf(pte_is_valid(*cpte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4308 #if MACH_ASSERT
4309 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4310 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4311 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4312 }
4313 #endif
4314 write_pte_fast(cpte, ARM_PTE_EMPTY);
4315 num_pte_changed++;
4316 }
4317
4318 if ((spte != ARM_PTE_EMPTY) && (pmap != kernel_pmap)) {
4319 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4320 assertf(pte_is_valid(spte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4321 --refcnt;
4322 }
4323
4324 if (pte_is_wired(spte)) {
4325 pte_set_wired(pmap, cpte, 0);
4326 num_unwired++;
4327 }
4328 /*
4329 * if not managed, we're done
4330 */
4331 if (!managed) {
4332 continue;
4333 }
4334
4335 #if XNU_MONITOR
4336 if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4337 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4338 }
4339 if (__improbable(ro_va)) {
4340 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4341 }
4342 #endif
4343
4344 /*
4345 * find and remove the mapping from the chain for this
4346 * physical address.
4347 */
4348 bool is_internal, is_altacct;
4349 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4350
4351 if (is_altacct) {
4352 assert(is_internal);
4353 num_internal++;
4354 num_alt_internal++;
4355 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4356 ppattr_clear_altacct(pai);
4357 ppattr_clear_internal(pai);
4358 }
4359 } else if (is_internal) {
4360 if (ppattr_test_reusable(pai)) {
4361 num_reusable++;
4362 } else {
4363 num_internal++;
4364 }
4365 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4366 ppattr_clear_internal(pai);
4367 }
4368 } else {
4369 num_external++;
4370 }
4371 pvh_unlock(pai);
4372 num_removed++;
4373 }
4374
4375 /*
4376 * Update the counts
4377 */
4378 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4379
4380 if (pmap != kernel_pmap) {
4381 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4382 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4383 }
4384
4385 /* update ledgers */
4386 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4387 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4388 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4389 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4390 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4391 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4392 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4393 /* make needed adjustments to phys_footprint */
4394 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4395 ((num_internal -
4396 num_alt_internal) +
4397 (num_compressed -
4398 num_alt_compressed)) * pmap_page_size);
4399 }
4400
4401 /* flush the ptable entries we have written */
4402 if (num_pte_changed > 0) {
4403 FLUSH_PTE_STRONG();
4404 }
4405
4406 return num_pte_changed;
4407 }
4408
4409
4410 /*
4411 * Remove the given range of addresses
4412 * from the specified map.
4413 *
4414 * It is assumed that the start and end are properly
4415 * rounded to the hardware page size.
4416 */
4417 void
4418 pmap_remove(
4419 pmap_t pmap,
4420 vm_map_address_t start,
4421 vm_map_address_t end)
4422 {
4423 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4424 }
4425
4426 MARK_AS_PMAP_TEXT vm_map_address_t
4427 pmap_remove_options_internal(
4428 pmap_t pmap,
4429 vm_map_address_t start,
4430 vm_map_address_t end,
4431 int options)
4432 {
4433 vm_map_address_t eva = end;
4434 pt_entry_t *bpte, *epte;
4435 pt_entry_t *pte_p;
4436 tt_entry_t *tte_p;
4437 int remove_count = 0;
4438 bool need_strong_sync = false;
4439 bool unlock = true;
4440
4441 validate_pmap_mutable(pmap);
4442
4443 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4444
4445 if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4446 panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4447 }
4448
4449 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4450
4451 tte_p = pmap_tte(pmap, start);
4452
4453 if (tte_p == (tt_entry_t *) NULL) {
4454 goto done;
4455 }
4456
4457 if (tte_is_valid_table(*tte_p)) {
4458 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4459 bpte = &pte_p[pte_index(pt_attr, start)];
4460 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4461
4462 /*
4463 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4464 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4465 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4466 */
4467 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4468 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4469 __func__, ptep_get_pmap(bpte), pmap, bpte);
4470 }
4471
4472 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4473 &need_strong_sync, options);
4474
4475 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4476 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4477 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4478 unlock = false; // pmap_tte_deallocate() has dropped the lock
4479 }
4480 }
4481
4482 done:
4483 if (unlock) {
4484 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4485 }
4486
4487 if (remove_count > 0) {
4488 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4489 }
4490 return eva;
4491 }
4492
4493 __mockable void
4494 pmap_remove_options(
4495 pmap_t pmap,
4496 vm_map_address_t start,
4497 vm_map_address_t end,
4498 int options)
4499 {
4500 vm_map_address_t va;
4501
4502 if (pmap == PMAP_NULL) {
4503 return;
4504 }
4505
4506 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4507
4508 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4509 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4510 VM_KERNEL_ADDRHIDE(end));
4511
4512 /*
4513 * We allow single-page requests to execute non-preemptibly,
4514 * as it doesn't make sense to sample AST_URGENT for a single-page
4515 * operation, and there are a couple of special use cases that
4516 * require a non-preemptible single-page operation.
4517 */
4518 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4519 pmap_verify_preemptible();
4520 }
4521
4522 /*
4523 * Invalidate the translation buffer first
4524 */
4525 va = start;
4526 while (va < end) {
4527 vm_map_address_t l;
4528
4529 #if XNU_TARGET_OS_XR
4530 /* rdar://84856940 */
4531 unsigned int const BATCH_SIZE = 128 * pt_attr_leaf_size(pt_attr);
4532
4533 l = va + BATCH_SIZE;
4534
4535 vm_map_address_t const l_twig = l & ~pt_attr_twig_offmask(pt_attr);
4536
4537 if (l_twig > (va & ~pt_attr_twig_offmask(pt_attr))) {
4538 // We're not allowed to cross an L2 boundary.
4539 l = l_twig;
4540 }
4541 #else /* XNU_TARGET_OS_XR */
4542 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4543 #endif /* XNU_TARGET_OS_XR */
4544 if (l > end) {
4545 l = end;
4546 }
4547
4548 #if XNU_MONITOR
4549 va = pmap_remove_options_ppl(pmap, va, l, options);
4550
4551 pmap_ledger_check_balance(pmap);
4552 #else
4553 va = pmap_remove_options_internal(pmap, va, l, options);
4554 #endif
4555 }
4556
4557 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4558 }
4559
4560
4561 /*
4562 * Remove phys addr if mapped in specified map
4563 */
4564 void
4565 pmap_remove_some_phys(
4566 __unused pmap_t map,
4567 __unused ppnum_t pn)
4568 {
4569 /* Implement to support working set code */
4570 }
4571
4572 /*
4573 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4574 * switch a thread onto a new vm_map.
4575 */
4576 void
4577 pmap_switch_user(thread_t thread, vm_map_t new_map)
4578 {
4579 pmap_t new_pmap = new_map->pmap;
4580
4581
4582 thread->map = new_map;
4583 pmap_set_pmap(new_pmap, thread);
4584
4585 }
4586
4587 void
4588 pmap_set_pmap(
4589 pmap_t pmap,
4590 thread_t thread)
4591 {
4592 pmap_switch(pmap, thread);
4593 #if __ARM_USER_PROTECT__
4594 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4595 thread->machine.asid = pmap->hw_asid;
4596 #endif
4597 }
4598
4599 static void
4600 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4601 {
4602 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4603 }
4604
4605 #if HAS_SPECRES
4606 static void
4607 pmap_flush_core_cfp_asid_async(pmap_t pmap)
4608 {
4609 const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4610 asm volatile ("cfp rctx, %0" : : "r"(rctx_operand));
4611 }
4612
4613 #if REQUIRES_DVP_RCTX
4614 static void
4615 pmap_flush_core_dvp_asid_async(pmap_t pmap)
4616 {
4617 const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4618 asm volatile ("dvp rctx, %0" : : "r"(rctx_operand));
4619 }
4620 #endif /* REQUIRES_DVP_RCTX */
4621 #endif /* HAS_SPECRES */
4622
4623 static inline bool
4624 pmap_user_ttb_is_clear(void)
4625 {
4626 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4627 }
4628
4629 MARK_AS_PMAP_TEXT void
4630 pmap_switch_internal(
4631 pmap_t pmap)
4632 {
4633 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4634 #if XNU_MONITOR
4635 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4636
4637 /**
4638 * Make sure a pmap is never active-and-nested. For more details,
4639 * see pmap_set_nested_internal().
4640 */
4641 os_atomic_thread_fence(seq_cst);
4642 if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4643 panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4644 }
4645 #endif
4646 validate_pmap_mutable(pmap);
4647 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4648 uint16_t asid_index = pmap->hw_asid;
4649 bool do_asid_flush = false;
4650 bool do_commpage_flush = false;
4651 #if HAS_SPECRES
4652 bool do_speculation_restriction = false;
4653 #endif /* HAS_SPECRES */
4654
4655 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4656 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4657 }
4658 #if __ARM_KERNEL_PROTECT__
4659 asid_index >>= 1;
4660 #endif
4661
4662 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4663 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4664 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4665 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4666 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4667 bool break_before_make = do_shared_region_flush;
4668
4669 #if !HAS_16BIT_ASID
4670 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4671 asid_index -= 1;
4672 pmap_update_plru(asid_index);
4673
4674 /* Paranoia. */
4675 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4676
4677 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4678 uint8_t new_sw_asid = pmap->sw_asid;
4679 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4680
4681 if (new_sw_asid != last_sw_asid) {
4682 /**
4683 * If the virtual ASID of the new pmap does not match the virtual ASID
4684 * last seen on this CPU for the physical ASID (that was a mouthful),
4685 * then this switch runs the risk of aliasing. We need to flush the
4686 * TLB for this phyiscal ASID in this case.
4687 */
4688 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4689 do_asid_flush = true;
4690 #if HAS_SPECRES
4691 do_speculation_restriction = true;
4692 #endif /* HAS_SPECRES */
4693 break_before_make = true;
4694 }
4695 }
4696 #endif /* !HAS_16BIT_ASID */
4697
4698 #if HAS_SPECRES_DEBUGGING
4699 if (specres_debug & SPECRES_DEBUG_FORCE_RCTX) {
4700 do_speculation_restriction = true;
4701 } else if (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX) {
4702 do_speculation_restriction = false;
4703 }
4704 #endif /* HAS_SPECRES_DEBUGGING */
4705
4706 #if __ARM_MIXED_PAGE_SIZE__
4707 if (pt_attr->pta_tcr_value != get_tcr()) {
4708 break_before_make = true;
4709 }
4710 #endif
4711 #if __ARM_MIXED_PAGE_SIZE__
4712 /*
4713 * For mixed page size configurations, we need to flush the global commpage mappings from
4714 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4715 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4716 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4717 * conflict abort or other unpredictable behavior.
4718 */
4719 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4720 do_commpage_flush = true;
4721 }
4722 if (do_commpage_flush) {
4723 break_before_make = true;
4724 }
4725 #endif
4726 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4727 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4728 pmap_clear_user_ttb_internal();
4729 }
4730
4731 #if HAS_SPECRES
4732 /**
4733 * Perform an CFP/DVP flush if required.
4734 */
4735 if (__improbable(do_speculation_restriction)) {
4736 pmap_flush_core_cfp_asid_async(pmap);
4737 #if REQUIRES_DVP_RCTX
4738 pmap_flush_core_dvp_asid_async(pmap);
4739 #endif /* REQUIRES_DVP_RCTX */
4740 #if DEVELOPMENT || DEBUG
4741 os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4742 #endif /* DEVELOPMENT || DEBUG */
4743 }
4744 #endif /* HAS_SPECRES */
4745
4746 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4747 * to flush the userspace mappings for that region. Those mappings are global
4748 * and will not be protected by the ASID. It should also be cheaper to flush the
4749 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4750 if (__improbable(do_shared_region_flush)) {
4751 #if __ARM_RANGE_TLBI__
4752 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4753 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4754
4755 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4756 * There may still be non-global entries that overlap with the incoming pmap's
4757 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4758 * must necessarily belong to a different ASID than the incoming pmap, or they would
4759 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4760 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4761 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4762 * to consider additional invalidation here in the future. */
4763 if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4764 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4765 } else {
4766 /*
4767 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4768 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4769 * have a single-page shared region anyway, not least because pmap_nest()
4770 * requires L2 block alignment of the address and size.
4771 */
4772 do_asid_flush = false;
4773 flush_core_tlb_async();
4774 }
4775 #else
4776 do_asid_flush = false;
4777 flush_core_tlb_async();
4778 #endif // __ARM_RANGE_TLBI__
4779 }
4780
4781 #if __ARM_MIXED_PAGE_SIZE__
4782 if (__improbable(do_commpage_flush)) {
4783 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4784 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4785 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4786 flush_core_tlb_allrange_async(rtlbi_param);
4787 }
4788 #endif
4789 if (__improbable(do_asid_flush)) {
4790 pmap_flush_core_tlb_asid_async(pmap);
4791 #if DEVELOPMENT || DEBUG
4792 os_atomic_inc(&pmap_asid_flushes, relaxed);
4793 #endif /* DEVELOPMENT || DEBUG */
4794 }
4795
4796 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush
4797 #if HAS_SPECRES && !HAS_ERRATA_123855614
4798 || do_speculation_restriction
4799 #endif /* HAS_SPECRES && !HAS_ERRATA_123855614 */
4800 )) {
4801 sync_tlb_flush_local();
4802 }
4803
4804 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4805 }
4806
4807 void
4808 pmap_switch(
4809 pmap_t pmap,
4810 thread_t thread __unused)
4811 {
4812 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4813 #if XNU_MONITOR
4814 pmap_switch_ppl(pmap);
4815 #else
4816 pmap_switch_internal(pmap);
4817 #endif
4818 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4819 }
4820
4821 void
4822 pmap_page_protect(
4823 ppnum_t ppnum,
4824 vm_prot_t prot)
4825 {
4826 pmap_page_protect_options(ppnum, prot, 0, NULL);
4827 }
4828
4829 /*
4830 * Routine: pmap_page_protect_options
4831 *
4832 * Function:
4833 * Lower the permission for all mappings to a given
4834 * page.
4835 */
4836 MARK_AS_PMAP_TEXT static void
4837 pmap_page_protect_options_with_flush_range(
4838 ppnum_t ppnum,
4839 vm_prot_t prot,
4840 unsigned int options,
4841 pmap_tlb_flush_range_t *flush_range)
4842 {
4843 pmap_paddr_t phys = ptoa(ppnum);
4844 pv_entry_t **pv_h;
4845 pv_entry_t *pve_p, *orig_pve_p;
4846 pv_entry_t *pveh_p;
4847 pv_entry_t *pvet_p;
4848 pt_entry_t *pte_p, *orig_pte_p;
4849 pv_entry_t *new_pve_p;
4850 pt_entry_t *new_pte_p;
4851 vm_offset_t pvh_flags;
4852 unsigned int pai;
4853 bool remove;
4854 bool set_NX;
4855 unsigned int pvh_cnt = 0;
4856 unsigned int pass1_updated = 0;
4857 unsigned int pass2_updated = 0;
4858
4859 assert(ppnum != vm_page_fictitious_addr);
4860
4861 /* Only work with managed pages. */
4862 if (!pa_valid(phys)) {
4863 return;
4864 }
4865
4866 /*
4867 * Determine the new protection.
4868 */
4869 switch (prot) {
4870 case VM_PROT_ALL:
4871 return; /* nothing to do */
4872 case VM_PROT_READ:
4873 case VM_PROT_READ | VM_PROT_EXECUTE:
4874 remove = false;
4875 break;
4876 default:
4877 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4878 options = options & ~PMAP_OPTIONS_NOFLUSH;
4879 remove = true;
4880 break;
4881 }
4882
4883 pmap_cpu_data_t *pmap_cpu_data = NULL;
4884 if (remove) {
4885 #if !XNU_MONITOR
4886 mp_disable_preemption();
4887 #endif
4888 pmap_cpu_data = pmap_get_cpu_data();
4889 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4890 /*
4891 * Ensure the store to inflight_disconnect will be observed before any of the
4892 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4893 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4894 * another CPU, in between this function's clearing a PTE and dropping the
4895 * corresponding pagetable refcount. That can lead to a panic if the
4896 * destroying thread observes a non-zero refcount. For this we need a store-
4897 * store barrier; a store-release operation would not be sufficient.
4898 */
4899 os_atomic_thread_fence(release);
4900 }
4901
4902 pai = pa_index(phys);
4903 pvh_lock(pai);
4904 pv_h = pai_to_pvh(pai);
4905 pvh_flags = pvh_get_flags(pv_h);
4906
4907 #if XNU_MONITOR
4908 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4909 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4910 }
4911 if (__improbable(ppattr_pa_test_monitor(phys))) {
4912 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4913 }
4914 if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4915 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4916 }
4917 #endif
4918
4919
4920 orig_pte_p = pte_p = PT_ENTRY_NULL;
4921 orig_pve_p = pve_p = PV_ENTRY_NULL;
4922 pveh_p = PV_ENTRY_NULL;
4923 pvet_p = PV_ENTRY_NULL;
4924 new_pve_p = PV_ENTRY_NULL;
4925 new_pte_p = PT_ENTRY_NULL;
4926
4927
4928 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4929 orig_pte_p = pte_p = pvh_ptep(pv_h);
4930 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4931 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4932 pveh_p = pve_p;
4933 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4934 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4935 }
4936
4937 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4938 int pve_ptep_idx = 0;
4939
4940 /*
4941 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4942 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4943 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4944 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4945 * operation, TLB invalidation may be handled by the caller so it's possible for
4946 * tlb_flush_needed to be true while issue_tlbi is false.
4947 */
4948 bool issue_tlbi = false;
4949 bool tlb_flush_needed = false;
4950 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4951 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4952 pt_entry_t tmplate = ARM_PTE_EMPTY;
4953 bool update = false;
4954
4955 if (pve_p != PV_ENTRY_NULL) {
4956 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4957 if (pte_p == PT_ENTRY_NULL) {
4958 goto protect_skip_pve_pass1;
4959 }
4960 }
4961
4962 #ifdef PVH_FLAG_IOMMU
4963 if (pvh_ptep_is_iommu(pte_p)) {
4964 #if XNU_MONITOR
4965 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4966 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4967 ppnum, ptep_get_iommu(pte_p), pve_p);
4968 }
4969 #endif
4970 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4971 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4972 ppnum, ptep_get_iommu(pte_p), pve_p);
4973 }
4974 goto protect_skip_pve_pass1;
4975 }
4976 #endif
4977 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4978 const pmap_t pmap = ptdp->pmap;
4979 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4980
4981 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4982 #if MACH_ASSERT
4983 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4984 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4985 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4986 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4987
4988 pv_entry_t *check_pvep = pve_p;
4989
4990 do {
4991 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4992 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4993 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4994 }
4995 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4996
4997 /* Restore previous PTEP value. */
4998 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4999 }
5000 #endif
5001 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
5002 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
5003 }
5004
5005 #if DEVELOPMENT || DEBUG
5006 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5007 #else
5008 if ((prot & VM_PROT_EXECUTE))
5009 #endif
5010 {
5011 set_NX = false;
5012 } else {
5013 set_NX = true;
5014 }
5015
5016 #if HAS_FEAT_XS
5017 /**
5018 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
5019 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
5020 */
5021 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
5022 #endif /* HAS_FEAT_XS */
5023
5024 /* Remove the mapping if new protection is NONE */
5025 if (remove) {
5026 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
5027 panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
5028 __func__, pmap, ppnum);
5029 }
5030
5031 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
5032 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
5033 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5034 pt_entry_t spte = *pte_p;
5035
5036 if (pte_is_wired(spte)) {
5037 pte_set_wired(pmap, pte_p, 0);
5038 spte = *pte_p;
5039 if (pmap != kernel_pmap) {
5040 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5041 }
5042 }
5043
5044 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
5045 (uint64_t)spte, pte_p, ppnum);
5046
5047 if (compress && is_internal && (pmap != kernel_pmap)) {
5048 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
5049 /* mark this PTE as having been "compressed" */
5050 tmplate = ARM_PTE_COMPRESSED;
5051 if (is_altacct) {
5052 tmplate |= ARM_PTE_COMPRESSED_ALT;
5053 }
5054 } else {
5055 tmplate = ARM_PTE_EMPTY;
5056 }
5057
5058 assert(spte != tmplate);
5059 write_pte_fast(pte_p, tmplate);
5060 update = true;
5061 ++pass1_updated;
5062
5063 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5064
5065 if (pmap != kernel_pmap) {
5066 if (ppattr_test_reusable(pai) &&
5067 is_internal &&
5068 !is_altacct) {
5069 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5070 } else if (!is_internal) {
5071 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5072 }
5073
5074 if (is_altacct) {
5075 assert(is_internal);
5076 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5077 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5078 if (options & PMAP_OPTIONS_COMPRESSOR) {
5079 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5080 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5081 }
5082 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5083 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5084 } else if (ppattr_test_reusable(pai)) {
5085 assert(is_internal);
5086 if (options & PMAP_OPTIONS_COMPRESSOR) {
5087 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5088 /* was not in footprint, but is now */
5089 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5090 }
5091 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5092 } else if (is_internal) {
5093 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5094
5095 /*
5096 * Update all stats related to physical footprint, which only
5097 * deals with internal pages.
5098 */
5099 if (options & PMAP_OPTIONS_COMPRESSOR) {
5100 /*
5101 * This removal is only being done so we can send this page to
5102 * the compressor; therefore it mustn't affect total task footprint.
5103 */
5104 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5105 } else {
5106 /*
5107 * This internal page isn't going to the compressor, so adjust stats to keep
5108 * phys_footprint up to date.
5109 */
5110 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5111 }
5112 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5113 } else {
5114 /* external page: no impact on ledgers */
5115 }
5116 }
5117 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
5118 } else {
5119 pt_entry_t spte = *pte_p;
5120 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5121
5122 if (pmap == kernel_pmap) {
5123 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5124 } else {
5125 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5126 }
5127
5128 /*
5129 * While the naive implementation of this would serve to add execute
5130 * permission, this is not how the VM uses this interface, or how
5131 * x86_64 implements it. So ignore requests to add execute permissions.
5132 */
5133 if (set_NX) {
5134 tmplate |= pt_attr_leaf_xn(pt_attr);
5135 }
5136
5137
5138 assert(spte != ARM_PTE_EMPTY);
5139 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
5140
5141 if (spte != tmplate) {
5142 /*
5143 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
5144 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
5145 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
5146 * should always be cleared by this function.
5147 */
5148 pte_set_was_writeable(tmplate, true);
5149 write_pte_fast(pte_p, tmplate);
5150 update = true;
5151 ++pass1_updated;
5152 } else if (pte_was_writeable(tmplate)) {
5153 /*
5154 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5155 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
5156 * write access to a page, this function should always at least clear that flag for
5157 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5158 * these mappings go through vm_fault(). We therefore don't want those accesses to
5159 * be handled through arm_fast_fault().
5160 */
5161 pte_set_was_writeable(tmplate, false);
5162 write_pte_fast(pte_p, tmplate);
5163 }
5164 }
5165
5166 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5167 tlb_flush_needed = true;
5168 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5169 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5170 issue_tlbi = true;
5171 }
5172 }
5173 protect_skip_pve_pass1:
5174 pte_p = PT_ENTRY_NULL;
5175 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5176 pve_ptep_idx = 0;
5177 pve_p = pve_next(pve_p);
5178 }
5179 }
5180
5181 if (tlb_flush_needed) {
5182 FLUSH_PTE_STRONG();
5183 }
5184
5185 if (!remove && !issue_tlbi) {
5186 goto protect_finish;
5187 }
5188
5189 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5190 pv_entry_t **pve_pp = pv_h;
5191 pve_p = orig_pve_p;
5192 pte_p = orig_pte_p;
5193 pve_ptep_idx = 0;
5194
5195 /*
5196 * We need to keep track of whether a particular PVE list contains IOMMU
5197 * mappings when removing entries, because we should only remove CPU
5198 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5199 * it around.
5200 */
5201 bool iommu_mapping_in_pve = false;
5202 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5203 if (pve_p != PV_ENTRY_NULL) {
5204 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5205 if (pte_p == PT_ENTRY_NULL) {
5206 goto protect_skip_pve_pass2;
5207 }
5208 }
5209
5210 #ifdef PVH_FLAG_IOMMU
5211 if (pvh_ptep_is_iommu(pte_p)) {
5212 iommu_mapping_in_pve = true;
5213 if (remove && (pve_p == PV_ENTRY_NULL)) {
5214 /*
5215 * We've found an IOMMU entry and it's the only entry in the PV list.
5216 * We don't discard IOMMU entries, so simply set up the new PV list to
5217 * contain the single IOMMU PTE and exit the loop.
5218 */
5219 new_pte_p = pte_p;
5220 break;
5221 }
5222 goto protect_skip_pve_pass2;
5223 }
5224 #endif
5225 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5226 const pmap_t pmap = ptdp->pmap;
5227 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5228
5229 if (remove) {
5230 if (!compress && (pmap != kernel_pmap)) {
5231 /*
5232 * We must wait to decrement the refcount until we're completely finished using the PTE
5233 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5234 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5235 * under us.
5236 */
5237 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5238 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5239 }
5240 }
5241 /* Remove this CPU mapping from PVE list. */
5242 if (pve_p != PV_ENTRY_NULL) {
5243 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5244 }
5245 } else {
5246 pt_entry_t spte = *pte_p;
5247 if (pte_was_writeable(spte)) {
5248 pte_set_was_writeable(spte, false);
5249 write_pte_fast(pte_p, spte);
5250 } else {
5251 goto protect_skip_pve_pass2;
5252 }
5253 }
5254 ++pass2_updated;
5255 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5256 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5257 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5258 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5259 }
5260
5261 protect_skip_pve_pass2:
5262 pte_p = PT_ENTRY_NULL;
5263 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5264 pve_ptep_idx = 0;
5265
5266 if (remove) {
5267 /**
5268 * If there are any IOMMU mappings in the PVE list, preserve
5269 * those mappings in a new PVE list (new_pve_p) which will later
5270 * become the new PVH entry. Keep track of the CPU mappings in
5271 * pveh_p/pvet_p so they can be deallocated later.
5272 */
5273 if (iommu_mapping_in_pve) {
5274 iommu_mapping_in_pve = false;
5275 pv_entry_t *temp_pve_p = pve_next(pve_p);
5276 pve_remove(pv_h, pve_pp, pve_p);
5277 pveh_p = pvh_pve_list(pv_h);
5278 pve_p->pve_next = new_pve_p;
5279 new_pve_p = pve_p;
5280 pve_p = temp_pve_p;
5281 continue;
5282 } else {
5283 pvet_p = pve_p;
5284 pvh_cnt++;
5285 }
5286 }
5287
5288 pve_pp = pve_next_ptr(pve_p);
5289 pve_p = pve_next(pve_p);
5290 iommu_mapping_in_pve = false;
5291 }
5292 }
5293
5294 protect_finish:
5295
5296 #ifdef PVH_FLAG_EXEC
5297 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5298 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5299 }
5300 #endif
5301 if (__improbable(pass1_updated != pass2_updated)) {
5302 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5303 __func__, pass1_updated, pass2_updated);
5304 }
5305 /* if we removed a bunch of entries, take care of them now */
5306 if (remove) {
5307 if (new_pve_p != PV_ENTRY_NULL) {
5308 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5309 pvh_set_flags(pv_h, pvh_flags);
5310 } else if (new_pte_p != PT_ENTRY_NULL) {
5311 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5312 pvh_set_flags(pv_h, pvh_flags);
5313 } else {
5314 if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5315 pmap_flush_noncoherent_page(phys);
5316 }
5317 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5318 }
5319 }
5320
5321 if (flush_range && tlb_flush_needed) {
5322 if (!remove) {
5323 flush_range->ptfr_flush_needed = true;
5324 tlb_flush_needed = false;
5325 }
5326 }
5327
5328 /*
5329 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5330 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5331 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5332 * a page to be repurposed while it is still live in the TLBs.
5333 */
5334 if (remove && tlb_flush_needed) {
5335 sync_tlb_flush();
5336 }
5337
5338
5339 pvh_unlock(pai);
5340
5341 if (remove) {
5342 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5343 #if !XNU_MONITOR
5344 mp_enable_preemption();
5345 #endif
5346 }
5347
5348 if (!remove && tlb_flush_needed) {
5349 sync_tlb_flush();
5350 }
5351
5352 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5353 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5354 }
5355 }
5356
5357 MARK_AS_PMAP_TEXT void
5358 pmap_page_protect_options_internal(
5359 ppnum_t ppnum,
5360 vm_prot_t prot,
5361 unsigned int options,
5362 void *arg)
5363 {
5364 if (arg != NULL) {
5365 /*
5366 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5367 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5368 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5369 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5370 * In that case, force the flush to take place.
5371 */
5372 options &= ~PMAP_OPTIONS_NOFLUSH;
5373 }
5374 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5375 }
5376
5377 void
5378 pmap_page_protect_options(
5379 ppnum_t ppnum,
5380 vm_prot_t prot,
5381 unsigned int options,
5382 void *arg)
5383 {
5384 pmap_paddr_t phys = ptoa(ppnum);
5385
5386 assert(ppnum != vm_page_fictitious_addr);
5387
5388 /* Only work with managed pages. */
5389 if (!pa_valid(phys)) {
5390 return;
5391 }
5392
5393 /*
5394 * Determine the new protection.
5395 */
5396 if (prot == VM_PROT_ALL) {
5397 return; /* nothing to do */
5398 }
5399
5400 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5401
5402 #if XNU_MONITOR
5403 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5404 #else
5405 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5406 #endif
5407
5408 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5409 }
5410
5411
5412 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5413 MARK_AS_PMAP_TEXT void
5414 pmap_disable_user_jop_internal(pmap_t pmap)
5415 {
5416 if (pmap == kernel_pmap) {
5417 panic("%s: called with kernel_pmap", __func__);
5418 }
5419 validate_pmap_mutable(pmap);
5420 pmap->disable_jop = true;
5421 }
5422
5423 void
5424 pmap_disable_user_jop(pmap_t pmap)
5425 {
5426 #if XNU_MONITOR
5427 pmap_disable_user_jop_ppl(pmap);
5428 #else
5429 pmap_disable_user_jop_internal(pmap);
5430 #endif
5431 }
5432 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5433
5434 /*
5435 * Indicates if the pmap layer enforces some additional restrictions on the
5436 * given set of protections.
5437 */
5438 bool
5439 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5440 {
5441 return false;
5442 }
5443
5444 static inline bool
5445 pmap_allows_xo(pmap_t pmap __unused)
5446 {
5447 return true;
5448 }
5449
5450 /*
5451 * Set the physical protection on the
5452 * specified range of this map as requested.
5453 * VERY IMPORTANT: Will not increase permissions.
5454 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5455 */
5456 void
5457 pmap_protect(
5458 pmap_t pmap,
5459 vm_map_address_t b,
5460 vm_map_address_t e,
5461 vm_prot_t prot)
5462 {
5463 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5464 }
5465
5466 MARK_AS_PMAP_TEXT vm_map_address_t
5467 pmap_protect_options_internal(
5468 pmap_t pmap,
5469 vm_map_address_t start,
5470 vm_map_address_t end,
5471 vm_prot_t prot,
5472 unsigned int options,
5473 __unused void *args)
5474 {
5475 tt_entry_t *tte_p;
5476 pt_entry_t *bpte_p, *epte_p;
5477 pt_entry_t *pte_p;
5478 boolean_t set_NX = TRUE;
5479 boolean_t set_XO = FALSE;
5480 boolean_t should_have_removed = FALSE;
5481 bool need_strong_sync = false;
5482
5483 /* Validate the pmap input before accessing its data. */
5484 validate_pmap_mutable(pmap);
5485
5486 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5487
5488 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5489 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5490 }
5491
5492 #if DEVELOPMENT || DEBUG
5493 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5494 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5495 should_have_removed = TRUE;
5496 }
5497 } else
5498 #endif
5499 {
5500 /* Determine the new protection. */
5501 switch (prot) {
5502 case VM_PROT_READ:
5503 case VM_PROT_READ | VM_PROT_EXECUTE:
5504 break;
5505 case VM_PROT_READ | VM_PROT_WRITE:
5506 case VM_PROT_ALL:
5507 return end; /* nothing to do */
5508 case VM_PROT_EXECUTE:
5509 set_XO = true;
5510 if (pmap_allows_xo(pmap)) {
5511 break;
5512 }
5513 /* Fall through and panic if this pmap shouldn't be allowed to have XO mappings. */
5514 OS_FALLTHROUGH;
5515 default:
5516 should_have_removed = TRUE;
5517 }
5518 }
5519
5520 if (should_have_removed) {
5521 panic("%s: should have been a remove operation, "
5522 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5523 __FUNCTION__,
5524 pmap, (void *)start, (void *)end, prot, options, args);
5525 }
5526
5527 #if DEVELOPMENT || DEBUG
5528 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5529 #else
5530 if ((prot & VM_PROT_EXECUTE))
5531 #endif
5532 {
5533 set_NX = FALSE;
5534 } else {
5535 set_NX = TRUE;
5536 }
5537
5538 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5539 vm_map_address_t va = start;
5540 unsigned int npages = 0;
5541
5542 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5543
5544 tte_p = pmap_tte(pmap, start);
5545
5546 if ((tte_p != (tt_entry_t *) NULL) && tte_is_valid_table(*tte_p)) {
5547 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5548 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5549 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5550 pte_p = bpte_p;
5551
5552 for (pte_p = bpte_p;
5553 pte_p < epte_p;
5554 pte_p += PAGE_RATIO, va += pmap_page_size) {
5555 ++npages;
5556 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5557 pmap_pending_preemption())) {
5558 break;
5559 }
5560 pt_entry_t spte;
5561 #if DEVELOPMENT || DEBUG
5562 boolean_t force_write = FALSE;
5563 #endif
5564
5565 spte = *((volatile pt_entry_t*)pte_p);
5566
5567 if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5568 continue;
5569 }
5570
5571 pmap_paddr_t pa;
5572 unsigned int pai = 0;
5573 boolean_t managed = FALSE;
5574
5575 while (!managed) {
5576 /*
5577 * It may be possible for the pte to transition from managed
5578 * to unmanaged in this timeframe; for now, elide the assert.
5579 * We should break out as a consequence of checking pa_valid.
5580 */
5581 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5582 pa = pte_to_pa(spte);
5583 if (!pa_valid(pa)) {
5584 break;
5585 }
5586 pai = pa_index(pa);
5587 pvh_lock(pai);
5588 spte = *((volatile pt_entry_t*)pte_p);
5589 pa = pte_to_pa(spte);
5590 if (pai == pa_index(pa)) {
5591 managed = TRUE;
5592 break; // Leave the PVH locked as we will unlock it after we free the PTE
5593 }
5594 pvh_unlock(pai);
5595 }
5596
5597 if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5598 continue;
5599 }
5600
5601 pt_entry_t tmplate;
5602
5603 if (pmap == kernel_pmap) {
5604 #if DEVELOPMENT || DEBUG
5605 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5606 force_write = TRUE;
5607 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5608 } else
5609 #endif
5610 {
5611 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5612 }
5613 } else {
5614 #if DEVELOPMENT || DEBUG
5615 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5616 assert(pmap->type != PMAP_TYPE_NESTED);
5617 force_write = TRUE;
5618 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5619 } else
5620 #endif
5621 {
5622 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5623 }
5624 }
5625
5626 /*
5627 * XXX Removing "NX" would
5628 * grant "execute" access
5629 * immediately, bypassing any
5630 * checks VM might want to do
5631 * in its soft fault path.
5632 * pmap_protect() and co. are
5633 * not allowed to increase
5634 * access permissions.
5635 */
5636 if (set_NX) {
5637 tmplate |= pt_attr_leaf_xn(pt_attr);
5638 } else {
5639 if (pmap == kernel_pmap) {
5640 /* do NOT clear "PNX"! */
5641 tmplate |= ARM_PTE_NX;
5642 } else {
5643 /* do NOT clear "NX"! */
5644 tmplate |= pt_attr_leaf_x(pt_attr);
5645 if (__improbable(set_XO)) {
5646 tmplate &= ~ARM_PTE_APMASK;
5647 tmplate |= pt_attr_leaf_rona(pt_attr);
5648 }
5649 }
5650 }
5651
5652 #if DEVELOPMENT || DEBUG
5653 if (force_write) {
5654 /*
5655 * TODO: Run CS/Monitor checks here.
5656 */
5657 if (managed) {
5658 /*
5659 * We are marking the page as writable,
5660 * so we consider it to be modified and
5661 * referenced.
5662 */
5663 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5664 tmplate |= ARM_PTE_AF;
5665
5666 if (ppattr_test_reffault(pai)) {
5667 ppattr_clear_reffault(pai);
5668 }
5669
5670 if (ppattr_test_modfault(pai)) {
5671 ppattr_clear_modfault(pai);
5672 }
5673 }
5674 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5675 /*
5676 * An immediate request for anything other than
5677 * write should still mark the page as
5678 * referenced if managed.
5679 */
5680 if (managed) {
5681 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5682 tmplate |= ARM_PTE_AF;
5683
5684 if (ppattr_test_reffault(pai)) {
5685 ppattr_clear_reffault(pai);
5686 }
5687 }
5688 }
5689 #endif
5690
5691 /* We do not expect to write fast fault the entry. */
5692 pte_set_was_writeable(tmplate, false);
5693 #if HAS_FEAT_XS
5694 if (pte_is_xs(pt_attr, spte)) {
5695 need_strong_sync = true;
5696 }
5697 #endif /* HAS_FEAT_XS */
5698
5699 write_pte_fast(pte_p, tmplate);
5700
5701 if (managed) {
5702 pvh_assert_locked(pai);
5703 pvh_unlock(pai);
5704 }
5705 }
5706 FLUSH_PTE_STRONG();
5707 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5708 } else {
5709 va = end;
5710 }
5711
5712 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5713 return va;
5714 }
5715
5716 void
5717 pmap_protect_options(
5718 pmap_t pmap,
5719 vm_map_address_t b,
5720 vm_map_address_t e,
5721 vm_prot_t prot,
5722 unsigned int options,
5723 __unused void *args)
5724 {
5725 vm_map_address_t l, beg;
5726
5727 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5728
5729 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5730 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5731 pmap, (uint64_t)b, (uint64_t)e);
5732 }
5733
5734 /*
5735 * We allow single-page requests to execute non-preemptibly,
5736 * as it doesn't make sense to sample AST_URGENT for a single-page
5737 * operation, and there are a couple of special use cases that
5738 * require a non-preemptible single-page operation.
5739 */
5740 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5741 pmap_verify_preemptible();
5742 }
5743
5744 #if DEVELOPMENT || DEBUG
5745 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5746 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5747 pmap_remove_options(pmap, b, e, options);
5748 return;
5749 }
5750 } else
5751 #endif
5752 {
5753 /* Determine the new protection. */
5754 switch (prot) {
5755 case VM_PROT_READ:
5756 case VM_PROT_READ | VM_PROT_EXECUTE:
5757 break;
5758 case VM_PROT_READ | VM_PROT_WRITE:
5759 case VM_PROT_ALL:
5760 return; /* nothing to do */
5761 case VM_PROT_EXECUTE:
5762 if (pmap_allows_xo(pmap)) {
5763 break;
5764 }
5765 /* Fall through and remove the mapping if XO is requested and [pmap] doesn't allow it. */
5766 OS_FALLTHROUGH;
5767 default:
5768 pmap_remove_options(pmap, b, e, options);
5769 return;
5770 }
5771 }
5772
5773 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5774 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5775 VM_KERNEL_ADDRHIDE(e));
5776
5777 beg = b;
5778
5779 while (beg < e) {
5780 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5781
5782 if (l > e) {
5783 l = e;
5784 }
5785
5786 #if XNU_MONITOR
5787 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5788 #else
5789 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5790 #endif
5791 }
5792
5793 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5794 }
5795
5796 /**
5797 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5798 *
5799 * @param pmap pmap to insert the pages into.
5800 * @param va virtual address to map the pages into.
5801 * @param pa page number of the first physical page to map.
5802 * @param size block size, in number of pages.
5803 * @param prot mapping protection attributes.
5804 * @param attr flags to pass to pmap_enter().
5805 *
5806 * @return KERN_SUCCESS.
5807 */
5808 kern_return_t
5809 pmap_map_block(
5810 pmap_t pmap,
5811 addr64_t va,
5812 ppnum_t pa,
5813 uint32_t size,
5814 vm_prot_t prot,
5815 int attr,
5816 unsigned int flags)
5817 {
5818 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5819 }
5820
5821 /**
5822 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5823 * As opposed to pmap_map_block(), this function takes
5824 * a physical address as an input and operates using the
5825 * page size associated with the input pmap.
5826 *
5827 * @param pmap pmap to insert the pages into.
5828 * @param va virtual address to map the pages into.
5829 * @param pa physical address of the first physical page to map.
5830 * @param size block size, in number of pages.
5831 * @param prot mapping protection attributes.
5832 * @param attr flags to pass to pmap_enter().
5833 *
5834 * @return KERN_SUCCESS.
5835 */
5836 kern_return_t
5837 pmap_map_block_addr(
5838 pmap_t pmap,
5839 addr64_t va,
5840 pmap_paddr_t pa,
5841 uint32_t size,
5842 vm_prot_t prot,
5843 int attr,
5844 unsigned int flags)
5845 {
5846 #if __ARM_MIXED_PAGE_SIZE__
5847 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5848 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5849 #else
5850 const uint64_t pmap_page_size = PAGE_SIZE;
5851 #endif
5852
5853 for (ppnum_t page = 0; page < size; page++) {
5854 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5855 panic("%s: failed pmap_enter_addr, "
5856 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5857 __FUNCTION__,
5858 pmap, va, (uint64_t)pa, size, prot, flags);
5859 }
5860
5861 va += pmap_page_size;
5862 pa += pmap_page_size;
5863 }
5864
5865 return KERN_SUCCESS;
5866 }
5867
5868 kern_return_t
5869 pmap_enter_addr(
5870 pmap_t pmap,
5871 vm_map_address_t v,
5872 pmap_paddr_t pa,
5873 vm_prot_t prot,
5874 vm_prot_t fault_type,
5875 unsigned int flags,
5876 boolean_t wired)
5877 {
5878 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5879 }
5880
5881 /*
5882 * Insert the given physical page (p) at
5883 * the specified virtual address (v) in the
5884 * target physical map with the protection requested.
5885 *
5886 * If specified, the page will be wired down, meaning
5887 * that the related pte can not be reclaimed.
5888 *
5889 * NB: This is the only routine which MAY NOT lazy-evaluate
5890 * or lose information. That is, this routine must actually
5891 * insert this page into the given map eventually (must make
5892 * forward progress eventually.
5893 */
5894 kern_return_t
5895 pmap_enter(
5896 pmap_t pmap,
5897 vm_map_address_t v,
5898 ppnum_t pn,
5899 vm_prot_t prot,
5900 vm_prot_t fault_type,
5901 unsigned int flags,
5902 boolean_t wired,
5903 __unused pmap_mapping_type_t mapping_type)
5904 {
5905 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5906 }
5907
5908 /*
5909 * Attempt to commit the pte.
5910 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5911 * Performs no page table or accounting writes on failures.
5912 */
5913 static inline bool
5914 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5915 {
5916 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5917 bool success = false, changed_wiring = false;
5918
5919 __unreachable_ok_push
5920 if (TEST_PAGE_RATIO_4) {
5921 /*
5922 * 16K virtual pages w/ 4K hw pages.
5923 * We actually need to update 4 ptes here which can't easily be done atomically.
5924 * As a result we require the exclusive pmap lock.
5925 */
5926 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5927 *old_pte = *pte_p;
5928 if (*old_pte == new_pte) {
5929 /* Another thread completed this operation. Nothing to do here. */
5930 success = true;
5931 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5932 pte_is_valid(*old_pte)) {
5933 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5934 success = false;
5935 } else {
5936 write_pte_fast(pte_p, new_pte);
5937 success = true;
5938 }
5939 } else {
5940 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5941 }
5942 __unreachable_ok_pop
5943
5944 if (success && *old_pte != new_pte) {
5945 if (pte_is_valid(*old_pte)) {
5946 bool need_strong_sync = false;
5947 FLUSH_PTE_STRONG();
5948 #if HAS_FEAT_XS
5949 if (pte_is_xs(pt_attr, *old_pte)) {
5950 need_strong_sync = true;
5951 }
5952 #endif /* HAS_FEAT_XS */
5953 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5954 } else {
5955 FLUSH_PTE();
5956 __builtin_arm_isb(ISB_SY);
5957 }
5958 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5959 (new_pte & ARM_PTE_WIRED) != 0 :
5960 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5961
5962 if (pmap != kernel_pmap && changed_wiring) {
5963 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5964 if (new_pte & ARM_PTE_WIRED) {
5965 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5966 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5967 } else {
5968 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5969 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5970 }
5971 }
5972
5973 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5974 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5975 }
5976 return success;
5977 }
5978
5979 MARK_AS_PMAP_TEXT static pt_entry_t
5980 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5981 {
5982 pt_entry_t pte;
5983
5984 switch (wimg & (VM_WIMG_MASK)) {
5985 case VM_WIMG_IO:
5986 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5987 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5988 // AP, while preserving the security benefits of using device
5989 // mapping against side-channel attacks. On pre-H14 platforms,
5990 // the accesses will still be strongly ordered.
5991 if (is_dram_addr(pa)) {
5992 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5993 } else {
5994 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5995 #if HAS_FEAT_XS
5996 pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5997 if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5998 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5999 }
6000 #endif /* HAS_FEAT_XS */
6001 }
6002 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6003 break;
6004 case VM_WIMG_RT:
6005 if (is_dram_addr(pa)) {
6006 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
6007 } else {
6008 #if HAS_FEAT_XS
6009 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6010 #else /* HAS_FEAT_XS */
6011 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6012 #endif /* HAS_FEAT_XS */
6013 #if DEBUG || DEVELOPMENT
6014 pmap_wcrt_on_non_dram_count_increment_atomic();
6015 #endif /* DEBUG || DEVELOPMENT */
6016 }
6017 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6018 break;
6019 case VM_WIMG_POSTED:
6020 if (is_dram_addr(pa)) {
6021 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6022 } else {
6023 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
6024 }
6025 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6026 break;
6027 case VM_WIMG_POSTED_REORDERED:
6028 if (is_dram_addr(pa)) {
6029 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6030 } else {
6031 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
6032 }
6033 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6034 break;
6035 case VM_WIMG_POSTED_COMBINED_REORDERED:
6036 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6037 #if HAS_FEAT_XS
6038 if (!is_dram_addr(pa)) {
6039 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6040 }
6041 #endif /* HAS_FEAT_XS */
6042 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6043 break;
6044 case VM_WIMG_WCOMB:
6045 if (is_dram_addr(pa)) {
6046 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
6047 } else {
6048 #if HAS_FEAT_XS
6049 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6050 #else /* HAS_FEAT_XS */
6051 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6052 #endif /* HAS_FEAT_XS */
6053 #if DEBUG || DEVELOPMENT
6054 pmap_wcrt_on_non_dram_count_increment_atomic();
6055 #endif /* DEBUG || DEVELOPMENT */
6056 }
6057 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6058 break;
6059 case VM_WIMG_WTHRU:
6060 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
6061 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6062 break;
6063 case VM_WIMG_COPYBACK:
6064 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
6065 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6066 break;
6067 case VM_WIMG_INNERWBACK:
6068 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
6069 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
6070 break;
6071 default:
6072 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6073 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6074 }
6075
6076 return pte;
6077 }
6078
6079
6080 /*
6081 * Construct a PTE (and the physical page attributes) for the given virtual to
6082 * physical mapping.
6083 *
6084 * This function has no side effects and is safe to call so that it is safe to
6085 * call while attempting a pmap_enter transaction.
6086 */
6087 MARK_AS_PMAP_TEXT static pt_entry_t
6088 pmap_construct_pte(
6089 const pmap_t pmap,
6090 vm_map_address_t va,
6091 pmap_paddr_t pa,
6092 vm_prot_t prot,
6093 vm_prot_t fault_type,
6094 boolean_t wired,
6095 const pt_attr_t* const pt_attr,
6096 unsigned int options __unused,
6097 uint16_t *pp_attr_bits /* OUTPUT */
6098 )
6099 {
6100 bool set_NX = false, set_XO = false, set_TPRO = false;
6101 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
6102 assert(pp_attr_bits != NULL);
6103 *pp_attr_bits = 0;
6104
6105 if (wired) {
6106 pte |= ARM_PTE_WIRED;
6107 }
6108
6109 #if DEVELOPMENT || DEBUG
6110 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
6111 #else
6112 if ((prot & VM_PROT_EXECUTE))
6113 #endif
6114 {
6115 set_NX = false;
6116 } else {
6117 set_NX = true;
6118 }
6119
6120 if (prot == VM_PROT_EXECUTE) {
6121 set_XO = true;
6122 if (!pmap_allows_xo(pmap)) {
6123 panic("%s: attempted execute-only mapping", __func__);
6124 }
6125 }
6126
6127 if (set_NX) {
6128 pte |= pt_attr_leaf_xn(pt_attr);
6129 } else {
6130 if (pmap == kernel_pmap) {
6131 pte |= ARM_PTE_NX;
6132 } else {
6133 pte |= pt_attr_leaf_x(pt_attr);
6134 }
6135 }
6136
6137 if (pmap == kernel_pmap) {
6138 #if __ARM_KERNEL_PROTECT__
6139 pte |= ARM_PTE_NG;
6140 #endif /* __ARM_KERNEL_PROTECT__ */
6141 if (prot & VM_PROT_WRITE) {
6142 pte |= ARM_PTE_AP(AP_RWNA);
6143 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6144 } else {
6145 pte |= ARM_PTE_AP(AP_RONA);
6146 *pp_attr_bits |= PP_ATTR_REFERENCED;
6147 }
6148 } else {
6149 if (pmap->type != PMAP_TYPE_NESTED) {
6150 pte |= ARM_PTE_NG;
6151 } else if ((pmap->nested_region_unnested_table_bitmap)
6152 && (va >= pmap->nested_region_addr)
6153 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
6154 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
6155
6156 if ((pmap->nested_region_unnested_table_bitmap)
6157 && testbit(UNNEST_BIT(index), (int *)pmap->nested_region_unnested_table_bitmap)) {
6158 pte |= ARM_PTE_NG;
6159 }
6160 }
6161 if (set_TPRO) {
6162 pte |= pt_attr_leaf_rona(pt_attr);
6163 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6164 } else if (prot & VM_PROT_WRITE) {
6165 assert(pmap->type != PMAP_TYPE_NESTED);
6166 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6167 if (fault_type & VM_PROT_WRITE) {
6168 pte |= pt_attr_leaf_rw(pt_attr);
6169 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6170 } else {
6171 pte |= pt_attr_leaf_ro(pt_attr);
6172 /*
6173 * Mark the page as MODFAULT so that a subsequent write
6174 * may be handled through arm_fast_fault().
6175 */
6176 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6177 pte_set_was_writeable(pte, true);
6178 }
6179 } else {
6180 pte |= pt_attr_leaf_rw(pt_attr);
6181 *pp_attr_bits |= PP_ATTR_REFERENCED;
6182 }
6183 } else {
6184 if (__improbable(set_XO)) {
6185 pte |= pt_attr_leaf_rona(pt_attr);
6186 } else {
6187 pte |= pt_attr_leaf_ro(pt_attr);
6188 }
6189 *pp_attr_bits |= PP_ATTR_REFERENCED;
6190 }
6191 }
6192
6193 pte |= ARM_PTE_AF;
6194 return pte;
6195 }
6196
6197 MARK_AS_PMAP_TEXT kern_return_t
6198 pmap_enter_options_internal(
6199 pmap_t pmap,
6200 vm_map_address_t v,
6201 pmap_paddr_t pa,
6202 vm_prot_t prot,
6203 vm_prot_t fault_type,
6204 unsigned int flags,
6205 boolean_t wired,
6206 unsigned int options)
6207 {
6208 ppnum_t pn = (ppnum_t)atop(pa);
6209 pt_entry_t pte;
6210 pt_entry_t spte;
6211 pt_entry_t *pte_p;
6212 bool refcnt_updated;
6213 bool wiredcnt_updated;
6214 bool ro_va = false;
6215 unsigned int wimg_bits;
6216 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6217 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6218 kern_return_t kr = KERN_SUCCESS;
6219 uint16_t pp_attr_bits;
6220 volatile uint16_t *refcnt;
6221 volatile uint16_t *wiredcnt;
6222 pv_free_list_t *local_pv_free;
6223
6224 validate_pmap_mutable(pmap);
6225
6226 #if XNU_MONITOR
6227 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6228 panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6229 }
6230 #endif
6231
6232 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6233
6234 if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6235 panic("%s: pmap %p v 0x%llx not page-aligned",
6236 __func__, pmap, (unsigned long long)v);
6237 }
6238
6239 if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6240 panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6241 }
6242
6243 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6244 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6245 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6246 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6247 }
6248
6249 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6250 panic("pmap_enter_options() pmap %p pa 0x%llx",
6251 pmap, (uint64_t)pa);
6252 }
6253
6254 /* The PA should not extend beyond the architected physical address space */
6255 pa &= ARM_PTE_PAGE_MASK;
6256
6257 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6258 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
6259 extern vm_offset_t ctrr_test_page;
6260 if (__probable(v != ctrr_test_page))
6261 #endif
6262 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6263 }
6264 if (__improbable((prot == VM_PROT_EXECUTE) && !pmap_allows_xo(pmap))) {
6265 return KERN_PROTECTION_FAILURE;
6266 }
6267
6268 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6269 if (__improbable(prot != VM_PROT_READ)) {
6270 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6271 __func__, (unsigned long long)v, prot);
6272 }
6273 ro_va = true;
6274 }
6275 assert(pn != vm_page_fictitious_addr);
6276
6277 refcnt_updated = false;
6278 wiredcnt_updated = false;
6279
6280 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6281 /*
6282 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6283 *
6284 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6285 */
6286 lock_mode = PMAP_LOCK_EXCLUSIVE;
6287 }
6288
6289 if (!pmap_lock_preempt(pmap, lock_mode)) {
6290 return KERN_ABORTED;
6291 }
6292
6293 /*
6294 * Expand pmap to include this pte. Assume that
6295 * pmap is always expanded to include enough hardware
6296 * pages to map one VM page.
6297 */
6298 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6299 /* Must unlock to expand the pmap. */
6300 pmap_unlock(pmap, lock_mode);
6301
6302 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6303
6304 if (kr != KERN_SUCCESS) {
6305 return kr;
6306 }
6307
6308 if (!pmap_lock_preempt(pmap, lock_mode)) {
6309 return KERN_ABORTED;
6310 }
6311 }
6312
6313 if (options & PMAP_OPTIONS_NOENTER) {
6314 pmap_unlock(pmap, lock_mode);
6315 return KERN_SUCCESS;
6316 }
6317
6318 /*
6319 * Since we may not hold the pmap lock exclusive, updating the pte is
6320 * done via a cmpxchg loop.
6321 * We need to be careful about modifying non-local data structures before commiting
6322 * the new pte since we may need to re-do the transaction.
6323 */
6324 spte = os_atomic_load(pte_p, relaxed);
6325 while (!committed) {
6326 refcnt = NULL;
6327 wiredcnt = NULL;
6328 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6329 had_valid_mapping = pte_is_valid(spte);
6330
6331 if (pmap != kernel_pmap) {
6332 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6333 refcnt = &ptd_info->refcnt;
6334 wiredcnt = &ptd_info->wiredcnt;
6335 /*
6336 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6337 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6338 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6339 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6340 * have PTDs, so we can't use the check there.
6341 */
6342 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6343 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6344 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6345 }
6346 /*
6347 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6348 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6349 * or acquire the pmap lock exclusive.
6350 */
6351 if (!wiredcnt_updated) {
6352 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6353 wiredcnt_updated = true;
6354 }
6355 if (!refcnt_updated) {
6356 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6357 refcnt_updated = true;
6358 drop_refcnt = true;
6359 }
6360 }
6361
6362 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6363 /*
6364 * There is already a mapping here & it's for a different physical page.
6365 * First remove that mapping.
6366 *
6367 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6368 */
6369 if (lock_mode == PMAP_LOCK_SHARED) {
6370 if (pmap_lock_shared_to_exclusive(pmap)) {
6371 lock_mode = PMAP_LOCK_EXCLUSIVE;
6372 } else {
6373 /*
6374 * We failed to upgrade to an exclusive lock.
6375 * As a result we no longer hold the lock at all,
6376 * so we need to re-acquire it and restart the transaction.
6377 */
6378 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6379 lock_mode = PMAP_LOCK_EXCLUSIVE;
6380 /* pmap might have changed after we dropped the lock. Try again. */
6381 spte = os_atomic_load(pte_p, relaxed);
6382 continue;
6383 }
6384 }
6385 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6386 spte = ARM_PTE_EMPTY;
6387 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_EMPTY);
6388 }
6389
6390 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, options, &pp_attr_bits);
6391
6392 if (pa_valid(pa)) {
6393 unsigned int pai;
6394 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6395
6396 is_internal = FALSE;
6397 is_altacct = FALSE;
6398
6399 pai = pa_index(pa);
6400
6401 pvh_lock(pai);
6402
6403 /*
6404 * Make sure that the current per-cpu PV free list has
6405 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6406 * if the transaction succeeds. We're either in the
6407 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6408 * Note that we can still be interrupted, but a primary
6409 * interrupt handler can never enter the pmap.
6410 */
6411 #if !XNU_MONITOR
6412 assert(get_preemption_level() > 0);
6413 #endif
6414 local_pv_free = &pmap_get_cpu_data()->pv_free;
6415 pv_entry_t **pv_h = pai_to_pvh(pai);
6416 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6417 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6418
6419 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6420 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6421 int new_allocated_pves = 0;
6422
6423 while (new_allocated_pves < 2) {
6424 local_pv_free = &pmap_get_cpu_data()->pv_free;
6425 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6426 if (pv_status == PV_ALLOC_FAIL) {
6427 break;
6428 } else if (pv_status == PV_ALLOC_RETRY) {
6429 /*
6430 * In the case that pv_alloc() had to grab a new page of PVEs,
6431 * it will have dropped the pmap lock while doing so.
6432 * On non-PPL devices, dropping the lock re-enables preemption so we may
6433 * be on a different CPU now.
6434 */
6435 local_pv_free = &pmap_get_cpu_data()->pv_free;
6436 } else {
6437 /* If we've gotten this far then a node should've been allocated. */
6438 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6439
6440 new_allocated_pves++;
6441 }
6442 }
6443
6444 for (int i = 0; i < new_allocated_pves; i++) {
6445 pv_free(new_pve_p[i]);
6446 }
6447 }
6448
6449 if (pv_status == PV_ALLOC_FAIL) {
6450 pvh_unlock(pai);
6451 kr = KERN_RESOURCE_SHORTAGE;
6452 break;
6453 } else if (pv_status == PV_ALLOC_RETRY) {
6454 pvh_unlock(pai);
6455 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6456 spte = os_atomic_load(pte_p, relaxed);
6457 continue;
6458 }
6459
6460 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6461 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6462 } else {
6463 wimg_bits = pmap_cache_attributes(pn);
6464 }
6465
6466 /* We may be retrying this operation after dropping the PVH lock.
6467 * Cache attributes for the physical page may have changed while the lock
6468 * was dropped, so clear any cache attributes we may have previously set
6469 * in the PTE template. */
6470 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6471 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6472
6473 #if XNU_MONITOR
6474 /* The regular old kernel is not allowed to remap PPL pages. */
6475 if (__improbable(ppattr_pa_test_monitor(pa))) {
6476 panic("%s: page belongs to PPL, "
6477 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6478 __FUNCTION__,
6479 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6480 }
6481
6482 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6483 panic("%s: page locked down, "
6484 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6485 __FUNCTION__,
6486 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6487 }
6488 #endif
6489
6490
6491
6492 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6493 if (!committed) {
6494 pvh_unlock(pai);
6495 continue;
6496 }
6497 had_valid_mapping = pte_is_valid(spte);
6498 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6499
6500 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6501 /*
6502 * If there was already a valid pte here then we reuse its reference
6503 * on the ptd and drop the one that we took above.
6504 */
6505 drop_refcnt = had_valid_mapping;
6506
6507 if (!had_valid_mapping) {
6508 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6509 int pve_ptep_idx = 0;
6510 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6511 /* We did all the allocations up top. So this shouldn't be able to fail. */
6512 if (pv_status != PV_ALLOC_SUCCESS) {
6513 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6514 __func__, pv_status, new_pve_p, pmap);
6515 }
6516
6517 if (pmap != kernel_pmap) {
6518 if (options & PMAP_OPTIONS_INTERNAL) {
6519 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6520 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6521 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6522 /*
6523 * Make a note to ourselves that this
6524 * mapping is using alternative
6525 * accounting. We'll need this in order
6526 * to know which ledger to debit when
6527 * the mapping is removed.
6528 *
6529 * The altacct bit must be set while
6530 * the pv head is locked. Defer the
6531 * ledger accounting until after we've
6532 * dropped the lock.
6533 */
6534 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6535 is_altacct = TRUE;
6536 }
6537 }
6538 if (ppattr_test_reusable(pai) &&
6539 !is_altacct) {
6540 is_reusable = TRUE;
6541 } else if (options & PMAP_OPTIONS_INTERNAL) {
6542 is_internal = TRUE;
6543 } else {
6544 is_external = TRUE;
6545 }
6546 }
6547 }
6548
6549 pvh_unlock(pai);
6550
6551 if (pp_attr_bits != 0) {
6552 ppattr_pa_set_bits(pa, pp_attr_bits);
6553 }
6554
6555 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6556 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6557
6558 if (is_internal) {
6559 /*
6560 * Make corresponding adjustments to
6561 * phys_footprint statistics.
6562 */
6563 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6564 if (is_altacct) {
6565 /*
6566 * If this page is internal and
6567 * in an IOKit region, credit
6568 * the task's total count of
6569 * dirty, internal IOKit pages.
6570 * It should *not* count towards
6571 * the task's total physical
6572 * memory footprint, because
6573 * this entire region was
6574 * already billed to the task
6575 * at the time the mapping was
6576 * created.
6577 *
6578 * Put another way, this is
6579 * internal++ and
6580 * alternate_accounting++, so
6581 * net effect on phys_footprint
6582 * is 0. That means: don't
6583 * touch phys_footprint here.
6584 */
6585 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6586 } else {
6587 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6588 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6589 skip_footprint_debit = true;
6590 } else {
6591 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6592 }
6593 }
6594 }
6595 if (is_reusable) {
6596 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6597 } else if (is_external) {
6598 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6599 }
6600 }
6601 } else {
6602 if (prot & VM_PROT_EXECUTE) {
6603 kr = KERN_FAILURE;
6604 break;
6605 }
6606
6607 wimg_bits = pmap_cache_attributes(pn);
6608 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6609 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6610 }
6611
6612 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6613
6614 #if XNU_MONITOR
6615 pte = pmap_construct_io_pte(pa, pte);
6616
6617 /**
6618 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6619 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6620 * created and later removed. We must therefore prevent an attacker from downgrading a
6621 * a writable mapping in order to allow it to be removed and remapped to something else.
6622 */
6623 if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6624 pte_is_valid(spte) &&
6625 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6626 (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6627 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6628 __func__, (uint64_t)pte_to_pa(spte));
6629 }
6630 #endif
6631
6632 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6633 if (committed) {
6634 had_valid_mapping = pte_is_valid(spte);
6635 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6636
6637 /**
6638 * If there was already a valid pte here then we reuse its
6639 * reference on the ptd and drop the one that we took above.
6640 */
6641 drop_refcnt = had_valid_mapping;
6642 }
6643 }
6644 if (committed) {
6645 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6646 assert(pmap != kernel_pmap);
6647
6648 /* One less "compressed" */
6649 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6650 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6651
6652 if (spte & ARM_PTE_COMPRESSED_ALT) {
6653 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6654 } else if (!skip_footprint_debit) {
6655 /* Was part of the footprint */
6656 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6657 }
6658 /* The old entry held a reference so drop the extra one that we took above. */
6659 drop_refcnt = true;
6660 }
6661 }
6662 }
6663
6664 if (drop_refcnt && refcnt != NULL) {
6665 assert(refcnt_updated);
6666 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6667 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6668 }
6669 }
6670
6671 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6672 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6673 }
6674
6675 pmap_unlock(pmap, lock_mode);
6676
6677 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6678 pmap_phys_write_disable(v);
6679 }
6680
6681 return kr;
6682 }
6683
6684 kern_return_t
6685 pmap_enter_options_addr(
6686 pmap_t pmap,
6687 vm_map_address_t v,
6688 pmap_paddr_t pa,
6689 vm_prot_t prot,
6690 vm_prot_t fault_type,
6691 unsigned int flags,
6692 boolean_t wired,
6693 unsigned int options,
6694 __unused void *arg,
6695 __unused pmap_mapping_type_t mapping_type)
6696 {
6697 kern_return_t kr = KERN_FAILURE;
6698
6699
6700 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6701 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6702
6703
6704 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6705 do {
6706 #if XNU_MONITOR
6707 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6708 #else
6709 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6710 #endif
6711
6712 if (kr == KERN_RESOURCE_SHORTAGE) {
6713 #if XNU_MONITOR
6714 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6715 #endif
6716 if (nowait_requested) {
6717 break;
6718 }
6719 }
6720 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6721
6722 #if XNU_MONITOR
6723 pmap_ledger_check_balance(pmap);
6724 #endif
6725
6726 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6727
6728 return kr;
6729 }
6730
6731 kern_return_t
6732 pmap_enter_options(
6733 pmap_t pmap,
6734 vm_map_address_t v,
6735 ppnum_t pn,
6736 vm_prot_t prot,
6737 vm_prot_t fault_type,
6738 unsigned int flags,
6739 boolean_t wired,
6740 unsigned int options,
6741 __unused void *arg,
6742 pmap_mapping_type_t mapping_type)
6743 {
6744 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6745 }
6746
6747 /*
6748 * Routine: pmap_change_wiring
6749 * Function: Change the wiring attribute for a map/virtual-address
6750 * pair.
6751 * In/out conditions:
6752 * The mapping must already exist in the pmap.
6753 */
6754 MARK_AS_PMAP_TEXT kern_return_t
6755 pmap_change_wiring_internal(
6756 pmap_t pmap,
6757 vm_map_address_t v,
6758 boolean_t wired)
6759 {
6760 pt_entry_t *pte_p;
6761 pmap_paddr_t pa;
6762
6763 validate_pmap_mutable(pmap);
6764
6765 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6766 return KERN_ABORTED;
6767 }
6768
6769 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6770
6771 pte_p = pmap_pte(pmap, v);
6772 if (pte_p == PT_ENTRY_NULL) {
6773 if (!wired) {
6774 /*
6775 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6776 * may have been freed by a remove operation.
6777 */
6778 goto pmap_change_wiring_return;
6779 } else {
6780 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6781 }
6782 }
6783 /*
6784 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6785 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6786 */
6787 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6788
6789 while (pa_valid(pa)) {
6790 pmap_paddr_t new_pa;
6791
6792 pvh_lock(pa_index(pa));
6793 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6794
6795 if (pa == new_pa) {
6796 break;
6797 }
6798
6799 pvh_unlock(pa_index(pa));
6800 pa = new_pa;
6801 }
6802
6803 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6804 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6805 if (!wired) {
6806 /* PTE cleared by prior remove/disconnect operation */
6807 goto pmap_change_wiring_cleanup;
6808 } else {
6809 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6810 __func__, pte_p, (uint64_t)*pte_p, pmap);
6811 }
6812 }
6813
6814 assertf(pte_is_valid(*pte_p), "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6815 if (wired != pte_is_wired(*pte_p)) {
6816 pte_set_wired(pmap, pte_p, wired);
6817 if (pmap != kernel_pmap) {
6818 if (wired) {
6819 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6820 } else if (!wired) {
6821 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6822 }
6823 }
6824 }
6825
6826 pmap_change_wiring_cleanup:
6827 if (pa_valid(pa)) {
6828 pvh_unlock(pa_index(pa));
6829 }
6830
6831 pmap_change_wiring_return:
6832 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6833
6834 return KERN_SUCCESS;
6835 }
6836
6837 void
6838 pmap_change_wiring(
6839 pmap_t pmap,
6840 vm_map_address_t v,
6841 boolean_t wired)
6842 {
6843 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6844 pmap_verify_preemptible();
6845
6846 kern_return_t kr = KERN_FAILURE;
6847 #if XNU_MONITOR
6848 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6849 do {
6850 kr = pmap_change_wiring_ppl(pmap, v, wired);
6851 } while (kr == KERN_ABORTED);
6852
6853 pmap_ledger_check_balance(pmap);
6854 #else
6855 /* Since we verified preemptibility, call the helper only once. */
6856 kr = pmap_change_wiring_internal(pmap, v, wired);
6857 #endif
6858
6859 if (kr != KERN_SUCCESS) {
6860 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6861 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6862 }
6863 }
6864
6865 MARK_AS_PMAP_TEXT pmap_paddr_t
6866 pmap_find_pa_internal(
6867 pmap_t pmap,
6868 addr64_t va)
6869 {
6870 pmap_paddr_t pa = 0;
6871
6872 validate_pmap(pmap);
6873
6874 if (pmap != kernel_pmap) {
6875 pmap_lock(pmap, PMAP_LOCK_SHARED);
6876 }
6877
6878 pa = pmap_vtophys(pmap, va);
6879
6880 if (pmap != kernel_pmap) {
6881 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6882 }
6883
6884 return pa;
6885 }
6886
6887 pmap_paddr_t
6888 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6889 {
6890 pmap_paddr_t pa = 0;
6891
6892 if (pmap == kernel_pmap) {
6893 pa = mmu_kvtop(va);
6894 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6895 /*
6896 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6897 * translation even if PAN would prevent kernel access through the translation.
6898 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6899 */
6900 pa = mmu_uvtop(va);
6901 }
6902 return pa;
6903 }
6904
6905 pmap_paddr_t
6906 pmap_find_pa(
6907 pmap_t pmap,
6908 addr64_t va)
6909 {
6910 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6911
6912 if (pa != 0) {
6913 return pa;
6914 }
6915
6916 if (not_in_kdp) {
6917 #if XNU_MONITOR
6918 return pmap_find_pa_ppl(pmap, va);
6919 #else
6920 return pmap_find_pa_internal(pmap, va);
6921 #endif
6922 } else {
6923 return pmap_vtophys(pmap, va);
6924 }
6925 }
6926
6927 ppnum_t
6928 pmap_find_phys_nofault(
6929 pmap_t pmap,
6930 addr64_t va)
6931 {
6932 ppnum_t ppn;
6933 ppn = atop(pmap_find_pa_nofault(pmap, va));
6934 return ppn;
6935 }
6936
6937 ppnum_t
6938 pmap_find_phys(
6939 pmap_t pmap,
6940 addr64_t va)
6941 {
6942 ppnum_t ppn;
6943 ppn = atop(pmap_find_pa(pmap, va));
6944 return ppn;
6945 }
6946
6947 /**
6948 * Translate a kernel virtual address into a physical address.
6949 *
6950 * @param va The kernel virtual address to translate. Does not work on user
6951 * virtual addresses.
6952 *
6953 * @return The physical address if the translation was successful, or zero if
6954 * no valid mappings were found for the given virtual address.
6955 */
6956 pmap_paddr_t
6957 kvtophys(vm_offset_t va)
6958 {
6959 /**
6960 * Attempt to do the translation first in hardware using the AT (address
6961 * translation) instruction. This will attempt to use the MMU to do the
6962 * translation for us.
6963 */
6964 pmap_paddr_t pa = mmu_kvtop(va);
6965
6966 if (pa) {
6967 return pa;
6968 }
6969
6970 /* If the MMU can't find the mapping, then manually walk the page tables. */
6971 return pmap_vtophys(kernel_pmap, va);
6972 }
6973
6974 /**
6975 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6976 * points to a non-kernel-managed physical page, then this call will panic().
6977 *
6978 * @note The output of this function is guaranteed to be a kernel-managed
6979 * physical page, which means it's safe to pass the output directly to
6980 * pa_index() to create a physical address index for various pmap data
6981 * structures.
6982 *
6983 * @param va The kernel virtual address to translate. Does not work on user
6984 * virtual addresses.
6985 *
6986 * @return The translated physical address for the given virtual address.
6987 */
6988 pmap_paddr_t
6989 kvtophys_nofail(vm_offset_t va)
6990 {
6991 pmap_paddr_t pa = kvtophys(va);
6992
6993 if (!pa_valid(pa)) {
6994 panic("%s: Invalid or non-kernel-managed physical page returned, "
6995 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6996 }
6997
6998 return pa;
6999 }
7000
7001 pmap_paddr_t
7002 pmap_vtophys(
7003 pmap_t pmap,
7004 addr64_t va)
7005 {
7006 if ((va < pmap->min) || (va >= pmap->max)) {
7007 return 0;
7008 }
7009
7010 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7011
7012 tt_entry_t * ttp = NULL;
7013 tt_entry_t * ttep = NULL;
7014 tt_entry_t tte = ARM_TTE_EMPTY;
7015 pmap_paddr_t pa = 0;
7016 unsigned int cur_level;
7017
7018 ttp = pmap->tte;
7019
7020 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
7021 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
7022
7023 tte = *ttep;
7024
7025 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
7026 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
7027 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
7028 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
7029
7030 if ((tte & valid_mask) != valid_mask) {
7031 return (pmap_paddr_t) 0;
7032 }
7033
7034 /* This detects both leaf entries and intermediate block mappings. */
7035 if ((tte & type_mask) == type_block) {
7036 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
7037 break;
7038 }
7039
7040 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
7041 }
7042
7043 return pa;
7044 }
7045
7046 /*
7047 * pmap_init_pte_page - Initialize a page table page.
7048 */
7049 MARK_AS_PMAP_TEXT void
7050 pmap_init_pte_page(
7051 pmap_t pmap,
7052 pt_entry_t *pte_p,
7053 vm_offset_t va,
7054 unsigned int ttlevel,
7055 boolean_t alloc_ptd)
7056 {
7057 pt_desc_t *ptdp = NULL;
7058 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
7059
7060 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
7061 if (alloc_ptd) {
7062 /*
7063 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
7064 * on 4KB hardware, we may already have allocated a page table descriptor for a
7065 * bootstrap request, so we check for an existing PTD here.
7066 */
7067 ptdp = ptd_alloc(pmap);
7068 if (ptdp == NULL) {
7069 panic("%s: unable to allocate PTD", __func__);
7070 }
7071 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
7072 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
7073 pvh_set_flags(pvh, 0);
7074 } else {
7075 panic("pmap_init_pte_page(): pte_p %p", pte_p);
7076 }
7077 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7078 ptdp = pvh_ptd(pvh);
7079 } else {
7080 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7081 }
7082
7083 // below barrier ensures previous updates to the page are visible to PTW before
7084 // it is linked to the PTE of previous level
7085 __builtin_arm_dmb(DMB_ISHST);
7086 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7087 }
7088
7089 /*
7090 * Routine: pmap_expand
7091 *
7092 * Expands a pmap to be able to map the specified virtual address.
7093 *
7094 * Allocates new memory for the default (COARSE) translation table
7095 * entry, initializes all the pte entries to ARM_PTE_EMPTY and
7096 * also allocates space for the corresponding pv entries.
7097 *
7098 * Nothing should be locked.
7099 */
7100 MARK_AS_PMAP_TEXT static kern_return_t
7101 pmap_expand(
7102 pmap_t pmap,
7103 vm_map_address_t v,
7104 unsigned int options,
7105 unsigned int level)
7106 {
7107 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7108
7109 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
7110 return KERN_INVALID_ADDRESS;
7111 }
7112 pmap_paddr_t pa;
7113 unsigned int ttlevel = pt_attr_root_level(pt_attr);
7114 tt_entry_t *tte_p;
7115 tt_entry_t *tt_p;
7116
7117 pa = 0x0ULL;
7118 tt_p = (tt_entry_t *)NULL;
7119
7120 for (; ttlevel < level; ttlevel++) {
7121 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7122 return KERN_ABORTED;
7123 }
7124
7125 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7126 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7127 kern_return_t ret;
7128 while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
7129 if (options & PMAP_OPTIONS_NOWAIT) {
7130 /* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
7131 return ret;
7132 }
7133 #if XNU_MONITOR
7134 panic("%s: failed to allocate tt, "
7135 "pmap=%p, v=%p, options=0x%x, level=%u",
7136 __FUNCTION__,
7137 pmap, (void *)v, options, level);
7138 #else
7139 VM_PAGE_WAIT();
7140 #endif
7141 }
7142
7143 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
7144 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7145 return KERN_ABORTED;
7146 }
7147
7148 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7149 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7150 pa = kvtophys_nofail((vm_offset_t)tt_p);
7151 tte_p = pmap_ttne(pmap, ttlevel, v);
7152 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7153 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7154 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7155 pa = 0x0ULL;
7156 tt_p = (tt_entry_t *)NULL;
7157 }
7158 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7159 } else {
7160 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7161 }
7162
7163 if (tt_p != (tt_entry_t *)NULL) {
7164 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7165 tt_p = (tt_entry_t *)NULL;
7166 }
7167 }
7168
7169 return KERN_SUCCESS;
7170 }
7171
7172 /*
7173 * Routine: pmap_gc
7174 * Function:
7175 * Pmap garbage collection
7176 * Called by the pageout daemon when pages are scarce.
7177 *
7178 */
7179 void
7180 pmap_gc(void)
7181 {
7182 /*
7183 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7184 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7185 * or may contain wired mappings. However, with the relatively recent change to
7186 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
7187 * page, it may make sense to call that function here.
7188 */
7189 }
7190
7191 /*
7192 * By default, don't attempt pmap GC more frequently
7193 * than once / 1 minutes.
7194 */
7195
7196 void
7197 compute_pmap_gc_throttle(
7198 void *arg __unused)
7199 {
7200 }
7201
7202 /*
7203 * pmap_attribute_cache_sync(vm_offset_t pa)
7204 *
7205 * Invalidates all of the instruction cache on a physical page and
7206 * pushes any dirty data from the data cache for the same physical page
7207 */
7208
7209 kern_return_t
7210 pmap_attribute_cache_sync(
7211 ppnum_t pp,
7212 vm_size_t size,
7213 __unused vm_machine_attribute_t attribute,
7214 __unused vm_machine_attribute_val_t * value)
7215 {
7216 if (size > PAGE_SIZE) {
7217 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7218 } else {
7219 cache_sync_page(pp);
7220 }
7221
7222 return KERN_SUCCESS;
7223 }
7224
7225 /*
7226 * pmap_sync_page_data_phys(ppnum_t pp)
7227 *
7228 * Invalidates all of the instruction cache on a physical page and
7229 * pushes any dirty data from the data cache for the same physical page
7230 */
7231 void
7232 pmap_sync_page_data_phys(
7233 ppnum_t pp)
7234 {
7235 cache_sync_page(pp);
7236 }
7237
7238 /*
7239 * pmap_sync_page_attributes_phys(ppnum_t pp)
7240 *
7241 * Write back and invalidate all cachelines on a physical page.
7242 */
7243 void
7244 pmap_sync_page_attributes_phys(
7245 ppnum_t pp)
7246 {
7247 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7248 }
7249
7250 #if CONFIG_COREDUMP
7251 /* temporary workaround */
7252 boolean_t
7253 coredumpok(
7254 vm_map_t map,
7255 mach_vm_offset_t va)
7256 {
7257 pt_entry_t *pte_p;
7258 pt_entry_t spte;
7259
7260 pte_p = pmap_pte(map->pmap, va);
7261 if (0 == pte_p) {
7262 return FALSE;
7263 }
7264 if (vm_map_entry_has_device_pager(map, va)) {
7265 return FALSE;
7266 }
7267 spte = *pte_p;
7268 return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT;
7269 }
7270 #endif
7271
7272 void
7273 fillPage(
7274 ppnum_t pn,
7275 unsigned int fill)
7276 {
7277 unsigned int *addr;
7278 int count;
7279
7280 addr = (unsigned int *) phystokv(ptoa(pn));
7281 count = PAGE_SIZE / sizeof(unsigned int);
7282 while (count--) {
7283 *addr++ = fill;
7284 }
7285 }
7286
7287 extern void mapping_set_mod(ppnum_t pn);
7288
7289 void
7290 mapping_set_mod(
7291 ppnum_t pn)
7292 {
7293 pmap_set_modify(pn);
7294 }
7295
7296 extern void mapping_set_ref(ppnum_t pn);
7297
7298 void
7299 mapping_set_ref(
7300 ppnum_t pn)
7301 {
7302 pmap_set_reference(pn);
7303 }
7304
7305 /*
7306 * Clear specified attribute bits.
7307 *
7308 * Try to force an arm_fast_fault() for all mappings of
7309 * the page - to force attributes to be set again at fault time.
7310 * If the forcing succeeds, clear the cached bits at the head.
7311 * Otherwise, something must have been wired, so leave the cached
7312 * attributes alone.
7313 */
7314 MARK_AS_PMAP_TEXT static void
7315 phys_attribute_clear_with_flush_range(
7316 ppnum_t pn,
7317 unsigned int bits,
7318 int options,
7319 void *arg,
7320 pmap_tlb_flush_range_t *flush_range)
7321 {
7322 pmap_paddr_t pa = ptoa(pn);
7323 vm_prot_t allow_mode = VM_PROT_ALL;
7324
7325 #if XNU_MONITOR
7326 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7327 panic("%s: illegal request, "
7328 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7329 __FUNCTION__,
7330 pn, bits, options, arg, flush_range);
7331 }
7332 #endif
7333 if ((arg != NULL) || (flush_range != NULL)) {
7334 options = options & ~PMAP_OPTIONS_NOFLUSH;
7335 }
7336
7337 if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7338 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7339 "invalid options",
7340 pn, bits, options, arg, flush_range);
7341 }
7342
7343 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7344 (options & PMAP_OPTIONS_NOFLUSH))) {
7345 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7346 "should not clear 'modified' without flushing TLBs",
7347 pn, bits, options, arg, flush_range);
7348 }
7349
7350 assert(pn != vm_page_fictitious_addr);
7351
7352 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7353 assert(bits == PP_ATTR_MODIFIED);
7354
7355 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7356 /*
7357 * We short circuit this case; it should not need to
7358 * invoke arm_force_fast_fault, so just clear the modified bit.
7359 * pmap_page_protect has taken care of resetting
7360 * the state so that we'll see the next write as a fault to
7361 * the VM (i.e. we don't want a fast fault).
7362 */
7363 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7364 return;
7365 }
7366 if (bits & PP_ATTR_REFERENCED) {
7367 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7368 }
7369 if (bits & PP_ATTR_MODIFIED) {
7370 allow_mode &= ~VM_PROT_WRITE;
7371 }
7372
7373 if (bits == PP_ATTR_NOENCRYPT) {
7374 /*
7375 * We short circuit this case; it should not need to
7376 * invoke arm_force_fast_fault, so just clear and
7377 * return. On ARM, this bit is just a debugging aid.
7378 */
7379 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7380 return;
7381 }
7382
7383 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7384 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7385 }
7386 }
7387
7388 MARK_AS_PMAP_TEXT void
7389 phys_attribute_clear_internal(
7390 ppnum_t pn,
7391 unsigned int bits,
7392 int options,
7393 void *arg)
7394 {
7395 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7396 }
7397
7398 #if __ARM_RANGE_TLBI__
7399 MARK_AS_PMAP_TEXT static vm_map_address_t
7400 phys_attribute_clear_twig_internal(
7401 pmap_t pmap,
7402 vm_map_address_t start,
7403 vm_map_address_t end,
7404 unsigned int bits,
7405 unsigned int options,
7406 pmap_tlb_flush_range_t *flush_range)
7407 {
7408 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7409 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7410 assert(end >= start);
7411 assert((end - start) <= pt_attr_twig_size(pt_attr));
7412 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7413 vm_map_address_t va = start;
7414 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7415 tt_entry_t *tte_p;
7416 tte_p = pmap_tte(pmap, start);
7417 unsigned int npages = 0;
7418
7419 if (tte_p == (tt_entry_t *) NULL) {
7420 return end;
7421 }
7422
7423 if (tte_is_valid_table(*tte_p)) {
7424 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7425
7426 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7427 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7428 assert(end_pte_p >= start_pte_p);
7429 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7430 if (__improbable(npages++ && pmap_pending_preemption())) {
7431 return va;
7432 }
7433 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7434 if (pa_valid(pa)) {
7435 ppnum_t pn = (ppnum_t) atop(pa);
7436 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7437 }
7438 }
7439 }
7440 return end;
7441 }
7442
7443 MARK_AS_PMAP_TEXT vm_map_address_t
7444 phys_attribute_clear_range_internal(
7445 pmap_t pmap,
7446 vm_map_address_t start,
7447 vm_map_address_t end,
7448 unsigned int bits,
7449 unsigned int options)
7450 {
7451 if (__improbable(end < start)) {
7452 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7453 }
7454 validate_pmap_mutable(pmap);
7455
7456 vm_map_address_t va = start;
7457 pmap_tlb_flush_range_t flush_range = {
7458 .ptfr_pmap = pmap,
7459 .ptfr_start = start,
7460 .ptfr_end = end,
7461 .ptfr_flush_needed = false
7462 };
7463
7464 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7465 return va;
7466 }
7467
7468 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7469
7470 while (va < end) {
7471 vm_map_address_t curr_end;
7472
7473 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7474 if (curr_end > end) {
7475 curr_end = end;
7476 }
7477
7478 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7479 if ((va < curr_end) || pmap_pending_preemption()) {
7480 break;
7481 }
7482 }
7483 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7484 if (flush_range.ptfr_flush_needed) {
7485 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7486 flush_range.ptfr_start,
7487 flush_range.ptfr_end - flush_range.ptfr_start,
7488 flush_range.ptfr_pmap,
7489 true,
7490 false);
7491 sync_tlb_flush();
7492 }
7493 return va;
7494 }
7495
7496 static void
7497 phys_attribute_clear_range(
7498 pmap_t pmap,
7499 vm_map_address_t start,
7500 vm_map_address_t end,
7501 unsigned int bits,
7502 unsigned int options)
7503 {
7504 /*
7505 * We allow single-page requests to execute non-preemptibly,
7506 * as it doesn't make sense to sample AST_URGENT for a single-page
7507 * operation, and there are a couple of special use cases that
7508 * require a non-preemptible single-page operation.
7509 */
7510 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7511 pmap_verify_preemptible();
7512 }
7513
7514 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7515
7516 while (start < end) {
7517 #if XNU_MONITOR
7518 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7519 #else
7520 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7521 #endif
7522 }
7523
7524 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7525 }
7526 #endif /* __ARM_RANGE_TLBI__ */
7527
7528 static void
7529 phys_attribute_clear(
7530 ppnum_t pn,
7531 unsigned int bits,
7532 int options,
7533 void *arg)
7534 {
7535 /*
7536 * Do we really want this tracepoint? It will be extremely chatty.
7537 * Also, should we have a corresponding trace point for the set path?
7538 */
7539 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7540
7541 #if XNU_MONITOR
7542 phys_attribute_clear_ppl(pn, bits, options, arg);
7543 #else
7544 phys_attribute_clear_internal(pn, bits, options, arg);
7545 #endif
7546
7547 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7548 }
7549
7550 /*
7551 * Set specified attribute bits.
7552 *
7553 * Set cached value in the pv head because we have
7554 * no per-mapping hardware support for referenced and
7555 * modify bits.
7556 */
7557 MARK_AS_PMAP_TEXT void
7558 phys_attribute_set_internal(
7559 ppnum_t pn,
7560 unsigned int bits)
7561 {
7562 pmap_paddr_t pa = ptoa(pn);
7563 assert(pn != vm_page_fictitious_addr);
7564
7565 #if XNU_MONITOR
7566 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7567 panic("%s: illegal request, "
7568 "pn=%u, bits=%#x",
7569 __FUNCTION__,
7570 pn, bits);
7571 }
7572 #endif
7573
7574 ppattr_pa_set_bits(pa, (uint16_t)bits);
7575
7576 return;
7577 }
7578
7579 static void
7580 phys_attribute_set(
7581 ppnum_t pn,
7582 unsigned int bits)
7583 {
7584 #if XNU_MONITOR
7585 phys_attribute_set_ppl(pn, bits);
7586 #else
7587 phys_attribute_set_internal(pn, bits);
7588 #endif
7589 }
7590
7591
7592 /*
7593 * Check specified attribute bits.
7594 *
7595 * use the software cached bits (since no hw support).
7596 */
7597 static boolean_t
7598 phys_attribute_test(
7599 ppnum_t pn,
7600 unsigned int bits)
7601 {
7602 pmap_paddr_t pa = ptoa(pn);
7603 assert(pn != vm_page_fictitious_addr);
7604 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7605 }
7606
7607
7608 /*
7609 * Set the modify/reference bits on the specified physical page.
7610 */
7611 void
7612 pmap_set_modify(ppnum_t pn)
7613 {
7614 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7615 }
7616
7617
7618 /*
7619 * Clear the modify bits on the specified physical page.
7620 */
7621 void
7622 pmap_clear_modify(
7623 ppnum_t pn)
7624 {
7625 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7626 }
7627
7628
7629 /*
7630 * pmap_is_modified:
7631 *
7632 * Return whether or not the specified physical page is modified
7633 * by any physical maps.
7634 */
7635 boolean_t
7636 pmap_is_modified(
7637 ppnum_t pn)
7638 {
7639 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7640 }
7641
7642
7643 /*
7644 * Set the reference bit on the specified physical page.
7645 */
7646 static void
7647 pmap_set_reference(
7648 ppnum_t pn)
7649 {
7650 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7651 }
7652
7653 /*
7654 * Clear the reference bits on the specified physical page.
7655 */
7656 void
7657 pmap_clear_reference(
7658 ppnum_t pn)
7659 {
7660 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7661 }
7662
7663
7664 /*
7665 * pmap_is_referenced:
7666 *
7667 * Return whether or not the specified physical page is referenced
7668 * by any physical maps.
7669 */
7670 boolean_t
7671 pmap_is_referenced(
7672 ppnum_t pn)
7673 {
7674 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7675 }
7676
7677 /*
7678 * pmap_get_refmod(phys)
7679 * returns the referenced and modified bits of the specified
7680 * physical page.
7681 */
7682 unsigned int
7683 pmap_get_refmod(
7684 ppnum_t pn)
7685 {
7686 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7687 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7688 }
7689
7690 static inline unsigned int
7691 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7692 {
7693 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7694 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7695 }
7696
7697 /*
7698 * pmap_clear_refmod(phys, mask)
7699 * clears the referenced and modified bits as specified by the mask
7700 * of the specified physical page.
7701 */
7702 void
7703 pmap_clear_refmod_options(
7704 ppnum_t pn,
7705 unsigned int mask,
7706 unsigned int options,
7707 void *arg)
7708 {
7709 unsigned int bits;
7710
7711 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7712 phys_attribute_clear(pn, bits, options, arg);
7713 }
7714
7715 /*
7716 * Perform pmap_clear_refmod_options on a virtual address range.
7717 * The operation will be performed in bulk & tlb flushes will be coalesced
7718 * if possible.
7719 *
7720 * Returns true if the operation is supported on this platform.
7721 * If this function returns false, the operation is not supported and
7722 * nothing has been modified in the pmap.
7723 */
7724 bool
7725 pmap_clear_refmod_range_options(
7726 pmap_t pmap __unused,
7727 vm_map_address_t start __unused,
7728 vm_map_address_t end __unused,
7729 unsigned int mask __unused,
7730 unsigned int options __unused)
7731 {
7732 #if __ARM_RANGE_TLBI__
7733 unsigned int bits;
7734 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7735 phys_attribute_clear_range(pmap, start, end, bits, options);
7736 return true;
7737 #else /* __ARM_RANGE_TLBI__ */
7738 #pragma unused(pmap, start, end, mask, options)
7739 /*
7740 * This operation allows the VM to bulk modify refmod bits on a virtually
7741 * contiguous range of addresses. This is large performance improvement on
7742 * platforms that support ranged tlbi instructions. But on older platforms,
7743 * we can only flush per-page or the entire asid. So we currently
7744 * only support this operation on platforms that support ranged tlbi.
7745 * instructions. On other platforms, we require that
7746 * the VM modify the bits on a per-page basis.
7747 */
7748 return false;
7749 #endif /* __ARM_RANGE_TLBI__ */
7750 }
7751
7752 void
7753 pmap_clear_refmod(
7754 ppnum_t pn,
7755 unsigned int mask)
7756 {
7757 pmap_clear_refmod_options(pn, mask, 0, NULL);
7758 }
7759
7760 unsigned int
7761 pmap_disconnect_options(
7762 ppnum_t pn,
7763 unsigned int options,
7764 void *arg)
7765 {
7766 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7767 /*
7768 * On ARM, the "modified" bit is managed by software, so
7769 * we know up-front if the physical page is "modified",
7770 * without having to scan all the PTEs pointing to it.
7771 * The caller should have made the VM page "busy" so noone
7772 * should be able to establish any new mapping and "modify"
7773 * the page behind us.
7774 */
7775 if (pmap_is_modified(pn)) {
7776 /*
7777 * The page has been modified and will be sent to
7778 * the VM compressor.
7779 */
7780 options |= PMAP_OPTIONS_COMPRESSOR;
7781 } else {
7782 /*
7783 * The page hasn't been modified and will be freed
7784 * instead of compressed.
7785 */
7786 }
7787 }
7788
7789 /* disconnect the page */
7790 pmap_page_protect_options(pn, 0, options, arg);
7791
7792 /* return ref/chg status */
7793 return pmap_get_refmod(pn);
7794 }
7795
7796 /*
7797 * Routine:
7798 * pmap_disconnect
7799 *
7800 * Function:
7801 * Disconnect all mappings for this page and return reference and change status
7802 * in generic format.
7803 *
7804 */
7805 unsigned int
7806 pmap_disconnect(
7807 ppnum_t pn)
7808 {
7809 pmap_page_protect(pn, 0); /* disconnect the page */
7810 return pmap_get_refmod(pn); /* return ref/chg status */
7811 }
7812
7813 boolean_t
7814 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7815 {
7816 if (ptoa(first) >= vm_last_phys) {
7817 return FALSE;
7818 }
7819 if (ptoa(last) < vm_first_phys) {
7820 return FALSE;
7821 }
7822
7823 return TRUE;
7824 }
7825
7826 /*
7827 * The state maintained by the noencrypt functions is used as a
7828 * debugging aid on ARM. This incurs some overhead on the part
7829 * of the caller. A special case check in phys_attribute_clear
7830 * (the most expensive path) currently minimizes this overhead,
7831 * but stubbing these functions out on RELEASE kernels yields
7832 * further wins.
7833 */
7834 boolean_t
7835 pmap_is_noencrypt(
7836 ppnum_t pn)
7837 {
7838 #if DEVELOPMENT || DEBUG
7839 boolean_t result = FALSE;
7840
7841 if (!pa_valid(ptoa(pn))) {
7842 return FALSE;
7843 }
7844
7845 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7846
7847 return result;
7848 #else
7849 #pragma unused(pn)
7850 return FALSE;
7851 #endif
7852 }
7853
7854 void
7855 pmap_set_noencrypt(
7856 ppnum_t pn)
7857 {
7858 #if DEVELOPMENT || DEBUG
7859 if (!pa_valid(ptoa(pn))) {
7860 return;
7861 }
7862
7863 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7864 #else
7865 #pragma unused(pn)
7866 #endif
7867 }
7868
7869 void
7870 pmap_clear_noencrypt(
7871 ppnum_t pn)
7872 {
7873 #if DEVELOPMENT || DEBUG
7874 if (!pa_valid(ptoa(pn))) {
7875 return;
7876 }
7877
7878 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7879 #else
7880 #pragma unused(pn)
7881 #endif
7882 }
7883
7884 #if XNU_MONITOR
7885 boolean_t
7886 pmap_is_monitor(ppnum_t pn)
7887 {
7888 assert(pa_valid(ptoa(pn)));
7889 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7890 }
7891 #endif
7892
7893 void
7894 pmap_lock_phys_page(ppnum_t pn)
7895 {
7896 #if !XNU_MONITOR
7897 unsigned int pai;
7898 pmap_paddr_t phys = ptoa(pn);
7899
7900 if (pa_valid(phys)) {
7901 pai = pa_index(phys);
7902 pvh_lock(pai);
7903 } else
7904 #else
7905 (void)pn;
7906 #endif
7907 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7908 }
7909
7910
7911 void
7912 pmap_unlock_phys_page(ppnum_t pn)
7913 {
7914 #if !XNU_MONITOR
7915 unsigned int pai;
7916 pmap_paddr_t phys = ptoa(pn);
7917
7918 if (pa_valid(phys)) {
7919 pai = pa_index(phys);
7920 pvh_unlock(pai);
7921 } else
7922 #else
7923 (void)pn;
7924 #endif
7925 { simple_unlock(&phys_backup_lock);}
7926 }
7927
7928 MARK_AS_PMAP_TEXT static void
7929 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7930 {
7931 if (pmap != kernel_pmap) {
7932 pmap_t nested_pmap = pmap->nested_pmap;
7933 cpu_data_ptr->cpu_nested_pmap = nested_pmap;
7934 if (nested_pmap != NULL) {
7935 cpu_data_ptr->cpu_nested_pmap_attr = pmap_get_pt_attr(nested_pmap);
7936 /**
7937 * Obtain the full shared region bounds from the nested pmap. If the top-level pmap
7938 * hasn't been fully nested yet, its bounds may not yet be configured, or may be in the
7939 * process of being configured on another core.
7940 */
7941 cpu_data_ptr->cpu_nested_region_addr = nested_pmap->nested_region_addr;
7942 cpu_data_ptr->cpu_nested_region_size = nested_pmap->nested_region_size;
7943 }
7944 #if __ARM_MIXED_PAGE_SIZE__
7945 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7946 #endif
7947 }
7948
7949
7950 #if __ARM_MIXED_PAGE_SIZE__
7951 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7952 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7953 }
7954 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7955
7956
7957 if (pmap != kernel_pmap) {
7958 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7959 } else if (!pmap_user_ttb_is_clear()) {
7960 pmap_clear_user_ttb_internal();
7961 }
7962 }
7963
7964 MARK_AS_PMAP_TEXT void
7965 pmap_clear_user_ttb_internal(void)
7966 {
7967 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7968 }
7969
7970 void
7971 pmap_clear_user_ttb(void)
7972 {
7973 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7974 #if XNU_MONITOR
7975 pmap_clear_user_ttb_ppl();
7976 #else
7977 pmap_clear_user_ttb_internal();
7978 #endif
7979 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7980 }
7981
7982
7983 #if defined(__arm64__)
7984 /*
7985 * Marker for use in multi-pass fast-fault PV list processing.
7986 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7987 * these functions, as compressed PTEs should never be present in PV lists.
7988 * Note that this only holds true for arm64; for arm32 we don't have enough
7989 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7990 * and WRITEABLE marker depending on whether the PTE is valid.
7991 */
7992 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7993 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7994 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7995 #endif
7996
7997
7998 MARK_AS_PMAP_TEXT static boolean_t
7999 arm_force_fast_fault_with_flush_range(
8000 ppnum_t ppnum,
8001 vm_prot_t allow_mode,
8002 int options,
8003 pmap_tlb_flush_range_t *flush_range)
8004 {
8005 pmap_paddr_t phys = ptoa(ppnum);
8006 pv_entry_t *pve_p;
8007 pt_entry_t *pte_p;
8008 unsigned int pai;
8009 unsigned int pass1_updated = 0;
8010 unsigned int pass2_updated = 0;
8011 boolean_t result;
8012 pv_entry_t **pv_h;
8013 bool is_reusable;
8014 bool ref_fault;
8015 bool mod_fault;
8016 bool clear_write_fault = false;
8017 bool ref_aliases_mod = false;
8018 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
8019
8020 assert(ppnum != vm_page_fictitious_addr);
8021
8022 if (!pa_valid(phys)) {
8023 return FALSE; /* Not a managed page. */
8024 }
8025
8026 result = TRUE;
8027 ref_fault = false;
8028 mod_fault = false;
8029 pai = pa_index(phys);
8030 if (__probable(mustsynch)) {
8031 pvh_lock(pai);
8032 }
8033 pv_h = pai_to_pvh(pai);
8034
8035 #if XNU_MONITOR
8036 if (__improbable(ppattr_pa_test_monitor(phys))) {
8037 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
8038 }
8039 #endif
8040 pte_p = PT_ENTRY_NULL;
8041 pve_p = PV_ENTRY_NULL;
8042 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8043 pte_p = pvh_ptep(pv_h);
8044 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8045 pve_p = pvh_pve_list(pv_h);
8046 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8047 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
8048 }
8049
8050 is_reusable = ppattr_test_reusable(pai);
8051
8052 /*
8053 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
8054 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
8055 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
8056 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
8057 * operation, TLB invalidation may be handled by the caller so it's possible for
8058 * tlb_flush_needed to be true while issue_tlbi is false.
8059 */
8060 bool issue_tlbi = false;
8061 bool tlb_flush_needed = false;
8062
8063 pv_entry_t *orig_pve_p = pve_p;
8064 pt_entry_t *orig_pte_p = pte_p;
8065 int pve_ptep_idx = 0;
8066
8067 /*
8068 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8069 * TLB invalidation in pass 2.
8070 */
8071 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8072 pt_entry_t spte;
8073 pt_entry_t tmplate;
8074
8075 if (pve_p != PV_ENTRY_NULL) {
8076 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8077 if (pte_p == PT_ENTRY_NULL) {
8078 goto fff_skip_pve_pass1;
8079 }
8080 }
8081
8082 #ifdef PVH_FLAG_IOMMU
8083 if (pvh_ptep_is_iommu(pte_p)) {
8084 goto fff_skip_pve_pass1;
8085 }
8086 #endif
8087 if (*pte_p == ARM_PTE_EMPTY) {
8088 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8089 }
8090 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8091 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8092 }
8093
8094 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8095 const pmap_t pmap = ptdp->pmap;
8096 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8097 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8098
8099 assert(va >= pmap->min && va < pmap->max);
8100
8101 /* update pmap stats and ledgers */
8102 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8103 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8104 if (is_altacct) {
8105 /*
8106 * We do not track "reusable" status for
8107 * "alternate accounting" mappings.
8108 */
8109 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8110 is_reusable &&
8111 is_internal &&
8112 pmap != kernel_pmap) {
8113 /* one less "reusable" */
8114 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8115 /* one more "internal" */
8116 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8117 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8118
8119 /*
8120 * Since the page is being marked non-reusable, we assume that it will be
8121 * modified soon. Avoid the cost of another trap to handle the fast
8122 * fault when we next write to this page.
8123 */
8124 clear_write_fault = true;
8125 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8126 !is_reusable &&
8127 is_internal &&
8128 pmap != kernel_pmap) {
8129 /* one more "reusable" */
8130 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8131 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8132 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8133 }
8134
8135 bool wiredskip = pte_is_wired(*pte_p) &&
8136 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8137
8138 if (wiredskip) {
8139 result = FALSE;
8140 goto fff_skip_pve_pass1;
8141 }
8142
8143 spte = *pte_p;
8144 tmplate = spte;
8145
8146 #if HAS_FEAT_XS
8147 /**
8148 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
8149 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
8150 */
8151 assert(!pte_is_xs(pt_attr, spte));
8152 #endif /* HAS_FEAT_XS */
8153 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8154 /* read protection sets the pte to fault */
8155 tmplate = tmplate & ~ARM_PTE_AF;
8156 ref_fault = true;
8157 }
8158 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8159 /* take away write permission if set */
8160 if (pmap == kernel_pmap) {
8161 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8162 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8163 pte_set_was_writeable(tmplate, true);
8164 mod_fault = true;
8165 }
8166 } else {
8167 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8168 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8169 pte_set_was_writeable(tmplate, true);
8170 mod_fault = true;
8171 }
8172 }
8173 }
8174
8175 #if MACH_ASSERT && XNU_MONITOR
8176 if (is_pte_xprr_protected(pmap, spte)) {
8177 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8178 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8179 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8180 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8181 ppnum, options, allow_mode);
8182 }
8183 }
8184 #endif /* MACH_ASSERT && XNU_MONITOR */
8185
8186 if (result && (tmplate != spte)) {
8187 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8188 !(options & PMAP_OPTIONS_NOFLUSH)) {
8189 tlb_flush_needed = true;
8190 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8191 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8192 #ifdef ARM_PTE_FF_MARKER
8193 assert(!(spte & ARM_PTE_FF_MARKER));
8194 tmplate |= ARM_PTE_FF_MARKER;
8195 ++pass1_updated;
8196 #endif
8197 issue_tlbi = true;
8198 }
8199 }
8200 write_pte_fast(pte_p, tmplate);
8201 }
8202
8203 fff_skip_pve_pass1:
8204 pte_p = PT_ENTRY_NULL;
8205 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8206 pve_ptep_idx = 0;
8207 pve_p = pve_next(pve_p);
8208 }
8209 }
8210
8211 if (tlb_flush_needed) {
8212 FLUSH_PTE_STRONG();
8213 }
8214
8215 if (!issue_tlbi) {
8216 goto fff_finish;
8217 }
8218
8219 /* Pass 2: Issue any required TLB invalidations */
8220 pve_p = orig_pve_p;
8221 pte_p = orig_pte_p;
8222 pve_ptep_idx = 0;
8223
8224 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8225 if (pve_p != PV_ENTRY_NULL) {
8226 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8227 if (pte_p == PT_ENTRY_NULL) {
8228 goto fff_skip_pve_pass2;
8229 }
8230 }
8231
8232 #ifdef PVH_FLAG_IOMMU
8233 if (pvh_ptep_is_iommu(pte_p)) {
8234 goto fff_skip_pve_pass2;
8235 }
8236 #endif
8237
8238 #ifdef ARM_PTE_FF_MARKER
8239 pt_entry_t spte = *pte_p;
8240
8241 if (!(spte & ARM_PTE_FF_MARKER)) {
8242 goto fff_skip_pve_pass2;
8243 } else {
8244 spte &= (~ARM_PTE_FF_MARKER);
8245 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8246 write_pte_fast(pte_p, spte);
8247 ++pass2_updated;
8248 }
8249 #endif
8250 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8251 const pmap_t pmap = ptdp->pmap;
8252 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8253
8254 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8255 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8256 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8257 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8258 }
8259
8260 fff_skip_pve_pass2:
8261 pte_p = PT_ENTRY_NULL;
8262 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8263 pve_ptep_idx = 0;
8264 pve_p = pve_next(pve_p);
8265 }
8266 }
8267
8268 fff_finish:
8269 if (__improbable(pass1_updated != pass2_updated)) {
8270 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8271 __func__, pass1_updated, pass2_updated);
8272 }
8273
8274 /*
8275 * If we are using the same approach for ref and mod
8276 * faults on this PTE, do not clear the write fault;
8277 * this would cause both ref and mod to be set on the
8278 * page again, and prevent us from taking ANY read/write
8279 * fault on the mapping.
8280 */
8281 if (clear_write_fault && !ref_aliases_mod) {
8282 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8283 }
8284 if (tlb_flush_needed) {
8285 if (flush_range) {
8286 /* Delayed flush. Signal to the caller that the flush is needed. */
8287 flush_range->ptfr_flush_needed = true;
8288 } else {
8289 sync_tlb_flush();
8290 }
8291 }
8292
8293 /* update global "reusable" status for this page */
8294 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8295 ppattr_clear_reusable(pai);
8296 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8297 ppattr_set_reusable(pai);
8298 }
8299
8300 if (mod_fault) {
8301 ppattr_set_modfault(pai);
8302 }
8303 if (ref_fault) {
8304 ppattr_set_reffault(pai);
8305 }
8306 if (__probable(mustsynch)) {
8307 pvh_unlock(pai);
8308 }
8309 return result;
8310 }
8311
8312 MARK_AS_PMAP_TEXT boolean_t
8313 arm_force_fast_fault_internal(
8314 ppnum_t ppnum,
8315 vm_prot_t allow_mode,
8316 int options)
8317 {
8318 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8319 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8320 }
8321 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8322 }
8323
8324 /*
8325 * Routine: arm_force_fast_fault
8326 *
8327 * Function:
8328 * Force all mappings for this page to fault according
8329 * to the access modes allowed, so we can gather ref/modify
8330 * bits again.
8331 */
8332
8333 boolean_t
8334 arm_force_fast_fault(
8335 ppnum_t ppnum,
8336 vm_prot_t allow_mode,
8337 int options,
8338 __unused void *arg)
8339 {
8340 pmap_paddr_t phys = ptoa(ppnum);
8341
8342 assert(ppnum != vm_page_fictitious_addr);
8343
8344 if (!pa_valid(phys)) {
8345 return FALSE; /* Not a managed page. */
8346 }
8347
8348 #if XNU_MONITOR
8349 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8350 #else
8351 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8352 #endif
8353 }
8354
8355 /*
8356 * Routine: arm_clear_fast_fault
8357 *
8358 * Function:
8359 * Clear pending force fault for all mappings for this page based on
8360 * the observed fault type, update ref/modify bits.
8361 */
8362 MARK_AS_PMAP_TEXT static boolean_t
8363 arm_clear_fast_fault(
8364 ppnum_t ppnum,
8365 vm_prot_t fault_type,
8366 pt_entry_t *pte_p)
8367 {
8368 pmap_paddr_t pa = ptoa(ppnum);
8369 pv_entry_t *pve_p;
8370 unsigned int pai;
8371 boolean_t result;
8372 bool tlb_flush_needed = false;
8373 pv_entry_t **pv_h;
8374 unsigned int npve = 0;
8375 unsigned int pass1_updated = 0;
8376 unsigned int pass2_updated = 0;
8377
8378 assert(ppnum != vm_page_fictitious_addr);
8379
8380 if (!pa_valid(pa)) {
8381 return FALSE; /* Not a managed page. */
8382 }
8383
8384 result = FALSE;
8385 pai = pa_index(pa);
8386 pvh_assert_locked(pai);
8387 pv_h = pai_to_pvh(pai);
8388
8389 pve_p = PV_ENTRY_NULL;
8390 if (pte_p == PT_ENTRY_NULL) {
8391 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8392 pte_p = pvh_ptep(pv_h);
8393 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8394 pve_p = pvh_pve_list(pv_h);
8395 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8396 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8397 }
8398 }
8399
8400 pv_entry_t *orig_pve_p = pve_p;
8401 pt_entry_t *orig_pte_p = pte_p;
8402 int pve_ptep_idx = 0;
8403
8404 /*
8405 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8406 * TLB invalidation in pass 2.
8407 */
8408 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8409 pt_entry_t spte;
8410 pt_entry_t tmplate;
8411
8412 if (pve_p != PV_ENTRY_NULL) {
8413 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8414 if (pte_p == PT_ENTRY_NULL) {
8415 goto cff_skip_pve_pass1;
8416 }
8417 }
8418
8419 #ifdef PVH_FLAG_IOMMU
8420 if (pvh_ptep_is_iommu(pte_p)) {
8421 goto cff_skip_pve_pass1;
8422 }
8423 #endif
8424 if (*pte_p == ARM_PTE_EMPTY) {
8425 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8426 }
8427
8428 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8429 const pmap_t pmap = ptdp->pmap;
8430 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8431
8432 assert(va >= pmap->min && va < pmap->max);
8433
8434 spte = *pte_p;
8435 tmplate = spte;
8436
8437 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8438 {
8439 if (pmap == kernel_pmap) {
8440 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8441 } else {
8442 assert(pmap->type != PMAP_TYPE_NESTED);
8443 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8444 }
8445 }
8446
8447 tmplate |= ARM_PTE_AF;
8448
8449 pte_set_was_writeable(tmplate, false);
8450 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8451 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8452 tmplate = spte | ARM_PTE_AF;
8453
8454 {
8455 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8456 }
8457 }
8458
8459 #if MACH_ASSERT && XNU_MONITOR
8460 if (is_pte_xprr_protected(pmap, spte)) {
8461 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8462 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8463 "ppnum=0x%x, fault_type=0x%x",
8464 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8465 ppnum, fault_type);
8466 }
8467 }
8468 #endif /* MACH_ASSERT && XNU_MONITOR */
8469
8470 assert(spte != ARM_PTE_EMPTY);
8471 if (spte != tmplate) {
8472 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8473 #ifdef ARM_PTE_FF_MARKER
8474 assert(!(spte & ARM_PTE_FF_MARKER));
8475 tmplate |= ARM_PTE_FF_MARKER;
8476 ++pass1_updated;
8477 #endif
8478 tlb_flush_needed = true;
8479 }
8480 write_pte_fast(pte_p, tmplate);
8481 result = TRUE;
8482 }
8483
8484 cff_skip_pve_pass1:
8485 pte_p = PT_ENTRY_NULL;
8486 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8487 pve_ptep_idx = 0;
8488 pve_p = pve_next(pve_p);
8489 ++npve;
8490 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8491 break;
8492 }
8493 }
8494 }
8495
8496 if (!tlb_flush_needed) {
8497 goto cff_finish;
8498 }
8499
8500 FLUSH_PTE_STRONG();
8501
8502 /* Pass 2: Issue any required TLB invalidations */
8503 pve_p = orig_pve_p;
8504 pte_p = orig_pte_p;
8505 pve_ptep_idx = 0;
8506 npve = 0;
8507
8508 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8509 if (pve_p != PV_ENTRY_NULL) {
8510 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8511 if (pte_p == PT_ENTRY_NULL) {
8512 goto cff_skip_pve_pass2;
8513 }
8514 }
8515
8516 #ifdef PVH_FLAG_IOMMU
8517 if (pvh_ptep_is_iommu(pte_p)) {
8518 goto cff_skip_pve_pass2;
8519 }
8520 #endif
8521
8522 #ifdef ARM_PTE_FF_MARKER
8523 pt_entry_t spte = *pte_p;
8524
8525 if (!(spte & ARM_PTE_FF_MARKER)) {
8526 goto cff_skip_pve_pass2;
8527 } else {
8528 spte &= (~ARM_PTE_FF_MARKER);
8529 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8530 write_pte_fast(pte_p, spte);
8531 ++pass2_updated;
8532 }
8533 #endif
8534 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8535 const pmap_t pmap = ptdp->pmap;
8536 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8537
8538 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8539 pmap, true, false);
8540
8541 cff_skip_pve_pass2:
8542 pte_p = PT_ENTRY_NULL;
8543 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8544 pve_ptep_idx = 0;
8545 pve_p = pve_next(pve_p);
8546 ++npve;
8547 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8548 break;
8549 }
8550 }
8551 }
8552
8553 cff_finish:
8554 if (__improbable(pass1_updated != pass2_updated)) {
8555 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8556 __func__, pass1_updated, pass2_updated);
8557 }
8558 if (tlb_flush_needed) {
8559 sync_tlb_flush();
8560 }
8561 return result;
8562 }
8563
8564 /*
8565 * Determine if the fault was induced by software tracking of
8566 * modify/reference bits. If so, re-enable the mapping (and set
8567 * the appropriate bits).
8568 *
8569 * Returns KERN_SUCCESS if the fault was induced and was
8570 * successfully handled.
8571 *
8572 * Returns KERN_FAILURE if the fault was not induced and
8573 * the function was unable to deal with it.
8574 *
8575 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8576 * disallows this type of access.
8577 *
8578 * Returns KERN_ABORTED if the pmap lock is taken and a
8579 * preemption is pending.
8580 *
8581 */
8582 MARK_AS_PMAP_TEXT kern_return_t
8583 arm_fast_fault_internal(
8584 pmap_t pmap,
8585 vm_map_address_t va,
8586 vm_prot_t fault_type,
8587 __unused bool was_af_fault,
8588 __unused bool from_user)
8589 {
8590 kern_return_t result = KERN_FAILURE;
8591 pt_entry_t *ptep;
8592 pt_entry_t spte = ARM_PTE_EMPTY;
8593 unsigned int pai;
8594 pmap_paddr_t pa;
8595 validate_pmap_mutable(pmap);
8596
8597 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8598 return KERN_ABORTED;
8599 }
8600
8601 /*
8602 * If the entry doesn't exist, is completely invalid, or is already
8603 * valid, we can't fix it here.
8604 */
8605
8606 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8607 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8608 if (ptep != PT_ENTRY_NULL) {
8609 while (true) {
8610 spte = *((volatile pt_entry_t*)ptep);
8611
8612 pa = pte_to_pa(spte);
8613
8614 if ((spte == ARM_PTE_EMPTY) ||
8615 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8616 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8617 return result;
8618 }
8619
8620 if (!pa_valid(pa)) {
8621 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8622 #if XNU_MONITOR
8623 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8624 return KERN_PROTECTION_FAILURE;
8625 } else
8626 #endif
8627 return result;
8628 }
8629 pai = pa_index(pa);
8630 pvh_lock(pai);
8631 if (*ptep == spte) {
8632 /*
8633 * Double-check the spte value, as we care about the AF bit.
8634 * It's also possible that pmap_page_protect() transitioned the
8635 * PTE to compressed/empty before we grabbed the PVH lock.
8636 */
8637 break;
8638 }
8639 pvh_unlock(pai);
8640 }
8641 } else {
8642 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8643 return result;
8644 }
8645
8646
8647 if ((result != KERN_SUCCESS) &&
8648 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8649 /*
8650 * An attempted access will always clear ref/mod fault state, as
8651 * appropriate for the fault type. arm_clear_fast_fault will
8652 * update the associated PTEs for the page as appropriate; if
8653 * any PTEs are updated, we redrive the access. If the mapping
8654 * does not actually allow for the attempted access, the
8655 * following fault will (hopefully) fail to update any PTEs, and
8656 * thus cause arm_fast_fault to decide that it failed to handle
8657 * the fault.
8658 */
8659 if (ppattr_test_reffault(pai)) {
8660 ppattr_clear_reffault(pai);
8661 }
8662 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8663 ppattr_clear_modfault(pai);
8664 }
8665
8666 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8667 /*
8668 * Should this preserve KERN_PROTECTION_FAILURE? The
8669 * cost of not doing so is a another fault in a case
8670 * that should already result in an exception.
8671 */
8672 result = KERN_SUCCESS;
8673 }
8674 }
8675
8676 /*
8677 * If the PTE already has sufficient permissions, we can report the fault as handled.
8678 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8679 * on mappings of the same page
8680 */
8681 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8682 uintptr_t ap_ro, ap_rw, ap_x;
8683 if (pmap == kernel_pmap) {
8684 ap_ro = ARM_PTE_AP(AP_RONA);
8685 ap_rw = ARM_PTE_AP(AP_RWNA);
8686 ap_x = ARM_PTE_NX;
8687 } else {
8688 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8689 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8690 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8691 }
8692 /*
8693 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8694 * hardware they may be xPRR-protected, in which case they'll be handled
8695 * by the is_pte_xprr_protected() case above. Additionally, the exception
8696 * handling path currently does not call arm_fast_fault() without at least
8697 * VM_PROT_READ in fault_type.
8698 */
8699 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8700 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8701 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8702 result = KERN_SUCCESS;
8703 }
8704 }
8705 }
8706
8707 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8708 /*
8709 * A prior arm_clear_fast_fault() operation may have returned early due to
8710 * another pending PV list operation or an excessively large PV list.
8711 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8712 * taking a fault on the same mapping.
8713 */
8714 result = KERN_SUCCESS;
8715 }
8716
8717 pvh_unlock(pai);
8718 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8719 return result;
8720 }
8721
8722 kern_return_t
8723 arm_fast_fault(
8724 pmap_t pmap,
8725 vm_map_address_t va,
8726 vm_prot_t fault_type,
8727 bool was_af_fault,
8728 __unused bool from_user)
8729 {
8730 kern_return_t result = KERN_FAILURE;
8731
8732 if (va < pmap->min || va >= pmap->max) {
8733 return result;
8734 }
8735
8736 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8737 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8738 from_user);
8739
8740 do {
8741 #if XNU_MONITOR
8742 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8743 #else
8744 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8745 #endif
8746 } while (result == KERN_ABORTED);
8747
8748 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8749
8750 return result;
8751 }
8752
8753 void
8754 pmap_copy_page(
8755 ppnum_t psrc,
8756 ppnum_t pdst,
8757 int options)
8758 {
8759 bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8760 (addr64_t) (ptoa(pdst)),
8761 PAGE_SIZE,
8762 options);
8763 }
8764
8765
8766 /*
8767 * pmap_copy_page copies the specified (machine independent) pages.
8768 */
8769 void
8770 pmap_copy_part_page(
8771 ppnum_t psrc,
8772 vm_offset_t src_offset,
8773 ppnum_t pdst,
8774 vm_offset_t dst_offset,
8775 vm_size_t len)
8776 {
8777 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8778 (addr64_t) (ptoa(pdst) + dst_offset),
8779 len);
8780 }
8781
8782
8783 /*
8784 * pmap_zero_page zeros the specified (machine independent) page.
8785 */
8786 void
8787 pmap_zero_page(
8788 ppnum_t pn)
8789 {
8790 assert(pn != vm_page_fictitious_addr);
8791 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8792 }
8793
8794 void
8795 pmap_zero_page_with_options(
8796 ppnum_t pn,
8797 int options)
8798 {
8799 assert(pn != vm_page_fictitious_addr);
8800 bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8801 }
8802
8803 /*
8804 * pmap_zero_part_page
8805 * zeros the specified (machine independent) part of a page.
8806 */
8807 void
8808 pmap_zero_part_page(
8809 ppnum_t pn,
8810 vm_offset_t offset,
8811 vm_size_t len)
8812 {
8813 assert(pn != vm_page_fictitious_addr);
8814 assert(offset + len <= PAGE_SIZE);
8815 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8816 }
8817
8818 void
8819 pmap_map_globals(
8820 void)
8821 {
8822 pt_entry_t *ptep, pte;
8823
8824 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8825 assert(ptep != PT_ENTRY_NULL);
8826 assert(*ptep == ARM_PTE_EMPTY);
8827
8828 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8829 #if __ARM_KERNEL_PROTECT__
8830 pte |= ARM_PTE_NG;
8831 #endif /* __ARM_KERNEL_PROTECT__ */
8832 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8833 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8834 *ptep = pte;
8835 FLUSH_PTE();
8836 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8837
8838 #if KASAN
8839 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8840 #endif
8841 }
8842
8843 vm_offset_t
8844 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8845 {
8846 if (__improbable(index >= CPUWINDOWS_MAX)) {
8847 panic("%s: invalid index %u", __func__, index);
8848 }
8849 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8850 }
8851
8852 MARK_AS_PMAP_TEXT unsigned int
8853 pmap_map_cpu_windows_copy_internal(
8854 ppnum_t pn,
8855 vm_prot_t prot,
8856 unsigned int wimg_bits)
8857 {
8858 pt_entry_t *ptep = NULL, pte;
8859 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8860 unsigned int cpu_num;
8861 unsigned int i;
8862 vm_offset_t cpu_copywindow_vaddr = 0;
8863 bool need_strong_sync = false;
8864
8865 #if XNU_MONITOR
8866 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8867 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8868 #endif
8869
8870 #if XNU_MONITOR
8871 #ifdef __ARM_COHERENT_IO__
8872 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8873 panic("%s: attempted to map a managed page, "
8874 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8875 __FUNCTION__,
8876 pn, prot, wimg_bits);
8877 }
8878 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8879 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8880 }
8881
8882 #else /* __ARM_COHERENT_IO__ */
8883 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8884 #endif /* __ARM_COHERENT_IO__ */
8885 #endif /* XNU_MONITOR */
8886 cpu_num = pmap_cpu_data->cpu_number;
8887
8888 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8889 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8890 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8891 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8892 if (!pte_is_valid(*ptep)) {
8893 break;
8894 }
8895 }
8896 if (i == CPUWINDOWS_MAX) {
8897 panic("pmap_map_cpu_windows_copy: out of window");
8898 }
8899
8900 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8901 #if __ARM_KERNEL_PROTECT__
8902 pte |= ARM_PTE_NG;
8903 #endif /* __ARM_KERNEL_PROTECT__ */
8904
8905 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8906
8907 if (prot & VM_PROT_WRITE) {
8908 pte |= ARM_PTE_AP(AP_RWNA);
8909 } else {
8910 pte |= ARM_PTE_AP(AP_RONA);
8911 }
8912 #if HAS_FEAT_XS
8913 need_strong_sync = pte_is_xs(native_pt_attr, pte);
8914 #endif
8915 write_pte_fast(ptep, pte);
8916 /*
8917 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8918 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8919 */
8920 FLUSH_PTE_STRONG();
8921 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8922 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8923
8924 return i;
8925 }
8926
8927 unsigned int
8928 pmap_map_cpu_windows_copy(
8929 ppnum_t pn,
8930 vm_prot_t prot,
8931 unsigned int wimg_bits)
8932 {
8933 #if XNU_MONITOR
8934 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8935 #else
8936 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8937 #endif
8938 }
8939
8940 MARK_AS_PMAP_TEXT void
8941 pmap_unmap_cpu_windows_copy_internal(
8942 unsigned int index)
8943 {
8944 pt_entry_t *ptep;
8945 unsigned int cpu_num;
8946 vm_offset_t cpu_copywindow_vaddr = 0;
8947 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8948
8949 cpu_num = pmap_cpu_data->cpu_number;
8950
8951 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8952 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8953 * (which are likely to have been on I/O memory) are complete before
8954 * tearing down the mapping. */
8955 __builtin_arm_dsb(DSB_SY);
8956 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8957 write_pte_strong(ptep, ARM_PTE_EMPTY);
8958 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8959 }
8960
8961 void
8962 pmap_unmap_cpu_windows_copy(
8963 unsigned int index)
8964 {
8965 #if XNU_MONITOR
8966 return pmap_unmap_cpu_windows_copy_ppl(index);
8967 #else
8968 return pmap_unmap_cpu_windows_copy_internal(index);
8969 #endif
8970 }
8971
8972 #if XNU_MONITOR
8973
8974 MARK_AS_PMAP_TEXT void
8975 pmap_invoke_with_page(
8976 ppnum_t page_number,
8977 void *ctx,
8978 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8979 {
8980 #pragma unused(page_number, ctx, callback)
8981 }
8982
8983 /*
8984 * Loop over every pmap_io_range (I/O ranges marked as owned by
8985 * the PPL in the device tree) and conditionally call callback() on each range
8986 * that needs to be included in the hibernation image.
8987 *
8988 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8989 * context is needed in the callback.
8990 * @param callback Callback function invoked on each range (gated by flag).
8991 */
8992 MARK_AS_PMAP_TEXT void
8993 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8994 {
8995 extern const pmap_io_range_t* io_attr_table;
8996 extern const unsigned int num_io_rgns;
8997 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8998 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8999 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
9000 }
9001 }
9002 }
9003
9004 /**
9005 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
9006 * PPL-owned page. Otherwise, do nothing.
9007 *
9008 * @param addr Physical address of the page to set the HASHED flag on.
9009 */
9010 MARK_AS_PMAP_TEXT void
9011 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
9012 {
9013 /* Ignore non-managed kernel memory. */
9014 if (!pa_valid(addr)) {
9015 return;
9016 }
9017
9018 const unsigned int pai = pa_index(addr);
9019 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
9020 pv_entry_t **pv_h = pai_to_pvh(pai);
9021
9022 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
9023 pvh_lock(pai);
9024 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
9025 pvh_unlock(pai);
9026 }
9027 }
9028
9029 /**
9030 * Loop through every physical page in the system and clear out the HASHED flag
9031 * on every PPL-owned page. That flag is used to keep track of which pages have
9032 * been hashed into the hibernation image during the hibernation entry process.
9033 *
9034 * The HASHED flag needs to be cleared out between hibernation cycles because the
9035 * pv_head_table and pp_attr_table's might have been copied into the hibernation
9036 * image with the HASHED flag set on certain pages. It's important to clear the
9037 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
9038 * into the hibernation image can't be compromised across hibernation cycles.
9039 */
9040 MARK_AS_PMAP_TEXT void
9041 pmap_clear_ppl_hashed_flag_all(void)
9042 {
9043 const unsigned int last_index = pa_index(vm_last_phys);
9044 pv_entry_t **pv_h = NULL;
9045
9046 for (int pai = 0; pai < last_index; ++pai) {
9047 pv_h = pai_to_pvh(pai);
9048
9049 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
9050 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
9051 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
9052 pvh_lock(pai);
9053 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
9054 pvh_unlock(pai);
9055 }
9056 }
9057 }
9058
9059 /**
9060 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
9061 * ppl_hib driver will call this after all wired pages have been copied into the
9062 * hibernation image.
9063 */
9064 MARK_AS_PMAP_TEXT void
9065 pmap_check_ppl_hashed_flag_all(void)
9066 {
9067 const unsigned int last_index = pa_index(vm_last_phys);
9068 pv_entry_t **pv_h = NULL;
9069
9070 for (int pai = 0; pai < last_index; ++pai) {
9071 pv_h = pai_to_pvh(pai);
9072
9073 /**
9074 * The PMAP stacks are explicitly not saved into the image so skip checking
9075 * the pages that contain the PMAP stacks.
9076 */
9077 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
9078 (pai < pa_index(pmap_stacks_end_pa));
9079
9080 if (!is_pmap_stack &&
9081 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
9082 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
9083 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
9084 }
9085 }
9086 }
9087
9088 #endif /* XNU_MONITOR */
9089
9090 /*
9091 * Indicate that a pmap is intended to be used as a nested pmap
9092 * within one or more larger address spaces. This must be set
9093 * before pmap_nest() is called with this pmap as the 'subordinate'.
9094 */
9095 MARK_AS_PMAP_TEXT void
9096 pmap_set_nested_internal(
9097 pmap_t pmap)
9098 {
9099 validate_pmap_mutable(pmap);
9100 if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
9101 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9102 __func__, pmap, pmap->type);
9103 }
9104
9105 #if XNU_MONITOR
9106 /**
9107 * The "seq_cst" ordering of the atomic load here guarantees
9108 * the check below is performed after the type update above
9109 * is observed. Together with similar order guarantee at
9110 * pmap_switch_internal(), it makes sure a pmap is never
9111 * active-and-nested:
9112 *
9113 * pmap_set_nested() | pmap_switch()
9114 * --------------------------------------
9115 * set nested | set active
9116 * store-load barrier| store-load barrier
9117 * assert !active | assert !nested
9118 */
9119 const int max_cpu = ml_get_max_cpu_number();
9120 for (unsigned int i = 0; i <= max_cpu; ++i) {
9121 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
9122 if (cpu_data == NULL) {
9123 continue;
9124 }
9125 if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
9126 panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
9127 }
9128 }
9129 #endif /* XNU_MONITOR */
9130
9131 /**
9132 * Ensure that a (potentially concurrent) call to pmap_set_shared_region() hasn't tried
9133 * to give this pmap its own nested pmap.
9134 */
9135 if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
9136 panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
9137 }
9138
9139 pmap_get_pt_ops(pmap)->free_id(pmap);
9140 }
9141
9142 __mockable void
9143 pmap_set_nested(
9144 pmap_t pmap)
9145 {
9146 #if XNU_MONITOR
9147 pmap_set_nested_ppl(pmap);
9148 #else
9149 pmap_set_nested_internal(pmap);
9150 #endif
9151 }
9152
9153 bool
9154 pmap_is_nested(
9155 pmap_t pmap)
9156 {
9157 return pmap->type == PMAP_TYPE_NESTED;
9158 }
9159
9160 /*
9161 * pmap_trim_range(pmap, start, end)
9162 *
9163 * pmap = pmap to operate on
9164 * start = start of the range
9165 * end = end of the range
9166 *
9167 * Attempts to deallocate TTEs for the given range in the nested range.
9168 */
9169 MARK_AS_PMAP_TEXT static void
9170 pmap_trim_range(
9171 pmap_t pmap,
9172 addr64_t start,
9173 addr64_t end)
9174 {
9175 addr64_t cur;
9176 addr64_t nested_region_start;
9177 addr64_t nested_region_end;
9178 addr64_t adjusted_start;
9179 addr64_t adjusted_end;
9180 addr64_t adjust_offmask;
9181 tt_entry_t * tte_p;
9182 pt_entry_t * pte_p;
9183 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9184
9185 if (__improbable(end < start)) {
9186 panic("%s: invalid address range, "
9187 "pmap=%p, start=%p, end=%p",
9188 __func__,
9189 pmap, (void*)start, (void*)end);
9190 }
9191
9192 nested_region_start = pmap->nested_region_addr;
9193 nested_region_end = nested_region_start + pmap->nested_region_size;
9194
9195 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9196 panic("%s: range outside nested region %p-%p, "
9197 "pmap=%p, start=%p, end=%p",
9198 __func__, (void *)nested_region_start, (void *)nested_region_end,
9199 pmap, (void*)start, (void*)end);
9200 }
9201
9202 /* Contract the range to TT page boundaries. */
9203 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9204 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9205 adjusted_end = end & ~adjust_offmask;
9206
9207 /* Iterate over the range, trying to remove TTEs. */
9208 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9209 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9210
9211 tte_p = pmap_tte(pmap, cur);
9212
9213 if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
9214 pte_p = (pt_entry_t *) ttetokv(*tte_p);
9215
9216 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9217 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9218 /* Deallocate for the nested map. */
9219 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9220 } else if (pmap->type == PMAP_TYPE_USER) {
9221 /**
9222 * Just remove for the parent map. If the leaf table pointed
9223 * to by the TTE being removed (owned by the nested pmap)
9224 * has any mappings, then this call will panic. This
9225 * enforces the policy that tables being trimmed must be
9226 * empty to prevent possible use-after-free attacks.
9227 */
9228 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9229 } else {
9230 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9231 }
9232 } else {
9233 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9234 }
9235 }
9236
9237 /* Remove empty L2 TTs. */
9238 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9239 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9240
9241 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9242 /* For each L1 entry in our range... */
9243 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9244
9245 bool remove_tt1e = true;
9246 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9247 tt_entry_t * tt2e_start;
9248 tt_entry_t * tt2e_end;
9249 tt_entry_t * tt2e_p;
9250 tt_entry_t tt1e;
9251
9252 if (tt1e_p == NULL) {
9253 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9254 continue;
9255 }
9256
9257 tt1e = *tt1e_p;
9258
9259 if (tt1e == ARM_TTE_TYPE_FAULT) {
9260 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9261 continue;
9262 }
9263
9264 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9265 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9266
9267 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9268 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9269 /*
9270 * If any TTEs are populated, don't remove the
9271 * L1 TT.
9272 */
9273 remove_tt1e = false;
9274 }
9275 }
9276
9277 if (remove_tt1e) {
9278 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9279 } else {
9280 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9281 }
9282 }
9283 }
9284
9285 /**
9286 * State machine for multi-step pmap trimming. Trimming is the action of
9287 * deallocating the TTEs of the shared region of pmaps down to a given range.
9288 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9289 * disabling preemption for too long. These steps include computing the bounds
9290 * of the shared region, trimming the head of the "grand", trimming the tail of
9291 * the "grand", and trimming the "subord". Some of the steps can be skipped under
9292 * different conditions.
9293 *
9294 * @param grand the pmap in which the pages are nested
9295 * @param subord the pmap from which the pages are shared, or nested
9296 * @param vstart start of the used range in "grand"
9297 * @param size size of the used range
9298 * @param state the current state of the state machine
9299 *
9300 * @return the next state of the state machine, to be used in the next call
9301 * into this function.
9302 */
9303 MARK_AS_PMAP_TEXT pmap_trim_state_t
9304 pmap_trim_internal(
9305 pmap_t grand,
9306 pmap_t subord,
9307 addr64_t vstart,
9308 uint64_t size,
9309 pmap_trim_state_t state)
9310 {
9311 /* Validation needs to be done regardless of state. */
9312 addr64_t vend;
9313
9314 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9315 panic("%s: grand addr wraps around, "
9316 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9317 __func__, grand, subord, (void*)vstart, size, state);
9318 }
9319
9320 validate_pmap_mutable(grand);
9321 validate_pmap(subord);
9322
9323 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9324 panic("%s: subord is of non-nestable type 0x%hhx, "
9325 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9326 __func__, subord->type, grand, subord, (void*)vstart, size, state);
9327 }
9328
9329 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9330 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9331 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9332 __func__, grand->type, grand, subord, (void*)vstart, size, state);
9333 }
9334
9335 if (__improbable(grand->nested_pmap != subord)) {
9336 panic("%s: grand->nested != subord, "
9337 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9338 __func__, grand, subord, (void*)vstart, size, state);
9339 }
9340
9341 if (__improbable((size != 0) &&
9342 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9343 panic("%s: grand range not in nested region, "
9344 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9345 __func__, grand, subord, (void*)vstart, size, state);
9346 }
9347
9348
9349 /* Trimming starts with figuring out the bounds for the grand. */
9350 if (state == PMAP_TRIM_STATE_START) {
9351 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9352
9353 /**
9354 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9355 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9356 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9357 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9358 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9359 * PMAP_TRIM_STATE_DONE.
9360 */
9361 if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9362 assert(subord->nested_bounds_set);
9363
9364 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9365 if (!grand->nested_bounds_set) {
9366 /* Inherit the bounds from subord. */
9367 grand->nested_region_true_start = subord->nested_region_true_start;
9368 grand->nested_region_true_end = subord->nested_region_true_end;
9369 grand->nested_bounds_set = true;
9370 }
9371
9372 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9373
9374 /* Now that the grand has bounds, we are done. */
9375 return PMAP_TRIM_STATE_DONE;
9376 }
9377
9378 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9379 if ((!subord->nested_bounds_set) && size) {
9380 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9381 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9382
9383 subord->nested_region_true_start = vstart;
9384 subord->nested_region_true_end = vend;
9385 subord->nested_region_true_start &= ~adjust_offmask;
9386
9387 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9388 panic("%s: padded true end wraps around, "
9389 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9390 __func__, grand, subord, (void*)vstart, size, state);
9391 }
9392
9393 subord->nested_region_true_end &= ~adjust_offmask;
9394 subord->nested_bounds_set = true;
9395 }
9396
9397 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9398 if (subord->nested_bounds_set) {
9399 /* Inherit the bounds from subord. */
9400 grand->nested_region_true_start = subord->nested_region_true_start;
9401 grand->nested_region_true_end = subord->nested_region_true_end;
9402 grand->nested_bounds_set = true;
9403
9404 /* If we know the bounds, we can trim the pmap. */
9405 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9406
9407 state = PMAP_TRIM_STATE_GRAND_BEFORE;
9408 } else {
9409 /* Don't trim if we don't know the bounds. */
9410 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9411
9412 return PMAP_TRIM_STATE_DONE;
9413 }
9414 }
9415
9416 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
9417 if (!grand->nested_bounds_set) {
9418 panic("%s: !grand->nested_bounds_set, "
9419 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9420 __func__, grand, subord, (void*)vstart, size, state);
9421 }
9422
9423 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9424 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9425 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9426 NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9427 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9428 (unsigned int)grand->nested_no_bounds_ref_state);
9429 }
9430
9431 #if XNU_MONITOR
9432 if (pmap_pending_preemption()) {
9433 return PMAP_TRIM_STATE_GRAND_AFTER;
9434 }
9435 #endif
9436
9437 state = PMAP_TRIM_STATE_GRAND_AFTER;
9438 }
9439
9440 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9441 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9442 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9443 NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9444 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9445 (unsigned int)grand->nested_no_bounds_ref_state);
9446 }
9447
9448 #if XNU_MONITOR
9449 if (pmap_pending_preemption()) {
9450 return PMAP_TRIM_STATE_SUBORD;
9451 }
9452 #endif
9453
9454 state = PMAP_TRIM_STATE_SUBORD;
9455 }
9456
9457 /* START state is guaranteed to compute the bounds for the subord. */
9458 if (!subord->nested_bounds_set) {
9459 panic("%s: !subord->nested_bounds_set, "
9460 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9461 __func__, grand, subord, (void*)vstart, size, state);
9462 }
9463
9464 if (state == PMAP_TRIM_STATE_SUBORD) {
9465 /**
9466 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9467 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9468 * called once grand's nested tables have been fully trimmed, and can only be called once
9469 * for each 'grand' pmap. We use release ordering for the atomics above to ensure that
9470 * the state update is visible only once the preceding trim operation is complete. An
9471 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9472 * but locking within pmap_trim_range() should make that harmless (and all but one will
9473 * ultimately panic due to a failed atomic state CAS). We use acquire ordering here to
9474 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9475 * of the state CAS.
9476 */
9477 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9478 NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9479 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9480 (unsigned int)grand->nested_no_bounds_ref_state);
9481 }
9482 pmap_trim_subord(subord);
9483 }
9484
9485 return PMAP_TRIM_STATE_DONE;
9486 }
9487
9488 MARK_AS_PMAP_TEXT static void
9489 pmap_trim_self(pmap_t pmap)
9490 {
9491 if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9492 /* If we have a no bounds ref, we need to drop it. */
9493 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9494 pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9495 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9496 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9497 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9498 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9499
9500 if (nested_bounds_set) {
9501 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9502 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9503 }
9504 /*
9505 * Try trimming the nested pmap, in case we had the
9506 * last reference.
9507 */
9508 pmap_trim_subord(pmap->nested_pmap);
9509 }
9510 }
9511
9512 /*
9513 * pmap_trim_subord(grand, subord)
9514 *
9515 * grand = pmap that we have nested subord in
9516 * subord = nested pmap we are attempting to trim
9517 *
9518 * Trims subord if possible
9519 */
9520 MARK_AS_PMAP_TEXT static void
9521 pmap_trim_subord(pmap_t subord)
9522 {
9523 bool contract_subord = false;
9524
9525 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9526
9527 subord->nested_no_bounds_refcnt--;
9528
9529 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9530 /* If this was the last no bounds reference, trim subord. */
9531 contract_subord = true;
9532 }
9533
9534 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9535
9536 if (contract_subord) {
9537 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9538 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9539 }
9540 }
9541
9542 /**
9543 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9544 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9545 * disabling preemption for too long.
9546 *
9547 * @note When we load the shared region we always create pages tables for the
9548 * entire region. In practice, the shared cache may use just a portion
9549 * of that. Before we know the bounds of the shared region, it can
9550 * already be mapped into processes. Therefore, once the bounds are
9551 * known, "trimming" comes in handy to remove the unnecessary page
9552 * tables in the processes the shared region is mapped in, and eventually
9553 * those in the shared region itself. Note that the shared region must
9554 * be trimmed after the user processes because it has the L3 entries
9555 * everyone else is pointing to.
9556 *
9557 * @param grand the pmap in which the pages are nested
9558 * @param subord the pmap from which the pages are shared, or nested
9559 * @param vstart start of the used range in "grand"
9560 * @param size size of the used range
9561 */
9562 void
9563 pmap_trim(
9564 pmap_t grand,
9565 pmap_t subord,
9566 addr64_t vstart,
9567 uint64_t size)
9568 {
9569 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9570
9571 #if XNU_MONITOR
9572 /* On PPL systems, drives the state machine until its done. */
9573 while (state != PMAP_TRIM_STATE_DONE) {
9574 __assert_only pmap_trim_state_t old_state = state;
9575 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9576
9577 /* Are we making progress? */
9578 assert(old_state != state);
9579 }
9580
9581 pmap_ledger_check_balance(grand);
9582 pmap_ledger_check_balance(subord);
9583 #else
9584 state = pmap_trim_internal(grand, subord, vstart, size, state);
9585
9586 /* On non-PPL systems, we expect the implementation to finish in one call. */
9587 assert(state == PMAP_TRIM_STATE_DONE);
9588 #endif
9589 }
9590
9591 #if HAS_APPLE_PAC
9592 void *
9593 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9594 {
9595 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9596 panic("attempt to sign user pointer without process independent key");
9597 }
9598
9599 void *res = NULL;
9600 uint64_t current_intr_state = pmap_interrupts_disable();
9601
9602 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9603
9604 __compiler_materialize_and_prevent_reordering_on(value);
9605 switch (key) {
9606 case ptrauth_key_asia:
9607 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9608 break;
9609 case ptrauth_key_asda:
9610 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9611 break;
9612 default:
9613 __builtin_unreachable();
9614 }
9615 __compiler_materialize_and_prevent_reordering_on(res);
9616
9617 ml_disable_user_jop_key(jop_key, saved_jop_state);
9618
9619 pmap_interrupts_restore(current_intr_state);
9620
9621 return res;
9622 }
9623
9624 void *
9625 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9626 {
9627 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9628 }
9629
9630 void *
9631 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9632 {
9633 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9634 panic("attempt to auth user pointer without process independent key");
9635 }
9636
9637 void *res = NULL;
9638 uint64_t current_intr_state = pmap_interrupts_disable();
9639
9640 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9641 __compiler_materialize_and_prevent_reordering_on(value);
9642 res = ml_auth_ptr_unchecked(value, key, discriminator);
9643 __compiler_materialize_and_prevent_reordering_on(res);
9644 ml_disable_user_jop_key(jop_key, saved_jop_state);
9645
9646 pmap_interrupts_restore(current_intr_state);
9647
9648 return res;
9649 }
9650
9651 void *
9652 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9653 {
9654 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9655 }
9656 #endif /* HAS_APPLE_PAC */
9657
9658 /*
9659 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9660 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9661 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9662 * return value, to indicate where a preempted [un]nest operation should resume.
9663 * When the return value contains the ending address of the nested region with
9664 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9665 */
9666 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9667
9668 /**
9669 * Establishes the pmap associated with a shared region as the nested pmap
9670 * for a top-level user pmap.
9671 *
9672 * @param grand The top-level user pmap
9673 * @param subord The pmap to be set as [grand]'s nested pmap
9674 * @param vstart The base VA of the region to be nested.
9675 * @param size The size (in bytes) of the region to be nested.
9676 */
9677 MARK_AS_PMAP_TEXT kern_return_t
9678 pmap_set_shared_region_internal(
9679 pmap_t grand,
9680 pmap_t subord,
9681 addr64_t vstart,
9682 uint64_t size)
9683 {
9684 addr64_t vend;
9685 uint64_t nested_region_unnested_table_bitmap_size;
9686 unsigned int* nested_region_unnested_table_bitmap = NULL;
9687 kern_return_t kr = KERN_SUCCESS;
9688
9689 validate_pmap_mutable(grand);
9690 validate_pmap(subord);
9691
9692 #if XNU_MONITOR
9693 /*
9694 * Ordering is important here. validate_pmap() has already ensured subord is a
9695 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9696 * be in the process of being destroyed. If destruction is already committed,
9697 * then the check of ref_count below will cover us. If destruction is initiated
9698 * during or after this call, then pmap_destroy() will catch the non-zero
9699 * nested_count.
9700 */
9701 os_atomic_inc(&subord->nested_count, relaxed);
9702 os_atomic_thread_fence(seq_cst);
9703 #endif
9704 if (__improbable(os_atomic_inc_orig(&subord->ref_count, acquire) <= 0)) {
9705 panic("%s: invalid subordinate pmap %p", __func__, subord);
9706 }
9707
9708 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9709 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9710 }
9711
9712 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9713 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9714 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9715 }
9716 if (__improbable(((size | vstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9717 panic("%s: pmap %p unaligned set_shared_region request 0x%llx, 0x%llx",
9718 __func__, grand, vstart, size);
9719 }
9720 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9721 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9722 }
9723
9724 if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) {
9725 nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9726
9727 /**
9728 * Each even-numbered entry in the unnested table bit stores the table's unnesting state to be used
9729 * by pmap_enter() in determining whether to set the NG bit, while the subsequent odd-numbered
9730 * entry stores the "unnest in progress" indicator for the table, which is used by pmap_unnest()
9731 * to determine if an L3 table may not have been fully marked NG due to an interrupted operation.
9732 */
9733 nested_region_unnested_table_bitmap_size <<= 1;
9734
9735 if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9736 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9737 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9738 __func__, nested_region_unnested_table_bitmap_size,
9739 grand, subord, vstart, size);
9740 }
9741
9742 #if XNU_MONITOR
9743 pmap_paddr_t pa = 0;
9744
9745 if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9746 panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9747 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9748 __FUNCTION__, nested_region_unnested_table_bitmap_size,
9749 grand, subord, vstart, size);
9750 }
9751
9752 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9753
9754 if (kr != KERN_SUCCESS) {
9755 goto done;
9756 }
9757
9758 assert(pa);
9759
9760 nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9761 #else
9762 nested_region_unnested_table_bitmap = kalloc_data(
9763 nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9764 Z_WAITOK | Z_ZERO);
9765 #endif
9766
9767 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9768 kr = KERN_ABORTED;
9769 goto done;
9770 }
9771
9772 if (subord->nested_region_unnested_table_bitmap == NULL) {
9773 subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9774 subord->nested_region_addr = vstart;
9775 subord->nested_region_size = (mach_vm_offset_t) size;
9776
9777 /**
9778 * Use a store-release operation to ensure that the rest of the subord->nested_region_*
9779 * fields are initialized and visible before setting the nested_region_unnested_table_bitmap
9780 * field (which is used as the flag to say that the rest are initialized).
9781 */
9782 os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release);
9783 nested_region_unnested_table_bitmap = NULL;
9784 }
9785 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9786 }
9787
9788 if (__improbable(!os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst))) {
9789 panic("%s: attempt to nest pmap %p into pmap %p which already has a nested pmap %p",
9790 __func__, subord, grand, grand->nested_pmap);
9791 }
9792 /**
9793 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9794 * into a nested pmap, which would then produce multiple levels of nesting.
9795 */
9796 if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9797 panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9798 }
9799
9800 done:
9801 if (nested_region_unnested_table_bitmap != NULL) {
9802 #if XNU_MONITOR
9803 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9804 #else
9805 kfree_data(nested_region_unnested_table_bitmap,
9806 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9807 #endif
9808 nested_region_unnested_table_bitmap = NULL;
9809 }
9810
9811 if (kr != KERN_SUCCESS) {
9812 #if XNU_MONITOR
9813 os_atomic_dec(&subord->nested_count, relaxed);
9814 #endif
9815 pmap_destroy_internal(subord);
9816 }
9817
9818 return kr;
9819 }
9820
9821 __mockable void
9822 pmap_set_shared_region(
9823 pmap_t grand,
9824 pmap_t subord,
9825 addr64_t vstart,
9826 uint64_t size)
9827 {
9828 kern_return_t kr = KERN_SUCCESS;
9829
9830 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START,
9831 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size);
9832
9833 pmap_verify_preemptible();
9834 #if XNU_MONITOR
9835 do {
9836 kr = pmap_set_shared_region_ppl(grand, subord, vstart, size);
9837 if (kr == KERN_RESOURCE_SHORTAGE) {
9838 pmap_alloc_page_for_ppl(0);
9839 } else if ((kr != KERN_SUCCESS) && (kr != KERN_ABORTED)) {
9840 panic("%s: unexpected return code 0x%x from pmap_set_shared_region_ppl",
9841 __func__, kr);
9842 }
9843 } while (kr != KERN_SUCCESS);
9844
9845 pmap_ledger_check_balance(grand);
9846 pmap_ledger_check_balance(subord);
9847 #else
9848 /**
9849 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9850 * we have verified preemptibility. Therefore, pmap_set_shared_region_internal()
9851 * will wait for a page or a lock instead of bailing out as in the PPL flavor.
9852 */
9853 kr = pmap_set_shared_region_internal(grand, subord, vstart, size);
9854 assert3u(kr, ==, KERN_SUCCESS);
9855 #endif
9856
9857 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END);
9858 }
9859
9860 /**
9861 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9862 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9863 * This function operates in 3 main phases:
9864 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9865 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9866 * the mapping range are present in subord.
9867 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9868 * contains pointers to subord's leaf-level pagetable pages for the specified
9869 * VA range.
9870 *
9871 * This function may return early due to pending AST_URGENT preemption; if so
9872 * it will indicate the need to be re-entered.
9873 *
9874 * @note This function requires that [subord] has already been associated with
9875 * [grand] through a call to pmap_set_shared_region().
9876 *
9877 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9878 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9879 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9880 * @param size twig-aligned size of the nesting range
9881 * @param vrestart the twig-aligned starting address of the current call. May contain
9882 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9883 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9884 * KERN_RESOURCE_SHORTAGE on allocation failure.
9885 *
9886 * @return the virtual address at which to restart the operation, possibly including
9887 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9888 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9889 */
9890 MARK_AS_PMAP_TEXT vm_map_offset_t
9891 pmap_nest_internal(
9892 pmap_t grand,
9893 pmap_t subord,
9894 addr64_t vstart,
9895 uint64_t size,
9896 vm_map_offset_t vrestart,
9897 kern_return_t *krp)
9898 {
9899 kern_return_t kr = KERN_FAILURE;
9900 vm_map_offset_t vaddr;
9901 tt_entry_t *stte_p;
9902 tt_entry_t *gtte_p;
9903 int expand_options = 0;
9904 bool grand_locked = false;
9905
9906 addr64_t vend;
9907 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9908 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9909 }
9910 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9911 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9912 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9913 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9914 }
9915
9916 assert(krp != NULL);
9917 validate_pmap_mutable(grand);
9918 validate_pmap(subord);
9919
9920 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9921
9922 if (__improbable(subord != grand->nested_pmap)) {
9923 panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p",
9924 __func__, subord, grand, grand->nested_pmap);
9925 }
9926
9927 #if XNU_MONITOR
9928 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9929 #endif
9930
9931 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9932 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9933 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9934 grand, vstart, size, (unsigned long long)vrestart);
9935 }
9936
9937 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9938 kr = KERN_ABORTED;
9939 goto nest_cleanup;
9940 }
9941
9942 if (__improbable((subord->nested_region_addr + subord->nested_region_size) < vend) ||
9943 (subord->nested_region_addr > vstart)) {
9944 panic("%s: attempt to nest [0x%llx, 0x%llx) in pmap %p outside nested pmap %p bounds [0x%llx, 0x%llx)\n",
9945 __func__, vstart, vend, grand, subord, subord->nested_region_addr, subord->nested_region_addr + subord->nested_region_size);
9946 }
9947 if (grand->nested_region_size == 0) {
9948 /*
9949 * If this is grand's first nesting operation, keep the reference on subord.
9950 * It will be released by pmap_destroy_internal() when grand is destroyed.
9951 */
9952 if (!subord->nested_bounds_set) {
9953 /*
9954 * We are nesting without the shared regions bounds
9955 * being known. We'll have to trim the pmap later.
9956 */
9957 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9958 NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9959 panic("%s: grand %p already nested", __func__, grand);
9960 }
9961 subord->nested_no_bounds_refcnt++;
9962 }
9963
9964 /**
9965 * Ensure that we won't exceed the nested_region_unnested_table bitmap bounds established
9966 * in pmap_set_shared_region_internal().
9967 */
9968 if (__improbable((vstart < subord->nested_region_addr) ||
9969 (vend > (subord->nested_region_addr + subord->nested_region_size)))) {
9970 panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9971 __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9972 (void *) (subord->nested_region_addr + subord->nested_region_size));
9973 }
9974
9975 grand->nested_region_addr = vstart;
9976 grand->nested_region_size = (mach_vm_offset_t) size;
9977 } else {
9978 if (__improbable(grand->nested_region_addr > vstart)) {
9979 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9980 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9981 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9982 }
9983 }
9984
9985 vaddr = vrestart & ~PMAP_NEST_GRAND;
9986 if (vaddr < subord->nested_region_true_start) {
9987 vaddr = subord->nested_region_true_start;
9988 }
9989
9990 addr64_t true_end = vend;
9991 if (true_end > subord->nested_region_true_end) {
9992 true_end = subord->nested_region_true_end;
9993 }
9994 __unused unsigned int ttecount = 0;
9995
9996 if (vrestart & PMAP_NEST_GRAND) {
9997 goto nest_grand;
9998 }
9999
10000 while (vaddr < true_end) {
10001 stte_p = pmap_tte(subord, vaddr);
10002 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
10003 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10004 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
10005
10006 if (kr != KERN_SUCCESS) {
10007 goto done;
10008 }
10009
10010 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
10011 }
10012 vaddr += pt_attr_twig_size(pt_attr);
10013 vrestart = vaddr;
10014 ++ttecount;
10015 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10016 pmap_pending_preemption())) {
10017 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10018 kr = KERN_SUCCESS;
10019 goto done;
10020 }
10021 }
10022 /*
10023 * copy TTEs from subord pmap into grand pmap
10024 */
10025
10026 vaddr = (vm_map_offset_t) vstart;
10027 if (vaddr < subord->nested_region_true_start) {
10028 vaddr = subord->nested_region_true_start;
10029 }
10030 vrestart = vaddr | PMAP_NEST_GRAND;
10031
10032 nest_grand:
10033 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10034
10035 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10036 kr = KERN_ABORTED;
10037 goto done;
10038 }
10039 while (vaddr < true_end) {
10040 gtte_p = pmap_tte(grand, vaddr);
10041 if (gtte_p == PT_ENTRY_NULL) {
10042 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10043 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
10044 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10045 if (kr == KERN_SUCCESS) {
10046 kr = KERN_ABORTED;
10047 }
10048 }
10049
10050 if (kr != KERN_SUCCESS) {
10051 goto done;
10052 }
10053
10054 gtte_p = pmap_tt2e(grand, vaddr);
10055 }
10056 /* Don't leak a page table page. Don't violate break-before-make. */
10057 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
10058 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
10059 __func__, gtte_p, grand);
10060 }
10061 /**
10062 * It's possible that grand was trimmed by pmap_trim_internal() while the
10063 * lock was dropped, in which case the previously stored "true" start/end
10064 * will no longer be accurate. In that case, we need to avoid nesting
10065 * tables outside the trimmed range, as those tables may be immediately freed
10066 * which would lead to a dangling page table pointer in grand.
10067 * Note that pmap_trim() may concurrently update grand's bounds as we are
10068 * making these checks, but in that case pmap_trim_range() has not yet
10069 * been called on grand and will wait for us to drop grand's lock, so it
10070 * should see any TTEs we've nested here and clear them appropriately.
10071 */
10072 if (__probable((vaddr >= grand->nested_region_true_start) &&
10073 (vaddr < grand->nested_region_true_end))) {
10074 stte_p = pmap_tte(subord, vaddr);
10075 if (__improbable(stte_p == PT_ENTRY_NULL)) {
10076 panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
10077 }
10078 *gtte_p = *stte_p;
10079 }
10080
10081 vaddr += pt_attr_twig_size(pt_attr);
10082 vrestart = vaddr | PMAP_NEST_GRAND;
10083 ++ttecount;
10084 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10085 pmap_pending_preemption())) {
10086 break;
10087 }
10088 }
10089 if (vaddr >= true_end) {
10090 vrestart = vend | PMAP_NEST_GRAND;
10091 }
10092
10093 kr = KERN_SUCCESS;
10094 done:
10095
10096 FLUSH_PTE();
10097 __builtin_arm_isb(ISB_SY);
10098
10099 if (grand_locked) {
10100 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10101 }
10102
10103 nest_cleanup:
10104 #if XNU_MONITOR
10105 if (kr != KERN_SUCCESS) {
10106 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10107 *krp = kr;
10108 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10109 }
10110 #else
10111 if (kr != KERN_SUCCESS) {
10112 *krp = kr;
10113 }
10114 #endif
10115 return vrestart;
10116 }
10117
10118 __mockable kern_return_t
10119 pmap_nest(
10120 pmap_t grand,
10121 pmap_t subord,
10122 addr64_t vstart,
10123 uint64_t size)
10124 {
10125 kern_return_t kr = KERN_SUCCESS;
10126 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
10127 vm_map_offset_t vend = vaddr + size;
10128 __unused vm_map_offset_t vlast = vaddr;
10129
10130 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
10131 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
10132 VM_KERNEL_ADDRHIDE(vstart));
10133
10134 pmap_verify_preemptible();
10135 #if XNU_MONITOR
10136 while (vaddr != (vend | PMAP_NEST_GRAND)) {
10137 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
10138 if (kr == KERN_RESOURCE_SHORTAGE) {
10139 pmap_alloc_page_for_ppl(0);
10140 kr = KERN_SUCCESS;
10141 } else if (kr == KERN_ABORTED) {
10142 /**
10143 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
10144 * that it won't update kr when KERN_SUCCESS is to be returned.
10145 * Therefore, the KERN_ABORTED needs to be manually cleared here,
10146 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
10147 */
10148 kr = KERN_SUCCESS;
10149 continue;
10150 } else if (kr != KERN_SUCCESS) {
10151 break;
10152 } else if (vaddr == vlast) {
10153 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10154 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
10155 }
10156 vlast = vaddr;
10157 }
10158
10159 pmap_ledger_check_balance(grand);
10160 pmap_ledger_check_balance(subord);
10161 #else
10162 /**
10163 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
10164 * we have verified preemptibility. Therefore, pmap_nest_internal() will
10165 * wait for a page or a lock instead of bailing out as in the PPL flavor.
10166 */
10167 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
10168 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
10169 }
10170 #endif
10171
10172 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
10173
10174 return kr;
10175 }
10176
10177 /*
10178 * kern_return_t pmap_unnest(grand, vaddr)
10179 *
10180 * grand = the pmap that will have the virtual range unnested
10181 * vaddr = start of range in pmap to be unnested
10182 * size = size of range in pmap to be unnested
10183 *
10184 */
10185
10186 kern_return_t
10187 pmap_unnest(
10188 pmap_t grand,
10189 addr64_t vaddr,
10190 uint64_t size)
10191 {
10192 return pmap_unnest_options(grand, vaddr, size, 0);
10193 }
10194
10195 /**
10196 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10197 * from a top-level pmap ('grand'). The corresponding mappings in the nested
10198 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10199 * still have the region nested. The mappings in 'grand' will be left empty
10200 * with the assumption that they will be demand-filled by subsequent access faults.
10201 *
10202 * This function operates in 2 main phases:
10203 * 1. Iteration over the nested pmap's mappings for the specified range to mark
10204 * them non-global.
10205 * 2. Clearing of the twig-level TTEs for the address range in grand.
10206 *
10207 * This function may return early due to pending AST_URGENT preemption; if so
10208 * it will indicate the need to be re-entered.
10209 *
10210 * @param grand pmap from which to unnest mappings
10211 * @param vaddr twig-aligned virtual address for the beginning of the nested range
10212 * @param size twig-aligned size of the nested range
10213 * @param vrestart the page-aligned starting address of the current call. May contain
10214 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10215 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10216 * grand is being torn down and step 1) above is not needed.
10217 *
10218 * @return the virtual address at which to restart the operation, possibly including
10219 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
10220 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10221 */
10222 MARK_AS_PMAP_TEXT vm_map_offset_t
10223 pmap_unnest_options_internal(
10224 pmap_t grand,
10225 addr64_t vaddr,
10226 uint64_t size,
10227 vm_map_offset_t vrestart,
10228 unsigned int option)
10229 {
10230 vm_map_offset_t start;
10231 vm_map_offset_t addr;
10232 tt_entry_t *tte_p;
10233 unsigned int current_index;
10234 unsigned int start_index;
10235 unsigned int max_index;
10236 unsigned int entry_count = 0;
10237
10238 addr64_t vend;
10239 addr64_t true_end;
10240 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10241 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10242 }
10243 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10244 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10245 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10246 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10247 }
10248
10249 validate_pmap_mutable(grand);
10250
10251 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10252 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10253 }
10254
10255 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10256
10257 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10258 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10259 (unsigned long long)vaddr, (unsigned long long)size);
10260 }
10261
10262 if (__improbable(grand->nested_pmap == NULL)) {
10263 panic("%s: %p has no nested pmap", __func__, grand);
10264 }
10265
10266 true_end = vend;
10267 if (true_end > grand->nested_pmap->nested_region_true_end) {
10268 true_end = grand->nested_pmap->nested_region_true_end;
10269 }
10270
10271 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10272 if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10273 return vrestart;
10274 }
10275
10276 start = vrestart;
10277 if (start < grand->nested_pmap->nested_region_true_start) {
10278 start = grand->nested_pmap->nested_region_true_start;
10279 }
10280 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10281 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10282 bool flush_tlb = false;
10283
10284 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10285 pt_entry_t *bpte, *cpte;
10286
10287 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10288
10289 bpte = pmap_pte(grand->nested_pmap, addr);
10290
10291 /*
10292 * If we've re-entered this function partway through unnesting a leaf region, the
10293 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10294 * the run of PTEs and the adjacent "in-progress" bit will be set.
10295 */
10296 if (!testbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10297 testbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap)) {
10298 /*
10299 * Mark the 'twig' region as being unnested. Every mapping entered within
10300 * the nested pmap in this region will now be marked non-global. Do this
10301 * before marking any of the PTEs within the region as non-global to avoid
10302 * the possibility of pmap_enter() subsequently inserting a global mapping
10303 * in the region, which could lead to a TLB conflict if a non-global entry
10304 * is later inserted for the same VA in a pmap which has fully unnested this
10305 * region.
10306 */
10307 setbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10308 setbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10309 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10310 pmap_paddr_t pa;
10311 unsigned int pai = 0;
10312 boolean_t managed = FALSE;
10313 pt_entry_t spte;
10314
10315 if (pte_is_valid(*cpte) && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10316 spte = *((volatile pt_entry_t*)cpte);
10317 while (!managed) {
10318 pa = pte_to_pa(spte);
10319 if (!pa_valid(pa)) {
10320 break;
10321 }
10322 pai = pa_index(pa);
10323 pvh_lock(pai);
10324 spte = *((volatile pt_entry_t*)cpte);
10325 pa = pte_to_pa(spte);
10326 if (pai == pa_index(pa)) {
10327 managed = TRUE;
10328 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10329 }
10330 pvh_unlock(pai);
10331 }
10332
10333 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10334 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10335 flush_tlb = true;
10336 }
10337
10338 if (managed) {
10339 pvh_assert_locked(pai);
10340 pvh_unlock(pai);
10341 }
10342 }
10343
10344 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10345 vrestart = addr;
10346 ++entry_count;
10347 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10348 pmap_pending_preemption())) {
10349 goto unnest_subord_done;
10350 }
10351 }
10352 clrbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10353 }
10354 addr = vlim;
10355 vrestart = addr;
10356 ++entry_count;
10357 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10358 pmap_pending_preemption())) {
10359 break;
10360 }
10361 }
10362
10363 unnest_subord_done:
10364 if (flush_tlb) {
10365 FLUSH_PTE_STRONG();
10366 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10367 }
10368
10369 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10370 if (current_index < max_index) {
10371 return vrestart;
10372 }
10373 }
10374
10375 /*
10376 * invalidate all pdes for segment at vaddr in pmap grand
10377 */
10378 if (vrestart & PMAP_NEST_GRAND) {
10379 addr = vrestart & ~PMAP_NEST_GRAND;
10380 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10381 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10382 }
10383 } else {
10384 addr = vaddr;
10385 vrestart = vaddr | PMAP_NEST_GRAND;
10386 }
10387
10388 /**
10389 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10390 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10391 * upon reentry.
10392 */
10393 if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10394 return vrestart;
10395 }
10396
10397 if (addr < grand->nested_pmap->nested_region_true_start) {
10398 addr = grand->nested_pmap->nested_region_true_start;
10399 }
10400
10401 start = addr;
10402
10403 while (addr < true_end) {
10404 tte_p = pmap_tte(grand, addr);
10405 /*
10406 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10407 * so it's possible that a region we're trying to unnest may not have been
10408 * nested in the first place.
10409 */
10410 if (tte_p != NULL) {
10411 *tte_p = ARM_TTE_TYPE_FAULT;
10412 }
10413 addr += pt_attr_twig_size(pt_attr);
10414 vrestart = addr | PMAP_NEST_GRAND;
10415 ++entry_count;
10416 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10417 pmap_pending_preemption())) {
10418 break;
10419 }
10420 }
10421 if (addr >= true_end) {
10422 vrestart = vend | PMAP_NEST_GRAND;
10423 }
10424
10425 FLUSH_PTE_STRONG();
10426 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10427
10428 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10429
10430 return vrestart;
10431 }
10432
10433 __mockable kern_return_t
10434 pmap_unnest_options(
10435 pmap_t grand,
10436 addr64_t vaddr,
10437 uint64_t size,
10438 unsigned int option)
10439 {
10440 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10441 vm_map_offset_t vend = vaddr + size;
10442
10443 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10444 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10445
10446 pmap_verify_preemptible();
10447 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10448 #if XNU_MONITOR
10449 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10450 #else
10451 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10452 #endif
10453 }
10454
10455 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10456
10457 return KERN_SUCCESS;
10458 }
10459
10460 boolean_t
10461 pmap_adjust_unnest_parameters(
10462 __unused pmap_t p,
10463 __unused vm_map_offset_t *s,
10464 __unused vm_map_offset_t *e)
10465 {
10466 return TRUE; /* to get to log_unnest_badness()... */
10467 }
10468
10469 /**
10470 * Perform any necessary pre-nesting of the parent's shared region at fork()
10471 * time.
10472 *
10473 * @note This should only be called from vm_map_fork().
10474 *
10475 * @param old_pmap The pmap of the parent task.
10476 * @param new_pmap The pmap of the child task.
10477 *
10478 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10479 * KERN_INVALID_ARGUMENT if the arguments were not valid.
10480 */
10481 kern_return_t
10482 pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap)
10483 {
10484 if (old_pmap == NULL || new_pmap == NULL) {
10485 return KERN_INVALID_ARGUMENT;
10486 }
10487 if (old_pmap->nested_pmap == NULL) {
10488 return KERN_SUCCESS;
10489 }
10490 /**
10491 * Obtain the full shared region bounds from the nested pmap. If old_pmap
10492 * hasn't been fully nested yet, its bounds may not yet be configured.
10493 */
10494 pmap_set_shared_region(new_pmap,
10495 old_pmap->nested_pmap,
10496 old_pmap->nested_pmap->nested_region_addr,
10497 old_pmap->nested_pmap->nested_region_size);
10498 return KERN_SUCCESS;
10499 }
10500
10501 /*
10502 * disable no-execute capability on
10503 * the specified pmap
10504 */
10505 #if DEVELOPMENT || DEBUG
10506 void
10507 pmap_disable_NX(
10508 pmap_t pmap)
10509 {
10510 pmap->nx_enabled = FALSE;
10511 }
10512 #else
10513 void
10514 pmap_disable_NX(
10515 __unused pmap_t pmap)
10516 {
10517 }
10518 #endif
10519
10520 /*
10521 * flush a range of hardware TLB entries.
10522 * NOTE: assumes the smallest TLB entry in use will be for
10523 * an ARM small page (4K).
10524 */
10525
10526 #if __ARM_RANGE_TLBI__
10527 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10528 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
10529 #else
10530 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10531 #endif // __ARM_RANGE_TLBI__
10532 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10533 "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10534 "of npages to 32 bits below may truncate.");
10535
10536 static void
10537 flush_mmu_tlb_region_asid_async(
10538 vm_offset_t va,
10539 size_t length,
10540 pmap_t pmap,
10541 bool last_level_only __unused,
10542 bool strong __unused)
10543 {
10544 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10545 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10546 size_t npages = length >> pmap_page_shift;
10547 uint32_t asid;
10548
10549 asid = pmap->hw_asid;
10550
10551 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10552 boolean_t flush_all = FALSE;
10553
10554 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10555 flush_all = TRUE;
10556 }
10557 if (flush_all) {
10558 flush_mmu_tlb_async();
10559 } else {
10560 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10561 }
10562 return;
10563 }
10564 #if __ARM_RANGE_TLBI__
10565 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10566 /**
10567 * Note that casting npages to 32 bits here is always safe thanks to
10568 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10569 */
10570 va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10571 if (pmap->type == PMAP_TYPE_NESTED) {
10572 flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10573 } else {
10574 flush_mmu_tlb_range_async(va, last_level_only, strong);
10575 }
10576 return;
10577 }
10578 #endif
10579 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10580 va = tlbi_asid(asid) | tlbi_addr(va);
10581
10582 if (pmap->type == PMAP_TYPE_NESTED) {
10583 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10584 } else {
10585 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10586 }
10587 }
10588
10589 MARK_AS_PMAP_TEXT static void
10590 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10591 {
10592 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10593 }
10594
10595 void
10596 flush_mmu_tlb_region(
10597 vm_offset_t va,
10598 unsigned length)
10599 {
10600 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10601 sync_tlb_flush();
10602 }
10603
10604 unsigned int
10605 pmap_cache_attributes(
10606 ppnum_t pn)
10607 {
10608 pmap_paddr_t paddr;
10609 unsigned int pai;
10610 unsigned int result;
10611 pp_attr_t pp_attr_current;
10612
10613 paddr = ptoa(pn);
10614
10615 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10616
10617 if (!pa_valid(paddr)) {
10618 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10619 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10620 }
10621
10622 result = VM_WIMG_DEFAULT;
10623
10624 pai = pa_index(paddr);
10625
10626 pp_attr_current = pp_attr_table[pai];
10627 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10628 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10629 }
10630 return result;
10631 }
10632
10633 MARK_AS_PMAP_TEXT static void
10634 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10635 {
10636 if ((wimg_bits_prev != wimg_bits_new)
10637 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10638 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10639 && (wimg_bits_new != VM_WIMG_COPYBACK))
10640 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10641 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10642 pmap_sync_page_attributes_phys(pn);
10643 }
10644
10645 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10646 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10647 }
10648 }
10649
10650 MARK_AS_PMAP_TEXT __unused void
10651 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10652 {
10653 pmap_paddr_t paddr = ptoa(pn);
10654 const unsigned int pai = pa_index(paddr);
10655
10656 if (__improbable(!pa_valid(paddr))) {
10657 panic("%s called on non-managed page 0x%08x", __func__, pn);
10658 }
10659
10660 pvh_lock(pai);
10661
10662 #if XNU_MONITOR
10663 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10664 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10665 }
10666 #endif
10667
10668 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10669
10670 pvh_unlock(pai);
10671
10672 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10673 }
10674
10675 void *
10676 pmap_map_compressor_page(ppnum_t pn)
10677 {
10678 #if __ARM_PTE_PHYSMAP__
10679 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10680 if (cacheattr != VM_WIMG_DEFAULT) {
10681 #if XNU_MONITOR
10682 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10683 #else
10684 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10685 #endif
10686 }
10687 #endif
10688 return (void*)phystokv(ptoa(pn));
10689 }
10690
10691 void
10692 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10693 {
10694 #if __ARM_PTE_PHYSMAP__
10695 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10696 if (cacheattr != VM_WIMG_DEFAULT) {
10697 #if XNU_MONITOR
10698 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10699 #else
10700 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10701 #endif
10702 }
10703 #endif
10704 }
10705
10706 /**
10707 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10708 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10709 *
10710 * @param page_list List of pages to be updated.
10711 * @param cacheattr The new cache attribute.
10712 */
10713 void
10714 pmap_batch_set_cache_attributes(
10715 const unified_page_list_t *page_list,
10716 unsigned int cacheattr)
10717 {
10718 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10719
10720 if (page_list->type != UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY) {
10721 /**
10722 * For the UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY case, we process the UPL in batch below.
10723 * In an ideal world we would just use these iterator functions within
10724 * pmap_batch_set_cache_attributes_internal() for all cases, but since this is the PPL
10725 * that means we'll need to take special care to handle pending preemption and
10726 * if necessary return the iterator position out to this function and then re-enter
10727 * pmap_batch_set_cache_attributes_internal() at the same iterator position in a
10728 * secure manner. Not impossible, but also not trivial, so unless someone asks for
10729 * this perf improvement on the PPL I'm going to take the lazy approach here.
10730 */
10731 unified_page_list_iterator_t iter;
10732
10733 for (unified_page_list_iterator_init(page_list, &iter);
10734 !unified_page_list_iterator_end(&iter);
10735 unified_page_list_iterator_next(&iter)) {
10736 bool is_fictitious = false;
10737 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10738 if (__probable(!is_fictitious)) {
10739 #if XNU_MONITOR
10740 pmap_set_cache_attributes_ppl(pn, cacheattr);
10741 #else /* !XNU_MONITOR */
10742 pmap_set_cache_attributes_internal(pn, cacheattr);
10743 #endif /* XNU_MONITOR */
10744 }
10745 }
10746 return;
10747 }
10748
10749 if (page_list->upl.upl_size == 0) {
10750 return;
10751 }
10752
10753 batch_set_cache_attr_state_t states;
10754 states.page_index = 0;
10755 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10756 states.tlb_flush_pass_needed = false;
10757 states.rt_cache_flush_pass_needed = false;
10758
10759 /* Verify we are being called from a preemptible context. */
10760 pmap_verify_preemptible();
10761
10762 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10763 #if XNU_MONITOR
10764 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) page_list->upl.upl_info,
10765 states, page_list->upl.upl_size, cacheattr);
10766 #else /* !XNU_MONITOR */
10767 states = pmap_batch_set_cache_attributes_internal(page_list->upl.upl_info,
10768 states, page_list->upl.upl_size, cacheattr);
10769 #endif /* XNU_MONITOR */
10770 }
10771
10772 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10773 }
10774
10775 /**
10776 * Flushes TLB entries associated with the page specified by paddr, but do not
10777 * issue barriers yet.
10778 *
10779 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10780 */
10781 MARK_AS_PMAP_TEXT static void
10782 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10783 {
10784 #if __ARM_PTE_PHYSMAP__
10785 /* Flush the physical aperture mappings. */
10786 const vm_offset_t kva = phystokv(paddr);
10787 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10788 #endif /* __ARM_PTE_PHYSMAP__ */
10789
10790 /* Flush the mappings tracked in the ptes. */
10791 const unsigned int pai = pa_index(paddr);
10792 pv_entry_t **pv_h = pai_to_pvh(pai);
10793
10794 pt_entry_t *pte_p = PT_ENTRY_NULL;
10795 pv_entry_t *pve_p = PV_ENTRY_NULL;
10796
10797 pvh_assert_locked(pai);
10798
10799 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10800 pte_p = pvh_ptep(pv_h);
10801 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10802 pve_p = pvh_pve_list(pv_h);
10803 pte_p = PT_ENTRY_NULL;
10804 }
10805
10806 int pve_ptep_idx = 0;
10807 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10808 if (pve_p != PV_ENTRY_NULL) {
10809 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10810 if (pte_p == PT_ENTRY_NULL) {
10811 goto flush_tlb_skip_pte;
10812 }
10813 }
10814
10815 #ifdef PVH_FLAG_IOMMU
10816 if (pvh_ptep_is_iommu(pte_p)) {
10817 goto flush_tlb_skip_pte;
10818 }
10819 #endif /* PVH_FLAG_IOMMU */
10820 pmap_t pmap = ptep_get_pmap(pte_p);
10821 vm_map_address_t va = ptep_get_va(pte_p);
10822
10823 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10824 pmap, true, false);
10825
10826 flush_tlb_skip_pte:
10827 pte_p = PT_ENTRY_NULL;
10828 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10829 pve_ptep_idx = 0;
10830 pve_p = pve_next(pve_p);
10831 }
10832 }
10833 }
10834
10835 /**
10836 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10837 *
10838 * @param pai The Physical Address Index of the entry.
10839 * @param cacheattr The new cache attribute.
10840 */
10841 MARK_AS_PMAP_TEXT static void
10842 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10843 {
10844 pvh_assert_locked(pai);
10845
10846 pp_attr_t pp_attr_current, pp_attr_template;
10847 do {
10848 pp_attr_current = pp_attr_table[pai];
10849 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10850
10851 /**
10852 * WIMG bits should only be updated under the PVH lock, but we should do
10853 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10854 */
10855 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10856 }
10857
10858 /**
10859 * Batch updates the cache attributes of a list of pages in three passes.
10860 *
10861 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10862 * In pass two, TLB entries are flushed for each page in the list if necessary.
10863 * In pass three, caches are cleaned for each page in the list if necessary.
10864 *
10865 * When running in PPL, this function may decide to return to the caller in response
10866 * to AST_URGENT.
10867 *
10868 * @param user_page_list List of pages to be updated.
10869 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10870 * @param page_cnt Number of pages in total in user_page_list.
10871 * @param cacheattr The new cache attributes.
10872 *
10873 * @return The new state of the state machine.
10874 */
10875 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10876 pmap_batch_set_cache_attributes_internal(
10877 #if XNU_MONITOR
10878 volatile upl_page_info_t *user_page_list,
10879 #else /* !XNU_MONITOR */
10880 upl_page_info_array_t user_page_list,
10881 #endif /* XNU_MONITOR */
10882 batch_set_cache_attr_state_t states,
10883 unsigned int page_cnt,
10884 unsigned int cacheattr)
10885 {
10886 uint64_t page_index = states.page_index;
10887 uint64_t state = states.state;
10888 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10889 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10890
10891 /* For verifying progress. */
10892 __assert_only const uint64_t page_index_old = page_index;
10893 __assert_only const uint64_t state_old = state;
10894
10895 /* Assert page_index and state are within their range. */
10896 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10897 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10898 }
10899
10900 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10901 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10902 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10903 while (page_index < page_cnt) {
10904 const ppnum_t pn = user_page_list[page_index].phys_addr;
10905 const pmap_paddr_t paddr = ptoa(pn);
10906
10907 if (!pa_valid(paddr)) {
10908 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10909 }
10910
10911 const unsigned int pai = pa_index(paddr);
10912
10913 /* Lock the page. */
10914 pvh_lock(pai);
10915
10916 #if XNU_MONITOR
10917 if (ppattr_pa_test_monitor(paddr)) {
10918 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10919 }
10920 #endif /* XNU_MONITOR */
10921 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10922
10923 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10924 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10925 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10926 }
10927
10928 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10929
10930 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10931 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10932 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10933 }
10934
10935 /* Update the cache attributes in PTE and PP_ATTR table. */
10936 if (wimg_bits_new != wimg_bits_prev) {
10937 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10938 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10939 }
10940
10941 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10942 rt_cache_flush_pass_needed = true;
10943 }
10944
10945 pvh_unlock(pai);
10946
10947 page_index++;
10948
10949 #if XNU_MONITOR
10950 /**
10951 * Check for AST_URGENT every page, as the pve list search in cache
10952 * update can take non-constant time.
10953 */
10954 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10955 goto pbscai_exit;
10956 }
10957 #endif /* XNU_MONITOR */
10958 }
10959
10960 /* page_index == page_cnt && !pmap_pending_preemption() */
10961 if (tlb_flush_pass_needed) {
10962 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10963 } else if (rt_cache_flush_pass_needed) {
10964 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10965 } else {
10966 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10967 }
10968 page_index = 0;
10969
10970 /* Sync the PTE writes before potential TLB/Cache flushes. */
10971 FLUSH_PTE_STRONG();
10972
10973 #if XNU_MONITOR
10974 if (__improbable(pmap_pending_preemption())) {
10975 goto pbscai_exit;
10976 }
10977 #endif /* XNU_MONITOR */
10978 }
10979
10980 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10981 /**
10982 * Pass 2: for each physical page and for each mapping, we need to flush
10983 * the TLB for it.
10984 */
10985 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10986 while (page_index < page_cnt) {
10987 const ppnum_t pn = user_page_list[page_index].phys_addr;
10988
10989 const pmap_paddr_t paddr = ptoa(pn);
10990 if (!pa_valid(paddr)) {
10991 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10992 }
10993
10994 const unsigned int pai = pa_index(paddr);
10995
10996 pvh_lock(pai);
10997 pmap_flush_tlb_for_paddr_locked_async(paddr);
10998 pvh_unlock(pai);
10999
11000 page_index++;
11001
11002 #if XNU_MONITOR
11003 /**
11004 * Check for AST_URGENT every page, as the pve list search in cache
11005 * update can take non-constant time.
11006 */
11007 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11008 goto pbscai_exit;
11009 }
11010 #endif /* XNU_MONITOR */
11011 }
11012
11013 #if HAS_FEAT_XS
11014 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11015 arm64_sync_tlb(false);
11016 #else
11017 /**
11018 * For targets that distinguish between mild and strong DSB, mild DSB
11019 * will not drain the prefetcher. This can lead to prefetch-driven
11020 * cache fills that defeat the uncacheable requirement of the RT memory type.
11021 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11022 */
11023 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
11024 #endif
11025
11026 if (rt_cache_flush_pass_needed) {
11027 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
11028 } else {
11029 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11030 }
11031 page_index = 0;
11032
11033 #if XNU_MONITOR
11034 if (__improbable(pmap_pending_preemption())) {
11035 goto pbscai_exit;
11036 }
11037 #endif /* XNU_MONITOR */
11038 }
11039
11040 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
11041 /* Pass 3: Flush the cache if the page is recently set to RT */
11042 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
11043 #if !XNU_MONITOR
11044 /**
11045 * On non-PPL platforms, we disable preemption to ensure we are not preempted
11046 * in the state where DC by VA instructions remain enabled.
11047 */
11048 disable_preemption();
11049 #endif /* !XNU_MONITOR */
11050
11051 assert(get_preemption_level() > 0);
11052
11053 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11054 /**
11055 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
11056 * and the host will handle cache maintenance for it. So we don't need to
11057 * worry about enabling the ops here for AVP.
11058 */
11059 enable_dc_mva_ops();
11060 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11061
11062 while (page_index < page_cnt) {
11063 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
11064
11065 if (!pa_valid(paddr)) {
11066 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
11067 }
11068
11069 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
11070
11071 page_index++;
11072
11073 #if XNU_MONITOR
11074 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11075 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11076 disable_dc_mva_ops();
11077 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11078 goto pbscai_exit;
11079 }
11080 #endif /* XNU_MONITOR */
11081 }
11082
11083 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11084 disable_dc_mva_ops();
11085 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11086
11087 #if !XNU_MONITOR
11088 enable_preemption();
11089 #endif /* !XNU_MONITOR */
11090
11091 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11092 page_index = 0;
11093 }
11094
11095 #if XNU_MONITOR
11096 pbscai_exit:
11097 #endif /* XNU_MONITOR */
11098 /* Assert page_index and state are within their range. */
11099 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
11100
11101 /* Make sure we are making progress in this call. */
11102 assert(page_index > page_index_old || state > state_old);
11103
11104 batch_set_cache_attr_state_t states_new;
11105 states_new.page_index = page_index;
11106 states_new.state = state;
11107 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
11108 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
11109 return states_new;
11110 }
11111
11112 MARK_AS_PMAP_TEXT static void
11113 pmap_set_cache_attributes_priv(
11114 ppnum_t pn,
11115 unsigned int cacheattr,
11116 boolean_t external __unused)
11117 {
11118 pmap_paddr_t paddr;
11119 unsigned int pai;
11120 pp_attr_t pp_attr_current;
11121 pp_attr_t pp_attr_template;
11122 unsigned int wimg_bits_prev, wimg_bits_new;
11123
11124 paddr = ptoa(pn);
11125
11126 if (!pa_valid(paddr)) {
11127 return; /* Not a managed page. */
11128 }
11129
11130 if (cacheattr & VM_WIMG_USE_DEFAULT) {
11131 cacheattr = VM_WIMG_DEFAULT;
11132 }
11133
11134 pai = pa_index(paddr);
11135
11136 pvh_lock(pai);
11137
11138 #if XNU_MONITOR
11139 if (external && ppattr_pa_test_monitor(paddr)) {
11140 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
11141 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
11142 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
11143 }
11144 #endif
11145
11146 do {
11147 pp_attr_current = pp_attr_table[pai];
11148 wimg_bits_prev = VM_WIMG_DEFAULT;
11149 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
11150 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
11151 }
11152
11153 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
11154
11155 /**
11156 * WIMG bits should only be updated under the PVH lock, but we should do
11157 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
11158 */
11159 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
11160
11161 wimg_bits_new = VM_WIMG_DEFAULT;
11162 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
11163 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
11164 }
11165
11166 if (wimg_bits_new != wimg_bits_prev) {
11167 pmap_update_cache_attributes_locked(pn, cacheattr, true);
11168 }
11169
11170 pvh_unlock(pai);
11171
11172 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
11173 }
11174
11175 MARK_AS_PMAP_TEXT void
11176 pmap_set_cache_attributes_internal(
11177 ppnum_t pn,
11178 unsigned int cacheattr)
11179 {
11180 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
11181 }
11182
11183 void
11184 pmap_set_cache_attributes(
11185 ppnum_t pn,
11186 unsigned int cacheattr)
11187 {
11188 #if XNU_MONITOR
11189 pmap_set_cache_attributes_ppl(pn, cacheattr);
11190 #else
11191 pmap_set_cache_attributes_internal(pn, cacheattr);
11192 #endif
11193 }
11194
11195 /**
11196 * Updates the page numbered ppnum to have attribute specified by attributes.
11197 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
11198 * The necessity of the TLB flush is returned in case this function is called
11199 * in a batched manner and the TLB flush is intended to be done at a different
11200 * timing.
11201 *
11202 * @param ppnum Page Number of the page to be updated.
11203 * @param attributes The new cache attributes.
11204 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11205 * immediately.
11206 *
11207 * @return Returns true if a TLB flush is needed for this update regardless of
11208 * whether a flush has occurred already.
11209 */
11210 MARK_AS_PMAP_TEXT bool
11211 pmap_update_cache_attributes_locked(
11212 ppnum_t ppnum,
11213 unsigned attributes,
11214 bool perform_tlbi)
11215 {
11216 pmap_paddr_t phys = ptoa(ppnum);
11217 pv_entry_t *pve_p;
11218 pt_entry_t *pte_p;
11219 pv_entry_t **pv_h;
11220 pt_entry_t tmplate;
11221 unsigned int pai;
11222 boolean_t tlb_flush_needed = false;
11223
11224 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11225
11226 if (pmap_panic_dev_wimg_on_managed) {
11227 switch (attributes & VM_WIMG_MASK) {
11228 case VM_WIMG_IO: // nGnRnE
11229 case VM_WIMG_POSTED: // nGnRE
11230 /* supported on DRAM, but slow, so we disallow */
11231
11232 case VM_WIMG_POSTED_REORDERED: // nGRE
11233 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11234 /* unsupported on DRAM */
11235
11236 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11237 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11238 break;
11239
11240 default:
11241 /* not device type memory, all good */
11242
11243 break;
11244 }
11245 }
11246
11247 #if __ARM_PTE_PHYSMAP__
11248 vm_offset_t kva = phystokv(phys);
11249 pte_p = pmap_pte(kernel_pmap, kva);
11250
11251 tmplate = *pte_p;
11252 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11253 #if XNU_MONITOR
11254 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11255 #else
11256 tmplate |= wimg_to_pte(attributes, phys);
11257 #endif
11258 if (tmplate & ARM_PTE_HINT_MASK) {
11259 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11260 __FUNCTION__, pte_p, (void *)kva, tmplate);
11261 }
11262
11263 if (perform_tlbi) {
11264 write_pte_strong(pte_p, tmplate);
11265 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11266 } else {
11267 write_pte_fast(pte_p, tmplate);
11268 }
11269 tlb_flush_needed = true;
11270 #endif
11271
11272 pai = pa_index(phys);
11273
11274 pv_h = pai_to_pvh(pai);
11275
11276 pte_p = PT_ENTRY_NULL;
11277 pve_p = PV_ENTRY_NULL;
11278 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11279 pte_p = pvh_ptep(pv_h);
11280 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11281 pve_p = pvh_pve_list(pv_h);
11282 pte_p = PT_ENTRY_NULL;
11283 }
11284
11285 int pve_ptep_idx = 0;
11286 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11287 vm_map_address_t va;
11288 pmap_t pmap;
11289
11290 if (pve_p != PV_ENTRY_NULL) {
11291 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11292 if (pte_p == PT_ENTRY_NULL) {
11293 goto cache_skip_pve;
11294 }
11295 }
11296
11297 #ifdef PVH_FLAG_IOMMU
11298 if (pvh_ptep_is_iommu(pte_p)) {
11299 goto cache_skip_pve;
11300 }
11301 #endif
11302 pmap = ptep_get_pmap(pte_p);
11303 #if HAS_FEAT_XS
11304 /**
11305 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11306 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11307 */
11308 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11309 #endif /* HAS_FEAT_XS */
11310 va = ptep_get_va(pte_p);
11311
11312 tmplate = *pte_p;
11313 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11314 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11315
11316 if (perform_tlbi) {
11317 write_pte_strong(pte_p, tmplate);
11318 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11319 pmap, true, false);
11320 } else {
11321 write_pte_fast(pte_p, tmplate);
11322 }
11323 tlb_flush_needed = true;
11324
11325 cache_skip_pve:
11326 pte_p = PT_ENTRY_NULL;
11327 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11328 pve_ptep_idx = 0;
11329 pve_p = pve_next(pve_p);
11330 }
11331 }
11332 if (perform_tlbi && tlb_flush_needed) {
11333 #if HAS_FEAT_XS
11334 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11335 arm64_sync_tlb(false);
11336 #else
11337 /**
11338 * For targets that distinguish between mild and strong DSB, mild DSB
11339 * will not drain the prefetcher. This can lead to prefetch-driven
11340 * cache fills that defeat the uncacheable requirement of the RT memory type.
11341 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11342 */
11343 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11344 #endif
11345 }
11346
11347 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11348
11349 return tlb_flush_needed;
11350 }
11351
11352 /**
11353 * Mark a pmap as being dedicated to use for a commpage mapping.
11354 * The pmap itself will never be activated on a CPU; its mappings will
11355 * only be embedded in userspace pmaps at a fixed virtual address.
11356 *
11357 * @param pmap the pmap to mark as belonging to a commpage.
11358 */
11359 static void
11360 pmap_set_commpage(pmap_t pmap)
11361 {
11362 #if XNU_MONITOR
11363 assert(!pmap_ppl_locked_down);
11364 #endif
11365 assert(pmap->type == PMAP_TYPE_USER);
11366 pmap->type = PMAP_TYPE_COMMPAGE;
11367 /*
11368 * Free the pmap's ASID. This pmap should not ever be directly
11369 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
11370 * ASID space contention but will also cause pmap_switch() to panic
11371 * if an attacker tries to activate this pmap. Disable preemption to
11372 * accommodate the *_nopreempt spinlock in free_asid().
11373 */
11374 mp_disable_preemption();
11375 pmap_get_pt_ops(pmap)->free_id(pmap);
11376 mp_enable_preemption();
11377 }
11378
11379 static void
11380 pmap_update_tt3e(
11381 pmap_t pmap,
11382 vm_address_t address,
11383 tt_entry_t template)
11384 {
11385 tt_entry_t *ptep, pte;
11386
11387 ptep = pmap_tt3e(pmap, address);
11388 if (ptep == NULL) {
11389 panic("%s: no ptep?", __FUNCTION__);
11390 }
11391
11392 pte = *ptep;
11393 pte = tte_to_pa(pte) | template;
11394 write_pte_strong(ptep, pte);
11395 }
11396
11397 /* Note absence of non-global bit */
11398 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11399 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11400 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11401 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11402
11403 /* Note absence of non-global bit and no-execute bit. */
11404 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11405 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11406 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11407 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11408
11409 void
11410 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11411 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11412 {
11413 kern_return_t kr;
11414 pmap_paddr_t data_pa = 0; // data address
11415 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11416 pmap_paddr_t text_pa = 0; // text address
11417
11418 *kernel_data_addr = 0;
11419 *kernel_text_addr = 0;
11420 *user_text_addr = 0;
11421
11422 #if XNU_MONITOR
11423 data_pa = pmap_alloc_page_for_kern(0);
11424 assert(data_pa);
11425 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11426 ro_data_pa = pmap_alloc_page_for_kern(0);
11427 assert(ro_data_pa);
11428 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11429 #if CONFIG_ARM_PFZ
11430 text_pa = pmap_alloc_page_for_kern(0);
11431 assert(text_pa);
11432 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11433 #endif
11434
11435 #else /* XNU_MONITOR */
11436 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11437 /*
11438 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11439 * mapped at page granularity, so a separate page for kernel RO data would not
11440 * be useful.
11441 */
11442 ro_data_pa = data_pa;
11443 #if CONFIG_ARM_PFZ
11444 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11445 #endif
11446
11447 #endif /* XNU_MONITOR */
11448
11449 /*
11450 * In order to avoid burning extra pages on mapping the shared page, we
11451 * create a dedicated pmap for the shared page. We forcibly nest the
11452 * translation tables from this pmap into other pmaps. The level we
11453 * will nest at depends on the MMU configuration (page size, TTBR range,
11454 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11455 *
11456 * Note that this is NOT "the nested pmap" (which is used to nest the
11457 * shared cache).
11458 *
11459 * Note that we update parameters of the entry for our unique needs (NG
11460 * entry, etc.).
11461 */
11462 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11463 assert(commpage_pmap_default != NULL);
11464 pmap_set_commpage(commpage_pmap_default);
11465
11466 /* The user 64-bit mappings... */
11467 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11468 assert(kr == KERN_SUCCESS);
11469 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11470
11471 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11472 assert(kr == KERN_SUCCESS);
11473 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11474 #if CONFIG_ARM_PFZ
11475 /* User mapping of comm page text section for 64 bit mapping only
11476 *
11477 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11478 * user processes to get this page mapped in, they should never call into
11479 * this page.
11480 *
11481 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11482 * is slid in the same L3 as the data commpage. It is either outside the
11483 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11484 * it is reserved and unavailable to mach VM for future mappings.
11485 */
11486 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11487 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11488
11489 vm_map_address_t commpage_text_va = 0;
11490
11491 do {
11492 int text_leaf_index = random() % num_ptes;
11493
11494 // Generate a VA for the commpage text with the same root and twig index as data
11495 // comm page, but with new leaf index we've just generated.
11496 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11497 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11498 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11499
11500 // Assert that this is empty
11501 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11502 assert(ptep != PT_ENTRY_NULL);
11503 assert(*ptep == ARM_TTE_EMPTY);
11504
11505 // At this point, we've found the address we want to insert our comm page at
11506 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11507 assert(kr == KERN_SUCCESS);
11508 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11509 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11510
11511 *user_text_addr = commpage_text_va;
11512 #endif
11513
11514 /* ...and the user 32-bit mappings. */
11515 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11516 assert(kr == KERN_SUCCESS);
11517 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11518
11519 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11520 assert(kr == KERN_SUCCESS);
11521 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11522 #if __ARM_MIXED_PAGE_SIZE__
11523 /**
11524 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11525 * new set of page tables that point to the exact same 16K shared page as
11526 * before. Only the first 4K of the 16K shared page is mapped since that's
11527 * the only part that contains relevant data.
11528 */
11529 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11530 assert(commpage_pmap_4k != NULL);
11531 pmap_set_commpage(commpage_pmap_4k);
11532
11533 /* The user 64-bit mappings... */
11534 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11535 assert(kr == KERN_SUCCESS);
11536 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11537
11538 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11539 assert(kr == KERN_SUCCESS);
11540 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11541
11542 /* ...and the user 32-bit mapping. */
11543 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11544 assert(kr == KERN_SUCCESS);
11545 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11546
11547 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11548 assert(kr == KERN_SUCCESS);
11549 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11550 #endif
11551
11552 /* For manipulation in kernel, go straight to physical page */
11553 *kernel_data_addr = phystokv(data_pa);
11554 assert(commpage_ro_data_kva == 0);
11555 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11556 assert(commpage_text_kva == 0);
11557 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11558 }
11559
11560
11561 /*
11562 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11563 * with user controlled TTEs for regions that aren't explicitly reserved by the
11564 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11565 */
11566 #if (ARM_PGSHIFT == 14)
11567 /**
11568 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11569 * commpage completely above the maximum 32-bit userspace VA.
11570 */
11571 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11572
11573 /**
11574 * Normally there'd be an assert to check that 64-bit devices with 64-bit
11575 * userspace VAs can nest the commpage completely above the maximum 64-bit
11576 * userpace VA, but that technically isn't true on macOS. On those systems, the
11577 * commpage lives within the userspace VA range, but is protected by the VM as
11578 * a reserved region (see vm_reserved_regions[] definition for more info).
11579 */
11580
11581 #elif (ARM_PGSHIFT == 12)
11582 /**
11583 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11584 * above the maximum userspace VA.
11585 */
11586 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11587 #else
11588 #error Nested shared page mapping is unsupported on this config
11589 #endif
11590
11591 MARK_AS_PMAP_TEXT kern_return_t
11592 pmap_insert_commpage_internal(
11593 pmap_t pmap)
11594 {
11595 kern_return_t kr = KERN_SUCCESS;
11596 vm_offset_t commpage_vaddr;
11597 pt_entry_t *ttep, *src_ttep;
11598 int options = 0;
11599 pmap_t commpage_pmap = commpage_pmap_default;
11600
11601 /* Validate the pmap input before accessing its data. */
11602 validate_pmap_mutable(pmap);
11603
11604 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11605 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11606
11607 #if __ARM_MIXED_PAGE_SIZE__
11608 #if !__ARM_16K_PG__
11609 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11610 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11611 #endif /* !__ARM_16K_PG__ */
11612
11613 /* Choose the correct shared page pmap to use. */
11614 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11615 if (pmap_page_size == 16384) {
11616 commpage_pmap = commpage_pmap_default;
11617 } else if (pmap_page_size == 4096) {
11618 commpage_pmap = commpage_pmap_4k;
11619 } else {
11620 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11621 }
11622 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11623
11624 #if XNU_MONITOR
11625 options |= PMAP_OPTIONS_NOWAIT;
11626 #endif /* XNU_MONITOR */
11627
11628 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11629 #error We assume a single page.
11630 #endif
11631
11632 if (pmap_is_64bit(pmap)) {
11633 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11634 } else {
11635 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11636 }
11637
11638
11639 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11640
11641 /*
11642 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11643 * two (2MB) depending on the address space layout. For 16KB pages, each level
11644 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11645 * to "nest".
11646 *
11647 * Note: This is not "nesting" in the shared cache sense. This definition of
11648 * nesting just means inserting pointers to pre-allocated tables inside of
11649 * the passed in pmap to allow us to share page tables (which map the shared
11650 * page) for every task. This saves at least one page of memory per process
11651 * compared to creating new page tables in every process for mapping the
11652 * shared page.
11653 */
11654
11655 /**
11656 * Allocate the twig page tables if needed, and slam a pointer to the shared
11657 * page's tables into place.
11658 */
11659 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11660 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11661
11662 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11663
11664 if (kr != KERN_SUCCESS) {
11665 #if XNU_MONITOR
11666 if (kr == KERN_RESOURCE_SHORTAGE) {
11667 return kr;
11668 } else
11669 #endif
11670 if (kr == KERN_ABORTED) {
11671 return kr;
11672 } else {
11673 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11674 }
11675 }
11676
11677 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11678 }
11679
11680 if (*ttep != ARM_PTE_EMPTY) {
11681 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11682 }
11683
11684 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11685
11686 *ttep = *src_ttep;
11687 FLUSH_PTE_STRONG();
11688
11689 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11690
11691 return kr;
11692 }
11693
11694 static void
11695 pmap_unmap_commpage(
11696 pmap_t pmap)
11697 {
11698 pt_entry_t *ttep;
11699 vm_offset_t commpage_vaddr;
11700 pmap_t commpage_pmap = commpage_pmap_default;
11701
11702 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11703 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11704
11705 #if __ARM_MIXED_PAGE_SIZE__
11706 #if !__ARM_16K_PG__
11707 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11708 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11709 #endif /* !__ARM_16K_PG__ */
11710
11711 /* Choose the correct shared page pmap to use. */
11712 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11713 if (pmap_page_size == 16384) {
11714 commpage_pmap = commpage_pmap_default;
11715 } else if (pmap_page_size == 4096) {
11716 commpage_pmap = commpage_pmap_4k;
11717 } else {
11718 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11719 }
11720 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11721
11722 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11723 #error We assume a single page.
11724 #endif
11725
11726 if (pmap_is_64bit(pmap)) {
11727 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11728 } else {
11729 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11730 }
11731
11732
11733 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11734
11735 if (ttep == NULL) {
11736 return;
11737 }
11738
11739 /* It had better be mapped to the shared page. */
11740 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11741 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11742 }
11743
11744 *ttep = ARM_TTE_EMPTY;
11745 FLUSH_PTE_STRONG();
11746
11747 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11748 sync_tlb_flush();
11749 }
11750
11751 void
11752 pmap_insert_commpage(
11753 pmap_t pmap)
11754 {
11755 kern_return_t kr = KERN_FAILURE;
11756 #if XNU_MONITOR
11757 do {
11758 kr = pmap_insert_commpage_ppl(pmap);
11759
11760 if (kr == KERN_RESOURCE_SHORTAGE) {
11761 pmap_alloc_page_for_ppl(0);
11762 }
11763 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11764
11765 pmap_ledger_check_balance(pmap);
11766 #else
11767 do {
11768 kr = pmap_insert_commpage_internal(pmap);
11769 } while (kr == KERN_ABORTED);
11770 #endif
11771
11772 if (kr != KERN_SUCCESS) {
11773 panic("%s: failed to insert the shared page, kr=%d, "
11774 "pmap=%p",
11775 __FUNCTION__, kr,
11776 pmap);
11777 }
11778 }
11779
11780 static boolean_t
11781 pmap_is_64bit(
11782 pmap_t pmap)
11783 {
11784 return pmap->is_64bit;
11785 }
11786
11787 bool
11788 pmap_is_exotic(
11789 pmap_t pmap __unused)
11790 {
11791 return false;
11792 }
11793
11794
11795 /* ARMTODO -- an implementation that accounts for
11796 * holes in the physical map, if any.
11797 */
11798 boolean_t
11799 pmap_valid_page(
11800 ppnum_t pn)
11801 {
11802 return pa_valid(ptoa(pn));
11803 }
11804
11805 boolean_t
11806 pmap_bootloader_page(
11807 ppnum_t pn)
11808 {
11809 pmap_paddr_t paddr = ptoa(pn);
11810
11811 if (pa_valid(paddr)) {
11812 return FALSE;
11813 }
11814 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11815 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11816 }
11817
11818 MARK_AS_PMAP_TEXT boolean_t
11819 pmap_is_empty_internal(
11820 pmap_t pmap,
11821 vm_map_offset_t va_start,
11822 vm_map_offset_t va_end)
11823 {
11824 vm_map_offset_t block_start, block_end;
11825 tt_entry_t *tte_p;
11826
11827 if (pmap == NULL) {
11828 return TRUE;
11829 }
11830
11831 validate_pmap(pmap);
11832
11833 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11834 unsigned int initial_not_in_kdp = not_in_kdp;
11835
11836 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11837 pmap_lock(pmap, PMAP_LOCK_SHARED);
11838 }
11839
11840
11841 /* TODO: This will be faster if we increment ttep at each level. */
11842 block_start = va_start;
11843
11844 while (block_start < va_end) {
11845 pt_entry_t *bpte_p, *epte_p;
11846 pt_entry_t *pte_p;
11847
11848 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11849 if (block_end > va_end) {
11850 block_end = va_end;
11851 }
11852
11853 tte_p = pmap_tte(pmap, block_start);
11854 if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
11855 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11856 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11857 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11858
11859 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11860 if (*pte_p != ARM_PTE_EMPTY) {
11861 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11862 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11863 }
11864 return FALSE;
11865 }
11866 }
11867 }
11868 block_start = block_end;
11869 }
11870
11871 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11872 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11873 }
11874
11875 return TRUE;
11876 }
11877
11878 boolean_t
11879 pmap_is_empty(
11880 pmap_t pmap,
11881 vm_map_offset_t va_start,
11882 vm_map_offset_t va_end)
11883 {
11884 #if XNU_MONITOR
11885 return pmap_is_empty_ppl(pmap, va_start, va_end);
11886 #else
11887 return pmap_is_empty_internal(pmap, va_start, va_end);
11888 #endif
11889 }
11890
11891 vm_map_offset_t
11892 pmap_max_offset(
11893 boolean_t is64,
11894 unsigned int option)
11895 {
11896 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11897 }
11898
11899 vm_map_offset_t
11900 pmap_max_64bit_offset(
11901 __unused unsigned int option)
11902 {
11903 vm_map_offset_t max_offset_ret = 0;
11904
11905 #if defined(__arm64__)
11906 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11907 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11908 max_offset_ret = arm64_pmap_max_offset_default;
11909 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11910 max_offset_ret = min_max_offset;
11911 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11912 max_offset_ret = MACH_VM_MAX_ADDRESS;
11913 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11914 if (arm64_pmap_max_offset_default) {
11915 max_offset_ret = arm64_pmap_max_offset_default;
11916 } else if (max_mem > 0xC0000000) {
11917 // devices with > 3GB of memory
11918 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11919 } else if (max_mem > 0x40000000) {
11920 // devices with > 1GB and <= 3GB of memory
11921 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11922 } else {
11923 // devices with <= 1 GB of memory
11924 max_offset_ret = min_max_offset;
11925 }
11926 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11927 if (arm64_pmap_max_offset_default) {
11928 // Allow the boot-arg to override jumbo size
11929 max_offset_ret = arm64_pmap_max_offset_default;
11930 } else {
11931 max_offset_ret = MACH_VM_JUMBO_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11932 }
11933 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11934 } else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11935 max_offset_ret = MACH_VM_MAX_ADDRESS;
11936 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11937 } else {
11938 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11939 }
11940
11941 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11942 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11943 assert(max_offset_ret >= min_max_offset);
11944 }
11945 #else
11946 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11947 #endif
11948
11949 return max_offset_ret;
11950 }
11951
11952 vm_map_offset_t
11953 pmap_max_32bit_offset(
11954 unsigned int option)
11955 {
11956 vm_map_offset_t max_offset_ret = 0;
11957
11958 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11959 max_offset_ret = arm_pmap_max_offset_default;
11960 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11961 max_offset_ret = VM_MAX_ADDRESS;
11962 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11963 max_offset_ret = VM_MAX_ADDRESS;
11964 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11965 if (arm_pmap_max_offset_default) {
11966 max_offset_ret = arm_pmap_max_offset_default;
11967 } else if (max_mem > 0x20000000) {
11968 max_offset_ret = VM_MAX_ADDRESS;
11969 } else {
11970 max_offset_ret = VM_MAX_ADDRESS;
11971 }
11972 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11973 max_offset_ret = VM_MAX_ADDRESS;
11974 } else {
11975 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11976 }
11977
11978 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11979 return max_offset_ret;
11980 }
11981
11982 #if CONFIG_DTRACE
11983 /*
11984 * Constrain DTrace copyin/copyout actions
11985 */
11986 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11987 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11988
11989 kern_return_t
11990 dtrace_copyio_preflight(
11991 __unused addr64_t va)
11992 {
11993 if (current_map() == kernel_map) {
11994 return KERN_FAILURE;
11995 } else {
11996 return KERN_SUCCESS;
11997 }
11998 }
11999
12000 kern_return_t
12001 dtrace_copyio_postflight(
12002 __unused addr64_t va)
12003 {
12004 return KERN_SUCCESS;
12005 }
12006 #endif /* CONFIG_DTRACE */
12007
12008
12009 void
12010 pmap_flush_context_init(__unused pmap_flush_context *pfc)
12011 {
12012 }
12013
12014
12015 void
12016 pmap_flush(
12017 __unused pmap_flush_context *cpus_to_flush)
12018 {
12019 /* not implemented yet */
12020 return;
12021 }
12022
12023 #if XNU_MONITOR
12024
12025 /*
12026 * Enforce that the address range described by kva and nbytes is not currently
12027 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
12028 * unintentionally writing to PPL-owned memory.
12029 */
12030 void
12031 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
12032 {
12033 vm_offset_t end;
12034 if (os_add_overflow(kva, nbytes, &end)) {
12035 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12036 }
12037 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12038 pmap_paddr_t pa = kvtophys_nofail(ckva);
12039 unsigned int pai = pa_index(pa);
12040 pp_attr_t attr;
12041 if (__improbable(!pa_valid(pa))) {
12042 panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12043 }
12044 pvh_lock(pai);
12045 if (__improbable(ckva == phystokv(pa))) {
12046 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12047 }
12048 do {
12049 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
12050 if (__improbable(attr & PP_ATTR_MONITOR)) {
12051 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
12052 }
12053 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
12054 pvh_unlock(pai);
12055 if (__improbable(kvtophys_nofail(ckva) != pa)) {
12056 panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12057 }
12058 }
12059 }
12060
12061 void
12062 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
12063 {
12064 vm_offset_t end;
12065 if (os_add_overflow(kva, nbytes, &end)) {
12066 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12067 }
12068 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12069 pmap_paddr_t pa = kvtophys_nofail(ckva);
12070
12071 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
12072 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
12073 }
12074 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
12075 ppattr_pa_clear_no_monitor(pa);
12076 }
12077 }
12078
12079 /**
12080 * Lock down a page, making all mappings read-only, and preventing further
12081 * mappings or removal of this particular kva's mapping. Effectively, it makes
12082 * the physical page at kva immutable (see the ppl_writable parameter for an
12083 * exception to this).
12084 *
12085 * @param kva Valid address to any mapping of the physical page to lockdown.
12086 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12087 * @param ppl_writable True if the PPL should still be able to write to the page
12088 * using the physical aperture mapping. False will make the
12089 * page read-only for both the kernel and PPL in the
12090 * physical aperture.
12091 */
12092
12093 MARK_AS_PMAP_TEXT static void
12094 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12095 {
12096 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
12097 }
12098
12099 /**
12100 * Lock down a page, giving all mappings the specified maximum permissions, and
12101 * preventing further mappings or removal of this particular kva's mapping.
12102 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
12103 * parameter for an exception to this).
12104 *
12105 * @param kva Valid address to any mapping of the physical page to lockdown.
12106 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12107 * @param ppl_writable True if the PPL should still be able to write to the page
12108 * using the physical aperture mapping. False will make the
12109 * page read-only for both the kernel and PPL in the
12110 * physical aperture.
12111 * @param prot Maximum permissions to allow in existing alias mappings
12112 */
12113 MARK_AS_PMAP_TEXT static void
12114 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
12115 {
12116 const pmap_paddr_t pa = kvtophys_nofail(kva);
12117 const unsigned int pai = pa_index(pa);
12118
12119 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12120 pvh_lock(pai);
12121 pv_entry_t **pvh = pai_to_pvh(pai);
12122 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12123
12124 if (__improbable(ppattr_pa_test_monitor(pa))) {
12125 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
12126 }
12127
12128 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
12129 panic("%s: %#lx already locked down/executable (%#llx)",
12130 __func__, kva, (uint64_t)pvh_flags);
12131 }
12132
12133
12134 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
12135
12136 /* Update the physical aperture mapping to prevent kernel write access. */
12137 const unsigned int new_xprr_perm =
12138 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12139 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
12140
12141 pvh_unlock(pai);
12142
12143 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
12144
12145 /**
12146 * Double-check that the mapping didn't change physical addresses before the
12147 * LOCKDOWN flag was set (there is a brief window between the above
12148 * kvtophys() and pvh_lock() calls where the mapping could have changed).
12149 *
12150 * This doesn't solve the ABA problem, but this doesn't have to since once
12151 * the pvh_lock() is grabbed no new mappings can be created on this physical
12152 * page without the LOCKDOWN flag already set (so any future mappings can
12153 * only be RO, and no existing mappings can be removed).
12154 */
12155 if (kvtophys_nofail(kva) != pa) {
12156 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
12157 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
12158 }
12159 }
12160
12161 /**
12162 * Helper for releasing a page from being locked down to the PPL, making it writable to the
12163 * kernel once again.
12164 *
12165 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12166 * to unlockdown a page that was never locked down, will panic.
12167 *
12168 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
12169 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12170 * @param ppl_writable This must match whatever `ppl_writable` parameter was
12171 * passed to the paired pmap_ppl_lockdown_page() call. Any
12172 * deviation will result in a panic.
12173 */
12174 MARK_AS_PMAP_TEXT static void
12175 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
12176 {
12177 pvh_assert_locked(pai);
12178 pv_entry_t **pvh = pai_to_pvh(pai);
12179 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12180
12181 if (__improbable(!(pvh_flags & lockdown_flag))) {
12182 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
12183 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
12184 }
12185
12186
12187 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
12188
12189 /* Restore the pre-lockdown physical aperture mapping permissions. */
12190 const unsigned int old_xprr_perm =
12191 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12192 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
12193 }
12194
12195 /**
12196 * Release a page from being locked down to the PPL, making it writable to the
12197 * kernel once again.
12198 *
12199 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12200 * to unlockdown a page that was never locked down, will panic.
12201 *
12202 * @param kva Valid address to any mapping of the physical page to unlockdown.
12203 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12204 * @param ppl_writable This must match whatever `ppl_writable` parameter was
12205 * passed to the paired pmap_ppl_lockdown_page() call. Any
12206 * deviation will result in a panic.
12207 */
12208 MARK_AS_PMAP_TEXT static void
12209 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12210 {
12211 const pmap_paddr_t pa = kvtophys_nofail(kva);
12212 const unsigned int pai = pa_index(pa);
12213
12214 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12215 pvh_lock(pai);
12216 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12217 pvh_unlock(pai);
12218 }
12219
12220 #else /* XNU_MONITOR */
12221
12222 void __unused
12223 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12224 {
12225 }
12226
12227 void __unused
12228 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12229 {
12230 }
12231
12232 #endif /* !XNU_MONITOR */
12233
12234
12235 MARK_AS_PMAP_TEXT static inline void
12236 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12237 {
12238 #if XNU_MONITOR
12239 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12240 #else
12241 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12242 #endif
12243 }
12244
12245 MARK_AS_PMAP_TEXT static inline void
12246 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12247 {
12248 #if XNU_MONITOR
12249 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12250 #else
12251 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12252 #endif
12253 }
12254
12255 /**
12256 * Perform basic validation checks on the destination only and
12257 * corresponding offset/sizes prior to writing to a read only allocation.
12258 *
12259 * @note Should be called before writing to an allocation from the read
12260 * only allocator.
12261 *
12262 * @param zid The ID of the zone the allocation belongs to.
12263 * @param va VA of element being modified (destination).
12264 * @param offset Offset being written to, in the element.
12265 * @param new_data_size Size of modification.
12266 *
12267 */
12268
12269 MARK_AS_PMAP_TEXT static void
12270 pmap_ro_zone_validate_element_dst(
12271 zone_id_t zid,
12272 vm_offset_t va,
12273 vm_offset_t offset,
12274 vm_size_t new_data_size)
12275 {
12276 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12277 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12278 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12279 }
12280
12281 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12282
12283 /* Check element is from correct zone and properly aligned */
12284 zone_require_ro(zid, elem_size, (void*)va);
12285
12286 if (__improbable(new_data_size > (elem_size - offset))) {
12287 panic("%s: New data size %lu too large for elem size %lu at addr %p",
12288 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12289 }
12290 if (__improbable(offset >= elem_size)) {
12291 panic("%s: Offset %lu too large for elem size %lu at addr %p",
12292 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12293 }
12294 }
12295
12296
12297 /**
12298 * Perform basic validation checks on the source, destination and
12299 * corresponding offset/sizes prior to writing to a read only allocation.
12300 *
12301 * @note Should be called before writing to an allocation from the read
12302 * only allocator.
12303 *
12304 * @param zid The ID of the zone the allocation belongs to.
12305 * @param va VA of element being modified (destination).
12306 * @param offset Offset being written to, in the element.
12307 * @param new_data Pointer to new data (source).
12308 * @param new_data_size Size of modification.
12309 *
12310 */
12311
12312 MARK_AS_PMAP_TEXT static void
12313 pmap_ro_zone_validate_element(
12314 zone_id_t zid,
12315 vm_offset_t va,
12316 vm_offset_t offset,
12317 const vm_offset_t new_data,
12318 vm_size_t new_data_size)
12319 {
12320 vm_offset_t sum = 0;
12321
12322 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12323 panic("%s: Integer addition overflow %p + %lu = %lu",
12324 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12325 }
12326
12327 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12328 }
12329
12330 /**
12331 * Ensure that physical page is locked down before writing to it.
12332 *
12333 * @note Should be called before writing to an allocation from the read
12334 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12335 * ensure that it is called after the modification.
12336 *
12337 *
12338 * @param pa Physical address of the element being modified.
12339 * @param va Virtual address of element being modified.
12340 * @param size Size of the modification.
12341 *
12342 */
12343
12344 MARK_AS_PMAP_TEXT static void
12345 pmap_ro_zone_lock_phy_page(
12346 const pmap_paddr_t pa,
12347 vm_offset_t va,
12348 vm_size_t size)
12349 {
12350 if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12351 panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12352 __func__, (unsigned long long)va, (unsigned long long)size);
12353 }
12354 const unsigned int pai = pa_index(pa);
12355 pvh_lock(pai);
12356
12357 /* Ensure that the physical page is locked down */
12358 #if XNU_MONITOR
12359 pv_entry_t **pvh = pai_to_pvh(pai);
12360 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12361 panic("%s: Physical page not locked down %llx", __func__, pa);
12362 }
12363 #endif /* XNU_MONITOR */
12364 }
12365
12366 /**
12367 * Unlock physical page after writing to it.
12368 *
12369 * @note Should be called after writing to an allocation from the read
12370 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12371 * ensure that it has been called prior to the modification.
12372 *
12373 * @param pa Physical address of the element that was modified.
12374 * @param va Virtual address of element that was modified.
12375 * @param size Size of the modification.
12376 *
12377 */
12378
12379 MARK_AS_PMAP_TEXT static void
12380 pmap_ro_zone_unlock_phy_page(
12381 const pmap_paddr_t pa,
12382 vm_offset_t va __unused,
12383 vm_size_t size __unused)
12384 {
12385 const unsigned int pai = pa_index(pa);
12386 pvh_unlock(pai);
12387 }
12388
12389 /**
12390 * Function to copy kauth_cred from new_data to kv.
12391 * Function defined in "kern_prot.c"
12392 *
12393 * @note Will be removed upon completion of
12394 * <rdar://problem/72635194> Compiler PAC support for memcpy.
12395 *
12396 * @param kv Address to copy new data to.
12397 * @param new_data Pointer to new data.
12398 *
12399 */
12400
12401 extern void
12402 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12403
12404 /**
12405 * Zalloc-specific memcpy that writes through the physical aperture
12406 * and ensures the element being modified is from a read-only zone.
12407 *
12408 * @note Designed to work only with the zone allocator's read-only submap.
12409 *
12410 * @param zid The ID of the zone to allocate from.
12411 * @param va VA of element to be modified.
12412 * @param offset Offset from element.
12413 * @param new_data Pointer to new data.
12414 * @param new_data_size Size of modification.
12415 *
12416 */
12417
12418 void
12419 pmap_ro_zone_memcpy(
12420 zone_id_t zid,
12421 vm_offset_t va,
12422 vm_offset_t offset,
12423 const vm_offset_t new_data,
12424 vm_size_t new_data_size)
12425 {
12426 #if XNU_MONITOR
12427 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12428 #else /* XNU_MONITOR */
12429 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12430 #endif /* XNU_MONITOR */
12431 }
12432
12433 MARK_AS_PMAP_TEXT void
12434 pmap_ro_zone_memcpy_internal(
12435 zone_id_t zid,
12436 vm_offset_t va,
12437 vm_offset_t offset,
12438 const vm_offset_t new_data,
12439 vm_size_t new_data_size)
12440 {
12441 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12442
12443 if (!new_data || new_data_size == 0) {
12444 return;
12445 }
12446
12447 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12448 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12449 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12450 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12451 }
12452
12453 /**
12454 * Zalloc-specific function to atomically mutate fields of an element that
12455 * belongs to a read-only zone, via the physcial aperture.
12456 *
12457 * @note Designed to work only with the zone allocator's read-only submap.
12458 *
12459 * @param zid The ID of the zone the element belongs to.
12460 * @param va VA of element to be modified.
12461 * @param offset Offset in element.
12462 * @param op Atomic operation to perform.
12463 * @param value Mutation value.
12464 *
12465 */
12466
12467 uint64_t
12468 pmap_ro_zone_atomic_op(
12469 zone_id_t zid,
12470 vm_offset_t va,
12471 vm_offset_t offset,
12472 zro_atomic_op_t op,
12473 uint64_t value)
12474 {
12475 #if XNU_MONITOR
12476 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12477 #else /* XNU_MONITOR */
12478 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12479 #endif /* XNU_MONITOR */
12480 }
12481
12482 MARK_AS_PMAP_TEXT uint64_t
12483 pmap_ro_zone_atomic_op_internal(
12484 zone_id_t zid,
12485 vm_offset_t va,
12486 vm_offset_t offset,
12487 zro_atomic_op_t op,
12488 uint64_t value)
12489 {
12490 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12491 vm_size_t value_size = op & 0xf;
12492
12493 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12494 pmap_ro_zone_lock_phy_page(pa, va, value_size);
12495 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12496 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12497
12498 return value;
12499 }
12500
12501 /**
12502 * bzero for allocations from read only zones, that writes through the
12503 * physical aperture.
12504 *
12505 * @note This is called by the zfree path of all allocations from read
12506 * only zones.
12507 *
12508 * @param zid The ID of the zone the allocation belongs to.
12509 * @param va VA of element to be zeroed.
12510 * @param offset Offset in the element.
12511 * @param size Size of allocation.
12512 *
12513 */
12514
12515 void
12516 pmap_ro_zone_bzero(
12517 zone_id_t zid,
12518 vm_offset_t va,
12519 vm_offset_t offset,
12520 vm_size_t size)
12521 {
12522 #if XNU_MONITOR
12523 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12524 #else /* XNU_MONITOR */
12525 pmap_ro_zone_bzero_internal(zid, va, offset, size);
12526 #endif /* XNU_MONITOR */
12527 }
12528
12529 MARK_AS_PMAP_TEXT void
12530 pmap_ro_zone_bzero_internal(
12531 zone_id_t zid,
12532 vm_offset_t va,
12533 vm_offset_t offset,
12534 vm_size_t size)
12535 {
12536 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12537 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12538 pmap_ro_zone_lock_phy_page(pa, va, size);
12539 bzero((void*)phystokv(pa), size);
12540 pmap_ro_zone_unlock_phy_page(pa, va, size);
12541 }
12542
12543 /**
12544 * Removes write access from the Physical Aperture.
12545 *
12546 * @note For non-PPL devices, it simply makes all virtual mappings RO.
12547 * @note Designed to work only with the zone allocator's read-only submap.
12548 *
12549 * @param va VA of the page to restore write access to.
12550 *
12551 */
12552 MARK_AS_PMAP_TEXT static void
12553 pmap_phys_write_disable(vm_address_t va)
12554 {
12555 #if XNU_MONITOR
12556 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12557 #else /* XNU_MONITOR */
12558 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12559 #endif /* XNU_MONITOR */
12560 }
12561
12562 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
12563
12564 MARK_AS_PMAP_TEXT mach_vm_size_t
12565 pmap_query_resident_internal(
12566 pmap_t pmap,
12567 vm_map_address_t start,
12568 vm_map_address_t end,
12569 mach_vm_size_t *compressed_bytes_p)
12570 {
12571 mach_vm_size_t resident_bytes = 0;
12572 mach_vm_size_t compressed_bytes = 0;
12573
12574 pt_entry_t *bpte, *epte;
12575 pt_entry_t *pte_p;
12576 tt_entry_t *tte_p;
12577
12578 if (pmap == NULL) {
12579 return PMAP_RESIDENT_INVALID;
12580 }
12581
12582 validate_pmap(pmap);
12583
12584 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12585
12586 /* Ensure that this request is valid, and addresses exactly one TTE. */
12587 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12588 (end % pt_attr_page_size(pt_attr)))) {
12589 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12590 }
12591
12592 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12593 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12594 }
12595
12596 pmap_lock(pmap, PMAP_LOCK_SHARED);
12597 tte_p = pmap_tte(pmap, start);
12598 if (tte_p == (tt_entry_t *) NULL) {
12599 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12600 return PMAP_RESIDENT_INVALID;
12601 }
12602 if (tte_is_valid_table(*tte_p)) {
12603 pte_p = (pt_entry_t *) ttetokv(*tte_p);
12604 bpte = &pte_p[pte_index(pt_attr, start)];
12605 epte = &pte_p[pte_index(pt_attr, end)];
12606
12607 for (; bpte < epte; bpte++) {
12608 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12609 compressed_bytes += pt_attr_page_size(pt_attr);
12610 } else if (pa_valid(pte_to_pa(*bpte))) {
12611 resident_bytes += pt_attr_page_size(pt_attr);
12612 }
12613 }
12614 }
12615 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12616
12617 if (compressed_bytes_p) {
12618 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12619 *compressed_bytes_p += compressed_bytes;
12620 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12621 }
12622
12623 return resident_bytes;
12624 }
12625
12626 mach_vm_size_t
12627 pmap_query_resident(
12628 pmap_t pmap,
12629 vm_map_address_t start,
12630 vm_map_address_t end,
12631 mach_vm_size_t *compressed_bytes_p)
12632 {
12633 mach_vm_size_t total_resident_bytes;
12634 mach_vm_size_t compressed_bytes;
12635 vm_map_address_t va;
12636
12637
12638 if (pmap == PMAP_NULL) {
12639 if (compressed_bytes_p) {
12640 *compressed_bytes_p = 0;
12641 }
12642 return 0;
12643 }
12644
12645 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12646
12647 total_resident_bytes = 0;
12648 compressed_bytes = 0;
12649
12650 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12651 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12652 VM_KERNEL_ADDRHIDE(end));
12653
12654 va = start;
12655 while (va < end) {
12656 vm_map_address_t l;
12657 mach_vm_size_t resident_bytes;
12658
12659 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12660
12661 if (l > end) {
12662 l = end;
12663 }
12664 #if XNU_MONITOR
12665 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12666 #else
12667 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12668 #endif
12669 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12670 break;
12671 }
12672
12673 total_resident_bytes += resident_bytes;
12674
12675 va = l;
12676 }
12677
12678 if (compressed_bytes_p) {
12679 *compressed_bytes_p = compressed_bytes;
12680 }
12681
12682 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12683 total_resident_bytes);
12684
12685 return total_resident_bytes;
12686 }
12687
12688 #if MACH_ASSERT
12689 static void
12690 pmap_check_ledgers(
12691 pmap_t pmap)
12692 {
12693 int pid;
12694 char *procname;
12695
12696 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12697 /*
12698 * This pmap was not or is no longer fully associated
12699 * with a task (e.g. the old pmap after a fork()/exec() or
12700 * spawn()). Its "ledger" still points at a task that is
12701 * now using a different (and active) address space, so
12702 * we can't check that all the pmap ledgers are balanced here.
12703 *
12704 * If the "pid" is set, that means that we went through
12705 * pmap_set_process() in task_terminate_internal(), so
12706 * this task's ledger should not have been re-used and
12707 * all the pmap ledgers should be back to 0.
12708 */
12709 return;
12710 }
12711
12712 pid = pmap->pmap_pid;
12713 procname = pmap->pmap_procname;
12714
12715 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12716 }
12717 #endif /* MACH_ASSERT */
12718
12719 void
12720 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12721 {
12722 }
12723
12724 /**
12725 * The minimum shared region nesting size is used by the VM to determine when to
12726 * break up large mappings to nested regions. The smallest size that these
12727 * mappings can be broken into is determined by what page table level those
12728 * regions are being nested in at and the size of the page tables.
12729 *
12730 * For instance, if a nested region is nesting at L2 for a process utilizing
12731 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12732 * block entry).
12733 *
12734 * @param pmap The target pmap to determine the block size based on whether it's
12735 * using 16KB or 4KB page tables.
12736 */
12737 uint64_t
12738 pmap_shared_region_size_min(__unused pmap_t pmap)
12739 {
12740 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12741
12742 /**
12743 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12744 * 4KB pages). This means that a target pmap will contain L2 entries that
12745 * point to shared L3 page tables in the shared region pmap.
12746 */
12747 return pt_attr_twig_size(pt_attr);
12748 }
12749
12750 boolean_t
12751 pmap_enforces_execute_only(
12752 pmap_t pmap)
12753 {
12754 return pmap != kernel_pmap;
12755 }
12756
12757 MARK_AS_PMAP_TEXT void
12758 pmap_set_vm_map_cs_enforced_internal(
12759 pmap_t pmap,
12760 bool new_value)
12761 {
12762 validate_pmap_mutable(pmap);
12763 pmap->pmap_vm_map_cs_enforced = new_value;
12764 }
12765
12766 void
12767 pmap_set_vm_map_cs_enforced(
12768 pmap_t pmap,
12769 bool new_value)
12770 {
12771 #if XNU_MONITOR
12772 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12773 #else
12774 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12775 #endif
12776 }
12777
12778 extern int cs_process_enforcement_enable;
12779 bool
12780 pmap_get_vm_map_cs_enforced(
12781 pmap_t pmap)
12782 {
12783 if (cs_process_enforcement_enable) {
12784 return true;
12785 }
12786 return pmap->pmap_vm_map_cs_enforced;
12787 }
12788
12789 MARK_AS_PMAP_TEXT void
12790 pmap_set_jit_entitled_internal(
12791 __unused pmap_t pmap)
12792 {
12793 return;
12794 }
12795
12796 void
12797 pmap_set_jit_entitled(
12798 pmap_t pmap)
12799 {
12800 #if XNU_MONITOR
12801 pmap_set_jit_entitled_ppl(pmap);
12802 #else
12803 pmap_set_jit_entitled_internal(pmap);
12804 #endif
12805 }
12806
12807 bool
12808 pmap_get_jit_entitled(
12809 __unused pmap_t pmap)
12810 {
12811 return false;
12812 }
12813
12814 MARK_AS_PMAP_TEXT void
12815 pmap_set_tpro_internal(
12816 __unused pmap_t pmap)
12817 {
12818 return;
12819 }
12820
12821 void
12822 pmap_set_tpro(
12823 pmap_t pmap)
12824 {
12825 #if XNU_MONITOR
12826 pmap_set_tpro_ppl(pmap);
12827 #else /* XNU_MONITOR */
12828 pmap_set_tpro_internal(pmap);
12829 #endif /* XNU_MONITOR */
12830 }
12831
12832 bool
12833 pmap_get_tpro(
12834 __unused pmap_t pmap)
12835 {
12836 return false;
12837 }
12838
12839 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12840
12841 MARK_AS_PMAP_TEXT kern_return_t
12842 pmap_query_page_info_internal(
12843 pmap_t pmap,
12844 vm_map_offset_t va,
12845 int *disp_p)
12846 {
12847 pmap_paddr_t pa;
12848 int disp;
12849 unsigned int pai;
12850 pt_entry_t *pte_p, pte;
12851 pv_entry_t **pv_h, *pve_p;
12852
12853 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12854 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12855 *disp_p = 0;
12856 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12857 return KERN_INVALID_ARGUMENT;
12858 }
12859
12860 validate_pmap(pmap);
12861 pmap_lock(pmap, PMAP_LOCK_SHARED);
12862
12863 try_again:
12864 disp = 0;
12865 pte_p = pmap_pte(pmap, va);
12866 if (pte_p == PT_ENTRY_NULL) {
12867 goto done;
12868 }
12869 pte = *(volatile pt_entry_t*)pte_p;
12870 pa = pte_to_pa(pte);
12871 if (pa == 0) {
12872 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12873 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12874 if (pte & ARM_PTE_COMPRESSED_ALT) {
12875 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12876 }
12877 }
12878 } else {
12879 disp |= PMAP_QUERY_PAGE_PRESENT;
12880 pai = pa_index(pa);
12881 if (!pa_valid(pa)) {
12882 goto done;
12883 }
12884 pvh_lock(pai);
12885 if (pte != *(volatile pt_entry_t*)pte_p) {
12886 /* something changed: try again */
12887 pvh_unlock(pai);
12888 pmap_query_page_info_retries++;
12889 goto try_again;
12890 }
12891 pv_h = pai_to_pvh(pai);
12892 pve_p = PV_ENTRY_NULL;
12893 int pve_ptep_idx = 0;
12894 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12895 pve_p = pvh_pve_list(pv_h);
12896 while (pve_p != PV_ENTRY_NULL &&
12897 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12898 pve_p = pve_next(pve_p);
12899 }
12900 }
12901
12902 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12903 disp |= PMAP_QUERY_PAGE_ALTACCT;
12904 } else if (ppattr_test_reusable(pai)) {
12905 disp |= PMAP_QUERY_PAGE_REUSABLE;
12906 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12907 disp |= PMAP_QUERY_PAGE_INTERNAL;
12908 }
12909 pvh_unlock(pai);
12910 }
12911
12912 done:
12913 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12914 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12915 *disp_p = disp;
12916 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12917 return KERN_SUCCESS;
12918 }
12919
12920 kern_return_t
12921 pmap_query_page_info(
12922 pmap_t pmap,
12923 vm_map_offset_t va,
12924 int *disp_p)
12925 {
12926 #if XNU_MONITOR
12927 return pmap_query_page_info_ppl(pmap, va, disp_p);
12928 #else
12929 return pmap_query_page_info_internal(pmap, va, disp_p);
12930 #endif
12931 }
12932
12933
12934
12935 uint32_t
12936 pmap_user_va_bits(pmap_t pmap __unused)
12937 {
12938 #if __ARM_MIXED_PAGE_SIZE__
12939 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12940 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12941 #else
12942 return 64 - T0SZ_BOOT;
12943 #endif
12944 }
12945
12946 uint32_t
12947 pmap_kernel_va_bits(void)
12948 {
12949 return 64 - T1SZ_BOOT;
12950 }
12951
12952 static vm_map_size_t
12953 pmap_user_va_size(pmap_t pmap)
12954 {
12955 return 1ULL << pmap_user_va_bits(pmap);
12956 }
12957
12958 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
12959 static vm_map_address_t
12960 pmap_strip_user_addr(pmap_t pmap, vm_map_address_t ptr)
12961 {
12962 assert(pmap && pmap != kernel_pmap);
12963
12964 /*
12965 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR0 address.
12966 * Ignore the strip request.
12967 */
12968 if ((ptr & TTBR_SELECTOR) != 0) {
12969 return ptr;
12970 }
12971
12972 /* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
12973 return ptr & (pmap->max - 1);
12974 }
12975
12976 static vm_map_address_t
12977 pmap_strip_kernel_addr(pmap_t pmap, vm_map_address_t ptr)
12978 {
12979 assert(pmap && pmap == kernel_pmap);
12980
12981 /*
12982 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR1 address.
12983 * Ignore the strip request.
12984 */
12985 if ((ptr & TTBR_SELECTOR) == 0) {
12986 return ptr;
12987 }
12988
12989 /* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
12990 return ptr | pmap->min;
12991 }
12992
12993 vm_map_address_t
12994 pmap_strip_addr(pmap_t pmap, vm_map_address_t ptr)
12995 {
12996 assert(pmap);
12997
12998 return pmap == kernel_pmap ? pmap_strip_kernel_addr(pmap, ptr) :
12999 pmap_strip_user_addr(pmap, ptr);
13000 }
13001 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
13002
13003
13004
13005 bool
13006 pmap_in_ppl(void)
13007 {
13008 // Unsupported
13009 return false;
13010 }
13011
13012 __attribute__((__noreturn__))
13013 void
13014 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
13015 {
13016 panic("%s called on an unsupported platform.", __FUNCTION__);
13017 }
13018
13019 void *
13020 pmap_claim_reserved_ppl_page(void)
13021 {
13022 // Unsupported
13023 return NULL;
13024 }
13025
13026 void
13027 pmap_free_reserved_ppl_page(void __unused *kva)
13028 {
13029 // Unsupported
13030 }
13031
13032
13033 #if PMAP_CS_PPL_MONITOR
13034
13035 /* Immutable part of the trust cache runtime */
13036 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
13037
13038 /* Mutable part of the trust cache runtime */
13039 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
13040
13041 /* Lock for the trust cache runtime */
13042 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
13043
13044 MARK_AS_PMAP_TEXT kern_return_t
13045 pmap_check_trust_cache_runtime_for_uuid_internal(
13046 const uint8_t check_uuid[kUUIDSize])
13047 {
13048 kern_return_t ret = KERN_DENIED;
13049
13050 /* Lock the runtime as shared */
13051 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13052
13053 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
13054 &ppl_trust_cache_rt,
13055 check_uuid,
13056 NULL);
13057
13058 /* Unlock the runtime */
13059 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13060
13061 if (tc_ret.error == kTCReturnSuccess) {
13062 ret = KERN_SUCCESS;
13063 } else if (tc_ret.error == kTCReturnNotFound) {
13064 ret = KERN_NOT_FOUND;
13065 } else {
13066 ret = KERN_FAILURE;
13067 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
13068 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13069 }
13070
13071 return ret;
13072 }
13073
13074 kern_return_t
13075 pmap_check_trust_cache_runtime_for_uuid(
13076 const uint8_t check_uuid[kUUIDSize])
13077 {
13078 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
13079 }
13080
13081 MARK_AS_PMAP_TEXT kern_return_t
13082 pmap_load_trust_cache_with_type_internal(
13083 TCType_t type,
13084 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13085 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13086 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13087 {
13088 kern_return_t ret = KERN_DENIED;
13089 pmap_img4_payload_t *payload = NULL;
13090 size_t img4_payload_len = 0;
13091 size_t payload_len_aligned = 0;
13092 size_t manifest_len_aligned = 0;
13093
13094 /* Ignore the auxiliary manifest until we add support for it */
13095 (void)img4_aux_manifest;
13096 (void)img4_aux_manifest_len;
13097
13098
13099 #if PMAP_CS_INCLUDE_CODE_SIGNING
13100 if (pmap_cs) {
13101 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
13102 panic("trust cache type not loadable from interface: %u", type);
13103 } else if (type >= kTCTypeTotal) {
13104 panic("attempted to load an unsupported trust cache type: %u", type);
13105 }
13106
13107 /* Validate entitlement for the calling process */
13108 if (TCTypeConfig[type].entitlementValue != NULL) {
13109 const bool entitlement_satisfied = check_entitlement_pmap(
13110 NULL,
13111 "com.apple.private.pmap.load-trust-cache",
13112 TCTypeConfig[type].entitlementValue,
13113 false,
13114 true);
13115
13116 if (entitlement_satisfied == false) {
13117 panic("attempted to load trust cache without entitlement: %u", type);
13118 }
13119 }
13120 }
13121 #endif
13122
13123 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13124 ret = pmap_reserve_ppl_page();
13125 if (ret != KERN_SUCCESS) {
13126 if (ret != KERN_RESOURCE_SHORTAGE) {
13127 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
13128 }
13129 return ret;
13130 }
13131
13132 /* Align the passed in lengths to the page size -- round_page is overflow safe */
13133 payload_len_aligned = round_page(pmap_img4_payload_len);
13134 manifest_len_aligned = round_page(img4_manifest_len);
13135
13136 /* Ensure we have valid data passed in */
13137 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
13138 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
13139
13140 /*
13141 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
13142 * data structure used by libTrustCache to manage the payload. We need to be able to
13143 * write to that data structure, so we keep the payload PPL writable.
13144 */
13145 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13146 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
13147
13148 /* Should be safe to read from this now */
13149 payload = (pmap_img4_payload_t*)pmap_img4_payload;
13150
13151 /* Acquire a writable version of the trust cache data structure */
13152 TrustCache_t *trust_cache = &payload->trust_cache;
13153 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
13154
13155 /* Calculate the correct length of the img4 payload */
13156 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
13157 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
13158 }
13159
13160 /* Exclusively lock the runtime */
13161 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
13162
13163 /* Load the trust cache */
13164 TCReturn_t tc_ret = amfi->TrustCache.load(
13165 &ppl_trust_cache_rt,
13166 type,
13167 trust_cache,
13168 (const uintptr_t)payload->img4_payload, img4_payload_len,
13169 (const uintptr_t)img4_manifest, img4_manifest_len);
13170
13171 /* Unlock the runtime */
13172 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
13173
13174 if (tc_ret.error == kTCReturnSuccess) {
13175 ret = KERN_SUCCESS;
13176 } else {
13177 if (tc_ret.error == kTCReturnDuplicate) {
13178 ret = KERN_ALREADY_IN_SET;
13179 } else {
13180 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
13181 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13182
13183 ret = KERN_FAILURE;
13184 }
13185
13186 /* Unlock the payload data */
13187 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13188 trust_cache = NULL;
13189 payload = NULL;
13190 }
13191
13192 /* Unlock the manifest since it is no longer needed */
13193 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
13194
13195 /* Return the CoreCrypto reserved page back to the free list */
13196 pmap_release_reserved_ppl_page();
13197
13198 return ret;
13199 }
13200
13201 kern_return_t
13202 pmap_load_trust_cache_with_type(
13203 TCType_t type,
13204 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13205 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13206 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13207 {
13208 kern_return_t ret = KERN_DENIED;
13209
13210 ret = pmap_load_trust_cache_with_type_ppl(
13211 type,
13212 pmap_img4_payload, pmap_img4_payload_len,
13213 img4_manifest, img4_manifest_len,
13214 img4_aux_manifest, img4_aux_manifest_len);
13215
13216 while (ret == KERN_RESOURCE_SHORTAGE) {
13217 /* Allocate a page from the free list */
13218 pmap_alloc_page_for_ppl(0);
13219
13220 /* Attempt the call again */
13221 ret = pmap_load_trust_cache_with_type_ppl(
13222 type,
13223 pmap_img4_payload, pmap_img4_payload_len,
13224 img4_manifest, img4_manifest_len,
13225 img4_aux_manifest, img4_aux_manifest_len);
13226 }
13227
13228 return ret;
13229 }
13230
13231 MARK_AS_PMAP_TEXT kern_return_t
13232 pmap_query_trust_cache_safe(
13233 TCQueryType_t query_type,
13234 const uint8_t cdhash[kTCEntryHashSize],
13235 TrustCacheQueryToken_t *query_token)
13236 {
13237 kern_return_t ret = KERN_NOT_FOUND;
13238
13239 /* Validate the query type preemptively */
13240 if (query_type >= kTCQueryTypeTotal) {
13241 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
13242 return KERN_INVALID_ARGUMENT;
13243 }
13244
13245 /* Lock the runtime as shared */
13246 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13247
13248 TCReturn_t tc_ret = amfi->TrustCache.query(
13249 &ppl_trust_cache_rt,
13250 query_type,
13251 cdhash,
13252 query_token);
13253
13254 /* Unlock the runtime */
13255 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13256
13257 if (tc_ret.error == kTCReturnSuccess) {
13258 ret = KERN_SUCCESS;
13259 } else if (tc_ret.error == kTCReturnNotFound) {
13260 ret = KERN_NOT_FOUND;
13261 } else {
13262 ret = KERN_FAILURE;
13263 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13264 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13265 }
13266
13267 return ret;
13268 }
13269
13270 MARK_AS_PMAP_TEXT kern_return_t
13271 pmap_query_trust_cache_internal(
13272 TCQueryType_t query_type,
13273 const uint8_t cdhash[kTCEntryHashSize],
13274 TrustCacheQueryToken_t *query_token)
13275 {
13276 kern_return_t ret = KERN_NOT_FOUND;
13277 TrustCacheQueryToken_t query_token_safe = {0};
13278 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13279
13280 /* Copy in the CDHash into PPL storage */
13281 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13282
13283 /* Query through the safe API since we're in the PPL now */
13284 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13285
13286 if (query_token != NULL) {
13287 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13288 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13289 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13290 }
13291
13292 return ret;
13293 }
13294
13295 kern_return_t
13296 pmap_query_trust_cache(
13297 TCQueryType_t query_type,
13298 const uint8_t cdhash[kTCEntryHashSize],
13299 TrustCacheQueryToken_t *query_token)
13300 {
13301 kern_return_t ret = KERN_NOT_FOUND;
13302
13303 ret = pmap_query_trust_cache_ppl(
13304 query_type,
13305 cdhash,
13306 query_token);
13307
13308 return ret;
13309 }
13310
13311 MARK_AS_PMAP_DATA uint8_t ppl_developer_mode_set = 0;
13312 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13313
13314 MARK_AS_PMAP_TEXT void
13315 pmap_toggle_developer_mode_internal(
13316 bool state)
13317 {
13318 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13319 /*
13320 * On internal builds, we may call into the PPL twice in order to enable developer
13321 * mode during early boot and during data migration. The latter does not happen for
13322 * non-internal builds, and thus those only need to support a single transition to
13323 * enabling developer mode.
13324 */
13325 const uint8_t epoch_enable = 2;
13326 #else
13327 const uint8_t epoch_enable = 1;
13328 #endif
13329
13330 /*
13331 * We don't really care if the state is false -- in that case, the transition can
13332 * happen as many times as needed. However, we still need to increment whenever we
13333 * set the state as such. This is partly because we need to track whether we have
13334 * actually resolved the state or not, and also because we expect developer mode
13335 * to only be enabled during the first or second (internal-only) call into this
13336 * function.
13337 */
13338 uint8_t epoch = os_atomic_inc_orig(&ppl_developer_mode_set, relaxed);
13339
13340 if (state == os_atomic_load(&ppl_developer_mode_storage, relaxed)) {
13341 return;
13342 } else if ((state == true) && (epoch >= epoch_enable)) {
13343 panic("PMAP_CS: enabling developer mode incorrectly [%u]", epoch);
13344 }
13345
13346 /* Update the developer mode state on the system */
13347 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13348 }
13349
13350 void
13351 pmap_toggle_developer_mode(
13352 bool state)
13353 {
13354 pmap_toggle_developer_mode_ppl(state);
13355 }
13356
13357 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13358 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13359
13360 #pragma mark Image4 - New
13361
13362 typedef struct _pmap_image4_dispatch {
13363 image4_cs_trap_t selector;
13364 image4_cs_trap_handler_t handler;
13365 } pmap_image4_dispatch_t;
13366
13367 MARK_AS_PMAP_TEXT static errno_t
13368 _pmap_image4_monitor_trap_set_release_type(
13369 const pmap_image4_dispatch_t *dispatch,
13370 const void *input_data)
13371 {
13372 /*
13373 * csmx_release_type --> __cs_copy
13374 */
13375 image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13376
13377 /* Copy the input data to prevent ToCToU */
13378 memcpy(&input, input_data, sizeof(input));
13379
13380 /* Dispatch to AppleImage4 */
13381 return dispatch->handler(
13382 dispatch->selector,
13383 &input, sizeof(input),
13384 NULL, NULL);
13385 }
13386
13387
13388
13389 MARK_AS_PMAP_TEXT static errno_t
13390 _pmap_image4_monitor_trap_nonce_set(
13391 const pmap_image4_dispatch_t *dispatch,
13392 const void *input_data)
13393 {
13394 /*
13395 * csmx_clear --> __cs_copy
13396 * csmx_cipher --> __cs_copy
13397 */
13398 image4_cs_trap_argv_nonce_set_t input = {0};
13399
13400 /* Copy the input data to prevent ToCToU */
13401 memcpy(&input, input_data, sizeof(input));
13402
13403 /* Dispatch to AppleImage4 */
13404 return dispatch->handler(
13405 dispatch->selector,
13406 &input, sizeof(input),
13407 NULL, NULL);
13408 }
13409
13410 MARK_AS_PMAP_TEXT static errno_t
13411 _pmap_image4_monitor_trap_nonce_roll(
13412 const pmap_image4_dispatch_t *dispatch,
13413 const void *input_data)
13414 {
13415 image4_cs_trap_argv_nonce_roll_t input = {0};
13416
13417 /* Copy the input data to prevent ToCToU */
13418 memcpy(&input, input_data, sizeof(input));
13419
13420 /* Dispatch to AppleImage4 */
13421 return dispatch->handler(
13422 dispatch->selector,
13423 &input, sizeof(input),
13424 NULL, NULL);
13425 }
13426
13427 MARK_AS_PMAP_TEXT static errno_t
13428 _pmap_image4_monitor_trap_image_activate(
13429 const pmap_image4_dispatch_t *dispatch,
13430 const void *input_data)
13431 {
13432 /*
13433 * csmx_payload (csmx_payload_len) --> __cs_xfer
13434 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13435 */
13436 image4_cs_trap_argv_image_activate_t input = {0};
13437
13438 /* Copy the input data to prevent ToCToU */
13439 memcpy(&input, input_data, sizeof(input));
13440
13441 /* Validate the payload region */
13442 pmap_cs_assert_addr(
13443 input.csmx_payload, round_page(input.csmx_payload_len),
13444 false, false);
13445
13446 /* Validate the manifest region */
13447 pmap_cs_assert_addr(
13448 input.csmx_manifest, round_page(input.csmx_manifest_len),
13449 false, false);
13450
13451 /* Lockdown the payload region */
13452 pmap_cs_lockdown_pages(
13453 input.csmx_payload, round_page(input.csmx_payload_len), false);
13454
13455 /* Lockdown the manifest region */
13456 pmap_cs_lockdown_pages(
13457 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13458
13459 /* Dispatch the handler */
13460 errno_t err = dispatch->handler(
13461 dispatch->selector,
13462 &input, sizeof(input),
13463 NULL, NULL);
13464
13465 /*
13466 * Image activation always returns the manifest back to the kernel since it isn't
13467 * needed once the evaluation of the image has been completed. The payload must
13468 * remain owned by the monitor if the activation was successful.
13469 */
13470 if (err != 0) {
13471 /* Unlock the payload region */
13472 pmap_cs_unlockdown_pages(
13473 input.csmx_payload, round_page(input.csmx_payload_len), false);
13474 }
13475
13476 /* Unlock the manifest region */
13477 pmap_cs_unlockdown_pages(
13478 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13479
13480 return err;
13481 }
13482
13483 MARK_AS_PMAP_TEXT static errno_t
13484 _pmap_image4_monitor_trap_passthrough(
13485 __unused const pmap_image4_dispatch_t *dispatch,
13486 __unused const void *input_data,
13487 __unused size_t input_size)
13488 {
13489 #if DEVELOPMENT || DEBUG || KASAN
13490 return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13491 #else
13492 pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13493 return ENOSYS;
13494 #endif
13495 }
13496
13497 MARK_AS_PMAP_TEXT errno_t
13498 pmap_image4_monitor_trap_internal(
13499 image4_cs_trap_t selector,
13500 const void *input_data,
13501 size_t input_size)
13502 {
13503 kern_return_t ret = KERN_DENIED;
13504 errno_t err = EPERM;
13505
13506 /* Acquire the handler for this selector */
13507 image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13508 if (handler == NULL) {
13509 pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13510 return EINVAL;
13511 }
13512
13513 /* Verify input size for the handler */
13514 if (input_size != image4_cs_trap_vector_size(selector)) {
13515 pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13516 return EINVAL;
13517 }
13518
13519 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13520 ret = pmap_reserve_ppl_page();
13521 if (ret != KERN_SUCCESS) {
13522 if (ret == KERN_RESOURCE_SHORTAGE) {
13523 return ENOMEM;
13524 }
13525 pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13526 return EPERM;
13527 }
13528
13529 /* Setup dispatch parameters */
13530 pmap_image4_dispatch_t dispatch = {
13531 .selector = selector,
13532 .handler = handler
13533 };
13534
13535 switch (selector) {
13536 case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13537 err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13538 break;
13539
13540 case IMAGE4_CS_TRAP_NONCE_SET:
13541 err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13542 break;
13543
13544 case IMAGE4_CS_TRAP_NONCE_ROLL:
13545 err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13546 break;
13547
13548 case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13549 err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13550 break;
13551
13552 default:
13553 err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13554 break;
13555 }
13556
13557 /* Return the CoreCrypto reserved page back to the free list */
13558 pmap_release_reserved_ppl_page();
13559
13560 return err;
13561 }
13562
13563 errno_t
13564 pmap_image4_monitor_trap(
13565 image4_cs_trap_t selector,
13566 const void *input_data,
13567 size_t input_size)
13568 {
13569 errno_t err = EPERM;
13570
13571 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13572 while (err == ENOMEM) {
13573 /* Allocate a page from the free list */
13574 pmap_alloc_page_for_ppl(0);
13575
13576 /* Call the monitor dispatch again */
13577 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13578 }
13579
13580 return err;
13581 }
13582
13583 #endif /* PMAP_CS_PPL_MONITOR */
13584
13585 #if PMAP_CS_INCLUDE_CODE_SIGNING
13586
13587 static int
13588 pmap_cs_profiles_rbtree_compare(
13589 void *profile0,
13590 void *profile1)
13591 {
13592 if (profile0 < profile1) {
13593 return -1;
13594 } else if (profile0 > profile1) {
13595 return 1;
13596 }
13597 return 0;
13598 }
13599
13600 /* Red-black tree for managing provisioning profiles */
13601 MARK_AS_PMAP_DATA static
13602 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13603
13604 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13605 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13606
13607 /* Lock for the profile red-black tree */
13608 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13609
13610 void
13611 pmap_initialize_provisioning_profiles(void)
13612 {
13613 /* Initialize the profiles red-black tree lock */
13614 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13615 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13616
13617 /* Initialize the red-black tree itself */
13618 RB_INIT(&pmap_cs_registered_profiles);
13619
13620 printf("initialized PPL provisioning profile data\n");
13621 }
13622
13623 static bool
13624 pmap_is_testflight_profile(
13625 pmap_cs_profile_t *profile_obj)
13626 {
13627 const char *entitlement_name = "beta-reports-active";
13628 const size_t entitlement_length = strlen(entitlement_name);
13629 CEQueryOperation_t query[2] = {0};
13630
13631 /* If the profile provisions no entitlements, then it isn't a test flight one */
13632 if (profile_obj->entitlements_ctx == NULL) {
13633 return false;
13634 }
13635
13636 /* Build our CoreEntitlements query */
13637 query[0].opcode = kCEOpSelectKey;
13638 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13639 query[0].parameters.stringParameter.length = entitlement_length;
13640 query[1] = CEMatchBool(true);
13641
13642 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13643 profile_obj->entitlements_ctx,
13644 query, 2);
13645
13646 if (ce_err == amfi->CoreEntitlements.kNoError) {
13647 return true;
13648 }
13649
13650 return false;
13651 }
13652
13653 static bool
13654 pmap_is_development_profile(
13655 pmap_cs_profile_t *profile_obj)
13656 {
13657 /* Check for UPP */
13658 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13659 *profile_obj->profile_ctx,
13660 CESelectDictValue("ProvisionsAllDevices"));
13661 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13662 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13663 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13664 return false;
13665 }
13666 }
13667
13668 /* Check for TestFlight profile */
13669 if (pmap_is_testflight_profile(profile_obj) == true) {
13670 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13671 return false;
13672 }
13673
13674 pmap_cs_log_info("%p: development profile", profile_obj);
13675 return true;
13676 }
13677
13678 static kern_return_t
13679 pmap_initialize_profile_entitlements(
13680 pmap_cs_profile_t *profile_obj)
13681 {
13682 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13683 *profile_obj->profile_ctx,
13684 CESelectDictValue("Entitlements"));
13685
13686 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13687 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13688 profile_obj->entitlements_ctx = NULL;
13689
13690 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13691 return KERN_NOT_FOUND;
13692 }
13693
13694 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13695 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13696
13697 CEValidationResult ce_result = {0};
13698 CEError_t ce_err = amfi->CoreEntitlements.Validate(
13699 pmap_cs_core_entitlements_runtime,
13700 &ce_result,
13701 der_start, der_end);
13702 if (ce_err != amfi->CoreEntitlements.kNoError) {
13703 pmap_cs_log_error("unable to validate profile entitlements: %s",
13704 amfi->CoreEntitlements.GetErrorString(ce_err));
13705
13706 return KERN_ABORTED;
13707 }
13708
13709 struct CEQueryContext query_ctx = {0};
13710 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13711 pmap_cs_core_entitlements_runtime,
13712 ce_result,
13713 &query_ctx);
13714 if (ce_err != amfi->CoreEntitlements.kNoError) {
13715 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13716 amfi->CoreEntitlements.GetErrorString(ce_err));
13717
13718 return KERN_ABORTED;
13719 }
13720
13721 /* Setup the entitlements context within the profile object */
13722 profile_obj->entitlements_ctx_storage = query_ctx;
13723 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13724
13725 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13726 return KERN_SUCCESS;
13727 }
13728
13729 kern_return_t
13730 pmap_register_provisioning_profile_internal(
13731 const vm_address_t payload_addr,
13732 const vm_size_t payload_size)
13733 {
13734 kern_return_t ret = KERN_DENIED;
13735 pmap_cs_profile_t *profile_obj = NULL;
13736 pmap_profile_payload_t *profile_payload = NULL;
13737 vm_size_t max_profile_blob_size = 0;
13738 const uint8_t *profile_content = NULL;
13739 size_t profile_content_length = 0;
13740
13741
13742 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
13743 ret = pmap_reserve_ppl_page();
13744 if (ret != KERN_SUCCESS) {
13745 if (ret != KERN_RESOURCE_SHORTAGE) {
13746 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13747 }
13748 return ret;
13749 }
13750
13751 /* Ensure we have valid data passed in */
13752 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13753
13754 /*
13755 * Lockdown the data passed in. The pmap profile payload also contains the profile
13756 * data structure used by the PPL to manage the payload. We need to be able to write
13757 * to that data structure, so we keep the payload PPL writable.
13758 */
13759 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13760
13761 /* Should be safe to read from this now */
13762 profile_payload = (pmap_profile_payload_t*)payload_addr;
13763
13764 /* Ensure the profile blob size provided is valid */
13765 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13766 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13767 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13768 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13769 }
13770
13771 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13772 const bool allow_development_root_cert = true;
13773 #else
13774 const bool allow_development_root_cert = false;
13775 #endif
13776
13777 int ct_result = coretrust->CTEvaluateProvisioningProfile(
13778 profile_payload->profile_blob, profile_payload->profile_blob_size,
13779 allow_development_root_cert,
13780 &profile_content, &profile_content_length);
13781
13782 /* Release the PPL page allocated for CoreCrypto */
13783 pmap_release_reserved_ppl_page();
13784
13785 if (ct_result != 0) {
13786 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13787 } else if ((profile_content == NULL) || profile_content_length == 0) {
13788 panic("PMAP_CS: profile does not have any content: %p | %lu",
13789 profile_content, profile_content_length);
13790 }
13791
13792 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13793 pmap_cs_core_entitlements_runtime,
13794 CCDER_CONSTRUCTED_SET,
13795 false,
13796 profile_content, profile_content + profile_content_length);
13797 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13798 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13799 }
13800
13801 /* Acquire a writable version of the profile data structure */
13802 profile_obj = &profile_payload->profile_obj_storage;
13803 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13804
13805 profile_obj->original_payload = profile_payload;
13806 profile_obj->profile_ctx_storage = profile_ctx_storage;
13807 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13808 os_atomic_store(&profile_obj->reference_count, 0, release);
13809
13810 /* Setup the entitlements provisioned by the profile */
13811 ret = pmap_initialize_profile_entitlements(profile_obj);
13812 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13813 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13814 }
13815
13816 /* Setup properties of the profile */
13817 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13818
13819 /* Mark as validated since it passed all checks */
13820 profile_obj->profile_validated = true;
13821
13822 /* Add the profile to the red-black tree */
13823 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13824 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13825 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13826 }
13827 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13828
13829 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13830 return KERN_SUCCESS;
13831 }
13832
13833 kern_return_t
13834 pmap_register_provisioning_profile(
13835 const vm_address_t payload_addr,
13836 const vm_size_t payload_size)
13837 {
13838 kern_return_t ret = KERN_DENIED;
13839
13840 ret = pmap_register_provisioning_profile_ppl(
13841 payload_addr,
13842 payload_size);
13843
13844 while (ret == KERN_RESOURCE_SHORTAGE) {
13845 /* Allocate a page from the free list */
13846 pmap_alloc_page_for_ppl(0);
13847
13848 /* Attempt the call again */
13849 ret = pmap_register_provisioning_profile_ppl(
13850 payload_addr,
13851 payload_size);
13852 }
13853
13854 return ret;
13855 }
13856
13857 kern_return_t
13858 pmap_unregister_provisioning_profile_internal(
13859 pmap_cs_profile_t *profile_obj)
13860 {
13861 kern_return_t ret = KERN_DENIED;
13862
13863 /* Lock the red-black tree exclusively */
13864 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13865
13866 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13867 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13868 }
13869
13870 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13871 if (reference_count != 0) {
13872 ret = KERN_FAILURE;
13873 goto exit;
13874 }
13875
13876 /* Remove the profile from the red-black tree */
13877 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13878
13879 /* Unregistration was a success */
13880 ret = KERN_SUCCESS;
13881
13882 exit:
13883 /* Unlock the red-black tree */
13884 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13885
13886 if (ret == KERN_SUCCESS) {
13887 /* Get the original payload address */
13888 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13889 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13890
13891 /* Get the original payload size */
13892 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13893 payload_size = round_page(payload_size);
13894
13895 /* Unlock the profile payload */
13896 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13897 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13898 profile_payload, payload_size);
13899
13900 profile_obj = NULL;
13901 }
13902 return ret;
13903 }
13904
13905 kern_return_t
13906 pmap_unregister_provisioning_profile(
13907 pmap_cs_profile_t *profile_obj)
13908 {
13909 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13910 }
13911
13912 kern_return_t
13913 pmap_associate_provisioning_profile_internal(
13914 pmap_cs_code_directory_t *cd_entry,
13915 pmap_cs_profile_t *profile_obj)
13916 {
13917 kern_return_t ret = KERN_DENIED;
13918
13919 /* Acquire the lock on the code directory */
13920 pmap_cs_lock_code_directory(cd_entry);
13921
13922 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13923 pmap_cs_log_error("disallowing profile association with verified signature");
13924 goto exit;
13925 } else if (cd_entry->profile_obj != NULL) {
13926 pmap_cs_log_error("disallowing multiple profile associations with signature");
13927 goto exit;
13928 }
13929
13930 /* Lock the red-black tree as shared */
13931 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13932
13933 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13934 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13935 } else if (profile_obj->profile_validated == false) {
13936 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13937 }
13938
13939 /* Associate the profile with the signature */
13940 cd_entry->profile_obj = profile_obj;
13941
13942 /* Increment the reference count on the profile object */
13943 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13944 if (reference_count == 0) {
13945 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13946 }
13947
13948 /* Unlock the red-black tree */
13949 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13950
13951 /* Association was a success */
13952 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13953 ret = KERN_SUCCESS;
13954
13955 exit:
13956 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13957
13958 return ret;
13959 }
13960
13961 kern_return_t
13962 pmap_associate_provisioning_profile(
13963 pmap_cs_code_directory_t *cd_entry,
13964 pmap_cs_profile_t *profile_obj)
13965 {
13966 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13967 }
13968
13969 kern_return_t
13970 pmap_disassociate_provisioning_profile_internal(
13971 pmap_cs_code_directory_t *cd_entry)
13972 {
13973 pmap_cs_profile_t *profile_obj = NULL;
13974 kern_return_t ret = KERN_DENIED;
13975
13976 /* Acquire the lock on the code directory */
13977 pmap_cs_lock_code_directory(cd_entry);
13978
13979 if (cd_entry->profile_obj == NULL) {
13980 ret = KERN_NOT_FOUND;
13981 goto exit;
13982 }
13983 profile_obj = cd_entry->profile_obj;
13984
13985 /* Disassociate the profile from the signature */
13986 cd_entry->profile_obj = NULL;
13987
13988 /* Disassociation was a success */
13989 ret = KERN_SUCCESS;
13990
13991 exit:
13992 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13993
13994 if (ret == KERN_SUCCESS) {
13995 /* Decrement the reference count on the profile object */
13996 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13997 if (reference_count == UINT32_MAX) {
13998 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13999 }
14000 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
14001 }
14002 return ret;
14003 }
14004
14005 kern_return_t
14006 pmap_disassociate_provisioning_profile(
14007 pmap_cs_code_directory_t *cd_entry)
14008 {
14009 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
14010 }
14011
14012 kern_return_t
14013 pmap_associate_kernel_entitlements_internal(
14014 pmap_cs_code_directory_t *cd_entry,
14015 const void *kernel_entitlements)
14016 {
14017 kern_return_t ret = KERN_DENIED;
14018
14019 if (kernel_entitlements == NULL) {
14020 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
14021 }
14022
14023 /* Acquire the lock on the code directory */
14024 pmap_cs_lock_code_directory(cd_entry);
14025
14026 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
14027 ret = KERN_DENIED;
14028 goto out;
14029 } else if (cd_entry->kernel_entitlements != NULL) {
14030 ret = KERN_DENIED;
14031 goto out;
14032 }
14033 cd_entry->kernel_entitlements = kernel_entitlements;
14034
14035 /* Association was a success */
14036 ret = KERN_SUCCESS;
14037
14038 out:
14039 lck_rw_unlock_exclusive(&cd_entry->rwlock);
14040 return ret;
14041 }
14042
14043 kern_return_t
14044 pmap_associate_kernel_entitlements(
14045 pmap_cs_code_directory_t *cd_entry,
14046 const void *kernel_entitlements)
14047 {
14048 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
14049 }
14050
14051 kern_return_t
14052 pmap_resolve_kernel_entitlements_internal(
14053 pmap_t pmap,
14054 const void **kernel_entitlements)
14055 {
14056 const void *entitlements = NULL;
14057 pmap_cs_code_directory_t *cd_entry = NULL;
14058 kern_return_t ret = KERN_DENIED;
14059
14060 /* Validate the PMAP object */
14061 validate_pmap(pmap);
14062
14063 /* Ensure no kernel PMAP */
14064 if (pmap == kernel_pmap) {
14065 return KERN_NOT_FOUND;
14066 }
14067
14068 /* Attempt a shared lock on the PMAP */
14069 if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
14070 return KERN_ABORTED;
14071 }
14072
14073 /*
14074 * Acquire the code signature from the PMAP. This function is called when
14075 * performing an entitlement check, and since we've confirmed this isn't
14076 * the kernel_pmap, at this stage, each pmap _should_ have a main region
14077 * with a code signature.
14078 */
14079 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
14080 if (cd_entry == NULL) {
14081 ret = KERN_NOT_FOUND;
14082 goto out;
14083 }
14084
14085 entitlements = cd_entry->kernel_entitlements;
14086 if (entitlements == NULL) {
14087 ret = KERN_NOT_FOUND;
14088 goto out;
14089 }
14090
14091 /* Pin and write out the entitlements object pointer */
14092 if (kernel_entitlements != NULL) {
14093 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14094 *kernel_entitlements = entitlements;
14095 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14096 }
14097
14098 /* Successfully resolved the entitlements */
14099 ret = KERN_SUCCESS;
14100
14101 out:
14102 /* Unlock the code signature object */
14103 if (cd_entry != NULL) {
14104 lck_rw_unlock_shared(&cd_entry->rwlock);
14105 cd_entry = NULL;
14106 }
14107
14108 /* Unlock the PMAP object */
14109 pmap_unlock(pmap, PMAP_LOCK_SHARED);
14110
14111 return ret;
14112 }
14113
14114 kern_return_t
14115 pmap_resolve_kernel_entitlements(
14116 pmap_t pmap,
14117 const void **kernel_entitlements)
14118 {
14119 kern_return_t ret = KERN_DENIED;
14120
14121 do {
14122 ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
14123 } while (ret == KERN_ABORTED);
14124
14125 return ret;
14126 }
14127
14128 kern_return_t
14129 pmap_accelerate_entitlements_internal(
14130 pmap_cs_code_directory_t *cd_entry)
14131 {
14132 const coreentitlements_t *CoreEntitlements = NULL;
14133 const CS_SuperBlob *superblob = NULL;
14134 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
14135 size_t signature_length = 0;
14136 size_t acceleration_length = 0;
14137 size_t required_length = 0;
14138 kern_return_t ret = KERN_DENIED;
14139
14140 /* Setup the CoreEntitlements interface */
14141 CoreEntitlements = &amfi->CoreEntitlements;
14142
14143 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
14144
14145 /* Acquire the lock on the code directory */
14146 pmap_cs_lock_code_directory(cd_entry);
14147
14148 /*
14149 * Only reconstituted code signatures can be accelerated. This is only a policy
14150 * decision we make since this allows us to re-use any unused space within the
14151 * locked down code signature region. There is also a decent bit of validation
14152 * within the reconstitution function to ensure blobs are ordered and do not
14153 * contain any padding around them which can cause issues here.
14154 *
14155 * This also serves as a check to ensure the signature is trusted.
14156 */
14157 if (cd_entry->unneeded_code_signature_unlocked == false) {
14158 ret = KERN_DENIED;
14159 goto out;
14160 }
14161
14162 if (cd_entry->ce_ctx == NULL) {
14163 ret = KERN_SUCCESS;
14164 goto out;
14165 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
14166 ret = KERN_SUCCESS;
14167 goto out;
14168 }
14169
14170 /* We only support accelerating when size <= PAGE_SIZE */
14171 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
14172 if (ce_err != CoreEntitlements->kNoError) {
14173 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
14174 /* Small entitlement blobs aren't eligible */
14175 ret = KERN_SUCCESS;
14176 goto out;
14177 }
14178 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
14179 cd_entry, CoreEntitlements->GetErrorString(ce_err));
14180 } else if (acceleration_length > PAGE_SIZE) {
14181 ret = KERN_ABORTED;
14182 goto out;
14183 }
14184 assert(acceleration_length > 0);
14185
14186 superblob = cd_entry->superblob;
14187 signature_length = ntohl(superblob->length);
14188
14189 /* Adjust the required length for the overhead structure -- can't overflow */
14190 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
14191 if (required_length > PAGE_SIZE) {
14192 ret = KERN_ABORTED;
14193 goto out;
14194 }
14195
14196 /*
14197 * First we'll check if the code signature has enough space within the locked down
14198 * region of memory to hold the buffer. If not, then we'll see if we can bucket
14199 * allocate the buffer, and if not, we'll just allocate an entire page from the
14200 * free list.
14201 *
14202 * When we're storing the buffer within the code signature, we also need to make
14203 * sure we account for alignment of the buffer.
14204 */
14205 const vm_address_t align_mask = sizeof(void*) - 1;
14206 size_t required_length_within_sig = required_length + align_mask;
14207
14208 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
14209 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
14210 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
14211
14212 /* We need to resolve to the physical aperture */
14213 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
14214 acceleration_buf = (void*)phystokv(phys_addr);
14215
14216 /* Ensure the offset within the page wasn't lost */
14217 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
14218
14219 acceleration_buf->allocated = false;
14220 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
14221 } else {
14222 if (required_length <= pmap_cs_blob_limit) {
14223 struct pmap_cs_blob *bucket = NULL;
14224 size_t bucket_size = 0;
14225
14226 /* Allocate a buffer from the blob allocator */
14227 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
14228 if (ret != KERN_SUCCESS) {
14229 goto out;
14230 }
14231 acceleration_buf = (void*)bucket->blob;
14232 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
14233 } else {
14234 pmap_paddr_t phys_addr = 0;
14235 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
14236 if (ret != KERN_SUCCESS) {
14237 goto out;
14238 }
14239 acceleration_buf = (void*)phystokv(phys_addr);
14240 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
14241 }
14242 acceleration_buf->allocated = true;
14243 }
14244 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
14245 acceleration_buf->length = acceleration_length;
14246
14247 /* Take the acceleration buffer lock */
14248 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
14249
14250 /* Setup the global acceleration buffer state */
14251 pmap_cs_acceleration_buf = acceleration_buf;
14252
14253 /* Accelerate the entitlements */
14254 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14255 if (ce_err != CoreEntitlements->kNoError) {
14256 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14257 cd_entry, CoreEntitlements->GetErrorString(ce_err));
14258 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14259 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14260 }
14261
14262 /*
14263 * The global acceleration buffer lock is unlocked by the allocation function itself
14264 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14265 * an assert that the lock is unlocked here since another thread could have acquired
14266 * it by now.
14267 */
14268 ret = KERN_SUCCESS;
14269
14270 out:
14271 lck_rw_unlock_exclusive(&cd_entry->rwlock);
14272 return ret;
14273 }
14274
14275 kern_return_t
14276 pmap_accelerate_entitlements(
14277 pmap_cs_code_directory_t *cd_entry)
14278 {
14279 kern_return_t ret = KERN_DENIED;
14280
14281 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14282 while (ret == KERN_RESOURCE_SHORTAGE) {
14283 /* Allocate a page for the PPL */
14284 pmap_alloc_page_for_ppl(0);
14285
14286 /* Try again */
14287 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14288 }
14289
14290 return ret;
14291 }
14292
14293 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14294
14295 MARK_AS_PMAP_TEXT bool
14296 pmap_lookup_in_loaded_trust_caches_internal(
14297 const uint8_t cdhash[CS_CDHASH_LEN])
14298 {
14299 kern_return_t kr = KERN_NOT_FOUND;
14300
14301 #if PMAP_CS_PPL_MONITOR
14302 /*
14303 * If we have the PPL monitor, then this function can only be called from
14304 * within the PPL. Calling it directly would've caused a panic, so we can
14305 * assume that we're in the PPL here.
14306 */
14307 uint8_t cdhash_safe[CS_CDHASH_LEN];
14308 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14309
14310 kr = pmap_query_trust_cache_safe(
14311 kTCQueryTypeLoadable,
14312 cdhash_safe,
14313 NULL);
14314 #else
14315 kr = query_trust_cache(
14316 kTCQueryTypeLoadable,
14317 cdhash,
14318 NULL);
14319 #endif
14320
14321 if (kr == KERN_SUCCESS) {
14322 return true;
14323 }
14324 return false;
14325 }
14326
14327 bool
14328 pmap_lookup_in_loaded_trust_caches(
14329 const uint8_t cdhash[CS_CDHASH_LEN])
14330 {
14331 #if XNU_MONITOR
14332 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14333 #else
14334 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14335 #endif
14336 }
14337
14338 MARK_AS_PMAP_TEXT uint32_t
14339 pmap_lookup_in_static_trust_cache_internal(
14340 const uint8_t cdhash[CS_CDHASH_LEN])
14341 {
14342 TrustCacheQueryToken_t query_token = {0};
14343 kern_return_t kr = KERN_NOT_FOUND;
14344 uint64_t flags = 0;
14345 uint8_t hash_type = 0;
14346
14347 #if PMAP_CS_PPL_MONITOR
14348 /*
14349 * If we have the PPL monitor, then this function can only be called from
14350 * within the PPL. Calling it directly would've caused a panic, so we can
14351 * assume that we're in the PPL here.
14352 */
14353 uint8_t cdhash_safe[CS_CDHASH_LEN];
14354 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14355
14356 kr = pmap_query_trust_cache_safe(
14357 kTCQueryTypeStatic,
14358 cdhash_safe,
14359 &query_token);
14360 #else
14361 kr = query_trust_cache(
14362 kTCQueryTypeStatic,
14363 cdhash,
14364 &query_token);
14365 #endif
14366
14367 if (kr == KERN_SUCCESS) {
14368 amfi->TrustCache.queryGetFlags(&query_token, &flags);
14369 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14370
14371 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14372 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14373 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14374 }
14375
14376 return 0;
14377 }
14378
14379 uint32_t
14380 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14381 {
14382 #if XNU_MONITOR
14383 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14384 #else
14385 return pmap_lookup_in_static_trust_cache_internal(cdhash);
14386 #endif
14387 }
14388
14389 #if PMAP_CS_INCLUDE_CODE_SIGNING
14390
14391 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14392 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14393
14394 MARK_AS_PMAP_TEXT void
14395 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14396 {
14397
14398 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14399 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14400 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14401
14402 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14403 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14404 }
14405
14406 MARK_AS_PMAP_TEXT bool
14407 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14408 {
14409 bool match = false;
14410
14411 /* Lockdown mode disallows compilation service */
14412 if (ppl_lockdown_mode_enabled == true) {
14413 return false;
14414 }
14415
14416 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14417 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14418 match = true;
14419 }
14420 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14421
14422 if (match) {
14423 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14424 }
14425
14426 return match;
14427 }
14428
14429 void
14430 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14431 {
14432 #if XNU_MONITOR
14433 pmap_set_compilation_service_cdhash_ppl(cdhash);
14434 #else
14435 pmap_set_compilation_service_cdhash_internal(cdhash);
14436 #endif
14437 }
14438
14439 bool
14440 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14441 {
14442 #if XNU_MONITOR
14443 return pmap_match_compilation_service_cdhash_ppl(cdhash);
14444 #else
14445 return pmap_match_compilation_service_cdhash_internal(cdhash);
14446 #endif
14447 }
14448
14449 /*
14450 * As part of supporting local signing on the device, we need the PMAP layer
14451 * to store the local signing key so that PMAP_CS can validate with it. We
14452 * store it at the PMAP layer such that it is accessible to both AMFI and
14453 * PMAP_CS should they need it.
14454 */
14455 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14456 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14457
14458 MARK_AS_PMAP_TEXT void
14459 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14460 {
14461 bool key_set = false;
14462
14463 /*
14464 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14465 * a successful exchange means that the local signing public key has _not_ been
14466 * set. In case the key has been set, we panic as we would never expect the
14467 * kernel to attempt to set the key more than once.
14468 */
14469 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14470
14471 if (key_set) {
14472 panic("attempted to set the local signing public key multiple times");
14473 }
14474
14475 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14476 pmap_cs_log_info("set local signing public key");
14477 }
14478
14479 void
14480 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14481 {
14482 #if XNU_MONITOR
14483 return pmap_set_local_signing_public_key_ppl(public_key);
14484 #else
14485 return pmap_set_local_signing_public_key_internal(public_key);
14486 #endif
14487 }
14488
14489 uint8_t*
14490 pmap_get_local_signing_public_key(void)
14491 {
14492 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14493
14494 if (key_set) {
14495 return pmap_local_signing_public_key;
14496 }
14497
14498 return NULL;
14499 }
14500
14501 /*
14502 * Locally signed applications need to be explicitly authorized by an entitled application
14503 * before we allow them to run.
14504 */
14505 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14506 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14507
14508 MARK_AS_PMAP_TEXT void
14509 pmap_unrestrict_local_signing_internal(
14510 const uint8_t cdhash[CS_CDHASH_LEN])
14511 {
14512
14513 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14514 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14515 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14516
14517 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14518 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14519 }
14520
14521 void
14522 pmap_unrestrict_local_signing(
14523 const uint8_t cdhash[CS_CDHASH_LEN])
14524 {
14525 #if XNU_MONITOR
14526 return pmap_unrestrict_local_signing_ppl(cdhash);
14527 #else
14528 return pmap_unrestrict_local_signing_internal(cdhash);
14529 #endif
14530 }
14531
14532 #if PMAP_CS
14533 MARK_AS_PMAP_TEXT static void
14534 pmap_restrict_local_signing(void)
14535 {
14536 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14537 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14538 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14539 }
14540
14541 MARK_AS_PMAP_TEXT static bool
14542 pmap_local_signing_restricted(
14543 const uint8_t cdhash[CS_CDHASH_LEN])
14544 {
14545 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14546 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14547 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14548
14549 return ret != 0;
14550 }
14551
14552 #endif
14553 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14554
14555 MARK_AS_PMAP_TEXT void
14556 pmap_footprint_suspend_internal(
14557 vm_map_t map,
14558 boolean_t suspend)
14559 {
14560 #if DEVELOPMENT || DEBUG
14561 if (suspend) {
14562 current_thread()->pmap_footprint_suspended = TRUE;
14563 map->pmap->footprint_was_suspended = TRUE;
14564 } else {
14565 current_thread()->pmap_footprint_suspended = FALSE;
14566 }
14567 #else /* DEVELOPMENT || DEBUG */
14568 (void) map;
14569 (void) suspend;
14570 #endif /* DEVELOPMENT || DEBUG */
14571 }
14572
14573 void
14574 pmap_footprint_suspend(
14575 vm_map_t map,
14576 boolean_t suspend)
14577 {
14578 #if XNU_MONITOR
14579 pmap_footprint_suspend_ppl(map, suspend);
14580 #else
14581 pmap_footprint_suspend_internal(map, suspend);
14582 #endif
14583 }
14584
14585 MARK_AS_PMAP_TEXT void
14586 pmap_nop_internal(pmap_t pmap __unused)
14587 {
14588 validate_pmap_mutable(pmap);
14589 }
14590
14591 void
14592 pmap_nop(pmap_t pmap)
14593 {
14594 #if XNU_MONITOR
14595 pmap_nop_ppl(pmap);
14596 #else
14597 pmap_nop_internal(pmap);
14598 #endif
14599 }
14600
14601 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14602
14603 struct page_table_dump_header {
14604 uint64_t pa;
14605 uint64_t num_entries;
14606 uint64_t start_va;
14607 uint64_t end_va;
14608 };
14609
14610 static kern_return_t
14611 pmap_dump_page_tables_recurse(pmap_t pmap,
14612 const tt_entry_t *ttp,
14613 unsigned int cur_level,
14614 unsigned int level_mask,
14615 uint64_t start_va,
14616 void *buf_start,
14617 void *buf_end,
14618 size_t *bytes_copied)
14619 {
14620 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14621 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14622
14623 uint64_t size = pt_attr->pta_level_info[cur_level].size;
14624 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14625 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14626 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14627
14628 void *bufp = (uint8_t*)buf_start + *bytes_copied;
14629
14630 if (cur_level == pt_attr_root_level(pt_attr)) {
14631 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14632 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14633 }
14634
14635 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14636 const tt_entry_t *tt_end = &ttp[num_entries];
14637
14638 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14639 return KERN_INSUFFICIENT_BUFFER_SIZE;
14640 }
14641
14642 if (level_mask & (1U << cur_level)) {
14643 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14644 header->pa = ml_static_vtop((vm_offset_t)ttp);
14645 header->num_entries = num_entries;
14646 header->start_va = start_va;
14647 header->end_va = start_va + (num_entries * size);
14648
14649 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14650 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14651 }
14652 uint64_t current_va = start_va;
14653
14654 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14655 tt_entry_t tte = *ttep;
14656
14657 if (!(tte & valid_mask)) {
14658 continue;
14659 }
14660
14661 if ((tte & type_mask) == type_block) {
14662 continue;
14663 } else {
14664 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14665 panic("%s: corrupt entry %#llx at %p, "
14666 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14667 __FUNCTION__, tte, ttep,
14668 ttp, cur_level, bufp, buf_end);
14669 }
14670
14671 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14672
14673 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14674 level_mask, current_va, buf_start, buf_end, bytes_copied);
14675
14676 if (recurse_result != KERN_SUCCESS) {
14677 return recurse_result;
14678 }
14679 }
14680 }
14681
14682 return KERN_SUCCESS;
14683 }
14684
14685 kern_return_t
14686 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14687 {
14688 if (not_in_kdp) {
14689 panic("pmap_dump_page_tables must only be called from kernel debugger context");
14690 }
14691 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14692 level_mask, pmap->min, bufp, buf_end, bytes_copied);
14693 }
14694
14695 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14696
14697 kern_return_t
14698 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14699 unsigned int level_mask __unused, size_t *bytes_copied __unused)
14700 {
14701 return KERN_NOT_SUPPORTED;
14702 }
14703 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14704
14705
14706 #ifdef CONFIG_XNUPOST
14707 #ifdef __arm64__
14708 static volatile bool pmap_test_took_fault = false;
14709
14710 static bool
14711 pmap_test_fault_handler(arm_saved_state_t * state)
14712 {
14713 bool retval = false;
14714 uint64_t esr = get_saved_state_esr(state);
14715 esr_exception_class_t class = ESR_EC(esr);
14716 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
14717
14718 if ((class == ESR_EC_DABORT_EL1) &&
14719 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14720 pmap_test_took_fault = true;
14721 /* return to the instruction immediately after the call to NX page */
14722 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14723 retval = true;
14724 }
14725
14726 return retval;
14727 }
14728
14729 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14730 static NOKASAN bool
14731 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14732 {
14733 pmap_t old_pmap = NULL;
14734 thread_t thread = current_thread();
14735
14736 pmap_test_took_fault = false;
14737
14738 /*
14739 * We're potentially switching pmaps without using the normal thread
14740 * mechanism; disable interrupts and preemption to avoid any unexpected
14741 * memory accesses.
14742 */
14743 uint64_t old_int_state = pmap_interrupts_disable();
14744 mp_disable_preemption();
14745
14746 if (pmap != NULL) {
14747 old_pmap = current_pmap();
14748 pmap_switch(pmap, thread);
14749
14750 /* Disable PAN; pmap shouldn't be the kernel pmap. */
14751 #if __ARM_PAN_AVAILABLE__
14752 __builtin_arm_wsr("pan", 0);
14753 #endif /* __ARM_PAN_AVAILABLE__ */
14754 }
14755
14756 ml_expect_fault_begin(pmap_test_fault_handler, va);
14757
14758 if (is_write) {
14759 *((volatile uint64_t*)(va)) = 0xdec0de;
14760 } else {
14761 volatile uint64_t tmp = *((volatile uint64_t*)(va));
14762 (void)tmp;
14763 }
14764
14765 /* Save the fault bool, and undo the gross stuff we did. */
14766 bool took_fault = pmap_test_took_fault;
14767 ml_expect_fault_end();
14768
14769 if (pmap != NULL) {
14770 #if __ARM_PAN_AVAILABLE__
14771 __builtin_arm_wsr("pan", 1);
14772 #endif /* __ARM_PAN_AVAILABLE__ */
14773
14774 pmap_switch(old_pmap, thread);
14775 }
14776
14777 mp_enable_preemption();
14778 pmap_interrupts_restore(old_int_state);
14779 bool retval = (took_fault == should_fault);
14780 return retval;
14781 }
14782
14783 static bool
14784 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14785 {
14786 bool retval = pmap_test_access(pmap, va, should_fault, false);
14787
14788 if (!retval) {
14789 T_FAIL("%s: %s, "
14790 "pmap=%p, va=%p, should_fault=%u",
14791 __func__, should_fault ? "did not fault" : "faulted",
14792 pmap, (void*)va, (unsigned)should_fault);
14793 }
14794
14795 return retval;
14796 }
14797
14798 static bool
14799 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14800 {
14801 bool retval = pmap_test_access(pmap, va, should_fault, true);
14802
14803 if (!retval) {
14804 T_FAIL("%s: %s, "
14805 "pmap=%p, va=%p, should_fault=%u",
14806 __func__, should_fault ? "did not fault" : "faulted",
14807 pmap, (void*)va, (unsigned)should_fault);
14808 }
14809
14810 return retval;
14811 }
14812
14813 static bool
14814 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14815 {
14816 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14817 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14818
14819 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14820
14821 if (!retval) {
14822 T_FAIL("%s: bits=%u, "
14823 "pa=%p, should_be_set=%u",
14824 __func__, bits,
14825 (void*)pa, should_be_set);
14826 }
14827
14828 return retval;
14829 }
14830
14831 static __attribute__((noinline)) bool
14832 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14833 {
14834 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14835 return retval;
14836 }
14837
14838 static int
14839 pmap_test_test_config(unsigned int flags)
14840 {
14841 T_LOG("running pmap_test_test_config flags=0x%X", flags);
14842 unsigned int map_count = 0;
14843 unsigned long page_ratio = 0;
14844 pmap_t pmap = pmap_create_options(NULL, 0, flags);
14845
14846 if (!pmap) {
14847 panic("Failed to allocate pmap");
14848 }
14849
14850 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14851 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14852 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14853 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14854
14855 if (pmap_page_size <= native_page_size) {
14856 page_ratio = native_page_size / pmap_page_size;
14857 } else {
14858 /*
14859 * We claim to support a page_ratio of less than 1, which is
14860 * not currently supported by the pmap layer; panic.
14861 */
14862 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14863 "flags=%u",
14864 __func__, native_page_size, pmap_page_size,
14865 flags);
14866 }
14867
14868 if (PAGE_RATIO > 1) {
14869 /*
14870 * The kernel is deliberately pretending to have 16KB pages.
14871 * The pmap layer has code that supports this, so pretend the
14872 * page size is larger than it is.
14873 */
14874 pmap_page_size = PAGE_SIZE;
14875 native_page_size = PAGE_SIZE;
14876 }
14877
14878 /*
14879 * Get two pages from the VM; one to be mapped wired, and one to be
14880 * mapped nonwired.
14881 */
14882 vm_page_t unwired_vm_page = vm_page_grab();
14883 vm_page_t wired_vm_page = vm_page_grab();
14884
14885 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14886 panic("Failed to grab VM pages");
14887 }
14888
14889 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14890 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14891
14892 pmap_paddr_t pa = ptoa(pn);
14893 pmap_paddr_t wired_pa = ptoa(wired_pn);
14894
14895 /*
14896 * We'll start mappings at the second twig TT. This keeps us from only
14897 * using the first entry in each TT, which would trivially be address
14898 * 0; one of the things we will need to test is retrieving the VA for
14899 * a given PTE.
14900 */
14901 vm_map_address_t va_base = pmap_twig_size;
14902 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14903
14904 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14905 /*
14906 * Not exactly a functional failure, but this test relies on
14907 * there being a spare PTE slot we can use to pin the TT.
14908 */
14909 panic("Cannot pin translation table");
14910 }
14911
14912 /*
14913 * Create the wired mapping; this will prevent the pmap layer from
14914 * reclaiming our test TTs, which would interfere with this test
14915 * ("interfere" -> "make it panic").
14916 */
14917 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14918
14919 #if XNU_MONITOR
14920 /*
14921 * If the PPL is enabled, make sure that the kernel cannot write
14922 * to PPL memory.
14923 */
14924 if (!pmap_ppl_disable) {
14925 T_LOG("Validate that kernel cannot write to PPL memory.");
14926 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14927 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14928 }
14929 #endif
14930
14931 /*
14932 * Create read-only mappings of the nonwired page; if the pmap does
14933 * not use the same page size as the kernel, create multiple mappings
14934 * so that the kernel page is fully mapped.
14935 */
14936 for (map_count = 0; map_count < page_ratio; map_count++) {
14937 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14938 }
14939
14940 /* Validate that all the PTEs have the expected PA and VA. */
14941 for (map_count = 0; map_count < page_ratio; map_count++) {
14942 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14943
14944 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14945 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14946 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14947 }
14948
14949 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14950 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14951 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14952 }
14953 }
14954
14955 T_LOG("Validate that reads to our mapping do not fault.");
14956 pmap_test_read(pmap, va_base, false);
14957
14958 T_LOG("Validate that writes to our mapping fault.");
14959 pmap_test_write(pmap, va_base, true);
14960
14961 T_LOG("Make the first mapping writable.");
14962 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14963
14964 T_LOG("Validate that writes to our mapping do not fault.");
14965 pmap_test_write(pmap, va_base, false);
14966
14967
14968 T_LOG("Test XO mapping");
14969 kern_return_t kr = pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14970 if (pmap_allows_xo(pmap)) {
14971 if (kr != KERN_SUCCESS) {
14972 T_FAIL("XO mapping returned 0x%x instead of KERN_SUCCESS", (unsigned int)kr);
14973 }
14974 } else if (kr != KERN_PROTECTION_FAILURE) {
14975 T_FAIL("XO mapping returned 0x%x instead of KERN_PROTECTION_FAILURE", (unsigned int)kr);
14976 }
14977
14978 T_LOG("Make the first mapping RX");
14979 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE | VM_PROT_READ, VM_PROT_EXECUTE, 0, false);
14980
14981 T_LOG("Validate that reads to our mapping do not fault.");
14982 pmap_test_read(pmap, va_base, false);
14983
14984 T_LOG("Validate that writes to our mapping fault.");
14985 pmap_test_write(pmap, va_base, true);
14986
14987
14988 /*
14989 * For page ratios of greater than 1: validate that writes to the other
14990 * mappings still fault. Remove the mappings afterwards (we're done
14991 * with page ratio testing).
14992 */
14993 for (map_count = 1; map_count < page_ratio; map_count++) {
14994 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14995 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14996 }
14997
14998 T_LOG("Mark the page unreferenced and unmodified.");
14999 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15000 pmap_test_check_refmod(pa, 0);
15001
15002 /*
15003 * Begin testing the ref/mod state machine. Re-enter the mapping with
15004 * different protection/fault_type settings, and confirm that the
15005 * ref/mod state matches our expectations at each step.
15006 */
15007 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
15008 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
15009 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15010
15011 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
15012 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15013 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
15014 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15015
15016 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
15017 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15018 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
15019 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15020
15021 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
15022 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
15023 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15024
15025 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
15026 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15027 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15028 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
15029
15030 /*
15031 * Shared memory testing; we'll have two mappings; one read-only,
15032 * one read-write.
15033 */
15034 vm_map_address_t rw_base = va_base;
15035 vm_map_address_t ro_base = va_base + pmap_page_size;
15036
15037 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15038 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
15039
15040 /*
15041 * Test that we take faults as expected for unreferenced/unmodified
15042 * pages. Also test the arm_fast_fault interface, to ensure that
15043 * mapping permissions change as expected.
15044 */
15045 T_LOG("!ref/!mod: expect no access");
15046 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15047 pmap_test_read_write(pmap, ro_base, false, false);
15048 pmap_test_read_write(pmap, rw_base, false, false);
15049
15050 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
15051 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
15052 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15053 pmap_test_read_write(pmap, ro_base, true, false);
15054 pmap_test_read_write(pmap, rw_base, true, false);
15055
15056 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
15057 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
15058 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
15059 pmap_test_read_write(pmap, ro_base, true, false);
15060 pmap_test_read_write(pmap, rw_base, true, true);
15061
15062 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
15063 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15064 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
15065 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
15066 pmap_test_read_write(pmap, ro_base, true, false);
15067 pmap_test_read_write(pmap, rw_base, true, true);
15068
15069 T_LOG("RW protect both mappings; should not change protections.");
15070 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15071 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15072 pmap_test_read_write(pmap, ro_base, true, false);
15073 pmap_test_read_write(pmap, rw_base, true, true);
15074
15075 T_LOG("Read protect both mappings; RW mapping should become RO.");
15076 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
15077 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
15078 pmap_test_read_write(pmap, ro_base, true, false);
15079 pmap_test_read_write(pmap, rw_base, true, false);
15080
15081 T_LOG("RW protect the page; mappings should not change protections.");
15082 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15083 pmap_page_protect(pn, VM_PROT_ALL);
15084 pmap_test_read_write(pmap, ro_base, true, false);
15085 pmap_test_read_write(pmap, rw_base, true, true);
15086
15087 T_LOG("Read protect the page; RW mapping should become RO.");
15088 pmap_page_protect(pn, VM_PROT_READ);
15089 pmap_test_read_write(pmap, ro_base, true, false);
15090 pmap_test_read_write(pmap, rw_base, true, false);
15091
15092 T_LOG("Validate that disconnect removes all known mappings of the page.");
15093 pmap_disconnect(pn);
15094 if (!pmap_verify_free(pn)) {
15095 T_FAIL("Page still has mappings");
15096 }
15097
15098 T_LOG("Remove the wired mapping, so we can tear down the test map.");
15099 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
15100 pmap_destroy(pmap);
15101
15102 T_LOG("Release the pages back to the VM.");
15103 vm_page_lock_queues();
15104 vm_page_free(unwired_vm_page);
15105 vm_page_free(wired_vm_page);
15106 vm_page_unlock_queues();
15107
15108 T_LOG("Testing successful!");
15109 return 0;
15110 }
15111 #endif /* __arm64__ */
15112
15113 kern_return_t
15114 pmap_test(void)
15115 {
15116 T_LOG("Starting pmap_tests");
15117 #ifdef __arm64__
15118 int flags = 0;
15119 flags |= PMAP_CREATE_64BIT;
15120
15121 #if __ARM_MIXED_PAGE_SIZE__
15122 T_LOG("Testing VM_PAGE_SIZE_4KB");
15123 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
15124 T_LOG("Testing VM_PAGE_SIZE_16KB");
15125 pmap_test_test_config(flags);
15126 #else /* __ARM_MIXED_PAGE_SIZE__ */
15127 pmap_test_test_config(flags);
15128 #endif /* __ARM_MIXED_PAGE_SIZE__ */
15129
15130 #endif /* __arm64__ */
15131 T_PASS("completed pmap_test successfully");
15132 return KERN_SUCCESS;
15133 }
15134 #endif /* CONFIG_XNUPOST */
15135
15136 /*
15137 * The following function should never make it to RELEASE code, since
15138 * it provides a way to get the PPL to modify text pages.
15139 */
15140 #if DEVELOPMENT || DEBUG
15141
15142 #define ARM_UNDEFINED_INSN 0xe7f000f0
15143 #define ARM_UNDEFINED_INSN_THUMB 0xde00
15144
15145 /**
15146 * Forcibly overwrite executable text with an illegal instruction.
15147 *
15148 * @note Only used for xnu unit testing.
15149 *
15150 * @param pa The physical address to corrupt.
15151 *
15152 * @return KERN_SUCCESS on success.
15153 */
15154 kern_return_t
15155 pmap_test_text_corruption(pmap_paddr_t pa)
15156 {
15157 #if XNU_MONITOR
15158 return pmap_test_text_corruption_ppl(pa);
15159 #else /* XNU_MONITOR */
15160 return pmap_test_text_corruption_internal(pa);
15161 #endif /* XNU_MONITOR */
15162 }
15163
15164 MARK_AS_PMAP_TEXT kern_return_t
15165 pmap_test_text_corruption_internal(pmap_paddr_t pa)
15166 {
15167 vm_offset_t va = phystokv(pa);
15168 unsigned int pai = pa_index(pa);
15169
15170 assert(pa_valid(pa));
15171
15172 pvh_lock(pai);
15173
15174 pv_entry_t **pv_h = pai_to_pvh(pai);
15175 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
15176 #if defined(PVH_FLAG_EXEC)
15177 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
15178
15179 if (need_ap_twiddle) {
15180 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
15181 }
15182 #endif /* defined(PVH_FLAG_EXEC) */
15183
15184 /*
15185 * The low bit in an instruction address indicates a THUMB instruction
15186 */
15187 if (va & 1) {
15188 va &= ~(vm_offset_t)1;
15189 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
15190 } else {
15191 *(uint32_t *)va = ARM_UNDEFINED_INSN;
15192 }
15193
15194 #if defined(PVH_FLAG_EXEC)
15195 if (need_ap_twiddle) {
15196 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
15197 }
15198 #endif /* defined(PVH_FLAG_EXEC) */
15199
15200 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
15201
15202 pvh_unlock(pai);
15203
15204 return KERN_SUCCESS;
15205 }
15206
15207 #endif /* DEVELOPMENT || DEBUG */
15208