1 /*
2 * Copyright (c) 2011-2021, 2023-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object_internal.h>
60 #include <vm/vm_page_internal.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm_internal.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap_internal.h>
84
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93
94 #include <pexpert/device_tree.h>
95
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106
107
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131
132 const struct page_table_ops native_pt_ops =
133 {
134 .alloc_id = alloc_asid,
135 .free_id = free_asid,
136 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 .wimg_to_pte = wimg_to_pte,
139 };
140
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 [0] = {
144 .size = ARM_16K_TT_L0_SIZE,
145 .offmask = ARM_16K_TT_L0_OFFMASK,
146 .shift = ARM_16K_TT_L0_SHIFT,
147 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 .valid_mask = ARM_TTE_VALID,
149 .type_mask = ARM_TTE_TYPE_MASK,
150 .type_block = ARM_TTE_TYPE_BLOCK
151 },
152 [1] = {
153 .size = ARM_16K_TT_L1_SIZE,
154 .offmask = ARM_16K_TT_L1_OFFMASK,
155 .shift = ARM_16K_TT_L1_SHIFT,
156 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 .valid_mask = ARM_TTE_VALID,
158 .type_mask = ARM_TTE_TYPE_MASK,
159 .type_block = ARM_TTE_TYPE_BLOCK
160 },
161 [2] = {
162 .size = ARM_16K_TT_L2_SIZE,
163 .offmask = ARM_16K_TT_L2_OFFMASK,
164 .shift = ARM_16K_TT_L2_SHIFT,
165 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 .valid_mask = ARM_TTE_VALID,
167 .type_mask = ARM_TTE_TYPE_MASK,
168 .type_block = ARM_TTE_TYPE_BLOCK
169 },
170 [3] = {
171 .size = ARM_16K_TT_L3_SIZE,
172 .offmask = ARM_16K_TT_L3_OFFMASK,
173 .shift = ARM_16K_TT_L3_SHIFT,
174 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 .valid_mask = ARM_PTE_TYPE_VALID,
176 .type_mask = ARM_PTE_TYPE_MASK,
177 .type_block = ARM_TTE_TYPE_L3BLOCK
178 }
179 };
180
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 [0] = {
184 .size = ARM_4K_TT_L0_SIZE,
185 .offmask = ARM_4K_TT_L0_OFFMASK,
186 .shift = ARM_4K_TT_L0_SHIFT,
187 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 .valid_mask = ARM_TTE_VALID,
189 .type_mask = ARM_TTE_TYPE_MASK,
190 .type_block = ARM_TTE_TYPE_BLOCK
191 },
192 [1] = {
193 .size = ARM_4K_TT_L1_SIZE,
194 .offmask = ARM_4K_TT_L1_OFFMASK,
195 .shift = ARM_4K_TT_L1_SHIFT,
196 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 .valid_mask = ARM_TTE_VALID,
198 .type_mask = ARM_TTE_TYPE_MASK,
199 .type_block = ARM_TTE_TYPE_BLOCK
200 },
201 [2] = {
202 .size = ARM_4K_TT_L2_SIZE,
203 .offmask = ARM_4K_TT_L2_OFFMASK,
204 .shift = ARM_4K_TT_L2_SHIFT,
205 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 .valid_mask = ARM_TTE_VALID,
207 .type_mask = ARM_TTE_TYPE_MASK,
208 .type_block = ARM_TTE_TYPE_BLOCK
209 },
210 [3] = {
211 .size = ARM_4K_TT_L3_SIZE,
212 .offmask = ARM_4K_TT_L3_OFFMASK,
213 .shift = ARM_4K_TT_L3_SHIFT,
214 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 .valid_mask = ARM_PTE_TYPE_VALID,
216 .type_mask = ARM_PTE_TYPE_MASK,
217 .type_block = ARM_TTE_TYPE_L3BLOCK
218 }
219 };
220
221 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
222 {
223 [0] = { /* Unused */
224 .size = ARM_4K_TT_L0_SIZE,
225 .offmask = ARM_4K_TT_L0_OFFMASK,
226 .shift = ARM_4K_TT_L0_SHIFT,
227 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
228 .valid_mask = ARM_TTE_VALID,
229 .type_mask = ARM_TTE_TYPE_MASK,
230 .type_block = ARM_TTE_TYPE_BLOCK
231 },
232 [1] = { /* Concatenated, so index mask is larger than normal */
233 .size = ARM_4K_TT_L1_SIZE,
234 .offmask = ARM_4K_TT_L1_OFFMASK,
235 .shift = ARM_4K_TT_L1_SHIFT,
236 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
237 .index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
238 #else
239 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
240 #endif
241 .valid_mask = ARM_TTE_VALID,
242 .type_mask = ARM_TTE_TYPE_MASK,
243 .type_block = ARM_TTE_TYPE_BLOCK
244 },
245 [2] = {
246 .size = ARM_4K_TT_L2_SIZE,
247 .offmask = ARM_4K_TT_L2_OFFMASK,
248 .shift = ARM_4K_TT_L2_SHIFT,
249 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
250 .valid_mask = ARM_TTE_VALID,
251 .type_mask = ARM_TTE_TYPE_MASK,
252 .type_block = ARM_TTE_TYPE_BLOCK
253 },
254 [3] = {
255 .size = ARM_4K_TT_L3_SIZE,
256 .offmask = ARM_4K_TT_L3_OFFMASK,
257 .shift = ARM_4K_TT_L3_SHIFT,
258 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
259 .valid_mask = ARM_PTE_TYPE_VALID,
260 .type_mask = ARM_PTE_TYPE_MASK,
261 .type_block = ARM_TTE_TYPE_L3BLOCK
262 }
263 };
264
265 const struct page_table_attr pmap_pt_attr_4k = {
266 .pta_level_info = pmap_table_level_info_4k,
267 .pta_root_level = (T0SZ_BOOT - 16) / 9,
268 #if __ARM_MIXED_PAGE_SIZE__
269 .pta_commpage_level = PMAP_TT_L2_LEVEL,
270 #else /* __ARM_MIXED_PAGE_SIZE__ */
271 #if __ARM_16K_PG__
272 .pta_commpage_level = PMAP_TT_L2_LEVEL,
273 #else /* __ARM_16K_PG__ */
274 .pta_commpage_level = PMAP_TT_L1_LEVEL,
275 #endif /* __ARM_16K_PG__ */
276 #endif /* __ARM_MIXED_PAGE_SIZE__ */
277 .pta_max_level = PMAP_TT_L3_LEVEL,
278 .pta_ops = &native_pt_ops,
279 .ap_ro = ARM_PTE_AP(AP_RORO),
280 .ap_rw = ARM_PTE_AP(AP_RWRW),
281 .ap_rona = ARM_PTE_AP(AP_RONA),
282 .ap_rwna = ARM_PTE_AP(AP_RWNA),
283 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
284 .ap_x = ARM_PTE_PNX,
285 #if __ARM_MIXED_PAGE_SIZE__
286 .pta_tcr_value = TCR_EL1_4KB,
287 #endif /* __ARM_MIXED_PAGE_SIZE__ */
288 .pta_page_size = 4096,
289 .pta_pagezero_size = 4096,
290 .pta_page_shift = 12,
291 };
292
293 const struct page_table_attr pmap_pt_attr_16k = {
294 .pta_level_info = pmap_table_level_info_16k,
295 .pta_root_level = PMAP_TT_L1_LEVEL,
296 .pta_commpage_level = PMAP_TT_L2_LEVEL,
297 .pta_max_level = PMAP_TT_L3_LEVEL,
298 .pta_ops = &native_pt_ops,
299 .ap_ro = ARM_PTE_AP(AP_RORO),
300 .ap_rw = ARM_PTE_AP(AP_RWRW),
301 .ap_rona = ARM_PTE_AP(AP_RONA),
302 .ap_rwna = ARM_PTE_AP(AP_RWNA),
303 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
304 .ap_x = ARM_PTE_PNX,
305 #if __ARM_MIXED_PAGE_SIZE__
306 .pta_tcr_value = TCR_EL1_16KB,
307 #endif /* __ARM_MIXED_PAGE_SIZE__ */
308 .pta_page_size = 16384,
309 .pta_pagezero_size = 16384,
310 .pta_page_shift = 14,
311 };
312
313 #if __ARM_16K_PG__
314 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
315 #else /* !__ARM_16K_PG__ */
316 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
317 #endif /* !__ARM_16K_PG__ */
318
319
320 #if DEVELOPMENT || DEBUG
321 int vm_footprint_suspend_allowed = 1;
322
323 extern int pmap_ledgers_panic;
324 extern int pmap_ledgers_panic_leeway;
325
326 #endif /* DEVELOPMENT || DEBUG */
327
328 #if DEVELOPMENT || DEBUG
329 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
330 (current_thread()->pmap_footprint_suspended)
331 #else /* DEVELOPMENT || DEBUG */
332 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
333 #endif /* DEVELOPMENT || DEBUG */
334
335
336 /*
337 * Represents a tlb range that will be flushed before exiting
338 * the ppl.
339 * Used by phys_attribute_clear_range to defer flushing pages in
340 * this range until the end of the operation.
341 */
342 typedef struct pmap_tlb_flush_range {
343 pmap_t ptfr_pmap;
344 vm_map_address_t ptfr_start;
345 vm_map_address_t ptfr_end;
346 bool ptfr_flush_needed;
347 } pmap_tlb_flush_range_t;
348
349 #if XNU_MONITOR
350 /*
351 * PPL External References.
352 */
353 extern vm_offset_t segPPLDATAB;
354 extern unsigned long segSizePPLDATA;
355 extern vm_offset_t segPPLTEXTB;
356 extern unsigned long segSizePPLTEXT;
357 extern vm_offset_t segPPLDATACONSTB;
358 extern unsigned long segSizePPLDATACONST;
359
360
361 /*
362 * PPL Global Variables
363 */
364
365 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
366 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
367 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
368 #else
369 const boolean_t pmap_ppl_disable = FALSE;
370 #endif
371
372 /*
373 * Indicates if the PPL has started applying APRR.
374 * This variable is accessed from various assembly trampolines, so be sure to change
375 * those if you change the size or layout of this variable.
376 */
377 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
378
379 extern void *pmap_stacks_start;
380 extern void *pmap_stacks_end;
381
382 #endif /* !XNU_MONITOR */
383
384
385
386 /* Virtual memory region for early allocation */
387 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
388 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
389 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
390
391 extern uint8_t bootstrap_pagetables[];
392
393 extern unsigned int not_in_kdp;
394
395 extern vm_offset_t first_avail;
396
397 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
398 extern vm_offset_t virtual_space_end; /* End of kernel address space */
399 extern vm_offset_t static_memory_end;
400
401 extern const vm_map_address_t physmap_base;
402 extern const vm_map_address_t physmap_end;
403
404 extern int maxproc, hard_maxproc;
405
406 /* The number of address bits one TTBR can cover. */
407 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
408
409 /*
410 * The bounds on our TTBRs. These are for sanity checking that
411 * an address is accessible by a TTBR before we attempt to map it.
412 */
413
414 /* The level of the root of a page table. */
415 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
416
417 /* The number of entries in the root TT of a page table. */
418 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
419
420 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
421 const pmap_t kernel_pmap = &kernel_pmap_store;
422
423 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
424
425 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
426 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
427 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
428
429 typedef struct tt_free_entry {
430 struct tt_free_entry *next;
431 } tt_free_entry_t;
432
433 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
434
435 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
436 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
437 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
438 #define FREE_PAGE_SIZE_TT_MAX 4
439 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
440 unsigned int free_tt_count MARK_AS_PMAP_DATA;
441 unsigned int free_tt_max MARK_AS_PMAP_DATA;
442
443 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
444
445 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
446 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
447 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
448 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
449 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
450 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
451
452 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
453 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
454
455 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
456 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
457
458 /* Lock group used for all pmap object locks. */
459 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
460
461 #if DEVELOPMENT || DEBUG
462 int nx_enabled = 1; /* enable no-execute protection */
463 int allow_data_exec = 0; /* No apps may execute data */
464 int allow_stack_exec = 0; /* No apps may execute from the stack */
465 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
466 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
467 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
468 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
469 #else /* DEVELOPMENT || DEBUG */
470 const int nx_enabled = 1; /* enable no-execute protection */
471 const int allow_data_exec = 0; /* No apps may execute data */
472 const int allow_stack_exec = 0; /* No apps may execute from the stack */
473 #endif /* DEVELOPMENT || DEBUG */
474
475 /**
476 * This variable is set true during hibernation entry to protect pmap data structures
477 * during image copying, and reset false on hibernation exit.
478 */
479 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
480
481 #if MACH_ASSERT
482 static void pmap_check_ledgers(pmap_t pmap);
483 #else
484 static inline void
pmap_check_ledgers(__unused pmap_t pmap)485 pmap_check_ledgers(__unused pmap_t pmap)
486 {
487 }
488 #endif /* MACH_ASSERT */
489
490 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
491
492 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
493 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
494
495 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
496
497 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
498 #if defined(__arm64__)
499 /* end of shared region + 512MB for various purposes */
500 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
501 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
502 "Minimum address space size outside allowable range");
503
504 // Max offset is 15.375GB for devices with "large" memory config
505 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
506 // Max offset is 11.375GB for devices with "small" memory config
507 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
508
509
510 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
511 "Large device address space size outside allowable range");
512 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
513 "Small device address space size outside allowable range");
514
515 # ifdef XNU_TARGET_OS_OSX
516 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
517 # else
518 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
519 # endif
520 #endif /* __arm64__ */
521
522 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
523 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
524 #else
525 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
526 #endif
527
528 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
529 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
530 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
531 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
532 #if !HAS_16BIT_ASID
533 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
534 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
535 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
536 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
537 #else
538 static uint16_t last_allocated_asid = 0;
539 #endif /* !HAS_16BIT_ASID */
540
541 #if HAS_SPECRES_DEBUGGING
542 /* A debug flag that controls SPECRES instructions behavior facilitating perf evaluation. */
543 static SECURITY_READ_ONLY_LATE(uint64_t) specres_debug = 0;
544 #endif /* HAS_SPECRES_DEBUGGING */
545
546
547 #if __ARM_MIXED_PAGE_SIZE__
548 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
549 #endif
550 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
551 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
552 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
553
554 /* PTE Define Macros */
555
556 #define ARM_PTE_IS_COMPRESSED(x, p) \
557 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
558 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
559 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
560 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
561 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
562
563 #define pte_is_wired(pte) \
564 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
565
566 #define pte_was_writeable(pte) \
567 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
568
569 #define pte_set_was_writeable(pte, was_writeable) \
570 do { \
571 if ((was_writeable)) { \
572 (pte) |= ARM_PTE_WRITEABLE; \
573 } else { \
574 (pte) &= ~ARM_PTE_WRITEABLE; \
575 } \
576 } while(0)
577
578 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)579 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
580 {
581 if (wired) {
582 *ptep |= ARM_PTE_WIRED;
583 } else {
584 *ptep &= ~ARM_PTE_WIRED;
585 }
586 /*
587 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
588 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
589 * never reclaimed.
590 */
591 if (pmap == kernel_pmap) {
592 return;
593 }
594 unsigned short *ptd_wiredcnt_ptr;
595 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
596 if (wired) {
597 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
598 } else {
599 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
600 if (__improbable(prev_wired == 0)) {
601 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
602 }
603 }
604 }
605
606 #if HAS_FEAT_XS
607
608 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)609 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
610 {
611 if (__improbable(pt_attr->stage2)) {
612 return false;
613 }
614 switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
615 case CACHE_ATTRINDX_DISABLE_XS:
616 case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
617 return true;
618 default:
619 return false;
620 }
621 }
622
623 #endif /* HAS_FEAT_XS */
624
625 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
626 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
627 arm64_sync_tlb(strong); \
628 }
629
630 /*
631 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
632 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
633 * will observe the updated PTE.
634 */
635 #define FLUSH_PTE() \
636 __builtin_arm_dmb(DMB_ISH);
637
638 /*
639 * Synchronize updates to PTEs that were previously valid and thus may be cached in
640 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
641 * TLBI. This should only require a store-store barrier, as subsequent accesses in
642 * program order will not issue until the DSB completes. Prior loads may be reordered
643 * after the barrier, but their behavior should not be materially affected by the
644 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
645 * matter for loads until the access is re-driven well after the TLB update is
646 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
647 * we should be in a position to handle access faults. For "voluntary" PTE access
648 * restriction due to unmapping or protection, the decision to restrict access should
649 * have a data dependency on prior loads in order to avoid a data race.
650 */
651 #define FLUSH_PTE_STRONG() \
652 __builtin_arm_dsb(DSB_ISHST);
653
654 /**
655 * Write enough page table entries to map a single VM page. On systems where the
656 * VM page size does not match the hardware page size, multiple page table
657 * entries will need to be written.
658 *
659 * @note This function does not emit a barrier to ensure these page table writes
660 * have completed before continuing. This is commonly needed. In the case
661 * where a DMB or DSB barrier is needed, then use the write_pte() and
662 * write_pte_strong() functions respectively instead of this one.
663 *
664 * @param ptep Pointer to the first page table entry to update.
665 * @param pte The value to write into each page table entry. In the case that
666 * multiple PTEs are updated to a non-empty value, then the address
667 * in this value will automatically be incremented for each PTE
668 * write.
669 */
670 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)671 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
672 {
673 /**
674 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
675 * systems, which is why it's checked at runtime instead of compile time.
676 * The "unreachable" warning needs to be suppressed because it still is a
677 * compile time constant on some systems.
678 */
679 __unreachable_ok_push
680 if (TEST_PAGE_RATIO_4) {
681 if (((uintptr_t)ptep) & 0x1f) {
682 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
683 __func__, ptep, (void*)pte);
684 }
685
686 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
687 /**
688 * If we're writing an empty/compressed PTE value, then don't
689 * auto-increment the address for each PTE write.
690 */
691 *ptep = pte;
692 *(ptep + 1) = pte;
693 *(ptep + 2) = pte;
694 *(ptep + 3) = pte;
695 } else {
696 *ptep = pte;
697 *(ptep + 1) = pte | 0x1000;
698 *(ptep + 2) = pte | 0x2000;
699 *(ptep + 3) = pte | 0x3000;
700 }
701 } else {
702 *ptep = pte;
703 }
704 __unreachable_ok_pop
705 }
706
707 /**
708 * Writes enough page table entries to map a single VM page and then ensures
709 * those writes complete by executing a Data Memory Barrier.
710 *
711 * @note The DMB issued by this function is not strong enough to protect against
712 * TLB invalidates from being reordered above the PTE writes. If a TLBI
713 * instruction is going to immediately be called after this write, it's
714 * recommended to call write_pte_strong() instead of this function.
715 *
716 * See the function header for write_pte_fast() for more details on the
717 * parameters.
718 */
719 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)720 write_pte(pt_entry_t *ptep, pt_entry_t pte)
721 {
722 write_pte_fast(ptep, pte);
723 FLUSH_PTE();
724 }
725
726 /**
727 * Writes enough page table entries to map a single VM page and then ensures
728 * those writes complete by executing a Data Synchronization Barrier. This
729 * barrier provides stronger guarantees than the DMB executed by write_pte().
730 *
731 * @note This function is useful if you're going to immediately flush the TLB
732 * after making the PTE write. A DSB is required to protect against the
733 * TLB invalidate being reordered before the PTE write.
734 *
735 * See the function header for write_pte_fast() for more details on the
736 * parameters.
737 */
738 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)739 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
740 {
741 write_pte_fast(ptep, pte);
742 FLUSH_PTE_STRONG();
743 }
744
745 /**
746 * Retrieve the pmap structure for the thread running on the current CPU.
747 */
748 pmap_t
current_pmap()749 current_pmap()
750 {
751 const pmap_t current = vm_map_pmap(current_thread()->map);
752
753 assert(current != NULL);
754
755 #if XNU_MONITOR
756 /**
757 * On PPL-enabled systems, it's important that PPL policy decisions aren't
758 * decided by kernel-writable memory. This function is used in various parts
759 * of the PPL, and besides validating that the pointer returned by this
760 * function is indeed a pmap structure, it's also important to ensure that
761 * it's actually the current thread's pmap. This is because different pmaps
762 * will have access to different entitlements based on the code signature of
763 * their loaded process. So if a different user pmap is set in the current
764 * thread structure (in an effort to bypass code signing restrictions), even
765 * though the structure would validate correctly as it is a real pmap
766 * structure, it should fail here.
767 *
768 * This only needs to occur for user pmaps because the kernel pmap's root
769 * page table is always the same as TTBR1 (it's set during bootstrap and not
770 * changed so it'd be redundant to check), and its code signing fields are
771 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
772 * it shouldn't be possible to set those fields. Due to that, an attacker
773 * setting the current thread's pmap to the kernel pmap as a way to bypass
774 * this check won't accomplish anything as it doesn't provide any extra code
775 * signing entitlements.
776 */
777 if ((current != kernel_pmap) &&
778 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
779 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
780 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
781 }
782 #endif /* XNU_MONITOR */
783
784 return current;
785 }
786
787 #if DEVELOPMENT || DEBUG
788
789 /*
790 * Trace levels are controlled by a bitmask in which each
791 * level can be enabled/disabled by the (1<<level) position
792 * in the boot arg
793 * Level 0: PPL extension functionality
794 * Level 1: pmap lifecycle (create/destroy/switch)
795 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
796 * Level 3: internal state management (attributes/fast-fault)
797 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
798 */
799
800 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
801
802 #define PMAP_TRACE(level, ...) \
803 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
804 KDBG_RELEASE(__VA_ARGS__); \
805 }
806 #else /* DEVELOPMENT || DEBUG */
807
808 #define PMAP_TRACE(level, ...)
809
810 #endif /* DEVELOPMENT || DEBUG */
811
812
813 /*
814 * Internal function prototypes (forward declarations).
815 */
816
817 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
818
819 static void pmap_set_reference(ppnum_t pn);
820
821 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
822
823 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
824
825 static kern_return_t pmap_expand(
826 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
827
828 static int pmap_remove_range(
829 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
830
831 static tt_entry_t *pmap_tt1_allocate(
832 pmap_t, vm_size_t, unsigned int);
833
834 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
835
836 static void pmap_tt1_deallocate(
837 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
838
839 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
840
841 static kern_return_t pmap_tt_allocate(
842 pmap_t, tt_entry_t **, unsigned int, unsigned int);
843
844 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
845
846 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
847 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
848 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
849
850 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
851
852
853 static void pmap_unmap_commpage(
854 pmap_t pmap);
855
856 static boolean_t
857 pmap_is_64bit(pmap_t);
858
859
860 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
861
862 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
863
864 static bool pmap_update_cache_attributes_locked(
865 ppnum_t, unsigned, bool);
866
867 static boolean_t arm_clear_fast_fault(
868 ppnum_t ppnum,
869 vm_prot_t fault_type,
870 pt_entry_t *pte_p);
871
872 static void pmap_trim_self(pmap_t pmap);
873 static void pmap_trim_subord(pmap_t subord);
874
875
876 /*
877 * Temporary prototypes, while we wait for pmap_enter to move to taking an
878 * address instead of a page number.
879 */
880 static kern_return_t
881 pmap_enter_addr(
882 pmap_t pmap,
883 vm_map_address_t v,
884 pmap_paddr_t pa,
885 vm_prot_t prot,
886 vm_prot_t fault_type,
887 unsigned int flags,
888 boolean_t wired);
889
890 kern_return_t
891 pmap_enter_options_addr(
892 pmap_t pmap,
893 vm_map_address_t v,
894 pmap_paddr_t pa,
895 vm_prot_t prot,
896 vm_prot_t fault_type,
897 unsigned int flags,
898 boolean_t wired,
899 unsigned int options,
900 __unused void *arg,
901 __unused pmap_mapping_type_t mapping_type);
902
903 #ifdef CONFIG_XNUPOST
904 kern_return_t pmap_test(void);
905 #endif /* CONFIG_XNUPOST */
906
907 PMAP_SUPPORT_PROTOTYPES(
908 kern_return_t,
909 arm_fast_fault, (pmap_t pmap,
910 vm_map_address_t va,
911 vm_prot_t fault_type,
912 bool was_af_fault,
913 bool from_user), ARM_FAST_FAULT_INDEX);
914
915 PMAP_SUPPORT_PROTOTYPES(
916 boolean_t,
917 arm_force_fast_fault, (ppnum_t ppnum,
918 vm_prot_t allow_mode,
919 int options), ARM_FORCE_FAST_FAULT_INDEX);
920
921 MARK_AS_PMAP_TEXT static boolean_t
922 arm_force_fast_fault_with_flush_range(
923 ppnum_t ppnum,
924 vm_prot_t allow_mode,
925 int options,
926 pmap_tlb_flush_range_t *flush_range);
927
928 /**
929 * Definition of the states driving the batch cache attributes update
930 * state machine.
931 */
932 typedef struct {
933 uint64_t page_index : 32, /* The page index to be operated on */
934 state : 8, /* The current state of the update machine */
935 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
936 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
937 :0;
938 } batch_set_cache_attr_state_t;
939
940 /* Possible values of the "state" field. */
941 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
942 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
943 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
944 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
945
946 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
947
948 PMAP_SUPPORT_PROTOTYPES(
949 batch_set_cache_attr_state_t,
950 pmap_batch_set_cache_attributes, (
951 #if XNU_MONITOR
952 volatile upl_page_info_t *user_page_list,
953 #else /* !XNU_MONITOR */
954 upl_page_info_array_t user_page_list,
955 #endif /* XNU_MONITOR */
956 batch_set_cache_attr_state_t state,
957 unsigned int page_cnt,
958 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
959
960 PMAP_SUPPORT_PROTOTYPES(
961 kern_return_t,
962 pmap_change_wiring, (pmap_t pmap,
963 vm_map_address_t v,
964 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
965
966 PMAP_SUPPORT_PROTOTYPES(
967 pmap_t,
968 pmap_create_options, (ledger_t ledger,
969 vm_map_size_t size,
970 unsigned int flags,
971 kern_return_t * kr), PMAP_CREATE_INDEX);
972
973 PMAP_SUPPORT_PROTOTYPES(
974 void,
975 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
976
977 PMAP_SUPPORT_PROTOTYPES(
978 kern_return_t,
979 pmap_enter_options, (pmap_t pmap,
980 vm_map_address_t v,
981 pmap_paddr_t pa,
982 vm_prot_t prot,
983 vm_prot_t fault_type,
984 unsigned int flags,
985 boolean_t wired,
986 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
987
988 PMAP_SUPPORT_PROTOTYPES(
989 pmap_paddr_t,
990 pmap_find_pa, (pmap_t pmap,
991 addr64_t va), PMAP_FIND_PA_INDEX);
992
993 PMAP_SUPPORT_PROTOTYPES(
994 kern_return_t,
995 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
996
997
998 PMAP_SUPPORT_PROTOTYPES(
999 boolean_t,
1000 pmap_is_empty, (pmap_t pmap,
1001 vm_map_offset_t va_start,
1002 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1003
1004
1005 PMAP_SUPPORT_PROTOTYPES(
1006 unsigned int,
1007 pmap_map_cpu_windows_copy, (ppnum_t pn,
1008 vm_prot_t prot,
1009 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1010
1011 PMAP_SUPPORT_PROTOTYPES(
1012 void,
1013 pmap_ro_zone_memcpy, (zone_id_t zid,
1014 vm_offset_t va,
1015 vm_offset_t offset,
1016 const vm_offset_t new_data,
1017 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1018
1019 PMAP_SUPPORT_PROTOTYPES(
1020 uint64_t,
1021 pmap_ro_zone_atomic_op, (zone_id_t zid,
1022 vm_offset_t va,
1023 vm_offset_t offset,
1024 zro_atomic_op_t op,
1025 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1026
1027 PMAP_SUPPORT_PROTOTYPES(
1028 void,
1029 pmap_ro_zone_bzero, (zone_id_t zid,
1030 vm_offset_t va,
1031 vm_offset_t offset,
1032 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1033
1034 PMAP_SUPPORT_PROTOTYPES(
1035 vm_map_offset_t,
1036 pmap_nest, (pmap_t grand,
1037 pmap_t subord,
1038 addr64_t vstart,
1039 uint64_t size,
1040 vm_map_offset_t vrestart,
1041 kern_return_t * krp), PMAP_NEST_INDEX);
1042
1043 PMAP_SUPPORT_PROTOTYPES(
1044 void,
1045 pmap_page_protect_options, (ppnum_t ppnum,
1046 vm_prot_t prot,
1047 unsigned int options,
1048 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1049
1050 PMAP_SUPPORT_PROTOTYPES(
1051 vm_map_address_t,
1052 pmap_protect_options, (pmap_t pmap,
1053 vm_map_address_t start,
1054 vm_map_address_t end,
1055 vm_prot_t prot,
1056 unsigned int options,
1057 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1058
1059 PMAP_SUPPORT_PROTOTYPES(
1060 kern_return_t,
1061 pmap_query_page_info, (pmap_t pmap,
1062 vm_map_offset_t va,
1063 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1064
1065 PMAP_SUPPORT_PROTOTYPES(
1066 mach_vm_size_t,
1067 pmap_query_resident, (pmap_t pmap,
1068 vm_map_address_t start,
1069 vm_map_address_t end,
1070 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1071
1072 PMAP_SUPPORT_PROTOTYPES(
1073 void,
1074 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1075
1076 PMAP_SUPPORT_PROTOTYPES(
1077 vm_map_address_t,
1078 pmap_remove_options, (pmap_t pmap,
1079 vm_map_address_t start,
1080 vm_map_address_t end,
1081 int options), PMAP_REMOVE_OPTIONS_INDEX);
1082
1083
1084 PMAP_SUPPORT_PROTOTYPES(
1085 void,
1086 pmap_set_cache_attributes, (ppnum_t pn,
1087 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1088
1089 PMAP_SUPPORT_PROTOTYPES(
1090 void,
1091 pmap_update_compressor_page, (ppnum_t pn,
1092 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1093
1094 PMAP_SUPPORT_PROTOTYPES(
1095 void,
1096 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1097
1098 #if MACH_ASSERT || XNU_MONITOR
1099 PMAP_SUPPORT_PROTOTYPES(
1100 void,
1101 pmap_set_process, (pmap_t pmap,
1102 int pid,
1103 char *procname), PMAP_SET_PROCESS_INDEX);
1104 #endif
1105
1106 PMAP_SUPPORT_PROTOTYPES(
1107 void,
1108 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1109
1110 PMAP_SUPPORT_PROTOTYPES(
1111 vm_map_offset_t,
1112 pmap_unnest_options, (pmap_t grand,
1113 addr64_t vaddr,
1114 uint64_t size,
1115 vm_map_offset_t vrestart,
1116 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1117
1118 PMAP_SUPPORT_PROTOTYPES(
1119 void,
1120 phys_attribute_set, (ppnum_t pn,
1121 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1122
1123 PMAP_SUPPORT_PROTOTYPES(
1124 void,
1125 phys_attribute_clear, (ppnum_t pn,
1126 unsigned int bits,
1127 int options,
1128 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1129
1130 #if __ARM_RANGE_TLBI__
1131 PMAP_SUPPORT_PROTOTYPES(
1132 vm_map_address_t,
1133 phys_attribute_clear_range, (pmap_t pmap,
1134 vm_map_address_t start,
1135 vm_map_address_t end,
1136 unsigned int bits,
1137 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1138 #endif /* __ARM_RANGE_TLBI__ */
1139
1140
1141 PMAP_SUPPORT_PROTOTYPES(
1142 void,
1143 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1144
1145 PMAP_SUPPORT_PROTOTYPES(
1146 void,
1147 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1148
1149 PMAP_SUPPORT_PROTOTYPES(
1150 void,
1151 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1152
1153 PMAP_SUPPORT_PROTOTYPES(
1154 void,
1155 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1156
1157 PMAP_SUPPORT_PROTOTYPES(
1158 void,
1159 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1160
1161 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1162 PMAP_SUPPORT_PROTOTYPES(
1163 void,
1164 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1165 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1166
1167 /* Definition of the states used by pmap_trim(). */
1168 typedef enum {
1169 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1170 PMAP_TRIM_STATE_START = 0,
1171
1172 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1173 PMAP_TRIM_STATE_GRAND_BEFORE,
1174
1175 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1176 PMAP_TRIM_STATE_GRAND_AFTER,
1177
1178 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1179 PMAP_TRIM_STATE_SUBORD,
1180
1181 /* Marks that trimming is finished. */
1182 PMAP_TRIM_STATE_DONE,
1183
1184 /* Sentry enum for sanity checks. */
1185 PMAP_TRIM_STATE_COUNT,
1186 } pmap_trim_state_t;
1187
1188 PMAP_SUPPORT_PROTOTYPES(
1189 pmap_trim_state_t,
1190 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1191
1192 #if HAS_APPLE_PAC
1193 PMAP_SUPPORT_PROTOTYPES(
1194 void *,
1195 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1196 PMAP_SUPPORT_PROTOTYPES(
1197 void *,
1198 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1199 #endif /* HAS_APPLE_PAC */
1200
1201
1202
1203
1204 PMAP_SUPPORT_PROTOTYPES(
1205 kern_return_t,
1206 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1207 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1208
1209 PMAP_SUPPORT_PROTOTYPES(
1210 kern_return_t,
1211 pmap_load_trust_cache_with_type, (TCType_t type,
1212 const vm_address_t pmap_img4_payload,
1213 const vm_size_t pmap_img4_payload_len,
1214 const vm_address_t img4_manifest,
1215 const vm_size_t img4_manifest_len,
1216 const vm_address_t img4_aux_manifest,
1217 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1218
1219 PMAP_SUPPORT_PROTOTYPES(
1220 void,
1221 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1222
1223 PMAP_SUPPORT_PROTOTYPES(
1224 kern_return_t,
1225 pmap_query_trust_cache, (TCQueryType_t query_type,
1226 const uint8_t cdhash[kTCEntryHashSize],
1227 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1228
1229 PMAP_SUPPORT_PROTOTYPES(
1230 errno_t,
1231 pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1232 const void *input_data,
1233 size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1234
1235 #if PMAP_CS_INCLUDE_CODE_SIGNING
1236
1237 PMAP_SUPPORT_PROTOTYPES(
1238 kern_return_t,
1239 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1240 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1241
1242 PMAP_SUPPORT_PROTOTYPES(
1243 kern_return_t,
1244 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1245 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1246
1247 PMAP_SUPPORT_PROTOTYPES(
1248 kern_return_t,
1249 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1250 pmap_cs_profile_t * profile_obj),
1251 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1252
1253 PMAP_SUPPORT_PROTOTYPES(
1254 kern_return_t,
1255 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1256 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1257
1258 PMAP_SUPPORT_PROTOTYPES(
1259 kern_return_t,
1260 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1261 const void *kernel_entitlements),
1262 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1263
1264 PMAP_SUPPORT_PROTOTYPES(
1265 kern_return_t,
1266 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1267 const void **kernel_entitlements),
1268 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1269
1270 PMAP_SUPPORT_PROTOTYPES(
1271 kern_return_t,
1272 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1273 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1274
1275 PMAP_SUPPORT_PROTOTYPES(
1276 kern_return_t,
1277 pmap_cs_allow_invalid, (pmap_t pmap),
1278 PMAP_CS_ALLOW_INVALID_INDEX);
1279
1280 PMAP_SUPPORT_PROTOTYPES(
1281 void,
1282 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1283 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1284
1285 PMAP_SUPPORT_PROTOTYPES(
1286 bool,
1287 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1288 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1289
1290 PMAP_SUPPORT_PROTOTYPES(
1291 void,
1292 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1293 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1294
1295 PMAP_SUPPORT_PROTOTYPES(
1296 void,
1297 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1298 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1299
1300 #endif
1301
1302 PMAP_SUPPORT_PROTOTYPES(
1303 uint32_t,
1304 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1305
1306 PMAP_SUPPORT_PROTOTYPES(
1307 bool,
1308 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1309
1310 PMAP_SUPPORT_PROTOTYPES(
1311 void,
1312 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1313
1314 void pmap_footprint_suspend(vm_map_t map,
1315 boolean_t suspend);
1316 PMAP_SUPPORT_PROTOTYPES(
1317 void,
1318 pmap_footprint_suspend, (vm_map_t map,
1319 boolean_t suspend),
1320 PMAP_FOOTPRINT_SUSPEND_INDEX);
1321
1322
1323
1324
1325
1326 #if DEVELOPMENT || DEBUG
1327 PMAP_SUPPORT_PROTOTYPES(
1328 kern_return_t,
1329 pmap_test_text_corruption, (pmap_paddr_t),
1330 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1331 #endif /* DEVELOPMENT || DEBUG */
1332
1333 /*
1334 * The low global vector page is mapped at a fixed alias.
1335 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1336 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1337 * to check both addresses anyway for backward compatibility. So for now
1338 * we leave H6 and H7 where they were.
1339 */
1340 #if (ARM_PGSHIFT == 14)
1341 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1342 #else
1343 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1344 #endif
1345
1346
1347 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1348 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1349 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1350
1351 #if XNU_MONITOR
1352
1353 #if __has_feature(ptrauth_calls)
1354 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1355 #else
1356 #define __ptrauth_ppl_handler
1357 #endif
1358
1359 /*
1360 * Table of function pointers used for PPL dispatch.
1361 */
1362 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1363 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1364 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1365 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1366 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1367 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1368 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1369 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1370 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1371 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1372 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1373 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1374 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1375 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1376 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1377 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1378 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1379 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1380 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1381 [PMAP_NEST_INDEX] = pmap_nest_internal,
1382 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1383 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1384 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1385 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1386 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1387 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1388 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1389 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1390 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1391 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1392 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1393 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1394 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1395 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1396 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1397 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1398 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1399 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1400 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1401 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1402 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1403 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1404 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1405 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1406 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1407 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1408 [PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1409 #if PMAP_CS_INCLUDE_CODE_SIGNING
1410 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1411 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1412 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1413 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1414 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1415 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1416 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1417 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1418 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1419 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1420 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1421 #endif
1422 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1423 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1424 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1425 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1426 #if HAS_APPLE_PAC
1427 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1428 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1429 #endif /* HAS_APPLE_PAC */
1430 #if __ARM_RANGE_TLBI__
1431 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1432 #endif /* __ARM_RANGE_TLBI__ */
1433 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1434 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1435 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1436 [PMAP_NOP_INDEX] = pmap_nop_internal,
1437
1438 #if DEVELOPMENT || DEBUG
1439 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1440 #endif /* DEVELOPMENT || DEBUG */
1441
1442
1443 };
1444 #endif
1445
1446 #if XNU_MONITOR
1447 /**
1448 * A convenience function for setting protections on a single physical
1449 * aperture or static region mapping without invalidating the TLB.
1450 *
1451 * @note This function does not perform any TLB invalidations. That must be done
1452 * separately to be able to safely use the updated mapping.
1453 *
1454 * @note This function understands the difference between the VM page size and
1455 * the kernel page size and will update multiple PTEs if the sizes differ.
1456 * In other words, enough PTEs will always get updated to change the
1457 * permissions on a PAGE_SIZE amount of memory.
1458 *
1459 * @note The PVH lock for the physical page represented by this mapping must
1460 * already be locked.
1461 *
1462 * @note This function assumes the caller has already verified that the PTE
1463 * pointer does indeed point to a physical aperture or static region page
1464 * table. Please validate your inputs before passing it along to this
1465 * function.
1466 *
1467 * @param ptep Pointer to the physical aperture or static region page table to
1468 * update with a new XPRR index.
1469 * @param expected_perm The XPRR index that is expected to already exist at the
1470 * current mapping. If the current index doesn't match this
1471 * then the system will panic.
1472 * @param new_perm The new XPRR index to update the mapping with.
1473 */
1474 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1475 pmap_set_pte_xprr_perm(
1476 pt_entry_t * const ptep,
1477 unsigned int expected_perm,
1478 unsigned int new_perm)
1479 {
1480 assert(ptep != NULL);
1481
1482 pt_entry_t spte = *ptep;
1483 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1484
1485 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1486 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1487 __func__, ptep, new_perm, expected_perm);
1488 }
1489
1490 /**
1491 * The PTE involved should be valid, should not have the hint bit set, and
1492 * should have the expected XPRR index.
1493 */
1494 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1495 panic_plain("%s: physical aperture or static region PTE is invalid, "
1496 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1497 __func__, ptep, spte, new_perm, expected_perm);
1498 }
1499
1500 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1501 panic_plain("%s: physical aperture or static region PTE has hint bit "
1502 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1503 __func__, ptep, spte, new_perm, expected_perm);
1504 }
1505
1506 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1507 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1508 "ptep=%p, new_perm=%u, expected_perm=%u",
1509 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1510 }
1511
1512 pt_entry_t template = spte;
1513 template &= ~ARM_PTE_XPRR_MASK;
1514 template |= xprr_perm_to_pte(new_perm);
1515
1516 write_pte_strong(ptep, template);
1517 }
1518
1519 /**
1520 * Update the protections on a single physical aperture mapping and invalidate
1521 * the TLB so the mapping can be used.
1522 *
1523 * @note The PVH lock for the physical page must already be locked.
1524 *
1525 * @param pai The physical address index of the page whose physical aperture
1526 * mapping will be updated with new permissions.
1527 * @param expected_perm The XPRR index that is expected to already exist at the
1528 * current mapping. If the current index doesn't match this
1529 * then the system will panic.
1530 * @param new_perm The new XPRR index to update the mapping with.
1531 */
1532 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1533 pmap_set_xprr_perm(
1534 unsigned int pai,
1535 unsigned int expected_perm,
1536 unsigned int new_perm)
1537 {
1538 pvh_assert_locked(pai);
1539
1540 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1541 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1542
1543 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1544
1545 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1546 sync_tlb_flush();
1547 }
1548
1549 /**
1550 * Update the protections on a range of physical aperture or static region
1551 * mappings and invalidate the TLB so the mappings can be used.
1552 *
1553 * @note Static region mappings can only be updated before machine_lockdown().
1554 * Physical aperture mappings can be updated at any time.
1555 *
1556 * @param start The starting virtual address of the static region or physical
1557 * aperture range whose permissions will be updated.
1558 * @param end The final (inclusive) virtual address of the static region or
1559 * physical aperture range whose permissions will be updated.
1560 * @param expected_perm The XPRR index that is expected to already exist at the
1561 * current mappings. If the current indices don't match
1562 * this then the system will panic.
1563 * @param new_perm The new XPRR index to update the mappings with.
1564 */
1565 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1566 pmap_set_range_xprr_perm(
1567 vm_address_t start,
1568 vm_address_t end,
1569 unsigned int expected_perm,
1570 unsigned int new_perm)
1571 {
1572 /**
1573 * Validate our arguments; any invalid argument will be grounds for a panic.
1574 */
1575 if (__improbable((start | end) & ARM_PGMASK)) {
1576 panic_plain("%s: start or end not page aligned, "
1577 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1578 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1579 }
1580
1581 if (__improbable(start > end)) {
1582 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1583 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1584 }
1585
1586 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1587 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1588
1589 if (__improbable(!(in_physmap || in_static))) {
1590 panic_plain("%s: address not in static region or physical aperture, "
1591 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1592 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1593 }
1594
1595 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1596 panic_plain("%s: invalid XPRR index, "
1597 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1598 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1599 }
1600
1601 /*
1602 * Walk over the PTEs for the given range, and set the protections on those
1603 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1604 * one twig entry (whichever twig entry currently maps "va").
1605 */
1606 vm_address_t va = start;
1607 while (va < end) {
1608 /**
1609 * Get the last VA that the twig entry for "va" maps. All of the leaf
1610 * PTEs from va to tte_va_end will have their permissions updated.
1611 */
1612 vm_address_t tte_va_end =
1613 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1614
1615 if (tte_va_end > end) {
1616 tte_va_end = end;
1617 }
1618
1619 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1620
1621 if (ttep == NULL) {
1622 panic_plain("%s: physical aperture or static region tte is NULL, "
1623 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1624 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1625 }
1626
1627 tt_entry_t tte = *ttep;
1628
1629 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1630 panic_plain("%s: tte=0x%llx is not a table type entry, "
1631 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1632 tte, (void *)start, (void *)end, new_perm, expected_perm);
1633 }
1634
1635 /* Walk over the given L3 page table page and update the PTEs. */
1636 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1637 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1638 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1639 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1640
1641 /**
1642 * The current PTE pointer is incremented by the page ratio (ratio of
1643 * VM page size to kernel hardware page size) because one call to
1644 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1645 * a PAGE_SIZE worth of hardware pages.
1646 */
1647 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1648 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1649 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1650 pvh_lock(pai);
1651 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1652 pvh_unlock(pai);
1653 }
1654
1655 va = tte_va_end;
1656 }
1657
1658 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1659 }
1660
1661 #endif /* XNU_MONITOR */
1662
1663 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1664 PMAP_ZINFO_PALLOC(
1665 pmap_t pmap, int bytes)
1666 {
1667 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1668 }
1669
1670 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1671 PMAP_ZINFO_PFREE(
1672 pmap_t pmap,
1673 int bytes)
1674 {
1675 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1676 }
1677
1678 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1679 pmap_tt_ledger_credit(
1680 pmap_t pmap,
1681 vm_size_t size)
1682 {
1683 if (pmap != kernel_pmap) {
1684 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1685 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1686 }
1687 }
1688
1689 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1690 pmap_tt_ledger_debit(
1691 pmap_t pmap,
1692 vm_size_t size)
1693 {
1694 if (pmap != kernel_pmap) {
1695 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1696 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1697 }
1698 }
1699
1700 static inline void
pmap_update_plru(uint16_t asid_index __unused)1701 pmap_update_plru(uint16_t asid_index __unused)
1702 {
1703 #if !HAS_16BIT_ASID
1704 if (__probable(pmap_asid_plru)) {
1705 unsigned plru_index = asid_index >> 6;
1706 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1707 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1708 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1709 }
1710 }
1711 #endif /* !HAS_16BIT_ASID */
1712 }
1713
1714 static bool
alloc_asid(pmap_t pmap)1715 alloc_asid(pmap_t pmap)
1716 {
1717 int vasid = -1;
1718 uint16_t hw_asid;
1719
1720 pmap_simple_lock(&asid_lock);
1721
1722 #if !HAS_16BIT_ASID
1723 if (__probable(pmap_asid_plru)) {
1724 unsigned plru_index = 0;
1725 uint64_t lowest_gen = asid_plru_generation[0];
1726 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1727 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1728 if (asid_plru_generation[i] < lowest_gen) {
1729 plru_index = i;
1730 lowest_gen = asid_plru_generation[i];
1731 lowest_gen_bitmap = asid_plru_bitmap[i];
1732 }
1733 }
1734
1735 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1736 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1737 if (temp_plru) {
1738 vasid = (plru_index << 6) + lsb_first(temp_plru);
1739 #if DEVELOPMENT || DEBUG
1740 ++pmap_asid_hits;
1741 #endif
1742 break;
1743 }
1744 }
1745 }
1746 #else
1747 /**
1748 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1749 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1750 * However, we first try to allocate starting from the position of the most-recently allocated
1751 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1752 * lower bit positions and then re-checking those same lower positions every time we allocate
1753 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1754 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1755 * logic, without requiring prohibitively expensive RCTX instructions.
1756 */
1757 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1758 #endif /* !HAS_16BIT_ASID */
1759 if (__improbable(vasid < 0)) {
1760 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1761 // slightly better with the collision detection scheme used by pmap_switch_internal().
1762 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1763 #if DEVELOPMENT || DEBUG
1764 ++pmap_asid_misses;
1765 #endif
1766 }
1767 if (__improbable(vasid < 0)) {
1768 pmap_simple_unlock(&asid_lock);
1769 return false;
1770 }
1771 assert((uint32_t)vasid < pmap_max_asids);
1772 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1773 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1774 #if HAS_16BIT_ASID
1775 last_allocated_asid = (uint16_t)vasid;
1776 #endif /* HAS_16BIT_ASID */
1777 pmap_simple_unlock(&asid_lock);
1778 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1779 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1780 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1781 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1782 * reassign to a reserved VASID. */
1783 assert(pmap->sw_asid < UINT8_MAX);
1784 pmap->sw_asid = UINT8_MAX;
1785 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1786 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1787 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1788 assert(hw_asid < MAX_HW_ASIDS);
1789 }
1790 pmap_update_plru(hw_asid);
1791 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1792 #if __ARM_KERNEL_PROTECT__
1793 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1794 #endif
1795 pmap->hw_asid = hw_asid;
1796 return true;
1797 }
1798
1799 static void
free_asid(pmap_t pmap)1800 free_asid(pmap_t pmap)
1801 {
1802 unsigned int vasid;
1803 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1804 if (__improbable(hw_asid == 0)) {
1805 return;
1806 }
1807
1808 #if __ARM_KERNEL_PROTECT__
1809 hw_asid >>= 1;
1810 #endif
1811 hw_asid -= 1;
1812
1813 #if HAS_16BIT_ASID
1814 vasid = hw_asid;
1815 #else
1816 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1817 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1818 } else {
1819 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1820 }
1821
1822 if (__probable(pmap_asid_plru)) {
1823 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1824 }
1825 #endif /* HAS_16BIT_ASID */
1826 pmap_simple_lock(&asid_lock);
1827 assert(!bitmap_test(&asid_bitmap[0], vasid));
1828 bitmap_set(&asid_bitmap[0], vasid);
1829 pmap_simple_unlock(&asid_lock);
1830 }
1831
1832
1833 boolean_t
pmap_valid_address(pmap_paddr_t addr)1834 pmap_valid_address(
1835 pmap_paddr_t addr)
1836 {
1837 return pa_valid(addr);
1838 }
1839
1840
1841
1842
1843
1844
1845 /*
1846 * Map memory at initialization. The physical addresses being
1847 * mapped are not managed and are never unmapped.
1848 *
1849 * For now, VM is already on, we only need to map the
1850 * specified memory.
1851 */
1852 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1853 pmap_map(
1854 vm_map_address_t virt,
1855 vm_offset_t start,
1856 vm_offset_t end,
1857 vm_prot_t prot,
1858 unsigned int flags)
1859 {
1860 kern_return_t kr;
1861 vm_size_t ps;
1862
1863 ps = PAGE_SIZE;
1864 while (start < end) {
1865 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1866 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1867
1868 if (kr != KERN_SUCCESS) {
1869 panic("%s: failed pmap_enter, "
1870 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1871 __FUNCTION__,
1872 (void *) virt, (void *) start, (void *) end, prot, flags);
1873 }
1874
1875 virt += ps;
1876 start += ps;
1877 }
1878 return virt;
1879 }
1880
1881 #if XNU_MONITOR
1882 /**
1883 * Remove kernel writeablity from an IO PTE value if the page is owned by
1884 * guarded mode software.
1885 *
1886 * @param paddr The physical address of the page which has to be non-DRAM.
1887 * @param tmplate The PTE value to be evaluated.
1888 *
1889 * @return A new PTE value with permission bits modified.
1890 */
1891 static inline
1892 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1893 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1894 {
1895 assert(!pa_valid(paddr));
1896
1897 const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1898
1899 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1900 /* PPL to own the page by converting KERN_RW to PPL_RW. */
1901 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1902 switch (xprr_perm) {
1903 case XPRR_KERN_RO_PERM:
1904 break;
1905 case XPRR_KERN_RW_PERM:
1906 tmplate &= ~ARM_PTE_XPRR_MASK;
1907 tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1908 break;
1909 default:
1910 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1911 }
1912 }
1913
1914 return tmplate;
1915 }
1916 #endif /* XNU_MONITOR */
1917
1918 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1919 pmap_map_bd_with_options(
1920 vm_map_address_t virt,
1921 vm_offset_t start,
1922 vm_offset_t end,
1923 vm_prot_t prot,
1924 int32_t options)
1925 {
1926 pt_entry_t mem_attr;
1927
1928 switch (options & PMAP_MAP_BD_MASK) {
1929 case PMAP_MAP_BD_WCOMB:
1930 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1931 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1932 break;
1933 case PMAP_MAP_BD_POSTED:
1934 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1935 break;
1936 case PMAP_MAP_BD_POSTED_REORDERED:
1937 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1938 break;
1939 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1940 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1941 break;
1942 default:
1943 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1944 break;
1945 }
1946
1947 /* not cacheable and not buffered */
1948 pt_entry_t tmplate = pa_to_pte(start)
1949 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1950 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1951 | mem_attr;
1952
1953 #if __ARM_KERNEL_PROTECT__
1954 tmplate |= ARM_PTE_NG;
1955 #endif /* __ARM_KERNEL_PROTECT__ */
1956
1957 vm_map_address_t vaddr = virt;
1958 vm_offset_t paddr = start;
1959 while (paddr < end) {
1960 pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1961 if (ptep == PT_ENTRY_NULL) {
1962 panic("pmap_map_bd");
1963 }
1964
1965 /**
1966 * For every iteration, the paddr encoded in tmplate is incrementing,
1967 * but we always start with the original AP bits defined at the top
1968 * of the function in tmplate and only modify the AP bits in the pte
1969 * variable.
1970 */
1971 pt_entry_t pte;
1972 #if XNU_MONITOR
1973 if (!pa_valid(paddr)) {
1974 pte = pmap_construct_io_pte(paddr, tmplate);
1975 } else {
1976 pte = tmplate;
1977 }
1978 #else /* !XNU_MONITOR */
1979 pte = tmplate;
1980 #endif
1981
1982 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1983 write_pte_strong(ptep, pte);
1984
1985 pte_increment_pa(tmplate);
1986 vaddr += PAGE_SIZE;
1987 paddr += PAGE_SIZE;
1988 }
1989
1990 if (end >= start) {
1991 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1992 }
1993
1994 return vaddr;
1995 }
1996
1997 /*
1998 * Back-door routine for mapping kernel VM at initialization.
1999 * Useful for mapping memory outside the range
2000 * [vm_first_phys, vm_last_phys] (i.e., devices).
2001 * Otherwise like pmap_map.
2002 */
2003 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)2004 pmap_map_bd(
2005 vm_map_address_t virt,
2006 vm_offset_t start,
2007 vm_offset_t end,
2008 vm_prot_t prot)
2009 {
2010 return pmap_map_bd_with_options(virt, start, end, prot, 0);
2011 }
2012
2013 /*
2014 * Back-door routine for mapping kernel VM at initialization.
2015 * Useful for mapping memory specific physical addresses in early
2016 * boot (i.e., before kernel_map is initialized).
2017 *
2018 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
2019 */
2020
2021 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2022 pmap_map_high_window_bd(
2023 vm_offset_t pa_start,
2024 vm_size_t len,
2025 vm_prot_t prot)
2026 {
2027 pt_entry_t *ptep, pte;
2028 vm_map_address_t va_start = VREGION1_START;
2029 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
2030 vm_map_address_t va_end;
2031 vm_map_address_t va;
2032 vm_size_t offset;
2033
2034 offset = pa_start & PAGE_MASK;
2035 pa_start -= offset;
2036 len += offset;
2037
2038 if (len > (va_max - va_start)) {
2039 panic("%s: area too large, "
2040 "pa_start=%p, len=%p, prot=0x%x",
2041 __FUNCTION__,
2042 (void*)pa_start, (void*)len, prot);
2043 }
2044
2045 scan:
2046 for (; va_start < va_max; va_start += PAGE_SIZE) {
2047 ptep = pmap_pte(kernel_pmap, va_start);
2048 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2049 if (*ptep == ARM_PTE_TYPE_FAULT) {
2050 break;
2051 }
2052 }
2053 if (va_start > va_max) {
2054 panic("%s: insufficient pages, "
2055 "pa_start=%p, len=%p, prot=0x%x",
2056 __FUNCTION__,
2057 (void*)pa_start, (void*)len, prot);
2058 }
2059
2060 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2061 ptep = pmap_pte(kernel_pmap, va_end);
2062 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2063 if (*ptep != ARM_PTE_TYPE_FAULT) {
2064 va_start = va_end + PAGE_SIZE;
2065 goto scan;
2066 }
2067 }
2068
2069 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2070 ptep = pmap_pte(kernel_pmap, va);
2071 pte = pa_to_pte(pa_start)
2072 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2073 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2074 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2075 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2076 #if __ARM_KERNEL_PROTECT__
2077 pte |= ARM_PTE_NG;
2078 #endif /* __ARM_KERNEL_PROTECT__ */
2079 write_pte_strong(ptep, pte);
2080 }
2081 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2082 #if KASAN
2083 kasan_notify_address(va_start, len);
2084 #endif
2085 return va_start;
2086 }
2087
2088 static uint32_t
pmap_compute_max_asids(void)2089 pmap_compute_max_asids(void)
2090 {
2091 DTEntry entry;
2092 void const *prop = NULL;
2093 uint32_t max_asids;
2094 int err;
2095 unsigned int prop_size;
2096
2097 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2098 assert(err == kSuccess);
2099
2100 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2101 /* TODO: consider allowing maxproc limits to be scaled earlier so that
2102 * we can choose a more flexible default value here. */
2103 return MAX_ASIDS;
2104 }
2105
2106 if (prop_size != sizeof(max_asids)) {
2107 panic("pmap-max-asids property is not a 32-bit integer");
2108 }
2109
2110 max_asids = *((uint32_t const *)prop);
2111 #if HAS_16BIT_ASID
2112 if (max_asids > MAX_HW_ASIDS) {
2113 panic("pmap-max-asids 0x%x too large", max_asids);
2114 }
2115 #else
2116 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2117 max_asids = (max_asids + 63) & ~63UL;
2118
2119 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2120 /* currently capped by size of pmap->sw_asid */
2121 panic("pmap-max-asids 0x%x too large", max_asids);
2122 }
2123 #endif /* HAS_16BIT_ASID */
2124 if (max_asids == 0) {
2125 panic("pmap-max-asids cannot be zero");
2126 }
2127 return max_asids;
2128 }
2129
2130 #if __arm64__
2131 /*
2132 * pmap_get_arm64_prot
2133 *
2134 * return effective armv8 VMSA block protections including
2135 * table AP/PXN/XN overrides of a pmap entry
2136 *
2137 */
2138
2139 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2140 pmap_get_arm64_prot(
2141 pmap_t pmap,
2142 vm_offset_t addr)
2143 {
2144 tt_entry_t tte = 0;
2145 unsigned int level = 0;
2146 uint64_t tte_type = 0;
2147 uint64_t effective_prot_bits = 0;
2148 uint64_t aggregate_tte = 0;
2149 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2150 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2151
2152 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2153 tte = *pmap_ttne(pmap, level, addr);
2154
2155 if (!(tte & ARM_TTE_VALID)) {
2156 return 0;
2157 }
2158
2159 tte_type = tte & ARM_TTE_TYPE_MASK;
2160
2161 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2162 (level == pt_attr->pta_max_level)) {
2163 /* Block or page mapping; both have the same protection bit layout. */
2164 break;
2165 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
2166 /* All of the table bits we care about are overrides, so just OR them together. */
2167 aggregate_tte |= tte;
2168 }
2169 }
2170
2171 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2172 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2173 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2174
2175 /* Start with the PTE bits. */
2176 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2177
2178 /* Table AP bits mask out block/page AP bits */
2179 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2180
2181 /* XN/PXN bits can be OR'd in. */
2182 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2183 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2184
2185 return effective_prot_bits;
2186 }
2187 #endif /* __arm64__ */
2188
2189 /**
2190 * Helper macros for accessing the "unnested" and "in-progress" bits in
2191 * pmap->nested_region_unnested_table_bitmap.
2192 */
2193 #define UNNEST_BIT(index) ((index) * 2)
2194 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
2195
2196
2197 /*
2198 * Bootstrap the system enough to run with virtual memory.
2199 *
2200 * The early VM initialization code has already allocated
2201 * the first CPU's translation table and made entries for
2202 * all the one-to-one mappings to be found there.
2203 *
2204 * We must set up the kernel pmap structures, the
2205 * physical-to-virtual translation lookup tables for the
2206 * physical memory to be managed (between avail_start and
2207 * avail_end).
2208 *
2209 * Map the kernel's code and data, and allocate the system page table.
2210 * Page_size must already be set.
2211 *
2212 * Parameters:
2213 * first_avail first available physical page -
2214 * after kernel page tables
2215 * avail_start PA of first managed physical page
2216 * avail_end PA of last managed physical page
2217 */
2218
2219 void
pmap_bootstrap(vm_offset_t vstart)2220 pmap_bootstrap(
2221 vm_offset_t vstart)
2222 {
2223 vm_map_offset_t maxoffset;
2224
2225 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2226
2227 #if XNU_MONITOR
2228
2229 #if DEVELOPMENT || DEBUG
2230 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2231 #endif
2232
2233 #if CONFIG_CSR_FROM_DT
2234 if (csr_unsafe_kernel_text) {
2235 pmap_ppl_disable = true;
2236 }
2237 #endif /* CONFIG_CSR_FROM_DT */
2238
2239 #endif /* XNU_MONITOR */
2240
2241 #if DEVELOPMENT || DEBUG
2242 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2243 kprintf("Kernel traces for pmap operations enabled\n");
2244 }
2245 #endif
2246
2247 /*
2248 * Initialize the kernel pmap.
2249 */
2250 #if ARM_PARAMETERIZED_PMAP
2251 kernel_pmap->pmap_pt_attr = native_pt_attr;
2252 #endif /* ARM_PARAMETERIZED_PMAP */
2253 #if HAS_APPLE_PAC
2254 kernel_pmap->disable_jop = 0;
2255 #endif /* HAS_APPLE_PAC */
2256 kernel_pmap->tte = cpu_tte;
2257 kernel_pmap->ttep = cpu_ttep;
2258 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2259 kernel_pmap->max = UINTPTR_MAX;
2260 os_atomic_init(&kernel_pmap->ref_count, 1);
2261 #if XNU_MONITOR
2262 os_atomic_init(&kernel_pmap->nested_count, 0);
2263 #endif
2264 kernel_pmap->nx_enabled = TRUE;
2265 #ifdef __arm64__
2266 kernel_pmap->is_64bit = TRUE;
2267 #else
2268 kernel_pmap->is_64bit = FALSE;
2269 #endif
2270 #if CONFIG_ROSETTA
2271 kernel_pmap->is_rosetta = FALSE;
2272 #endif
2273
2274 #if ARM_PARAMETERIZED_PMAP
2275 kernel_pmap->pmap_pt_attr = native_pt_attr;
2276 #endif /* ARM_PARAMETERIZED_PMAP */
2277
2278 kernel_pmap->nested_region_addr = 0x0ULL;
2279 kernel_pmap->nested_region_size = 0x0ULL;
2280 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2281 kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2282 kernel_pmap->type = PMAP_TYPE_KERNEL;
2283
2284 kernel_pmap->hw_asid = 0;
2285 kernel_pmap->sw_asid = 0;
2286
2287 pmap_lock_init(kernel_pmap);
2288
2289 pmap_max_asids = pmap_compute_max_asids();
2290 #if HAS_16BIT_ASID
2291 asid_chunk_size = MAX_HW_ASIDS;
2292 #else
2293 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2294 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2295 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2296 * masking used by the PLRU scheme. This means we must handle the case in which
2297 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2298 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2299 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2300 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2301 #endif /* HAS_16BIT_ASIDS */
2302
2303 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2304
2305 #if HAS_SPECRES_DEBUGGING
2306 PE_parse_boot_argn("specres_debug", &specres_debug, sizeof(specres_debug));
2307
2308 if ((specres_debug & SPECRES_DEBUG_FORCE_RCTX) && (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX)) {
2309 panic("%s: invalid specres_debug value: %llu", __func__, specres_debug);
2310 }
2311 #endif /* HAS_SPECRES_DEBUGGING */
2312
2313 /**
2314 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2315 * pp_attr_table, etc). This function will use `avail_start` to allocate
2316 * space for these data structures.
2317 */
2318 pmap_data_bootstrap();
2319
2320 /**
2321 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2322 */
2323 uat_bootstrap();
2324
2325
2326 /**
2327 * Bootstrap any necessary SART data structures and values needed from the device tree.
2328 */
2329 sart_bootstrap();
2330
2331 /**
2332 * Don't make any assumptions about the alignment of avail_start before this
2333 * point (i.e., pmap_data_bootstrap() performs allocations).
2334 */
2335 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2336
2337 const pmap_paddr_t pmap_struct_start = avail_start;
2338
2339 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2340 avail_start = round_page(avail_start + asid_table_size);
2341
2342 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2343
2344 vm_first_phys = gPhysBase;
2345 vm_last_phys = trunc_page(avail_end);
2346
2347 queue_init(&map_pmap_list);
2348 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2349 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2350 free_page_size_tt_count = 0;
2351 free_page_size_tt_max = 0;
2352 free_tt_list = TT_FREE_ENTRY_NULL;
2353 free_tt_count = 0;
2354 free_tt_max = 0;
2355
2356 virtual_space_start = vstart;
2357 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2358
2359 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2360 #if !HAS_16BIT_ASID
2361 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2362 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2363 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2364 #endif /* !HAS_16BIT_ASID */
2365
2366
2367
2368 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2369 maxoffset = trunc_page(maxoffset);
2370 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2371 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2372 arm_pmap_max_offset_default = maxoffset;
2373 }
2374 }
2375 #if defined(__arm64__)
2376 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2377 maxoffset = trunc_page(maxoffset);
2378 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2379 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2380 arm64_pmap_max_offset_default = maxoffset;
2381 }
2382 }
2383 #endif
2384
2385 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2386
2387
2388 #if PMAP_CS_PPL_MONITOR
2389 /* Initialize the PPL trust cache read-write lock */
2390 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2391 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2392 #endif
2393
2394 #if DEVELOPMENT || DEBUG
2395 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2396 &vm_footprint_suspend_allowed,
2397 sizeof(vm_footprint_suspend_allowed));
2398 #endif /* DEVELOPMENT || DEBUG */
2399
2400 #if KASAN
2401 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2402 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2403 #endif /* KASAN */
2404
2405 /**
2406 * Ensure that avail_start is always left on a page boundary. The calling
2407 * code might not perform any alignment before allocating page tables so
2408 * this is important.
2409 */
2410 avail_start = round_page(avail_start);
2411 }
2412
2413 #if XNU_MONITOR
2414
2415 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2416 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2417 {
2418 pmap_paddr_t cur_pa;
2419 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2420 assert(pa_valid(cur_pa));
2421 ppattr_pa_set_monitor(cur_pa);
2422 }
2423 }
2424
2425 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2426 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2427 pmap_paddr_t end_pa,
2428 unsigned int expected_perm,
2429 unsigned int new_perm)
2430 {
2431 vm_offset_t start_va = phystokv(start_pa);
2432 vm_offset_t end_va = start_va + (end_pa - start_pa);
2433
2434 pa_set_range_monitor(start_pa, end_pa);
2435 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2436 }
2437
2438 static void
pmap_lockdown_kc(void)2439 pmap_lockdown_kc(void)
2440 {
2441 extern vm_offset_t vm_kernelcache_base;
2442 extern vm_offset_t vm_kernelcache_top;
2443 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2444 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2445 pmap_paddr_t cur_pa = start_pa;
2446 vm_offset_t cur_va = vm_kernelcache_base;
2447 while (cur_pa < end_pa) {
2448 vm_size_t range_size = end_pa - cur_pa;
2449 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2450 if (ptov_va != cur_va) {
2451 /*
2452 * If the physical address maps back to a virtual address that is non-linear
2453 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2454 * reclaimed by the OS and should therefore not be locked down.
2455 */
2456 cur_pa += range_size;
2457 cur_va += range_size;
2458 continue;
2459 }
2460 unsigned int pai = pa_index(cur_pa);
2461 pv_entry_t **pv_h = pai_to_pvh(pai);
2462
2463 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2464
2465 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2466 panic("pai %d already locked down", pai);
2467 }
2468
2469 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2470 cur_pa += ARM_PGBYTES;
2471 cur_va += ARM_PGBYTES;
2472 }
2473 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2474 extern uint64_t ctrr_ro_test;
2475 extern uint64_t ctrr_nx_test;
2476 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2477 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2478 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2479 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2480 }
2481 #endif
2482 }
2483
2484 void
pmap_static_allocations_done(void)2485 pmap_static_allocations_done(void)
2486 {
2487 pmap_paddr_t monitor_start_pa;
2488 pmap_paddr_t monitor_end_pa;
2489
2490 /*
2491 * Protect the bootstrap (V=P and V->P) page tables.
2492 *
2493 * These bootstrap allocations will be used primarily for page tables.
2494 * If we wish to secure the page tables, we need to start by marking
2495 * these bootstrap allocations as pages that we want to protect.
2496 */
2497 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2498 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2499
2500 /* The bootstrap page tables are mapped RW at boostrap. */
2501 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2502
2503 /*
2504 * We use avail_start as a pointer to the first address that has not
2505 * been reserved for bootstrap, so we know which pages to give to the
2506 * virtual memory layer.
2507 */
2508 monitor_start_pa = first_avail_phys;
2509 monitor_end_pa = avail_start;
2510
2511 /* The other bootstrap allocations are mapped RW at bootstrap. */
2512 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2513
2514 /*
2515 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2516 * to RO in arm_vm_prot_finalize(), which is called after this function.
2517 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2518 * they can't be allocated for other uses. We don't need a special xPRR
2519 * protection index, as there is no PPL_RO index, and these pages are ultimately
2520 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2521 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2522 * lookup table index to USER_XO before APRR is applied, leading the hardware
2523 * to believe we are dealing with an user XO page upon performing a translation.
2524 */
2525 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2526 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2527 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2528
2529 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2530 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2531
2532 /* PPL data is RW for the PPL, RO for the kernel. */
2533 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2534
2535 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2536 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2537
2538 /* PPL text is RX for the PPL, RO for the kernel. */
2539 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2540
2541
2542 /*
2543 * In order to support DTrace, the save areas for the PPL must be
2544 * writable. This is due to the fact that DTrace will try to update
2545 * register state.
2546 */
2547 if (pmap_ppl_disable) {
2548 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2549 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2550
2551 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2552 }
2553
2554
2555 if (segSizePPLDATACONST > 0) {
2556 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2557 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2558
2559 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2560 }
2561
2562 /*
2563 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2564 * precaution. The real RW mappings are at a different location with guard pages.
2565 */
2566 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2567
2568 /* Prevent remapping of the kernelcache */
2569 pmap_lockdown_kc();
2570 }
2571
2572
2573 void
pmap_lockdown_ppl(void)2574 pmap_lockdown_ppl(void)
2575 {
2576 /* Mark the PPL as being locked down. */
2577
2578 mp_disable_preemption(); // for _nopreempt locking operations
2579 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2580 if (commpage_text_kva != 0) {
2581 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2582 false, VM_PROT_READ | VM_PROT_EXECUTE);
2583 }
2584 mp_enable_preemption();
2585
2586 /* Write-protect the kernel RO commpage. */
2587 #error "XPRR configuration error"
2588 }
2589 #endif /* XNU_MONITOR */
2590
2591 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2592 pmap_virtual_space(
2593 vm_offset_t *startp,
2594 vm_offset_t *endp
2595 )
2596 {
2597 *startp = virtual_space_start;
2598 *endp = virtual_space_end;
2599 }
2600
2601
2602 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2603 pmap_virtual_region(
2604 unsigned int region_select,
2605 vm_map_offset_t *startp,
2606 vm_map_size_t *size
2607 )
2608 {
2609 boolean_t ret = FALSE;
2610 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2611 if (region_select == 0) {
2612 /*
2613 * In this config, the bootstrap mappings should occupy their own L2
2614 * TTs, as they should be immutable after boot. Having the associated
2615 * TTEs and PTEs in their own pages allows us to lock down those pages,
2616 * while allowing the rest of the kernel address range to be remapped.
2617 */
2618 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2619 #if defined(ARM_LARGE_MEMORY)
2620 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2621 #else
2622 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2623 #endif
2624 ret = TRUE;
2625 }
2626
2627 #if defined(ARM_LARGE_MEMORY)
2628 if (region_select == 1) {
2629 *startp = VREGION1_START;
2630 *size = VREGION1_SIZE;
2631 ret = TRUE;
2632 }
2633 #endif
2634 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2635 #if defined(ARM_LARGE_MEMORY)
2636 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2637 if (region_select == 0) {
2638 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2639 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2640 ret = TRUE;
2641 }
2642
2643 if (region_select == 1) {
2644 *startp = VREGION1_START;
2645 *size = VREGION1_SIZE;
2646 ret = TRUE;
2647 }
2648 #else /* !defined(ARM_LARGE_MEMORY) */
2649 unsigned long low_global_vr_mask = 0;
2650 vm_map_size_t low_global_vr_size = 0;
2651
2652 if (region_select == 0) {
2653 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2654 if (!TEST_PAGE_SIZE_4K) {
2655 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2656 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2657 } else {
2658 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2659 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2660 }
2661 ret = TRUE;
2662 }
2663 if (region_select == 1) {
2664 *startp = VREGION1_START;
2665 *size = VREGION1_SIZE;
2666 ret = TRUE;
2667 }
2668 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2669 if (!TEST_PAGE_SIZE_4K) {
2670 low_global_vr_mask = 0xFFFFFFFFFE000000;
2671 low_global_vr_size = 0x2000000;
2672 } else {
2673 low_global_vr_mask = 0xFFFFFFFFFF800000;
2674 low_global_vr_size = 0x800000;
2675 }
2676
2677 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2678 *startp = LOW_GLOBAL_BASE_ADDRESS;
2679 *size = low_global_vr_size;
2680 ret = TRUE;
2681 }
2682
2683 if (region_select == 3) {
2684 /* In this config, we allow the bootstrap mappings to occupy the same
2685 * page table pages as the heap.
2686 */
2687 *startp = VM_MIN_KERNEL_ADDRESS;
2688 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2689 ret = TRUE;
2690 }
2691 #endif /* defined(ARM_LARGE_MEMORY) */
2692 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2693 return ret;
2694 }
2695
2696 /*
2697 * Routines to track and allocate physical pages during early boot.
2698 * On most systems that memory runs from first_avail through to avail_end
2699 * with no gaps.
2700 *
2701 * If the system supports ECC and ecc_bad_pages_count > 0, we
2702 * need to skip those pages.
2703 */
2704
2705 static unsigned int avail_page_count = 0;
2706 static bool need_ram_ranges_init = true;
2707
2708
2709 /**
2710 * Checks to see if a given page is in
2711 * the array of known bad pages
2712 *
2713 * @param ppn page number to check
2714 */
2715 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2716 pmap_is_bad_ram(__unused ppnum_t ppn)
2717 {
2718 return false;
2719 }
2720
2721 /**
2722 * Prepare bad ram pages to be skipped.
2723 */
2724
2725
2726 /*
2727 * Initialize the count of available pages. No lock needed here,
2728 * as this code is called while kernel boot up is single threaded.
2729 */
2730 static void
initialize_ram_ranges(void)2731 initialize_ram_ranges(void)
2732 {
2733 pmap_paddr_t first = first_avail;
2734 pmap_paddr_t end = avail_end;
2735
2736 assert(first <= end);
2737 assert(first == (first & ~PAGE_MASK));
2738 assert(end == (end & ~PAGE_MASK));
2739 avail_page_count = atop(end - first);
2740
2741 need_ram_ranges_init = false;
2742
2743 }
2744
2745 unsigned int
pmap_free_pages(void)2746 pmap_free_pages(
2747 void)
2748 {
2749 if (need_ram_ranges_init) {
2750 initialize_ram_ranges();
2751 }
2752 return avail_page_count;
2753 }
2754
2755 unsigned int
pmap_free_pages_span(void)2756 pmap_free_pages_span(
2757 void)
2758 {
2759 if (need_ram_ranges_init) {
2760 initialize_ram_ranges();
2761 }
2762 return (unsigned int)atop(avail_end - first_avail);
2763 }
2764
2765
2766 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2767 pmap_next_page_hi(
2768 ppnum_t * pnum,
2769 __unused boolean_t might_free)
2770 {
2771 return pmap_next_page(pnum);
2772 }
2773
2774
2775 boolean_t
pmap_next_page(ppnum_t * pnum)2776 pmap_next_page(
2777 ppnum_t *pnum)
2778 {
2779 if (need_ram_ranges_init) {
2780 initialize_ram_ranges();
2781 }
2782
2783
2784 if (first_avail != avail_end) {
2785 *pnum = (ppnum_t)atop(first_avail);
2786 first_avail += PAGE_SIZE;
2787 assert(avail_page_count > 0);
2788 --avail_page_count;
2789 return TRUE;
2790 }
2791 assert(avail_page_count == 0);
2792 return FALSE;
2793 }
2794
2795
2796 /*
2797 * Initialize the pmap module.
2798 * Called by vm_init, to initialize any structures that the pmap
2799 * system needs to map virtual memory.
2800 */
2801 void
pmap_init(void)2802 pmap_init(
2803 void)
2804 {
2805 /*
2806 * Protect page zero in the kernel map.
2807 * (can be overruled by permanent transltion
2808 * table entries at page zero - see arm_vm_init).
2809 */
2810 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2811
2812 pmap_initialized = TRUE;
2813
2814 /*
2815 * Create the zone of physical maps
2816 * and the physical-to-virtual entries.
2817 */
2818 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2819 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2820
2821
2822 /*
2823 * Initialize the pmap object (for tracking the vm_page_t
2824 * structures for pages we allocate to be page tables in
2825 * pmap_expand().
2826 */
2827 _vm_object_allocate(mem_size, pmap_object);
2828 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2829
2830 /*
2831 * The values of [hard_]maxproc may have been scaled, make sure
2832 * they are still less than the value of pmap_max_asids.
2833 */
2834 if ((uint32_t)maxproc > pmap_max_asids) {
2835 maxproc = pmap_max_asids;
2836 }
2837 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2838 hard_maxproc = pmap_max_asids;
2839 }
2840 }
2841
2842 /**
2843 * Verify that a given physical page contains no mappings (outside of the
2844 * default physical aperture mapping).
2845 *
2846 * @param ppnum Physical page number to check there are no mappings to.
2847 *
2848 * @return True if there are no mappings, false otherwise or if the page is not
2849 * kernel-managed.
2850 */
2851 bool
pmap_verify_free(ppnum_t ppnum)2852 pmap_verify_free(ppnum_t ppnum)
2853 {
2854 const pmap_paddr_t pa = ptoa(ppnum);
2855
2856 assert(pa != vm_page_fictitious_addr);
2857
2858 /* Only mappings to kernel-managed physical memory are tracked. */
2859 if (!pa_valid(pa)) {
2860 return false;
2861 }
2862
2863 const unsigned int pai = pa_index(pa);
2864 pv_entry_t **pvh = pai_to_pvh(pai);
2865
2866 return pvh_test_type(pvh, PVH_TYPE_NULL);
2867 }
2868
2869 #if MACH_ASSERT
2870 /**
2871 * Verify that a given physical page contains no mappings (outside of the
2872 * default physical aperture mapping) and if it does, then panic.
2873 *
2874 * @note It's recommended to use pmap_verify_free() directly when operating in
2875 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2876 * normally being called from outside of the PPL, and the pv_head_table
2877 * can't be modified outside of the PPL).
2878 *
2879 * @param ppnum Physical page number to check there are no mappings to.
2880 */
2881 void
pmap_assert_free(ppnum_t ppnum)2882 pmap_assert_free(ppnum_t ppnum)
2883 {
2884 const pmap_paddr_t pa = ptoa(ppnum);
2885
2886 /* Only mappings to kernel-managed physical memory are tracked. */
2887 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2888 return;
2889 }
2890
2891 const unsigned int pai = pa_index(pa);
2892 pv_entry_t **pvh = pai_to_pvh(pai);
2893
2894 /**
2895 * This function is always called from outside of the PPL. Because of this,
2896 * the PVH entry can't be locked. This function is generally only called
2897 * before the VM reclaims a physical page and shouldn't be creating new
2898 * mappings. Even if a new mapping is created while parsing the hierarchy,
2899 * the worst case is that the system will panic in another way, and we were
2900 * already about to panic anyway.
2901 */
2902
2903 /**
2904 * Since pmap_verify_free() returned false, that means there is at least one
2905 * mapping left. Let's get some extra info on the first mapping we find to
2906 * dump in the panic string (the common case is that there is one spare
2907 * mapping that was never unmapped).
2908 */
2909 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2910
2911 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2912 first_ptep = pvh_ptep(pvh);
2913 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2914 pv_entry_t *pvep = pvh_pve_list(pvh);
2915
2916 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2917 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2918 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2919 if (first_ptep != PT_ENTRY_NULL) {
2920 break;
2921 }
2922 }
2923
2924 /* The PVE should have at least one valid PTE. */
2925 assert(first_ptep != PT_ENTRY_NULL);
2926 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2927 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2928 __func__, pvh, pai);
2929 } else {
2930 /**
2931 * The mapping disappeared between here and the pmap_verify_free() call.
2932 * The only way that can happen is if the VM was racing this call with
2933 * a call that unmaps PTEs. Operations on this page should not be
2934 * occurring at the same time as this check, and unfortunately we can't
2935 * lock the PVH entry to prevent it, so just panic instead.
2936 */
2937 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2938 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2939 __func__, pvh, pai);
2940 }
2941
2942 /* Panic with a unique string identifying the first bad mapping and owner. */
2943 {
2944 /* First PTE is mapped by the main CPUs. */
2945 pmap_t pmap = ptep_get_pmap(first_ptep);
2946 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2947
2948 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2949 "%s CPU mapping (pmap: %p)",
2950 __func__, (uint64_t)pa, first_ptep, type, pmap);
2951 }
2952 }
2953 #endif
2954
2955
2956 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2957 pmap_root_alloc_size(pmap_t pmap)
2958 {
2959 #pragma unused(pmap)
2960 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2961 unsigned int root_level = pt_attr_root_level(pt_attr);
2962 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2963 }
2964
2965
2966 /*
2967 * Create and return a physical map.
2968 *
2969 * If the size specified for the map
2970 * is zero, the map is an actual physical
2971 * map, and may be referenced by the
2972 * hardware.
2973 *
2974 * If the size specified is non-zero,
2975 * the map will be used in software only, and
2976 * is bounded by that size.
2977 */
2978 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2979 pmap_create_options_internal(
2980 ledger_t ledger,
2981 vm_map_size_t size,
2982 unsigned int flags,
2983 kern_return_t *kr)
2984 {
2985 unsigned i;
2986 unsigned tte_index_max;
2987 pmap_t p;
2988 bool is_64bit = flags & PMAP_CREATE_64BIT;
2989 #if defined(HAS_APPLE_PAC)
2990 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2991 #endif /* defined(HAS_APPLE_PAC) */
2992 kern_return_t local_kr = KERN_SUCCESS;
2993
2994 if (size != 0) {
2995 {
2996 // Size parameter should only be set for stage 2.
2997 return PMAP_NULL;
2998 }
2999 }
3000
3001 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
3002 return PMAP_NULL;
3003 }
3004
3005 #if XNU_MONITOR
3006 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
3007 goto pmap_create_fail;
3008 }
3009
3010 assert(p != PMAP_NULL);
3011
3012 if (ledger) {
3013 pmap_ledger_validate(ledger);
3014 pmap_ledger_retain(ledger);
3015 }
3016 #else
3017 /*
3018 * Allocate a pmap struct from the pmap_zone. Then allocate
3019 * the translation table of the right size for the pmap.
3020 */
3021 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
3022 local_kr = KERN_RESOURCE_SHORTAGE;
3023 goto pmap_create_fail;
3024 }
3025 #endif
3026
3027 p->ledger = ledger;
3028
3029
3030 p->pmap_vm_map_cs_enforced = false;
3031 p->min = 0;
3032
3033
3034 #if CONFIG_ROSETTA
3035 if (flags & PMAP_CREATE_ROSETTA) {
3036 p->is_rosetta = TRUE;
3037 } else {
3038 p->is_rosetta = FALSE;
3039 }
3040 #endif /* CONFIG_ROSETTA */
3041
3042 #if defined(HAS_APPLE_PAC)
3043 p->disable_jop = disable_jop;
3044 #endif /* defined(HAS_APPLE_PAC) */
3045
3046 p->nested_region_true_start = 0;
3047 p->nested_region_true_end = ~0;
3048
3049 p->nx_enabled = true;
3050 p->is_64bit = is_64bit;
3051 p->nested_pmap = PMAP_NULL;
3052 p->type = PMAP_TYPE_USER;
3053
3054 #if ARM_PARAMETERIZED_PMAP
3055 /* Default to the native pt_attr */
3056 p->pmap_pt_attr = native_pt_attr;
3057 #endif /* ARM_PARAMETERIZED_PMAP */
3058 #if __ARM_MIXED_PAGE_SIZE__
3059 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3060 p->pmap_pt_attr = &pmap_pt_attr_4k;
3061 }
3062 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3063 p->max = pmap_user_va_size(p);
3064
3065 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3066 local_kr = KERN_NO_SPACE;
3067 goto id_alloc_fail;
3068 }
3069
3070 pmap_lock_init(p);
3071
3072 p->tt_entry_free = (tt_entry_t *)0;
3073 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3074
3075
3076 #if XNU_MONITOR
3077 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3078 #else
3079 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3080 #endif
3081 if (!(p->tte)) {
3082 local_kr = KERN_RESOURCE_SHORTAGE;
3083 goto tt1_alloc_fail;
3084 }
3085
3086 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3087 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3088
3089 /* nullify the translation table */
3090 for (i = 0; i < tte_index_max; i++) {
3091 p->tte[i] = ARM_TTE_TYPE_FAULT;
3092 }
3093
3094 FLUSH_PTE();
3095
3096 /*
3097 * initialize the rest of the structure
3098 */
3099 p->nested_region_addr = 0x0ULL;
3100 p->nested_region_size = 0x0ULL;
3101 p->nested_region_unnested_table_bitmap = NULL;
3102 p->nested_region_unnested_table_bitmap_size = 0x0UL;
3103
3104 p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3105 p->nested_no_bounds_refcnt = 0;
3106 p->nested_bounds_set = false;
3107
3108
3109 #if MACH_ASSERT
3110 p->pmap_pid = 0;
3111 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3112 #endif /* MACH_ASSERT */
3113 #if DEVELOPMENT || DEBUG
3114 p->footprint_was_suspended = FALSE;
3115 #endif /* DEVELOPMENT || DEBUG */
3116
3117 #if XNU_MONITOR
3118 os_atomic_init(&p->nested_count, 0);
3119 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3120 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3121 os_atomic_thread_fence(release);
3122 #endif
3123 os_atomic_init(&p->ref_count, 1);
3124 pmap_simple_lock(&pmaps_lock);
3125 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3126 pmap_simple_unlock(&pmaps_lock);
3127
3128 /*
3129 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3130 * which can lead to a concurrent disconnect operation making the balance
3131 * transiently negative. The ledger should still ultimately balance out,
3132 * which we still check upon pmap destruction.
3133 */
3134 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3135 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3136 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3137 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3138 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3139 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3140 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3141 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3142 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3143
3144 return p;
3145
3146 tt1_alloc_fail:
3147 pmap_get_pt_ops(p)->free_id(p);
3148 id_alloc_fail:
3149 #if XNU_MONITOR
3150 pmap_free_pmap(p);
3151
3152 if (ledger) {
3153 pmap_ledger_release(ledger);
3154 }
3155 #else
3156 zfree(pmap_zone, p);
3157 #endif
3158 pmap_create_fail:
3159 #if XNU_MONITOR
3160 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3161 #endif
3162 *kr = local_kr;
3163 #if XNU_MONITOR
3164 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3165 #endif
3166 return PMAP_NULL;
3167 }
3168
3169 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3170 pmap_create_options(
3171 ledger_t ledger,
3172 vm_map_size_t size,
3173 unsigned int flags)
3174 {
3175 pmap_t pmap;
3176 kern_return_t kr = KERN_SUCCESS;
3177
3178 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3179
3180 ledger_reference(ledger);
3181
3182 #if XNU_MONITOR
3183 for (;;) {
3184 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3185 if (kr != KERN_RESOURCE_SHORTAGE) {
3186 break;
3187 }
3188 assert(pmap == PMAP_NULL);
3189 pmap_alloc_page_for_ppl(0);
3190 kr = KERN_SUCCESS;
3191 }
3192 #else
3193 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3194 #endif
3195
3196 if (pmap == PMAP_NULL) {
3197 ledger_dereference(ledger);
3198 }
3199
3200 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3201
3202 return pmap;
3203 }
3204
3205 #if XNU_MONITOR
3206 /*
3207 * This symbol remains in place when the PPL is enabled so that the dispatch
3208 * table does not change from development to release configurations.
3209 */
3210 #endif
3211 #if MACH_ASSERT || XNU_MONITOR
3212 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3213 pmap_set_process_internal(
3214 __unused pmap_t pmap,
3215 __unused int pid,
3216 __unused char *procname)
3217 {
3218 #if MACH_ASSERT
3219 if (pmap == NULL || pmap->pmap_pid == -1) {
3220 return;
3221 }
3222
3223 validate_pmap_mutable(pmap);
3224
3225 pmap->pmap_pid = pid;
3226 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3227 #endif /* MACH_ASSERT */
3228 }
3229 #endif /* MACH_ASSERT || XNU_MONITOR */
3230
3231 #if MACH_ASSERT
3232 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3233 pmap_set_process(
3234 pmap_t pmap,
3235 int pid,
3236 char *procname)
3237 {
3238 #if XNU_MONITOR
3239 pmap_set_process_ppl(pmap, pid, procname);
3240 #else
3241 pmap_set_process_internal(pmap, pid, procname);
3242 #endif
3243 }
3244 #endif /* MACH_ASSERT */
3245
3246 /*
3247 * pmap_deallocate_all_leaf_tts:
3248 *
3249 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3250 * removing and deallocating all TTEs.
3251 */
3252 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3253 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3254 {
3255 tt_entry_t tte = ARM_TTE_EMPTY;
3256 tt_entry_t * ttep = NULL;
3257 tt_entry_t * last_ttep = NULL;
3258
3259 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3260
3261 assert(level < pt_attr_leaf_level(pt_attr));
3262
3263 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3264
3265 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3266 tte = *ttep;
3267
3268 if (!(tte & ARM_TTE_VALID)) {
3269 continue;
3270 }
3271
3272 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3273 panic("%s: found block mapping, ttep=%p, tte=%p, "
3274 "pmap=%p, first_ttep=%p, level=%u",
3275 __FUNCTION__, ttep, (void *)tte,
3276 pmap, first_ttep, level);
3277 }
3278
3279 /* Must be valid, type table */
3280 if (level < pt_attr_twig_level(pt_attr)) {
3281 /* If we haven't reached the twig level, recurse to the next level. */
3282 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3283 }
3284
3285 /* Remove the TTE. */
3286 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3287 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3288 }
3289 }
3290
3291 /*
3292 * We maintain stats and ledgers so that a task's physical footprint is:
3293 * phys_footprint = ((internal - alternate_accounting)
3294 * + (internal_compressed - alternate_accounting_compressed)
3295 * + iokit_mapped
3296 * + purgeable_nonvolatile
3297 * + purgeable_nonvolatile_compressed
3298 * + page_table)
3299 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3300 */
3301
3302 /*
3303 * Retire the given physical map from service.
3304 * Should only be called if the map contains
3305 * no valid mappings.
3306 */
3307 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3308 pmap_destroy_internal(
3309 pmap_t pmap)
3310 {
3311 if (pmap == PMAP_NULL) {
3312 return;
3313 }
3314
3315 validate_pmap(pmap);
3316
3317 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3318
3319 int32_t ref_count = os_atomic_dec(&pmap->ref_count, release);
3320 if (ref_count > 0) {
3321 return;
3322 } else if (__improbable(ref_count < 0)) {
3323 panic("pmap %p: refcount underflow", pmap);
3324 } else if (__improbable(pmap == kernel_pmap)) {
3325 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3326 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3327 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3328 }
3329
3330 /*
3331 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3332 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3333 * That ensures that if the pmap is currently in use elsewhere, this path will
3334 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3335 * ref_count of 0 and panic.
3336 */
3337 os_atomic_thread_fence(seq_cst);
3338
3339 #if XNU_MONITOR
3340 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3341 panic("pmap %p: attempt to destroy while nested", pmap);
3342 }
3343 const int max_cpu = ml_get_max_cpu_number();
3344 for (unsigned int i = 0; i <= max_cpu; ++i) {
3345 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3346 if (cpu_data == NULL) {
3347 continue;
3348 }
3349 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3350 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3351 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3352 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3353 }
3354 }
3355 #endif
3356 pmap_unmap_commpage(pmap);
3357
3358 pmap_simple_lock(&pmaps_lock);
3359 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3360 pmap_simple_unlock(&pmaps_lock);
3361
3362 pmap_trim_self(pmap);
3363
3364 /*
3365 * Free the memory maps, then the
3366 * pmap structure.
3367 */
3368 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3369
3370
3371
3372 if (pmap->tte) {
3373 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3374 pmap->tte = (tt_entry_t *) NULL;
3375 pmap->ttep = 0;
3376 }
3377
3378 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3379
3380 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3381 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3382 sync_tlb_flush();
3383 } else {
3384 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3385 sync_tlb_flush();
3386 /* return its asid to the pool */
3387 pmap_get_pt_ops(pmap)->free_id(pmap);
3388 if (pmap->nested_pmap != NULL) {
3389 #if XNU_MONITOR
3390 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3391 #endif
3392 /* release the reference we hold on the nested pmap */
3393 pmap_destroy_internal(pmap->nested_pmap);
3394 }
3395 }
3396
3397 pmap_check_ledgers(pmap);
3398
3399 if (pmap->nested_region_unnested_table_bitmap) {
3400 #if XNU_MONITOR
3401 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3402 #else
3403 kfree_data(pmap->nested_region_unnested_table_bitmap,
3404 pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3405 #endif
3406 }
3407
3408 #if XNU_MONITOR
3409 if (pmap->ledger) {
3410 pmap_ledger_release(pmap->ledger);
3411 }
3412
3413 pmap_lock_destroy(pmap);
3414 pmap_free_pmap(pmap);
3415 #else
3416 pmap_lock_destroy(pmap);
3417 zfree(pmap_zone, pmap);
3418 #endif
3419 }
3420
3421 void
pmap_destroy(pmap_t pmap)3422 pmap_destroy(
3423 pmap_t pmap)
3424 {
3425 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3426
3427 ledger_t ledger = pmap->ledger;
3428
3429 #if XNU_MONITOR
3430 pmap_destroy_ppl(pmap);
3431
3432 pmap_ledger_check_balance(pmap);
3433 #else
3434 pmap_destroy_internal(pmap);
3435 #endif
3436
3437 ledger_dereference(ledger);
3438
3439 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3440 }
3441
3442
3443 /*
3444 * Add a reference to the specified pmap.
3445 */
3446 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3447 pmap_reference_internal(
3448 pmap_t pmap)
3449 {
3450 if (pmap != PMAP_NULL) {
3451 validate_pmap_mutable(pmap);
3452 os_atomic_inc(&pmap->ref_count, acquire);
3453 }
3454 }
3455
3456 void
pmap_reference(pmap_t pmap)3457 pmap_reference(
3458 pmap_t pmap)
3459 {
3460 #if XNU_MONITOR
3461 pmap_reference_ppl(pmap);
3462 #else
3463 pmap_reference_internal(pmap);
3464 #endif
3465 }
3466
3467 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3468 pmap_tt1_allocate(
3469 pmap_t pmap,
3470 vm_size_t size,
3471 unsigned option)
3472 {
3473 tt_entry_t *tt1 = NULL;
3474 tt_free_entry_t *tt1_free;
3475 pmap_paddr_t pa;
3476 vm_address_t va;
3477 vm_address_t va_end;
3478 kern_return_t ret;
3479
3480 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3481 size = PAGE_SIZE;
3482 }
3483
3484 /**
3485 * We expect top level translation tables to always fit into a single
3486 * physical page. This would also catch a misconfiguration if 4K
3487 * concatenated page tables needed more than one physical tt1 page.
3488 */
3489 if (__improbable(size > PAGE_SIZE)) {
3490 panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)size);
3491 }
3492
3493 pmap_simple_lock(&tt1_lock);
3494 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3495 free_page_size_tt_count--;
3496 tt1 = (tt_entry_t *)free_page_size_tt_list;
3497 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3498 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3499 free_tt_count--;
3500 tt1 = (tt_entry_t *)free_tt_list;
3501 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3502 }
3503 pmap_simple_unlock(&tt1_lock);
3504
3505 if (tt1 != NULL) {
3506 pmap_tt_ledger_credit(pmap, size);
3507 return (tt_entry_t *)tt1;
3508 }
3509
3510 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3511
3512 if (ret == KERN_RESOURCE_SHORTAGE) {
3513 return (tt_entry_t *)0;
3514 }
3515
3516 #if XNU_MONITOR
3517 assert(pa);
3518 #endif
3519
3520 if (size < PAGE_SIZE) {
3521 va = phystokv(pa) + size;
3522 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3523 tt_free_entry_t *next_free = NULL;
3524 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3525 tt1_free = (tt_free_entry_t *)va;
3526 tt1_free->next = next_free;
3527 next_free = tt1_free;
3528 }
3529 pmap_simple_lock(&tt1_lock);
3530 local_free_list->next = free_tt_list;
3531 free_tt_list = next_free;
3532 free_tt_count += ((PAGE_SIZE / size) - 1);
3533 if (free_tt_count > free_tt_max) {
3534 free_tt_max = free_tt_count;
3535 }
3536 pmap_simple_unlock(&tt1_lock);
3537 }
3538
3539 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3540 * Depending on the device, this can vary between 512b and 16K. */
3541 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3542 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3543 pmap_tt_ledger_credit(pmap, size);
3544
3545 return (tt_entry_t *) phystokv(pa);
3546 }
3547
3548 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3549 pmap_tt1_deallocate(
3550 pmap_t pmap,
3551 tt_entry_t *tt,
3552 vm_size_t size,
3553 unsigned option)
3554 {
3555 tt_free_entry_t *tt_entry;
3556
3557 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3558 size = PAGE_SIZE;
3559 }
3560
3561 tt_entry = (tt_free_entry_t *)tt;
3562 assert(not_in_kdp);
3563 pmap_simple_lock(&tt1_lock);
3564
3565 if (size < PAGE_SIZE) {
3566 free_tt_count++;
3567 if (free_tt_count > free_tt_max) {
3568 free_tt_max = free_tt_count;
3569 }
3570 tt_entry->next = free_tt_list;
3571 free_tt_list = tt_entry;
3572 }
3573
3574 if (size == PAGE_SIZE) {
3575 free_page_size_tt_count++;
3576 if (free_page_size_tt_count > free_page_size_tt_max) {
3577 free_page_size_tt_max = free_page_size_tt_count;
3578 }
3579 tt_entry->next = free_page_size_tt_list;
3580 free_page_size_tt_list = tt_entry;
3581 }
3582
3583 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3584 pmap_simple_unlock(&tt1_lock);
3585 pmap_tt_ledger_debit(pmap, size);
3586 return;
3587 }
3588
3589 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3590 free_page_size_tt_count--;
3591 tt = (tt_entry_t *)free_page_size_tt_list;
3592 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3593
3594 pmap_simple_unlock(&tt1_lock);
3595
3596 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3597
3598 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3599
3600 pmap_simple_lock(&tt1_lock);
3601 }
3602
3603 pmap_simple_unlock(&tt1_lock);
3604 pmap_tt_ledger_debit(pmap, size);
3605 }
3606
3607 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3608 pmap_tt_allocate(
3609 pmap_t pmap,
3610 tt_entry_t **ttp,
3611 unsigned int level,
3612 unsigned int options)
3613 {
3614 pmap_paddr_t pa;
3615 *ttp = NULL;
3616
3617 /* Traverse the tt_entry_free list to find a free tt_entry */
3618 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3619 return KERN_ABORTED;
3620 }
3621
3622 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3623 tt_free_entry_t *tt_free_cur, *tt_free_next;
3624
3625 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3626 tt_free_next = tt_free_cur->next;
3627 tt_free_cur->next = NULL;
3628 *ttp = (tt_entry_t *)tt_free_cur;
3629 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3630 }
3631 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3632
3633 /* Only do the heavylifting here when we don't have a free tt_entry. */
3634 if (*ttp == NULL) {
3635 pt_desc_t *ptdp;
3636
3637 const unsigned int alloc_flags =
3638 (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
3639 /*
3640 * Allocate a VM page for the level x page table entries.
3641 */
3642 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags) != KERN_SUCCESS) {
3643 if (options & PMAP_OPTIONS_NOWAIT) {
3644 return KERN_RESOURCE_SHORTAGE;
3645 }
3646 VM_PAGE_WAIT();
3647 }
3648
3649 /* Allocate a new Page Table Descriptor for the newly allocated page table. */
3650 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3651 if (options & PMAP_OPTIONS_NOWAIT) {
3652 /* Deallocate all allocated resources so far. */
3653 pmap_pages_free(pa, PAGE_SIZE);
3654 return KERN_RESOURCE_SHORTAGE;
3655 }
3656 VM_PAGE_WAIT();
3657 }
3658
3659 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3660 OSAddAtomic64(1, &alloc_ttepages_count);
3661 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3662 } else {
3663 OSAddAtomic64(1, &alloc_ptepages_count);
3664 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3665 }
3666
3667 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3668
3669 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3670
3671 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3672 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3673 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3674
3675 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3676 if (PAGE_SIZE > pmap_page_size) {
3677 vm_address_t va;
3678 vm_address_t va_end;
3679
3680 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3681 /* Deallocate all allocated resources so far. */
3682 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3683 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3684 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3685 pmap_pages_free(pa, PAGE_SIZE);
3686 ptd_deallocate(ptdp);
3687
3688 return KERN_ABORTED;
3689 }
3690
3691 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3692 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3693 pmap->tt_entry_free = (tt_entry_t *)va;
3694 }
3695 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3696 }
3697
3698 *ttp = (tt_entry_t *)phystokv(pa);
3699 }
3700
3701 #if XNU_MONITOR
3702 assert(*ttp);
3703 #endif
3704
3705 return KERN_SUCCESS;
3706 }
3707
3708
3709 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3710 pmap_tt_deallocate(
3711 pmap_t pmap,
3712 tt_entry_t *ttp,
3713 unsigned int level)
3714 {
3715 pt_desc_t *ptdp;
3716 ptd_info_t *ptd_info;
3717 unsigned pt_acc_cnt;
3718 unsigned i;
3719 vm_offset_t free_page = 0;
3720 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3721 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3722
3723 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3724
3725 ptdp = ptep_get_ptd(ttp);
3726 ptd_info = ptd_get_info(ptdp, ttp);
3727
3728 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3729
3730 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3731 ptd_info->refcnt = 0;
3732 }
3733
3734 if (__improbable(ptd_info->refcnt != 0)) {
3735 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3736 }
3737
3738 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3739 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3740 }
3741
3742 if (pt_acc_cnt == 0) {
3743 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3744 unsigned pt_free_entry_cnt = 1;
3745
3746 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3747 tt_free_entry_t *tt_free_list_next;
3748
3749 tt_free_list_next = tt_free_list->next;
3750 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3751 pt_free_entry_cnt++;
3752 }
3753 tt_free_list = tt_free_list_next;
3754 }
3755 if (pt_free_entry_cnt == max_pt_index) {
3756 tt_free_entry_t *tt_free_list_cur;
3757
3758 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3759 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3760 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3761
3762 while (tt_free_list_cur) {
3763 tt_free_entry_t *tt_free_list_next;
3764
3765 tt_free_list_next = tt_free_list_cur->next;
3766 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3767 tt_free_list->next = tt_free_list_next->next;
3768 } else {
3769 tt_free_list = tt_free_list_next;
3770 }
3771 tt_free_list_cur = tt_free_list_next;
3772 }
3773 } else {
3774 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3775 pmap->tt_entry_free = ttp;
3776 }
3777 } else {
3778 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3779 pmap->tt_entry_free = ttp;
3780 }
3781
3782 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3783
3784 if (free_page != 0) {
3785 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3786 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3787 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3788 if (level < pt_attr_leaf_level(pt_attr)) {
3789 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3790 } else {
3791 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3792 }
3793 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3794 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3795 }
3796 }
3797
3798 /**
3799 * Safely clear out a translation table entry.
3800 *
3801 * @note If the TTE to clear out points to a leaf table, then that leaf table
3802 * must have a refcnt of zero before the TTE can be removed.
3803 * @note This function expects to be called with pmap locked exclusive, and will
3804 * return with pmap unlocked.
3805 *
3806 * @param pmap The pmap containing the page table whose TTE is being removed.
3807 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3808 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3809 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3810 * @param ttep Pointer to the TTE that should be cleared out.
3811 * @param level The level of the page table that contains the TTE to be removed.
3812 */
3813 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3814 pmap_tte_remove(
3815 pmap_t pmap,
3816 vm_offset_t va_start,
3817 vm_offset_t va_end,
3818 bool need_strong_sync,
3819 tt_entry_t *ttep,
3820 unsigned int level)
3821 {
3822 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3823
3824 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3825 const tt_entry_t tte = *ttep;
3826
3827 if (__improbable(tte == ARM_TTE_EMPTY)) {
3828 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3829 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3830 }
3831
3832 *ttep = (tt_entry_t) 0;
3833 FLUSH_PTE_STRONG();
3834 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3835 if (va_end > va_start) {
3836 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3837 }
3838
3839 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3840
3841 /**
3842 * Remember, the passed in "level" parameter refers to the level above the
3843 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3844 * page table).
3845 */
3846 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3847
3848 /**
3849 * Non-leaf pagetables don't track active references in the PTD and instead
3850 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3851 * the real refcount below.
3852 */
3853 unsigned short refcnt = PT_DESC_REFCOUNT;
3854
3855 /*
3856 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3857 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3858 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3859 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3860 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3861 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3862 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3863 * synchronize it against the disconnect operation. If that removal caused the
3864 * refcount to reach zero, the pagetable page could be freed before the disconnect
3865 * operation is finished using the relevant pagetable descriptor.
3866 * Address these cases by waiting until all CPUs have been observed to not be
3867 * executing pmap_disconnect().
3868 */
3869 if (remove_leaf_table) {
3870 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3871 const int max_cpu = ml_get_max_cpu_number();
3872 bitmap_full(&active_disconnects[0], max_cpu + 1);
3873 bool inflight_disconnect;
3874
3875 /*
3876 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3877 * ahead of any prior PTE load which may have observed the effect of a
3878 * concurrent disconnect operation. An acquire fence is required for this;
3879 * a load-acquire operation is insufficient.
3880 */
3881 os_atomic_thread_fence(acquire);
3882 do {
3883 inflight_disconnect = false;
3884 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3885 i >= 0;
3886 i = bitmap_next(&active_disconnects[0], i)) {
3887 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3888 if (cpu_data == NULL) {
3889 continue;
3890 }
3891 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3892 __builtin_arm_wfe();
3893 inflight_disconnect = true;
3894 continue;
3895 }
3896 os_atomic_clear_exclusive();
3897 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3898 }
3899 } while (inflight_disconnect);
3900 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3901 os_atomic_thread_fence(acquire);
3902 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3903 }
3904
3905 #if MACH_ASSERT
3906 /**
3907 * On internal devices, always do the page table consistency check
3908 * regardless of page table level or the actual refcnt value.
3909 */
3910 {
3911 #else /* MACH_ASSERT */
3912 /**
3913 * Only perform the page table consistency check when deleting leaf page
3914 * tables and it seems like there might be valid/compressed mappings
3915 * leftover.
3916 */
3917 if (__improbable(remove_leaf_table && refcnt != 0)) {
3918 #endif /* MACH_ASSERT */
3919
3920 /**
3921 * There are multiple problems that can arise as a non-zero refcnt:
3922 * 1. A bug in the refcnt management logic.
3923 * 2. A memory stomper or hardware failure.
3924 * 3. The VM forgetting to unmap all of the valid mappings in an address
3925 * space before destroying a pmap.
3926 *
3927 * By looping over the page table and determining how many valid or
3928 * compressed entries there actually are, we can narrow down which of
3929 * these three cases is causing this panic. If the expected refcnt
3930 * (valid + compressed) and the actual refcnt don't match then the
3931 * problem is probably either a memory corruption issue (if the
3932 * non-empty entries don't match valid+compressed, that could also be a
3933 * sign of corruption) or refcnt management bug. Otherwise, there
3934 * actually are leftover mappings and the higher layers of xnu are
3935 * probably at fault.
3936 */
3937 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3938 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3939
3940 pt_entry_t *ptep = bpte;
3941 unsigned short non_empty = 0, valid = 0, comp = 0;
3942
3943 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3944 /**
3945 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3946 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3947 * That's because it's possible for the 4-tuple PTE clear operation in
3948 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3949 * pmap_disconnect() to race each other in such a way that the compressed marker
3950 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3951 * This should be harmless as only the 1st PTE is used for accounting purposes,
3952 * but we don't want it to trip our internal checks here.
3953 */
3954 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3955 if ((i % PAGE_RATIO) == 0) {
3956 comp++;
3957 } else {
3958 continue;
3959 }
3960 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3961 valid++;
3962 }
3963
3964 /* Keep track of all non-empty entries to detect memory corruption. */
3965 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3966 non_empty++;
3967 }
3968 }
3969
3970 #if MACH_ASSERT
3971 /**
3972 * On internal machines, panic whenever a page table getting deleted has
3973 * leftover mappings (valid or otherwise) or a leaf page table has a
3974 * non-zero refcnt.
3975 */
3976 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3977 #else /* MACH_ASSERT */
3978 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3979 {
3980 #endif /* MACH_ASSERT */
3981 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3982 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3983 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3984 }
3985 }
3986 }
3987
3988 /**
3989 * Given a pointer to an entry within a `level` page table, delete the
3990 * page table at `level` + 1 that is represented by that entry. For instance,
3991 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3992 * contains the PA of the L3 table, and `level` would be "2".
3993 *
3994 * @note If the table getting deallocated is a leaf table, then that leaf table
3995 * must have a refcnt of zero before getting deallocated. All other levels
3996 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3997 * @note This function expects to be called with pmap locked exclusive and will
3998 * return with pmap unlocked.
3999 *
4000 * @param pmap The pmap that owns the page table to be deallocated.
4001 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
4002 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
4003 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
4004 * @param ttep Pointer to the `level` TTE to remove.
4005 * @param level The level of the table that contains an entry pointing to the
4006 * table to be removed. The deallocated page table will be a
4007 * `level` + 1 table (so if `level` is 2, then an L3 table will be
4008 * deleted).
4009 */
4010 void
4011 pmap_tte_deallocate(
4012 pmap_t pmap,
4013 vm_offset_t va_start,
4014 vm_offset_t va_end,
4015 bool need_strong_sync,
4016 tt_entry_t *ttep,
4017 unsigned int level)
4018 {
4019 tt_entry_t tte;
4020
4021 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4022
4023 tte = *ttep;
4024
4025 if (tte_get_ptd(tte)->pmap != pmap) {
4026 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4027 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4028 }
4029
4030 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
4031 __func__, ttep, (unsigned long long)tte);
4032
4033 /* pmap_tte_remove() will drop the pmap lock */
4034 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4035
4036 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4037 }
4038
4039 /*
4040 * Remove a range of hardware page-table entries.
4041 * The entries given are the first (inclusive)
4042 * and last (exclusive) entries for the VM pages.
4043 * The virtual address is the va for the first pte.
4044 *
4045 * The pmap must be locked.
4046 * If the pmap is not the kernel pmap, the range must lie
4047 * entirely within one pte-page. This is NOT checked.
4048 * Assumes that the pte-page exists.
4049 *
4050 * Returns the number of PTE changed
4051 */
4052 MARK_AS_PMAP_TEXT static int
4053 pmap_remove_range(
4054 pmap_t pmap,
4055 vm_map_address_t va,
4056 pt_entry_t *bpte,
4057 pt_entry_t *epte)
4058 {
4059 bool need_strong_sync = false;
4060 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4061 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4062 if (num_changed > 0) {
4063 PMAP_UPDATE_TLBS(pmap, va,
4064 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4065 }
4066 return num_changed;
4067 }
4068
4069
4070 #ifdef PVH_FLAG_EXEC
4071
4072 /*
4073 * Update the access protection bits of the physical aperture mapping for a page.
4074 * This is useful, for example, in guranteeing that a verified executable page
4075 * has no writable mappings anywhere in the system, including the physical
4076 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4077 * synchronization overhead in cases where the call to this function is
4078 * guaranteed to be followed by other TLB operations.
4079 */
4080 void
4081 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4082 {
4083 #if __ARM_PTE_PHYSMAP__
4084 pvh_assert_locked(pai);
4085 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4086 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4087
4088 pt_entry_t tmplate = *pte_p;
4089 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4090 return;
4091 }
4092 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4093 if (tmplate & ARM_PTE_HINT_MASK) {
4094 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4095 __func__, pte_p, (void *)kva, tmplate);
4096 }
4097 write_pte_strong(pte_p, tmplate);
4098 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4099 if (!flush_tlb_async) {
4100 sync_tlb_flush();
4101 }
4102 #endif
4103 }
4104 #endif /* defined(PVH_FLAG_EXEC) */
4105
4106
4107
4108 MARK_AS_PMAP_TEXT int
4109 pmap_remove_range_options(
4110 pmap_t pmap,
4111 vm_map_address_t va,
4112 pt_entry_t *bpte,
4113 pt_entry_t *epte,
4114 vm_map_address_t *eva,
4115 bool *need_strong_sync __unused,
4116 int options)
4117 {
4118 pt_entry_t *cpte;
4119 size_t npages = 0;
4120 int num_removed, num_unwired;
4121 int num_pte_changed;
4122 unsigned int pai = 0;
4123 pmap_paddr_t pa;
4124 int num_external, num_internal, num_reusable;
4125 int num_alt_internal;
4126 uint64_t num_compressed, num_alt_compressed;
4127 int16_t refcnt = 0;
4128
4129 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4130
4131 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4132 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4133
4134 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4135 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4136 }
4137
4138 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4139 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4140 }
4141
4142 if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4143 panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4144 }
4145
4146 num_removed = 0;
4147 num_unwired = 0;
4148 num_pte_changed = 0;
4149 num_external = 0;
4150 num_internal = 0;
4151 num_reusable = 0;
4152 num_compressed = 0;
4153 num_alt_internal = 0;
4154 num_alt_compressed = 0;
4155
4156 #if XNU_MONITOR
4157 bool ro_va = false;
4158 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4159 ro_va = true;
4160 }
4161 #endif
4162 for (cpte = bpte; cpte < epte;
4163 cpte += PAGE_RATIO, va += pmap_page_size) {
4164 pt_entry_t spte;
4165 boolean_t managed = FALSE;
4166
4167 /*
4168 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4169 * so we need to be as aggressive as possible in checking for preemption when we can.
4170 */
4171 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4172 *eva = va;
4173 break;
4174 }
4175
4176 spte = *((volatile pt_entry_t*)cpte);
4177
4178 while (!managed) {
4179 if (pmap != kernel_pmap &&
4180 (options & PMAP_OPTIONS_REMOVE) &&
4181 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4182 /*
4183 * "pmap" must be locked at this point,
4184 * so this should not race with another
4185 * pmap_remove_range() or pmap_enter().
4186 */
4187
4188 /* one less "compressed"... */
4189 num_compressed++;
4190 if (spte & ARM_PTE_COMPRESSED_ALT) {
4191 /* ... but it used to be "ALTACCT" */
4192 num_alt_compressed++;
4193 }
4194
4195 /* clear marker */
4196 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4197 /*
4198 * "refcnt" also accounts for
4199 * our "compressed" markers,
4200 * so let's update it here.
4201 */
4202 --refcnt;
4203 spte = *((volatile pt_entry_t*)cpte);
4204 }
4205 /*
4206 * It may be possible for the pte to transition from managed
4207 * to unmanaged in this timeframe; for now, elide the assert.
4208 * We should break out as a consequence of checking pa_valid.
4209 */
4210 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4211 pa = pte_to_pa(spte);
4212 if (!pa_valid(pa)) {
4213 #if XNU_MONITOR
4214 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4215 #endif
4216 #if XNU_MONITOR
4217 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4218 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4219 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4220 __func__, (uint64_t)pa);
4221 }
4222 #endif
4223 break;
4224 }
4225 #if HAS_FEAT_XS
4226 if (pte_is_xs(pt_attr, spte)) {
4227 *need_strong_sync = true;
4228 }
4229 #endif /* HAS_FEAT_XS */
4230 pai = pa_index(pa);
4231 pvh_lock(pai);
4232 spte = *((volatile pt_entry_t*)cpte);
4233 pa = pte_to_pa(spte);
4234 if (pai == pa_index(pa)) {
4235 managed = TRUE;
4236 break; // Leave pai locked as we will unlock it after we free the PV entry
4237 }
4238 pvh_unlock(pai);
4239 }
4240
4241 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4242 /*
4243 * There used to be a valid mapping here but it
4244 * has already been removed when the page was
4245 * sent to the VM compressor, so nothing left to
4246 * remove now...
4247 */
4248 continue;
4249 }
4250
4251 /* remove the translation, do not flush the TLB */
4252 if (*cpte != ARM_PTE_TYPE_FAULT) {
4253 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4254 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4255 #if MACH_ASSERT
4256 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4257 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4258 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4259 }
4260 #endif
4261 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4262 num_pte_changed++;
4263 }
4264
4265 if ((spte != ARM_PTE_TYPE_FAULT) &&
4266 (pmap != kernel_pmap)) {
4267 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4268 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4269 --refcnt;
4270 }
4271
4272 if (pte_is_wired(spte)) {
4273 pte_set_wired(pmap, cpte, 0);
4274 num_unwired++;
4275 }
4276 /*
4277 * if not managed, we're done
4278 */
4279 if (!managed) {
4280 continue;
4281 }
4282
4283 #if XNU_MONITOR
4284 if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4285 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4286 }
4287 if (__improbable(ro_va)) {
4288 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4289 }
4290 #endif
4291
4292 /*
4293 * find and remove the mapping from the chain for this
4294 * physical address.
4295 */
4296 bool is_internal, is_altacct;
4297 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4298
4299 if (is_altacct) {
4300 assert(is_internal);
4301 num_internal++;
4302 num_alt_internal++;
4303 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4304 ppattr_clear_altacct(pai);
4305 ppattr_clear_internal(pai);
4306 }
4307 } else if (is_internal) {
4308 if (ppattr_test_reusable(pai)) {
4309 num_reusable++;
4310 } else {
4311 num_internal++;
4312 }
4313 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4314 ppattr_clear_internal(pai);
4315 }
4316 } else {
4317 num_external++;
4318 }
4319 pvh_unlock(pai);
4320 num_removed++;
4321 }
4322
4323 /*
4324 * Update the counts
4325 */
4326 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4327
4328 if (pmap != kernel_pmap) {
4329 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4330 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4331 }
4332
4333 /* update ledgers */
4334 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4335 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4336 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4337 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4338 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4339 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4340 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4341 /* make needed adjustments to phys_footprint */
4342 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4343 ((num_internal -
4344 num_alt_internal) +
4345 (num_compressed -
4346 num_alt_compressed)) * pmap_page_size);
4347 }
4348
4349 /* flush the ptable entries we have written */
4350 if (num_pte_changed > 0) {
4351 FLUSH_PTE_STRONG();
4352 }
4353
4354 return num_pte_changed;
4355 }
4356
4357
4358 /*
4359 * Remove the given range of addresses
4360 * from the specified map.
4361 *
4362 * It is assumed that the start and end are properly
4363 * rounded to the hardware page size.
4364 */
4365 void
4366 pmap_remove(
4367 pmap_t pmap,
4368 vm_map_address_t start,
4369 vm_map_address_t end)
4370 {
4371 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4372 }
4373
4374 MARK_AS_PMAP_TEXT vm_map_address_t
4375 pmap_remove_options_internal(
4376 pmap_t pmap,
4377 vm_map_address_t start,
4378 vm_map_address_t end,
4379 int options)
4380 {
4381 vm_map_address_t eva = end;
4382 pt_entry_t *bpte, *epte;
4383 pt_entry_t *pte_p;
4384 tt_entry_t *tte_p;
4385 int remove_count = 0;
4386 bool need_strong_sync = false;
4387 bool unlock = true;
4388
4389 validate_pmap_mutable(pmap);
4390
4391 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4392
4393 if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4394 panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4395 }
4396
4397 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4398
4399 tte_p = pmap_tte(pmap, start);
4400
4401 if (tte_p == (tt_entry_t *) NULL) {
4402 goto done;
4403 }
4404
4405 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4406 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4407 bpte = &pte_p[pte_index(pt_attr, start)];
4408 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4409
4410 /*
4411 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4412 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4413 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4414 */
4415 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4416 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4417 __func__, ptep_get_pmap(bpte), pmap, bpte);
4418 }
4419
4420 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4421 &need_strong_sync, options);
4422
4423 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4424 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4425 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4426 unlock = false; // pmap_tte_deallocate() has dropped the lock
4427 }
4428 }
4429
4430 done:
4431 if (unlock) {
4432 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4433 }
4434
4435 if (remove_count > 0) {
4436 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4437 }
4438 return eva;
4439 }
4440
4441 void
4442 pmap_remove_options(
4443 pmap_t pmap,
4444 vm_map_address_t start,
4445 vm_map_address_t end,
4446 int options)
4447 {
4448 vm_map_address_t va;
4449
4450 if (pmap == PMAP_NULL) {
4451 return;
4452 }
4453
4454 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4455
4456 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4457 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4458 VM_KERNEL_ADDRHIDE(end));
4459
4460 /*
4461 * We allow single-page requests to execute non-preemptibly,
4462 * as it doesn't make sense to sample AST_URGENT for a single-page
4463 * operation, and there are a couple of special use cases that
4464 * require a non-preemptible single-page operation.
4465 */
4466 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4467 pmap_verify_preemptible();
4468 }
4469
4470 /*
4471 * Invalidate the translation buffer first
4472 */
4473 va = start;
4474 while (va < end) {
4475 vm_map_address_t l;
4476
4477 #if XNU_TARGET_OS_XR
4478 /* rdar://84856940 */
4479 unsigned int const BATCH_SIZE = 128 * pt_attr_leaf_size(pt_attr);
4480
4481 l = va + BATCH_SIZE;
4482
4483 vm_map_address_t const l_twig = l & ~pt_attr_twig_offmask(pt_attr);
4484
4485 if (l_twig > (va & ~pt_attr_twig_offmask(pt_attr))) {
4486 // We're not allowed to cross an L2 boundary.
4487 l = l_twig;
4488 }
4489 #else /* XNU_TARGET_OS_XR */
4490 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4491 #endif /* XNU_TARGET_OS_XR */
4492 if (l > end) {
4493 l = end;
4494 }
4495
4496 #if XNU_MONITOR
4497 va = pmap_remove_options_ppl(pmap, va, l, options);
4498
4499 pmap_ledger_check_balance(pmap);
4500 #else
4501 va = pmap_remove_options_internal(pmap, va, l, options);
4502 #endif
4503 }
4504
4505 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4506 }
4507
4508
4509 /*
4510 * Remove phys addr if mapped in specified map
4511 */
4512 void
4513 pmap_remove_some_phys(
4514 __unused pmap_t map,
4515 __unused ppnum_t pn)
4516 {
4517 /* Implement to support working set code */
4518 }
4519
4520 /*
4521 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4522 * switch a thread onto a new vm_map.
4523 */
4524 void
4525 pmap_switch_user(thread_t thread, vm_map_t new_map)
4526 {
4527 pmap_t new_pmap = new_map->pmap;
4528
4529
4530 thread->map = new_map;
4531 pmap_set_pmap(new_pmap, thread);
4532
4533 }
4534
4535 void
4536 pmap_set_pmap(
4537 pmap_t pmap,
4538 #if !__ARM_USER_PROTECT__
4539 __unused
4540 #endif
4541 thread_t thread)
4542 {
4543 pmap_switch(pmap);
4544 #if __ARM_USER_PROTECT__
4545 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4546 thread->machine.asid = pmap->hw_asid;
4547 #endif
4548 }
4549
4550 static void
4551 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4552 {
4553 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4554 }
4555
4556 #if HAS_SPECRES
4557 static void
4558 pmap_flush_core_cfp_asid_async(pmap_t pmap)
4559 {
4560 const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4561 asm volatile ("cfp rctx, %0" : : "r"(rctx_operand));
4562 }
4563
4564 #if REQUIRES_DVP_RCTX
4565 static void
4566 pmap_flush_core_dvp_asid_async(pmap_t pmap)
4567 {
4568 const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4569 asm volatile ("dvp rctx, %0" : : "r"(rctx_operand));
4570 }
4571 #endif /* REQUIRES_DVP_RCTX */
4572 #endif /* HAS_SPECRES */
4573
4574 static inline bool
4575 pmap_user_ttb_is_clear(void)
4576 {
4577 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4578 }
4579
4580 MARK_AS_PMAP_TEXT void
4581 pmap_switch_internal(
4582 pmap_t pmap)
4583 {
4584 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4585 #if XNU_MONITOR
4586 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4587
4588 /**
4589 * Make sure a pmap is never active-and-nested. For more details,
4590 * see pmap_set_nested_internal().
4591 */
4592 os_atomic_thread_fence(seq_cst);
4593 if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4594 panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4595 }
4596 #endif
4597 validate_pmap_mutable(pmap);
4598 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4599 uint16_t asid_index = pmap->hw_asid;
4600 bool do_asid_flush = false;
4601 bool do_commpage_flush = false;
4602 #if HAS_SPECRES
4603 bool do_speculation_restriction = false;
4604 #endif /* HAS_SPECRES */
4605
4606 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4607 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4608 }
4609 #if __ARM_KERNEL_PROTECT__
4610 asid_index >>= 1;
4611 #endif
4612
4613 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4614 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4615 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4616 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4617 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4618 bool break_before_make = do_shared_region_flush;
4619
4620 #if !HAS_16BIT_ASID
4621 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4622 asid_index -= 1;
4623 pmap_update_plru(asid_index);
4624
4625 /* Paranoia. */
4626 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4627
4628 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4629 uint8_t new_sw_asid = pmap->sw_asid;
4630 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4631
4632 if (new_sw_asid != last_sw_asid) {
4633 /**
4634 * If the virtual ASID of the new pmap does not match the virtual ASID
4635 * last seen on this CPU for the physical ASID (that was a mouthful),
4636 * then this switch runs the risk of aliasing. We need to flush the
4637 * TLB for this phyiscal ASID in this case.
4638 */
4639 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4640 do_asid_flush = true;
4641 #if HAS_SPECRES
4642 do_speculation_restriction = true;
4643 #endif /* HAS_SPECRES */
4644 break_before_make = true;
4645 }
4646 }
4647 #endif /* !HAS_16BIT_ASID */
4648
4649 #if HAS_SPECRES_DEBUGGING
4650 if (specres_debug & SPECRES_DEBUG_FORCE_RCTX) {
4651 do_speculation_restriction = true;
4652 } else if (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX) {
4653 do_speculation_restriction = false;
4654 }
4655 #endif /* HAS_SPECRES_DEBUGGING */
4656
4657 #if __ARM_MIXED_PAGE_SIZE__
4658 if (pt_attr->pta_tcr_value != get_tcr()) {
4659 break_before_make = true;
4660 }
4661 #endif
4662 #if __ARM_MIXED_PAGE_SIZE__
4663 /*
4664 * For mixed page size configurations, we need to flush the global commpage mappings from
4665 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4666 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4667 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4668 * conflict abort or other unpredictable behavior.
4669 */
4670 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4671 do_commpage_flush = true;
4672 }
4673 if (do_commpage_flush) {
4674 break_before_make = true;
4675 }
4676 #endif
4677 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4678 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4679 pmap_clear_user_ttb_internal();
4680 }
4681
4682 #if HAS_SPECRES
4683 /**
4684 * Perform an CFP/DVP flush if required.
4685 */
4686 if (__improbable(do_speculation_restriction)) {
4687 pmap_flush_core_cfp_asid_async(pmap);
4688 #if REQUIRES_DVP_RCTX
4689 pmap_flush_core_dvp_asid_async(pmap);
4690 #endif /* REQUIRES_DVP_RCTX */
4691 #if DEVELOPMENT || DEBUG
4692 os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4693 #endif /* DEVELOPMENT || DEBUG */
4694 }
4695 #endif /* HAS_SPECRES */
4696
4697 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4698 * to flush the userspace mappings for that region. Those mappings are global
4699 * and will not be protected by the ASID. It should also be cheaper to flush the
4700 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4701 if (__improbable(do_shared_region_flush)) {
4702 #if __ARM_RANGE_TLBI__
4703 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4704 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4705
4706 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4707 * There may still be non-global entries that overlap with the incoming pmap's
4708 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4709 * must necessarily belong to a different ASID than the incoming pmap, or they would
4710 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4711 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4712 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4713 * to consider additional invalidation here in the future. */
4714 if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4715 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4716 } else {
4717 /*
4718 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4719 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4720 * have a single-page shared region anyway, not least because pmap_nest()
4721 * requires L2 block alignment of the address and size.
4722 */
4723 do_asid_flush = false;
4724 flush_core_tlb_async();
4725 }
4726 #else
4727 do_asid_flush = false;
4728 flush_core_tlb_async();
4729 #endif // __ARM_RANGE_TLBI__
4730 }
4731
4732 #if __ARM_MIXED_PAGE_SIZE__
4733 if (__improbable(do_commpage_flush)) {
4734 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4735 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4736 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4737 flush_core_tlb_allrange_async(rtlbi_param);
4738 }
4739 #endif
4740 if (__improbable(do_asid_flush)) {
4741 pmap_flush_core_tlb_asid_async(pmap);
4742 #if DEVELOPMENT || DEBUG
4743 os_atomic_inc(&pmap_asid_flushes, relaxed);
4744 #endif /* DEVELOPMENT || DEBUG */
4745 }
4746
4747 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush
4748 #if HAS_SPECRES && !HAS_ERRATA_123855614
4749 || do_speculation_restriction
4750 #endif /* HAS_SPECRES && !HAS_ERRATA_123855614 */
4751 )) {
4752 sync_tlb_flush_local();
4753 }
4754
4755 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4756 }
4757
4758 void
4759 pmap_switch(
4760 pmap_t pmap)
4761 {
4762 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4763 #if XNU_MONITOR
4764 pmap_switch_ppl(pmap);
4765 #else
4766 pmap_switch_internal(pmap);
4767 #endif
4768 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4769 }
4770
4771 void
4772 pmap_page_protect(
4773 ppnum_t ppnum,
4774 vm_prot_t prot)
4775 {
4776 pmap_page_protect_options(ppnum, prot, 0, NULL);
4777 }
4778
4779 /*
4780 * Routine: pmap_page_protect_options
4781 *
4782 * Function:
4783 * Lower the permission for all mappings to a given
4784 * page.
4785 */
4786 MARK_AS_PMAP_TEXT static void
4787 pmap_page_protect_options_with_flush_range(
4788 ppnum_t ppnum,
4789 vm_prot_t prot,
4790 unsigned int options,
4791 pmap_tlb_flush_range_t *flush_range)
4792 {
4793 pmap_paddr_t phys = ptoa(ppnum);
4794 pv_entry_t **pv_h;
4795 pv_entry_t *pve_p, *orig_pve_p;
4796 pv_entry_t *pveh_p;
4797 pv_entry_t *pvet_p;
4798 pt_entry_t *pte_p, *orig_pte_p;
4799 pv_entry_t *new_pve_p;
4800 pt_entry_t *new_pte_p;
4801 vm_offset_t pvh_flags;
4802 unsigned int pai;
4803 bool remove;
4804 bool set_NX;
4805 unsigned int pvh_cnt = 0;
4806 unsigned int pass1_updated = 0;
4807 unsigned int pass2_updated = 0;
4808
4809 assert(ppnum != vm_page_fictitious_addr);
4810
4811 /* Only work with managed pages. */
4812 if (!pa_valid(phys)) {
4813 return;
4814 }
4815
4816 /*
4817 * Determine the new protection.
4818 */
4819 switch (prot) {
4820 case VM_PROT_ALL:
4821 return; /* nothing to do */
4822 case VM_PROT_READ:
4823 case VM_PROT_READ | VM_PROT_EXECUTE:
4824 remove = false;
4825 break;
4826 default:
4827 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4828 options = options & ~PMAP_OPTIONS_NOFLUSH;
4829 remove = true;
4830 break;
4831 }
4832
4833 pmap_cpu_data_t *pmap_cpu_data = NULL;
4834 if (remove) {
4835 #if !XNU_MONITOR
4836 mp_disable_preemption();
4837 #endif
4838 pmap_cpu_data = pmap_get_cpu_data();
4839 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4840 /*
4841 * Ensure the store to inflight_disconnect will be observed before any of the
4842 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4843 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4844 * another CPU, in between this function's clearing a PTE and dropping the
4845 * corresponding pagetable refcount. That can lead to a panic if the
4846 * destroying thread observes a non-zero refcount. For this we need a store-
4847 * store barrier; a store-release operation would not be sufficient.
4848 */
4849 os_atomic_thread_fence(release);
4850 }
4851
4852 pai = pa_index(phys);
4853 pvh_lock(pai);
4854 pv_h = pai_to_pvh(pai);
4855 pvh_flags = pvh_get_flags(pv_h);
4856
4857 #if XNU_MONITOR
4858 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4859 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4860 }
4861 if (__improbable(ppattr_pa_test_monitor(phys))) {
4862 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4863 }
4864 if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4865 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4866 }
4867 #endif
4868
4869
4870 orig_pte_p = pte_p = PT_ENTRY_NULL;
4871 orig_pve_p = pve_p = PV_ENTRY_NULL;
4872 pveh_p = PV_ENTRY_NULL;
4873 pvet_p = PV_ENTRY_NULL;
4874 new_pve_p = PV_ENTRY_NULL;
4875 new_pte_p = PT_ENTRY_NULL;
4876
4877
4878 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4879 orig_pte_p = pte_p = pvh_ptep(pv_h);
4880 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4881 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4882 pveh_p = pve_p;
4883 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4884 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4885 }
4886
4887 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4888 int pve_ptep_idx = 0;
4889
4890 /*
4891 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4892 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4893 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4894 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4895 * operation, TLB invalidation may be handled by the caller so it's possible for
4896 * tlb_flush_needed to be true while issue_tlbi is false.
4897 */
4898 bool issue_tlbi = false;
4899 bool tlb_flush_needed = false;
4900 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4901 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4902 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4903 bool update = false;
4904
4905 if (pve_p != PV_ENTRY_NULL) {
4906 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4907 if (pte_p == PT_ENTRY_NULL) {
4908 goto protect_skip_pve_pass1;
4909 }
4910 }
4911
4912 #ifdef PVH_FLAG_IOMMU
4913 if (pvh_ptep_is_iommu(pte_p)) {
4914 #if XNU_MONITOR
4915 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4916 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4917 ppnum, ptep_get_iommu(pte_p), pve_p);
4918 }
4919 #endif
4920 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4921 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4922 ppnum, ptep_get_iommu(pte_p), pve_p);
4923 }
4924 goto protect_skip_pve_pass1;
4925 }
4926 #endif
4927 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4928 const pmap_t pmap = ptdp->pmap;
4929 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4930
4931 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4932 #if MACH_ASSERT
4933 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4934 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4935 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4936 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4937
4938 pv_entry_t *check_pvep = pve_p;
4939
4940 do {
4941 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4942 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4943 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4944 }
4945 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4946
4947 /* Restore previous PTEP value. */
4948 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4949 }
4950 #endif
4951 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4952 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4953 }
4954
4955 #if DEVELOPMENT || DEBUG
4956 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4957 #else
4958 if ((prot & VM_PROT_EXECUTE))
4959 #endif
4960 {
4961 set_NX = false;
4962 } else {
4963 set_NX = true;
4964 }
4965
4966 #if HAS_FEAT_XS
4967 /**
4968 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4969 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4970 */
4971 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4972 #endif /* HAS_FEAT_XS */
4973
4974 /* Remove the mapping if new protection is NONE */
4975 if (remove) {
4976 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4977 panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4978 __func__, pmap, ppnum);
4979 }
4980
4981 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4982 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4983 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4984 pt_entry_t spte = *pte_p;
4985
4986 if (pte_is_wired(spte)) {
4987 pte_set_wired(pmap, pte_p, 0);
4988 spte = *pte_p;
4989 if (pmap != kernel_pmap) {
4990 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4991 }
4992 }
4993
4994 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4995 (uint64_t)spte, pte_p, ppnum);
4996
4997 if (compress && is_internal && (pmap != kernel_pmap)) {
4998 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4999 /* mark this PTE as having been "compressed" */
5000 tmplate = ARM_PTE_COMPRESSED;
5001 if (is_altacct) {
5002 tmplate |= ARM_PTE_COMPRESSED_ALT;
5003 }
5004 } else {
5005 tmplate = ARM_PTE_TYPE_FAULT;
5006 }
5007
5008 assert(spte != tmplate);
5009 write_pte_fast(pte_p, tmplate);
5010 update = true;
5011 ++pass1_updated;
5012
5013 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5014
5015 if (pmap != kernel_pmap) {
5016 if (ppattr_test_reusable(pai) &&
5017 is_internal &&
5018 !is_altacct) {
5019 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5020 } else if (!is_internal) {
5021 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5022 }
5023
5024 if (is_altacct) {
5025 assert(is_internal);
5026 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5027 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5028 if (options & PMAP_OPTIONS_COMPRESSOR) {
5029 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5030 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5031 }
5032 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5033 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5034 } else if (ppattr_test_reusable(pai)) {
5035 assert(is_internal);
5036 if (options & PMAP_OPTIONS_COMPRESSOR) {
5037 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5038 /* was not in footprint, but is now */
5039 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5040 }
5041 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5042 } else if (is_internal) {
5043 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5044
5045 /*
5046 * Update all stats related to physical footprint, which only
5047 * deals with internal pages.
5048 */
5049 if (options & PMAP_OPTIONS_COMPRESSOR) {
5050 /*
5051 * This removal is only being done so we can send this page to
5052 * the compressor; therefore it mustn't affect total task footprint.
5053 */
5054 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5055 } else {
5056 /*
5057 * This internal page isn't going to the compressor, so adjust stats to keep
5058 * phys_footprint up to date.
5059 */
5060 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5061 }
5062 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5063 } else {
5064 /* external page: no impact on ledgers */
5065 }
5066 }
5067 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
5068 } else {
5069 pt_entry_t spte = *pte_p;
5070 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5071
5072 if (pmap == kernel_pmap) {
5073 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5074 } else {
5075 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5076 }
5077
5078 /*
5079 * While the naive implementation of this would serve to add execute
5080 * permission, this is not how the VM uses this interface, or how
5081 * x86_64 implements it. So ignore requests to add execute permissions.
5082 */
5083 if (set_NX) {
5084 tmplate |= pt_attr_leaf_xn(pt_attr);
5085 }
5086
5087
5088 assert(spte != ARM_PTE_TYPE_FAULT);
5089 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
5090
5091 if (spte != tmplate) {
5092 /*
5093 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
5094 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
5095 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
5096 * should always be cleared by this function.
5097 */
5098 pte_set_was_writeable(tmplate, true);
5099 write_pte_fast(pte_p, tmplate);
5100 update = true;
5101 ++pass1_updated;
5102 } else if (pte_was_writeable(tmplate)) {
5103 /*
5104 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5105 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
5106 * write access to a page, this function should always at least clear that flag for
5107 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5108 * these mappings go through vm_fault(). We therefore don't want those accesses to
5109 * be handled through arm_fast_fault().
5110 */
5111 pte_set_was_writeable(tmplate, false);
5112 write_pte_fast(pte_p, tmplate);
5113 }
5114 }
5115
5116 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5117 tlb_flush_needed = true;
5118 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5119 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5120 issue_tlbi = true;
5121 }
5122 }
5123 protect_skip_pve_pass1:
5124 pte_p = PT_ENTRY_NULL;
5125 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5126 pve_ptep_idx = 0;
5127 pve_p = pve_next(pve_p);
5128 }
5129 }
5130
5131 if (tlb_flush_needed) {
5132 FLUSH_PTE_STRONG();
5133 }
5134
5135 if (!remove && !issue_tlbi) {
5136 goto protect_finish;
5137 }
5138
5139 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5140 pv_entry_t **pve_pp = pv_h;
5141 pve_p = orig_pve_p;
5142 pte_p = orig_pte_p;
5143 pve_ptep_idx = 0;
5144
5145 /*
5146 * We need to keep track of whether a particular PVE list contains IOMMU
5147 * mappings when removing entries, because we should only remove CPU
5148 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5149 * it around.
5150 */
5151 bool iommu_mapping_in_pve = false;
5152 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5153 if (pve_p != PV_ENTRY_NULL) {
5154 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5155 if (pte_p == PT_ENTRY_NULL) {
5156 goto protect_skip_pve_pass2;
5157 }
5158 }
5159
5160 #ifdef PVH_FLAG_IOMMU
5161 if (pvh_ptep_is_iommu(pte_p)) {
5162 iommu_mapping_in_pve = true;
5163 if (remove && (pve_p == PV_ENTRY_NULL)) {
5164 /*
5165 * We've found an IOMMU entry and it's the only entry in the PV list.
5166 * We don't discard IOMMU entries, so simply set up the new PV list to
5167 * contain the single IOMMU PTE and exit the loop.
5168 */
5169 new_pte_p = pte_p;
5170 break;
5171 }
5172 goto protect_skip_pve_pass2;
5173 }
5174 #endif
5175 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5176 const pmap_t pmap = ptdp->pmap;
5177 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5178
5179 if (remove) {
5180 if (!compress && (pmap != kernel_pmap)) {
5181 /*
5182 * We must wait to decrement the refcount until we're completely finished using the PTE
5183 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5184 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5185 * under us.
5186 */
5187 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5188 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5189 }
5190 }
5191 /* Remove this CPU mapping from PVE list. */
5192 if (pve_p != PV_ENTRY_NULL) {
5193 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5194 }
5195 } else {
5196 pt_entry_t spte = *pte_p;
5197 if (pte_was_writeable(spte)) {
5198 pte_set_was_writeable(spte, false);
5199 write_pte_fast(pte_p, spte);
5200 } else {
5201 goto protect_skip_pve_pass2;
5202 }
5203 }
5204 ++pass2_updated;
5205 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5206 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5207 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5208 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5209 }
5210
5211 protect_skip_pve_pass2:
5212 pte_p = PT_ENTRY_NULL;
5213 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5214 pve_ptep_idx = 0;
5215
5216 if (remove) {
5217 /**
5218 * If there are any IOMMU mappings in the PVE list, preserve
5219 * those mappings in a new PVE list (new_pve_p) which will later
5220 * become the new PVH entry. Keep track of the CPU mappings in
5221 * pveh_p/pvet_p so they can be deallocated later.
5222 */
5223 if (iommu_mapping_in_pve) {
5224 iommu_mapping_in_pve = false;
5225 pv_entry_t *temp_pve_p = pve_next(pve_p);
5226 pve_remove(pv_h, pve_pp, pve_p);
5227 pveh_p = pvh_pve_list(pv_h);
5228 pve_p->pve_next = new_pve_p;
5229 new_pve_p = pve_p;
5230 pve_p = temp_pve_p;
5231 continue;
5232 } else {
5233 pvet_p = pve_p;
5234 pvh_cnt++;
5235 }
5236 }
5237
5238 pve_pp = pve_next_ptr(pve_p);
5239 pve_p = pve_next(pve_p);
5240 iommu_mapping_in_pve = false;
5241 }
5242 }
5243
5244 protect_finish:
5245
5246 #ifdef PVH_FLAG_EXEC
5247 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5248 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5249 }
5250 #endif
5251 if (__improbable(pass1_updated != pass2_updated)) {
5252 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5253 __func__, pass1_updated, pass2_updated);
5254 }
5255 /* if we removed a bunch of entries, take care of them now */
5256 if (remove) {
5257 if (new_pve_p != PV_ENTRY_NULL) {
5258 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5259 pvh_set_flags(pv_h, pvh_flags);
5260 } else if (new_pte_p != PT_ENTRY_NULL) {
5261 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5262 pvh_set_flags(pv_h, pvh_flags);
5263 } else {
5264 if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5265 pmap_flush_noncoherent_page(phys);
5266 }
5267 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5268 }
5269 }
5270
5271 if (flush_range && tlb_flush_needed) {
5272 if (!remove) {
5273 flush_range->ptfr_flush_needed = true;
5274 tlb_flush_needed = false;
5275 }
5276 }
5277
5278 /*
5279 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5280 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5281 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5282 * a page to be repurposed while it is still live in the TLBs.
5283 */
5284 if (remove && tlb_flush_needed) {
5285 sync_tlb_flush();
5286 }
5287
5288
5289 pvh_unlock(pai);
5290
5291 if (remove) {
5292 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5293 #if !XNU_MONITOR
5294 mp_enable_preemption();
5295 #endif
5296 }
5297
5298 if (!remove && tlb_flush_needed) {
5299 sync_tlb_flush();
5300 }
5301
5302 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5303 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5304 }
5305 }
5306
5307 MARK_AS_PMAP_TEXT void
5308 pmap_page_protect_options_internal(
5309 ppnum_t ppnum,
5310 vm_prot_t prot,
5311 unsigned int options,
5312 void *arg)
5313 {
5314 if (arg != NULL) {
5315 /*
5316 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5317 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5318 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5319 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5320 * In that case, force the flush to take place.
5321 */
5322 options &= ~PMAP_OPTIONS_NOFLUSH;
5323 }
5324 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5325 }
5326
5327 void
5328 pmap_page_protect_options(
5329 ppnum_t ppnum,
5330 vm_prot_t prot,
5331 unsigned int options,
5332 void *arg)
5333 {
5334 pmap_paddr_t phys = ptoa(ppnum);
5335
5336 assert(ppnum != vm_page_fictitious_addr);
5337
5338 /* Only work with managed pages. */
5339 if (!pa_valid(phys)) {
5340 return;
5341 }
5342
5343 /*
5344 * Determine the new protection.
5345 */
5346 if (prot == VM_PROT_ALL) {
5347 return; /* nothing to do */
5348 }
5349
5350 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5351
5352 #if XNU_MONITOR
5353 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5354 #else
5355 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5356 #endif
5357
5358 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5359 }
5360
5361
5362 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5363 MARK_AS_PMAP_TEXT void
5364 pmap_disable_user_jop_internal(pmap_t pmap)
5365 {
5366 if (pmap == kernel_pmap) {
5367 panic("%s: called with kernel_pmap", __func__);
5368 }
5369 validate_pmap_mutable(pmap);
5370 pmap->disable_jop = true;
5371 }
5372
5373 void
5374 pmap_disable_user_jop(pmap_t pmap)
5375 {
5376 #if XNU_MONITOR
5377 pmap_disable_user_jop_ppl(pmap);
5378 #else
5379 pmap_disable_user_jop_internal(pmap);
5380 #endif
5381 }
5382 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5383
5384 /*
5385 * Indicates if the pmap layer enforces some additional restrictions on the
5386 * given set of protections.
5387 */
5388 bool
5389 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5390 {
5391 return false;
5392 }
5393
5394 /*
5395 * Set the physical protection on the
5396 * specified range of this map as requested.
5397 * VERY IMPORTANT: Will not increase permissions.
5398 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5399 */
5400 void
5401 pmap_protect(
5402 pmap_t pmap,
5403 vm_map_address_t b,
5404 vm_map_address_t e,
5405 vm_prot_t prot)
5406 {
5407 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5408 }
5409
5410 MARK_AS_PMAP_TEXT vm_map_address_t
5411 pmap_protect_options_internal(
5412 pmap_t pmap,
5413 vm_map_address_t start,
5414 vm_map_address_t end,
5415 vm_prot_t prot,
5416 unsigned int options,
5417 __unused void *args)
5418 {
5419 tt_entry_t *tte_p;
5420 pt_entry_t *bpte_p, *epte_p;
5421 pt_entry_t *pte_p;
5422 boolean_t set_NX = TRUE;
5423 boolean_t set_XO = FALSE;
5424 boolean_t should_have_removed = FALSE;
5425 bool need_strong_sync = false;
5426
5427 /* Validate the pmap input before accessing its data. */
5428 validate_pmap_mutable(pmap);
5429
5430 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5431
5432 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5433 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5434 }
5435
5436 #if DEVELOPMENT || DEBUG
5437 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5438 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5439 should_have_removed = TRUE;
5440 }
5441 } else
5442 #endif
5443 {
5444 /* Determine the new protection. */
5445 switch (prot) {
5446 case VM_PROT_EXECUTE:
5447 set_XO = TRUE;
5448 OS_FALLTHROUGH;
5449 case VM_PROT_READ:
5450 case VM_PROT_READ | VM_PROT_EXECUTE:
5451 break;
5452 case VM_PROT_READ | VM_PROT_WRITE:
5453 case VM_PROT_ALL:
5454 return end; /* nothing to do */
5455 default:
5456 should_have_removed = TRUE;
5457 }
5458 }
5459
5460 if (should_have_removed) {
5461 panic("%s: should have been a remove operation, "
5462 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5463 __FUNCTION__,
5464 pmap, (void *)start, (void *)end, prot, options, args);
5465 }
5466
5467 #if DEVELOPMENT || DEBUG
5468 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5469 #else
5470 if ((prot & VM_PROT_EXECUTE))
5471 #endif
5472 {
5473 set_NX = FALSE;
5474 } else {
5475 set_NX = TRUE;
5476 }
5477
5478 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5479 vm_map_address_t va = start;
5480 unsigned int npages = 0;
5481
5482 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5483
5484 tte_p = pmap_tte(pmap, start);
5485
5486 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5487 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5488 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5489 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5490 pte_p = bpte_p;
5491
5492 for (pte_p = bpte_p;
5493 pte_p < epte_p;
5494 pte_p += PAGE_RATIO, va += pmap_page_size) {
5495 ++npages;
5496 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5497 pmap_pending_preemption())) {
5498 break;
5499 }
5500 pt_entry_t spte;
5501 #if DEVELOPMENT || DEBUG
5502 boolean_t force_write = FALSE;
5503 #endif
5504
5505 spte = *((volatile pt_entry_t*)pte_p);
5506
5507 if ((spte == ARM_PTE_TYPE_FAULT) ||
5508 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5509 continue;
5510 }
5511
5512 pmap_paddr_t pa;
5513 unsigned int pai = 0;
5514 boolean_t managed = FALSE;
5515
5516 while (!managed) {
5517 /*
5518 * It may be possible for the pte to transition from managed
5519 * to unmanaged in this timeframe; for now, elide the assert.
5520 * We should break out as a consequence of checking pa_valid.
5521 */
5522 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5523 pa = pte_to_pa(spte);
5524 if (!pa_valid(pa)) {
5525 break;
5526 }
5527 pai = pa_index(pa);
5528 pvh_lock(pai);
5529 spte = *((volatile pt_entry_t*)pte_p);
5530 pa = pte_to_pa(spte);
5531 if (pai == pa_index(pa)) {
5532 managed = TRUE;
5533 break; // Leave the PVH locked as we will unlock it after we free the PTE
5534 }
5535 pvh_unlock(pai);
5536 }
5537
5538 if ((spte == ARM_PTE_TYPE_FAULT) ||
5539 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5540 continue;
5541 }
5542
5543 pt_entry_t tmplate;
5544
5545 if (pmap == kernel_pmap) {
5546 #if DEVELOPMENT || DEBUG
5547 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5548 force_write = TRUE;
5549 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5550 } else
5551 #endif
5552 {
5553 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5554 }
5555 } else {
5556 #if DEVELOPMENT || DEBUG
5557 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5558 assert(pmap->type != PMAP_TYPE_NESTED);
5559 force_write = TRUE;
5560 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5561 } else
5562 #endif
5563 {
5564 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5565 }
5566 }
5567
5568 /*
5569 * XXX Removing "NX" would
5570 * grant "execute" access
5571 * immediately, bypassing any
5572 * checks VM might want to do
5573 * in its soft fault path.
5574 * pmap_protect() and co. are
5575 * not allowed to increase
5576 * access permissions.
5577 */
5578 if (set_NX) {
5579 tmplate |= pt_attr_leaf_xn(pt_attr);
5580 } else {
5581 if (pmap == kernel_pmap) {
5582 /* do NOT clear "PNX"! */
5583 tmplate |= ARM_PTE_NX;
5584 } else {
5585 /* do NOT clear "NX"! */
5586 tmplate |= pt_attr_leaf_x(pt_attr);
5587 if (set_XO) {
5588 tmplate &= ~ARM_PTE_APMASK;
5589 tmplate |= pt_attr_leaf_rona(pt_attr);
5590 }
5591 }
5592 }
5593
5594 #if DEVELOPMENT || DEBUG
5595 if (force_write) {
5596 /*
5597 * TODO: Run CS/Monitor checks here.
5598 */
5599 if (managed) {
5600 /*
5601 * We are marking the page as writable,
5602 * so we consider it to be modified and
5603 * referenced.
5604 */
5605 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5606 tmplate |= ARM_PTE_AF;
5607
5608 if (ppattr_test_reffault(pai)) {
5609 ppattr_clear_reffault(pai);
5610 }
5611
5612 if (ppattr_test_modfault(pai)) {
5613 ppattr_clear_modfault(pai);
5614 }
5615 }
5616 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5617 /*
5618 * An immediate request for anything other than
5619 * write should still mark the page as
5620 * referenced if managed.
5621 */
5622 if (managed) {
5623 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5624 tmplate |= ARM_PTE_AF;
5625
5626 if (ppattr_test_reffault(pai)) {
5627 ppattr_clear_reffault(pai);
5628 }
5629 }
5630 }
5631 #endif
5632
5633 /* We do not expect to write fast fault the entry. */
5634 pte_set_was_writeable(tmplate, false);
5635 #if HAS_FEAT_XS
5636 if (pte_is_xs(pt_attr, spte)) {
5637 need_strong_sync = true;
5638 }
5639 #endif /* HAS_FEAT_XS */
5640
5641 write_pte_fast(pte_p, tmplate);
5642
5643 if (managed) {
5644 pvh_assert_locked(pai);
5645 pvh_unlock(pai);
5646 }
5647 }
5648 FLUSH_PTE_STRONG();
5649 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5650 } else {
5651 va = end;
5652 }
5653
5654 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5655 return va;
5656 }
5657
5658 void
5659 pmap_protect_options(
5660 pmap_t pmap,
5661 vm_map_address_t b,
5662 vm_map_address_t e,
5663 vm_prot_t prot,
5664 unsigned int options,
5665 __unused void *args)
5666 {
5667 vm_map_address_t l, beg;
5668
5669 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5670
5671 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5672 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5673 pmap, (uint64_t)b, (uint64_t)e);
5674 }
5675
5676 /*
5677 * We allow single-page requests to execute non-preemptibly,
5678 * as it doesn't make sense to sample AST_URGENT for a single-page
5679 * operation, and there are a couple of special use cases that
5680 * require a non-preemptible single-page operation.
5681 */
5682 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5683 pmap_verify_preemptible();
5684 }
5685
5686 #if DEVELOPMENT || DEBUG
5687 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5688 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5689 pmap_remove_options(pmap, b, e, options);
5690 return;
5691 }
5692 } else
5693 #endif
5694 {
5695 /* Determine the new protection. */
5696 switch (prot) {
5697 case VM_PROT_EXECUTE:
5698 case VM_PROT_READ:
5699 case VM_PROT_READ | VM_PROT_EXECUTE:
5700 break;
5701 case VM_PROT_READ | VM_PROT_WRITE:
5702 case VM_PROT_ALL:
5703 return; /* nothing to do */
5704 default:
5705 pmap_remove_options(pmap, b, e, options);
5706 return;
5707 }
5708 }
5709
5710 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5711 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5712 VM_KERNEL_ADDRHIDE(e));
5713
5714 beg = b;
5715
5716 while (beg < e) {
5717 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5718
5719 if (l > e) {
5720 l = e;
5721 }
5722
5723 #if XNU_MONITOR
5724 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5725 #else
5726 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5727 #endif
5728 }
5729
5730 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5731 }
5732
5733 /**
5734 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5735 *
5736 * @param pmap pmap to insert the pages into.
5737 * @param va virtual address to map the pages into.
5738 * @param pa page number of the first physical page to map.
5739 * @param size block size, in number of pages.
5740 * @param prot mapping protection attributes.
5741 * @param attr flags to pass to pmap_enter().
5742 *
5743 * @return KERN_SUCCESS.
5744 */
5745 kern_return_t
5746 pmap_map_block(
5747 pmap_t pmap,
5748 addr64_t va,
5749 ppnum_t pa,
5750 uint32_t size,
5751 vm_prot_t prot,
5752 int attr,
5753 unsigned int flags)
5754 {
5755 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5756 }
5757
5758 /**
5759 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5760 * As opposed to pmap_map_block(), this function takes
5761 * a physical address as an input and operates using the
5762 * page size associated with the input pmap.
5763 *
5764 * @param pmap pmap to insert the pages into.
5765 * @param va virtual address to map the pages into.
5766 * @param pa physical address of the first physical page to map.
5767 * @param size block size, in number of pages.
5768 * @param prot mapping protection attributes.
5769 * @param attr flags to pass to pmap_enter().
5770 *
5771 * @return KERN_SUCCESS.
5772 */
5773 kern_return_t
5774 pmap_map_block_addr(
5775 pmap_t pmap,
5776 addr64_t va,
5777 pmap_paddr_t pa,
5778 uint32_t size,
5779 vm_prot_t prot,
5780 int attr,
5781 unsigned int flags)
5782 {
5783 #if __ARM_MIXED_PAGE_SIZE__
5784 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5785 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5786 #else
5787 const uint64_t pmap_page_size = PAGE_SIZE;
5788 #endif
5789
5790 for (ppnum_t page = 0; page < size; page++) {
5791 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5792 panic("%s: failed pmap_enter_addr, "
5793 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5794 __FUNCTION__,
5795 pmap, va, (uint64_t)pa, size, prot, flags);
5796 }
5797
5798 va += pmap_page_size;
5799 pa += pmap_page_size;
5800 }
5801
5802 return KERN_SUCCESS;
5803 }
5804
5805 kern_return_t
5806 pmap_enter_addr(
5807 pmap_t pmap,
5808 vm_map_address_t v,
5809 pmap_paddr_t pa,
5810 vm_prot_t prot,
5811 vm_prot_t fault_type,
5812 unsigned int flags,
5813 boolean_t wired)
5814 {
5815 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5816 }
5817
5818 /*
5819 * Insert the given physical page (p) at
5820 * the specified virtual address (v) in the
5821 * target physical map with the protection requested.
5822 *
5823 * If specified, the page will be wired down, meaning
5824 * that the related pte can not be reclaimed.
5825 *
5826 * NB: This is the only routine which MAY NOT lazy-evaluate
5827 * or lose information. That is, this routine must actually
5828 * insert this page into the given map eventually (must make
5829 * forward progress eventually.
5830 */
5831 kern_return_t
5832 pmap_enter(
5833 pmap_t pmap,
5834 vm_map_address_t v,
5835 ppnum_t pn,
5836 vm_prot_t prot,
5837 vm_prot_t fault_type,
5838 unsigned int flags,
5839 boolean_t wired,
5840 __unused pmap_mapping_type_t mapping_type)
5841 {
5842 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5843 }
5844
5845 /*
5846 * Attempt to commit the pte.
5847 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5848 * Performs no page table or accounting writes on failures.
5849 */
5850 static inline bool
5851 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5852 {
5853 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5854 bool success = false, changed_wiring = false;
5855
5856 __unreachable_ok_push
5857 if (TEST_PAGE_RATIO_4) {
5858 /*
5859 * 16K virtual pages w/ 4K hw pages.
5860 * We actually need to update 4 ptes here which can't easily be done atomically.
5861 * As a result we require the exclusive pmap lock.
5862 */
5863 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5864 *old_pte = *pte_p;
5865 if (*old_pte == new_pte) {
5866 /* Another thread completed this operation. Nothing to do here. */
5867 success = true;
5868 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5869 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5870 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5871 success = false;
5872 } else {
5873 write_pte_fast(pte_p, new_pte);
5874 success = true;
5875 }
5876 } else {
5877 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5878 }
5879 __unreachable_ok_pop
5880
5881 if (success && *old_pte != new_pte) {
5882 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5883 bool need_strong_sync = false;
5884 FLUSH_PTE_STRONG();
5885 #if HAS_FEAT_XS
5886 if (pte_is_xs(pt_attr, *old_pte)) {
5887 need_strong_sync = true;
5888 }
5889 #endif /* HAS_FEAT_XS */
5890 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5891 } else {
5892 FLUSH_PTE();
5893 __builtin_arm_isb(ISB_SY);
5894 }
5895 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5896 (new_pte & ARM_PTE_WIRED) != 0 :
5897 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5898
5899 if (pmap != kernel_pmap && changed_wiring) {
5900 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5901 if (new_pte & ARM_PTE_WIRED) {
5902 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5903 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5904 } else {
5905 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5906 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5907 }
5908 }
5909
5910 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5911 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5912 }
5913 return success;
5914 }
5915
5916 MARK_AS_PMAP_TEXT static pt_entry_t
5917 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5918 {
5919 pt_entry_t pte;
5920
5921 switch (wimg & (VM_WIMG_MASK)) {
5922 case VM_WIMG_IO:
5923 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5924 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5925 // AP, while preserving the security benefits of using device
5926 // mapping against side-channel attacks. On pre-H14 platforms,
5927 // the accesses will still be strongly ordered.
5928 if (is_dram_addr(pa)) {
5929 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5930 } else {
5931 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5932 #if HAS_FEAT_XS
5933 pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5934 if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5935 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5936 }
5937 #endif /* HAS_FEAT_XS */
5938 }
5939 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5940 break;
5941 case VM_WIMG_RT:
5942 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5943 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5944 break;
5945 case VM_WIMG_POSTED:
5946 if (is_dram_addr(pa)) {
5947 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5948 } else {
5949 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5950 }
5951 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5952 break;
5953 case VM_WIMG_POSTED_REORDERED:
5954 if (is_dram_addr(pa)) {
5955 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5956 } else {
5957 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5958 }
5959 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5960 break;
5961 case VM_WIMG_POSTED_COMBINED_REORDERED:
5962 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5963 #if HAS_FEAT_XS
5964 if (!is_dram_addr(pa)) {
5965 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5966 }
5967 #endif /* HAS_FEAT_XS */
5968 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5969 break;
5970 case VM_WIMG_WCOMB:
5971 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5972 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5973 break;
5974 case VM_WIMG_WTHRU:
5975 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5976 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5977 break;
5978 case VM_WIMG_COPYBACK:
5979 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5980 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5981 break;
5982 case VM_WIMG_INNERWBACK:
5983 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5984 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5985 break;
5986 default:
5987 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5988 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5989 }
5990
5991 return pte;
5992 }
5993
5994
5995 /*
5996 * Construct a PTE (and the physical page attributes) for the given virtual to
5997 * physical mapping.
5998 *
5999 * This function has no side effects and is safe to call so that it is safe to
6000 * call while attempting a pmap_enter transaction.
6001 */
6002 MARK_AS_PMAP_TEXT static pt_entry_t
6003 pmap_construct_pte(
6004 const pmap_t pmap,
6005 vm_map_address_t va,
6006 pmap_paddr_t pa,
6007 vm_prot_t prot,
6008 vm_prot_t fault_type,
6009 boolean_t wired,
6010 const pt_attr_t* const pt_attr,
6011 uint16_t *pp_attr_bits /* OUTPUT */
6012 )
6013 {
6014 bool set_NX = false, set_XO = false;
6015 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
6016 assert(pp_attr_bits != NULL);
6017 *pp_attr_bits = 0;
6018
6019 if (wired) {
6020 pte |= ARM_PTE_WIRED;
6021 }
6022
6023 #if DEVELOPMENT || DEBUG
6024 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
6025 #else
6026 if ((prot & VM_PROT_EXECUTE))
6027 #endif
6028 {
6029 set_NX = false;
6030 } else {
6031 set_NX = true;
6032 }
6033
6034 if (prot == VM_PROT_EXECUTE) {
6035 set_XO = true;
6036 }
6037
6038 if (set_NX) {
6039 pte |= pt_attr_leaf_xn(pt_attr);
6040 } else {
6041 if (pmap == kernel_pmap) {
6042 pte |= ARM_PTE_NX;
6043 } else {
6044 pte |= pt_attr_leaf_x(pt_attr);
6045 }
6046 }
6047
6048 if (pmap == kernel_pmap) {
6049 #if __ARM_KERNEL_PROTECT__
6050 pte |= ARM_PTE_NG;
6051 #endif /* __ARM_KERNEL_PROTECT__ */
6052 if (prot & VM_PROT_WRITE) {
6053 pte |= ARM_PTE_AP(AP_RWNA);
6054 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6055 } else {
6056 pte |= ARM_PTE_AP(AP_RONA);
6057 *pp_attr_bits |= PP_ATTR_REFERENCED;
6058 }
6059 } else {
6060 if (pmap->type != PMAP_TYPE_NESTED) {
6061 pte |= ARM_PTE_NG;
6062 } else if ((pmap->nested_region_unnested_table_bitmap)
6063 && (va >= pmap->nested_region_addr)
6064 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
6065 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
6066
6067 if ((pmap->nested_region_unnested_table_bitmap)
6068 && testbit(UNNEST_BIT(index), (int *)pmap->nested_region_unnested_table_bitmap)) {
6069 pte |= ARM_PTE_NG;
6070 }
6071 }
6072 if (prot & VM_PROT_WRITE) {
6073 assert(pmap->type != PMAP_TYPE_NESTED);
6074 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6075 if (fault_type & VM_PROT_WRITE) {
6076 pte |= pt_attr_leaf_rw(pt_attr);
6077 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6078 } else {
6079 pte |= pt_attr_leaf_ro(pt_attr);
6080 /*
6081 * Mark the page as MODFAULT so that a subsequent write
6082 * may be handled through arm_fast_fault().
6083 */
6084 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6085 pte_set_was_writeable(pte, true);
6086 }
6087 } else {
6088 pte |= pt_attr_leaf_rw(pt_attr);
6089 *pp_attr_bits |= PP_ATTR_REFERENCED;
6090 }
6091 } else {
6092 if (set_XO) {
6093 pte |= pt_attr_leaf_rona(pt_attr);
6094 } else {
6095 pte |= pt_attr_leaf_ro(pt_attr);
6096 }
6097 *pp_attr_bits |= PP_ATTR_REFERENCED;
6098 }
6099 }
6100
6101 pte |= ARM_PTE_AF;
6102 return pte;
6103 }
6104
6105 MARK_AS_PMAP_TEXT kern_return_t
6106 pmap_enter_options_internal(
6107 pmap_t pmap,
6108 vm_map_address_t v,
6109 pmap_paddr_t pa,
6110 vm_prot_t prot,
6111 vm_prot_t fault_type,
6112 unsigned int flags,
6113 boolean_t wired,
6114 unsigned int options)
6115 {
6116 ppnum_t pn = (ppnum_t)atop(pa);
6117 pt_entry_t pte;
6118 pt_entry_t spte;
6119 pt_entry_t *pte_p;
6120 bool refcnt_updated;
6121 bool wiredcnt_updated;
6122 bool ro_va = false;
6123 unsigned int wimg_bits;
6124 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6125 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6126 kern_return_t kr = KERN_SUCCESS;
6127 uint16_t pp_attr_bits;
6128 volatile uint16_t *refcnt;
6129 volatile uint16_t *wiredcnt;
6130 pv_free_list_t *local_pv_free;
6131
6132 validate_pmap_mutable(pmap);
6133
6134 #if XNU_MONITOR
6135 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6136 panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6137 }
6138 #endif
6139
6140 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6141
6142 if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6143 panic("%s: pmap %p v 0x%llx not page-aligned",
6144 __func__, pmap, (unsigned long long)v);
6145 }
6146
6147 if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6148 panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6149 }
6150
6151 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6152 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6153 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6154 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6155 }
6156
6157 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6158 panic("pmap_enter_options() pmap %p pa 0x%llx",
6159 pmap, (uint64_t)pa);
6160 }
6161
6162 /* The PA should not extend beyond the architected physical address space */
6163 pa &= ARM_PTE_PAGE_MASK;
6164
6165 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6166 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6167 extern vm_offset_t ctrr_test_page;
6168 if (__probable(v != ctrr_test_page))
6169 #endif
6170 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6171 }
6172 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6173 if (__improbable(prot != VM_PROT_READ)) {
6174 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6175 __func__, (unsigned long long)v, prot);
6176 }
6177 ro_va = true;
6178 }
6179 assert(pn != vm_page_fictitious_addr);
6180
6181 refcnt_updated = false;
6182 wiredcnt_updated = false;
6183
6184 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6185 /*
6186 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6187 *
6188 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6189 */
6190 lock_mode = PMAP_LOCK_EXCLUSIVE;
6191 }
6192
6193 if (!pmap_lock_preempt(pmap, lock_mode)) {
6194 return KERN_ABORTED;
6195 }
6196
6197 /*
6198 * Expand pmap to include this pte. Assume that
6199 * pmap is always expanded to include enough hardware
6200 * pages to map one VM page.
6201 */
6202 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6203 /* Must unlock to expand the pmap. */
6204 pmap_unlock(pmap, lock_mode);
6205
6206 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6207
6208 if (kr != KERN_SUCCESS) {
6209 return kr;
6210 }
6211
6212 if (!pmap_lock_preempt(pmap, lock_mode)) {
6213 return KERN_ABORTED;
6214 }
6215 }
6216
6217 if (options & PMAP_OPTIONS_NOENTER) {
6218 pmap_unlock(pmap, lock_mode);
6219 return KERN_SUCCESS;
6220 }
6221
6222 /*
6223 * Since we may not hold the pmap lock exclusive, updating the pte is
6224 * done via a cmpxchg loop.
6225 * We need to be careful about modifying non-local data structures before commiting
6226 * the new pte since we may need to re-do the transaction.
6227 */
6228 spte = os_atomic_load(pte_p, relaxed);
6229 while (!committed) {
6230 refcnt = NULL;
6231 wiredcnt = NULL;
6232 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6233 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6234
6235 if (pmap != kernel_pmap) {
6236 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6237 refcnt = &ptd_info->refcnt;
6238 wiredcnt = &ptd_info->wiredcnt;
6239 /*
6240 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6241 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6242 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6243 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6244 * have PTDs, so we can't use the check there.
6245 */
6246 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6247 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6248 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6249 }
6250 /*
6251 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6252 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6253 * or acquire the pmap lock exclusive.
6254 */
6255 if (!wiredcnt_updated) {
6256 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6257 wiredcnt_updated = true;
6258 }
6259 if (!refcnt_updated) {
6260 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6261 refcnt_updated = true;
6262 drop_refcnt = true;
6263 }
6264 }
6265
6266 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6267 /*
6268 * There is already a mapping here & it's for a different physical page.
6269 * First remove that mapping.
6270 *
6271 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6272 */
6273 if (lock_mode == PMAP_LOCK_SHARED) {
6274 if (pmap_lock_shared_to_exclusive(pmap)) {
6275 lock_mode = PMAP_LOCK_EXCLUSIVE;
6276 } else {
6277 /*
6278 * We failed to upgrade to an exclusive lock.
6279 * As a result we no longer hold the lock at all,
6280 * so we need to re-acquire it and restart the transaction.
6281 */
6282 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6283 lock_mode = PMAP_LOCK_EXCLUSIVE;
6284 /* pmap might have changed after we dropped the lock. Try again. */
6285 spte = os_atomic_load(pte_p, relaxed);
6286 continue;
6287 }
6288 }
6289 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6290 spte = ARM_PTE_TYPE_FAULT;
6291 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6292 }
6293
6294 /*
6295 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6296 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6297 * read-write protection. The PMAP layer though still needs to use the right
6298 * index, which is the older XO-now-TPRO one and that is specially selected
6299 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6300 */
6301 if (options & PMAP_OPTIONS_MAP_TPRO) {
6302 if (__improbable(pmap == kernel_pmap)) {
6303 panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6304 __func__);
6305 }
6306 pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6307 } else {
6308 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6309 }
6310
6311
6312 if (pa_valid(pa)) {
6313 unsigned int pai;
6314 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6315
6316 is_internal = FALSE;
6317 is_altacct = FALSE;
6318
6319 pai = pa_index(pa);
6320
6321 pvh_lock(pai);
6322
6323 /*
6324 * Make sure that the current per-cpu PV free list has
6325 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6326 * if the transaction succeeds. We're either in the
6327 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6328 * Note that we can still be interrupted, but a primary
6329 * interrupt handler can never enter the pmap.
6330 */
6331 #if !XNU_MONITOR
6332 assert(get_preemption_level() > 0);
6333 #endif
6334 local_pv_free = &pmap_get_cpu_data()->pv_free;
6335 pv_entry_t **pv_h = pai_to_pvh(pai);
6336 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6337 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6338
6339 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6340 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6341 int new_allocated_pves = 0;
6342
6343 while (new_allocated_pves < 2) {
6344 local_pv_free = &pmap_get_cpu_data()->pv_free;
6345 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6346 if (pv_status == PV_ALLOC_FAIL) {
6347 break;
6348 } else if (pv_status == PV_ALLOC_RETRY) {
6349 /*
6350 * In the case that pv_alloc() had to grab a new page of PVEs,
6351 * it will have dropped the pmap lock while doing so.
6352 * On non-PPL devices, dropping the lock re-enables preemption so we may
6353 * be on a different CPU now.
6354 */
6355 local_pv_free = &pmap_get_cpu_data()->pv_free;
6356 } else {
6357 /* If we've gotten this far then a node should've been allocated. */
6358 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6359
6360 new_allocated_pves++;
6361 }
6362 }
6363
6364 for (int i = 0; i < new_allocated_pves; i++) {
6365 pv_free(new_pve_p[i]);
6366 }
6367 }
6368
6369 if (pv_status == PV_ALLOC_FAIL) {
6370 pvh_unlock(pai);
6371 kr = KERN_RESOURCE_SHORTAGE;
6372 break;
6373 } else if (pv_status == PV_ALLOC_RETRY) {
6374 pvh_unlock(pai);
6375 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6376 spte = os_atomic_load(pte_p, relaxed);
6377 continue;
6378 }
6379
6380 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6381 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6382 } else {
6383 wimg_bits = pmap_cache_attributes(pn);
6384 }
6385
6386 /* We may be retrying this operation after dropping the PVH lock.
6387 * Cache attributes for the physical page may have changed while the lock
6388 * was dropped, so clear any cache attributes we may have previously set
6389 * in the PTE template. */
6390 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6391 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6392
6393
6394
6395 #if XNU_MONITOR
6396 /* The regular old kernel is not allowed to remap PPL pages. */
6397 if (__improbable(ppattr_pa_test_monitor(pa))) {
6398 panic("%s: page belongs to PPL, "
6399 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6400 __FUNCTION__,
6401 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6402 }
6403
6404 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6405 panic("%s: page locked down, "
6406 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6407 __FUNCTION__,
6408 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6409 }
6410 #endif
6411
6412
6413
6414
6415
6416 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6417 if (!committed) {
6418 pvh_unlock(pai);
6419 continue;
6420 }
6421 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6422 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6423
6424 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6425 /*
6426 * If there was already a valid pte here then we reuse its reference
6427 * on the ptd and drop the one that we took above.
6428 */
6429 drop_refcnt = had_valid_mapping;
6430
6431 if (!had_valid_mapping) {
6432 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6433 int pve_ptep_idx = 0;
6434 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6435 /* We did all the allocations up top. So this shouldn't be able to fail. */
6436 if (pv_status != PV_ALLOC_SUCCESS) {
6437 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6438 __func__, pv_status, new_pve_p, pmap);
6439 }
6440
6441 if (pmap != kernel_pmap) {
6442 if (options & PMAP_OPTIONS_INTERNAL) {
6443 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6444 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6445 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6446 /*
6447 * Make a note to ourselves that this
6448 * mapping is using alternative
6449 * accounting. We'll need this in order
6450 * to know which ledger to debit when
6451 * the mapping is removed.
6452 *
6453 * The altacct bit must be set while
6454 * the pv head is locked. Defer the
6455 * ledger accounting until after we've
6456 * dropped the lock.
6457 */
6458 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6459 is_altacct = TRUE;
6460 }
6461 }
6462 if (ppattr_test_reusable(pai) &&
6463 !is_altacct) {
6464 is_reusable = TRUE;
6465 } else if (options & PMAP_OPTIONS_INTERNAL) {
6466 is_internal = TRUE;
6467 } else {
6468 is_external = TRUE;
6469 }
6470 }
6471 }
6472
6473 pvh_unlock(pai);
6474
6475 if (pp_attr_bits != 0) {
6476 ppattr_pa_set_bits(pa, pp_attr_bits);
6477 }
6478
6479 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6480 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6481
6482 if (is_internal) {
6483 /*
6484 * Make corresponding adjustments to
6485 * phys_footprint statistics.
6486 */
6487 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6488 if (is_altacct) {
6489 /*
6490 * If this page is internal and
6491 * in an IOKit region, credit
6492 * the task's total count of
6493 * dirty, internal IOKit pages.
6494 * It should *not* count towards
6495 * the task's total physical
6496 * memory footprint, because
6497 * this entire region was
6498 * already billed to the task
6499 * at the time the mapping was
6500 * created.
6501 *
6502 * Put another way, this is
6503 * internal++ and
6504 * alternate_accounting++, so
6505 * net effect on phys_footprint
6506 * is 0. That means: don't
6507 * touch phys_footprint here.
6508 */
6509 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6510 } else {
6511 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6512 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6513 skip_footprint_debit = true;
6514 } else {
6515 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6516 }
6517 }
6518 }
6519 if (is_reusable) {
6520 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6521 } else if (is_external) {
6522 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6523 }
6524 }
6525 } else {
6526 if (prot & VM_PROT_EXECUTE) {
6527 kr = KERN_FAILURE;
6528 break;
6529 }
6530
6531 wimg_bits = pmap_cache_attributes(pn);
6532 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6533 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6534 }
6535
6536 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6537
6538 #if XNU_MONITOR
6539 pte = pmap_construct_io_pte(pa, pte);
6540
6541 /**
6542 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6543 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6544 * created and later removed. We must therefore prevent an attacker from downgrading a
6545 * a writable mapping in order to allow it to be removed and remapped to something else.
6546 */
6547 if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6548 ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6549 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6550 (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6551 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6552 __func__, (uint64_t)pte_to_pa(spte));
6553 }
6554 #endif
6555
6556 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6557 if (committed) {
6558 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6559 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6560
6561 /**
6562 * If there was already a valid pte here then we reuse its
6563 * reference on the ptd and drop the one that we took above.
6564 */
6565 drop_refcnt = had_valid_mapping;
6566 }
6567 }
6568 if (committed) {
6569 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6570 assert(pmap != kernel_pmap);
6571
6572 /* One less "compressed" */
6573 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6574 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6575
6576 if (spte & ARM_PTE_COMPRESSED_ALT) {
6577 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6578 } else if (!skip_footprint_debit) {
6579 /* Was part of the footprint */
6580 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6581 }
6582 /* The old entry held a reference so drop the extra one that we took above. */
6583 drop_refcnt = true;
6584 }
6585 }
6586 }
6587
6588 if (drop_refcnt && refcnt != NULL) {
6589 assert(refcnt_updated);
6590 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6591 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6592 }
6593 }
6594
6595 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6596 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6597 }
6598
6599 pmap_unlock(pmap, lock_mode);
6600
6601 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6602 pmap_phys_write_disable(v);
6603 }
6604
6605 return kr;
6606 }
6607
6608 kern_return_t
6609 pmap_enter_options_addr(
6610 pmap_t pmap,
6611 vm_map_address_t v,
6612 pmap_paddr_t pa,
6613 vm_prot_t prot,
6614 vm_prot_t fault_type,
6615 unsigned int flags,
6616 boolean_t wired,
6617 unsigned int options,
6618 __unused void *arg,
6619 __unused pmap_mapping_type_t mapping_type)
6620 {
6621 kern_return_t kr = KERN_FAILURE;
6622
6623
6624 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6625 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6626
6627
6628 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6629 do {
6630 #if XNU_MONITOR
6631 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6632 #else
6633 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6634 #endif
6635
6636 if (kr == KERN_RESOURCE_SHORTAGE) {
6637 #if XNU_MONITOR
6638 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6639 #endif
6640 if (nowait_requested) {
6641 break;
6642 }
6643 }
6644 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6645
6646 #if XNU_MONITOR
6647 pmap_ledger_check_balance(pmap);
6648 #endif
6649
6650 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6651
6652 return kr;
6653 }
6654
6655 kern_return_t
6656 pmap_enter_options(
6657 pmap_t pmap,
6658 vm_map_address_t v,
6659 ppnum_t pn,
6660 vm_prot_t prot,
6661 vm_prot_t fault_type,
6662 unsigned int flags,
6663 boolean_t wired,
6664 unsigned int options,
6665 __unused void *arg,
6666 pmap_mapping_type_t mapping_type)
6667 {
6668 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6669 }
6670
6671 /*
6672 * Routine: pmap_change_wiring
6673 * Function: Change the wiring attribute for a map/virtual-address
6674 * pair.
6675 * In/out conditions:
6676 * The mapping must already exist in the pmap.
6677 */
6678 MARK_AS_PMAP_TEXT kern_return_t
6679 pmap_change_wiring_internal(
6680 pmap_t pmap,
6681 vm_map_address_t v,
6682 boolean_t wired)
6683 {
6684 pt_entry_t *pte_p;
6685 pmap_paddr_t pa;
6686
6687 validate_pmap_mutable(pmap);
6688
6689 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6690 return KERN_ABORTED;
6691 }
6692
6693 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6694
6695 pte_p = pmap_pte(pmap, v);
6696 if (pte_p == PT_ENTRY_NULL) {
6697 if (!wired) {
6698 /*
6699 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6700 * may have been freed by a remove operation.
6701 */
6702 goto pmap_change_wiring_return;
6703 } else {
6704 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6705 }
6706 }
6707 /*
6708 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6709 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6710 */
6711 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6712
6713 while (pa_valid(pa)) {
6714 pmap_paddr_t new_pa;
6715
6716 pvh_lock(pa_index(pa));
6717 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6718
6719 if (pa == new_pa) {
6720 break;
6721 }
6722
6723 pvh_unlock(pa_index(pa));
6724 pa = new_pa;
6725 }
6726
6727 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6728 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6729 if (!wired) {
6730 /* PTE cleared by prior remove/disconnect operation */
6731 goto pmap_change_wiring_cleanup;
6732 } else {
6733 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6734 __func__, pte_p, (uint64_t)*pte_p, pmap);
6735 }
6736 }
6737
6738 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6739 if (wired != pte_is_wired(*pte_p)) {
6740 pte_set_wired(pmap, pte_p, wired);
6741 if (pmap != kernel_pmap) {
6742 if (wired) {
6743 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6744 } else if (!wired) {
6745 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6746 }
6747 }
6748 }
6749
6750 pmap_change_wiring_cleanup:
6751 if (pa_valid(pa)) {
6752 pvh_unlock(pa_index(pa));
6753 }
6754
6755 pmap_change_wiring_return:
6756 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6757
6758 return KERN_SUCCESS;
6759 }
6760
6761 void
6762 pmap_change_wiring(
6763 pmap_t pmap,
6764 vm_map_address_t v,
6765 boolean_t wired)
6766 {
6767 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6768 pmap_verify_preemptible();
6769
6770 kern_return_t kr = KERN_FAILURE;
6771 #if XNU_MONITOR
6772 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6773 do {
6774 kr = pmap_change_wiring_ppl(pmap, v, wired);
6775 } while (kr == KERN_ABORTED);
6776
6777 pmap_ledger_check_balance(pmap);
6778 #else
6779 /* Since we verified preemptibility, call the helper only once. */
6780 kr = pmap_change_wiring_internal(pmap, v, wired);
6781 #endif
6782
6783 if (kr != KERN_SUCCESS) {
6784 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6785 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6786 }
6787 }
6788
6789 MARK_AS_PMAP_TEXT pmap_paddr_t
6790 pmap_find_pa_internal(
6791 pmap_t pmap,
6792 addr64_t va)
6793 {
6794 pmap_paddr_t pa = 0;
6795
6796 validate_pmap(pmap);
6797
6798 if (pmap != kernel_pmap) {
6799 pmap_lock(pmap, PMAP_LOCK_SHARED);
6800 }
6801
6802 pa = pmap_vtophys(pmap, va);
6803
6804 if (pmap != kernel_pmap) {
6805 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6806 }
6807
6808 return pa;
6809 }
6810
6811 pmap_paddr_t
6812 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6813 {
6814 pmap_paddr_t pa = 0;
6815
6816 if (pmap == kernel_pmap) {
6817 pa = mmu_kvtop(va);
6818 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6819 /*
6820 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6821 * translation even if PAN would prevent kernel access through the translation.
6822 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6823 */
6824 pa = mmu_uvtop(va);
6825 }
6826 return pa;
6827 }
6828
6829 pmap_paddr_t
6830 pmap_find_pa(
6831 pmap_t pmap,
6832 addr64_t va)
6833 {
6834 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6835
6836 if (pa != 0) {
6837 return pa;
6838 }
6839
6840 if (not_in_kdp) {
6841 #if XNU_MONITOR
6842 return pmap_find_pa_ppl(pmap, va);
6843 #else
6844 return pmap_find_pa_internal(pmap, va);
6845 #endif
6846 } else {
6847 return pmap_vtophys(pmap, va);
6848 }
6849 }
6850
6851 ppnum_t
6852 pmap_find_phys_nofault(
6853 pmap_t pmap,
6854 addr64_t va)
6855 {
6856 ppnum_t ppn;
6857 ppn = atop(pmap_find_pa_nofault(pmap, va));
6858 return ppn;
6859 }
6860
6861 ppnum_t
6862 pmap_find_phys(
6863 pmap_t pmap,
6864 addr64_t va)
6865 {
6866 ppnum_t ppn;
6867 ppn = atop(pmap_find_pa(pmap, va));
6868 return ppn;
6869 }
6870
6871 /**
6872 * Translate a kernel virtual address into a physical address.
6873 *
6874 * @param va The kernel virtual address to translate. Does not work on user
6875 * virtual addresses.
6876 *
6877 * @return The physical address if the translation was successful, or zero if
6878 * no valid mappings were found for the given virtual address.
6879 */
6880 pmap_paddr_t
6881 kvtophys(vm_offset_t va)
6882 {
6883 /**
6884 * Attempt to do the translation first in hardware using the AT (address
6885 * translation) instruction. This will attempt to use the MMU to do the
6886 * translation for us.
6887 */
6888 pmap_paddr_t pa = mmu_kvtop(va);
6889
6890 if (pa) {
6891 return pa;
6892 }
6893
6894 /* If the MMU can't find the mapping, then manually walk the page tables. */
6895 return pmap_vtophys(kernel_pmap, va);
6896 }
6897
6898 /**
6899 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6900 * points to a non-kernel-managed physical page, then this call will panic().
6901 *
6902 * @note The output of this function is guaranteed to be a kernel-managed
6903 * physical page, which means it's safe to pass the output directly to
6904 * pa_index() to create a physical address index for various pmap data
6905 * structures.
6906 *
6907 * @param va The kernel virtual address to translate. Does not work on user
6908 * virtual addresses.
6909 *
6910 * @return The translated physical address for the given virtual address.
6911 */
6912 pmap_paddr_t
6913 kvtophys_nofail(vm_offset_t va)
6914 {
6915 pmap_paddr_t pa = kvtophys(va);
6916
6917 if (!pa_valid(pa)) {
6918 panic("%s: Invalid or non-kernel-managed physical page returned, "
6919 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6920 }
6921
6922 return pa;
6923 }
6924
6925 pmap_paddr_t
6926 pmap_vtophys(
6927 pmap_t pmap,
6928 addr64_t va)
6929 {
6930 if ((va < pmap->min) || (va >= pmap->max)) {
6931 return 0;
6932 }
6933
6934 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6935
6936 tt_entry_t * ttp = NULL;
6937 tt_entry_t * ttep = NULL;
6938 tt_entry_t tte = ARM_TTE_EMPTY;
6939 pmap_paddr_t pa = 0;
6940 unsigned int cur_level;
6941
6942 ttp = pmap->tte;
6943
6944 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6945 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6946
6947 tte = *ttep;
6948
6949 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6950 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6951 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6952 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6953
6954 if ((tte & valid_mask) != valid_mask) {
6955 return (pmap_paddr_t) 0;
6956 }
6957
6958 /* This detects both leaf entries and intermediate block mappings. */
6959 if ((tte & type_mask) == type_block) {
6960 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6961 break;
6962 }
6963
6964 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6965 }
6966
6967 return pa;
6968 }
6969
6970 /*
6971 * pmap_init_pte_page - Initialize a page table page.
6972 */
6973 MARK_AS_PMAP_TEXT void
6974 pmap_init_pte_page(
6975 pmap_t pmap,
6976 pt_entry_t *pte_p,
6977 vm_offset_t va,
6978 unsigned int ttlevel,
6979 boolean_t alloc_ptd)
6980 {
6981 pt_desc_t *ptdp = NULL;
6982 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6983
6984 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6985 if (alloc_ptd) {
6986 /*
6987 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6988 * on 4KB hardware, we may already have allocated a page table descriptor for a
6989 * bootstrap request, so we check for an existing PTD here.
6990 */
6991 ptdp = ptd_alloc(pmap);
6992 if (ptdp == NULL) {
6993 panic("%s: unable to allocate PTD", __func__);
6994 }
6995 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6996 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6997 pvh_set_flags(pvh, 0);
6998 } else {
6999 panic("pmap_init_pte_page(): pte_p %p", pte_p);
7000 }
7001 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7002 ptdp = pvh_ptd(pvh);
7003 } else {
7004 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7005 }
7006
7007 // below barrier ensures previous updates to the page are visible to PTW before
7008 // it is linked to the PTE of previous level
7009 __builtin_arm_dmb(DMB_ISHST);
7010 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7011 }
7012
7013 /*
7014 * Routine: pmap_expand
7015 *
7016 * Expands a pmap to be able to map the specified virtual address.
7017 *
7018 * Allocates new memory for the default (COARSE) translation table
7019 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
7020 * also allocates space for the corresponding pv entries.
7021 *
7022 * Nothing should be locked.
7023 */
7024 MARK_AS_PMAP_TEXT static kern_return_t
7025 pmap_expand(
7026 pmap_t pmap,
7027 vm_map_address_t v,
7028 unsigned int options,
7029 unsigned int level)
7030 {
7031 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7032
7033 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
7034 return KERN_INVALID_ADDRESS;
7035 }
7036 pmap_paddr_t pa;
7037 unsigned int ttlevel = pt_attr_root_level(pt_attr);
7038 tt_entry_t *tte_p;
7039 tt_entry_t *tt_p;
7040
7041 pa = 0x0ULL;
7042 tt_p = (tt_entry_t *)NULL;
7043
7044 for (; ttlevel < level; ttlevel++) {
7045 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7046 return KERN_ABORTED;
7047 }
7048
7049 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7050 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7051 kern_return_t ret;
7052 while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
7053 if (options & PMAP_OPTIONS_NOWAIT) {
7054 /* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
7055 return ret;
7056 }
7057 #if XNU_MONITOR
7058 panic("%s: failed to allocate tt, "
7059 "pmap=%p, v=%p, options=0x%x, level=%u",
7060 __FUNCTION__,
7061 pmap, (void *)v, options, level);
7062 #else
7063 VM_PAGE_WAIT();
7064 #endif
7065 }
7066
7067 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
7068 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7069 return KERN_ABORTED;
7070 }
7071
7072 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7073 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7074 pa = kvtophys_nofail((vm_offset_t)tt_p);
7075 tte_p = pmap_ttne(pmap, ttlevel, v);
7076 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7077 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7078 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7079 pa = 0x0ULL;
7080 tt_p = (tt_entry_t *)NULL;
7081 }
7082 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7083 } else {
7084 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7085 }
7086
7087 if (tt_p != (tt_entry_t *)NULL) {
7088 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7089 tt_p = (tt_entry_t *)NULL;
7090 }
7091 }
7092
7093 return KERN_SUCCESS;
7094 }
7095
7096 /*
7097 * Routine: pmap_gc
7098 * Function:
7099 * Pmap garbage collection
7100 * Called by the pageout daemon when pages are scarce.
7101 *
7102 */
7103 void
7104 pmap_gc(void)
7105 {
7106 /*
7107 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7108 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7109 * or may contain wired mappings. However, with the relatively recent change to
7110 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
7111 * page, it may make sense to call that function here.
7112 */
7113 }
7114
7115 /*
7116 * By default, don't attempt pmap GC more frequently
7117 * than once / 1 minutes.
7118 */
7119
7120 void
7121 compute_pmap_gc_throttle(
7122 void *arg __unused)
7123 {
7124 }
7125
7126 /*
7127 * pmap_attribute_cache_sync(vm_offset_t pa)
7128 *
7129 * Invalidates all of the instruction cache on a physical page and
7130 * pushes any dirty data from the data cache for the same physical page
7131 */
7132
7133 kern_return_t
7134 pmap_attribute_cache_sync(
7135 ppnum_t pp,
7136 vm_size_t size,
7137 __unused vm_machine_attribute_t attribute,
7138 __unused vm_machine_attribute_val_t * value)
7139 {
7140 if (size > PAGE_SIZE) {
7141 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7142 } else {
7143 cache_sync_page(pp);
7144 }
7145
7146 return KERN_SUCCESS;
7147 }
7148
7149 /*
7150 * pmap_sync_page_data_phys(ppnum_t pp)
7151 *
7152 * Invalidates all of the instruction cache on a physical page and
7153 * pushes any dirty data from the data cache for the same physical page
7154 */
7155 void
7156 pmap_sync_page_data_phys(
7157 ppnum_t pp)
7158 {
7159 cache_sync_page(pp);
7160 }
7161
7162 /*
7163 * pmap_sync_page_attributes_phys(ppnum_t pp)
7164 *
7165 * Write back and invalidate all cachelines on a physical page.
7166 */
7167 void
7168 pmap_sync_page_attributes_phys(
7169 ppnum_t pp)
7170 {
7171 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7172 }
7173
7174 #if CONFIG_COREDUMP
7175 /* temporary workaround */
7176 boolean_t
7177 coredumpok(
7178 vm_map_t map,
7179 mach_vm_offset_t va)
7180 {
7181 pt_entry_t *pte_p;
7182 pt_entry_t spte;
7183
7184 pte_p = pmap_pte(map->pmap, va);
7185 if (0 == pte_p) {
7186 return FALSE;
7187 }
7188 if (vm_map_entry_has_device_pager(map, va)) {
7189 return FALSE;
7190 }
7191 spte = *pte_p;
7192 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7193 }
7194 #endif
7195
7196 void
7197 fillPage(
7198 ppnum_t pn,
7199 unsigned int fill)
7200 {
7201 unsigned int *addr;
7202 int count;
7203
7204 addr = (unsigned int *) phystokv(ptoa(pn));
7205 count = PAGE_SIZE / sizeof(unsigned int);
7206 while (count--) {
7207 *addr++ = fill;
7208 }
7209 }
7210
7211 extern void mapping_set_mod(ppnum_t pn);
7212
7213 void
7214 mapping_set_mod(
7215 ppnum_t pn)
7216 {
7217 pmap_set_modify(pn);
7218 }
7219
7220 extern void mapping_set_ref(ppnum_t pn);
7221
7222 void
7223 mapping_set_ref(
7224 ppnum_t pn)
7225 {
7226 pmap_set_reference(pn);
7227 }
7228
7229 /*
7230 * Clear specified attribute bits.
7231 *
7232 * Try to force an arm_fast_fault() for all mappings of
7233 * the page - to force attributes to be set again at fault time.
7234 * If the forcing succeeds, clear the cached bits at the head.
7235 * Otherwise, something must have been wired, so leave the cached
7236 * attributes alone.
7237 */
7238 MARK_AS_PMAP_TEXT static void
7239 phys_attribute_clear_with_flush_range(
7240 ppnum_t pn,
7241 unsigned int bits,
7242 int options,
7243 void *arg,
7244 pmap_tlb_flush_range_t *flush_range)
7245 {
7246 pmap_paddr_t pa = ptoa(pn);
7247 vm_prot_t allow_mode = VM_PROT_ALL;
7248
7249 #if XNU_MONITOR
7250 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7251 panic("%s: illegal request, "
7252 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7253 __FUNCTION__,
7254 pn, bits, options, arg, flush_range);
7255 }
7256 #endif
7257 if ((arg != NULL) || (flush_range != NULL)) {
7258 options = options & ~PMAP_OPTIONS_NOFLUSH;
7259 }
7260
7261 if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7262 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7263 "invalid options",
7264 pn, bits, options, arg, flush_range);
7265 }
7266
7267 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7268 (options & PMAP_OPTIONS_NOFLUSH))) {
7269 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7270 "should not clear 'modified' without flushing TLBs",
7271 pn, bits, options, arg, flush_range);
7272 }
7273
7274 assert(pn != vm_page_fictitious_addr);
7275
7276 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7277 assert(bits == PP_ATTR_MODIFIED);
7278
7279 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7280 /*
7281 * We short circuit this case; it should not need to
7282 * invoke arm_force_fast_fault, so just clear the modified bit.
7283 * pmap_page_protect has taken care of resetting
7284 * the state so that we'll see the next write as a fault to
7285 * the VM (i.e. we don't want a fast fault).
7286 */
7287 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7288 return;
7289 }
7290 if (bits & PP_ATTR_REFERENCED) {
7291 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7292 }
7293 if (bits & PP_ATTR_MODIFIED) {
7294 allow_mode &= ~VM_PROT_WRITE;
7295 }
7296
7297 if (bits == PP_ATTR_NOENCRYPT) {
7298 /*
7299 * We short circuit this case; it should not need to
7300 * invoke arm_force_fast_fault, so just clear and
7301 * return. On ARM, this bit is just a debugging aid.
7302 */
7303 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7304 return;
7305 }
7306
7307 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7308 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7309 }
7310 }
7311
7312 MARK_AS_PMAP_TEXT void
7313 phys_attribute_clear_internal(
7314 ppnum_t pn,
7315 unsigned int bits,
7316 int options,
7317 void *arg)
7318 {
7319 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7320 }
7321
7322 #if __ARM_RANGE_TLBI__
7323 MARK_AS_PMAP_TEXT static vm_map_address_t
7324 phys_attribute_clear_twig_internal(
7325 pmap_t pmap,
7326 vm_map_address_t start,
7327 vm_map_address_t end,
7328 unsigned int bits,
7329 unsigned int options,
7330 pmap_tlb_flush_range_t *flush_range)
7331 {
7332 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7333 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7334 assert(end >= start);
7335 assert((end - start) <= pt_attr_twig_size(pt_attr));
7336 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7337 vm_map_address_t va = start;
7338 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7339 tt_entry_t *tte_p;
7340 tte_p = pmap_tte(pmap, start);
7341 unsigned int npages = 0;
7342
7343 if (tte_p == (tt_entry_t *) NULL) {
7344 return end;
7345 }
7346
7347 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7348 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7349
7350 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7351 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7352 assert(end_pte_p >= start_pte_p);
7353 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7354 if (__improbable(npages++ && pmap_pending_preemption())) {
7355 return va;
7356 }
7357 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7358 if (pa_valid(pa)) {
7359 ppnum_t pn = (ppnum_t) atop(pa);
7360 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7361 }
7362 }
7363 }
7364 return end;
7365 }
7366
7367 MARK_AS_PMAP_TEXT vm_map_address_t
7368 phys_attribute_clear_range_internal(
7369 pmap_t pmap,
7370 vm_map_address_t start,
7371 vm_map_address_t end,
7372 unsigned int bits,
7373 unsigned int options)
7374 {
7375 if (__improbable(end < start)) {
7376 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7377 }
7378 validate_pmap_mutable(pmap);
7379
7380 vm_map_address_t va = start;
7381 pmap_tlb_flush_range_t flush_range = {
7382 .ptfr_pmap = pmap,
7383 .ptfr_start = start,
7384 .ptfr_end = end,
7385 .ptfr_flush_needed = false
7386 };
7387
7388 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7389 return va;
7390 }
7391
7392 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7393
7394 while (va < end) {
7395 vm_map_address_t curr_end;
7396
7397 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7398 if (curr_end > end) {
7399 curr_end = end;
7400 }
7401
7402 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7403 if ((va < curr_end) || pmap_pending_preemption()) {
7404 break;
7405 }
7406 }
7407 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7408 if (flush_range.ptfr_flush_needed) {
7409 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7410 flush_range.ptfr_start,
7411 flush_range.ptfr_end - flush_range.ptfr_start,
7412 flush_range.ptfr_pmap,
7413 true,
7414 false);
7415 sync_tlb_flush();
7416 }
7417 return va;
7418 }
7419
7420 static void
7421 phys_attribute_clear_range(
7422 pmap_t pmap,
7423 vm_map_address_t start,
7424 vm_map_address_t end,
7425 unsigned int bits,
7426 unsigned int options)
7427 {
7428 /*
7429 * We allow single-page requests to execute non-preemptibly,
7430 * as it doesn't make sense to sample AST_URGENT for a single-page
7431 * operation, and there are a couple of special use cases that
7432 * require a non-preemptible single-page operation.
7433 */
7434 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7435 pmap_verify_preemptible();
7436 }
7437
7438 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7439
7440 while (start < end) {
7441 #if XNU_MONITOR
7442 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7443 #else
7444 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7445 #endif
7446 }
7447
7448 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7449 }
7450 #endif /* __ARM_RANGE_TLBI__ */
7451
7452 static void
7453 phys_attribute_clear(
7454 ppnum_t pn,
7455 unsigned int bits,
7456 int options,
7457 void *arg)
7458 {
7459 /*
7460 * Do we really want this tracepoint? It will be extremely chatty.
7461 * Also, should we have a corresponding trace point for the set path?
7462 */
7463 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7464
7465 #if XNU_MONITOR
7466 phys_attribute_clear_ppl(pn, bits, options, arg);
7467 #else
7468 phys_attribute_clear_internal(pn, bits, options, arg);
7469 #endif
7470
7471 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7472 }
7473
7474 /*
7475 * Set specified attribute bits.
7476 *
7477 * Set cached value in the pv head because we have
7478 * no per-mapping hardware support for referenced and
7479 * modify bits.
7480 */
7481 MARK_AS_PMAP_TEXT void
7482 phys_attribute_set_internal(
7483 ppnum_t pn,
7484 unsigned int bits)
7485 {
7486 pmap_paddr_t pa = ptoa(pn);
7487 assert(pn != vm_page_fictitious_addr);
7488
7489 #if XNU_MONITOR
7490 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7491 panic("%s: illegal request, "
7492 "pn=%u, bits=%#x",
7493 __FUNCTION__,
7494 pn, bits);
7495 }
7496 #endif
7497
7498 ppattr_pa_set_bits(pa, (uint16_t)bits);
7499
7500 return;
7501 }
7502
7503 static void
7504 phys_attribute_set(
7505 ppnum_t pn,
7506 unsigned int bits)
7507 {
7508 #if XNU_MONITOR
7509 phys_attribute_set_ppl(pn, bits);
7510 #else
7511 phys_attribute_set_internal(pn, bits);
7512 #endif
7513 }
7514
7515
7516 /*
7517 * Check specified attribute bits.
7518 *
7519 * use the software cached bits (since no hw support).
7520 */
7521 static boolean_t
7522 phys_attribute_test(
7523 ppnum_t pn,
7524 unsigned int bits)
7525 {
7526 pmap_paddr_t pa = ptoa(pn);
7527 assert(pn != vm_page_fictitious_addr);
7528 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7529 }
7530
7531
7532 /*
7533 * Set the modify/reference bits on the specified physical page.
7534 */
7535 void
7536 pmap_set_modify(ppnum_t pn)
7537 {
7538 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7539 }
7540
7541
7542 /*
7543 * Clear the modify bits on the specified physical page.
7544 */
7545 void
7546 pmap_clear_modify(
7547 ppnum_t pn)
7548 {
7549 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7550 }
7551
7552
7553 /*
7554 * pmap_is_modified:
7555 *
7556 * Return whether or not the specified physical page is modified
7557 * by any physical maps.
7558 */
7559 boolean_t
7560 pmap_is_modified(
7561 ppnum_t pn)
7562 {
7563 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7564 }
7565
7566
7567 /*
7568 * Set the reference bit on the specified physical page.
7569 */
7570 static void
7571 pmap_set_reference(
7572 ppnum_t pn)
7573 {
7574 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7575 }
7576
7577 /*
7578 * Clear the reference bits on the specified physical page.
7579 */
7580 void
7581 pmap_clear_reference(
7582 ppnum_t pn)
7583 {
7584 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7585 }
7586
7587
7588 /*
7589 * pmap_is_referenced:
7590 *
7591 * Return whether or not the specified physical page is referenced
7592 * by any physical maps.
7593 */
7594 boolean_t
7595 pmap_is_referenced(
7596 ppnum_t pn)
7597 {
7598 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7599 }
7600
7601 /*
7602 * pmap_get_refmod(phys)
7603 * returns the referenced and modified bits of the specified
7604 * physical page.
7605 */
7606 unsigned int
7607 pmap_get_refmod(
7608 ppnum_t pn)
7609 {
7610 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7611 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7612 }
7613
7614 static inline unsigned int
7615 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7616 {
7617 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7618 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7619 }
7620
7621 /*
7622 * pmap_clear_refmod(phys, mask)
7623 * clears the referenced and modified bits as specified by the mask
7624 * of the specified physical page.
7625 */
7626 void
7627 pmap_clear_refmod_options(
7628 ppnum_t pn,
7629 unsigned int mask,
7630 unsigned int options,
7631 void *arg)
7632 {
7633 unsigned int bits;
7634
7635 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7636 phys_attribute_clear(pn, bits, options, arg);
7637 }
7638
7639 /*
7640 * Perform pmap_clear_refmod_options on a virtual address range.
7641 * The operation will be performed in bulk & tlb flushes will be coalesced
7642 * if possible.
7643 *
7644 * Returns true if the operation is supported on this platform.
7645 * If this function returns false, the operation is not supported and
7646 * nothing has been modified in the pmap.
7647 */
7648 bool
7649 pmap_clear_refmod_range_options(
7650 pmap_t pmap __unused,
7651 vm_map_address_t start __unused,
7652 vm_map_address_t end __unused,
7653 unsigned int mask __unused,
7654 unsigned int options __unused)
7655 {
7656 #if __ARM_RANGE_TLBI__
7657 unsigned int bits;
7658 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7659 phys_attribute_clear_range(pmap, start, end, bits, options);
7660 return true;
7661 #else /* __ARM_RANGE_TLBI__ */
7662 #pragma unused(pmap, start, end, mask, options)
7663 /*
7664 * This operation allows the VM to bulk modify refmod bits on a virtually
7665 * contiguous range of addresses. This is large performance improvement on
7666 * platforms that support ranged tlbi instructions. But on older platforms,
7667 * we can only flush per-page or the entire asid. So we currently
7668 * only support this operation on platforms that support ranged tlbi.
7669 * instructions. On other platforms, we require that
7670 * the VM modify the bits on a per-page basis.
7671 */
7672 return false;
7673 #endif /* __ARM_RANGE_TLBI__ */
7674 }
7675
7676 void
7677 pmap_clear_refmod(
7678 ppnum_t pn,
7679 unsigned int mask)
7680 {
7681 pmap_clear_refmod_options(pn, mask, 0, NULL);
7682 }
7683
7684 unsigned int
7685 pmap_disconnect_options(
7686 ppnum_t pn,
7687 unsigned int options,
7688 void *arg)
7689 {
7690 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7691 /*
7692 * On ARM, the "modified" bit is managed by software, so
7693 * we know up-front if the physical page is "modified",
7694 * without having to scan all the PTEs pointing to it.
7695 * The caller should have made the VM page "busy" so noone
7696 * should be able to establish any new mapping and "modify"
7697 * the page behind us.
7698 */
7699 if (pmap_is_modified(pn)) {
7700 /*
7701 * The page has been modified and will be sent to
7702 * the VM compressor.
7703 */
7704 options |= PMAP_OPTIONS_COMPRESSOR;
7705 } else {
7706 /*
7707 * The page hasn't been modified and will be freed
7708 * instead of compressed.
7709 */
7710 }
7711 }
7712
7713 /* disconnect the page */
7714 pmap_page_protect_options(pn, 0, options, arg);
7715
7716 /* return ref/chg status */
7717 return pmap_get_refmod(pn);
7718 }
7719
7720 /*
7721 * Routine:
7722 * pmap_disconnect
7723 *
7724 * Function:
7725 * Disconnect all mappings for this page and return reference and change status
7726 * in generic format.
7727 *
7728 */
7729 unsigned int
7730 pmap_disconnect(
7731 ppnum_t pn)
7732 {
7733 pmap_page_protect(pn, 0); /* disconnect the page */
7734 return pmap_get_refmod(pn); /* return ref/chg status */
7735 }
7736
7737 boolean_t
7738 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7739 {
7740 if (ptoa(first) >= vm_last_phys) {
7741 return FALSE;
7742 }
7743 if (ptoa(last) < vm_first_phys) {
7744 return FALSE;
7745 }
7746
7747 return TRUE;
7748 }
7749
7750 /*
7751 * The state maintained by the noencrypt functions is used as a
7752 * debugging aid on ARM. This incurs some overhead on the part
7753 * of the caller. A special case check in phys_attribute_clear
7754 * (the most expensive path) currently minimizes this overhead,
7755 * but stubbing these functions out on RELEASE kernels yields
7756 * further wins.
7757 */
7758 boolean_t
7759 pmap_is_noencrypt(
7760 ppnum_t pn)
7761 {
7762 #if DEVELOPMENT || DEBUG
7763 boolean_t result = FALSE;
7764
7765 if (!pa_valid(ptoa(pn))) {
7766 return FALSE;
7767 }
7768
7769 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7770
7771 return result;
7772 #else
7773 #pragma unused(pn)
7774 return FALSE;
7775 #endif
7776 }
7777
7778 void
7779 pmap_set_noencrypt(
7780 ppnum_t pn)
7781 {
7782 #if DEVELOPMENT || DEBUG
7783 if (!pa_valid(ptoa(pn))) {
7784 return;
7785 }
7786
7787 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7788 #else
7789 #pragma unused(pn)
7790 #endif
7791 }
7792
7793 void
7794 pmap_clear_noencrypt(
7795 ppnum_t pn)
7796 {
7797 #if DEVELOPMENT || DEBUG
7798 if (!pa_valid(ptoa(pn))) {
7799 return;
7800 }
7801
7802 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7803 #else
7804 #pragma unused(pn)
7805 #endif
7806 }
7807
7808 #if XNU_MONITOR
7809 boolean_t
7810 pmap_is_monitor(ppnum_t pn)
7811 {
7812 assert(pa_valid(ptoa(pn)));
7813 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7814 }
7815 #endif
7816
7817 void
7818 pmap_lock_phys_page(ppnum_t pn)
7819 {
7820 #if !XNU_MONITOR
7821 unsigned int pai;
7822 pmap_paddr_t phys = ptoa(pn);
7823
7824 if (pa_valid(phys)) {
7825 pai = pa_index(phys);
7826 pvh_lock(pai);
7827 } else
7828 #else
7829 (void)pn;
7830 #endif
7831 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7832 }
7833
7834
7835 void
7836 pmap_unlock_phys_page(ppnum_t pn)
7837 {
7838 #if !XNU_MONITOR
7839 unsigned int pai;
7840 pmap_paddr_t phys = ptoa(pn);
7841
7842 if (pa_valid(phys)) {
7843 pai = pa_index(phys);
7844 pvh_unlock(pai);
7845 } else
7846 #else
7847 (void)pn;
7848 #endif
7849 { simple_unlock(&phys_backup_lock);}
7850 }
7851
7852 MARK_AS_PMAP_TEXT static void
7853 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7854 {
7855 if (pmap != kernel_pmap) {
7856 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7857 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7858 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7859 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7860 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7861 #if __ARM_MIXED_PAGE_SIZE__
7862 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7863 #endif
7864 }
7865
7866
7867 #if __ARM_MIXED_PAGE_SIZE__
7868 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7869 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7870 }
7871 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7872
7873
7874 if (pmap != kernel_pmap) {
7875 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7876 } else if (!pmap_user_ttb_is_clear()) {
7877 pmap_clear_user_ttb_internal();
7878 }
7879 }
7880
7881 MARK_AS_PMAP_TEXT void
7882 pmap_clear_user_ttb_internal(void)
7883 {
7884 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7885 }
7886
7887 void
7888 pmap_clear_user_ttb(void)
7889 {
7890 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7891 #if XNU_MONITOR
7892 pmap_clear_user_ttb_ppl();
7893 #else
7894 pmap_clear_user_ttb_internal();
7895 #endif
7896 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7897 }
7898
7899
7900 #if defined(__arm64__)
7901 /*
7902 * Marker for use in multi-pass fast-fault PV list processing.
7903 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7904 * these functions, as compressed PTEs should never be present in PV lists.
7905 * Note that this only holds true for arm64; for arm32 we don't have enough
7906 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7907 * and WRITEABLE marker depending on whether the PTE is valid.
7908 */
7909 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7910 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7911 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7912 #endif
7913
7914
7915 MARK_AS_PMAP_TEXT static boolean_t
7916 arm_force_fast_fault_with_flush_range(
7917 ppnum_t ppnum,
7918 vm_prot_t allow_mode,
7919 int options,
7920 pmap_tlb_flush_range_t *flush_range)
7921 {
7922 pmap_paddr_t phys = ptoa(ppnum);
7923 pv_entry_t *pve_p;
7924 pt_entry_t *pte_p;
7925 unsigned int pai;
7926 unsigned int pass1_updated = 0;
7927 unsigned int pass2_updated = 0;
7928 boolean_t result;
7929 pv_entry_t **pv_h;
7930 bool is_reusable;
7931 bool ref_fault;
7932 bool mod_fault;
7933 bool clear_write_fault = false;
7934 bool ref_aliases_mod = false;
7935 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7936
7937 assert(ppnum != vm_page_fictitious_addr);
7938
7939 if (!pa_valid(phys)) {
7940 return FALSE; /* Not a managed page. */
7941 }
7942
7943 result = TRUE;
7944 ref_fault = false;
7945 mod_fault = false;
7946 pai = pa_index(phys);
7947 if (__probable(mustsynch)) {
7948 pvh_lock(pai);
7949 }
7950 pv_h = pai_to_pvh(pai);
7951
7952 #if XNU_MONITOR
7953 if (__improbable(ppattr_pa_test_monitor(phys))) {
7954 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7955 }
7956 #endif
7957 pte_p = PT_ENTRY_NULL;
7958 pve_p = PV_ENTRY_NULL;
7959 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7960 pte_p = pvh_ptep(pv_h);
7961 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7962 pve_p = pvh_pve_list(pv_h);
7963 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7964 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7965 }
7966
7967 is_reusable = ppattr_test_reusable(pai);
7968
7969 /*
7970 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7971 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
7972 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7973 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
7974 * operation, TLB invalidation may be handled by the caller so it's possible for
7975 * tlb_flush_needed to be true while issue_tlbi is false.
7976 */
7977 bool issue_tlbi = false;
7978 bool tlb_flush_needed = false;
7979
7980 pv_entry_t *orig_pve_p = pve_p;
7981 pt_entry_t *orig_pte_p = pte_p;
7982 int pve_ptep_idx = 0;
7983
7984 /*
7985 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7986 * TLB invalidation in pass 2.
7987 */
7988 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7989 pt_entry_t spte;
7990 pt_entry_t tmplate;
7991
7992 if (pve_p != PV_ENTRY_NULL) {
7993 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7994 if (pte_p == PT_ENTRY_NULL) {
7995 goto fff_skip_pve_pass1;
7996 }
7997 }
7998
7999 #ifdef PVH_FLAG_IOMMU
8000 if (pvh_ptep_is_iommu(pte_p)) {
8001 goto fff_skip_pve_pass1;
8002 }
8003 #endif
8004 if (*pte_p == ARM_PTE_EMPTY) {
8005 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8006 }
8007 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8008 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8009 }
8010
8011 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8012 const pmap_t pmap = ptdp->pmap;
8013 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8014 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8015
8016 assert(va >= pmap->min && va < pmap->max);
8017
8018 /* update pmap stats and ledgers */
8019 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8020 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8021 if (is_altacct) {
8022 /*
8023 * We do not track "reusable" status for
8024 * "alternate accounting" mappings.
8025 */
8026 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8027 is_reusable &&
8028 is_internal &&
8029 pmap != kernel_pmap) {
8030 /* one less "reusable" */
8031 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8032 /* one more "internal" */
8033 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8034 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8035
8036 /*
8037 * Since the page is being marked non-reusable, we assume that it will be
8038 * modified soon. Avoid the cost of another trap to handle the fast
8039 * fault when we next write to this page.
8040 */
8041 clear_write_fault = true;
8042 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8043 !is_reusable &&
8044 is_internal &&
8045 pmap != kernel_pmap) {
8046 /* one more "reusable" */
8047 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8048 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8049 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8050 }
8051
8052 bool wiredskip = pte_is_wired(*pte_p) &&
8053 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8054
8055 if (wiredskip) {
8056 result = FALSE;
8057 goto fff_skip_pve_pass1;
8058 }
8059
8060 spte = *pte_p;
8061 tmplate = spte;
8062
8063 #if HAS_FEAT_XS
8064 /**
8065 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
8066 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
8067 */
8068 assert(!pte_is_xs(pt_attr, spte));
8069 #endif /* HAS_FEAT_XS */
8070 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8071 /* read protection sets the pte to fault */
8072 tmplate = tmplate & ~ARM_PTE_AF;
8073 ref_fault = true;
8074 }
8075 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8076 /* take away write permission if set */
8077 if (pmap == kernel_pmap) {
8078 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8079 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8080 pte_set_was_writeable(tmplate, true);
8081 mod_fault = true;
8082 }
8083 } else {
8084 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8085 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8086 pte_set_was_writeable(tmplate, true);
8087 mod_fault = true;
8088 }
8089 }
8090 }
8091
8092 #if MACH_ASSERT && XNU_MONITOR
8093 if (is_pte_xprr_protected(pmap, spte)) {
8094 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8095 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8096 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8097 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8098 ppnum, options, allow_mode);
8099 }
8100 }
8101 #endif /* MACH_ASSERT && XNU_MONITOR */
8102
8103 if (result && (tmplate != spte)) {
8104 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8105 !(options & PMAP_OPTIONS_NOFLUSH)) {
8106 tlb_flush_needed = true;
8107 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8108 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8109 #ifdef ARM_PTE_FF_MARKER
8110 assert(!(spte & ARM_PTE_FF_MARKER));
8111 tmplate |= ARM_PTE_FF_MARKER;
8112 ++pass1_updated;
8113 #endif
8114 issue_tlbi = true;
8115 }
8116 }
8117 write_pte_fast(pte_p, tmplate);
8118 }
8119
8120 fff_skip_pve_pass1:
8121 pte_p = PT_ENTRY_NULL;
8122 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8123 pve_ptep_idx = 0;
8124 pve_p = pve_next(pve_p);
8125 }
8126 }
8127
8128 if (tlb_flush_needed) {
8129 FLUSH_PTE_STRONG();
8130 }
8131
8132 if (!issue_tlbi) {
8133 goto fff_finish;
8134 }
8135
8136 /* Pass 2: Issue any required TLB invalidations */
8137 pve_p = orig_pve_p;
8138 pte_p = orig_pte_p;
8139 pve_ptep_idx = 0;
8140
8141 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8142 if (pve_p != PV_ENTRY_NULL) {
8143 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8144 if (pte_p == PT_ENTRY_NULL) {
8145 goto fff_skip_pve_pass2;
8146 }
8147 }
8148
8149 #ifdef PVH_FLAG_IOMMU
8150 if (pvh_ptep_is_iommu(pte_p)) {
8151 goto fff_skip_pve_pass2;
8152 }
8153 #endif
8154
8155 #ifdef ARM_PTE_FF_MARKER
8156 pt_entry_t spte = *pte_p;
8157
8158 if (!(spte & ARM_PTE_FF_MARKER)) {
8159 goto fff_skip_pve_pass2;
8160 } else {
8161 spte &= (~ARM_PTE_FF_MARKER);
8162 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8163 write_pte_fast(pte_p, spte);
8164 ++pass2_updated;
8165 }
8166 #endif
8167 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8168 const pmap_t pmap = ptdp->pmap;
8169 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8170
8171 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8172 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8173 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8174 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8175 }
8176
8177 fff_skip_pve_pass2:
8178 pte_p = PT_ENTRY_NULL;
8179 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8180 pve_ptep_idx = 0;
8181 pve_p = pve_next(pve_p);
8182 }
8183 }
8184
8185 fff_finish:
8186 if (__improbable(pass1_updated != pass2_updated)) {
8187 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8188 __func__, pass1_updated, pass2_updated);
8189 }
8190
8191 /*
8192 * If we are using the same approach for ref and mod
8193 * faults on this PTE, do not clear the write fault;
8194 * this would cause both ref and mod to be set on the
8195 * page again, and prevent us from taking ANY read/write
8196 * fault on the mapping.
8197 */
8198 if (clear_write_fault && !ref_aliases_mod) {
8199 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8200 }
8201 if (tlb_flush_needed) {
8202 if (flush_range) {
8203 /* Delayed flush. Signal to the caller that the flush is needed. */
8204 flush_range->ptfr_flush_needed = true;
8205 } else {
8206 sync_tlb_flush();
8207 }
8208 }
8209
8210 /* update global "reusable" status for this page */
8211 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8212 ppattr_clear_reusable(pai);
8213 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8214 ppattr_set_reusable(pai);
8215 }
8216
8217 if (mod_fault) {
8218 ppattr_set_modfault(pai);
8219 }
8220 if (ref_fault) {
8221 ppattr_set_reffault(pai);
8222 }
8223 if (__probable(mustsynch)) {
8224 pvh_unlock(pai);
8225 }
8226 return result;
8227 }
8228
8229 MARK_AS_PMAP_TEXT boolean_t
8230 arm_force_fast_fault_internal(
8231 ppnum_t ppnum,
8232 vm_prot_t allow_mode,
8233 int options)
8234 {
8235 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8236 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8237 }
8238 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8239 }
8240
8241 /*
8242 * Routine: arm_force_fast_fault
8243 *
8244 * Function:
8245 * Force all mappings for this page to fault according
8246 * to the access modes allowed, so we can gather ref/modify
8247 * bits again.
8248 */
8249
8250 boolean_t
8251 arm_force_fast_fault(
8252 ppnum_t ppnum,
8253 vm_prot_t allow_mode,
8254 int options,
8255 __unused void *arg)
8256 {
8257 pmap_paddr_t phys = ptoa(ppnum);
8258
8259 assert(ppnum != vm_page_fictitious_addr);
8260
8261 if (!pa_valid(phys)) {
8262 return FALSE; /* Not a managed page. */
8263 }
8264
8265 #if XNU_MONITOR
8266 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8267 #else
8268 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8269 #endif
8270 }
8271
8272 /*
8273 * Routine: arm_clear_fast_fault
8274 *
8275 * Function:
8276 * Clear pending force fault for all mappings for this page based on
8277 * the observed fault type, update ref/modify bits.
8278 */
8279 MARK_AS_PMAP_TEXT static boolean_t
8280 arm_clear_fast_fault(
8281 ppnum_t ppnum,
8282 vm_prot_t fault_type,
8283 pt_entry_t *pte_p)
8284 {
8285 pmap_paddr_t pa = ptoa(ppnum);
8286 pv_entry_t *pve_p;
8287 unsigned int pai;
8288 boolean_t result;
8289 bool tlb_flush_needed = false;
8290 pv_entry_t **pv_h;
8291 unsigned int npve = 0;
8292 unsigned int pass1_updated = 0;
8293 unsigned int pass2_updated = 0;
8294
8295 assert(ppnum != vm_page_fictitious_addr);
8296
8297 if (!pa_valid(pa)) {
8298 return FALSE; /* Not a managed page. */
8299 }
8300
8301 result = FALSE;
8302 pai = pa_index(pa);
8303 pvh_assert_locked(pai);
8304 pv_h = pai_to_pvh(pai);
8305
8306 pve_p = PV_ENTRY_NULL;
8307 if (pte_p == PT_ENTRY_NULL) {
8308 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8309 pte_p = pvh_ptep(pv_h);
8310 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8311 pve_p = pvh_pve_list(pv_h);
8312 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8313 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8314 }
8315 }
8316
8317 pv_entry_t *orig_pve_p = pve_p;
8318 pt_entry_t *orig_pte_p = pte_p;
8319 int pve_ptep_idx = 0;
8320
8321 /*
8322 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8323 * TLB invalidation in pass 2.
8324 */
8325 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8326 pt_entry_t spte;
8327 pt_entry_t tmplate;
8328
8329 if (pve_p != PV_ENTRY_NULL) {
8330 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8331 if (pte_p == PT_ENTRY_NULL) {
8332 goto cff_skip_pve_pass1;
8333 }
8334 }
8335
8336 #ifdef PVH_FLAG_IOMMU
8337 if (pvh_ptep_is_iommu(pte_p)) {
8338 goto cff_skip_pve_pass1;
8339 }
8340 #endif
8341 if (*pte_p == ARM_PTE_EMPTY) {
8342 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8343 }
8344
8345 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8346 const pmap_t pmap = ptdp->pmap;
8347 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8348
8349 assert(va >= pmap->min && va < pmap->max);
8350
8351 spte = *pte_p;
8352 tmplate = spte;
8353
8354 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8355 {
8356 if (pmap == kernel_pmap) {
8357 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8358 } else {
8359 assert(pmap->type != PMAP_TYPE_NESTED);
8360 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8361 }
8362 }
8363
8364 tmplate |= ARM_PTE_AF;
8365
8366 pte_set_was_writeable(tmplate, false);
8367 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8368 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8369 tmplate = spte | ARM_PTE_AF;
8370
8371 {
8372 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8373 }
8374 }
8375
8376 #if MACH_ASSERT && XNU_MONITOR
8377 if (is_pte_xprr_protected(pmap, spte)) {
8378 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8379 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8380 "ppnum=0x%x, fault_type=0x%x",
8381 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8382 ppnum, fault_type);
8383 }
8384 }
8385 #endif /* MACH_ASSERT && XNU_MONITOR */
8386
8387 assert(spte != ARM_PTE_TYPE_FAULT);
8388 if (spte != tmplate) {
8389 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8390 #ifdef ARM_PTE_FF_MARKER
8391 assert(!(spte & ARM_PTE_FF_MARKER));
8392 tmplate |= ARM_PTE_FF_MARKER;
8393 ++pass1_updated;
8394 #endif
8395 tlb_flush_needed = true;
8396 }
8397 write_pte_fast(pte_p, tmplate);
8398 result = TRUE;
8399 }
8400
8401 cff_skip_pve_pass1:
8402 pte_p = PT_ENTRY_NULL;
8403 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8404 pve_ptep_idx = 0;
8405 pve_p = pve_next(pve_p);
8406 ++npve;
8407 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8408 break;
8409 }
8410 }
8411 }
8412
8413 if (!tlb_flush_needed) {
8414 goto cff_finish;
8415 }
8416
8417 FLUSH_PTE_STRONG();
8418
8419 /* Pass 2: Issue any required TLB invalidations */
8420 pve_p = orig_pve_p;
8421 pte_p = orig_pte_p;
8422 pve_ptep_idx = 0;
8423 npve = 0;
8424
8425 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8426 if (pve_p != PV_ENTRY_NULL) {
8427 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8428 if (pte_p == PT_ENTRY_NULL) {
8429 goto cff_skip_pve_pass2;
8430 }
8431 }
8432
8433 #ifdef PVH_FLAG_IOMMU
8434 if (pvh_ptep_is_iommu(pte_p)) {
8435 goto cff_skip_pve_pass2;
8436 }
8437 #endif
8438
8439 #ifdef ARM_PTE_FF_MARKER
8440 pt_entry_t spte = *pte_p;
8441
8442 if (!(spte & ARM_PTE_FF_MARKER)) {
8443 goto cff_skip_pve_pass2;
8444 } else {
8445 spte &= (~ARM_PTE_FF_MARKER);
8446 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8447 write_pte_fast(pte_p, spte);
8448 ++pass2_updated;
8449 }
8450 #endif
8451 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8452 const pmap_t pmap = ptdp->pmap;
8453 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8454
8455 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8456 pmap, true, false);
8457
8458 cff_skip_pve_pass2:
8459 pte_p = PT_ENTRY_NULL;
8460 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8461 pve_ptep_idx = 0;
8462 pve_p = pve_next(pve_p);
8463 ++npve;
8464 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8465 break;
8466 }
8467 }
8468 }
8469
8470 cff_finish:
8471 if (__improbable(pass1_updated != pass2_updated)) {
8472 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8473 __func__, pass1_updated, pass2_updated);
8474 }
8475 if (tlb_flush_needed) {
8476 sync_tlb_flush();
8477 }
8478 return result;
8479 }
8480
8481 /*
8482 * Determine if the fault was induced by software tracking of
8483 * modify/reference bits. If so, re-enable the mapping (and set
8484 * the appropriate bits).
8485 *
8486 * Returns KERN_SUCCESS if the fault was induced and was
8487 * successfully handled.
8488 *
8489 * Returns KERN_FAILURE if the fault was not induced and
8490 * the function was unable to deal with it.
8491 *
8492 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8493 * disallows this type of access.
8494 *
8495 * Returns KERN_ABORTED if the pmap lock is taken and a
8496 * preemption is pending.
8497 *
8498 */
8499 MARK_AS_PMAP_TEXT kern_return_t
8500 arm_fast_fault_internal(
8501 pmap_t pmap,
8502 vm_map_address_t va,
8503 vm_prot_t fault_type,
8504 __unused bool was_af_fault,
8505 __unused bool from_user)
8506 {
8507 kern_return_t result = KERN_FAILURE;
8508 pt_entry_t *ptep;
8509 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8510 unsigned int pai;
8511 pmap_paddr_t pa;
8512 validate_pmap_mutable(pmap);
8513
8514 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8515 return KERN_ABORTED;
8516 }
8517
8518 /*
8519 * If the entry doesn't exist, is completely invalid, or is already
8520 * valid, we can't fix it here.
8521 */
8522
8523 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8524 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8525 if (ptep != PT_ENTRY_NULL) {
8526 while (true) {
8527 spte = *((volatile pt_entry_t*)ptep);
8528
8529 pa = pte_to_pa(spte);
8530
8531 if ((spte == ARM_PTE_TYPE_FAULT) ||
8532 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8533 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8534 return result;
8535 }
8536
8537 if (!pa_valid(pa)) {
8538 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8539 #if XNU_MONITOR
8540 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8541 return KERN_PROTECTION_FAILURE;
8542 } else
8543 #endif
8544 return result;
8545 }
8546 pai = pa_index(pa);
8547 pvh_lock(pai);
8548 if (*ptep == spte) {
8549 /*
8550 * Double-check the spte value, as we care about the AF bit.
8551 * It's also possible that pmap_page_protect() transitioned the
8552 * PTE to compressed/empty before we grabbed the PVH lock.
8553 */
8554 break;
8555 }
8556 pvh_unlock(pai);
8557 }
8558 } else {
8559 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8560 return result;
8561 }
8562
8563
8564 if ((result != KERN_SUCCESS) &&
8565 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8566 /*
8567 * An attempted access will always clear ref/mod fault state, as
8568 * appropriate for the fault type. arm_clear_fast_fault will
8569 * update the associated PTEs for the page as appropriate; if
8570 * any PTEs are updated, we redrive the access. If the mapping
8571 * does not actually allow for the attempted access, the
8572 * following fault will (hopefully) fail to update any PTEs, and
8573 * thus cause arm_fast_fault to decide that it failed to handle
8574 * the fault.
8575 */
8576 if (ppattr_test_reffault(pai)) {
8577 ppattr_clear_reffault(pai);
8578 }
8579 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8580 ppattr_clear_modfault(pai);
8581 }
8582
8583 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8584 /*
8585 * Should this preserve KERN_PROTECTION_FAILURE? The
8586 * cost of not doing so is a another fault in a case
8587 * that should already result in an exception.
8588 */
8589 result = KERN_SUCCESS;
8590 }
8591 }
8592
8593 /*
8594 * If the PTE already has sufficient permissions, we can report the fault as handled.
8595 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8596 * on mappings of the same page
8597 */
8598 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8599 uintptr_t ap_ro, ap_rw, ap_x;
8600 if (pmap == kernel_pmap) {
8601 ap_ro = ARM_PTE_AP(AP_RONA);
8602 ap_rw = ARM_PTE_AP(AP_RWNA);
8603 ap_x = ARM_PTE_NX;
8604 } else {
8605 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8606 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8607 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8608 }
8609 /*
8610 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8611 * hardware they may be xPRR-protected, in which case they'll be handled
8612 * by the is_pte_xprr_protected() case above. Additionally, the exception
8613 * handling path currently does not call arm_fast_fault() without at least
8614 * VM_PROT_READ in fault_type.
8615 */
8616 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8617 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8618 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8619 result = KERN_SUCCESS;
8620 }
8621 }
8622 }
8623
8624 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8625 /*
8626 * A prior arm_clear_fast_fault() operation may have returned early due to
8627 * another pending PV list operation or an excessively large PV list.
8628 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8629 * taking a fault on the same mapping.
8630 */
8631 result = KERN_SUCCESS;
8632 }
8633
8634 pvh_unlock(pai);
8635 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8636 return result;
8637 }
8638
8639 kern_return_t
8640 arm_fast_fault(
8641 pmap_t pmap,
8642 vm_map_address_t va,
8643 vm_prot_t fault_type,
8644 bool was_af_fault,
8645 __unused bool from_user)
8646 {
8647 kern_return_t result = KERN_FAILURE;
8648
8649 if (va < pmap->min || va >= pmap->max) {
8650 return result;
8651 }
8652
8653 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8654 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8655 from_user);
8656
8657 do {
8658 #if XNU_MONITOR
8659 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8660 #else
8661 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8662 #endif
8663 } while (result == KERN_ABORTED);
8664
8665 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8666
8667 return result;
8668 }
8669
8670 void
8671 pmap_copy_page(
8672 ppnum_t psrc,
8673 ppnum_t pdst)
8674 {
8675 bcopy_phys((addr64_t) (ptoa(psrc)),
8676 (addr64_t) (ptoa(pdst)),
8677 PAGE_SIZE);
8678 }
8679
8680
8681 /*
8682 * pmap_copy_page copies the specified (machine independent) pages.
8683 */
8684 void
8685 pmap_copy_part_page(
8686 ppnum_t psrc,
8687 vm_offset_t src_offset,
8688 ppnum_t pdst,
8689 vm_offset_t dst_offset,
8690 vm_size_t len)
8691 {
8692 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8693 (addr64_t) (ptoa(pdst) + dst_offset),
8694 len);
8695 }
8696
8697
8698 /*
8699 * pmap_zero_page zeros the specified (machine independent) page.
8700 */
8701 void
8702 pmap_zero_page(
8703 ppnum_t pn)
8704 {
8705 assert(pn != vm_page_fictitious_addr);
8706 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8707 }
8708
8709 /*
8710 * pmap_zero_part_page
8711 * zeros the specified (machine independent) part of a page.
8712 */
8713 void
8714 pmap_zero_part_page(
8715 ppnum_t pn,
8716 vm_offset_t offset,
8717 vm_size_t len)
8718 {
8719 assert(pn != vm_page_fictitious_addr);
8720 assert(offset + len <= PAGE_SIZE);
8721 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8722 }
8723
8724 void
8725 pmap_map_globals(
8726 void)
8727 {
8728 pt_entry_t *ptep, pte;
8729
8730 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8731 assert(ptep != PT_ENTRY_NULL);
8732 assert(*ptep == ARM_PTE_EMPTY);
8733
8734 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8735 #if __ARM_KERNEL_PROTECT__
8736 pte |= ARM_PTE_NG;
8737 #endif /* __ARM_KERNEL_PROTECT__ */
8738 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8739 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8740 *ptep = pte;
8741 FLUSH_PTE();
8742 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8743
8744 #if KASAN
8745 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8746 #endif
8747 }
8748
8749 vm_offset_t
8750 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8751 {
8752 if (__improbable(index >= CPUWINDOWS_MAX)) {
8753 panic("%s: invalid index %u", __func__, index);
8754 }
8755 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8756 }
8757
8758 MARK_AS_PMAP_TEXT unsigned int
8759 pmap_map_cpu_windows_copy_internal(
8760 ppnum_t pn,
8761 vm_prot_t prot,
8762 unsigned int wimg_bits)
8763 {
8764 pt_entry_t *ptep = NULL, pte;
8765 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8766 unsigned int cpu_num;
8767 unsigned int i;
8768 vm_offset_t cpu_copywindow_vaddr = 0;
8769 bool need_strong_sync = false;
8770
8771 #if XNU_MONITOR
8772 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8773 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8774 #endif
8775
8776 #if XNU_MONITOR
8777 #ifdef __ARM_COHERENT_IO__
8778 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8779 panic("%s: attempted to map a managed page, "
8780 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8781 __FUNCTION__,
8782 pn, prot, wimg_bits);
8783 }
8784 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8785 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8786 }
8787
8788 #else /* __ARM_COHERENT_IO__ */
8789 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8790 #endif /* __ARM_COHERENT_IO__ */
8791 #endif /* XNU_MONITOR */
8792 cpu_num = pmap_cpu_data->cpu_number;
8793
8794 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8795 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8796 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8797 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8798 if (*ptep == ARM_PTE_TYPE_FAULT) {
8799 break;
8800 }
8801 }
8802 if (i == CPUWINDOWS_MAX) {
8803 panic("pmap_map_cpu_windows_copy: out of window");
8804 }
8805
8806 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8807 #if __ARM_KERNEL_PROTECT__
8808 pte |= ARM_PTE_NG;
8809 #endif /* __ARM_KERNEL_PROTECT__ */
8810
8811 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8812
8813 if (prot & VM_PROT_WRITE) {
8814 pte |= ARM_PTE_AP(AP_RWNA);
8815 } else {
8816 pte |= ARM_PTE_AP(AP_RONA);
8817 }
8818 #if HAS_FEAT_XS
8819 need_strong_sync = pte_is_xs(native_pt_attr, pte);
8820 #endif
8821 write_pte_fast(ptep, pte);
8822 /*
8823 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8824 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8825 */
8826 FLUSH_PTE_STRONG();
8827 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8828 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8829
8830 return i;
8831 }
8832
8833 unsigned int
8834 pmap_map_cpu_windows_copy(
8835 ppnum_t pn,
8836 vm_prot_t prot,
8837 unsigned int wimg_bits)
8838 {
8839 #if XNU_MONITOR
8840 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8841 #else
8842 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8843 #endif
8844 }
8845
8846 MARK_AS_PMAP_TEXT void
8847 pmap_unmap_cpu_windows_copy_internal(
8848 unsigned int index)
8849 {
8850 pt_entry_t *ptep;
8851 unsigned int cpu_num;
8852 vm_offset_t cpu_copywindow_vaddr = 0;
8853 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8854
8855 cpu_num = pmap_cpu_data->cpu_number;
8856
8857 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8858 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8859 * (which are likely to have been on I/O memory) are complete before
8860 * tearing down the mapping. */
8861 __builtin_arm_dsb(DSB_SY);
8862 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8863 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8864 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8865 }
8866
8867 void
8868 pmap_unmap_cpu_windows_copy(
8869 unsigned int index)
8870 {
8871 #if XNU_MONITOR
8872 return pmap_unmap_cpu_windows_copy_ppl(index);
8873 #else
8874 return pmap_unmap_cpu_windows_copy_internal(index);
8875 #endif
8876 }
8877
8878 #if XNU_MONITOR
8879
8880 MARK_AS_PMAP_TEXT void
8881 pmap_invoke_with_page(
8882 ppnum_t page_number,
8883 void *ctx,
8884 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8885 {
8886 #pragma unused(page_number, ctx, callback)
8887 }
8888
8889 /*
8890 * Loop over every pmap_io_range (I/O ranges marked as owned by
8891 * the PPL in the device tree) and conditionally call callback() on each range
8892 * that needs to be included in the hibernation image.
8893 *
8894 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8895 * context is needed in the callback.
8896 * @param callback Callback function invoked on each range (gated by flag).
8897 */
8898 MARK_AS_PMAP_TEXT void
8899 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8900 {
8901 extern const pmap_io_range_t* io_attr_table;
8902 extern const unsigned int num_io_rgns;
8903 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8904 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8905 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8906 }
8907 }
8908 }
8909
8910 /**
8911 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8912 * PPL-owned page. Otherwise, do nothing.
8913 *
8914 * @param addr Physical address of the page to set the HASHED flag on.
8915 */
8916 MARK_AS_PMAP_TEXT void
8917 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8918 {
8919 /* Ignore non-managed kernel memory. */
8920 if (!pa_valid(addr)) {
8921 return;
8922 }
8923
8924 const unsigned int pai = pa_index(addr);
8925 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8926 pv_entry_t **pv_h = pai_to_pvh(pai);
8927
8928 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8929 pvh_lock(pai);
8930 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8931 pvh_unlock(pai);
8932 }
8933 }
8934
8935 /**
8936 * Loop through every physical page in the system and clear out the HASHED flag
8937 * on every PPL-owned page. That flag is used to keep track of which pages have
8938 * been hashed into the hibernation image during the hibernation entry process.
8939 *
8940 * The HASHED flag needs to be cleared out between hibernation cycles because the
8941 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8942 * image with the HASHED flag set on certain pages. It's important to clear the
8943 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8944 * into the hibernation image can't be compromised across hibernation cycles.
8945 */
8946 MARK_AS_PMAP_TEXT void
8947 pmap_clear_ppl_hashed_flag_all(void)
8948 {
8949 const unsigned int last_index = pa_index(vm_last_phys);
8950 pv_entry_t **pv_h = NULL;
8951
8952 for (int pai = 0; pai < last_index; ++pai) {
8953 pv_h = pai_to_pvh(pai);
8954
8955 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8956 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8957 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8958 pvh_lock(pai);
8959 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8960 pvh_unlock(pai);
8961 }
8962 }
8963 }
8964
8965 /**
8966 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8967 * ppl_hib driver will call this after all wired pages have been copied into the
8968 * hibernation image.
8969 */
8970 MARK_AS_PMAP_TEXT void
8971 pmap_check_ppl_hashed_flag_all(void)
8972 {
8973 const unsigned int last_index = pa_index(vm_last_phys);
8974 pv_entry_t **pv_h = NULL;
8975
8976 for (int pai = 0; pai < last_index; ++pai) {
8977 pv_h = pai_to_pvh(pai);
8978
8979 /**
8980 * The PMAP stacks are explicitly not saved into the image so skip checking
8981 * the pages that contain the PMAP stacks.
8982 */
8983 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8984 (pai < pa_index(pmap_stacks_end_pa));
8985
8986 if (!is_pmap_stack &&
8987 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8988 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8989 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8990 }
8991 }
8992 }
8993
8994 #endif /* XNU_MONITOR */
8995
8996 /*
8997 * Indicate that a pmap is intended to be used as a nested pmap
8998 * within one or more larger address spaces. This must be set
8999 * before pmap_nest() is called with this pmap as the 'subordinate'.
9000 */
9001 MARK_AS_PMAP_TEXT void
9002 pmap_set_nested_internal(
9003 pmap_t pmap)
9004 {
9005 validate_pmap_mutable(pmap);
9006 if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
9007 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9008 __func__, pmap, pmap->type);
9009 }
9010
9011 #if XNU_MONITOR
9012 /**
9013 * The "seq_cst" ordering of the atomic load here guarantees
9014 * the check below is performed after the type update above
9015 * is observed. Together with similar order guarantee at
9016 * pmap_switch_internal(), it makes sure a pmap is never
9017 * active-and-nested:
9018 *
9019 * pmap_set_nested() | pmap_switch()
9020 * --------------------------------------
9021 * set nested | set active
9022 * store-load barrier| store-load barrier
9023 * assert !active | assert !nested
9024 */
9025 const int max_cpu = ml_get_max_cpu_number();
9026 for (unsigned int i = 0; i <= max_cpu; ++i) {
9027 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
9028 if (cpu_data == NULL) {
9029 continue;
9030 }
9031 if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
9032 panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
9033 }
9034 }
9035 #endif /* XNU_MONITOR */
9036
9037 /**
9038 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
9039 * this pmap its own nested pmap.
9040 */
9041 if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
9042 panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
9043 }
9044
9045 pmap_get_pt_ops(pmap)->free_id(pmap);
9046 }
9047
9048 void
9049 pmap_set_nested(
9050 pmap_t pmap)
9051 {
9052 #if XNU_MONITOR
9053 pmap_set_nested_ppl(pmap);
9054 #else
9055 pmap_set_nested_internal(pmap);
9056 #endif
9057 }
9058
9059 bool
9060 pmap_is_nested(
9061 pmap_t pmap)
9062 {
9063 return pmap->type == PMAP_TYPE_NESTED;
9064 }
9065
9066 /*
9067 * pmap_trim_range(pmap, start, end)
9068 *
9069 * pmap = pmap to operate on
9070 * start = start of the range
9071 * end = end of the range
9072 *
9073 * Attempts to deallocate TTEs for the given range in the nested range.
9074 */
9075 MARK_AS_PMAP_TEXT static void
9076 pmap_trim_range(
9077 pmap_t pmap,
9078 addr64_t start,
9079 addr64_t end)
9080 {
9081 addr64_t cur;
9082 addr64_t nested_region_start;
9083 addr64_t nested_region_end;
9084 addr64_t adjusted_start;
9085 addr64_t adjusted_end;
9086 addr64_t adjust_offmask;
9087 tt_entry_t * tte_p;
9088 pt_entry_t * pte_p;
9089 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9090
9091 if (__improbable(end < start)) {
9092 panic("%s: invalid address range, "
9093 "pmap=%p, start=%p, end=%p",
9094 __func__,
9095 pmap, (void*)start, (void*)end);
9096 }
9097
9098 nested_region_start = pmap->nested_region_addr;
9099 nested_region_end = nested_region_start + pmap->nested_region_size;
9100
9101 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9102 panic("%s: range outside nested region %p-%p, "
9103 "pmap=%p, start=%p, end=%p",
9104 __func__, (void *)nested_region_start, (void *)nested_region_end,
9105 pmap, (void*)start, (void*)end);
9106 }
9107
9108 /* Contract the range to TT page boundaries. */
9109 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9110 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9111 adjusted_end = end & ~adjust_offmask;
9112
9113 /* Iterate over the range, trying to remove TTEs. */
9114 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9115 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9116
9117 tte_p = pmap_tte(pmap, cur);
9118
9119 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
9120 pte_p = (pt_entry_t *) ttetokv(*tte_p);
9121
9122 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9123 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9124 /* Deallocate for the nested map. */
9125 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9126 } else if (pmap->type == PMAP_TYPE_USER) {
9127 /**
9128 * Just remove for the parent map. If the leaf table pointed
9129 * to by the TTE being removed (owned by the nested pmap)
9130 * has any mappings, then this call will panic. This
9131 * enforces the policy that tables being trimmed must be
9132 * empty to prevent possible use-after-free attacks.
9133 */
9134 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9135 } else {
9136 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9137 }
9138 } else {
9139 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9140 }
9141 }
9142
9143 /* Remove empty L2 TTs. */
9144 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9145 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9146
9147 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9148 /* For each L1 entry in our range... */
9149 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9150
9151 bool remove_tt1e = true;
9152 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9153 tt_entry_t * tt2e_start;
9154 tt_entry_t * tt2e_end;
9155 tt_entry_t * tt2e_p;
9156 tt_entry_t tt1e;
9157
9158 if (tt1e_p == NULL) {
9159 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9160 continue;
9161 }
9162
9163 tt1e = *tt1e_p;
9164
9165 if (tt1e == ARM_TTE_TYPE_FAULT) {
9166 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9167 continue;
9168 }
9169
9170 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9171 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9172
9173 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9174 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9175 /*
9176 * If any TTEs are populated, don't remove the
9177 * L1 TT.
9178 */
9179 remove_tt1e = false;
9180 }
9181 }
9182
9183 if (remove_tt1e) {
9184 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9185 } else {
9186 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9187 }
9188 }
9189 }
9190
9191 /**
9192 * State machine for multi-step pmap trimming. Trimming is the action of
9193 * deallocating the TTEs of the shared region of pmaps down to a given range.
9194 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9195 * disabling preemption for too long. These steps include computing the bounds
9196 * of the shared region, trimming the head of the "grand", trimming the tail of
9197 * the "grand", and trimming the "subord". Some of the steps can be skipped under
9198 * different conditions.
9199 *
9200 * @param grand the pmap in which the pages are nested
9201 * @param subord the pmap from which the pages are shared, or nested
9202 * @param vstart start of the used range in "grand"
9203 * @param size size of the used range
9204 * @param state the current state of the state machine
9205 *
9206 * @return the next state of the state machine, to be used in the next call
9207 * into this function.
9208 */
9209 MARK_AS_PMAP_TEXT pmap_trim_state_t
9210 pmap_trim_internal(
9211 pmap_t grand,
9212 pmap_t subord,
9213 addr64_t vstart,
9214 uint64_t size,
9215 pmap_trim_state_t state)
9216 {
9217 /* Validation needs to be done regardless of state. */
9218 addr64_t vend;
9219
9220 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9221 panic("%s: grand addr wraps around, "
9222 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9223 __func__, grand, subord, (void*)vstart, size, state);
9224 }
9225
9226 validate_pmap_mutable(grand);
9227 validate_pmap(subord);
9228
9229 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9230 panic("%s: subord is of non-nestable type 0x%hhx, "
9231 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9232 __func__, subord->type, grand, subord, (void*)vstart, size, state);
9233 }
9234
9235 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9236 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9237 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9238 __func__, grand->type, grand, subord, (void*)vstart, size, state);
9239 }
9240
9241 if (__improbable(grand->nested_pmap != subord)) {
9242 panic("%s: grand->nested != subord, "
9243 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9244 __func__, grand, subord, (void*)vstart, size, state);
9245 }
9246
9247 if (__improbable((size != 0) &&
9248 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9249 panic("%s: grand range not in nested region, "
9250 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9251 __func__, grand, subord, (void*)vstart, size, state);
9252 }
9253
9254
9255 /* Trimming starts with figuring out the bounds for the grand. */
9256 if (state == PMAP_TRIM_STATE_START) {
9257 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9258
9259 /**
9260 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9261 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9262 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9263 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9264 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9265 * PMAP_TRIM_STATE_DONE.
9266 */
9267 if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9268 assert(subord->nested_bounds_set);
9269
9270 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9271 if (!grand->nested_bounds_set) {
9272 /* Inherit the bounds from subord. */
9273 grand->nested_region_true_start = subord->nested_region_true_start;
9274 grand->nested_region_true_end = subord->nested_region_true_end;
9275 grand->nested_bounds_set = true;
9276 }
9277
9278 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9279
9280 /* Now that the grand has bounds, we are done. */
9281 return PMAP_TRIM_STATE_DONE;
9282 }
9283
9284 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9285 if ((!subord->nested_bounds_set) && size) {
9286 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9287 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9288
9289 subord->nested_region_true_start = vstart;
9290 subord->nested_region_true_end = vend;
9291 subord->nested_region_true_start &= ~adjust_offmask;
9292
9293 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9294 panic("%s: padded true end wraps around, "
9295 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9296 __func__, grand, subord, (void*)vstart, size, state);
9297 }
9298
9299 subord->nested_region_true_end &= ~adjust_offmask;
9300 subord->nested_bounds_set = true;
9301 }
9302
9303 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9304 if (subord->nested_bounds_set) {
9305 /* Inherit the bounds from subord. */
9306 grand->nested_region_true_start = subord->nested_region_true_start;
9307 grand->nested_region_true_end = subord->nested_region_true_end;
9308 grand->nested_bounds_set = true;
9309
9310 /* If we know the bounds, we can trim the pmap. */
9311 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9312
9313 state = PMAP_TRIM_STATE_GRAND_BEFORE;
9314 } else {
9315 /* Don't trim if we don't know the bounds. */
9316 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9317
9318 return PMAP_TRIM_STATE_DONE;
9319 }
9320 }
9321
9322 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
9323 if (!grand->nested_bounds_set) {
9324 panic("%s: !grand->nested_bounds_set, "
9325 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9326 __func__, grand, subord, (void*)vstart, size, state);
9327 }
9328
9329 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9330 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9331 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9332 NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9333 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9334 (unsigned int)grand->nested_no_bounds_ref_state);
9335 }
9336
9337 #if XNU_MONITOR
9338 if (pmap_pending_preemption()) {
9339 return PMAP_TRIM_STATE_GRAND_AFTER;
9340 }
9341 #endif
9342
9343 state = PMAP_TRIM_STATE_GRAND_AFTER;
9344 }
9345
9346 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9347 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9348 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9349 NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9350 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9351 (unsigned int)grand->nested_no_bounds_ref_state);
9352 }
9353
9354 #if XNU_MONITOR
9355 if (pmap_pending_preemption()) {
9356 return PMAP_TRIM_STATE_SUBORD;
9357 }
9358 #endif
9359
9360 state = PMAP_TRIM_STATE_SUBORD;
9361 }
9362
9363 /* START state is guaranteed to compute the bounds for the subord. */
9364 if (!subord->nested_bounds_set) {
9365 panic("%s: !subord->nested_bounds_set, "
9366 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9367 __func__, grand, subord, (void*)vstart, size, state);
9368 }
9369
9370 if (state == PMAP_TRIM_STATE_SUBORD) {
9371 /**
9372 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9373 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9374 * called once grand's nested tables have been fully trimmed, and can only be called once
9375 * for each 'grand' pmap. We use release ordering for the atomics above to ensure that
9376 * the state update is visible only once the preceding trim operation is complete. An
9377 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9378 * but locking within pmap_trim_range() should make that harmless (and all but one will
9379 * ultimately panic due to a failed atomic state CAS). We use acquire ordering here to
9380 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9381 * of the state CAS.
9382 */
9383 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9384 NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9385 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9386 (unsigned int)grand->nested_no_bounds_ref_state);
9387 }
9388 pmap_trim_subord(subord);
9389 }
9390
9391 return PMAP_TRIM_STATE_DONE;
9392 }
9393
9394 MARK_AS_PMAP_TEXT static void
9395 pmap_trim_self(pmap_t pmap)
9396 {
9397 if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9398 /* If we have a no bounds ref, we need to drop it. */
9399 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9400 pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9401 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9402 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9403 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9404 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9405
9406 if (nested_bounds_set) {
9407 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9408 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9409 }
9410 /*
9411 * Try trimming the nested pmap, in case we had the
9412 * last reference.
9413 */
9414 pmap_trim_subord(pmap->nested_pmap);
9415 }
9416 }
9417
9418 /*
9419 * pmap_trim_subord(grand, subord)
9420 *
9421 * grand = pmap that we have nested subord in
9422 * subord = nested pmap we are attempting to trim
9423 *
9424 * Trims subord if possible
9425 */
9426 MARK_AS_PMAP_TEXT static void
9427 pmap_trim_subord(pmap_t subord)
9428 {
9429 bool contract_subord = false;
9430
9431 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9432
9433 subord->nested_no_bounds_refcnt--;
9434
9435 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9436 /* If this was the last no bounds reference, trim subord. */
9437 contract_subord = true;
9438 }
9439
9440 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9441
9442 if (contract_subord) {
9443 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9444 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9445 }
9446 }
9447
9448 /**
9449 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9450 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9451 * disabling preemption for too long.
9452 *
9453 * @note When we load the shared region we always create pages tables for the
9454 * entire region. In practice, the shared cache may use just a portion
9455 * of that. Before we know the bounds of the shared region, it can
9456 * already be mapped into processes. Therefore, once the bounds are
9457 * known, "trimming" comes in handy to remove the unnecessary page
9458 * tables in the processes the shared region is mapped in, and eventually
9459 * those in the shared region itself. Note that the shared region must
9460 * be trimmed after the user processes because it has the L3 entries
9461 * everyone else is pointing to.
9462 *
9463 * @param grand the pmap in which the pages are nested
9464 * @param subord the pmap from which the pages are shared, or nested
9465 * @param vstart start of the used range in "grand"
9466 * @param size size of the used range
9467 */
9468 void
9469 pmap_trim(
9470 pmap_t grand,
9471 pmap_t subord,
9472 addr64_t vstart,
9473 uint64_t size)
9474 {
9475 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9476
9477 #if XNU_MONITOR
9478 /* On PPL systems, drives the state machine until its done. */
9479 while (state != PMAP_TRIM_STATE_DONE) {
9480 __assert_only pmap_trim_state_t old_state = state;
9481 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9482
9483 /* Are we making progress? */
9484 assert(old_state != state);
9485 }
9486
9487 pmap_ledger_check_balance(grand);
9488 pmap_ledger_check_balance(subord);
9489 #else
9490 state = pmap_trim_internal(grand, subord, vstart, size, state);
9491
9492 /* On non-PPL systems, we expect the implementation to finish in one call. */
9493 assert(state == PMAP_TRIM_STATE_DONE);
9494 #endif
9495 }
9496
9497 #if HAS_APPLE_PAC
9498 void *
9499 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9500 {
9501 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9502 panic("attempt to sign user pointer without process independent key");
9503 }
9504
9505 void *res = NULL;
9506 uint64_t current_intr_state = pmap_interrupts_disable();
9507
9508 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9509
9510 __compiler_materialize_and_prevent_reordering_on(value);
9511 switch (key) {
9512 case ptrauth_key_asia:
9513 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9514 break;
9515 case ptrauth_key_asda:
9516 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9517 break;
9518 default:
9519 __builtin_unreachable();
9520 }
9521 __compiler_materialize_and_prevent_reordering_on(res);
9522
9523 ml_disable_user_jop_key(jop_key, saved_jop_state);
9524
9525 pmap_interrupts_restore(current_intr_state);
9526
9527 return res;
9528 }
9529
9530 void *
9531 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9532 {
9533 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9534 }
9535
9536 void *
9537 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9538 {
9539 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9540 panic("attempt to auth user pointer without process independent key");
9541 }
9542
9543 void *res = NULL;
9544 uint64_t current_intr_state = pmap_interrupts_disable();
9545
9546 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9547 __compiler_materialize_and_prevent_reordering_on(value);
9548 res = ml_auth_ptr_unchecked(value, key, discriminator);
9549 __compiler_materialize_and_prevent_reordering_on(res);
9550 ml_disable_user_jop_key(jop_key, saved_jop_state);
9551
9552 pmap_interrupts_restore(current_intr_state);
9553
9554 return res;
9555 }
9556
9557 void *
9558 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9559 {
9560 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9561 }
9562 #endif /* HAS_APPLE_PAC */
9563
9564 /*
9565 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9566 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9567 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9568 * return value, to indicate where a preempted [un]nest operation should resume.
9569 * When the return value contains the ending address of the nested region with
9570 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9571 */
9572 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9573
9574 /*
9575 * kern_return_t pmap_nest(grand, subord, vstart, size)
9576 *
9577 * grand = the pmap that we will nest subord into
9578 * subord = the pmap that goes into the grand
9579 * vstart = start of range in pmap to be inserted
9580 * size = Size of nest area (up to 16TB)
9581 *
9582 * Inserts a pmap into another. This is used to implement shared segments.
9583 *
9584 */
9585
9586 /**
9587 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9588 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9589 * This function operates in 3 main phases:
9590 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9591 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9592 * the mapping range are present in subord.
9593 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9594 * contains pointers to subord's leaf-level pagetable pages for the specified
9595 * VA range.
9596 *
9597 * This function may return early due to pending AST_URGENT preemption; if so
9598 * it will indicate the need to be re-entered.
9599 *
9600 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9601 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9602 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9603 * @param size twig-aligned size of the nesting range
9604 * @param vrestart the twig-aligned starting address of the current call. May contain
9605 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9606 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9607 * KERN_RESOURCE_SHORTAGE on allocation failure.
9608 *
9609 * @return the virtual address at which to restart the operation, possibly including
9610 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9611 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9612 */
9613 MARK_AS_PMAP_TEXT vm_map_offset_t
9614 pmap_nest_internal(
9615 pmap_t grand,
9616 pmap_t subord,
9617 addr64_t vstart,
9618 uint64_t size,
9619 vm_map_offset_t vrestart,
9620 kern_return_t *krp)
9621 {
9622 kern_return_t kr = KERN_FAILURE;
9623 vm_map_offset_t vaddr;
9624 tt_entry_t *stte_p;
9625 tt_entry_t *gtte_p;
9626 uint64_t nested_region_unnested_table_bitmap_size;
9627 unsigned int* nested_region_unnested_table_bitmap = NULL;
9628 uint64_t new_nested_region_unnested_table_bitmap_size;
9629 unsigned int* new_nested_region_unnested_table_bitmap = NULL;
9630 int expand_options = 0;
9631 bool deref_subord = true;
9632 bool grand_locked = false;
9633
9634 addr64_t vend;
9635 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9636 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9637 }
9638 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9639 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9640 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9641 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9642 }
9643
9644 assert(krp != NULL);
9645 validate_pmap_mutable(grand);
9646 validate_pmap(subord);
9647 #if XNU_MONITOR
9648 /*
9649 * Ordering is important here. validate_pmap() has already ensured subord is a
9650 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9651 * be in the process of being destroyed. If destruction is already committed,
9652 * then the check of ref_count below will cover us. If destruction is initiated
9653 * during or after this call, then pmap_destroy() will catch the non-zero
9654 * nested_count.
9655 */
9656 os_atomic_inc(&subord->nested_count, relaxed);
9657 os_atomic_thread_fence(seq_cst);
9658 #endif
9659 if (__improbable(os_atomic_inc_orig(&subord->ref_count, acquire) <= 0)) {
9660 panic("%s: invalid subordinate pmap %p", __func__, subord);
9661 }
9662
9663 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9664 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9665 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9666 }
9667
9668 #if XNU_MONITOR
9669 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9670 #endif
9671
9672 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9673 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9674 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9675 grand, vstart, size, (unsigned long long)vrestart);
9676 }
9677
9678 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9679 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9680 }
9681
9682 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9683 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9684 }
9685
9686 if (subord->nested_region_unnested_table_bitmap == NULL) {
9687 nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9688
9689 /**
9690 * Each even-numbered entry in the unnested table bit stores the table's unnesting state to be used
9691 * by pmap_enter() in determining whether to set the NG bit, while the subsequent odd-numbered
9692 * entry stores the "unnest in progress" indicator for the table, which is used by pmap_unnest()
9693 * to determine if an L3 table may not have been fully marked NG due to an interrupted operation.
9694 */
9695 nested_region_unnested_table_bitmap_size <<= 1;
9696
9697 if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9698 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9699 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9700 __func__, nested_region_unnested_table_bitmap_size,
9701 grand, subord, vstart, size);
9702 }
9703
9704 #if XNU_MONITOR
9705 pmap_paddr_t pa = 0;
9706
9707 if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9708 panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9709 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9710 __FUNCTION__, nested_region_unnested_table_bitmap_size,
9711 grand, subord, vstart, size);
9712 }
9713
9714 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9715
9716 if (kr != KERN_SUCCESS) {
9717 goto nest_cleanup;
9718 }
9719
9720 assert(pa);
9721
9722 nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9723 #else
9724 nested_region_unnested_table_bitmap = kalloc_data(
9725 nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9726 Z_WAITOK | Z_ZERO);
9727 #endif
9728
9729 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9730 kr = KERN_ABORTED;
9731 goto nest_cleanup;
9732 }
9733
9734 if (subord->nested_region_unnested_table_bitmap == NULL) {
9735 subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9736 subord->nested_region_addr = vstart;
9737 subord->nested_region_size = (mach_vm_offset_t) size;
9738
9739 /**
9740 * Ensure that the rest of the subord->nested_region_* fields are
9741 * initialized and visible before setting the nested_region_unnested_table_bitmap
9742 * field (which is used as the flag to say that the rest are initialized).
9743 */
9744 __builtin_arm_dmb(DMB_ISHST);
9745 subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9746 nested_region_unnested_table_bitmap = NULL;
9747 }
9748 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9749 if (nested_region_unnested_table_bitmap != NULL) {
9750 #if XNU_MONITOR
9751 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9752 #else
9753 kfree_data(nested_region_unnested_table_bitmap,
9754 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9755 #endif
9756 nested_region_unnested_table_bitmap = NULL;
9757 }
9758 }
9759
9760 /**
9761 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9762 * speculated before their initialization.
9763 */
9764 __builtin_arm_dmb(DMB_ISHLD);
9765
9766 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9767 uint64_t new_size;
9768
9769 nested_region_unnested_table_bitmap = NULL;
9770 nested_region_unnested_table_bitmap_size = 0ULL;
9771 new_size = vend - subord->nested_region_addr;
9772
9773 new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9774 new_nested_region_unnested_table_bitmap_size <<= 1;
9775
9776 if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9777 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9778 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9779 __func__, new_nested_region_unnested_table_bitmap_size,
9780 grand, subord, vstart, size);
9781 }
9782
9783 #if XNU_MONITOR
9784 pmap_paddr_t pa = 0;
9785
9786 if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9787 panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9788 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9789 __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9790 grand, subord, vstart, new_size);
9791 }
9792
9793 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9794
9795 if (kr != KERN_SUCCESS) {
9796 goto nest_cleanup;
9797 }
9798
9799 assert(pa);
9800
9801 new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9802 #else
9803 new_nested_region_unnested_table_bitmap = kalloc_data(
9804 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9805 Z_WAITOK | Z_ZERO);
9806 #endif
9807 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9808 kr = KERN_ABORTED;
9809 goto nest_cleanup;
9810 }
9811
9812 if (subord->nested_region_size < new_size) {
9813 bcopy(subord->nested_region_unnested_table_bitmap,
9814 new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9815 nested_region_unnested_table_bitmap_size = subord->nested_region_unnested_table_bitmap_size;
9816 nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9817 subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9818 subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9819 subord->nested_region_size = new_size;
9820 new_nested_region_unnested_table_bitmap = NULL;
9821 }
9822 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9823 if (nested_region_unnested_table_bitmap != NULL) {
9824 #if XNU_MONITOR
9825 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9826 #else
9827 kfree_data(nested_region_unnested_table_bitmap,
9828 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9829 #endif
9830 nested_region_unnested_table_bitmap = NULL;
9831 }
9832 if (new_nested_region_unnested_table_bitmap != NULL) {
9833 #if XNU_MONITOR
9834 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9835 #else
9836 kfree_data(new_nested_region_unnested_table_bitmap,
9837 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9838 #endif
9839 new_nested_region_unnested_table_bitmap = NULL;
9840 }
9841 }
9842
9843 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9844 kr = KERN_ABORTED;
9845 goto nest_cleanup;
9846 }
9847
9848 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9849 /**
9850 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9851 * into a nested pmap, which would then produce multiple levels of nesting.
9852 */
9853 if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9854 panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9855 }
9856 /*
9857 * If this is grand's first nesting operation, keep the reference on subord.
9858 * It will be released by pmap_destroy_internal() when grand is destroyed.
9859 */
9860 deref_subord = false;
9861
9862 if (!subord->nested_bounds_set) {
9863 /*
9864 * We are nesting without the shared regions bounds
9865 * being known. We'll have to trim the pmap later.
9866 */
9867 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9868 NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9869 panic("%s: grand %p already nested", __func__, grand);
9870 }
9871 subord->nested_no_bounds_refcnt++;
9872 }
9873
9874 if (__improbable(vstart < subord->nested_region_addr ||
9875 vend > (subord->nested_region_addr + subord->nested_region_size))) {
9876 panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9877 __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9878 (void *) (subord->nested_region_addr + subord->nested_region_size));
9879 }
9880
9881 grand->nested_region_addr = vstart;
9882 grand->nested_region_size = (mach_vm_offset_t) size;
9883 } else {
9884 if (__improbable(grand->nested_pmap != subord)) {
9885 panic("pmap_nest() pmap %p has a nested pmap", grand);
9886 } else if (__improbable(grand->nested_region_addr > vstart)) {
9887 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9888 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9889 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9890 }
9891 }
9892
9893 vaddr = vrestart & ~PMAP_NEST_GRAND;
9894 if (vaddr < subord->nested_region_true_start) {
9895 vaddr = subord->nested_region_true_start;
9896 }
9897
9898 addr64_t true_end = vend;
9899 if (true_end > subord->nested_region_true_end) {
9900 true_end = subord->nested_region_true_end;
9901 }
9902 __unused unsigned int ttecount = 0;
9903
9904 if (vrestart & PMAP_NEST_GRAND) {
9905 goto nest_grand;
9906 }
9907
9908 while (vaddr < true_end) {
9909 stte_p = pmap_tte(subord, vaddr);
9910 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9911 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9912 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9913
9914 if (kr != KERN_SUCCESS) {
9915 goto done;
9916 }
9917
9918 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9919 }
9920 vaddr += pt_attr_twig_size(pt_attr);
9921 vrestart = vaddr;
9922 ++ttecount;
9923 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9924 pmap_pending_preemption())) {
9925 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9926 kr = KERN_SUCCESS;
9927 goto done;
9928 }
9929 }
9930 /*
9931 * copy TTEs from subord pmap into grand pmap
9932 */
9933
9934 vaddr = (vm_map_offset_t) vstart;
9935 if (vaddr < subord->nested_region_true_start) {
9936 vaddr = subord->nested_region_true_start;
9937 }
9938 vrestart = vaddr | PMAP_NEST_GRAND;
9939
9940 nest_grand:
9941 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9942
9943 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9944 kr = KERN_ABORTED;
9945 goto done;
9946 }
9947 while (vaddr < true_end) {
9948 gtte_p = pmap_tte(grand, vaddr);
9949 if (gtte_p == PT_ENTRY_NULL) {
9950 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9951 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9952 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9953 if (kr == KERN_SUCCESS) {
9954 kr = KERN_ABORTED;
9955 }
9956 }
9957
9958 if (kr != KERN_SUCCESS) {
9959 goto done;
9960 }
9961
9962 gtte_p = pmap_tt2e(grand, vaddr);
9963 }
9964 /* Don't leak a page table page. Don't violate break-before-make. */
9965 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9966 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9967 __func__, gtte_p, grand);
9968 }
9969 /**
9970 * It's possible that grand was trimmed by pmap_trim_internal() while the
9971 * lock was dropped, in which case the previously stored "true" start/end
9972 * will no longer be accurate. In that case, we need to avoid nesting
9973 * tables outside the trimmed range, as those tables may be immediately freed
9974 * which would lead to a dangling page table pointer in grand.
9975 * Note that pmap_trim() may concurrently update grand's bounds as we are
9976 * making these checks, but in that case pmap_trim_range() has not yet
9977 * been called on grand and will wait for us to drop grand's lock, so it
9978 * should see any TTEs we've nested here and clear them appropriately.
9979 */
9980 if (__probable((vaddr >= grand->nested_region_true_start) &&
9981 (vaddr < grand->nested_region_true_end))) {
9982 stte_p = pmap_tte(subord, vaddr);
9983 if (__improbable(stte_p == PT_ENTRY_NULL)) {
9984 panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9985 }
9986 *gtte_p = *stte_p;
9987 }
9988
9989 vaddr += pt_attr_twig_size(pt_attr);
9990 vrestart = vaddr | PMAP_NEST_GRAND;
9991 ++ttecount;
9992 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9993 pmap_pending_preemption())) {
9994 break;
9995 }
9996 }
9997 if (vaddr >= true_end) {
9998 vrestart = vend | PMAP_NEST_GRAND;
9999 }
10000
10001 kr = KERN_SUCCESS;
10002 done:
10003
10004 FLUSH_PTE();
10005 __builtin_arm_isb(ISB_SY);
10006
10007 if (grand_locked) {
10008 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10009 }
10010
10011 nest_cleanup:
10012 #if XNU_MONITOR
10013 if (kr != KERN_SUCCESS) {
10014 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10015 *krp = kr;
10016 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10017 }
10018 #else
10019 if (kr != KERN_SUCCESS) {
10020 *krp = kr;
10021 }
10022 #endif
10023 if (nested_region_unnested_table_bitmap != NULL) {
10024 #if XNU_MONITOR
10025 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
10026 #else
10027 kfree_data(nested_region_unnested_table_bitmap,
10028 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
10029 #endif
10030 }
10031 if (new_nested_region_unnested_table_bitmap != NULL) {
10032 #if XNU_MONITOR
10033 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
10034 #else
10035 kfree_data(new_nested_region_unnested_table_bitmap,
10036 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
10037 #endif
10038 }
10039 if (deref_subord) {
10040 #if XNU_MONITOR
10041 os_atomic_dec(&subord->nested_count, relaxed);
10042 #endif
10043 pmap_destroy_internal(subord);
10044 }
10045 return vrestart;
10046 }
10047
10048 kern_return_t
10049 pmap_nest(
10050 pmap_t grand,
10051 pmap_t subord,
10052 addr64_t vstart,
10053 uint64_t size)
10054 {
10055 kern_return_t kr = KERN_SUCCESS;
10056 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
10057 vm_map_offset_t vend = vaddr + size;
10058 __unused vm_map_offset_t vlast = vaddr;
10059
10060 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
10061 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
10062 VM_KERNEL_ADDRHIDE(vstart));
10063
10064 pmap_verify_preemptible();
10065 #if XNU_MONITOR
10066 while (vaddr != (vend | PMAP_NEST_GRAND)) {
10067 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
10068 if (kr == KERN_RESOURCE_SHORTAGE) {
10069 pmap_alloc_page_for_ppl(0);
10070 kr = KERN_SUCCESS;
10071 } else if (kr == KERN_ABORTED) {
10072 /**
10073 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
10074 * that it won't update kr when KERN_SUCCESS is to be returned.
10075 * Therefore, the KERN_ABORTED needs to be manually cleared here,
10076 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
10077 */
10078 kr = KERN_SUCCESS;
10079 continue;
10080 } else if (kr != KERN_SUCCESS) {
10081 break;
10082 } else if (vaddr == vlast) {
10083 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10084 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
10085 }
10086 vlast = vaddr;
10087 }
10088
10089 pmap_ledger_check_balance(grand);
10090 pmap_ledger_check_balance(subord);
10091 #else
10092 /**
10093 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
10094 * we have verified preemptibility. Therefore, pmap_nest_internal() will
10095 * wait for a page or a lock instead of bailing out as in the PPL flavor.
10096 */
10097 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
10098 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
10099 }
10100 #endif
10101
10102 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
10103
10104 return kr;
10105 }
10106
10107 /*
10108 * kern_return_t pmap_unnest(grand, vaddr)
10109 *
10110 * grand = the pmap that will have the virtual range unnested
10111 * vaddr = start of range in pmap to be unnested
10112 * size = size of range in pmap to be unnested
10113 *
10114 */
10115
10116 kern_return_t
10117 pmap_unnest(
10118 pmap_t grand,
10119 addr64_t vaddr,
10120 uint64_t size)
10121 {
10122 return pmap_unnest_options(grand, vaddr, size, 0);
10123 }
10124
10125 /**
10126 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10127 * from a top-level pmap ('grand'). The corresponding mappings in the nested
10128 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10129 * still have the region nested. The mappings in 'grand' will be left empty
10130 * with the assumption that they will be demand-filled by subsequent access faults.
10131 *
10132 * This function operates in 2 main phases:
10133 * 1. Iteration over the nested pmap's mappings for the specified range to mark
10134 * them non-global.
10135 * 2. Clearing of the twig-level TTEs for the address range in grand.
10136 *
10137 * This function may return early due to pending AST_URGENT preemption; if so
10138 * it will indicate the need to be re-entered.
10139 *
10140 * @param grand pmap from which to unnest mappings
10141 * @param vaddr twig-aligned virtual address for the beginning of the nested range
10142 * @param size twig-aligned size of the nested range
10143 * @param vrestart the page-aligned starting address of the current call. May contain
10144 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10145 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10146 * grand is being torn down and step 1) above is not needed.
10147 *
10148 * @return the virtual address at which to restart the operation, possibly including
10149 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
10150 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10151 */
10152 MARK_AS_PMAP_TEXT vm_map_offset_t
10153 pmap_unnest_options_internal(
10154 pmap_t grand,
10155 addr64_t vaddr,
10156 uint64_t size,
10157 vm_map_offset_t vrestart,
10158 unsigned int option)
10159 {
10160 vm_map_offset_t start;
10161 vm_map_offset_t addr;
10162 tt_entry_t *tte_p;
10163 unsigned int current_index;
10164 unsigned int start_index;
10165 unsigned int max_index;
10166 unsigned int entry_count = 0;
10167
10168 addr64_t vend;
10169 addr64_t true_end;
10170 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10171 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10172 }
10173 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10174 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10175 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10176 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10177 }
10178
10179 validate_pmap_mutable(grand);
10180
10181 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10182 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10183 }
10184
10185 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10186
10187 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10188 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10189 (unsigned long long)vaddr, (unsigned long long)size);
10190 }
10191
10192 if (__improbable(grand->nested_pmap == NULL)) {
10193 panic("%s: %p has no nested pmap", __func__, grand);
10194 }
10195
10196 true_end = vend;
10197 if (true_end > grand->nested_pmap->nested_region_true_end) {
10198 true_end = grand->nested_pmap->nested_region_true_end;
10199 }
10200
10201 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10202 if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10203 return vrestart;
10204 }
10205
10206 start = vrestart;
10207 if (start < grand->nested_pmap->nested_region_true_start) {
10208 start = grand->nested_pmap->nested_region_true_start;
10209 }
10210 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10211 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10212 bool flush_tlb = false;
10213
10214 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10215 pt_entry_t *bpte, *cpte;
10216
10217 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10218
10219 bpte = pmap_pte(grand->nested_pmap, addr);
10220
10221 /*
10222 * If we've re-entered this function partway through unnesting a leaf region, the
10223 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10224 * the run of PTEs and the adjacent "in-progress" bit will be set.
10225 */
10226 if (!testbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10227 testbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap)) {
10228 /*
10229 * Mark the 'twig' region as being unnested. Every mapping entered within
10230 * the nested pmap in this region will now be marked non-global. Do this
10231 * before marking any of the PTEs within the region as non-global to avoid
10232 * the possibility of pmap_enter() subsequently inserting a global mapping
10233 * in the region, which could lead to a TLB conflict if a non-global entry
10234 * is later inserted for the same VA in a pmap which has fully unnested this
10235 * region.
10236 */
10237 setbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10238 setbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10239 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10240 pmap_paddr_t pa;
10241 unsigned int pai = 0;
10242 boolean_t managed = FALSE;
10243 pt_entry_t spte;
10244
10245 if ((*cpte != ARM_PTE_TYPE_FAULT)
10246 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10247 spte = *((volatile pt_entry_t*)cpte);
10248 while (!managed) {
10249 pa = pte_to_pa(spte);
10250 if (!pa_valid(pa)) {
10251 break;
10252 }
10253 pai = pa_index(pa);
10254 pvh_lock(pai);
10255 spte = *((volatile pt_entry_t*)cpte);
10256 pa = pte_to_pa(spte);
10257 if (pai == pa_index(pa)) {
10258 managed = TRUE;
10259 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10260 }
10261 pvh_unlock(pai);
10262 }
10263
10264 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10265 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10266 flush_tlb = true;
10267 }
10268
10269 if (managed) {
10270 pvh_assert_locked(pai);
10271 pvh_unlock(pai);
10272 }
10273 }
10274
10275 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10276 vrestart = addr;
10277 ++entry_count;
10278 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10279 pmap_pending_preemption())) {
10280 goto unnest_subord_done;
10281 }
10282 }
10283 clrbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10284 }
10285 addr = vlim;
10286 vrestart = addr;
10287 ++entry_count;
10288 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10289 pmap_pending_preemption())) {
10290 break;
10291 }
10292 }
10293
10294 unnest_subord_done:
10295 if (flush_tlb) {
10296 FLUSH_PTE_STRONG();
10297 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10298 }
10299
10300 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10301 if (current_index < max_index) {
10302 return vrestart;
10303 }
10304 }
10305
10306 /*
10307 * invalidate all pdes for segment at vaddr in pmap grand
10308 */
10309 if (vrestart & PMAP_NEST_GRAND) {
10310 addr = vrestart & ~PMAP_NEST_GRAND;
10311 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10312 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10313 }
10314 } else {
10315 addr = vaddr;
10316 vrestart = vaddr | PMAP_NEST_GRAND;
10317 }
10318
10319 /**
10320 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10321 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10322 * upon reentry.
10323 */
10324 if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10325 return vrestart;
10326 }
10327
10328 if (addr < grand->nested_pmap->nested_region_true_start) {
10329 addr = grand->nested_pmap->nested_region_true_start;
10330 }
10331
10332 start = addr;
10333
10334 while (addr < true_end) {
10335 tte_p = pmap_tte(grand, addr);
10336 /*
10337 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10338 * so it's possible that a region we're trying to unnest may not have been
10339 * nested in the first place.
10340 */
10341 if (tte_p != NULL) {
10342 *tte_p = ARM_TTE_TYPE_FAULT;
10343 }
10344 addr += pt_attr_twig_size(pt_attr);
10345 vrestart = addr | PMAP_NEST_GRAND;
10346 ++entry_count;
10347 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10348 pmap_pending_preemption())) {
10349 break;
10350 }
10351 }
10352 if (addr >= true_end) {
10353 vrestart = vend | PMAP_NEST_GRAND;
10354 }
10355
10356 FLUSH_PTE_STRONG();
10357 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10358
10359 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10360
10361 return vrestart;
10362 }
10363
10364 kern_return_t
10365 pmap_unnest_options(
10366 pmap_t grand,
10367 addr64_t vaddr,
10368 uint64_t size,
10369 unsigned int option)
10370 {
10371 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10372 vm_map_offset_t vend = vaddr + size;
10373
10374 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10375 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10376
10377 pmap_verify_preemptible();
10378 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10379 #if XNU_MONITOR
10380 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10381 #else
10382 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10383 #endif
10384 }
10385
10386 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10387
10388 return KERN_SUCCESS;
10389 }
10390
10391 boolean_t
10392 pmap_adjust_unnest_parameters(
10393 __unused pmap_t p,
10394 __unused vm_map_offset_t *s,
10395 __unused vm_map_offset_t *e)
10396 {
10397 return TRUE; /* to get to log_unnest_badness()... */
10398 }
10399
10400 #if PMAP_FORK_NEST
10401 /**
10402 * Perform any necessary pre-nesting of the parent's shared region at fork()
10403 * time.
10404 *
10405 * @note This should only be called from vm_map_fork().
10406 *
10407 * @param old_pmap The pmap of the parent task.
10408 * @param new_pmap The pmap of the child task.
10409 * @param nesting_start An output parameter that is updated with the start
10410 * address of the range that was pre-nested
10411 * @param nesting_end An output parameter that is updated with the end
10412 * address of the range that was pre-nested
10413 *
10414 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10415 * KERN_INVALID_ARGUMENT if the arguments were not valid.
10416 */
10417 kern_return_t
10418 pmap_fork_nest(
10419 pmap_t old_pmap,
10420 pmap_t new_pmap,
10421 vm_map_offset_t *nesting_start,
10422 vm_map_offset_t *nesting_end)
10423 {
10424 if (old_pmap == NULL || new_pmap == NULL) {
10425 return KERN_INVALID_ARGUMENT;
10426 }
10427 if (old_pmap->nested_pmap == NULL) {
10428 return KERN_SUCCESS;
10429 }
10430 pmap_nest(new_pmap,
10431 old_pmap->nested_pmap,
10432 old_pmap->nested_region_addr,
10433 old_pmap->nested_region_size);
10434 assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10435 new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10436 new_pmap->nested_region_size == old_pmap->nested_region_size,
10437 "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10438 new_pmap->nested_pmap,
10439 new_pmap->nested_region_addr,
10440 new_pmap->nested_region_size,
10441 old_pmap->nested_pmap,
10442 old_pmap->nested_region_addr,
10443 old_pmap->nested_region_size);
10444 *nesting_start = old_pmap->nested_region_addr;
10445 *nesting_end = *nesting_start + old_pmap->nested_region_size;
10446 return KERN_SUCCESS;
10447 }
10448 #endif /* PMAP_FORK_NEST */
10449
10450 /*
10451 * disable no-execute capability on
10452 * the specified pmap
10453 */
10454 #if DEVELOPMENT || DEBUG
10455 void
10456 pmap_disable_NX(
10457 pmap_t pmap)
10458 {
10459 pmap->nx_enabled = FALSE;
10460 }
10461 #else
10462 void
10463 pmap_disable_NX(
10464 __unused pmap_t pmap)
10465 {
10466 }
10467 #endif
10468
10469 /*
10470 * flush a range of hardware TLB entries.
10471 * NOTE: assumes the smallest TLB entry in use will be for
10472 * an ARM small page (4K).
10473 */
10474
10475 #if __ARM_RANGE_TLBI__
10476 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10477 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
10478 #else
10479 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10480 #endif // __ARM_RANGE_TLBI__
10481 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10482 "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10483 "of npages to 32 bits below may truncate.");
10484
10485 static void
10486 flush_mmu_tlb_region_asid_async(
10487 vm_offset_t va,
10488 size_t length,
10489 pmap_t pmap,
10490 bool last_level_only __unused,
10491 bool strong __unused)
10492 {
10493 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10494 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10495 size_t npages = length >> pmap_page_shift;
10496 uint32_t asid;
10497
10498 asid = pmap->hw_asid;
10499
10500 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10501 boolean_t flush_all = FALSE;
10502
10503 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10504 flush_all = TRUE;
10505 }
10506 if (flush_all) {
10507 flush_mmu_tlb_async();
10508 } else {
10509 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10510 }
10511 return;
10512 }
10513 #if __ARM_RANGE_TLBI__
10514 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10515 /**
10516 * Note that casting npages to 32 bits here is always safe thanks to
10517 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10518 */
10519 va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10520 if (pmap->type == PMAP_TYPE_NESTED) {
10521 flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10522 } else {
10523 flush_mmu_tlb_range_async(va, last_level_only, strong);
10524 }
10525 return;
10526 }
10527 #endif
10528 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10529 va = tlbi_asid(asid) | tlbi_addr(va);
10530
10531 if (pmap->type == PMAP_TYPE_NESTED) {
10532 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10533 } else {
10534 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10535 }
10536 }
10537
10538 MARK_AS_PMAP_TEXT static void
10539 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10540 {
10541 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10542 }
10543
10544 void
10545 flush_mmu_tlb_region(
10546 vm_offset_t va,
10547 unsigned length)
10548 {
10549 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10550 sync_tlb_flush();
10551 }
10552
10553 unsigned int
10554 pmap_cache_attributes(
10555 ppnum_t pn)
10556 {
10557 pmap_paddr_t paddr;
10558 unsigned int pai;
10559 unsigned int result;
10560 pp_attr_t pp_attr_current;
10561
10562 paddr = ptoa(pn);
10563
10564 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10565
10566 if (!pa_valid(paddr)) {
10567 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10568 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10569 }
10570
10571 result = VM_WIMG_DEFAULT;
10572
10573 pai = pa_index(paddr);
10574
10575 pp_attr_current = pp_attr_table[pai];
10576 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10577 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10578 }
10579 return result;
10580 }
10581
10582 MARK_AS_PMAP_TEXT static void
10583 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10584 {
10585 if ((wimg_bits_prev != wimg_bits_new)
10586 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10587 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10588 && (wimg_bits_new != VM_WIMG_COPYBACK))
10589 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10590 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10591 pmap_sync_page_attributes_phys(pn);
10592 }
10593
10594 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10595 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10596 }
10597 }
10598
10599 MARK_AS_PMAP_TEXT __unused void
10600 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10601 {
10602 pmap_paddr_t paddr = ptoa(pn);
10603 const unsigned int pai = pa_index(paddr);
10604
10605 if (__improbable(!pa_valid(paddr))) {
10606 panic("%s called on non-managed page 0x%08x", __func__, pn);
10607 }
10608
10609 pvh_lock(pai);
10610
10611 #if XNU_MONITOR
10612 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10613 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10614 }
10615 #endif
10616
10617 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10618
10619 pvh_unlock(pai);
10620
10621 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10622 }
10623
10624 void *
10625 pmap_map_compressor_page(ppnum_t pn)
10626 {
10627 #if __ARM_PTE_PHYSMAP__
10628 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10629 if (cacheattr != VM_WIMG_DEFAULT) {
10630 #if XNU_MONITOR
10631 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10632 #else
10633 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10634 #endif
10635 }
10636 #endif
10637 return (void*)phystokv(ptoa(pn));
10638 }
10639
10640 void
10641 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10642 {
10643 #if __ARM_PTE_PHYSMAP__
10644 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10645 if (cacheattr != VM_WIMG_DEFAULT) {
10646 #if XNU_MONITOR
10647 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10648 #else
10649 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10650 #endif
10651 }
10652 #endif
10653 }
10654
10655 /**
10656 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10657 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10658 *
10659 * @param page_list List of pages to be updated.
10660 * @param cacheattr The new cache attribute.
10661 */
10662 void
10663 pmap_batch_set_cache_attributes(
10664 const unified_page_list_t *page_list,
10665 unsigned int cacheattr)
10666 {
10667 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10668
10669 if (page_list->type != UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY) {
10670 /**
10671 * For the UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY case, we process the UPL in batch below.
10672 * In an ideal world we would just use these iterator functions within
10673 * pmap_batch_set_cache_attributes_internal() for all cases, but since this is the PPL
10674 * that means we'll need to take special care to handle pending preemption and
10675 * if necessary return the iterator position out to this function and then re-enter
10676 * pmap_batch_set_cache_attributes_internal() at the same iterator position in a
10677 * secure manner. Not impossible, but also not trivial, so unless someone asks for
10678 * this perf improvement on the PPL I'm going to take the lazy approach here.
10679 */
10680 unified_page_list_iterator_t iter;
10681
10682 for (unified_page_list_iterator_init(page_list, &iter);
10683 !unified_page_list_iterator_end(&iter);
10684 unified_page_list_iterator_next(&iter)) {
10685 bool is_fictitious = false;
10686 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10687 if (__probable(!is_fictitious)) {
10688 #if XNU_MONITOR
10689 pmap_set_cache_attributes_ppl(pn, cacheattr);
10690 #else /* !XNU_MONITOR */
10691 pmap_set_cache_attributes_internal(pn, cacheattr);
10692 #endif /* XNU_MONITOR */
10693 }
10694 }
10695 return;
10696 }
10697
10698 if (page_list->upl.upl_size == 0) {
10699 return;
10700 }
10701
10702 batch_set_cache_attr_state_t states;
10703 states.page_index = 0;
10704 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10705 states.tlb_flush_pass_needed = false;
10706 states.rt_cache_flush_pass_needed = false;
10707
10708 /* Verify we are being called from a preemptible context. */
10709 pmap_verify_preemptible();
10710
10711 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10712 #if XNU_MONITOR
10713 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) page_list->upl.upl_info,
10714 states, page_list->upl.upl_size, cacheattr);
10715 #else /* !XNU_MONITOR */
10716 states = pmap_batch_set_cache_attributes_internal(page_list->upl.upl_info,
10717 states, page_list->upl.upl_size, cacheattr);
10718 #endif /* XNU_MONITOR */
10719 }
10720
10721 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10722 }
10723
10724 /**
10725 * Flushes TLB entries associated with the page specified by paddr, but do not
10726 * issue barriers yet.
10727 *
10728 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10729 */
10730 MARK_AS_PMAP_TEXT static void
10731 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10732 {
10733 #if __ARM_PTE_PHYSMAP__
10734 /* Flush the physical aperture mappings. */
10735 const vm_offset_t kva = phystokv(paddr);
10736 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10737 #endif /* __ARM_PTE_PHYSMAP__ */
10738
10739 /* Flush the mappings tracked in the ptes. */
10740 const unsigned int pai = pa_index(paddr);
10741 pv_entry_t **pv_h = pai_to_pvh(pai);
10742
10743 pt_entry_t *pte_p = PT_ENTRY_NULL;
10744 pv_entry_t *pve_p = PV_ENTRY_NULL;
10745
10746 pvh_assert_locked(pai);
10747
10748 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10749 pte_p = pvh_ptep(pv_h);
10750 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10751 pve_p = pvh_pve_list(pv_h);
10752 pte_p = PT_ENTRY_NULL;
10753 }
10754
10755 int pve_ptep_idx = 0;
10756 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10757 if (pve_p != PV_ENTRY_NULL) {
10758 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10759 if (pte_p == PT_ENTRY_NULL) {
10760 goto flush_tlb_skip_pte;
10761 }
10762 }
10763
10764 #ifdef PVH_FLAG_IOMMU
10765 if (pvh_ptep_is_iommu(pte_p)) {
10766 goto flush_tlb_skip_pte;
10767 }
10768 #endif /* PVH_FLAG_IOMMU */
10769 pmap_t pmap = ptep_get_pmap(pte_p);
10770 vm_map_address_t va = ptep_get_va(pte_p);
10771
10772 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10773 pmap, true, false);
10774
10775 flush_tlb_skip_pte:
10776 pte_p = PT_ENTRY_NULL;
10777 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10778 pve_ptep_idx = 0;
10779 pve_p = pve_next(pve_p);
10780 }
10781 }
10782 }
10783
10784 /**
10785 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10786 *
10787 * @param pai The Physical Address Index of the entry.
10788 * @param cacheattr The new cache attribute.
10789 */
10790 MARK_AS_PMAP_TEXT static void
10791 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10792 {
10793 pvh_assert_locked(pai);
10794
10795 pp_attr_t pp_attr_current, pp_attr_template;
10796 do {
10797 pp_attr_current = pp_attr_table[pai];
10798 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10799
10800 /**
10801 * WIMG bits should only be updated under the PVH lock, but we should do
10802 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10803 */
10804 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10805 }
10806
10807 /**
10808 * Batch updates the cache attributes of a list of pages in three passes.
10809 *
10810 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10811 * In pass two, TLB entries are flushed for each page in the list if necessary.
10812 * In pass three, caches are cleaned for each page in the list if necessary.
10813 *
10814 * When running in PPL, this function may decide to return to the caller in response
10815 * to AST_URGENT.
10816 *
10817 * @param user_page_list List of pages to be updated.
10818 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10819 * @param page_cnt Number of pages in total in user_page_list.
10820 * @param cacheattr The new cache attributes.
10821 *
10822 * @return The new state of the state machine.
10823 */
10824 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10825 pmap_batch_set_cache_attributes_internal(
10826 #if XNU_MONITOR
10827 volatile upl_page_info_t *user_page_list,
10828 #else /* !XNU_MONITOR */
10829 upl_page_info_array_t user_page_list,
10830 #endif /* XNU_MONITOR */
10831 batch_set_cache_attr_state_t states,
10832 unsigned int page_cnt,
10833 unsigned int cacheattr)
10834 {
10835 uint64_t page_index = states.page_index;
10836 uint64_t state = states.state;
10837 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10838 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10839
10840 /* For verifying progress. */
10841 __assert_only const uint64_t page_index_old = page_index;
10842 __assert_only const uint64_t state_old = state;
10843
10844 /* Assert page_index and state are within their range. */
10845 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10846 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10847 }
10848
10849 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10850 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10851 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10852 while (page_index < page_cnt) {
10853 const ppnum_t pn = user_page_list[page_index].phys_addr;
10854 const pmap_paddr_t paddr = ptoa(pn);
10855
10856 if (!pa_valid(paddr)) {
10857 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10858 }
10859
10860 const unsigned int pai = pa_index(paddr);
10861
10862 /* Lock the page. */
10863 pvh_lock(pai);
10864
10865 #if XNU_MONITOR
10866 if (ppattr_pa_test_monitor(paddr)) {
10867 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10868 }
10869 #endif /* XNU_MONITOR */
10870 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10871
10872 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10873 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10874 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10875 }
10876
10877 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10878
10879 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10880 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10881 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10882 }
10883
10884 /* Update the cache attributes in PTE and PP_ATTR table. */
10885 if (wimg_bits_new != wimg_bits_prev) {
10886 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10887 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10888 }
10889
10890 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10891 rt_cache_flush_pass_needed = true;
10892 }
10893
10894 pvh_unlock(pai);
10895
10896 page_index++;
10897
10898 #if XNU_MONITOR
10899 /**
10900 * Check for AST_URGENT every page, as the pve list search in cache
10901 * update can take non-constant time.
10902 */
10903 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10904 goto pbscai_exit;
10905 }
10906 #endif /* XNU_MONITOR */
10907 }
10908
10909 /* page_index == page_cnt && !pmap_pending_preemption() */
10910 if (tlb_flush_pass_needed) {
10911 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10912 } else if (rt_cache_flush_pass_needed) {
10913 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10914 } else {
10915 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10916 }
10917 page_index = 0;
10918
10919 /* Sync the PTE writes before potential TLB/Cache flushes. */
10920 FLUSH_PTE_STRONG();
10921
10922 #if XNU_MONITOR
10923 if (__improbable(pmap_pending_preemption())) {
10924 goto pbscai_exit;
10925 }
10926 #endif /* XNU_MONITOR */
10927 }
10928
10929 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10930 /**
10931 * Pass 2: for each physical page and for each mapping, we need to flush
10932 * the TLB for it.
10933 */
10934 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10935 while (page_index < page_cnt) {
10936 const ppnum_t pn = user_page_list[page_index].phys_addr;
10937
10938 const pmap_paddr_t paddr = ptoa(pn);
10939 if (!pa_valid(paddr)) {
10940 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10941 }
10942
10943 const unsigned int pai = pa_index(paddr);
10944
10945 pvh_lock(pai);
10946 pmap_flush_tlb_for_paddr_locked_async(paddr);
10947 pvh_unlock(pai);
10948
10949 page_index++;
10950
10951 #if XNU_MONITOR
10952 /**
10953 * Check for AST_URGENT every page, as the pve list search in cache
10954 * update can take non-constant time.
10955 */
10956 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10957 goto pbscai_exit;
10958 }
10959 #endif /* XNU_MONITOR */
10960 }
10961
10962 #if HAS_FEAT_XS
10963 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10964 arm64_sync_tlb(false);
10965 #else
10966 /**
10967 * For targets that distinguish between mild and strong DSB, mild DSB
10968 * will not drain the prefetcher. This can lead to prefetch-driven
10969 * cache fills that defeat the uncacheable requirement of the RT memory type.
10970 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10971 */
10972 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10973 #endif
10974
10975 if (rt_cache_flush_pass_needed) {
10976 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10977 } else {
10978 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10979 }
10980 page_index = 0;
10981
10982 #if XNU_MONITOR
10983 if (__improbable(pmap_pending_preemption())) {
10984 goto pbscai_exit;
10985 }
10986 #endif /* XNU_MONITOR */
10987 }
10988
10989 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10990 /* Pass 3: Flush the cache if the page is recently set to RT */
10991 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10992 #if !XNU_MONITOR
10993 /**
10994 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10995 * in the state where DC by VA instructions remain enabled.
10996 */
10997 disable_preemption();
10998 #endif /* !XNU_MONITOR */
10999
11000 assert(get_preemption_level() > 0);
11001
11002 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11003 /**
11004 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
11005 * and the host will handle cache maintenance for it. So we don't need to
11006 * worry about enabling the ops here for AVP.
11007 */
11008 enable_dc_mva_ops();
11009 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11010
11011 while (page_index < page_cnt) {
11012 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
11013
11014 if (!pa_valid(paddr)) {
11015 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
11016 }
11017
11018 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
11019
11020 page_index++;
11021
11022 #if XNU_MONITOR
11023 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11024 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11025 disable_dc_mva_ops();
11026 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11027 goto pbscai_exit;
11028 }
11029 #endif /* XNU_MONITOR */
11030 }
11031
11032 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11033 disable_dc_mva_ops();
11034 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11035
11036 #if !XNU_MONITOR
11037 enable_preemption();
11038 #endif /* !XNU_MONITOR */
11039
11040 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11041 page_index = 0;
11042 }
11043
11044 #if XNU_MONITOR
11045 pbscai_exit:
11046 #endif /* XNU_MONITOR */
11047 /* Assert page_index and state are within their range. */
11048 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
11049
11050 /* Make sure we are making progress in this call. */
11051 assert(page_index > page_index_old || state > state_old);
11052
11053 batch_set_cache_attr_state_t states_new;
11054 states_new.page_index = page_index;
11055 states_new.state = state;
11056 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
11057 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
11058 return states_new;
11059 }
11060
11061 MARK_AS_PMAP_TEXT static void
11062 pmap_set_cache_attributes_priv(
11063 ppnum_t pn,
11064 unsigned int cacheattr,
11065 boolean_t external __unused)
11066 {
11067 pmap_paddr_t paddr;
11068 unsigned int pai;
11069 pp_attr_t pp_attr_current;
11070 pp_attr_t pp_attr_template;
11071 unsigned int wimg_bits_prev, wimg_bits_new;
11072
11073 paddr = ptoa(pn);
11074
11075 if (!pa_valid(paddr)) {
11076 return; /* Not a managed page. */
11077 }
11078
11079 if (cacheattr & VM_WIMG_USE_DEFAULT) {
11080 cacheattr = VM_WIMG_DEFAULT;
11081 }
11082
11083 pai = pa_index(paddr);
11084
11085 pvh_lock(pai);
11086
11087 #if XNU_MONITOR
11088 if (external && ppattr_pa_test_monitor(paddr)) {
11089 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
11090 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
11091 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
11092 }
11093 #endif
11094
11095 do {
11096 pp_attr_current = pp_attr_table[pai];
11097 wimg_bits_prev = VM_WIMG_DEFAULT;
11098 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
11099 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
11100 }
11101
11102 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
11103
11104 /**
11105 * WIMG bits should only be updated under the PVH lock, but we should do
11106 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
11107 */
11108 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
11109
11110 wimg_bits_new = VM_WIMG_DEFAULT;
11111 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
11112 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
11113 }
11114
11115 if (wimg_bits_new != wimg_bits_prev) {
11116 pmap_update_cache_attributes_locked(pn, cacheattr, true);
11117 }
11118
11119 pvh_unlock(pai);
11120
11121 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
11122 }
11123
11124 MARK_AS_PMAP_TEXT void
11125 pmap_set_cache_attributes_internal(
11126 ppnum_t pn,
11127 unsigned int cacheattr)
11128 {
11129 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
11130 }
11131
11132 void
11133 pmap_set_cache_attributes(
11134 ppnum_t pn,
11135 unsigned int cacheattr)
11136 {
11137 #if XNU_MONITOR
11138 pmap_set_cache_attributes_ppl(pn, cacheattr);
11139 #else
11140 pmap_set_cache_attributes_internal(pn, cacheattr);
11141 #endif
11142 }
11143
11144 /**
11145 * Updates the page numbered ppnum to have attribute specified by attributes.
11146 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
11147 * The necessity of the TLB flush is returned in case this function is called
11148 * in a batched manner and the TLB flush is intended to be done at a different
11149 * timing.
11150 *
11151 * @param ppnum Page Number of the page to be updated.
11152 * @param attributes The new cache attributes.
11153 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11154 * immediately.
11155 *
11156 * @return Returns true if a TLB flush is needed for this update regardless of
11157 * whether a flush has occurred already.
11158 */
11159 MARK_AS_PMAP_TEXT bool
11160 pmap_update_cache_attributes_locked(
11161 ppnum_t ppnum,
11162 unsigned attributes,
11163 bool perform_tlbi)
11164 {
11165 pmap_paddr_t phys = ptoa(ppnum);
11166 pv_entry_t *pve_p;
11167 pt_entry_t *pte_p;
11168 pv_entry_t **pv_h;
11169 pt_entry_t tmplate;
11170 unsigned int pai;
11171 boolean_t tlb_flush_needed = false;
11172
11173 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11174
11175 if (pmap_panic_dev_wimg_on_managed) {
11176 switch (attributes & VM_WIMG_MASK) {
11177 case VM_WIMG_IO: // nGnRnE
11178 case VM_WIMG_POSTED: // nGnRE
11179 /* supported on DRAM, but slow, so we disallow */
11180
11181 case VM_WIMG_POSTED_REORDERED: // nGRE
11182 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11183 /* unsupported on DRAM */
11184
11185 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11186 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11187 break;
11188
11189 default:
11190 /* not device type memory, all good */
11191
11192 break;
11193 }
11194 }
11195
11196 #if __ARM_PTE_PHYSMAP__
11197 vm_offset_t kva = phystokv(phys);
11198 pte_p = pmap_pte(kernel_pmap, kva);
11199
11200 tmplate = *pte_p;
11201 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11202 #if XNU_MONITOR
11203 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11204 #else
11205 tmplate |= wimg_to_pte(attributes, phys);
11206 #endif
11207 if (tmplate & ARM_PTE_HINT_MASK) {
11208 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11209 __FUNCTION__, pte_p, (void *)kva, tmplate);
11210 }
11211
11212 if (perform_tlbi) {
11213 write_pte_strong(pte_p, tmplate);
11214 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11215 } else {
11216 write_pte_fast(pte_p, tmplate);
11217 }
11218 tlb_flush_needed = true;
11219 #endif
11220
11221 pai = pa_index(phys);
11222
11223 pv_h = pai_to_pvh(pai);
11224
11225 pte_p = PT_ENTRY_NULL;
11226 pve_p = PV_ENTRY_NULL;
11227 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11228 pte_p = pvh_ptep(pv_h);
11229 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11230 pve_p = pvh_pve_list(pv_h);
11231 pte_p = PT_ENTRY_NULL;
11232 }
11233
11234 int pve_ptep_idx = 0;
11235 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11236 vm_map_address_t va;
11237 pmap_t pmap;
11238
11239 if (pve_p != PV_ENTRY_NULL) {
11240 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11241 if (pte_p == PT_ENTRY_NULL) {
11242 goto cache_skip_pve;
11243 }
11244 }
11245
11246 #ifdef PVH_FLAG_IOMMU
11247 if (pvh_ptep_is_iommu(pte_p)) {
11248 goto cache_skip_pve;
11249 }
11250 #endif
11251 pmap = ptep_get_pmap(pte_p);
11252 #if HAS_FEAT_XS
11253 /**
11254 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11255 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11256 */
11257 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11258 #endif /* HAS_FEAT_XS */
11259 va = ptep_get_va(pte_p);
11260
11261 tmplate = *pte_p;
11262 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11263 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11264
11265 if (perform_tlbi) {
11266 write_pte_strong(pte_p, tmplate);
11267 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11268 pmap, true, false);
11269 } else {
11270 write_pte_fast(pte_p, tmplate);
11271 }
11272 tlb_flush_needed = true;
11273
11274 cache_skip_pve:
11275 pte_p = PT_ENTRY_NULL;
11276 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11277 pve_ptep_idx = 0;
11278 pve_p = pve_next(pve_p);
11279 }
11280 }
11281 if (perform_tlbi && tlb_flush_needed) {
11282 #if HAS_FEAT_XS
11283 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11284 arm64_sync_tlb(false);
11285 #else
11286 /**
11287 * For targets that distinguish between mild and strong DSB, mild DSB
11288 * will not drain the prefetcher. This can lead to prefetch-driven
11289 * cache fills that defeat the uncacheable requirement of the RT memory type.
11290 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11291 */
11292 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11293 #endif
11294 }
11295
11296 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11297
11298 return tlb_flush_needed;
11299 }
11300
11301 /**
11302 * Mark a pmap as being dedicated to use for a commpage mapping.
11303 * The pmap itself will never be activated on a CPU; its mappings will
11304 * only be embedded in userspace pmaps at a fixed virtual address.
11305 *
11306 * @param pmap the pmap to mark as belonging to a commpage.
11307 */
11308 static void
11309 pmap_set_commpage(pmap_t pmap)
11310 {
11311 #if XNU_MONITOR
11312 assert(!pmap_ppl_locked_down);
11313 #endif
11314 assert(pmap->type == PMAP_TYPE_USER);
11315 pmap->type = PMAP_TYPE_COMMPAGE;
11316 /*
11317 * Free the pmap's ASID. This pmap should not ever be directly
11318 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
11319 * ASID space contention but will also cause pmap_switch() to panic
11320 * if an attacker tries to activate this pmap. Disable preemption to
11321 * accommodate the *_nopreempt spinlock in free_asid().
11322 */
11323 mp_disable_preemption();
11324 pmap_get_pt_ops(pmap)->free_id(pmap);
11325 mp_enable_preemption();
11326 }
11327
11328 static void
11329 pmap_update_tt3e(
11330 pmap_t pmap,
11331 vm_address_t address,
11332 tt_entry_t template)
11333 {
11334 tt_entry_t *ptep, pte;
11335
11336 ptep = pmap_tt3e(pmap, address);
11337 if (ptep == NULL) {
11338 panic("%s: no ptep?", __FUNCTION__);
11339 }
11340
11341 pte = *ptep;
11342 pte = tte_to_pa(pte) | template;
11343 write_pte_strong(ptep, pte);
11344 }
11345
11346 /* Note absence of non-global bit */
11347 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11348 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11349 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11350 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11351
11352 /* Note absence of non-global bit and no-execute bit. */
11353 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11354 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11355 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11356 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11357
11358 void
11359 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11360 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11361 {
11362 kern_return_t kr;
11363 pmap_paddr_t data_pa = 0; // data address
11364 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11365 pmap_paddr_t text_pa = 0; // text address
11366
11367 *kernel_data_addr = 0;
11368 *kernel_text_addr = 0;
11369 *user_text_addr = 0;
11370
11371 #if XNU_MONITOR
11372 data_pa = pmap_alloc_page_for_kern(0);
11373 assert(data_pa);
11374 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11375 ro_data_pa = pmap_alloc_page_for_kern(0);
11376 assert(ro_data_pa);
11377 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11378 #if CONFIG_ARM_PFZ
11379 text_pa = pmap_alloc_page_for_kern(0);
11380 assert(text_pa);
11381 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11382 #endif
11383
11384 #else /* XNU_MONITOR */
11385 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11386 /*
11387 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11388 * mapped at page granularity, so a separate page for kernel RO data would not
11389 * be useful.
11390 */
11391 ro_data_pa = data_pa;
11392 #if CONFIG_ARM_PFZ
11393 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11394 #endif
11395
11396 #endif /* XNU_MONITOR */
11397
11398 /*
11399 * In order to avoid burning extra pages on mapping the shared page, we
11400 * create a dedicated pmap for the shared page. We forcibly nest the
11401 * translation tables from this pmap into other pmaps. The level we
11402 * will nest at depends on the MMU configuration (page size, TTBR range,
11403 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11404 *
11405 * Note that this is NOT "the nested pmap" (which is used to nest the
11406 * shared cache).
11407 *
11408 * Note that we update parameters of the entry for our unique needs (NG
11409 * entry, etc.).
11410 */
11411 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11412 assert(commpage_pmap_default != NULL);
11413 pmap_set_commpage(commpage_pmap_default);
11414
11415 /* The user 64-bit mappings... */
11416 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11417 assert(kr == KERN_SUCCESS);
11418 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11419
11420 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11421 assert(kr == KERN_SUCCESS);
11422 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11423 #if CONFIG_ARM_PFZ
11424 /* User mapping of comm page text section for 64 bit mapping only
11425 *
11426 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11427 * user processes to get this page mapped in, they should never call into
11428 * this page.
11429 *
11430 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11431 * is slid in the same L3 as the data commpage. It is either outside the
11432 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11433 * it is reserved and unavailable to mach VM for future mappings.
11434 */
11435 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11436 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11437
11438 vm_map_address_t commpage_text_va = 0;
11439
11440 do {
11441 int text_leaf_index = random() % num_ptes;
11442
11443 // Generate a VA for the commpage text with the same root and twig index as data
11444 // comm page, but with new leaf index we've just generated.
11445 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11446 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11447 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11448
11449 // Assert that this is empty
11450 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11451 assert(ptep != PT_ENTRY_NULL);
11452 assert(*ptep == ARM_TTE_EMPTY);
11453
11454 // At this point, we've found the address we want to insert our comm page at
11455 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11456 assert(kr == KERN_SUCCESS);
11457 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11458 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11459
11460 *user_text_addr = commpage_text_va;
11461 #endif
11462
11463 /* ...and the user 32-bit mappings. */
11464 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11465 assert(kr == KERN_SUCCESS);
11466 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11467
11468 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11469 assert(kr == KERN_SUCCESS);
11470 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11471 #if __ARM_MIXED_PAGE_SIZE__
11472 /**
11473 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11474 * new set of page tables that point to the exact same 16K shared page as
11475 * before. Only the first 4K of the 16K shared page is mapped since that's
11476 * the only part that contains relevant data.
11477 */
11478 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11479 assert(commpage_pmap_4k != NULL);
11480 pmap_set_commpage(commpage_pmap_4k);
11481
11482 /* The user 64-bit mappings... */
11483 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11484 assert(kr == KERN_SUCCESS);
11485 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11486
11487 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11488 assert(kr == KERN_SUCCESS);
11489 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11490
11491 /* ...and the user 32-bit mapping. */
11492 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11493 assert(kr == KERN_SUCCESS);
11494 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11495
11496 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11497 assert(kr == KERN_SUCCESS);
11498 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11499 #endif
11500
11501 /* For manipulation in kernel, go straight to physical page */
11502 *kernel_data_addr = phystokv(data_pa);
11503 assert(commpage_ro_data_kva == 0);
11504 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11505 assert(commpage_text_kva == 0);
11506 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11507 }
11508
11509
11510 /*
11511 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11512 * with user controlled TTEs for regions that aren't explicitly reserved by the
11513 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11514 */
11515 #if (ARM_PGSHIFT == 14)
11516 /**
11517 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11518 * commpage completely above the maximum 32-bit userspace VA.
11519 */
11520 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11521
11522 /**
11523 * Normally there'd be an assert to check that 64-bit devices with 64-bit
11524 * userspace VAs can nest the commpage completely above the maximum 64-bit
11525 * userpace VA, but that technically isn't true on macOS. On those systems, the
11526 * commpage lives within the userspace VA range, but is protected by the VM as
11527 * a reserved region (see vm_reserved_regions[] definition for more info).
11528 */
11529
11530 #elif (ARM_PGSHIFT == 12)
11531 /**
11532 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11533 * above the maximum userspace VA.
11534 */
11535 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11536 #else
11537 #error Nested shared page mapping is unsupported on this config
11538 #endif
11539
11540 MARK_AS_PMAP_TEXT kern_return_t
11541 pmap_insert_commpage_internal(
11542 pmap_t pmap)
11543 {
11544 kern_return_t kr = KERN_SUCCESS;
11545 vm_offset_t commpage_vaddr;
11546 pt_entry_t *ttep, *src_ttep;
11547 int options = 0;
11548 pmap_t commpage_pmap = commpage_pmap_default;
11549
11550 /* Validate the pmap input before accessing its data. */
11551 validate_pmap_mutable(pmap);
11552
11553 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11554 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11555
11556 #if __ARM_MIXED_PAGE_SIZE__
11557 #if !__ARM_16K_PG__
11558 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11559 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11560 #endif /* !__ARM_16K_PG__ */
11561
11562 /* Choose the correct shared page pmap to use. */
11563 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11564 if (pmap_page_size == 16384) {
11565 commpage_pmap = commpage_pmap_default;
11566 } else if (pmap_page_size == 4096) {
11567 commpage_pmap = commpage_pmap_4k;
11568 } else {
11569 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11570 }
11571 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11572
11573 #if XNU_MONITOR
11574 options |= PMAP_OPTIONS_NOWAIT;
11575 #endif /* XNU_MONITOR */
11576
11577 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11578 #error We assume a single page.
11579 #endif
11580
11581 if (pmap_is_64bit(pmap)) {
11582 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11583 } else {
11584 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11585 }
11586
11587
11588 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11589
11590 /*
11591 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11592 * two (2MB) depending on the address space layout. For 16KB pages, each level
11593 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11594 * to "nest".
11595 *
11596 * Note: This is not "nesting" in the shared cache sense. This definition of
11597 * nesting just means inserting pointers to pre-allocated tables inside of
11598 * the passed in pmap to allow us to share page tables (which map the shared
11599 * page) for every task. This saves at least one page of memory per process
11600 * compared to creating new page tables in every process for mapping the
11601 * shared page.
11602 */
11603
11604 /**
11605 * Allocate the twig page tables if needed, and slam a pointer to the shared
11606 * page's tables into place.
11607 */
11608 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11609 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11610
11611 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11612
11613 if (kr != KERN_SUCCESS) {
11614 #if XNU_MONITOR
11615 if (kr == KERN_RESOURCE_SHORTAGE) {
11616 return kr;
11617 } else
11618 #endif
11619 if (kr == KERN_ABORTED) {
11620 return kr;
11621 } else {
11622 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11623 }
11624 }
11625
11626 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11627 }
11628
11629 if (*ttep != ARM_PTE_EMPTY) {
11630 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11631 }
11632
11633 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11634
11635 *ttep = *src_ttep;
11636 FLUSH_PTE_STRONG();
11637
11638 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11639
11640 return kr;
11641 }
11642
11643 static void
11644 pmap_unmap_commpage(
11645 pmap_t pmap)
11646 {
11647 pt_entry_t *ttep;
11648 vm_offset_t commpage_vaddr;
11649 pmap_t commpage_pmap = commpage_pmap_default;
11650
11651 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11652 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11653
11654 #if __ARM_MIXED_PAGE_SIZE__
11655 #if !__ARM_16K_PG__
11656 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11657 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11658 #endif /* !__ARM_16K_PG__ */
11659
11660 /* Choose the correct shared page pmap to use. */
11661 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11662 if (pmap_page_size == 16384) {
11663 commpage_pmap = commpage_pmap_default;
11664 } else if (pmap_page_size == 4096) {
11665 commpage_pmap = commpage_pmap_4k;
11666 } else {
11667 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11668 }
11669 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11670
11671 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11672 #error We assume a single page.
11673 #endif
11674
11675 if (pmap_is_64bit(pmap)) {
11676 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11677 } else {
11678 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11679 }
11680
11681
11682 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11683
11684 if (ttep == NULL) {
11685 return;
11686 }
11687
11688 /* It had better be mapped to the shared page. */
11689 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11690 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11691 }
11692
11693 *ttep = ARM_TTE_EMPTY;
11694 FLUSH_PTE_STRONG();
11695
11696 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11697 sync_tlb_flush();
11698 }
11699
11700 void
11701 pmap_insert_commpage(
11702 pmap_t pmap)
11703 {
11704 kern_return_t kr = KERN_FAILURE;
11705 #if XNU_MONITOR
11706 do {
11707 kr = pmap_insert_commpage_ppl(pmap);
11708
11709 if (kr == KERN_RESOURCE_SHORTAGE) {
11710 pmap_alloc_page_for_ppl(0);
11711 }
11712 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11713
11714 pmap_ledger_check_balance(pmap);
11715 #else
11716 do {
11717 kr = pmap_insert_commpage_internal(pmap);
11718 } while (kr == KERN_ABORTED);
11719 #endif
11720
11721 if (kr != KERN_SUCCESS) {
11722 panic("%s: failed to insert the shared page, kr=%d, "
11723 "pmap=%p",
11724 __FUNCTION__, kr,
11725 pmap);
11726 }
11727 }
11728
11729 static boolean_t
11730 pmap_is_64bit(
11731 pmap_t pmap)
11732 {
11733 return pmap->is_64bit;
11734 }
11735
11736 bool
11737 pmap_is_exotic(
11738 pmap_t pmap __unused)
11739 {
11740 return false;
11741 }
11742
11743
11744 /* ARMTODO -- an implementation that accounts for
11745 * holes in the physical map, if any.
11746 */
11747 boolean_t
11748 pmap_valid_page(
11749 ppnum_t pn)
11750 {
11751 return pa_valid(ptoa(pn));
11752 }
11753
11754 boolean_t
11755 pmap_bootloader_page(
11756 ppnum_t pn)
11757 {
11758 pmap_paddr_t paddr = ptoa(pn);
11759
11760 if (pa_valid(paddr)) {
11761 return FALSE;
11762 }
11763 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11764 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11765 }
11766
11767 MARK_AS_PMAP_TEXT boolean_t
11768 pmap_is_empty_internal(
11769 pmap_t pmap,
11770 vm_map_offset_t va_start,
11771 vm_map_offset_t va_end)
11772 {
11773 vm_map_offset_t block_start, block_end;
11774 tt_entry_t *tte_p;
11775
11776 if (pmap == NULL) {
11777 return TRUE;
11778 }
11779
11780 validate_pmap(pmap);
11781
11782 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11783 unsigned int initial_not_in_kdp = not_in_kdp;
11784
11785 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11786 pmap_lock(pmap, PMAP_LOCK_SHARED);
11787 }
11788
11789
11790 /* TODO: This will be faster if we increment ttep at each level. */
11791 block_start = va_start;
11792
11793 while (block_start < va_end) {
11794 pt_entry_t *bpte_p, *epte_p;
11795 pt_entry_t *pte_p;
11796
11797 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11798 if (block_end > va_end) {
11799 block_end = va_end;
11800 }
11801
11802 tte_p = pmap_tte(pmap, block_start);
11803 if ((tte_p != PT_ENTRY_NULL)
11804 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11805 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11806 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11807 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11808
11809 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11810 if (*pte_p != ARM_PTE_EMPTY) {
11811 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11812 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11813 }
11814 return FALSE;
11815 }
11816 }
11817 }
11818 block_start = block_end;
11819 }
11820
11821 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11822 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11823 }
11824
11825 return TRUE;
11826 }
11827
11828 boolean_t
11829 pmap_is_empty(
11830 pmap_t pmap,
11831 vm_map_offset_t va_start,
11832 vm_map_offset_t va_end)
11833 {
11834 #if XNU_MONITOR
11835 return pmap_is_empty_ppl(pmap, va_start, va_end);
11836 #else
11837 return pmap_is_empty_internal(pmap, va_start, va_end);
11838 #endif
11839 }
11840
11841 vm_map_offset_t
11842 pmap_max_offset(
11843 boolean_t is64,
11844 unsigned int option)
11845 {
11846 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11847 }
11848
11849 vm_map_offset_t
11850 pmap_max_64bit_offset(
11851 __unused unsigned int option)
11852 {
11853 vm_map_offset_t max_offset_ret = 0;
11854
11855 #if defined(__arm64__)
11856 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11857 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11858 max_offset_ret = arm64_pmap_max_offset_default;
11859 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11860 max_offset_ret = min_max_offset;
11861 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11862 max_offset_ret = MACH_VM_MAX_ADDRESS;
11863 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11864 if (arm64_pmap_max_offset_default) {
11865 max_offset_ret = arm64_pmap_max_offset_default;
11866 } else if (max_mem > 0xC0000000) {
11867 // devices with > 3GB of memory
11868 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11869 } else if (max_mem > 0x40000000) {
11870 // devices with > 1GB and <= 3GB of memory
11871 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11872 } else {
11873 // devices with <= 1 GB of memory
11874 max_offset_ret = min_max_offset;
11875 }
11876 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11877 if (arm64_pmap_max_offset_default) {
11878 // Allow the boot-arg to override jumbo size
11879 max_offset_ret = arm64_pmap_max_offset_default;
11880 } else {
11881 max_offset_ret = MACH_VM_JUMBO_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11882 }
11883 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11884 } else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11885 max_offset_ret = MACH_VM_MAX_ADDRESS;
11886 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11887 } else {
11888 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11889 }
11890
11891 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11892 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11893 assert(max_offset_ret >= min_max_offset);
11894 }
11895 #else
11896 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11897 #endif
11898
11899 return max_offset_ret;
11900 }
11901
11902 vm_map_offset_t
11903 pmap_max_32bit_offset(
11904 unsigned int option)
11905 {
11906 vm_map_offset_t max_offset_ret = 0;
11907
11908 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11909 max_offset_ret = arm_pmap_max_offset_default;
11910 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11911 max_offset_ret = VM_MAX_ADDRESS;
11912 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11913 max_offset_ret = VM_MAX_ADDRESS;
11914 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11915 if (arm_pmap_max_offset_default) {
11916 max_offset_ret = arm_pmap_max_offset_default;
11917 } else if (max_mem > 0x20000000) {
11918 max_offset_ret = VM_MAX_ADDRESS;
11919 } else {
11920 max_offset_ret = VM_MAX_ADDRESS;
11921 }
11922 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11923 max_offset_ret = VM_MAX_ADDRESS;
11924 } else {
11925 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11926 }
11927
11928 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11929 return max_offset_ret;
11930 }
11931
11932 #if CONFIG_DTRACE
11933 /*
11934 * Constrain DTrace copyin/copyout actions
11935 */
11936 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11937 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11938
11939 kern_return_t
11940 dtrace_copyio_preflight(
11941 __unused addr64_t va)
11942 {
11943 if (current_map() == kernel_map) {
11944 return KERN_FAILURE;
11945 } else {
11946 return KERN_SUCCESS;
11947 }
11948 }
11949
11950 kern_return_t
11951 dtrace_copyio_postflight(
11952 __unused addr64_t va)
11953 {
11954 return KERN_SUCCESS;
11955 }
11956 #endif /* CONFIG_DTRACE */
11957
11958
11959 void
11960 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11961 {
11962 }
11963
11964
11965 void
11966 pmap_flush(
11967 __unused pmap_flush_context *cpus_to_flush)
11968 {
11969 /* not implemented yet */
11970 return;
11971 }
11972
11973 #if XNU_MONITOR
11974
11975 /*
11976 * Enforce that the address range described by kva and nbytes is not currently
11977 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11978 * unintentionally writing to PPL-owned memory.
11979 */
11980 void
11981 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11982 {
11983 vm_offset_t end;
11984 if (os_add_overflow(kva, nbytes, &end)) {
11985 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11986 }
11987 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11988 pmap_paddr_t pa = kvtophys_nofail(ckva);
11989 unsigned int pai = pa_index(pa);
11990 pp_attr_t attr;
11991 if (__improbable(!pa_valid(pa))) {
11992 panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11993 }
11994 pvh_lock(pai);
11995 if (__improbable(ckva == phystokv(pa))) {
11996 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11997 }
11998 do {
11999 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
12000 if (__improbable(attr & PP_ATTR_MONITOR)) {
12001 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
12002 }
12003 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
12004 pvh_unlock(pai);
12005 if (__improbable(kvtophys_nofail(ckva) != pa)) {
12006 panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12007 }
12008 }
12009 }
12010
12011 void
12012 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
12013 {
12014 vm_offset_t end;
12015 if (os_add_overflow(kva, nbytes, &end)) {
12016 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12017 }
12018 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12019 pmap_paddr_t pa = kvtophys_nofail(ckva);
12020
12021 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
12022 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
12023 }
12024 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
12025 ppattr_pa_clear_no_monitor(pa);
12026 }
12027 }
12028
12029 /**
12030 * Lock down a page, making all mappings read-only, and preventing further
12031 * mappings or removal of this particular kva's mapping. Effectively, it makes
12032 * the physical page at kva immutable (see the ppl_writable parameter for an
12033 * exception to this).
12034 *
12035 * @param kva Valid address to any mapping of the physical page to lockdown.
12036 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12037 * @param ppl_writable True if the PPL should still be able to write to the page
12038 * using the physical aperture mapping. False will make the
12039 * page read-only for both the kernel and PPL in the
12040 * physical aperture.
12041 */
12042
12043 MARK_AS_PMAP_TEXT static void
12044 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12045 {
12046 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
12047 }
12048
12049 /**
12050 * Lock down a page, giving all mappings the specified maximum permissions, and
12051 * preventing further mappings or removal of this particular kva's mapping.
12052 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
12053 * parameter for an exception to this).
12054 *
12055 * @param kva Valid address to any mapping of the physical page to lockdown.
12056 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12057 * @param ppl_writable True if the PPL should still be able to write to the page
12058 * using the physical aperture mapping. False will make the
12059 * page read-only for both the kernel and PPL in the
12060 * physical aperture.
12061 * @param prot Maximum permissions to allow in existing alias mappings
12062 */
12063 MARK_AS_PMAP_TEXT static void
12064 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
12065 {
12066 const pmap_paddr_t pa = kvtophys_nofail(kva);
12067 const unsigned int pai = pa_index(pa);
12068
12069 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12070 pvh_lock(pai);
12071 pv_entry_t **pvh = pai_to_pvh(pai);
12072 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12073
12074 if (__improbable(ppattr_pa_test_monitor(pa))) {
12075 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
12076 }
12077
12078 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
12079 panic("%s: %#lx already locked down/executable (%#llx)",
12080 __func__, kva, (uint64_t)pvh_flags);
12081 }
12082
12083
12084 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
12085
12086 /* Update the physical aperture mapping to prevent kernel write access. */
12087 const unsigned int new_xprr_perm =
12088 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12089 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
12090
12091 pvh_unlock(pai);
12092
12093 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
12094
12095 /**
12096 * Double-check that the mapping didn't change physical addresses before the
12097 * LOCKDOWN flag was set (there is a brief window between the above
12098 * kvtophys() and pvh_lock() calls where the mapping could have changed).
12099 *
12100 * This doesn't solve the ABA problem, but this doesn't have to since once
12101 * the pvh_lock() is grabbed no new mappings can be created on this physical
12102 * page without the LOCKDOWN flag already set (so any future mappings can
12103 * only be RO, and no existing mappings can be removed).
12104 */
12105 if (kvtophys_nofail(kva) != pa) {
12106 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
12107 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
12108 }
12109 }
12110
12111 /**
12112 * Helper for releasing a page from being locked down to the PPL, making it writable to the
12113 * kernel once again.
12114 *
12115 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12116 * to unlockdown a page that was never locked down, will panic.
12117 *
12118 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
12119 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12120 * @param ppl_writable This must match whatever `ppl_writable` parameter was
12121 * passed to the paired pmap_ppl_lockdown_page() call. Any
12122 * deviation will result in a panic.
12123 */
12124 MARK_AS_PMAP_TEXT static void
12125 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
12126 {
12127 pvh_assert_locked(pai);
12128 pv_entry_t **pvh = pai_to_pvh(pai);
12129 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12130
12131 if (__improbable(!(pvh_flags & lockdown_flag))) {
12132 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
12133 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
12134 }
12135
12136
12137 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
12138
12139 /* Restore the pre-lockdown physical aperture mapping permissions. */
12140 const unsigned int old_xprr_perm =
12141 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12142 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
12143 }
12144
12145 /**
12146 * Release a page from being locked down to the PPL, making it writable to the
12147 * kernel once again.
12148 *
12149 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12150 * to unlockdown a page that was never locked down, will panic.
12151 *
12152 * @param kva Valid address to any mapping of the physical page to unlockdown.
12153 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12154 * @param ppl_writable This must match whatever `ppl_writable` parameter was
12155 * passed to the paired pmap_ppl_lockdown_page() call. Any
12156 * deviation will result in a panic.
12157 */
12158 MARK_AS_PMAP_TEXT static void
12159 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12160 {
12161 const pmap_paddr_t pa = kvtophys_nofail(kva);
12162 const unsigned int pai = pa_index(pa);
12163
12164 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12165 pvh_lock(pai);
12166 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12167 pvh_unlock(pai);
12168 }
12169
12170 #else /* XNU_MONITOR */
12171
12172 void __unused
12173 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12174 {
12175 }
12176
12177 void __unused
12178 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12179 {
12180 }
12181
12182 #endif /* !XNU_MONITOR */
12183
12184
12185 MARK_AS_PMAP_TEXT static inline void
12186 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12187 {
12188 #if XNU_MONITOR
12189 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12190 #else
12191 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12192 #endif
12193 }
12194
12195 MARK_AS_PMAP_TEXT static inline void
12196 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12197 {
12198 #if XNU_MONITOR
12199 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12200 #else
12201 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12202 #endif
12203 }
12204
12205 /**
12206 * Perform basic validation checks on the destination only and
12207 * corresponding offset/sizes prior to writing to a read only allocation.
12208 *
12209 * @note Should be called before writing to an allocation from the read
12210 * only allocator.
12211 *
12212 * @param zid The ID of the zone the allocation belongs to.
12213 * @param va VA of element being modified (destination).
12214 * @param offset Offset being written to, in the element.
12215 * @param new_data_size Size of modification.
12216 *
12217 */
12218
12219 MARK_AS_PMAP_TEXT static void
12220 pmap_ro_zone_validate_element_dst(
12221 zone_id_t zid,
12222 vm_offset_t va,
12223 vm_offset_t offset,
12224 vm_size_t new_data_size)
12225 {
12226 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12227 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12228 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12229 }
12230
12231 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12232
12233 /* Check element is from correct zone and properly aligned */
12234 zone_require_ro(zid, elem_size, (void*)va);
12235
12236 if (__improbable(new_data_size > (elem_size - offset))) {
12237 panic("%s: New data size %lu too large for elem size %lu at addr %p",
12238 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12239 }
12240 if (__improbable(offset >= elem_size)) {
12241 panic("%s: Offset %lu too large for elem size %lu at addr %p",
12242 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12243 }
12244 }
12245
12246
12247 /**
12248 * Perform basic validation checks on the source, destination and
12249 * corresponding offset/sizes prior to writing to a read only allocation.
12250 *
12251 * @note Should be called before writing to an allocation from the read
12252 * only allocator.
12253 *
12254 * @param zid The ID of the zone the allocation belongs to.
12255 * @param va VA of element being modified (destination).
12256 * @param offset Offset being written to, in the element.
12257 * @param new_data Pointer to new data (source).
12258 * @param new_data_size Size of modification.
12259 *
12260 */
12261
12262 MARK_AS_PMAP_TEXT static void
12263 pmap_ro_zone_validate_element(
12264 zone_id_t zid,
12265 vm_offset_t va,
12266 vm_offset_t offset,
12267 const vm_offset_t new_data,
12268 vm_size_t new_data_size)
12269 {
12270 vm_offset_t sum = 0;
12271
12272 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12273 panic("%s: Integer addition overflow %p + %lu = %lu",
12274 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12275 }
12276
12277 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12278 }
12279
12280 /**
12281 * Ensure that physical page is locked down before writing to it.
12282 *
12283 * @note Should be called before writing to an allocation from the read
12284 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12285 * ensure that it is called after the modification.
12286 *
12287 *
12288 * @param pa Physical address of the element being modified.
12289 * @param va Virtual address of element being modified.
12290 * @param size Size of the modification.
12291 *
12292 */
12293
12294 MARK_AS_PMAP_TEXT static void
12295 pmap_ro_zone_lock_phy_page(
12296 const pmap_paddr_t pa,
12297 vm_offset_t va,
12298 vm_size_t size)
12299 {
12300 if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12301 panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12302 __func__, (unsigned long long)va, (unsigned long long)size);
12303 }
12304 const unsigned int pai = pa_index(pa);
12305 pvh_lock(pai);
12306
12307 /* Ensure that the physical page is locked down */
12308 #if XNU_MONITOR
12309 pv_entry_t **pvh = pai_to_pvh(pai);
12310 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12311 panic("%s: Physical page not locked down %llx", __func__, pa);
12312 }
12313 #endif /* XNU_MONITOR */
12314 }
12315
12316 /**
12317 * Unlock physical page after writing to it.
12318 *
12319 * @note Should be called after writing to an allocation from the read
12320 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12321 * ensure that it has been called prior to the modification.
12322 *
12323 * @param pa Physical address of the element that was modified.
12324 * @param va Virtual address of element that was modified.
12325 * @param size Size of the modification.
12326 *
12327 */
12328
12329 MARK_AS_PMAP_TEXT static void
12330 pmap_ro_zone_unlock_phy_page(
12331 const pmap_paddr_t pa,
12332 vm_offset_t va __unused,
12333 vm_size_t size __unused)
12334 {
12335 const unsigned int pai = pa_index(pa);
12336 pvh_unlock(pai);
12337 }
12338
12339 /**
12340 * Function to copy kauth_cred from new_data to kv.
12341 * Function defined in "kern_prot.c"
12342 *
12343 * @note Will be removed upon completion of
12344 * <rdar://problem/72635194> Compiler PAC support for memcpy.
12345 *
12346 * @param kv Address to copy new data to.
12347 * @param new_data Pointer to new data.
12348 *
12349 */
12350
12351 extern void
12352 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12353
12354 /**
12355 * Zalloc-specific memcpy that writes through the physical aperture
12356 * and ensures the element being modified is from a read-only zone.
12357 *
12358 * @note Designed to work only with the zone allocator's read-only submap.
12359 *
12360 * @param zid The ID of the zone to allocate from.
12361 * @param va VA of element to be modified.
12362 * @param offset Offset from element.
12363 * @param new_data Pointer to new data.
12364 * @param new_data_size Size of modification.
12365 *
12366 */
12367
12368 void
12369 pmap_ro_zone_memcpy(
12370 zone_id_t zid,
12371 vm_offset_t va,
12372 vm_offset_t offset,
12373 const vm_offset_t new_data,
12374 vm_size_t new_data_size)
12375 {
12376 #if XNU_MONITOR
12377 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12378 #else /* XNU_MONITOR */
12379 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12380 #endif /* XNU_MONITOR */
12381 }
12382
12383 MARK_AS_PMAP_TEXT void
12384 pmap_ro_zone_memcpy_internal(
12385 zone_id_t zid,
12386 vm_offset_t va,
12387 vm_offset_t offset,
12388 const vm_offset_t new_data,
12389 vm_size_t new_data_size)
12390 {
12391 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12392
12393 if (!new_data || new_data_size == 0) {
12394 return;
12395 }
12396
12397 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12398 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12399 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12400 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12401 }
12402
12403 /**
12404 * Zalloc-specific function to atomically mutate fields of an element that
12405 * belongs to a read-only zone, via the physcial aperture.
12406 *
12407 * @note Designed to work only with the zone allocator's read-only submap.
12408 *
12409 * @param zid The ID of the zone the element belongs to.
12410 * @param va VA of element to be modified.
12411 * @param offset Offset in element.
12412 * @param op Atomic operation to perform.
12413 * @param value Mutation value.
12414 *
12415 */
12416
12417 uint64_t
12418 pmap_ro_zone_atomic_op(
12419 zone_id_t zid,
12420 vm_offset_t va,
12421 vm_offset_t offset,
12422 zro_atomic_op_t op,
12423 uint64_t value)
12424 {
12425 #if XNU_MONITOR
12426 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12427 #else /* XNU_MONITOR */
12428 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12429 #endif /* XNU_MONITOR */
12430 }
12431
12432 MARK_AS_PMAP_TEXT uint64_t
12433 pmap_ro_zone_atomic_op_internal(
12434 zone_id_t zid,
12435 vm_offset_t va,
12436 vm_offset_t offset,
12437 zro_atomic_op_t op,
12438 uint64_t value)
12439 {
12440 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12441 vm_size_t value_size = op & 0xf;
12442
12443 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12444 pmap_ro_zone_lock_phy_page(pa, va, value_size);
12445 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12446 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12447
12448 return value;
12449 }
12450
12451 /**
12452 * bzero for allocations from read only zones, that writes through the
12453 * physical aperture.
12454 *
12455 * @note This is called by the zfree path of all allocations from read
12456 * only zones.
12457 *
12458 * @param zid The ID of the zone the allocation belongs to.
12459 * @param va VA of element to be zeroed.
12460 * @param offset Offset in the element.
12461 * @param size Size of allocation.
12462 *
12463 */
12464
12465 void
12466 pmap_ro_zone_bzero(
12467 zone_id_t zid,
12468 vm_offset_t va,
12469 vm_offset_t offset,
12470 vm_size_t size)
12471 {
12472 #if XNU_MONITOR
12473 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12474 #else /* XNU_MONITOR */
12475 pmap_ro_zone_bzero_internal(zid, va, offset, size);
12476 #endif /* XNU_MONITOR */
12477 }
12478
12479 MARK_AS_PMAP_TEXT void
12480 pmap_ro_zone_bzero_internal(
12481 zone_id_t zid,
12482 vm_offset_t va,
12483 vm_offset_t offset,
12484 vm_size_t size)
12485 {
12486 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12487 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12488 pmap_ro_zone_lock_phy_page(pa, va, size);
12489 bzero((void*)phystokv(pa), size);
12490 pmap_ro_zone_unlock_phy_page(pa, va, size);
12491 }
12492
12493 /**
12494 * Removes write access from the Physical Aperture.
12495 *
12496 * @note For non-PPL devices, it simply makes all virtual mappings RO.
12497 * @note Designed to work only with the zone allocator's read-only submap.
12498 *
12499 * @param va VA of the page to restore write access to.
12500 *
12501 */
12502 MARK_AS_PMAP_TEXT static void
12503 pmap_phys_write_disable(vm_address_t va)
12504 {
12505 #if XNU_MONITOR
12506 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12507 #else /* XNU_MONITOR */
12508 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12509 #endif /* XNU_MONITOR */
12510 }
12511
12512 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
12513
12514 MARK_AS_PMAP_TEXT mach_vm_size_t
12515 pmap_query_resident_internal(
12516 pmap_t pmap,
12517 vm_map_address_t start,
12518 vm_map_address_t end,
12519 mach_vm_size_t *compressed_bytes_p)
12520 {
12521 mach_vm_size_t resident_bytes = 0;
12522 mach_vm_size_t compressed_bytes = 0;
12523
12524 pt_entry_t *bpte, *epte;
12525 pt_entry_t *pte_p;
12526 tt_entry_t *tte_p;
12527
12528 if (pmap == NULL) {
12529 return PMAP_RESIDENT_INVALID;
12530 }
12531
12532 validate_pmap(pmap);
12533
12534 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12535
12536 /* Ensure that this request is valid, and addresses exactly one TTE. */
12537 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12538 (end % pt_attr_page_size(pt_attr)))) {
12539 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12540 }
12541
12542 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12543 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12544 }
12545
12546 pmap_lock(pmap, PMAP_LOCK_SHARED);
12547 tte_p = pmap_tte(pmap, start);
12548 if (tte_p == (tt_entry_t *) NULL) {
12549 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12550 return PMAP_RESIDENT_INVALID;
12551 }
12552 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12553 pte_p = (pt_entry_t *) ttetokv(*tte_p);
12554 bpte = &pte_p[pte_index(pt_attr, start)];
12555 epte = &pte_p[pte_index(pt_attr, end)];
12556
12557 for (; bpte < epte; bpte++) {
12558 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12559 compressed_bytes += pt_attr_page_size(pt_attr);
12560 } else if (pa_valid(pte_to_pa(*bpte))) {
12561 resident_bytes += pt_attr_page_size(pt_attr);
12562 }
12563 }
12564 }
12565 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12566
12567 if (compressed_bytes_p) {
12568 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12569 *compressed_bytes_p += compressed_bytes;
12570 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12571 }
12572
12573 return resident_bytes;
12574 }
12575
12576 mach_vm_size_t
12577 pmap_query_resident(
12578 pmap_t pmap,
12579 vm_map_address_t start,
12580 vm_map_address_t end,
12581 mach_vm_size_t *compressed_bytes_p)
12582 {
12583 mach_vm_size_t total_resident_bytes;
12584 mach_vm_size_t compressed_bytes;
12585 vm_map_address_t va;
12586
12587
12588 if (pmap == PMAP_NULL) {
12589 if (compressed_bytes_p) {
12590 *compressed_bytes_p = 0;
12591 }
12592 return 0;
12593 }
12594
12595 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12596
12597 total_resident_bytes = 0;
12598 compressed_bytes = 0;
12599
12600 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12601 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12602 VM_KERNEL_ADDRHIDE(end));
12603
12604 va = start;
12605 while (va < end) {
12606 vm_map_address_t l;
12607 mach_vm_size_t resident_bytes;
12608
12609 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12610
12611 if (l > end) {
12612 l = end;
12613 }
12614 #if XNU_MONITOR
12615 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12616 #else
12617 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12618 #endif
12619 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12620 break;
12621 }
12622
12623 total_resident_bytes += resident_bytes;
12624
12625 va = l;
12626 }
12627
12628 if (compressed_bytes_p) {
12629 *compressed_bytes_p = compressed_bytes;
12630 }
12631
12632 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12633 total_resident_bytes);
12634
12635 return total_resident_bytes;
12636 }
12637
12638 #if MACH_ASSERT
12639 static void
12640 pmap_check_ledgers(
12641 pmap_t pmap)
12642 {
12643 int pid;
12644 char *procname;
12645
12646 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12647 /*
12648 * This pmap was not or is no longer fully associated
12649 * with a task (e.g. the old pmap after a fork()/exec() or
12650 * spawn()). Its "ledger" still points at a task that is
12651 * now using a different (and active) address space, so
12652 * we can't check that all the pmap ledgers are balanced here.
12653 *
12654 * If the "pid" is set, that means that we went through
12655 * pmap_set_process() in task_terminate_internal(), so
12656 * this task's ledger should not have been re-used and
12657 * all the pmap ledgers should be back to 0.
12658 */
12659 return;
12660 }
12661
12662 pid = pmap->pmap_pid;
12663 procname = pmap->pmap_procname;
12664
12665 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12666 }
12667 #endif /* MACH_ASSERT */
12668
12669 void
12670 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12671 {
12672 }
12673
12674 /**
12675 * The minimum shared region nesting size is used by the VM to determine when to
12676 * break up large mappings to nested regions. The smallest size that these
12677 * mappings can be broken into is determined by what page table level those
12678 * regions are being nested in at and the size of the page tables.
12679 *
12680 * For instance, if a nested region is nesting at L2 for a process utilizing
12681 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12682 * block entry).
12683 *
12684 * @param pmap The target pmap to determine the block size based on whether it's
12685 * using 16KB or 4KB page tables.
12686 */
12687 uint64_t
12688 pmap_shared_region_size_min(__unused pmap_t pmap)
12689 {
12690 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12691
12692 /**
12693 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12694 * 4KB pages). This means that a target pmap will contain L2 entries that
12695 * point to shared L3 page tables in the shared region pmap.
12696 */
12697 return pt_attr_twig_size(pt_attr);
12698 }
12699
12700 boolean_t
12701 pmap_enforces_execute_only(
12702 pmap_t pmap)
12703 {
12704 return pmap != kernel_pmap;
12705 }
12706
12707 MARK_AS_PMAP_TEXT void
12708 pmap_set_vm_map_cs_enforced_internal(
12709 pmap_t pmap,
12710 bool new_value)
12711 {
12712 validate_pmap_mutable(pmap);
12713 pmap->pmap_vm_map_cs_enforced = new_value;
12714 }
12715
12716 void
12717 pmap_set_vm_map_cs_enforced(
12718 pmap_t pmap,
12719 bool new_value)
12720 {
12721 #if XNU_MONITOR
12722 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12723 #else
12724 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12725 #endif
12726 }
12727
12728 extern int cs_process_enforcement_enable;
12729 bool
12730 pmap_get_vm_map_cs_enforced(
12731 pmap_t pmap)
12732 {
12733 if (cs_process_enforcement_enable) {
12734 return true;
12735 }
12736 return pmap->pmap_vm_map_cs_enforced;
12737 }
12738
12739 MARK_AS_PMAP_TEXT void
12740 pmap_set_jit_entitled_internal(
12741 __unused pmap_t pmap)
12742 {
12743 return;
12744 }
12745
12746 void
12747 pmap_set_jit_entitled(
12748 pmap_t pmap)
12749 {
12750 #if XNU_MONITOR
12751 pmap_set_jit_entitled_ppl(pmap);
12752 #else
12753 pmap_set_jit_entitled_internal(pmap);
12754 #endif
12755 }
12756
12757 bool
12758 pmap_get_jit_entitled(
12759 __unused pmap_t pmap)
12760 {
12761 return false;
12762 }
12763
12764 MARK_AS_PMAP_TEXT void
12765 pmap_set_tpro_internal(
12766 __unused pmap_t pmap)
12767 {
12768 return;
12769 }
12770
12771 void
12772 pmap_set_tpro(
12773 pmap_t pmap)
12774 {
12775 #if XNU_MONITOR
12776 pmap_set_tpro_ppl(pmap);
12777 #else /* XNU_MONITOR */
12778 pmap_set_tpro_internal(pmap);
12779 #endif /* XNU_MONITOR */
12780 }
12781
12782 bool
12783 pmap_get_tpro(
12784 __unused pmap_t pmap)
12785 {
12786 return false;
12787 }
12788
12789 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12790
12791 MARK_AS_PMAP_TEXT kern_return_t
12792 pmap_query_page_info_internal(
12793 pmap_t pmap,
12794 vm_map_offset_t va,
12795 int *disp_p)
12796 {
12797 pmap_paddr_t pa;
12798 int disp;
12799 unsigned int pai;
12800 pt_entry_t *pte_p, pte;
12801 pv_entry_t **pv_h, *pve_p;
12802
12803 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12804 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12805 *disp_p = 0;
12806 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12807 return KERN_INVALID_ARGUMENT;
12808 }
12809
12810 validate_pmap(pmap);
12811 pmap_lock(pmap, PMAP_LOCK_SHARED);
12812
12813 try_again:
12814 disp = 0;
12815 pte_p = pmap_pte(pmap, va);
12816 if (pte_p == PT_ENTRY_NULL) {
12817 goto done;
12818 }
12819 pte = *(volatile pt_entry_t*)pte_p;
12820 pa = pte_to_pa(pte);
12821 if (pa == 0) {
12822 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12823 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12824 if (pte & ARM_PTE_COMPRESSED_ALT) {
12825 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12826 }
12827 }
12828 } else {
12829 disp |= PMAP_QUERY_PAGE_PRESENT;
12830 pai = pa_index(pa);
12831 if (!pa_valid(pa)) {
12832 goto done;
12833 }
12834 pvh_lock(pai);
12835 if (pte != *(volatile pt_entry_t*)pte_p) {
12836 /* something changed: try again */
12837 pvh_unlock(pai);
12838 pmap_query_page_info_retries++;
12839 goto try_again;
12840 }
12841 pv_h = pai_to_pvh(pai);
12842 pve_p = PV_ENTRY_NULL;
12843 int pve_ptep_idx = 0;
12844 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12845 pve_p = pvh_pve_list(pv_h);
12846 while (pve_p != PV_ENTRY_NULL &&
12847 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12848 pve_p = pve_next(pve_p);
12849 }
12850 }
12851
12852 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12853 disp |= PMAP_QUERY_PAGE_ALTACCT;
12854 } else if (ppattr_test_reusable(pai)) {
12855 disp |= PMAP_QUERY_PAGE_REUSABLE;
12856 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12857 disp |= PMAP_QUERY_PAGE_INTERNAL;
12858 }
12859 pvh_unlock(pai);
12860 }
12861
12862 done:
12863 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12864 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12865 *disp_p = disp;
12866 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12867 return KERN_SUCCESS;
12868 }
12869
12870 kern_return_t
12871 pmap_query_page_info(
12872 pmap_t pmap,
12873 vm_map_offset_t va,
12874 int *disp_p)
12875 {
12876 #if XNU_MONITOR
12877 return pmap_query_page_info_ppl(pmap, va, disp_p);
12878 #else
12879 return pmap_query_page_info_internal(pmap, va, disp_p);
12880 #endif
12881 }
12882
12883
12884
12885 uint32_t
12886 pmap_user_va_bits(pmap_t pmap __unused)
12887 {
12888 #if __ARM_MIXED_PAGE_SIZE__
12889 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12890 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12891 #else
12892 return 64 - T0SZ_BOOT;
12893 #endif
12894 }
12895
12896 uint32_t
12897 pmap_kernel_va_bits(void)
12898 {
12899 return 64 - T1SZ_BOOT;
12900 }
12901
12902 static vm_map_size_t
12903 pmap_user_va_size(pmap_t pmap)
12904 {
12905 return 1ULL << pmap_user_va_bits(pmap);
12906 }
12907
12908
12909
12910 bool
12911 pmap_in_ppl(void)
12912 {
12913 // Unsupported
12914 return false;
12915 }
12916
12917 __attribute__((__noreturn__))
12918 void
12919 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12920 {
12921 panic("%s called on an unsupported platform.", __FUNCTION__);
12922 }
12923
12924 void *
12925 pmap_claim_reserved_ppl_page(void)
12926 {
12927 // Unsupported
12928 return NULL;
12929 }
12930
12931 void
12932 pmap_free_reserved_ppl_page(void __unused *kva)
12933 {
12934 // Unsupported
12935 }
12936
12937
12938 #if PMAP_CS_PPL_MONITOR
12939
12940 /* Immutable part of the trust cache runtime */
12941 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12942
12943 /* Mutable part of the trust cache runtime */
12944 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12945
12946 /* Lock for the trust cache runtime */
12947 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12948
12949 MARK_AS_PMAP_TEXT kern_return_t
12950 pmap_check_trust_cache_runtime_for_uuid_internal(
12951 const uint8_t check_uuid[kUUIDSize])
12952 {
12953 kern_return_t ret = KERN_DENIED;
12954
12955 if (amfi->TrustCache.version < 3) {
12956 /* AMFI change hasn't landed in the build */
12957 pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12958 return KERN_NOT_SUPPORTED;
12959 }
12960
12961 /* Lock the runtime as shared */
12962 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12963
12964 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12965 &ppl_trust_cache_rt,
12966 check_uuid,
12967 NULL);
12968
12969 /* Unlock the runtime */
12970 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12971
12972 if (tc_ret.error == kTCReturnSuccess) {
12973 ret = KERN_SUCCESS;
12974 } else if (tc_ret.error == kTCReturnNotFound) {
12975 ret = KERN_NOT_FOUND;
12976 } else {
12977 ret = KERN_FAILURE;
12978 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12979 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12980 }
12981
12982 return ret;
12983 }
12984
12985 kern_return_t
12986 pmap_check_trust_cache_runtime_for_uuid(
12987 const uint8_t check_uuid[kUUIDSize])
12988 {
12989 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12990 }
12991
12992 MARK_AS_PMAP_TEXT kern_return_t
12993 pmap_load_trust_cache_with_type_internal(
12994 TCType_t type,
12995 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12996 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12997 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12998 {
12999 kern_return_t ret = KERN_DENIED;
13000 pmap_img4_payload_t *payload = NULL;
13001 size_t img4_payload_len = 0;
13002 size_t payload_len_aligned = 0;
13003 size_t manifest_len_aligned = 0;
13004
13005 /* Ignore the auxiliary manifest until we add support for it */
13006 (void)img4_aux_manifest;
13007 (void)img4_aux_manifest_len;
13008
13009
13010 #if PMAP_CS_INCLUDE_CODE_SIGNING
13011 if (pmap_cs) {
13012 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
13013 panic("trust cache type not loadable from interface: %u", type);
13014 } else if (type >= kTCTypeTotal) {
13015 panic("attempted to load an unsupported trust cache type: %u", type);
13016 }
13017
13018 /* Validate entitlement for the calling process */
13019 if (TCTypeConfig[type].entitlementValue != NULL) {
13020 const bool entitlement_satisfied = check_entitlement_pmap(
13021 NULL,
13022 "com.apple.private.pmap.load-trust-cache",
13023 TCTypeConfig[type].entitlementValue,
13024 false,
13025 true);
13026
13027 if (entitlement_satisfied == false) {
13028 panic("attempted to load trust cache without entitlement: %u", type);
13029 }
13030 }
13031 }
13032 #endif
13033
13034 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13035 ret = pmap_reserve_ppl_page();
13036 if (ret != KERN_SUCCESS) {
13037 if (ret != KERN_RESOURCE_SHORTAGE) {
13038 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
13039 }
13040 return ret;
13041 }
13042
13043 /* Align the passed in lengths to the page size -- round_page is overflow safe */
13044 payload_len_aligned = round_page(pmap_img4_payload_len);
13045 manifest_len_aligned = round_page(img4_manifest_len);
13046
13047 /* Ensure we have valid data passed in */
13048 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
13049 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
13050
13051 /*
13052 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
13053 * data structure used by libTrustCache to manage the payload. We need to be able to
13054 * write to that data structure, so we keep the payload PPL writable.
13055 */
13056 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13057 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
13058
13059 /* Should be safe to read from this now */
13060 payload = (pmap_img4_payload_t*)pmap_img4_payload;
13061
13062 /* Acquire a writable version of the trust cache data structure */
13063 TrustCache_t *trust_cache = &payload->trust_cache;
13064 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
13065
13066 /* Calculate the correct length of the img4 payload */
13067 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
13068 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
13069 }
13070
13071 /* Exclusively lock the runtime */
13072 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
13073
13074 /* Load the trust cache */
13075 TCReturn_t tc_ret = amfi->TrustCache.load(
13076 &ppl_trust_cache_rt,
13077 type,
13078 trust_cache,
13079 (const uintptr_t)payload->img4_payload, img4_payload_len,
13080 (const uintptr_t)img4_manifest, img4_manifest_len);
13081
13082 /* Unlock the runtime */
13083 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
13084
13085 if (tc_ret.error == kTCReturnSuccess) {
13086 ret = KERN_SUCCESS;
13087 } else {
13088 if (tc_ret.error == kTCReturnDuplicate) {
13089 ret = KERN_ALREADY_IN_SET;
13090 } else {
13091 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
13092 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13093
13094 ret = KERN_FAILURE;
13095 }
13096
13097 /* Unlock the payload data */
13098 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13099 trust_cache = NULL;
13100 payload = NULL;
13101 }
13102
13103 /* Unlock the manifest since it is no longer needed */
13104 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
13105
13106 /* Return the CoreCrypto reserved page back to the free list */
13107 pmap_release_reserved_ppl_page();
13108
13109 return ret;
13110 }
13111
13112 kern_return_t
13113 pmap_load_trust_cache_with_type(
13114 TCType_t type,
13115 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13116 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13117 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13118 {
13119 kern_return_t ret = KERN_DENIED;
13120
13121 ret = pmap_load_trust_cache_with_type_ppl(
13122 type,
13123 pmap_img4_payload, pmap_img4_payload_len,
13124 img4_manifest, img4_manifest_len,
13125 img4_aux_manifest, img4_aux_manifest_len);
13126
13127 while (ret == KERN_RESOURCE_SHORTAGE) {
13128 /* Allocate a page from the free list */
13129 pmap_alloc_page_for_ppl(0);
13130
13131 /* Attempt the call again */
13132 ret = pmap_load_trust_cache_with_type_ppl(
13133 type,
13134 pmap_img4_payload, pmap_img4_payload_len,
13135 img4_manifest, img4_manifest_len,
13136 img4_aux_manifest, img4_aux_manifest_len);
13137 }
13138
13139 return ret;
13140 }
13141
13142 MARK_AS_PMAP_TEXT kern_return_t
13143 pmap_query_trust_cache_safe(
13144 TCQueryType_t query_type,
13145 const uint8_t cdhash[kTCEntryHashSize],
13146 TrustCacheQueryToken_t *query_token)
13147 {
13148 kern_return_t ret = KERN_NOT_FOUND;
13149
13150 /* Validate the query type preemptively */
13151 if (query_type >= kTCQueryTypeTotal) {
13152 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
13153 return KERN_INVALID_ARGUMENT;
13154 }
13155
13156 /* Lock the runtime as shared */
13157 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13158
13159 TCReturn_t tc_ret = amfi->TrustCache.query(
13160 &ppl_trust_cache_rt,
13161 query_type,
13162 cdhash,
13163 query_token);
13164
13165 /* Unlock the runtime */
13166 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13167
13168 if (tc_ret.error == kTCReturnSuccess) {
13169 ret = KERN_SUCCESS;
13170 } else if (tc_ret.error == kTCReturnNotFound) {
13171 ret = KERN_NOT_FOUND;
13172 } else {
13173 ret = KERN_FAILURE;
13174 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13175 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13176 }
13177
13178 return ret;
13179 }
13180
13181 MARK_AS_PMAP_TEXT kern_return_t
13182 pmap_query_trust_cache_internal(
13183 TCQueryType_t query_type,
13184 const uint8_t cdhash[kTCEntryHashSize],
13185 TrustCacheQueryToken_t *query_token)
13186 {
13187 kern_return_t ret = KERN_NOT_FOUND;
13188 TrustCacheQueryToken_t query_token_safe = {0};
13189 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13190
13191 /* Copy in the CDHash into PPL storage */
13192 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13193
13194 /* Query through the safe API since we're in the PPL now */
13195 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13196
13197 if (query_token != NULL) {
13198 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13199 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13200 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13201 }
13202
13203 return ret;
13204 }
13205
13206 kern_return_t
13207 pmap_query_trust_cache(
13208 TCQueryType_t query_type,
13209 const uint8_t cdhash[kTCEntryHashSize],
13210 TrustCacheQueryToken_t *query_token)
13211 {
13212 kern_return_t ret = KERN_NOT_FOUND;
13213
13214 ret = pmap_query_trust_cache_ppl(
13215 query_type,
13216 cdhash,
13217 query_token);
13218
13219 return ret;
13220 }
13221
13222 MARK_AS_PMAP_DATA bool ppl_developer_mode_set = false;
13223 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13224
13225 MARK_AS_PMAP_TEXT void
13226 pmap_toggle_developer_mode_internal(
13227 bool state)
13228 {
13229 bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
13230
13231 /*
13232 * Only the following state transitions are allowed:
13233 * -- not set --> false
13234 * -- not set --> true
13235 * -- true --> false
13236 * -- true --> true
13237 * -- false --> false
13238 *
13239 * We never allow false --> true transitions.
13240 */
13241 bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13242
13243 if ((current == false) && (state == true) && state_set) {
13244 panic("PMAP_CS: attempted to enable developer mode incorrectly");
13245 }
13246
13247 /* We're going to update the developer mode state, so update this first */
13248 os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13249
13250 /* Update the developer mode state on the system */
13251 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13252 }
13253
13254 void
13255 pmap_toggle_developer_mode(
13256 bool state)
13257 {
13258 pmap_toggle_developer_mode_ppl(state);
13259 }
13260
13261 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13262 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13263
13264 #pragma mark Image4 - New
13265
13266 typedef struct _pmap_image4_dispatch {
13267 image4_cs_trap_t selector;
13268 image4_cs_trap_handler_t handler;
13269 } pmap_image4_dispatch_t;
13270
13271 MARK_AS_PMAP_TEXT static errno_t
13272 _pmap_image4_monitor_trap_set_release_type(
13273 const pmap_image4_dispatch_t *dispatch,
13274 const void *input_data)
13275 {
13276 /*
13277 * csmx_release_type --> __cs_copy
13278 */
13279 image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13280
13281 /* Copy the input data to prevent ToCToU */
13282 memcpy(&input, input_data, sizeof(input));
13283
13284 /* Dispatch to AppleImage4 */
13285 return dispatch->handler(
13286 dispatch->selector,
13287 &input, sizeof(input),
13288 NULL, NULL);
13289 }
13290
13291
13292
13293 MARK_AS_PMAP_TEXT static errno_t
13294 _pmap_image4_monitor_trap_nonce_set(
13295 const pmap_image4_dispatch_t *dispatch,
13296 const void *input_data)
13297 {
13298 /*
13299 * csmx_clear --> __cs_copy
13300 * csmx_cipher --> __cs_copy
13301 */
13302 image4_cs_trap_argv_nonce_set_t input = {0};
13303
13304 /* Copy the input data to prevent ToCToU */
13305 memcpy(&input, input_data, sizeof(input));
13306
13307 /* Dispatch to AppleImage4 */
13308 return dispatch->handler(
13309 dispatch->selector,
13310 &input, sizeof(input),
13311 NULL, NULL);
13312 }
13313
13314 MARK_AS_PMAP_TEXT static errno_t
13315 _pmap_image4_monitor_trap_nonce_roll(
13316 const pmap_image4_dispatch_t *dispatch,
13317 const void *input_data)
13318 {
13319 image4_cs_trap_argv_nonce_roll_t input = {0};
13320
13321 /* Copy the input data to prevent ToCToU */
13322 memcpy(&input, input_data, sizeof(input));
13323
13324 /* Dispatch to AppleImage4 */
13325 return dispatch->handler(
13326 dispatch->selector,
13327 &input, sizeof(input),
13328 NULL, NULL);
13329 }
13330
13331 MARK_AS_PMAP_TEXT static errno_t
13332 _pmap_image4_monitor_trap_image_activate(
13333 const pmap_image4_dispatch_t *dispatch,
13334 const void *input_data)
13335 {
13336 /*
13337 * csmx_payload (csmx_payload_len) --> __cs_xfer
13338 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13339 */
13340 image4_cs_trap_argv_image_activate_t input = {0};
13341
13342 /* Copy the input data to prevent ToCToU */
13343 memcpy(&input, input_data, sizeof(input));
13344
13345 /* Validate the payload region */
13346 pmap_cs_assert_addr(
13347 input.csmx_payload, round_page(input.csmx_payload_len),
13348 false, false);
13349
13350 /* Validate the manifest region */
13351 pmap_cs_assert_addr(
13352 input.csmx_manifest, round_page(input.csmx_manifest_len),
13353 false, false);
13354
13355 /* Lockdown the payload region */
13356 pmap_cs_lockdown_pages(
13357 input.csmx_payload, round_page(input.csmx_payload_len), false);
13358
13359 /* Lockdown the manifest region */
13360 pmap_cs_lockdown_pages(
13361 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13362
13363 /* Dispatch the handler */
13364 errno_t err = dispatch->handler(
13365 dispatch->selector,
13366 &input, sizeof(input),
13367 NULL, NULL);
13368
13369 /*
13370 * Image activation always returns the manifest back to the kernel since it isn't
13371 * needed once the evaluation of the image has been completed. The payload must
13372 * remain owned by the monitor if the activation was successful.
13373 */
13374 if (err != 0) {
13375 /* Unlock the payload region */
13376 pmap_cs_unlockdown_pages(
13377 input.csmx_payload, round_page(input.csmx_payload_len), false);
13378 }
13379
13380 /* Unlock the manifest region */
13381 pmap_cs_unlockdown_pages(
13382 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13383
13384 return err;
13385 }
13386
13387 MARK_AS_PMAP_TEXT static errno_t
13388 _pmap_image4_monitor_trap_passthrough(
13389 __unused const pmap_image4_dispatch_t *dispatch,
13390 __unused const void *input_data,
13391 __unused size_t input_size)
13392 {
13393 #if DEVELOPMENT || DEBUG || KASAN
13394 return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13395 #else
13396 pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13397 return ENOSYS;
13398 #endif
13399 }
13400
13401 MARK_AS_PMAP_TEXT errno_t
13402 pmap_image4_monitor_trap_internal(
13403 image4_cs_trap_t selector,
13404 const void *input_data,
13405 size_t input_size)
13406 {
13407 kern_return_t ret = KERN_DENIED;
13408 errno_t err = EPERM;
13409
13410 /* Acquire the handler for this selector */
13411 image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13412 if (handler == NULL) {
13413 pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13414 return EINVAL;
13415 }
13416
13417 /* Verify input size for the handler */
13418 if (input_size != image4_cs_trap_vector_size(selector)) {
13419 pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13420 return EINVAL;
13421 }
13422
13423 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13424 ret = pmap_reserve_ppl_page();
13425 if (ret != KERN_SUCCESS) {
13426 if (ret == KERN_RESOURCE_SHORTAGE) {
13427 return ENOMEM;
13428 }
13429 pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13430 return EPERM;
13431 }
13432
13433 /* Setup dispatch parameters */
13434 pmap_image4_dispatch_t dispatch = {
13435 .selector = selector,
13436 .handler = handler
13437 };
13438
13439 switch (selector) {
13440 case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13441 err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13442 break;
13443
13444 case IMAGE4_CS_TRAP_NONCE_SET:
13445 err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13446 break;
13447
13448 case IMAGE4_CS_TRAP_NONCE_ROLL:
13449 err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13450 break;
13451
13452 case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13453 err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13454 break;
13455
13456 default:
13457 err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13458 break;
13459 }
13460
13461 /* Return the CoreCrypto reserved page back to the free list */
13462 pmap_release_reserved_ppl_page();
13463
13464 return err;
13465 }
13466
13467 errno_t
13468 pmap_image4_monitor_trap(
13469 image4_cs_trap_t selector,
13470 const void *input_data,
13471 size_t input_size)
13472 {
13473 errno_t err = EPERM;
13474
13475 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13476 while (err == ENOMEM) {
13477 /* Allocate a page from the free list */
13478 pmap_alloc_page_for_ppl(0);
13479
13480 /* Call the monitor dispatch again */
13481 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13482 }
13483
13484 return err;
13485 }
13486
13487 #endif /* PMAP_CS_PPL_MONITOR */
13488
13489 #if PMAP_CS_INCLUDE_CODE_SIGNING
13490
13491 static int
13492 pmap_cs_profiles_rbtree_compare(
13493 void *profile0,
13494 void *profile1)
13495 {
13496 if (profile0 < profile1) {
13497 return -1;
13498 } else if (profile0 > profile1) {
13499 return 1;
13500 }
13501 return 0;
13502 }
13503
13504 /* Red-black tree for managing provisioning profiles */
13505 MARK_AS_PMAP_DATA static
13506 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13507
13508 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13509 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13510
13511 /* Lock for the profile red-black tree */
13512 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13513
13514 void
13515 pmap_initialize_provisioning_profiles(void)
13516 {
13517 /* Initialize the profiles red-black tree lock */
13518 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13519 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13520
13521 /* Initialize the red-black tree itself */
13522 RB_INIT(&pmap_cs_registered_profiles);
13523
13524 printf("initialized PPL provisioning profile data\n");
13525 }
13526
13527 static bool
13528 pmap_is_testflight_profile(
13529 pmap_cs_profile_t *profile_obj)
13530 {
13531 const char *entitlement_name = "beta-reports-active";
13532 const size_t entitlement_length = strlen(entitlement_name);
13533 CEQueryOperation_t query[2] = {0};
13534
13535 /* If the profile provisions no entitlements, then it isn't a test flight one */
13536 if (profile_obj->entitlements_ctx == NULL) {
13537 return false;
13538 }
13539
13540 /* Build our CoreEntitlements query */
13541 query[0].opcode = kCEOpSelectKey;
13542 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13543 query[0].parameters.stringParameter.length = entitlement_length;
13544 query[1] = CEMatchBool(true);
13545
13546 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13547 profile_obj->entitlements_ctx,
13548 query, 2);
13549
13550 if (ce_err == amfi->CoreEntitlements.kNoError) {
13551 return true;
13552 }
13553
13554 return false;
13555 }
13556
13557 static bool
13558 pmap_is_development_profile(
13559 pmap_cs_profile_t *profile_obj)
13560 {
13561 /* Check for UPP */
13562 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13563 *profile_obj->profile_ctx,
13564 CESelectDictValue("ProvisionsAllDevices"));
13565 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13566 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13567 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13568 return false;
13569 }
13570 }
13571
13572 /* Check for TestFlight profile */
13573 if (pmap_is_testflight_profile(profile_obj) == true) {
13574 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13575 return false;
13576 }
13577
13578 pmap_cs_log_info("%p: development profile", profile_obj);
13579 return true;
13580 }
13581
13582 static kern_return_t
13583 pmap_initialize_profile_entitlements(
13584 pmap_cs_profile_t *profile_obj)
13585 {
13586 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13587 *profile_obj->profile_ctx,
13588 CESelectDictValue("Entitlements"));
13589
13590 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13591 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13592 profile_obj->entitlements_ctx = NULL;
13593
13594 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13595 return KERN_NOT_FOUND;
13596 }
13597
13598 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13599 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13600
13601 CEValidationResult ce_result = {0};
13602 CEError_t ce_err = amfi->CoreEntitlements.Validate(
13603 pmap_cs_core_entitlements_runtime,
13604 &ce_result,
13605 der_start, der_end);
13606 if (ce_err != amfi->CoreEntitlements.kNoError) {
13607 pmap_cs_log_error("unable to validate profile entitlements: %s",
13608 amfi->CoreEntitlements.GetErrorString(ce_err));
13609
13610 return KERN_ABORTED;
13611 }
13612
13613 struct CEQueryContext query_ctx = {0};
13614 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13615 pmap_cs_core_entitlements_runtime,
13616 ce_result,
13617 &query_ctx);
13618 if (ce_err != amfi->CoreEntitlements.kNoError) {
13619 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13620 amfi->CoreEntitlements.GetErrorString(ce_err));
13621
13622 return KERN_ABORTED;
13623 }
13624
13625 /* Setup the entitlements context within the profile object */
13626 profile_obj->entitlements_ctx_storage = query_ctx;
13627 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13628
13629 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13630 return KERN_SUCCESS;
13631 }
13632
13633 kern_return_t
13634 pmap_register_provisioning_profile_internal(
13635 const vm_address_t payload_addr,
13636 const vm_size_t payload_size)
13637 {
13638 kern_return_t ret = KERN_DENIED;
13639 pmap_cs_profile_t *profile_obj = NULL;
13640 pmap_profile_payload_t *profile_payload = NULL;
13641 vm_size_t max_profile_blob_size = 0;
13642 const uint8_t *profile_content = NULL;
13643 size_t profile_content_length = 0;
13644
13645
13646 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
13647 ret = pmap_reserve_ppl_page();
13648 if (ret != KERN_SUCCESS) {
13649 if (ret != KERN_RESOURCE_SHORTAGE) {
13650 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13651 }
13652 return ret;
13653 }
13654
13655 /* Ensure we have valid data passed in */
13656 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13657
13658 /*
13659 * Lockdown the data passed in. The pmap profile payload also contains the profile
13660 * data structure used by the PPL to manage the payload. We need to be able to write
13661 * to that data structure, so we keep the payload PPL writable.
13662 */
13663 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13664
13665 /* Should be safe to read from this now */
13666 profile_payload = (pmap_profile_payload_t*)payload_addr;
13667
13668 /* Ensure the profile blob size provided is valid */
13669 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13670 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13671 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13672 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13673 }
13674
13675 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13676 const bool allow_development_root_cert = true;
13677 #else
13678 const bool allow_development_root_cert = false;
13679 #endif
13680
13681 int ct_result = coretrust->CTEvaluateProvisioningProfile(
13682 profile_payload->profile_blob, profile_payload->profile_blob_size,
13683 allow_development_root_cert,
13684 &profile_content, &profile_content_length);
13685
13686 /* Release the PPL page allocated for CoreCrypto */
13687 pmap_release_reserved_ppl_page();
13688
13689 if (ct_result != 0) {
13690 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13691 } else if ((profile_content == NULL) || profile_content_length == 0) {
13692 panic("PMAP_CS: profile does not have any content: %p | %lu",
13693 profile_content, profile_content_length);
13694 }
13695
13696 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13697 pmap_cs_core_entitlements_runtime,
13698 CCDER_CONSTRUCTED_SET,
13699 false,
13700 profile_content, profile_content + profile_content_length);
13701 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13702 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13703 }
13704
13705 /* Acquire a writable version of the profile data structure */
13706 profile_obj = &profile_payload->profile_obj_storage;
13707 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13708
13709 profile_obj->original_payload = profile_payload;
13710 profile_obj->profile_ctx_storage = profile_ctx_storage;
13711 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13712 os_atomic_store(&profile_obj->reference_count, 0, release);
13713
13714 /* Setup the entitlements provisioned by the profile */
13715 ret = pmap_initialize_profile_entitlements(profile_obj);
13716 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13717 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13718 }
13719
13720 /* Setup properties of the profile */
13721 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13722
13723 /* Mark as validated since it passed all checks */
13724 profile_obj->profile_validated = true;
13725
13726 /* Add the profile to the red-black tree */
13727 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13728 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13729 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13730 }
13731 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13732
13733 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13734 return KERN_SUCCESS;
13735 }
13736
13737 kern_return_t
13738 pmap_register_provisioning_profile(
13739 const vm_address_t payload_addr,
13740 const vm_size_t payload_size)
13741 {
13742 kern_return_t ret = KERN_DENIED;
13743
13744 ret = pmap_register_provisioning_profile_ppl(
13745 payload_addr,
13746 payload_size);
13747
13748 while (ret == KERN_RESOURCE_SHORTAGE) {
13749 /* Allocate a page from the free list */
13750 pmap_alloc_page_for_ppl(0);
13751
13752 /* Attempt the call again */
13753 ret = pmap_register_provisioning_profile_ppl(
13754 payload_addr,
13755 payload_size);
13756 }
13757
13758 return ret;
13759 }
13760
13761 kern_return_t
13762 pmap_unregister_provisioning_profile_internal(
13763 pmap_cs_profile_t *profile_obj)
13764 {
13765 kern_return_t ret = KERN_DENIED;
13766
13767 /* Lock the red-black tree exclusively */
13768 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13769
13770 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13771 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13772 }
13773
13774 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13775 if (reference_count != 0) {
13776 ret = KERN_FAILURE;
13777 goto exit;
13778 }
13779
13780 /* Remove the profile from the red-black tree */
13781 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13782
13783 /* Unregistration was a success */
13784 ret = KERN_SUCCESS;
13785
13786 exit:
13787 /* Unlock the red-black tree */
13788 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13789
13790 if (ret == KERN_SUCCESS) {
13791 /* Get the original payload address */
13792 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13793 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13794
13795 /* Get the original payload size */
13796 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13797 payload_size = round_page(payload_size);
13798
13799 /* Unlock the profile payload */
13800 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13801 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13802 profile_payload, payload_size);
13803
13804 profile_obj = NULL;
13805 }
13806 return ret;
13807 }
13808
13809 kern_return_t
13810 pmap_unregister_provisioning_profile(
13811 pmap_cs_profile_t *profile_obj)
13812 {
13813 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13814 }
13815
13816 kern_return_t
13817 pmap_associate_provisioning_profile_internal(
13818 pmap_cs_code_directory_t *cd_entry,
13819 pmap_cs_profile_t *profile_obj)
13820 {
13821 kern_return_t ret = KERN_DENIED;
13822
13823 /* Acquire the lock on the code directory */
13824 pmap_cs_lock_code_directory(cd_entry);
13825
13826 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13827 pmap_cs_log_error("disallowing profile association with verified signature");
13828 goto exit;
13829 } else if (cd_entry->profile_obj != NULL) {
13830 pmap_cs_log_error("disallowing multiple profile associations with signature");
13831 goto exit;
13832 }
13833
13834 /* Lock the red-black tree as shared */
13835 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13836
13837 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13838 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13839 } else if (profile_obj->profile_validated == false) {
13840 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13841 }
13842
13843 /* Associate the profile with the signature */
13844 cd_entry->profile_obj = profile_obj;
13845
13846 /* Increment the reference count on the profile object */
13847 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13848 if (reference_count == 0) {
13849 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13850 }
13851
13852 /* Unlock the red-black tree */
13853 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13854
13855 /* Association was a success */
13856 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13857 ret = KERN_SUCCESS;
13858
13859 exit:
13860 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13861
13862 return ret;
13863 }
13864
13865 kern_return_t
13866 pmap_associate_provisioning_profile(
13867 pmap_cs_code_directory_t *cd_entry,
13868 pmap_cs_profile_t *profile_obj)
13869 {
13870 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13871 }
13872
13873 kern_return_t
13874 pmap_disassociate_provisioning_profile_internal(
13875 pmap_cs_code_directory_t *cd_entry)
13876 {
13877 pmap_cs_profile_t *profile_obj = NULL;
13878 kern_return_t ret = KERN_DENIED;
13879
13880 /* Acquire the lock on the code directory */
13881 pmap_cs_lock_code_directory(cd_entry);
13882
13883 if (cd_entry->profile_obj == NULL) {
13884 ret = KERN_NOT_FOUND;
13885 goto exit;
13886 }
13887 profile_obj = cd_entry->profile_obj;
13888
13889 /* Disassociate the profile from the signature */
13890 cd_entry->profile_obj = NULL;
13891
13892 /* Disassociation was a success */
13893 ret = KERN_SUCCESS;
13894
13895 exit:
13896 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13897
13898 if (ret == KERN_SUCCESS) {
13899 /* Decrement the reference count on the profile object */
13900 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13901 if (reference_count == UINT32_MAX) {
13902 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13903 }
13904 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13905 }
13906 return ret;
13907 }
13908
13909 kern_return_t
13910 pmap_disassociate_provisioning_profile(
13911 pmap_cs_code_directory_t *cd_entry)
13912 {
13913 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13914 }
13915
13916 kern_return_t
13917 pmap_associate_kernel_entitlements_internal(
13918 pmap_cs_code_directory_t *cd_entry,
13919 const void *kernel_entitlements)
13920 {
13921 kern_return_t ret = KERN_DENIED;
13922
13923 if (kernel_entitlements == NULL) {
13924 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13925 }
13926
13927 /* Acquire the lock on the code directory */
13928 pmap_cs_lock_code_directory(cd_entry);
13929
13930 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13931 ret = KERN_DENIED;
13932 goto out;
13933 } else if (cd_entry->kernel_entitlements != NULL) {
13934 ret = KERN_DENIED;
13935 goto out;
13936 }
13937 cd_entry->kernel_entitlements = kernel_entitlements;
13938
13939 /* Association was a success */
13940 ret = KERN_SUCCESS;
13941
13942 out:
13943 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13944 return ret;
13945 }
13946
13947 kern_return_t
13948 pmap_associate_kernel_entitlements(
13949 pmap_cs_code_directory_t *cd_entry,
13950 const void *kernel_entitlements)
13951 {
13952 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13953 }
13954
13955 kern_return_t
13956 pmap_resolve_kernel_entitlements_internal(
13957 pmap_t pmap,
13958 const void **kernel_entitlements)
13959 {
13960 const void *entitlements = NULL;
13961 pmap_cs_code_directory_t *cd_entry = NULL;
13962 kern_return_t ret = KERN_DENIED;
13963
13964 /* Validate the PMAP object */
13965 validate_pmap(pmap);
13966
13967 /* Ensure no kernel PMAP */
13968 if (pmap == kernel_pmap) {
13969 return KERN_NOT_FOUND;
13970 }
13971
13972 /* Attempt a shared lock on the PMAP */
13973 if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13974 return KERN_ABORTED;
13975 }
13976
13977 /*
13978 * Acquire the code signature from the PMAP. This function is called when
13979 * performing an entitlement check, and since we've confirmed this isn't
13980 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13981 * with a code signature.
13982 */
13983 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13984 if (cd_entry == NULL) {
13985 ret = KERN_NOT_FOUND;
13986 goto out;
13987 }
13988
13989 entitlements = cd_entry->kernel_entitlements;
13990 if (entitlements == NULL) {
13991 ret = KERN_NOT_FOUND;
13992 goto out;
13993 }
13994
13995 /* Pin and write out the entitlements object pointer */
13996 if (kernel_entitlements != NULL) {
13997 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13998 *kernel_entitlements = entitlements;
13999 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14000 }
14001
14002 /* Successfully resolved the entitlements */
14003 ret = KERN_SUCCESS;
14004
14005 out:
14006 /* Unlock the code signature object */
14007 if (cd_entry != NULL) {
14008 lck_rw_unlock_shared(&cd_entry->rwlock);
14009 cd_entry = NULL;
14010 }
14011
14012 /* Unlock the PMAP object */
14013 pmap_unlock(pmap, PMAP_LOCK_SHARED);
14014
14015 return ret;
14016 }
14017
14018 kern_return_t
14019 pmap_resolve_kernel_entitlements(
14020 pmap_t pmap,
14021 const void **kernel_entitlements)
14022 {
14023 kern_return_t ret = KERN_DENIED;
14024
14025 do {
14026 ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
14027 } while (ret == KERN_ABORTED);
14028
14029 return ret;
14030 }
14031
14032 kern_return_t
14033 pmap_accelerate_entitlements_internal(
14034 pmap_cs_code_directory_t *cd_entry)
14035 {
14036 const coreentitlements_t *CoreEntitlements = NULL;
14037 const CS_SuperBlob *superblob = NULL;
14038 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
14039 size_t signature_length = 0;
14040 size_t acceleration_length = 0;
14041 size_t required_length = 0;
14042 kern_return_t ret = KERN_DENIED;
14043
14044 /* Setup the CoreEntitlements interface */
14045 CoreEntitlements = &amfi->CoreEntitlements;
14046
14047 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
14048
14049 /* Acquire the lock on the code directory */
14050 pmap_cs_lock_code_directory(cd_entry);
14051
14052 /*
14053 * Only reconstituted code signatures can be accelerated. This is only a policy
14054 * decision we make since this allows us to re-use any unused space within the
14055 * locked down code signature region. There is also a decent bit of validation
14056 * within the reconstitution function to ensure blobs are ordered and do not
14057 * contain any padding around them which can cause issues here.
14058 *
14059 * This also serves as a check to ensure the signature is trusted.
14060 */
14061 if (cd_entry->unneeded_code_signature_unlocked == false) {
14062 ret = KERN_DENIED;
14063 goto out;
14064 }
14065
14066 if (cd_entry->ce_ctx == NULL) {
14067 ret = KERN_SUCCESS;
14068 goto out;
14069 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
14070 ret = KERN_SUCCESS;
14071 goto out;
14072 }
14073
14074 /* We only support accelerating when size <= PAGE_SIZE */
14075 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
14076 if (ce_err != CoreEntitlements->kNoError) {
14077 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
14078 /* Small entitlement blobs aren't eligible */
14079 ret = KERN_SUCCESS;
14080 goto out;
14081 }
14082 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
14083 cd_entry, CoreEntitlements->GetErrorString(ce_err));
14084 } else if (acceleration_length > PAGE_SIZE) {
14085 ret = KERN_ABORTED;
14086 goto out;
14087 }
14088 assert(acceleration_length > 0);
14089
14090 superblob = cd_entry->superblob;
14091 signature_length = ntohl(superblob->length);
14092
14093 /* Adjust the required length for the overhead structure -- can't overflow */
14094 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
14095 if (required_length > PAGE_SIZE) {
14096 ret = KERN_ABORTED;
14097 goto out;
14098 }
14099
14100 /*
14101 * First we'll check if the code signature has enough space within the locked down
14102 * region of memory to hold the buffer. If not, then we'll see if we can bucket
14103 * allocate the buffer, and if not, we'll just allocate an entire page from the
14104 * free list.
14105 *
14106 * When we're storing the buffer within the code signature, we also need to make
14107 * sure we account for alignment of the buffer.
14108 */
14109 const vm_address_t align_mask = sizeof(void*) - 1;
14110 size_t required_length_within_sig = required_length + align_mask;
14111
14112 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
14113 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
14114 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
14115
14116 /* We need to resolve to the physical aperture */
14117 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
14118 acceleration_buf = (void*)phystokv(phys_addr);
14119
14120 /* Ensure the offset within the page wasn't lost */
14121 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
14122
14123 acceleration_buf->allocated = false;
14124 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
14125 } else {
14126 if (required_length <= pmap_cs_blob_limit) {
14127 struct pmap_cs_blob *bucket = NULL;
14128 size_t bucket_size = 0;
14129
14130 /* Allocate a buffer from the blob allocator */
14131 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
14132 if (ret != KERN_SUCCESS) {
14133 goto out;
14134 }
14135 acceleration_buf = (void*)bucket->blob;
14136 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
14137 } else {
14138 pmap_paddr_t phys_addr = 0;
14139 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
14140 if (ret != KERN_SUCCESS) {
14141 goto out;
14142 }
14143 acceleration_buf = (void*)phystokv(phys_addr);
14144 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
14145 }
14146 acceleration_buf->allocated = true;
14147 }
14148 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
14149 acceleration_buf->length = acceleration_length;
14150
14151 /* Take the acceleration buffer lock */
14152 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
14153
14154 /* Setup the global acceleration buffer state */
14155 pmap_cs_acceleration_buf = acceleration_buf;
14156
14157 /* Accelerate the entitlements */
14158 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14159 if (ce_err != CoreEntitlements->kNoError) {
14160 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14161 cd_entry, CoreEntitlements->GetErrorString(ce_err));
14162 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14163 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14164 }
14165
14166 /*
14167 * The global acceleration buffer lock is unlocked by the allocation function itself
14168 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14169 * an assert that the lock is unlocked here since another thread could have acquired
14170 * it by now.
14171 */
14172 ret = KERN_SUCCESS;
14173
14174 out:
14175 lck_rw_unlock_exclusive(&cd_entry->rwlock);
14176 return ret;
14177 }
14178
14179 kern_return_t
14180 pmap_accelerate_entitlements(
14181 pmap_cs_code_directory_t *cd_entry)
14182 {
14183 kern_return_t ret = KERN_DENIED;
14184
14185 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14186 while (ret == KERN_RESOURCE_SHORTAGE) {
14187 /* Allocate a page for the PPL */
14188 pmap_alloc_page_for_ppl(0);
14189
14190 /* Try again */
14191 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14192 }
14193
14194 return ret;
14195 }
14196
14197 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14198
14199 MARK_AS_PMAP_TEXT bool
14200 pmap_lookup_in_loaded_trust_caches_internal(
14201 const uint8_t cdhash[CS_CDHASH_LEN])
14202 {
14203 kern_return_t kr = KERN_NOT_FOUND;
14204
14205 #if PMAP_CS_PPL_MONITOR
14206 /*
14207 * If we have the PPL monitor, then this function can only be called from
14208 * within the PPL. Calling it directly would've caused a panic, so we can
14209 * assume that we're in the PPL here.
14210 */
14211 uint8_t cdhash_safe[CS_CDHASH_LEN];
14212 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14213
14214 kr = pmap_query_trust_cache_safe(
14215 kTCQueryTypeLoadable,
14216 cdhash_safe,
14217 NULL);
14218 #else
14219 kr = query_trust_cache(
14220 kTCQueryTypeLoadable,
14221 cdhash,
14222 NULL);
14223 #endif
14224
14225 if (kr == KERN_SUCCESS) {
14226 return true;
14227 }
14228 return false;
14229 }
14230
14231 bool
14232 pmap_lookup_in_loaded_trust_caches(
14233 const uint8_t cdhash[CS_CDHASH_LEN])
14234 {
14235 #if XNU_MONITOR
14236 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14237 #else
14238 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14239 #endif
14240 }
14241
14242 MARK_AS_PMAP_TEXT uint32_t
14243 pmap_lookup_in_static_trust_cache_internal(
14244 const uint8_t cdhash[CS_CDHASH_LEN])
14245 {
14246 TrustCacheQueryToken_t query_token = {0};
14247 kern_return_t kr = KERN_NOT_FOUND;
14248 uint64_t flags = 0;
14249 uint8_t hash_type = 0;
14250
14251 #if PMAP_CS_PPL_MONITOR
14252 /*
14253 * If we have the PPL monitor, then this function can only be called from
14254 * within the PPL. Calling it directly would've caused a panic, so we can
14255 * assume that we're in the PPL here.
14256 */
14257 uint8_t cdhash_safe[CS_CDHASH_LEN];
14258 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14259
14260 kr = pmap_query_trust_cache_safe(
14261 kTCQueryTypeStatic,
14262 cdhash_safe,
14263 &query_token);
14264 #else
14265 kr = query_trust_cache(
14266 kTCQueryTypeStatic,
14267 cdhash,
14268 &query_token);
14269 #endif
14270
14271 if (kr == KERN_SUCCESS) {
14272 amfi->TrustCache.queryGetFlags(&query_token, &flags);
14273 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14274
14275 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14276 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14277 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14278 }
14279
14280 return 0;
14281 }
14282
14283 uint32_t
14284 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14285 {
14286 #if XNU_MONITOR
14287 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14288 #else
14289 return pmap_lookup_in_static_trust_cache_internal(cdhash);
14290 #endif
14291 }
14292
14293 #if PMAP_CS_INCLUDE_CODE_SIGNING
14294
14295 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14296 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14297
14298 MARK_AS_PMAP_TEXT void
14299 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14300 {
14301
14302 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14303 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14304 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14305
14306 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14307 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14308 }
14309
14310 MARK_AS_PMAP_TEXT bool
14311 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14312 {
14313 bool match = false;
14314
14315 /* Lockdown mode disallows compilation service */
14316 if (ppl_lockdown_mode_enabled == true) {
14317 return false;
14318 }
14319
14320 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14321 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14322 match = true;
14323 }
14324 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14325
14326 if (match) {
14327 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14328 }
14329
14330 return match;
14331 }
14332
14333 void
14334 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14335 {
14336 #if XNU_MONITOR
14337 pmap_set_compilation_service_cdhash_ppl(cdhash);
14338 #else
14339 pmap_set_compilation_service_cdhash_internal(cdhash);
14340 #endif
14341 }
14342
14343 bool
14344 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14345 {
14346 #if XNU_MONITOR
14347 return pmap_match_compilation_service_cdhash_ppl(cdhash);
14348 #else
14349 return pmap_match_compilation_service_cdhash_internal(cdhash);
14350 #endif
14351 }
14352
14353 /*
14354 * As part of supporting local signing on the device, we need the PMAP layer
14355 * to store the local signing key so that PMAP_CS can validate with it. We
14356 * store it at the PMAP layer such that it is accessible to both AMFI and
14357 * PMAP_CS should they need it.
14358 */
14359 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14360 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14361
14362 MARK_AS_PMAP_TEXT void
14363 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14364 {
14365 bool key_set = false;
14366
14367 /*
14368 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14369 * a successful exchange means that the local signing public key has _not_ been
14370 * set. In case the key has been set, we panic as we would never expect the
14371 * kernel to attempt to set the key more than once.
14372 */
14373 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14374
14375 if (key_set) {
14376 panic("attempted to set the local signing public key multiple times");
14377 }
14378
14379 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14380 pmap_cs_log_info("set local signing public key");
14381 }
14382
14383 void
14384 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14385 {
14386 #if XNU_MONITOR
14387 return pmap_set_local_signing_public_key_ppl(public_key);
14388 #else
14389 return pmap_set_local_signing_public_key_internal(public_key);
14390 #endif
14391 }
14392
14393 uint8_t*
14394 pmap_get_local_signing_public_key(void)
14395 {
14396 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14397
14398 if (key_set) {
14399 return pmap_local_signing_public_key;
14400 }
14401
14402 return NULL;
14403 }
14404
14405 /*
14406 * Locally signed applications need to be explicitly authorized by an entitled application
14407 * before we allow them to run.
14408 */
14409 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14410 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14411
14412 MARK_AS_PMAP_TEXT void
14413 pmap_unrestrict_local_signing_internal(
14414 const uint8_t cdhash[CS_CDHASH_LEN])
14415 {
14416
14417 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14418 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14419 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14420
14421 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14422 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14423 }
14424
14425 void
14426 pmap_unrestrict_local_signing(
14427 const uint8_t cdhash[CS_CDHASH_LEN])
14428 {
14429 #if XNU_MONITOR
14430 return pmap_unrestrict_local_signing_ppl(cdhash);
14431 #else
14432 return pmap_unrestrict_local_signing_internal(cdhash);
14433 #endif
14434 }
14435
14436 #if PMAP_CS
14437 MARK_AS_PMAP_TEXT static void
14438 pmap_restrict_local_signing(void)
14439 {
14440 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14441 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14442 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14443 }
14444
14445 MARK_AS_PMAP_TEXT static bool
14446 pmap_local_signing_restricted(
14447 const uint8_t cdhash[CS_CDHASH_LEN])
14448 {
14449 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14450 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14451 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14452
14453 return ret != 0;
14454 }
14455
14456 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14457 #endif
14458
14459 MARK_AS_PMAP_TEXT void
14460 pmap_footprint_suspend_internal(
14461 vm_map_t map,
14462 boolean_t suspend)
14463 {
14464 #if DEVELOPMENT || DEBUG
14465 if (suspend) {
14466 current_thread()->pmap_footprint_suspended = TRUE;
14467 map->pmap->footprint_was_suspended = TRUE;
14468 } else {
14469 current_thread()->pmap_footprint_suspended = FALSE;
14470 }
14471 #else /* DEVELOPMENT || DEBUG */
14472 (void) map;
14473 (void) suspend;
14474 #endif /* DEVELOPMENT || DEBUG */
14475 }
14476
14477 void
14478 pmap_footprint_suspend(
14479 vm_map_t map,
14480 boolean_t suspend)
14481 {
14482 #if XNU_MONITOR
14483 pmap_footprint_suspend_ppl(map, suspend);
14484 #else
14485 pmap_footprint_suspend_internal(map, suspend);
14486 #endif
14487 }
14488
14489 MARK_AS_PMAP_TEXT void
14490 pmap_nop_internal(pmap_t pmap __unused)
14491 {
14492 validate_pmap_mutable(pmap);
14493 }
14494
14495 void
14496 pmap_nop(pmap_t pmap)
14497 {
14498 #if XNU_MONITOR
14499 pmap_nop_ppl(pmap);
14500 #else
14501 pmap_nop_internal(pmap);
14502 #endif
14503 }
14504
14505 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14506
14507 struct page_table_dump_header {
14508 uint64_t pa;
14509 uint64_t num_entries;
14510 uint64_t start_va;
14511 uint64_t end_va;
14512 };
14513
14514 static kern_return_t
14515 pmap_dump_page_tables_recurse(pmap_t pmap,
14516 const tt_entry_t *ttp,
14517 unsigned int cur_level,
14518 unsigned int level_mask,
14519 uint64_t start_va,
14520 void *buf_start,
14521 void *buf_end,
14522 size_t *bytes_copied)
14523 {
14524 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14525 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14526
14527 uint64_t size = pt_attr->pta_level_info[cur_level].size;
14528 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14529 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14530 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14531
14532 void *bufp = (uint8_t*)buf_start + *bytes_copied;
14533
14534 if (cur_level == pt_attr_root_level(pt_attr)) {
14535 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14536 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14537 }
14538
14539 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14540 const tt_entry_t *tt_end = &ttp[num_entries];
14541
14542 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14543 return KERN_INSUFFICIENT_BUFFER_SIZE;
14544 }
14545
14546 if (level_mask & (1U << cur_level)) {
14547 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14548 header->pa = ml_static_vtop((vm_offset_t)ttp);
14549 header->num_entries = num_entries;
14550 header->start_va = start_va;
14551 header->end_va = start_va + (num_entries * size);
14552
14553 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14554 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14555 }
14556 uint64_t current_va = start_va;
14557
14558 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14559 tt_entry_t tte = *ttep;
14560
14561 if (!(tte & valid_mask)) {
14562 continue;
14563 }
14564
14565 if ((tte & type_mask) == type_block) {
14566 continue;
14567 } else {
14568 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14569 panic("%s: corrupt entry %#llx at %p, "
14570 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14571 __FUNCTION__, tte, ttep,
14572 ttp, cur_level, bufp, buf_end);
14573 }
14574
14575 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14576
14577 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14578 level_mask, current_va, buf_start, buf_end, bytes_copied);
14579
14580 if (recurse_result != KERN_SUCCESS) {
14581 return recurse_result;
14582 }
14583 }
14584 }
14585
14586 return KERN_SUCCESS;
14587 }
14588
14589 kern_return_t
14590 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14591 {
14592 if (not_in_kdp) {
14593 panic("pmap_dump_page_tables must only be called from kernel debugger context");
14594 }
14595 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14596 level_mask, pmap->min, bufp, buf_end, bytes_copied);
14597 }
14598
14599 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14600
14601 kern_return_t
14602 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14603 unsigned int level_mask __unused, size_t *bytes_copied __unused)
14604 {
14605 return KERN_NOT_SUPPORTED;
14606 }
14607 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14608
14609
14610 #ifdef CONFIG_XNUPOST
14611 #ifdef __arm64__
14612 static volatile bool pmap_test_took_fault = false;
14613
14614 static bool
14615 pmap_test_fault_handler(arm_saved_state_t * state)
14616 {
14617 bool retval = false;
14618 uint64_t esr = get_saved_state_esr(state);
14619 esr_exception_class_t class = ESR_EC(esr);
14620 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
14621
14622 if ((class == ESR_EC_DABORT_EL1) &&
14623 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14624 pmap_test_took_fault = true;
14625 /* return to the instruction immediately after the call to NX page */
14626 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14627 retval = true;
14628 }
14629
14630 return retval;
14631 }
14632
14633 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14634 static NOKASAN bool
14635 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14636 {
14637 pmap_t old_pmap = NULL;
14638
14639 pmap_test_took_fault = false;
14640
14641 /*
14642 * We're potentially switching pmaps without using the normal thread
14643 * mechanism; disable interrupts and preemption to avoid any unexpected
14644 * memory accesses.
14645 */
14646 uint64_t old_int_state = pmap_interrupts_disable();
14647 mp_disable_preemption();
14648
14649 if (pmap != NULL) {
14650 old_pmap = current_pmap();
14651 pmap_switch(pmap);
14652
14653 /* Disable PAN; pmap shouldn't be the kernel pmap. */
14654 #if __ARM_PAN_AVAILABLE__
14655 __builtin_arm_wsr("pan", 0);
14656 #endif /* __ARM_PAN_AVAILABLE__ */
14657 }
14658
14659 ml_expect_fault_begin(pmap_test_fault_handler, va);
14660
14661 if (is_write) {
14662 *((volatile uint64_t*)(va)) = 0xdec0de;
14663 } else {
14664 volatile uint64_t tmp = *((volatile uint64_t*)(va));
14665 (void)tmp;
14666 }
14667
14668 /* Save the fault bool, and undo the gross stuff we did. */
14669 bool took_fault = pmap_test_took_fault;
14670 ml_expect_fault_end();
14671
14672 if (pmap != NULL) {
14673 #if __ARM_PAN_AVAILABLE__
14674 __builtin_arm_wsr("pan", 1);
14675 #endif /* __ARM_PAN_AVAILABLE__ */
14676
14677 pmap_switch(old_pmap);
14678 }
14679
14680 mp_enable_preemption();
14681 pmap_interrupts_restore(old_int_state);
14682 bool retval = (took_fault == should_fault);
14683 return retval;
14684 }
14685
14686 static bool
14687 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14688 {
14689 bool retval = pmap_test_access(pmap, va, should_fault, false);
14690
14691 if (!retval) {
14692 T_FAIL("%s: %s, "
14693 "pmap=%p, va=%p, should_fault=%u",
14694 __func__, should_fault ? "did not fault" : "faulted",
14695 pmap, (void*)va, (unsigned)should_fault);
14696 }
14697
14698 return retval;
14699 }
14700
14701 static bool
14702 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14703 {
14704 bool retval = pmap_test_access(pmap, va, should_fault, true);
14705
14706 if (!retval) {
14707 T_FAIL("%s: %s, "
14708 "pmap=%p, va=%p, should_fault=%u",
14709 __func__, should_fault ? "did not fault" : "faulted",
14710 pmap, (void*)va, (unsigned)should_fault);
14711 }
14712
14713 return retval;
14714 }
14715
14716 static bool
14717 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14718 {
14719 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14720 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14721
14722 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14723
14724 if (!retval) {
14725 T_FAIL("%s: bits=%u, "
14726 "pa=%p, should_be_set=%u",
14727 __func__, bits,
14728 (void*)pa, should_be_set);
14729 }
14730
14731 return retval;
14732 }
14733
14734 static __attribute__((noinline)) bool
14735 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14736 {
14737 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14738 return retval;
14739 }
14740
14741 static int
14742 pmap_test_test_config(unsigned int flags)
14743 {
14744 T_LOG("running pmap_test_test_config flags=0x%X", flags);
14745 unsigned int map_count = 0;
14746 unsigned long page_ratio = 0;
14747 pmap_t pmap = pmap_create_options(NULL, 0, flags);
14748
14749 if (!pmap) {
14750 panic("Failed to allocate pmap");
14751 }
14752
14753 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14754 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14755 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14756 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14757
14758 if (pmap_page_size <= native_page_size) {
14759 page_ratio = native_page_size / pmap_page_size;
14760 } else {
14761 /*
14762 * We claim to support a page_ratio of less than 1, which is
14763 * not currently supported by the pmap layer; panic.
14764 */
14765 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14766 "flags=%u",
14767 __func__, native_page_size, pmap_page_size,
14768 flags);
14769 }
14770
14771 if (PAGE_RATIO > 1) {
14772 /*
14773 * The kernel is deliberately pretending to have 16KB pages.
14774 * The pmap layer has code that supports this, so pretend the
14775 * page size is larger than it is.
14776 */
14777 pmap_page_size = PAGE_SIZE;
14778 native_page_size = PAGE_SIZE;
14779 }
14780
14781 /*
14782 * Get two pages from the VM; one to be mapped wired, and one to be
14783 * mapped nonwired.
14784 */
14785 vm_page_t unwired_vm_page = vm_page_grab();
14786 vm_page_t wired_vm_page = vm_page_grab();
14787
14788 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14789 panic("Failed to grab VM pages");
14790 }
14791
14792 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14793 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14794
14795 pmap_paddr_t pa = ptoa(pn);
14796 pmap_paddr_t wired_pa = ptoa(wired_pn);
14797
14798 /*
14799 * We'll start mappings at the second twig TT. This keeps us from only
14800 * using the first entry in each TT, which would trivially be address
14801 * 0; one of the things we will need to test is retrieving the VA for
14802 * a given PTE.
14803 */
14804 vm_map_address_t va_base = pmap_twig_size;
14805 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14806
14807 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14808 /*
14809 * Not exactly a functional failure, but this test relies on
14810 * there being a spare PTE slot we can use to pin the TT.
14811 */
14812 panic("Cannot pin translation table");
14813 }
14814
14815 /*
14816 * Create the wired mapping; this will prevent the pmap layer from
14817 * reclaiming our test TTs, which would interfere with this test
14818 * ("interfere" -> "make it panic").
14819 */
14820 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14821
14822 #if XNU_MONITOR
14823 /*
14824 * If the PPL is enabled, make sure that the kernel cannot write
14825 * to PPL memory.
14826 */
14827 if (!pmap_ppl_disable) {
14828 T_LOG("Validate that kernel cannot write to PPL memory.");
14829 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14830 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14831 }
14832 #endif
14833
14834 /*
14835 * Create read-only mappings of the nonwired page; if the pmap does
14836 * not use the same page size as the kernel, create multiple mappings
14837 * so that the kernel page is fully mapped.
14838 */
14839 for (map_count = 0; map_count < page_ratio; map_count++) {
14840 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14841 }
14842
14843 /* Validate that all the PTEs have the expected PA and VA. */
14844 for (map_count = 0; map_count < page_ratio; map_count++) {
14845 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14846
14847 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14848 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14849 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14850 }
14851
14852 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14853 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14854 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14855 }
14856 }
14857
14858 T_LOG("Validate that reads to our mapping do not fault.");
14859 pmap_test_read(pmap, va_base, false);
14860
14861 T_LOG("Validate that writes to our mapping fault.");
14862 pmap_test_write(pmap, va_base, true);
14863
14864 T_LOG("Make the first mapping writable.");
14865 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14866
14867 T_LOG("Validate that writes to our mapping do not fault.");
14868 pmap_test_write(pmap, va_base, false);
14869
14870
14871 T_LOG("Make the first mapping execute-only");
14872 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14873
14874
14875 T_LOG("Validate that reads to our mapping do not fault.");
14876 pmap_test_read(pmap, va_base, false);
14877
14878 T_LOG("Validate that writes to our mapping fault.");
14879 pmap_test_write(pmap, va_base, true);
14880
14881
14882 /*
14883 * For page ratios of greater than 1: validate that writes to the other
14884 * mappings still fault. Remove the mappings afterwards (we're done
14885 * with page ratio testing).
14886 */
14887 for (map_count = 1; map_count < page_ratio; map_count++) {
14888 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14889 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14890 }
14891
14892 T_LOG("Mark the page unreferenced and unmodified.");
14893 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14894 pmap_test_check_refmod(pa, 0);
14895
14896 /*
14897 * Begin testing the ref/mod state machine. Re-enter the mapping with
14898 * different protection/fault_type settings, and confirm that the
14899 * ref/mod state matches our expectations at each step.
14900 */
14901 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
14902 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14903 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14904
14905 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
14906 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14907 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14908 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14909
14910 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
14911 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14912 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14913 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14914
14915 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
14916 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14917 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14918
14919 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
14920 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14921 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14922 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14923
14924 /*
14925 * Shared memory testing; we'll have two mappings; one read-only,
14926 * one read-write.
14927 */
14928 vm_map_address_t rw_base = va_base;
14929 vm_map_address_t ro_base = va_base + pmap_page_size;
14930
14931 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14932 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14933
14934 /*
14935 * Test that we take faults as expected for unreferenced/unmodified
14936 * pages. Also test the arm_fast_fault interface, to ensure that
14937 * mapping permissions change as expected.
14938 */
14939 T_LOG("!ref/!mod: expect no access");
14940 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14941 pmap_test_read_write(pmap, ro_base, false, false);
14942 pmap_test_read_write(pmap, rw_base, false, false);
14943
14944 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14945 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14946 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14947 pmap_test_read_write(pmap, ro_base, true, false);
14948 pmap_test_read_write(pmap, rw_base, true, false);
14949
14950 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14951 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14952 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14953 pmap_test_read_write(pmap, ro_base, true, false);
14954 pmap_test_read_write(pmap, rw_base, true, true);
14955
14956 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14957 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14958 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14959 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14960 pmap_test_read_write(pmap, ro_base, true, false);
14961 pmap_test_read_write(pmap, rw_base, true, true);
14962
14963 T_LOG("RW protect both mappings; should not change protections.");
14964 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14965 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14966 pmap_test_read_write(pmap, ro_base, true, false);
14967 pmap_test_read_write(pmap, rw_base, true, true);
14968
14969 T_LOG("Read protect both mappings; RW mapping should become RO.");
14970 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14971 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14972 pmap_test_read_write(pmap, ro_base, true, false);
14973 pmap_test_read_write(pmap, rw_base, true, false);
14974
14975 T_LOG("RW protect the page; mappings should not change protections.");
14976 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14977 pmap_page_protect(pn, VM_PROT_ALL);
14978 pmap_test_read_write(pmap, ro_base, true, false);
14979 pmap_test_read_write(pmap, rw_base, true, true);
14980
14981 T_LOG("Read protect the page; RW mapping should become RO.");
14982 pmap_page_protect(pn, VM_PROT_READ);
14983 pmap_test_read_write(pmap, ro_base, true, false);
14984 pmap_test_read_write(pmap, rw_base, true, false);
14985
14986 T_LOG("Validate that disconnect removes all known mappings of the page.");
14987 pmap_disconnect(pn);
14988 if (!pmap_verify_free(pn)) {
14989 T_FAIL("Page still has mappings");
14990 }
14991
14992 T_LOG("Remove the wired mapping, so we can tear down the test map.");
14993 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14994 pmap_destroy(pmap);
14995
14996 T_LOG("Release the pages back to the VM.");
14997 vm_page_lock_queues();
14998 vm_page_free(unwired_vm_page);
14999 vm_page_free(wired_vm_page);
15000 vm_page_unlock_queues();
15001
15002 T_LOG("Testing successful!");
15003 return 0;
15004 }
15005 #endif /* __arm64__ */
15006
15007 kern_return_t
15008 pmap_test(void)
15009 {
15010 T_LOG("Starting pmap_tests");
15011 #ifdef __arm64__
15012 int flags = 0;
15013 flags |= PMAP_CREATE_64BIT;
15014
15015 #if __ARM_MIXED_PAGE_SIZE__
15016 T_LOG("Testing VM_PAGE_SIZE_4KB");
15017 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
15018 T_LOG("Testing VM_PAGE_SIZE_16KB");
15019 pmap_test_test_config(flags);
15020 #else /* __ARM_MIXED_PAGE_SIZE__ */
15021 pmap_test_test_config(flags);
15022 #endif /* __ARM_MIXED_PAGE_SIZE__ */
15023
15024 #endif /* __arm64__ */
15025 T_PASS("completed pmap_test successfully");
15026 return KERN_SUCCESS;
15027 }
15028 #endif /* CONFIG_XNUPOST */
15029
15030 /*
15031 * The following function should never make it to RELEASE code, since
15032 * it provides a way to get the PPL to modify text pages.
15033 */
15034 #if DEVELOPMENT || DEBUG
15035
15036 #define ARM_UNDEFINED_INSN 0xe7f000f0
15037 #define ARM_UNDEFINED_INSN_THUMB 0xde00
15038
15039 /**
15040 * Forcibly overwrite executable text with an illegal instruction.
15041 *
15042 * @note Only used for xnu unit testing.
15043 *
15044 * @param pa The physical address to corrupt.
15045 *
15046 * @return KERN_SUCCESS on success.
15047 */
15048 kern_return_t
15049 pmap_test_text_corruption(pmap_paddr_t pa)
15050 {
15051 #if XNU_MONITOR
15052 return pmap_test_text_corruption_ppl(pa);
15053 #else /* XNU_MONITOR */
15054 return pmap_test_text_corruption_internal(pa);
15055 #endif /* XNU_MONITOR */
15056 }
15057
15058 MARK_AS_PMAP_TEXT kern_return_t
15059 pmap_test_text_corruption_internal(pmap_paddr_t pa)
15060 {
15061 vm_offset_t va = phystokv(pa);
15062 unsigned int pai = pa_index(pa);
15063
15064 assert(pa_valid(pa));
15065
15066 pvh_lock(pai);
15067
15068 pv_entry_t **pv_h = pai_to_pvh(pai);
15069 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
15070 #if defined(PVH_FLAG_EXEC)
15071 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
15072
15073 if (need_ap_twiddle) {
15074 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
15075 }
15076 #endif /* defined(PVH_FLAG_EXEC) */
15077
15078 /*
15079 * The low bit in an instruction address indicates a THUMB instruction
15080 */
15081 if (va & 1) {
15082 va &= ~(vm_offset_t)1;
15083 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
15084 } else {
15085 *(uint32_t *)va = ARM_UNDEFINED_INSN;
15086 }
15087
15088 #if defined(PVH_FLAG_EXEC)
15089 if (need_ap_twiddle) {
15090 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
15091 }
15092 #endif /* defined(PVH_FLAG_EXEC) */
15093
15094 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
15095
15096 pvh_unlock(pai);
15097
15098 return KERN_SUCCESS;
15099 }
15100
15101 #endif /* DEVELOPMENT || DEBUG */
15102