1 /*
2 * Copyright (c) 2011-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <mach_assert.h>
30 #include <mach_ldebug.h>
31
32 #include <mach/shared_region.h>
33 #include <mach/vm_param.h>
34 #include <mach/vm_prot.h>
35 #include <mach/vm_map.h>
36 #include <mach/machine/vm_param.h>
37 #include <mach/machine/vm_types.h>
38
39 #include <mach/boolean.h>
40 #include <kern/bits.h>
41 #include <kern/thread.h>
42 #include <kern/sched.h>
43 #include <kern/zalloc.h>
44 #include <kern/zalloc_internal.h>
45 #include <kern/kalloc.h>
46 #include <kern/spl.h>
47 #include <kern/startup.h>
48 #include <kern/trustcache.h>
49
50 #include <os/overflow.h>
51
52 #include <vm/pmap.h>
53 #include <vm/pmap_cs.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_protos.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/cpm.h>
61
62 #include <libkern/img4/interface.h>
63 #include <libkern/section_keywords.h>
64 #include <sys/errno.h>
65
66 #include <machine/atomic.h>
67 #include <machine/thread.h>
68 #include <machine/lowglobals.h>
69
70 #include <arm/caches_internal.h>
71 #include <arm/cpu_data.h>
72 #include <arm/cpu_data_internal.h>
73 #include <arm/cpu_capabilities.h>
74 #include <arm/cpu_number.h>
75 #include <arm/machine_cpu.h>
76 #include <arm/misc_protos.h>
77 #include <arm/pmap/pmap_internal.h>
78 #include <arm/trap.h>
79
80 #if (__ARM_VMSA__ > 7)
81 #include <arm64/proc_reg.h>
82 #include <pexpert/arm64/boot.h>
83 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
84 #include <arm64/amcc_rorgn.h>
85 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
86 #endif
87
88 #include <pexpert/device_tree.h>
89
90 #include <san/kasan.h>
91 #include <sys/cdefs.h>
92
93 #if defined(HAS_APPLE_PAC)
94 #include <ptrauth.h>
95 #endif
96
97 #ifdef CONFIG_XNUPOST
98 #include <tests/xnupost.h>
99 #endif
100
101
102 #if HIBERNATION
103 #include <IOKit/IOHibernatePrivate.h>
104 #endif /* HIBERNATION */
105
106 #ifdef __ARM64_PMAP_SUBPAGE_L1__
107 #if (__ARM_VMSA__ <= 7)
108 #error This is not supported for old-style page tables
109 #endif
110 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
111 #else
112 #if (__ARM_VMSA__ <= 7)
113 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES * 2)
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 #endif
118
119 extern u_int32_t random(void); /* from <libkern/libkern.h> */
120
121 static bool alloc_asid(pmap_t pmap);
122 static void free_asid(pmap_t pmap);
123 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
124 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
125 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
126
127 static const struct page_table_ops native_pt_ops =
128 {
129 .alloc_id = alloc_asid,
130 .free_id = free_asid,
131 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
132 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
133 .wimg_to_pte = wimg_to_pte,
134 };
135
136 #if (__ARM_VMSA__ > 7)
137 const struct page_table_level_info pmap_table_level_info_16k[] =
138 {
139 [0] = {
140 .size = ARM_16K_TT_L0_SIZE,
141 .offmask = ARM_16K_TT_L0_OFFMASK,
142 .shift = ARM_16K_TT_L0_SHIFT,
143 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
144 .valid_mask = ARM_TTE_VALID,
145 .type_mask = ARM_TTE_TYPE_MASK,
146 .type_block = ARM_TTE_TYPE_BLOCK
147 },
148 [1] = {
149 .size = ARM_16K_TT_L1_SIZE,
150 .offmask = ARM_16K_TT_L1_OFFMASK,
151 .shift = ARM_16K_TT_L1_SHIFT,
152 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
153 .valid_mask = ARM_TTE_VALID,
154 .type_mask = ARM_TTE_TYPE_MASK,
155 .type_block = ARM_TTE_TYPE_BLOCK
156 },
157 [2] = {
158 .size = ARM_16K_TT_L2_SIZE,
159 .offmask = ARM_16K_TT_L2_OFFMASK,
160 .shift = ARM_16K_TT_L2_SHIFT,
161 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
162 .valid_mask = ARM_TTE_VALID,
163 .type_mask = ARM_TTE_TYPE_MASK,
164 .type_block = ARM_TTE_TYPE_BLOCK
165 },
166 [3] = {
167 .size = ARM_16K_TT_L3_SIZE,
168 .offmask = ARM_16K_TT_L3_OFFMASK,
169 .shift = ARM_16K_TT_L3_SHIFT,
170 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
171 .valid_mask = ARM_PTE_TYPE_VALID,
172 .type_mask = ARM_PTE_TYPE_MASK,
173 .type_block = ARM_TTE_TYPE_L3BLOCK
174 }
175 };
176
177 const struct page_table_level_info pmap_table_level_info_4k[] =
178 {
179 [0] = {
180 .size = ARM_4K_TT_L0_SIZE,
181 .offmask = ARM_4K_TT_L0_OFFMASK,
182 .shift = ARM_4K_TT_L0_SHIFT,
183 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
184 .valid_mask = ARM_TTE_VALID,
185 .type_mask = ARM_TTE_TYPE_MASK,
186 .type_block = ARM_TTE_TYPE_BLOCK
187 },
188 [1] = {
189 .size = ARM_4K_TT_L1_SIZE,
190 .offmask = ARM_4K_TT_L1_OFFMASK,
191 .shift = ARM_4K_TT_L1_SHIFT,
192 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
193 .valid_mask = ARM_TTE_VALID,
194 .type_mask = ARM_TTE_TYPE_MASK,
195 .type_block = ARM_TTE_TYPE_BLOCK
196 },
197 [2] = {
198 .size = ARM_4K_TT_L2_SIZE,
199 .offmask = ARM_4K_TT_L2_OFFMASK,
200 .shift = ARM_4K_TT_L2_SHIFT,
201 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
202 .valid_mask = ARM_TTE_VALID,
203 .type_mask = ARM_TTE_TYPE_MASK,
204 .type_block = ARM_TTE_TYPE_BLOCK
205 },
206 [3] = {
207 .size = ARM_4K_TT_L3_SIZE,
208 .offmask = ARM_4K_TT_L3_OFFMASK,
209 .shift = ARM_4K_TT_L3_SHIFT,
210 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
211 .valid_mask = ARM_PTE_TYPE_VALID,
212 .type_mask = ARM_PTE_TYPE_MASK,
213 .type_block = ARM_TTE_TYPE_L3BLOCK
214 }
215 };
216
217 const struct page_table_attr pmap_pt_attr_4k = {
218 .pta_level_info = pmap_table_level_info_4k,
219 .pta_root_level = (T0SZ_BOOT - 16) / 9,
220 #if __ARM_MIXED_PAGE_SIZE__
221 .pta_commpage_level = PMAP_TT_L2_LEVEL,
222 #else /* __ARM_MIXED_PAGE_SIZE__ */
223 #if __ARM_16K_PG__
224 .pta_commpage_level = PMAP_TT_L2_LEVEL,
225 #else /* __ARM_16K_PG__ */
226 .pta_commpage_level = PMAP_TT_L1_LEVEL,
227 #endif /* __ARM_16K_PG__ */
228 #endif /* __ARM_MIXED_PAGE_SIZE__ */
229 .pta_max_level = PMAP_TT_L3_LEVEL,
230 .pta_ops = &native_pt_ops,
231 .ap_ro = ARM_PTE_AP(AP_RORO),
232 .ap_rw = ARM_PTE_AP(AP_RWRW),
233 .ap_rona = ARM_PTE_AP(AP_RONA),
234 .ap_rwna = ARM_PTE_AP(AP_RWNA),
235 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
236 .ap_x = ARM_PTE_PNX,
237 #if __ARM_MIXED_PAGE_SIZE__
238 .pta_tcr_value = TCR_EL1_4KB,
239 #endif /* __ARM_MIXED_PAGE_SIZE__ */
240 .pta_page_size = 4096,
241 .pta_page_shift = 12,
242 };
243
244 const struct page_table_attr pmap_pt_attr_16k = {
245 .pta_level_info = pmap_table_level_info_16k,
246 .pta_root_level = PMAP_TT_L1_LEVEL,
247 .pta_commpage_level = PMAP_TT_L2_LEVEL,
248 .pta_max_level = PMAP_TT_L3_LEVEL,
249 .pta_ops = &native_pt_ops,
250 .ap_ro = ARM_PTE_AP(AP_RORO),
251 .ap_rw = ARM_PTE_AP(AP_RWRW),
252 .ap_rona = ARM_PTE_AP(AP_RONA),
253 .ap_rwna = ARM_PTE_AP(AP_RWNA),
254 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
255 .ap_x = ARM_PTE_PNX,
256 #if __ARM_MIXED_PAGE_SIZE__
257 .pta_tcr_value = TCR_EL1_16KB,
258 #endif /* __ARM_MIXED_PAGE_SIZE__ */
259 .pta_page_size = 16384,
260 .pta_page_shift = 14,
261 };
262
263 #if __ARM_16K_PG__
264 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
265 #else /* !__ARM_16K_PG__ */
266 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
267 #endif /* !__ARM_16K_PG__ */
268
269
270 #else /* (__ARM_VMSA__ > 7) */
271 /*
272 * We don't support pmap parameterization for VMSA7, so use an opaque
273 * page_table_attr structure.
274 */
275 const struct page_table_attr * const native_pt_attr = NULL;
276 #endif /* (__ARM_VMSA__ > 7) */
277
278
279 static inline void
pmap_sync_tlb(bool strong __unused)280 pmap_sync_tlb(bool strong __unused)
281 {
282 sync_tlb_flush();
283 }
284
285 #if MACH_ASSERT
286 int vm_footprint_suspend_allowed = 1;
287
288 extern int pmap_ledgers_panic;
289 extern int pmap_ledgers_panic_leeway;
290
291 #endif /* MACH_ASSERT */
292
293 #if DEVELOPMENT || DEBUG
294 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
295 (current_thread()->pmap_footprint_suspended)
296 #else /* DEVELOPMENT || DEBUG */
297 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
298 #endif /* DEVELOPMENT || DEBUG */
299
300
301 #ifdef PLATFORM_BridgeOS
302 static struct pmap_legacy_trust_cache *pmap_legacy_trust_caches MARK_AS_PMAP_DATA = NULL;
303 #endif
304 static struct pmap_image4_trust_cache *pmap_image4_trust_caches MARK_AS_PMAP_DATA = NULL;
305
306 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_loaded_trust_caches_lock, 0);
307
308 SECURITY_READ_ONLY_LATE(int) srd_fused = 0;
309
310 /*
311 * Represents a tlb range that will be flushed before exiting
312 * the ppl.
313 * Used by phys_attribute_clear_range to defer flushing pages in
314 * this range until the end of the operation.
315 */
316 typedef struct pmap_tlb_flush_range {
317 pmap_t ptfr_pmap;
318 vm_map_address_t ptfr_start;
319 vm_map_address_t ptfr_end;
320 bool ptfr_flush_needed;
321 } pmap_tlb_flush_range_t;
322
323 #if XNU_MONITOR
324 /*
325 * PPL External References.
326 */
327 extern vm_offset_t segPPLDATAB;
328 extern unsigned long segSizePPLDATA;
329 extern vm_offset_t segPPLTEXTB;
330 extern unsigned long segSizePPLTEXT;
331 extern vm_offset_t segPPLDATACONSTB;
332 extern unsigned long segSizePPLDATACONST;
333
334
335 /*
336 * PPL Global Variables
337 */
338
339 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
340 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
341 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
342 #else
343 const boolean_t pmap_ppl_disable = FALSE;
344 #endif
345
346 /*
347 * Indicates if the PPL has started applying APRR.
348 * This variable is accessed from various assembly trampolines, so be sure to change
349 * those if you change the size or layout of this variable.
350 */
351 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
352
353 extern void *pmap_stacks_start;
354 extern void *pmap_stacks_end;
355
356 #endif /* !XNU_MONITOR */
357
358
359 /* Virtual memory region for early allocation */
360 #if (__ARM_VMSA__ == 7)
361 #define VREGION1_HIGH_WINDOW (0)
362 #else
363 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
364 #endif
365 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
366 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
367
368 extern uint8_t bootstrap_pagetables[];
369
370 extern unsigned int not_in_kdp;
371
372 extern vm_offset_t first_avail;
373
374 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
375 extern vm_offset_t virtual_space_end; /* End of kernel address space */
376 extern vm_offset_t static_memory_end;
377
378 extern const vm_map_address_t physmap_base;
379 extern const vm_map_address_t physmap_end;
380
381 extern int maxproc, hard_maxproc;
382
383 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
384 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
385
386 #if (__ARM_VMSA__ > 7)
387 /* The number of address bits one TTBR can cover. */
388 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
389
390 /*
391 * The bounds on our TTBRs. These are for sanity checking that
392 * an address is accessible by a TTBR before we attempt to map it.
393 */
394
395 /* The level of the root of a page table. */
396 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
397
398 /* The number of entries in the root TT of a page table. */
399 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
400 #else
401 const uint64_t arm64_root_pgtable_level = 0;
402 const uint64_t arm64_root_pgtable_num_ttes = 0;
403 #endif
404
405 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
406 const pmap_t kernel_pmap = &kernel_pmap_store;
407
408 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
409
410 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
411 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
412 unsigned int pmap_stamp MARK_AS_PMAP_DATA;
413 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
414
415 typedef struct tt_free_entry {
416 struct tt_free_entry *next;
417 } tt_free_entry_t;
418
419 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
420
421 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
422 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
423 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
424 #define FREE_PAGE_SIZE_TT_MAX 4
425 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
426 unsigned int free_two_page_size_tt_count MARK_AS_PMAP_DATA;
427 unsigned int free_two_page_size_tt_max MARK_AS_PMAP_DATA;
428 #define FREE_TWO_PAGE_SIZE_TT_MAX 4
429 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
430 unsigned int free_tt_count MARK_AS_PMAP_DATA;
431 unsigned int free_tt_max MARK_AS_PMAP_DATA;
432
433 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
434
435 boolean_t pmap_gc_allowed MARK_AS_PMAP_DATA = TRUE;
436 boolean_t pmap_gc_forced MARK_AS_PMAP_DATA = FALSE;
437 boolean_t pmap_gc_allowed_by_time_throttle = TRUE;
438
439 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
440 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
441 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
442 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
443 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
444 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
445
446 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
447 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
448
449 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
450 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
451
452 /* Lock group used for all pmap object locks. */
453 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
454
455 #if DEVELOPMENT || DEBUG
456 int nx_enabled = 1; /* enable no-execute protection */
457 int allow_data_exec = 0; /* No apps may execute data */
458 int allow_stack_exec = 0; /* No apps may execute from the stack */
459 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
460 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
461 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
462 #else /* DEVELOPMENT || DEBUG */
463 const int nx_enabled = 1; /* enable no-execute protection */
464 const int allow_data_exec = 0; /* No apps may execute data */
465 const int allow_stack_exec = 0; /* No apps may execute from the stack */
466 #endif /* DEVELOPMENT || DEBUG */
467
468 /**
469 * This variable is set true during hibernation entry to protect pmap data structures
470 * during image copying, and reset false on hibernation exit.
471 */
472 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
473
474 #if MACH_ASSERT
475 static void pmap_check_ledgers(pmap_t pmap);
476 #else
477 static inline void
pmap_check_ledgers(__unused pmap_t pmap)478 pmap_check_ledgers(__unused pmap_t pmap)
479 {
480 }
481 #endif /* MACH_ASSERT */
482
483 /**
484 * This helper function ensures that potentially-long-running batched PPL operations are
485 * called in preemptible context before entering the PPL, so that the PPL call may
486 * periodically exit to allow pending urgent ASTs to be taken.
487 */
488 static inline void
pmap_verify_preemptible(void)489 pmap_verify_preemptible(void)
490 {
491 assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
492 }
493
494 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
495
496 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
497 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
498
499 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
500
501 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
502 #if defined(__arm64__)
503 # ifdef XNU_TARGET_OS_OSX
504 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
505 # else
506 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
507 # endif
508 #endif /* __arm64__ */
509
510 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
511 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
512 #else
513 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
514 #endif
515
516 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
517 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
518 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
519 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
520 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
521 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
522 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
523 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
524
525
526 #if (__ARM_VMSA__ > 7)
527 #if __ARM_MIXED_PAGE_SIZE__
528 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap_4k;
529 #endif
530 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap_default;
531 #endif
532
533 /* PTE Define Macros */
534
535 #define pte_is_wired(pte) \
536 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
537
538 #define pte_was_writeable(pte) \
539 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
540
541 #define pte_set_was_writeable(pte, was_writeable) \
542 do { \
543 if ((was_writeable)) { \
544 (pte) |= ARM_PTE_WRITEABLE; \
545 } else { \
546 (pte) &= ~ARM_PTE_WRITEABLE; \
547 } \
548 } while(0)
549
550 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)551 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
552 {
553 if (wired) {
554 *ptep |= ARM_PTE_WIRED;
555 } else {
556 *ptep &= ~ARM_PTE_WIRED;
557 }
558 /*
559 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
560 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
561 * never reclaimed.
562 */
563 if (pmap == kernel_pmap) {
564 return;
565 }
566 unsigned short *ptd_wiredcnt_ptr;
567 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
568 if (wired) {
569 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
570 } else {
571 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
572 if (__improbable(prev_wired == 0)) {
573 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
574 }
575 }
576 }
577
578 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
579 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only); \
580 pmap_sync_tlb(strong); \
581 }
582
583 /*
584 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
585 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
586 * will observe the updated PTE.
587 */
588 #define FLUSH_PTE() \
589 __builtin_arm_dmb(DMB_ISH);
590
591 /*
592 * Synchronize updates to PTEs that were previously valid and thus may be cached in
593 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
594 * TLBI. This should only require a store-store barrier, as subsequent accesses in
595 * program order will not issue until the DSB completes. Prior loads may be reordered
596 * after the barrier, but their behavior should not be materially affected by the
597 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
598 * matter for loads until the access is re-driven well after the TLB update is
599 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
600 * we should be in a position to handle access faults. For "voluntary" PTE access
601 * restriction due to unmapping or protection, the decision to restrict access should
602 * have a data dependency on prior loads in order to avoid a data race.
603 */
604 #define FLUSH_PTE_STRONG() \
605 __builtin_arm_dsb(DSB_ISHST);
606
607 /**
608 * Write enough page table entries to map a single VM page. On systems where the
609 * VM page size does not match the hardware page size, multiple page table
610 * entries will need to be written.
611 *
612 * @note This function does not emit a barrier to ensure these page table writes
613 * have completed before continuing. This is commonly needed. In the case
614 * where a DMB or DSB barrier is needed, then use the write_pte() and
615 * write_pte_strong() functions respectively instead of this one.
616 *
617 * @param ptep Pointer to the first page table entry to update.
618 * @param pte The value to write into each page table entry. In the case that
619 * multiple PTEs are updated to a non-empty value, then the address
620 * in this value will automatically be incremented for each PTE
621 * write.
622 */
623 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)624 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
625 {
626 /**
627 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
628 * systems, which is why it's checked at runtime instead of compile time.
629 * The "unreachable" warning needs to be suppressed because it still is a
630 * compile time constant on some systems.
631 */
632 __unreachable_ok_push
633 if (TEST_PAGE_RATIO_4) {
634 if (((uintptr_t)ptep) & 0x1f) {
635 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
636 __func__, ptep, (void*)pte);
637 }
638
639 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
640 /**
641 * If we're writing an empty/compressed PTE value, then don't
642 * auto-increment the address for each PTE write.
643 */
644 *ptep = pte;
645 *(ptep + 1) = pte;
646 *(ptep + 2) = pte;
647 *(ptep + 3) = pte;
648 } else {
649 *ptep = pte;
650 *(ptep + 1) = pte | 0x1000;
651 *(ptep + 2) = pte | 0x2000;
652 *(ptep + 3) = pte | 0x3000;
653 }
654 } else {
655 *ptep = pte;
656 }
657 __unreachable_ok_pop
658 }
659
660 /**
661 * Writes enough page table entries to map a single VM page and then ensures
662 * those writes complete by executing a Data Memory Barrier.
663 *
664 * @note The DMB issued by this function is not strong enough to protect against
665 * TLB invalidates from being reordered above the PTE writes. If a TLBI
666 * instruction is going to immediately be called after this write, it's
667 * recommended to call write_pte_strong() instead of this function.
668 *
669 * See the function header for write_pte_fast() for more details on the
670 * parameters.
671 */
672 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)673 write_pte(pt_entry_t *ptep, pt_entry_t pte)
674 {
675 write_pte_fast(ptep, pte);
676 FLUSH_PTE();
677 }
678
679 /**
680 * Writes enough page table entries to map a single VM page and then ensures
681 * those writes complete by executing a Data Synchronization Barrier. This
682 * barrier provides stronger guarantees than the DMB executed by write_pte().
683 *
684 * @note This function is useful if you're going to immediately flush the TLB
685 * after making the PTE write. A DSB is required to protect against the
686 * TLB invalidate being reordered before the PTE write.
687 *
688 * See the function header for write_pte_fast() for more details on the
689 * parameters.
690 */
691 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)692 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
693 {
694 write_pte_fast(ptep, pte);
695 FLUSH_PTE_STRONG();
696 }
697
698 /**
699 * Retrieve the pmap structure for the thread running on the current CPU.
700 */
701 pmap_t
current_pmap()702 current_pmap()
703 {
704 const pmap_t current = vm_map_pmap(current_thread()->map);
705
706 assert(current != NULL);
707
708 #if XNU_MONITOR
709 /**
710 * On PPL-enabled systems, it's important that PPL policy decisions aren't
711 * decided by kernel-writable memory. This function is used in various parts
712 * of the PPL, and besides validating that the pointer returned by this
713 * function is indeed a pmap structure, it's also important to ensure that
714 * it's actually the current thread's pmap. This is because different pmaps
715 * will have access to different entitlements based on the code signature of
716 * their loaded process. So if a different user pmap is set in the current
717 * thread structure (in an effort to bypass code signing restrictions), even
718 * though the structure would validate correctly as it is a real pmap
719 * structure, it should fail here.
720 *
721 * This only needs to occur for user pmaps because the kernel pmap's root
722 * page table is always the same as TTBR1 (it's set during bootstrap and not
723 * changed so it'd be redundant to check), and its code signing fields are
724 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
725 * it shouldn't be possible to set those fields. Due to that, an attacker
726 * setting the current thread's pmap to the kernel pmap as a way to bypass
727 * this check won't accomplish anything as it doesn't provide any extra code
728 * signing entitlements.
729 */
730 if ((current != kernel_pmap) &&
731 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
732 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
733 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
734 }
735 #endif /* XNU_MONITOR */
736
737 return current;
738 }
739
740 #if DEVELOPMENT || DEBUG
741
742 /*
743 * Trace levels are controlled by a bitmask in which each
744 * level can be enabled/disabled by the (1<<level) position
745 * in the boot arg
746 * Level 0: PPL extension functionality
747 * Level 1: pmap lifecycle (create/destroy/switch)
748 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
749 * Level 3: internal state management (attributes/fast-fault)
750 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
751 */
752
753 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
754
755 #define PMAP_TRACE(level, ...) \
756 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
757 KDBG_RELEASE(__VA_ARGS__); \
758 }
759 #else /* DEVELOPMENT || DEBUG */
760
761 #define PMAP_TRACE(level, ...)
762
763 #endif /* DEVELOPMENT || DEBUG */
764
765
766 /*
767 * Internal function prototypes (forward declarations).
768 */
769
770 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
771
772 static void pmap_set_reference(ppnum_t pn);
773
774 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
775
776 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
777
778 static kern_return_t pmap_expand(
779 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
780
781 static int pmap_remove_range(
782 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
783
784 static tt_entry_t *pmap_tt1_allocate(
785 pmap_t, vm_size_t, unsigned int);
786
787 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
788
789 static void pmap_tt1_deallocate(
790 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
791
792 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
793
794 static kern_return_t pmap_tt_allocate(
795 pmap_t, tt_entry_t **, unsigned int, unsigned int);
796
797 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
798
799 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
800 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
801 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
802
803 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
804
805 #if (__ARM_VMSA__ > 7)
806
807 static void pmap_unmap_sharedpage(
808 pmap_t pmap);
809
810 static boolean_t
811 pmap_is_64bit(pmap_t);
812
813
814 #endif /* (__ARM_VMSA__ > 7) */
815
816 static void pmap_update_cache_attributes_locked(
817 ppnum_t, unsigned);
818
819 static boolean_t arm_clear_fast_fault(
820 ppnum_t ppnum,
821 vm_prot_t fault_type,
822 pt_entry_t *pte_p);
823
824 static void pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes);
825
826 static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes);
827
828 static void pmap_trim_self(pmap_t pmap);
829 static void pmap_trim_subord(pmap_t subord);
830
831
832 /*
833 * Temporary prototypes, while we wait for pmap_enter to move to taking an
834 * address instead of a page number.
835 */
836 static kern_return_t
837 pmap_enter_addr(
838 pmap_t pmap,
839 vm_map_address_t v,
840 pmap_paddr_t pa,
841 vm_prot_t prot,
842 vm_prot_t fault_type,
843 unsigned int flags,
844 boolean_t wired);
845
846 kern_return_t
847 pmap_enter_options_addr(
848 pmap_t pmap,
849 vm_map_address_t v,
850 pmap_paddr_t pa,
851 vm_prot_t prot,
852 vm_prot_t fault_type,
853 unsigned int flags,
854 boolean_t wired,
855 unsigned int options,
856 __unused void *arg);
857
858 #ifdef CONFIG_XNUPOST
859 kern_return_t pmap_test(void);
860 #endif /* CONFIG_XNUPOST */
861
862 PMAP_SUPPORT_PROTOTYPES(
863 kern_return_t,
864 arm_fast_fault, (pmap_t pmap,
865 vm_map_address_t va,
866 vm_prot_t fault_type,
867 bool was_af_fault,
868 bool from_user), ARM_FAST_FAULT_INDEX);
869
870 PMAP_SUPPORT_PROTOTYPES(
871 boolean_t,
872 arm_force_fast_fault, (ppnum_t ppnum,
873 vm_prot_t allow_mode,
874 int options), ARM_FORCE_FAST_FAULT_INDEX);
875
876 MARK_AS_PMAP_TEXT static boolean_t
877 arm_force_fast_fault_with_flush_range(
878 ppnum_t ppnum,
879 vm_prot_t allow_mode,
880 int options,
881 pmap_tlb_flush_range_t *flush_range);
882
883 PMAP_SUPPORT_PROTOTYPES(
884 boolean_t,
885 pmap_batch_set_cache_attributes, (ppnum_t pn,
886 unsigned int cacheattr,
887 unsigned int page_cnt,
888 unsigned int page_index,
889 boolean_t doit,
890 unsigned int *res), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
891
892 PMAP_SUPPORT_PROTOTYPES(
893 void,
894 pmap_change_wiring, (pmap_t pmap,
895 vm_map_address_t v,
896 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
897
898 PMAP_SUPPORT_PROTOTYPES(
899 pmap_t,
900 pmap_create_options, (ledger_t ledger,
901 vm_map_size_t size,
902 unsigned int flags,
903 kern_return_t * kr), PMAP_CREATE_INDEX);
904
905 PMAP_SUPPORT_PROTOTYPES(
906 void,
907 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
908
909 PMAP_SUPPORT_PROTOTYPES(
910 kern_return_t,
911 pmap_enter_options, (pmap_t pmap,
912 vm_map_address_t v,
913 pmap_paddr_t pa,
914 vm_prot_t prot,
915 vm_prot_t fault_type,
916 unsigned int flags,
917 boolean_t wired,
918 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
919
920 PMAP_SUPPORT_PROTOTYPES(
921 pmap_paddr_t,
922 pmap_find_pa, (pmap_t pmap,
923 addr64_t va), PMAP_FIND_PA_INDEX);
924
925 #if (__ARM_VMSA__ > 7)
926 PMAP_SUPPORT_PROTOTYPES(
927 kern_return_t,
928 pmap_insert_sharedpage, (pmap_t pmap), PMAP_INSERT_SHAREDPAGE_INDEX);
929 #endif
930
931
932 PMAP_SUPPORT_PROTOTYPES(
933 boolean_t,
934 pmap_is_empty, (pmap_t pmap,
935 vm_map_offset_t va_start,
936 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
937
938
939 PMAP_SUPPORT_PROTOTYPES(
940 unsigned int,
941 pmap_map_cpu_windows_copy, (ppnum_t pn,
942 vm_prot_t prot,
943 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
944
945 PMAP_SUPPORT_PROTOTYPES(
946 void,
947 pmap_ro_zone_memcpy, (zone_id_t zid,
948 vm_offset_t va,
949 vm_offset_t offset,
950 const vm_offset_t new_data,
951 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
952
953 PMAP_SUPPORT_PROTOTYPES(
954 uint64_t,
955 pmap_ro_zone_atomic_op, (zone_id_t zid,
956 vm_offset_t va,
957 vm_offset_t offset,
958 zro_atomic_op_t op,
959 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
960
961 PMAP_SUPPORT_PROTOTYPES(
962 void,
963 pmap_ro_zone_bzero, (zone_id_t zid,
964 vm_offset_t va,
965 vm_offset_t offset,
966 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
967
968 PMAP_SUPPORT_PROTOTYPES(
969 vm_map_offset_t,
970 pmap_nest, (pmap_t grand,
971 pmap_t subord,
972 addr64_t vstart,
973 uint64_t size,
974 vm_map_offset_t vrestart,
975 kern_return_t * krp), PMAP_NEST_INDEX);
976
977 PMAP_SUPPORT_PROTOTYPES(
978 void,
979 pmap_page_protect_options, (ppnum_t ppnum,
980 vm_prot_t prot,
981 unsigned int options,
982 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
983
984 PMAP_SUPPORT_PROTOTYPES(
985 vm_map_address_t,
986 pmap_protect_options, (pmap_t pmap,
987 vm_map_address_t start,
988 vm_map_address_t end,
989 vm_prot_t prot,
990 unsigned int options,
991 void *args), PMAP_PROTECT_OPTIONS_INDEX);
992
993 PMAP_SUPPORT_PROTOTYPES(
994 kern_return_t,
995 pmap_query_page_info, (pmap_t pmap,
996 vm_map_offset_t va,
997 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
998
999 PMAP_SUPPORT_PROTOTYPES(
1000 mach_vm_size_t,
1001 pmap_query_resident, (pmap_t pmap,
1002 vm_map_address_t start,
1003 vm_map_address_t end,
1004 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1005
1006 PMAP_SUPPORT_PROTOTYPES(
1007 void,
1008 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1009
1010 PMAP_SUPPORT_PROTOTYPES(
1011 vm_map_address_t,
1012 pmap_remove_options, (pmap_t pmap,
1013 vm_map_address_t start,
1014 vm_map_address_t end,
1015 int options), PMAP_REMOVE_OPTIONS_INDEX);
1016
1017
1018 PMAP_SUPPORT_PROTOTYPES(
1019 void,
1020 pmap_set_cache_attributes, (ppnum_t pn,
1021 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1022
1023 PMAP_SUPPORT_PROTOTYPES(
1024 void,
1025 pmap_update_compressor_page, (ppnum_t pn,
1026 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1027
1028 PMAP_SUPPORT_PROTOTYPES(
1029 void,
1030 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1031
1032 #if MACH_ASSERT || XNU_MONITOR
1033 PMAP_SUPPORT_PROTOTYPES(
1034 void,
1035 pmap_set_process, (pmap_t pmap,
1036 int pid,
1037 char *procname), PMAP_SET_PROCESS_INDEX);
1038 #endif
1039
1040 PMAP_SUPPORT_PROTOTYPES(
1041 void,
1042 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1043
1044 PMAP_SUPPORT_PROTOTYPES(
1045 vm_map_offset_t,
1046 pmap_unnest_options, (pmap_t grand,
1047 addr64_t vaddr,
1048 uint64_t size,
1049 vm_map_offset_t vrestart,
1050 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1051
1052 PMAP_SUPPORT_PROTOTYPES(
1053 void,
1054 phys_attribute_set, (ppnum_t pn,
1055 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1056
1057 PMAP_SUPPORT_PROTOTYPES(
1058 void,
1059 phys_attribute_clear, (ppnum_t pn,
1060 unsigned int bits,
1061 int options,
1062 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1063
1064 #if __ARM_RANGE_TLBI__
1065 PMAP_SUPPORT_PROTOTYPES(
1066 vm_map_address_t,
1067 phys_attribute_clear_range, (pmap_t pmap,
1068 vm_map_address_t start,
1069 vm_map_address_t end,
1070 unsigned int bits,
1071 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1072 #endif /* __ARM_RANGE_TLBI__ */
1073
1074
1075 PMAP_SUPPORT_PROTOTYPES(
1076 void,
1077 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1078
1079 PMAP_SUPPORT_PROTOTYPES(
1080 void,
1081 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1082
1083 PMAP_SUPPORT_PROTOTYPES(
1084 void,
1085 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1086
1087 PMAP_SUPPORT_PROTOTYPES(
1088 void,
1089 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1090
1091 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1092 PMAP_SUPPORT_PROTOTYPES(
1093 void,
1094 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1095 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1096
1097 PMAP_SUPPORT_PROTOTYPES(
1098 void,
1099 pmap_trim, (pmap_t grand,
1100 pmap_t subord,
1101 addr64_t vstart,
1102 uint64_t size), PMAP_TRIM_INDEX);
1103
1104 #if HAS_APPLE_PAC
1105 PMAP_SUPPORT_PROTOTYPES(
1106 void *,
1107 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1108 PMAP_SUPPORT_PROTOTYPES(
1109 void *,
1110 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1111 #endif /* HAS_APPLE_PAC */
1112
1113
1114
1115
1116 PMAP_SUPPORT_PROTOTYPES(
1117 bool,
1118 pmap_is_trust_cache_loaded, (const uuid_t uuid), PMAP_IS_TRUST_CACHE_LOADED_INDEX);
1119
1120 PMAP_SUPPORT_PROTOTYPES(
1121 uint32_t,
1122 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1123
1124 PMAP_SUPPORT_PROTOTYPES(
1125 bool,
1126 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1127
1128 PMAP_SUPPORT_PROTOTYPES(
1129 void,
1130 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1131 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1132
1133 PMAP_SUPPORT_PROTOTYPES(
1134 bool,
1135 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1136 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1137
1138 PMAP_SUPPORT_PROTOTYPES(
1139 void,
1140 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE]),
1141 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1142
1143 PMAP_SUPPORT_PROTOTYPES(
1144 void,
1145 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1146 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1147
1148 PMAP_SUPPORT_PROTOTYPES(
1149 void,
1150 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1151
1152 void pmap_footprint_suspend(vm_map_t map,
1153 boolean_t suspend);
1154 PMAP_SUPPORT_PROTOTYPES(
1155 void,
1156 pmap_footprint_suspend, (vm_map_t map,
1157 boolean_t suspend),
1158 PMAP_FOOTPRINT_SUSPEND_INDEX);
1159
1160
1161
1162
1163 #if DEVELOPMENT || DEBUG
1164 PMAP_SUPPORT_PROTOTYPES(
1165 kern_return_t,
1166 pmap_test_text_corruption, (pmap_paddr_t),
1167 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1168 #endif /* DEVELOPMENT || DEBUG */
1169
1170 #if (__ARM_VMSA__ > 7)
1171 /*
1172 * The low global vector page is mapped at a fixed alias.
1173 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1174 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1175 * to check both addresses anyway for backward compatibility. So for now
1176 * we leave H6 and H7 where they were.
1177 */
1178 #if (ARM_PGSHIFT == 14)
1179 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1180 #else
1181 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1182 #endif
1183
1184 #else
1185 #define LOWGLOBAL_ALIAS (0xFFFF1000)
1186 #endif
1187
1188 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1189 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1190 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1191
1192 #if XNU_MONITOR
1193
1194 #if __has_feature(ptrauth_calls)
1195 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1196 #else
1197 #define __ptrauth_ppl_handler
1198 #endif
1199
1200 /*
1201 * Table of function pointers used for PPL dispatch.
1202 */
1203 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1204 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1205 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1206 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1207 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1208 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1209 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1210 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1211 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1212 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1213 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1214 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1215 [PMAP_INSERT_SHAREDPAGE_INDEX] = pmap_insert_sharedpage_internal,
1216 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1217 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1218 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1219 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1220 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1221 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1222 [PMAP_NEST_INDEX] = pmap_nest_internal,
1223 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1224 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1225 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1226 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1227 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1228 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1229 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1230 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1231 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1232 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1233 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1234 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1235 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1236 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1237 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1238 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1239 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1240 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1241 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1242 [PMAP_IS_TRUST_CACHE_LOADED_INDEX] = pmap_is_trust_cache_loaded_internal,
1243 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1244 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1245 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1246 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1247 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1248 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1249 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1250 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1251 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1252 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1253 #if HAS_APPLE_PAC
1254 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1255 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1256 #endif /* HAS_APPLE_PAC */
1257 #if __ARM_RANGE_TLBI__
1258 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1259 #endif /* __ARM_RANGE_TLBI__ */
1260 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1261 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1262 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1263 [PMAP_NOP_INDEX] = pmap_nop_internal,
1264
1265 #if DEVELOPMENT || DEBUG
1266 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1267 #endif /* DEVELOPMENT || DEBUG */
1268 };
1269 #endif
1270
1271 #if XNU_MONITOR
1272 /**
1273 * A convenience function for setting protections on a single physical
1274 * aperture or static region mapping without invalidating the TLB.
1275 *
1276 * @note This function does not perform any TLB invalidations. That must be done
1277 * separately to be able to safely use the updated mapping.
1278 *
1279 * @note This function understands the difference between the VM page size and
1280 * the kernel page size and will update multiple PTEs if the sizes differ.
1281 * In other words, enough PTEs will always get updated to change the
1282 * permissions on a PAGE_SIZE amount of memory.
1283 *
1284 * @note The PVH lock for the physical page represented by this mapping must
1285 * already be locked.
1286 *
1287 * @note This function assumes the caller has already verified that the PTE
1288 * pointer does indeed point to a physical aperture or static region page
1289 * table. Please validate your inputs before passing it along to this
1290 * function.
1291 *
1292 * @param ptep Pointer to the physical aperture or static region page table to
1293 * update with a new XPRR index.
1294 * @param expected_perm The XPRR index that is expected to already exist at the
1295 * current mapping. If the current index doesn't match this
1296 * then the system will panic.
1297 * @param new_perm The new XPRR index to update the mapping with.
1298 */
1299 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1300 pmap_set_pte_xprr_perm(
1301 pt_entry_t * const ptep,
1302 unsigned int expected_perm,
1303 unsigned int new_perm)
1304 {
1305 assert(ptep != NULL);
1306
1307 pt_entry_t spte = *ptep;
1308 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1309
1310 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1311 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1312 __func__, ptep, new_perm, expected_perm);
1313 }
1314
1315 /**
1316 * The PTE involved should be valid, should not have the hint bit set, and
1317 * should have the expected XPRR index.
1318 */
1319 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1320 panic_plain("%s: physical aperture or static region PTE is invalid, "
1321 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1322 __func__, ptep, spte, new_perm, expected_perm);
1323 }
1324
1325 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1326 panic_plain("%s: physical aperture or static region PTE has hint bit "
1327 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1328 __func__, ptep, spte, new_perm, expected_perm);
1329 }
1330
1331 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1332 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1333 "ptep=%p, new_perm=%u, expected_perm=%u",
1334 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1335 }
1336
1337 pt_entry_t template = spte;
1338 template &= ~ARM_PTE_XPRR_MASK;
1339 template |= xprr_perm_to_pte(new_perm);
1340
1341 write_pte_strong(ptep, template);
1342 }
1343
1344 /**
1345 * Update the protections on a single physical aperture mapping and invalidate
1346 * the TLB so the mapping can be used.
1347 *
1348 * @note The PVH lock for the physical page must already be locked.
1349 *
1350 * @param pai The physical address index of the page whose physical aperture
1351 * mapping will be updated with new permissions.
1352 * @param expected_perm The XPRR index that is expected to already exist at the
1353 * current mapping. If the current index doesn't match this
1354 * then the system will panic.
1355 * @param new_perm The new XPRR index to update the mapping with.
1356 */
1357 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1358 pmap_set_xprr_perm(
1359 unsigned int pai,
1360 unsigned int expected_perm,
1361 unsigned int new_perm)
1362 {
1363 pvh_assert_locked(pai);
1364
1365 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1366 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1367
1368 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1369
1370 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true);
1371 sync_tlb_flush();
1372 }
1373
1374 /**
1375 * Update the protections on a range of physical aperture or static region
1376 * mappings and invalidate the TLB so the mappings can be used.
1377 *
1378 * @note Static region mappings can only be updated before machine_lockdown().
1379 * Physical aperture mappings can be updated at any time.
1380 *
1381 * @param start The starting virtual address of the static region or physical
1382 * aperture range whose permissions will be updated.
1383 * @param end The final (inclusive) virtual address of the static region or
1384 * physical aperture range whose permissions will be updated.
1385 * @param expected_perm The XPRR index that is expected to already exist at the
1386 * current mappings. If the current indices don't match
1387 * this then the system will panic.
1388 * @param new_perm The new XPRR index to update the mappings with.
1389 */
1390 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1391 pmap_set_range_xprr_perm(
1392 vm_address_t start,
1393 vm_address_t end,
1394 unsigned int expected_perm,
1395 unsigned int new_perm)
1396 {
1397 #if (__ARM_VMSA__ == 7)
1398 #error This function is not supported on older ARM hardware.
1399 #endif /* (__ARM_VMSA__ == 7) */
1400
1401 /**
1402 * Validate our arguments; any invalid argument will be grounds for a panic.
1403 */
1404 if (__improbable((start | end) & ARM_PGMASK)) {
1405 panic_plain("%s: start or end not page aligned, "
1406 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1407 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1408 }
1409
1410 if (__improbable(start > end)) {
1411 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1412 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1413 }
1414
1415 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1416 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1417
1418 if (__improbable(!(in_physmap || in_static))) {
1419 panic_plain("%s: address not in static region or physical aperture, "
1420 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1421 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1422 }
1423
1424 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1425 panic_plain("%s: invalid XPRR index, "
1426 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1427 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1428 }
1429
1430 /*
1431 * Walk over the PTEs for the given range, and set the protections on those
1432 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1433 * one twig entry (whichever twig entry currently maps "va").
1434 */
1435 vm_address_t va = start;
1436 while (va < end) {
1437 /**
1438 * Get the last VA that the twig entry for "va" maps. All of the leaf
1439 * PTEs from va to tte_va_end will have their permissions updated.
1440 */
1441 vm_address_t tte_va_end =
1442 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1443
1444 if (tte_va_end > end) {
1445 tte_va_end = end;
1446 }
1447
1448 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1449
1450 if (ttep == NULL) {
1451 panic_plain("%s: physical aperture or static region tte is NULL, "
1452 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1453 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1454 }
1455
1456 tt_entry_t tte = *ttep;
1457
1458 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1459 panic_plain("%s: tte=0x%llx is not a table type entry, "
1460 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1461 tte, (void *)start, (void *)end, new_perm, expected_perm);
1462 }
1463
1464 /* Walk over the given L3 page table page and update the PTEs. */
1465 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1466 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1467 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1468 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1469
1470 /**
1471 * The current PTE pointer is incremented by the page ratio (ratio of
1472 * VM page size to kernel hardware page size) because one call to
1473 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1474 * a PAGE_SIZE worth of hardware pages.
1475 */
1476 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1477 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1478 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1479 pvh_lock(pai);
1480 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1481 pvh_unlock(pai);
1482 }
1483
1484 va = tte_va_end;
1485 }
1486
1487 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1488 }
1489
1490 #endif /* XNU_MONITOR */
1491
1492 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1493 PMAP_ZINFO_PALLOC(
1494 pmap_t pmap, int bytes)
1495 {
1496 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1497 }
1498
1499 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1500 PMAP_ZINFO_PFREE(
1501 pmap_t pmap,
1502 int bytes)
1503 {
1504 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1505 }
1506
1507 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1508 pmap_tt_ledger_credit(
1509 pmap_t pmap,
1510 vm_size_t size)
1511 {
1512 if (pmap != kernel_pmap) {
1513 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1514 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1515 }
1516 }
1517
1518 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1519 pmap_tt_ledger_debit(
1520 pmap_t pmap,
1521 vm_size_t size)
1522 {
1523 if (pmap != kernel_pmap) {
1524 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1525 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1526 }
1527 }
1528
1529 static inline void
pmap_update_plru(uint16_t asid_index)1530 pmap_update_plru(uint16_t asid_index)
1531 {
1532 if (__probable(pmap_asid_plru)) {
1533 unsigned plru_index = asid_index >> 6;
1534 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1535 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1536 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1537 }
1538 }
1539 }
1540
1541 static bool
alloc_asid(pmap_t pmap)1542 alloc_asid(pmap_t pmap)
1543 {
1544 int vasid = -1;
1545 uint16_t hw_asid;
1546
1547 pmap_simple_lock(&asid_lock);
1548
1549 if (__probable(pmap_asid_plru)) {
1550 unsigned plru_index = 0;
1551 uint64_t lowest_gen = asid_plru_generation[0];
1552 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1553 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1554 if (asid_plru_generation[i] < lowest_gen) {
1555 plru_index = i;
1556 lowest_gen = asid_plru_generation[i];
1557 lowest_gen_bitmap = asid_plru_bitmap[i];
1558 }
1559 }
1560
1561 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1562 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1563 if (temp_plru) {
1564 vasid = (plru_index << 6) + lsb_first(temp_plru);
1565 #if DEVELOPMENT || DEBUG
1566 ++pmap_asid_hits;
1567 #endif
1568 break;
1569 }
1570 }
1571 }
1572 if (__improbable(vasid < 0)) {
1573 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1574 // slightly better with the collision detection scheme used by pmap_switch_internal().
1575 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1576 #if DEVELOPMENT || DEBUG
1577 ++pmap_asid_misses;
1578 #endif
1579 }
1580 if (__improbable(vasid < 0)) {
1581 pmap_simple_unlock(&asid_lock);
1582 return false;
1583 }
1584 assert((uint32_t)vasid < pmap_max_asids);
1585 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1586 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1587 pmap_simple_unlock(&asid_lock);
1588 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1589 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1590 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1591 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1592 * reassign to a reserved VASID. */
1593 assert(pmap->sw_asid < UINT8_MAX);
1594 pmap->sw_asid = UINT8_MAX;
1595 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1596 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1597 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1598 assert(hw_asid < MAX_HW_ASIDS);
1599 }
1600 pmap_update_plru(hw_asid);
1601 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1602 #if __ARM_KERNEL_PROTECT__
1603 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1604 #endif
1605 pmap->hw_asid = hw_asid;
1606 return true;
1607 }
1608
1609 static void
free_asid(pmap_t pmap)1610 free_asid(pmap_t pmap)
1611 {
1612 unsigned int vasid;
1613 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1614 if (__improbable(hw_asid == 0)) {
1615 return;
1616 }
1617
1618 #if __ARM_KERNEL_PROTECT__
1619 hw_asid >>= 1;
1620 #endif
1621 hw_asid -= 1;
1622
1623 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1624 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1625 } else {
1626 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1627 }
1628
1629 if (__probable(pmap_asid_plru)) {
1630 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1631 }
1632 pmap_simple_lock(&asid_lock);
1633 assert(!bitmap_test(&asid_bitmap[0], vasid));
1634 bitmap_set(&asid_bitmap[0], vasid);
1635 pmap_simple_unlock(&asid_lock);
1636 }
1637
1638
1639 boolean_t
pmap_valid_address(pmap_paddr_t addr)1640 pmap_valid_address(
1641 pmap_paddr_t addr)
1642 {
1643 return pa_valid(addr);
1644 }
1645
1646
1647
1648
1649
1650
1651 /*
1652 * Map memory at initialization. The physical addresses being
1653 * mapped are not managed and are never unmapped.
1654 *
1655 * For now, VM is already on, we only need to map the
1656 * specified memory.
1657 */
1658 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1659 pmap_map(
1660 vm_map_address_t virt,
1661 vm_offset_t start,
1662 vm_offset_t end,
1663 vm_prot_t prot,
1664 unsigned int flags)
1665 {
1666 kern_return_t kr;
1667 vm_size_t ps;
1668
1669 ps = PAGE_SIZE;
1670 while (start < end) {
1671 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1672 prot, VM_PROT_NONE, flags, FALSE);
1673
1674 if (kr != KERN_SUCCESS) {
1675 panic("%s: failed pmap_enter, "
1676 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1677 __FUNCTION__,
1678 (void *) virt, (void *) start, (void *) end, prot, flags);
1679 }
1680
1681 virt += ps;
1682 start += ps;
1683 }
1684 return virt;
1685 }
1686
1687 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1688 pmap_map_bd_with_options(
1689 vm_map_address_t virt,
1690 vm_offset_t start,
1691 vm_offset_t end,
1692 vm_prot_t prot,
1693 int32_t options)
1694 {
1695 pt_entry_t tmplate;
1696 pt_entry_t *ptep;
1697 vm_map_address_t vaddr;
1698 vm_offset_t paddr;
1699 pt_entry_t mem_attr;
1700
1701 switch (options & PMAP_MAP_BD_MASK) {
1702 case PMAP_MAP_BD_WCOMB:
1703 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1704 #if (__ARM_VMSA__ > 7)
1705 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1706 #else
1707 mem_attr |= ARM_PTE_SH;
1708 #endif
1709 break;
1710 case PMAP_MAP_BD_POSTED:
1711 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1712 break;
1713 case PMAP_MAP_BD_POSTED_REORDERED:
1714 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1715 break;
1716 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1717 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1718 break;
1719 default:
1720 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1721 break;
1722 }
1723
1724 tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1725 mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1726 #if __ARM_KERNEL_PROTECT__
1727 tmplate |= ARM_PTE_NG;
1728 #endif /* __ARM_KERNEL_PROTECT__ */
1729
1730 vaddr = virt;
1731 paddr = start;
1732 while (paddr < end) {
1733 ptep = pmap_pte(kernel_pmap, vaddr);
1734 if (ptep == PT_ENTRY_NULL) {
1735 panic("%s: no PTE for vaddr=%p, "
1736 "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1737 __FUNCTION__, (void*)vaddr,
1738 (void*)virt, (void*)start, (void*)end, prot, options);
1739 }
1740
1741 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1742 write_pte_strong(ptep, tmplate);
1743
1744 pte_increment_pa(tmplate);
1745 vaddr += PAGE_SIZE;
1746 paddr += PAGE_SIZE;
1747 }
1748
1749 if (end >= start) {
1750 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1751 }
1752
1753 return vaddr;
1754 }
1755
1756 /*
1757 * Back-door routine for mapping kernel VM at initialization.
1758 * Useful for mapping memory outside the range
1759 * [vm_first_phys, vm_last_phys] (i.e., devices).
1760 * Otherwise like pmap_map.
1761 */
1762 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1763 pmap_map_bd(
1764 vm_map_address_t virt,
1765 vm_offset_t start,
1766 vm_offset_t end,
1767 vm_prot_t prot)
1768 {
1769 pt_entry_t tmplate;
1770 pt_entry_t *ptep;
1771 vm_map_address_t vaddr;
1772 vm_offset_t paddr;
1773
1774 /* not cacheable and not buffered */
1775 tmplate = pa_to_pte(start)
1776 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1777 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1778 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1779 #if __ARM_KERNEL_PROTECT__
1780 tmplate |= ARM_PTE_NG;
1781 #endif /* __ARM_KERNEL_PROTECT__ */
1782
1783 vaddr = virt;
1784 paddr = start;
1785 while (paddr < end) {
1786 ptep = pmap_pte(kernel_pmap, vaddr);
1787 if (ptep == PT_ENTRY_NULL) {
1788 panic("pmap_map_bd");
1789 }
1790 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1791 write_pte_strong(ptep, tmplate);
1792
1793 pte_increment_pa(tmplate);
1794 vaddr += PAGE_SIZE;
1795 paddr += PAGE_SIZE;
1796 }
1797
1798 if (end >= start) {
1799 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1800 }
1801
1802 return vaddr;
1803 }
1804
1805 /*
1806 * Back-door routine for mapping kernel VM at initialization.
1807 * Useful for mapping memory specific physical addresses in early
1808 * boot (i.e., before kernel_map is initialized).
1809 *
1810 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1811 */
1812
1813 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1814 pmap_map_high_window_bd(
1815 vm_offset_t pa_start,
1816 vm_size_t len,
1817 vm_prot_t prot)
1818 {
1819 pt_entry_t *ptep, pte;
1820 #if (__ARM_VMSA__ == 7)
1821 vm_map_address_t va_start = VM_HIGH_KERNEL_WINDOW;
1822 vm_map_address_t va_max = VM_MAX_KERNEL_ADDRESS;
1823 #else
1824 vm_map_address_t va_start = VREGION1_START;
1825 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1826 #endif
1827 vm_map_address_t va_end;
1828 vm_map_address_t va;
1829 vm_size_t offset;
1830
1831 offset = pa_start & PAGE_MASK;
1832 pa_start -= offset;
1833 len += offset;
1834
1835 if (len > (va_max - va_start)) {
1836 panic("%s: area too large, "
1837 "pa_start=%p, len=%p, prot=0x%x",
1838 __FUNCTION__,
1839 (void*)pa_start, (void*)len, prot);
1840 }
1841
1842 scan:
1843 for (; va_start < va_max; va_start += PAGE_SIZE) {
1844 ptep = pmap_pte(kernel_pmap, va_start);
1845 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1846 if (*ptep == ARM_PTE_TYPE_FAULT) {
1847 break;
1848 }
1849 }
1850 if (va_start > va_max) {
1851 panic("%s: insufficient pages, "
1852 "pa_start=%p, len=%p, prot=0x%x",
1853 __FUNCTION__,
1854 (void*)pa_start, (void*)len, prot);
1855 }
1856
1857 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1858 ptep = pmap_pte(kernel_pmap, va_end);
1859 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1860 if (*ptep != ARM_PTE_TYPE_FAULT) {
1861 va_start = va_end + PAGE_SIZE;
1862 goto scan;
1863 }
1864 }
1865
1866 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1867 ptep = pmap_pte(kernel_pmap, va);
1868 pte = pa_to_pte(pa_start)
1869 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1870 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1871 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
1872 #if (__ARM_VMSA__ > 7)
1873 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
1874 #else
1875 pte |= ARM_PTE_SH;
1876 #endif
1877 #if __ARM_KERNEL_PROTECT__
1878 pte |= ARM_PTE_NG;
1879 #endif /* __ARM_KERNEL_PROTECT__ */
1880 write_pte_strong(ptep, pte);
1881 }
1882 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
1883 #if KASAN
1884 kasan_notify_address(va_start, len);
1885 #endif
1886 return va_start;
1887 }
1888
1889 static uint32_t
pmap_compute_max_asids(void)1890 pmap_compute_max_asids(void)
1891 {
1892 DTEntry entry;
1893 void const *prop = NULL;
1894 uint32_t max_asids;
1895 int err;
1896 unsigned int prop_size;
1897
1898 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1899 assert(err == kSuccess);
1900
1901 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
1902 /* TODO: consider allowing maxproc limits to be scaled earlier so that
1903 * we can choose a more flexible default value here. */
1904 return MAX_ASIDS;
1905 }
1906
1907 if (prop_size != sizeof(max_asids)) {
1908 panic("pmap-max-asids property is not a 32-bit integer");
1909 }
1910
1911 max_asids = *((uint32_t const *)prop);
1912 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
1913 max_asids = (max_asids + 63) & ~63UL;
1914
1915 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
1916 /* currently capped by size of pmap->sw_asid */
1917 panic("pmap-max-asids too large");
1918 }
1919 if (max_asids == 0) {
1920 panic("pmap-max-asids cannot be zero");
1921 }
1922 return max_asids;
1923 }
1924
1925 #if __arm64__
1926 /*
1927 * pmap_get_arm64_prot
1928 *
1929 * return effective armv8 VMSA block protections including
1930 * table AP/PXN/XN overrides of a pmap entry
1931 *
1932 */
1933
1934 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1935 pmap_get_arm64_prot(
1936 pmap_t pmap,
1937 vm_offset_t addr)
1938 {
1939 tt_entry_t tte = 0;
1940 unsigned int level = 0;
1941 uint64_t tte_type = 0;
1942 uint64_t effective_prot_bits = 0;
1943 uint64_t aggregate_tte = 0;
1944 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1945 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1946
1947 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1948 tte = *pmap_ttne(pmap, level, addr);
1949
1950 if (!(tte & ARM_TTE_VALID)) {
1951 return 0;
1952 }
1953
1954 tte_type = tte & ARM_TTE_TYPE_MASK;
1955
1956 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
1957 (level == pt_attr->pta_max_level)) {
1958 /* Block or page mapping; both have the same protection bit layout. */
1959 break;
1960 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
1961 /* All of the table bits we care about are overrides, so just OR them together. */
1962 aggregate_tte |= tte;
1963 }
1964 }
1965
1966 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1967 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1968 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1969
1970 /* Start with the PTE bits. */
1971 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1972
1973 /* Table AP bits mask out block/page AP bits */
1974 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1975
1976 /* XN/PXN bits can be OR'd in. */
1977 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1978 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1979
1980 return effective_prot_bits;
1981 }
1982 #endif /* __arm64__ */
1983
1984 static void
pmap_set_srd_fusing()1985 pmap_set_srd_fusing()
1986 {
1987 DTEntry entry;
1988 uint32_t const *prop = NULL;
1989 int err;
1990 unsigned int prop_size = 0;
1991
1992 err = SecureDTLookupEntry(NULL, "/chosen", &entry);
1993 if (err != kSuccess) {
1994 panic("PMAP: no chosen DT node");
1995 }
1996
1997 if (kSuccess == SecureDTGetProperty(entry, "research-enabled", (const void**)&prop, &prop_size)) {
1998 if (prop_size == sizeof(uint32_t)) {
1999 srd_fused = *prop;
2000 }
2001 }
2002
2003 #if DEVELOPMENT || DEBUG
2004 PE_parse_boot_argn("srd_fusing", &srd_fused, sizeof(srd_fused));
2005 #endif
2006 }
2007
2008 /*
2009 * Bootstrap the system enough to run with virtual memory.
2010 *
2011 * The early VM initialization code has already allocated
2012 * the first CPU's translation table and made entries for
2013 * all the one-to-one mappings to be found there.
2014 *
2015 * We must set up the kernel pmap structures, the
2016 * physical-to-virtual translation lookup tables for the
2017 * physical memory to be managed (between avail_start and
2018 * avail_end).
2019 *
2020 * Map the kernel's code and data, and allocate the system page table.
2021 * Page_size must already be set.
2022 *
2023 * Parameters:
2024 * first_avail first available physical page -
2025 * after kernel page tables
2026 * avail_start PA of first managed physical page
2027 * avail_end PA of last managed physical page
2028 */
2029
2030 void
pmap_bootstrap(vm_offset_t vstart)2031 pmap_bootstrap(
2032 vm_offset_t vstart)
2033 {
2034 vm_map_offset_t maxoffset;
2035
2036 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2037
2038 pmap_set_srd_fusing();
2039
2040 #if XNU_MONITOR
2041
2042 #if DEVELOPMENT || DEBUG
2043 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2044 #endif
2045
2046 #if CONFIG_CSR_FROM_DT
2047 if (csr_unsafe_kernel_text) {
2048 pmap_ppl_disable = true;
2049 }
2050 #endif /* CONFIG_CSR_FROM_DT */
2051
2052 #endif /* XNU_MONITOR */
2053
2054 #if DEVELOPMENT || DEBUG
2055 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2056 kprintf("Kernel traces for pmap operations enabled\n");
2057 }
2058 #endif
2059
2060 /*
2061 * Initialize the kernel pmap.
2062 */
2063 pmap_stamp = 1;
2064 #if ARM_PARAMETERIZED_PMAP
2065 kernel_pmap->pmap_pt_attr = native_pt_attr;
2066 #endif /* ARM_PARAMETERIZED_PMAP */
2067 #if HAS_APPLE_PAC
2068 kernel_pmap->disable_jop = 0;
2069 #endif /* HAS_APPLE_PAC */
2070 kernel_pmap->tte = cpu_tte;
2071 kernel_pmap->ttep = cpu_ttep;
2072 #if (__ARM_VMSA__ > 7)
2073 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2074 #else
2075 kernel_pmap->min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
2076 #endif
2077 kernel_pmap->max = UINTPTR_MAX;
2078 os_atomic_init(&kernel_pmap->ref_count, 1);
2079 #if XNU_MONITOR
2080 os_atomic_init(&kernel_pmap->nested_count, 0);
2081 #endif
2082 kernel_pmap->gc_status = 0;
2083 kernel_pmap->nx_enabled = TRUE;
2084 #ifdef __arm64__
2085 kernel_pmap->is_64bit = TRUE;
2086 #else
2087 kernel_pmap->is_64bit = FALSE;
2088 #endif
2089 kernel_pmap->stamp = os_atomic_inc(&pmap_stamp, relaxed);
2090
2091 #if ARM_PARAMETERIZED_PMAP
2092 kernel_pmap->pmap_pt_attr = native_pt_attr;
2093 #endif /* ARM_PARAMETERIZED_PMAP */
2094
2095 kernel_pmap->nested_region_addr = 0x0ULL;
2096 kernel_pmap->nested_region_size = 0x0ULL;
2097 kernel_pmap->nested_region_asid_bitmap = NULL;
2098 kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2099 kernel_pmap->type = PMAP_TYPE_KERNEL;
2100
2101 #if (__ARM_VMSA__ == 7)
2102 kernel_pmap->tte_index_max = 4 * (ARM_PGBYTES / sizeof(tt_entry_t));
2103 #endif
2104 kernel_pmap->hw_asid = 0;
2105 kernel_pmap->sw_asid = 0;
2106
2107 pmap_lock_init(kernel_pmap);
2108
2109 pmap_max_asids = pmap_compute_max_asids();
2110 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2111 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2112 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2113 * masking used by the PLRU scheme. This means we must handle the case in which
2114 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2115 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2116 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2117 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2118
2119 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2120
2121 /**
2122 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2123 * pp_attr_table, etc). This function will use `avail_start` to allocate
2124 * space for these data structures.
2125 * */
2126 pmap_data_bootstrap();
2127
2128 /**
2129 * Don't make any assumptions about the alignment of avail_start before this
2130 * point (i.e., pmap_data_bootstrap() performs allocations).
2131 */
2132 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2133
2134 const pmap_paddr_t pmap_struct_start = avail_start;
2135
2136 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2137 avail_start = round_page(avail_start + asid_table_size);
2138
2139 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2140
2141 vm_first_phys = gPhysBase;
2142 vm_last_phys = trunc_page(avail_end);
2143
2144 queue_init(&map_pmap_list);
2145 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2146 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2147 free_page_size_tt_count = 0;
2148 free_page_size_tt_max = 0;
2149 free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2150 free_two_page_size_tt_count = 0;
2151 free_two_page_size_tt_max = 0;
2152 free_tt_list = TT_FREE_ENTRY_NULL;
2153 free_tt_count = 0;
2154 free_tt_max = 0;
2155
2156 virtual_space_start = vstart;
2157 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2158
2159 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2160 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2161 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2162 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2163
2164
2165
2166 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2167 maxoffset = trunc_page(maxoffset);
2168 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2169 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2170 arm_pmap_max_offset_default = maxoffset;
2171 }
2172 }
2173 #if defined(__arm64__)
2174 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2175 maxoffset = trunc_page(maxoffset);
2176 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2177 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2178 arm64_pmap_max_offset_default = maxoffset;
2179 }
2180 }
2181 #endif
2182
2183 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2184
2185
2186 #if MACH_ASSERT
2187 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2188 &vm_footprint_suspend_allowed,
2189 sizeof(vm_footprint_suspend_allowed));
2190 #endif /* MACH_ASSERT */
2191
2192 #if KASAN
2193 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2194 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2195 #endif /* KASAN */
2196
2197 /**
2198 * Ensure that avail_start is always left on a page boundary. The calling
2199 * code might not perform any alignment before allocating page tables so
2200 * this is important.
2201 */
2202 avail_start = round_page(avail_start);
2203 }
2204
2205 #if XNU_MONITOR
2206
2207 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2208 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2209 {
2210 pmap_paddr_t cur_pa;
2211 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2212 assert(pa_valid(cur_pa));
2213 ppattr_pa_set_monitor(cur_pa);
2214 }
2215 }
2216
2217 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2218 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2219 pmap_paddr_t end_pa,
2220 unsigned int expected_perm,
2221 unsigned int new_perm)
2222 {
2223 vm_offset_t start_va = phystokv(start_pa);
2224 vm_offset_t end_va = start_va + (end_pa - start_pa);
2225
2226 pa_set_range_monitor(start_pa, end_pa);
2227 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2228 }
2229
2230 static void
pmap_lockdown_kc(void)2231 pmap_lockdown_kc(void)
2232 {
2233 extern vm_offset_t vm_kernelcache_base;
2234 extern vm_offset_t vm_kernelcache_top;
2235 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2236 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2237 pmap_paddr_t cur_pa = start_pa;
2238 vm_offset_t cur_va = vm_kernelcache_base;
2239 while (cur_pa < end_pa) {
2240 vm_size_t range_size = end_pa - cur_pa;
2241 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2242 if (ptov_va != cur_va) {
2243 /*
2244 * If the physical address maps back to a virtual address that is non-linear
2245 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2246 * reclaimed by the OS and should therefore not be locked down.
2247 */
2248 cur_pa += range_size;
2249 cur_va += range_size;
2250 continue;
2251 }
2252 unsigned int pai = pa_index(cur_pa);
2253 pv_entry_t **pv_h = pai_to_pvh(pai);
2254
2255 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2256
2257 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2258 panic("pai %d already locked down", pai);
2259 }
2260 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2261 cur_pa += ARM_PGBYTES;
2262 cur_va += ARM_PGBYTES;
2263 }
2264 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2265 extern uint64_t ctrr_ro_test;
2266 extern uint64_t ctrr_nx_test;
2267 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2268 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2269 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2270 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2271 }
2272 #endif
2273 }
2274
2275 void
pmap_static_allocations_done(void)2276 pmap_static_allocations_done(void)
2277 {
2278 pmap_paddr_t monitor_start_pa;
2279 pmap_paddr_t monitor_end_pa;
2280
2281 /*
2282 * Protect the bootstrap (V=P and V->P) page tables.
2283 *
2284 * These bootstrap allocations will be used primarily for page tables.
2285 * If we wish to secure the page tables, we need to start by marking
2286 * these bootstrap allocations as pages that we want to protect.
2287 */
2288 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2289 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2290
2291 /* The bootstrap page tables are mapped RW at boostrap. */
2292 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2293
2294 /*
2295 * We use avail_start as a pointer to the first address that has not
2296 * been reserved for bootstrap, so we know which pages to give to the
2297 * virtual memory layer.
2298 */
2299 monitor_start_pa = BootArgs->topOfKernelData;
2300 monitor_end_pa = avail_start;
2301
2302 /* The other bootstrap allocations are mapped RW at bootstrap. */
2303 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2304
2305 /*
2306 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2307 * to RO in arm_vm_prot_finalize(), which is called after this function.
2308 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2309 * they can't be allocated for other uses. We don't need a special xPRR
2310 * protection index, as there is no PPL_RO index, and these pages are ultimately
2311 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2312 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2313 * lookup table index to USER_XO before APRR is applied, leading the hardware
2314 * to believe we are dealing with an user XO page upon performing a translation.
2315 */
2316 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2317 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2318 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2319
2320 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2321 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2322
2323 /* PPL data is RW for the PPL, RO for the kernel. */
2324 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2325
2326 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2327 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2328
2329 /* PPL text is RX for the PPL, RO for the kernel. */
2330 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2331
2332
2333 /*
2334 * In order to support DTrace, the save areas for the PPL must be
2335 * writable. This is due to the fact that DTrace will try to update
2336 * register state.
2337 */
2338 if (pmap_ppl_disable) {
2339 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2340 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2341
2342 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2343 }
2344
2345
2346 if (segSizePPLDATACONST > 0) {
2347 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2348 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2349
2350 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2351 }
2352
2353 /*
2354 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2355 * precaution. The real RW mappings are at a different location with guard pages.
2356 */
2357 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2358
2359 /* Prevent remapping of the kernelcache */
2360 pmap_lockdown_kc();
2361 }
2362
2363 void
pmap_lockdown_ppl(void)2364 pmap_lockdown_ppl(void)
2365 {
2366 /* Mark the PPL as being locked down. */
2367
2368 #error "XPRR configuration error"
2369 }
2370 #endif /* XNU_MONITOR */
2371
2372 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2373 pmap_virtual_space(
2374 vm_offset_t *startp,
2375 vm_offset_t *endp
2376 )
2377 {
2378 *startp = virtual_space_start;
2379 *endp = virtual_space_end;
2380 }
2381
2382
2383 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2384 pmap_virtual_region(
2385 unsigned int region_select,
2386 vm_map_offset_t *startp,
2387 vm_map_size_t *size
2388 )
2389 {
2390 boolean_t ret = FALSE;
2391 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2392 if (region_select == 0) {
2393 /*
2394 * In this config, the bootstrap mappings should occupy their own L2
2395 * TTs, as they should be immutable after boot. Having the associated
2396 * TTEs and PTEs in their own pages allows us to lock down those pages,
2397 * while allowing the rest of the kernel address range to be remapped.
2398 */
2399 #if (__ARM_VMSA__ > 7)
2400 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2401 #else
2402 #error Unsupported configuration
2403 #endif
2404 #if defined(ARM_LARGE_MEMORY)
2405 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2406 #else
2407 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2408 #endif
2409 ret = TRUE;
2410 }
2411
2412 #if defined(ARM_LARGE_MEMORY)
2413 if (region_select == 1) {
2414 *startp = VREGION1_START;
2415 *size = VREGION1_SIZE;
2416 ret = TRUE;
2417 }
2418 #endif
2419 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2420 #if defined(ARM_LARGE_MEMORY)
2421 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2422 #if (__ARM_VMSA__ > 7)
2423 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2424 #else
2425 #error Unsupported configuration
2426 #endif
2427 if (region_select == 0) {
2428 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2429 ret = TRUE;
2430 }
2431 #else /* !defined(ARM_LARGE_MEMORY) */
2432 #if (__ARM_VMSA__ > 7)
2433 unsigned long low_global_vr_mask = 0;
2434 vm_map_size_t low_global_vr_size = 0;
2435 #endif
2436
2437 if (region_select == 0) {
2438 #if (__ARM_VMSA__ == 7)
2439 *startp = gVirtBase & 0xFFC00000;
2440 *size = ((virtual_space_start - (gVirtBase & 0xFFC00000)) + ~0xFFC00000) & 0xFFC00000;
2441 #else
2442 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2443 if (!TEST_PAGE_SIZE_4K) {
2444 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2445 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2446 } else {
2447 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2448 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2449 }
2450 #endif
2451 ret = TRUE;
2452 }
2453 if (region_select == 1) {
2454 *startp = VREGION1_START;
2455 *size = VREGION1_SIZE;
2456 ret = TRUE;
2457 }
2458 #if (__ARM_VMSA__ > 7)
2459 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2460 if (!TEST_PAGE_SIZE_4K) {
2461 low_global_vr_mask = 0xFFFFFFFFFE000000;
2462 low_global_vr_size = 0x2000000;
2463 } else {
2464 low_global_vr_mask = 0xFFFFFFFFFF800000;
2465 low_global_vr_size = 0x800000;
2466 }
2467
2468 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2469 *startp = LOW_GLOBAL_BASE_ADDRESS;
2470 *size = low_global_vr_size;
2471 ret = TRUE;
2472 }
2473
2474 if (region_select == 3) {
2475 /* In this config, we allow the bootstrap mappings to occupy the same
2476 * page table pages as the heap.
2477 */
2478 *startp = VM_MIN_KERNEL_ADDRESS;
2479 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2480 ret = TRUE;
2481 }
2482 #endif
2483 #endif /* defined(ARM_LARGE_MEMORY) */
2484 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2485 return ret;
2486 }
2487
2488 /*
2489 * Routines to track and allocate physical pages during early boot.
2490 * On most systems that memory runs from first_avail through to avail_end
2491 * with no gaps.
2492 *
2493 * However if the system supports ECC and bad_ram_pages_count > 0, we
2494 * need to be careful and skip those pages.
2495 */
2496 static unsigned int avail_page_count = 0;
2497 static bool need_ram_ranges_init = true;
2498
2499 #if defined(__arm64__)
2500 pmap_paddr_t *bad_ram_pages = NULL;
2501 unsigned int bad_ram_pages_count = 0;
2502
2503 /*
2504 * We use this sub-range of bad_ram_pages for pmap_next_page()
2505 */
2506 static pmap_paddr_t *skip_pages;
2507 static unsigned int skip_pages_count = 0;
2508
2509 #define MAX_BAD_RAM_PAGE_COUNT 64
2510 static pmap_paddr_t bad_ram_pages_arr[MAX_BAD_RAM_PAGE_COUNT];
2511
2512 /*
2513 * XXX - temporary code to get the bad pages array from boot-args.
2514 * expects a comma separated list of offsets from the start
2515 * of physical memory to be considered bad.
2516 *
2517 * HERE JOE -- will eventually be replaced by data provided by iboot
2518 */
2519 static void
parse_bad_ram_pages_boot_arg(void)2520 parse_bad_ram_pages_boot_arg(void)
2521 {
2522 char buf[256] = {0};
2523 char *s = buf;
2524 char *end;
2525 int count = 0;
2526 pmap_paddr_t num;
2527 extern uint64_t strtouq(const char *, char **, int);
2528
2529 if (!PE_parse_boot_arg_str("bad_ram_pages", buf, sizeof(buf))) {
2530 goto done;
2531 }
2532
2533 while (*s && count < MAX_BAD_RAM_PAGE_COUNT) {
2534 num = (pmap_paddr_t)strtouq(s, &end, 0);
2535 if (num == 0) {
2536 break;
2537 }
2538 num &= ~PAGE_MASK;
2539
2540 bad_ram_pages_arr[count++] = gDramBase + num;
2541
2542 if (*end != ',') {
2543 break;
2544 }
2545
2546 s = end + 1;
2547 }
2548
2549 done:
2550 bad_ram_pages = bad_ram_pages_arr;
2551 bad_ram_pages_count = count;
2552 }
2553
2554 /*
2555 * Comparison routine for qsort of array of physical addresses.
2556 */
2557 static int
pmap_paddr_cmp(void * a,void * b)2558 pmap_paddr_cmp(void *a, void *b)
2559 {
2560 pmap_paddr_t *x = a;
2561 pmap_paddr_t *y = b;
2562 if (*x < *y) {
2563 return -1;
2564 }
2565 return *x > *y;
2566 }
2567 #endif /* defined(__arm64__) */
2568
2569 /*
2570 * Look up ppn in the sorted bad_ram_pages array.
2571 */
2572 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2573 pmap_is_bad_ram(__unused ppnum_t ppn)
2574 {
2575 #if defined(__arm64__)
2576 pmap_paddr_t pa = ptoa(ppn);
2577 int low = 0;
2578 int high = bad_ram_pages_count - 1;
2579 int mid;
2580
2581 while (low <= high) {
2582 mid = (low + high) / 2;
2583 if (bad_ram_pages[mid] < pa) {
2584 low = mid + 1;
2585 } else if (bad_ram_pages[mid] > pa) {
2586 high = mid - 1;
2587 } else {
2588 return true;
2589 }
2590 }
2591 #endif /* defined(__arm64__) */
2592 return false;
2593 }
2594
2595 /*
2596 * Initialize the count of available pages. If we have bad_ram_pages, then sort the list of them.
2597 * No lock needed here, as this code is called while kernel boot up is single threaded.
2598 */
2599 static void
initialize_ram_ranges(void)2600 initialize_ram_ranges(void)
2601 {
2602 pmap_paddr_t first = first_avail;
2603 pmap_paddr_t end = avail_end;
2604
2605 assert(first <= end);
2606 assert(first == (first & ~PAGE_MASK));
2607 assert(end == (end & ~PAGE_MASK));
2608 avail_page_count = atop(end - first);
2609
2610 #if defined(__arm64__)
2611 /*
2612 * XXX Temporary code for testing, until there is iboot support
2613 *
2614 * Parse a list of known bad pages from a boot-args.
2615 */
2616 parse_bad_ram_pages_boot_arg();
2617
2618 /*
2619 * Sort and filter the bad pages list and adjust avail_page_count.
2620 */
2621 if (bad_ram_pages_count != 0) {
2622 qsort(bad_ram_pages, bad_ram_pages_count, sizeof(*bad_ram_pages), (cmpfunc_t)pmap_paddr_cmp);
2623 skip_pages = bad_ram_pages;
2624 skip_pages_count = bad_ram_pages_count;
2625
2626 /* ignore any pages before first */
2627 while (skip_pages_count > 0 && skip_pages[0] < first) {
2628 --skip_pages_count;
2629 ++skip_pages;
2630 }
2631
2632 /* ignore any pages at or after end */
2633 while (skip_pages_count > 0 && skip_pages[skip_pages_count - 1] >= end) {
2634 --skip_pages_count;
2635 }
2636
2637 avail_page_count -= skip_pages_count;
2638 }
2639 #endif /* defined(__arm64__) */
2640 need_ram_ranges_init = false;
2641 }
2642
2643 unsigned int
pmap_free_pages(void)2644 pmap_free_pages(
2645 void)
2646 {
2647 if (need_ram_ranges_init) {
2648 initialize_ram_ranges();
2649 }
2650 return avail_page_count;
2651 }
2652
2653 unsigned int
pmap_free_pages_span(void)2654 pmap_free_pages_span(
2655 void)
2656 {
2657 if (need_ram_ranges_init) {
2658 initialize_ram_ranges();
2659 }
2660 return (unsigned int)atop(avail_end - first_avail);
2661 }
2662
2663
2664 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2665 pmap_next_page_hi(
2666 ppnum_t * pnum,
2667 __unused boolean_t might_free)
2668 {
2669 return pmap_next_page(pnum);
2670 }
2671
2672
2673 boolean_t
pmap_next_page(ppnum_t * pnum)2674 pmap_next_page(
2675 ppnum_t *pnum)
2676 {
2677 if (need_ram_ranges_init) {
2678 initialize_ram_ranges();
2679 }
2680
2681 #if defined(__arm64__)
2682 /*
2683 * Skip over any known bad pages.
2684 */
2685 while (skip_pages_count > 0 && first_avail == skip_pages[0]) {
2686 first_avail += PAGE_SIZE;
2687 ++skip_pages;
2688 --skip_pages_count;
2689 }
2690 #endif /* defined(__arm64__) */
2691
2692 if (first_avail != avail_end) {
2693 *pnum = (ppnum_t)atop(first_avail);
2694 first_avail += PAGE_SIZE;
2695 assert(avail_page_count > 0);
2696 --avail_page_count;
2697 return TRUE;
2698 }
2699 assert(avail_page_count == 0);
2700 return FALSE;
2701 }
2702
2703 void
pmap_retire_page(__unused ppnum_t pnum)2704 pmap_retire_page(
2705 __unused ppnum_t pnum)
2706 {
2707 /* XXX Justin TBD - mark the page as unusable in pmap data structures */
2708 }
2709
2710
2711 /*
2712 * Initialize the pmap module.
2713 * Called by vm_init, to initialize any structures that the pmap
2714 * system needs to map virtual memory.
2715 */
2716 void
pmap_init(void)2717 pmap_init(
2718 void)
2719 {
2720 /*
2721 * Protect page zero in the kernel map.
2722 * (can be overruled by permanent transltion
2723 * table entries at page zero - see arm_vm_init).
2724 */
2725 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2726
2727 pmap_initialized = TRUE;
2728
2729 /*
2730 * Create the zone of physical maps
2731 * and the physical-to-virtual entries.
2732 */
2733 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2734 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2735
2736
2737 /*
2738 * Initialize the pmap object (for tracking the vm_page_t
2739 * structures for pages we allocate to be page tables in
2740 * pmap_expand().
2741 */
2742 _vm_object_allocate(mem_size, pmap_object);
2743 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2744
2745 /*
2746 * The values of [hard_]maxproc may have been scaled, make sure
2747 * they are still less than the value of pmap_max_asids.
2748 */
2749 if ((uint32_t)maxproc > pmap_max_asids) {
2750 maxproc = pmap_max_asids;
2751 }
2752 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2753 hard_maxproc = pmap_max_asids;
2754 }
2755 }
2756
2757 /**
2758 * Verify that a given physical page contains no mappings (outside of the
2759 * default physical aperture mapping).
2760 *
2761 * @param ppnum Physical page number to check there are no mappings to.
2762 *
2763 * @return True if there are no mappings, false otherwise or if the page is not
2764 * kernel-managed.
2765 */
2766 bool
pmap_verify_free(ppnum_t ppnum)2767 pmap_verify_free(ppnum_t ppnum)
2768 {
2769 const pmap_paddr_t pa = ptoa(ppnum);
2770
2771 assert(pa != vm_page_fictitious_addr);
2772
2773 /* Only mappings to kernel-managed physical memory are tracked. */
2774 if (!pa_valid(pa)) {
2775 return false;
2776 }
2777
2778 const unsigned int pai = pa_index(pa);
2779 pv_entry_t **pvh = pai_to_pvh(pai);
2780
2781 return pvh_test_type(pvh, PVH_TYPE_NULL);
2782 }
2783
2784 #if MACH_ASSERT
2785 /**
2786 * Verify that a given physical page contains no mappings (outside of the
2787 * default physical aperture mapping) and if it does, then panic.
2788 *
2789 * @note It's recommended to use pmap_verify_free() directly when operating in
2790 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2791 * normally being called from outside of the PPL, and the pv_head_table
2792 * can't be modified outside of the PPL).
2793 *
2794 * @param ppnum Physical page number to check there are no mappings to.
2795 */
2796 void
pmap_assert_free(ppnum_t ppnum)2797 pmap_assert_free(ppnum_t ppnum)
2798 {
2799 const pmap_paddr_t pa = ptoa(ppnum);
2800
2801 /* Only mappings to kernel-managed physical memory are tracked. */
2802 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2803 return;
2804 }
2805
2806 const unsigned int pai = pa_index(pa);
2807 pv_entry_t **pvh = pai_to_pvh(pai);
2808
2809 /**
2810 * This function is always called from outside of the PPL. Because of this,
2811 * the PVH entry can't be locked. This function is generally only called
2812 * before the VM reclaims a physical page and shouldn't be creating new
2813 * mappings. Even if a new mapping is created while parsing the hierarchy,
2814 * the worst case is that the system will panic in another way, and we were
2815 * already about to panic anyway.
2816 */
2817
2818 /**
2819 * Since pmap_verify_free() returned false, that means there is at least one
2820 * mapping left. Let's get some extra info on the first mapping we find to
2821 * dump in the panic string (the common case is that there is one spare
2822 * mapping that was never unmapped).
2823 */
2824 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2825
2826 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2827 first_ptep = pvh_ptep(pvh);
2828 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2829 pv_entry_t *pvep = pvh_pve_list(pvh);
2830
2831 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2832 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2833 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2834 if (first_ptep != PT_ENTRY_NULL) {
2835 break;
2836 }
2837 }
2838
2839 /* The PVE should have at least one valid PTE. */
2840 assert(first_ptep != PT_ENTRY_NULL);
2841 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2842 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2843 __func__, pvh, pai);
2844 } else {
2845 /**
2846 * The mapping disappeared between here and the pmap_verify_free() call.
2847 * The only way that can happen is if the VM was racing this call with
2848 * a call that unmaps PTEs. Operations on this page should not be
2849 * occurring at the same time as this check, and unfortunately we can't
2850 * lock the PVH entry to prevent it, so just panic instead.
2851 */
2852 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2853 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2854 __func__, pvh, pai);
2855 }
2856
2857 /* Panic with a unique string identifying the first bad mapping and owner. */
2858 {
2859 /* First PTE is mapped by the main CPUs. */
2860 pmap_t pmap = ptep_get_pmap(first_ptep);
2861 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2862
2863 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2864 "%s CPU mapping (pmap: %p)",
2865 __func__, (uint64_t)pa, first_ptep, type, pmap);
2866 }
2867 }
2868 #endif
2869
2870
2871 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2872 pmap_root_alloc_size(pmap_t pmap)
2873 {
2874 #if (__ARM_VMSA__ > 7)
2875 #pragma unused(pmap)
2876 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2877 unsigned int root_level = pt_attr_root_level(pt_attr);
2878 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2879 #else
2880 (void)pmap;
2881 return PMAP_ROOT_ALLOC_SIZE;
2882 #endif
2883 }
2884
2885 /*
2886 * Create and return a physical map.
2887 *
2888 * If the size specified for the map
2889 * is zero, the map is an actual physical
2890 * map, and may be referenced by the
2891 * hardware.
2892 *
2893 * If the size specified is non-zero,
2894 * the map will be used in software only, and
2895 * is bounded by that size.
2896 */
2897 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2898 pmap_create_options_internal(
2899 ledger_t ledger,
2900 vm_map_size_t size,
2901 unsigned int flags,
2902 kern_return_t *kr)
2903 {
2904 unsigned i;
2905 unsigned tte_index_max;
2906 pmap_t p;
2907 bool is_64bit = flags & PMAP_CREATE_64BIT;
2908 #if defined(HAS_APPLE_PAC)
2909 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2910 #endif /* defined(HAS_APPLE_PAC) */
2911 kern_return_t local_kr = KERN_SUCCESS;
2912
2913 /*
2914 * A software use-only map doesn't even need a pmap.
2915 */
2916 if (size != 0) {
2917 return PMAP_NULL;
2918 }
2919
2920 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2921 return PMAP_NULL;
2922 }
2923
2924 #if XNU_MONITOR
2925 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2926 goto pmap_create_fail;
2927 }
2928
2929 assert(p != PMAP_NULL);
2930
2931 if (ledger) {
2932 pmap_ledger_validate(ledger);
2933 pmap_ledger_retain(ledger);
2934 }
2935 #else
2936 /*
2937 * Allocate a pmap struct from the pmap_zone. Then allocate
2938 * the translation table of the right size for the pmap.
2939 */
2940 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2941 local_kr = KERN_RESOURCE_SHORTAGE;
2942 goto pmap_create_fail;
2943 }
2944 #endif
2945
2946 p->ledger = ledger;
2947
2948
2949 p->pmap_vm_map_cs_enforced = false;
2950
2951 p->min = 0;
2952 if (flags & PMAP_CREATE_64BIT) {
2953 } else {
2954 }
2955
2956 #if defined(HAS_APPLE_PAC)
2957 p->disable_jop = disable_jop;
2958 #endif /* defined(HAS_APPLE_PAC) */
2959
2960 p->nested_region_true_start = 0;
2961 p->nested_region_true_end = ~0;
2962
2963 p->gc_status = 0;
2964 p->stamp = os_atomic_inc(&pmap_stamp, relaxed);
2965 p->nx_enabled = true;
2966 p->is_64bit = is_64bit;
2967 p->nested_pmap = PMAP_NULL;
2968 p->type = PMAP_TYPE_USER;
2969
2970 #if ARM_PARAMETERIZED_PMAP
2971 /* Default to the native pt_attr */
2972 p->pmap_pt_attr = native_pt_attr;
2973 #endif /* ARM_PARAMETERIZED_PMAP */
2974 #if __ARM_MIXED_PAGE_SIZE__
2975 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2976 p->pmap_pt_attr = &pmap_pt_attr_4k;
2977 }
2978 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2979 p->max = pmap_user_va_size(p);
2980
2981 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2982 local_kr = KERN_NO_SPACE;
2983 goto id_alloc_fail;
2984 }
2985
2986 pmap_lock_init(p);
2987
2988 p->tt_entry_free = (tt_entry_t *)0;
2989 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
2990
2991 #if (__ARM_VMSA__ == 7)
2992 p->tte_index_max = tte_index_max;
2993 #endif
2994
2995 #if XNU_MONITOR
2996 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
2997 #else
2998 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
2999 #endif
3000 if (!(p->tte)) {
3001 local_kr = KERN_RESOURCE_SHORTAGE;
3002 goto tt1_alloc_fail;
3003 }
3004
3005 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3006 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3007
3008 /* nullify the translation table */
3009 for (i = 0; i < tte_index_max; i++) {
3010 p->tte[i] = ARM_TTE_TYPE_FAULT;
3011 }
3012
3013 FLUSH_PTE();
3014
3015 /*
3016 * initialize the rest of the structure
3017 */
3018 p->nested_region_addr = 0x0ULL;
3019 p->nested_region_size = 0x0ULL;
3020 p->nested_region_asid_bitmap = NULL;
3021 p->nested_region_asid_bitmap_size = 0x0UL;
3022
3023 p->nested_has_no_bounds_ref = false;
3024 p->nested_no_bounds_refcnt = 0;
3025 p->nested_bounds_set = false;
3026
3027
3028 #if MACH_ASSERT
3029 p->pmap_stats_assert = TRUE;
3030 p->pmap_pid = 0;
3031 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3032 #endif /* MACH_ASSERT */
3033 #if DEVELOPMENT || DEBUG
3034 p->footprint_was_suspended = FALSE;
3035 #endif /* DEVELOPMENT || DEBUG */
3036
3037 #if XNU_MONITOR
3038 os_atomic_init(&p->nested_count, 0);
3039 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3040 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3041 os_atomic_thread_fence(release);
3042 #endif
3043 os_atomic_init(&p->ref_count, 1);
3044 pmap_simple_lock(&pmaps_lock);
3045 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3046 pmap_simple_unlock(&pmaps_lock);
3047
3048 return p;
3049
3050 tt1_alloc_fail:
3051 pmap_get_pt_ops(p)->free_id(p);
3052 id_alloc_fail:
3053 #if XNU_MONITOR
3054 pmap_free_pmap(p);
3055
3056 if (ledger) {
3057 pmap_ledger_release(ledger);
3058 }
3059 #else
3060 zfree(pmap_zone, p);
3061 #endif
3062 pmap_create_fail:
3063 #if XNU_MONITOR
3064 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3065 #endif
3066 *kr = local_kr;
3067 #if XNU_MONITOR
3068 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3069 #endif
3070 return PMAP_NULL;
3071 }
3072
3073 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3074 pmap_create_options(
3075 ledger_t ledger,
3076 vm_map_size_t size,
3077 unsigned int flags)
3078 {
3079 pmap_t pmap;
3080 kern_return_t kr = KERN_SUCCESS;
3081
3082 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3083
3084 ledger_reference(ledger);
3085
3086 #if XNU_MONITOR
3087 for (;;) {
3088 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3089 if (kr != KERN_RESOURCE_SHORTAGE) {
3090 break;
3091 }
3092 assert(pmap == PMAP_NULL);
3093 pmap_alloc_page_for_ppl(0);
3094 kr = KERN_SUCCESS;
3095 }
3096 #else
3097 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3098 #endif
3099
3100 if (pmap == PMAP_NULL) {
3101 ledger_dereference(ledger);
3102 }
3103
3104 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3105
3106 return pmap;
3107 }
3108
3109 #if XNU_MONITOR
3110 /*
3111 * This symbol remains in place when the PPL is enabled so that the dispatch
3112 * table does not change from development to release configurations.
3113 */
3114 #endif
3115 #if MACH_ASSERT || XNU_MONITOR
3116 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3117 pmap_set_process_internal(
3118 __unused pmap_t pmap,
3119 __unused int pid,
3120 __unused char *procname)
3121 {
3122 #if MACH_ASSERT
3123 if (pmap == NULL) {
3124 return;
3125 }
3126
3127 validate_pmap_mutable(pmap);
3128
3129 pmap->pmap_pid = pid;
3130 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3131 if (pmap_ledgers_panic_leeway) {
3132 /*
3133 * XXX FBDP
3134 * Some processes somehow trigger some issues that make
3135 * the pmap stats and ledgers go off track, causing
3136 * some assertion failures and ledger panics.
3137 * Turn off the sanity checks if we allow some ledger leeway
3138 * because of that. We'll still do a final check in
3139 * pmap_check_ledgers() for discrepancies larger than the
3140 * allowed leeway after the address space has been fully
3141 * cleaned up.
3142 */
3143 pmap->pmap_stats_assert = FALSE;
3144 ledger_disable_panic_on_negative(pmap->ledger,
3145 task_ledgers.phys_footprint);
3146 ledger_disable_panic_on_negative(pmap->ledger,
3147 task_ledgers.internal);
3148 ledger_disable_panic_on_negative(pmap->ledger,
3149 task_ledgers.internal_compressed);
3150 ledger_disable_panic_on_negative(pmap->ledger,
3151 task_ledgers.iokit_mapped);
3152 ledger_disable_panic_on_negative(pmap->ledger,
3153 task_ledgers.alternate_accounting);
3154 ledger_disable_panic_on_negative(pmap->ledger,
3155 task_ledgers.alternate_accounting_compressed);
3156 }
3157 #endif /* MACH_ASSERT */
3158 }
3159 #endif /* MACH_ASSERT || XNU_MONITOR */
3160
3161 #if MACH_ASSERT
3162 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3163 pmap_set_process(
3164 pmap_t pmap,
3165 int pid,
3166 char *procname)
3167 {
3168 #if XNU_MONITOR
3169 pmap_set_process_ppl(pmap, pid, procname);
3170 #else
3171 pmap_set_process_internal(pmap, pid, procname);
3172 #endif
3173 }
3174 #endif /* MACH_ASSERT */
3175
3176 #if (__ARM_VMSA__ > 7)
3177 /*
3178 * pmap_deallocate_all_leaf_tts:
3179 *
3180 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3181 * removing and deallocating all TTEs.
3182 */
3183 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3184 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3185 {
3186 tt_entry_t tte = ARM_TTE_EMPTY;
3187 tt_entry_t * ttep = NULL;
3188 tt_entry_t * last_ttep = NULL;
3189
3190 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3191
3192 assert(level < pt_attr_leaf_level(pt_attr));
3193
3194 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3195
3196 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3197 tte = *ttep;
3198
3199 if (!(tte & ARM_TTE_VALID)) {
3200 continue;
3201 }
3202
3203 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3204 panic("%s: found block mapping, ttep=%p, tte=%p, "
3205 "pmap=%p, first_ttep=%p, level=%u",
3206 __FUNCTION__, ttep, (void *)tte,
3207 pmap, first_ttep, level);
3208 }
3209
3210 /* Must be valid, type table */
3211 if (level < pt_attr_twig_level(pt_attr)) {
3212 /* If we haven't reached the twig level, recurse to the next level. */
3213 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3214 }
3215
3216 /* Remove the TTE. */
3217 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3218 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3219 }
3220 }
3221 #endif /* (__ARM_VMSA__ > 7) */
3222
3223 /*
3224 * We maintain stats and ledgers so that a task's physical footprint is:
3225 * phys_footprint = ((internal - alternate_accounting)
3226 * + (internal_compressed - alternate_accounting_compressed)
3227 * + iokit_mapped
3228 * + purgeable_nonvolatile
3229 * + purgeable_nonvolatile_compressed
3230 * + page_table)
3231 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3232 */
3233
3234 /*
3235 * Retire the given physical map from service.
3236 * Should only be called if the map contains
3237 * no valid mappings.
3238 */
3239 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3240 pmap_destroy_internal(
3241 pmap_t pmap)
3242 {
3243 if (pmap == PMAP_NULL) {
3244 return;
3245 }
3246
3247 validate_pmap(pmap);
3248
3249 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3250
3251 int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3252 if (ref_count > 0) {
3253 return;
3254 } else if (__improbable(ref_count < 0)) {
3255 panic("pmap %p: refcount underflow", pmap);
3256 } else if (__improbable(pmap == kernel_pmap)) {
3257 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3258 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3259 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3260 }
3261
3262 #if XNU_MONITOR
3263 /*
3264 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3265 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3266 * That ensures that if the pmap is currently in use elsewhere, this path will
3267 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3268 * ref_count of 0 and panic.
3269 */
3270 os_atomic_thread_fence(seq_cst);
3271 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3272 panic("pmap %p: attempt to destroy while nested", pmap);
3273 }
3274 const int max_cpu = ml_get_max_cpu_number();
3275 for (unsigned int i = 0; i <= max_cpu; ++i) {
3276 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3277 if (cpu_data == NULL) {
3278 continue;
3279 }
3280 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3281 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3282 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3283 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3284 }
3285 }
3286 #endif
3287 #if (__ARM_VMSA__ > 7)
3288 pmap_unmap_sharedpage(pmap);
3289 #endif /* (__ARM_VMSA__ > 7) */
3290
3291 pmap_simple_lock(&pmaps_lock);
3292 #if !XNU_MONITOR
3293 while (pmap->gc_status & PMAP_GC_INFLIGHT) {
3294 pmap->gc_status |= PMAP_GC_WAIT;
3295 assert_wait((event_t) &pmap->gc_status, THREAD_UNINT);
3296 pmap_simple_unlock(&pmaps_lock);
3297 (void) thread_block(THREAD_CONTINUE_NULL);
3298 pmap_simple_lock(&pmaps_lock);
3299 }
3300 #endif /* !XNU_MONITOR */
3301 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3302 pmap_simple_unlock(&pmaps_lock);
3303
3304 pmap_trim_self(pmap);
3305
3306 /*
3307 * Free the memory maps, then the
3308 * pmap structure.
3309 */
3310 #if (__ARM_VMSA__ == 7)
3311 unsigned int i = 0;
3312 pt_entry_t *ttep;
3313
3314 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3315 for (i = 0; i < pmap->tte_index_max; i++) {
3316 ttep = &pmap->tte[i];
3317 if ((*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
3318 pmap_tte_deallocate(pmap, 0, 0, false, ttep, PMAP_TT_L1_LEVEL);
3319 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3320 }
3321 }
3322 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3323 #else /* (__ARM_VMSA__ == 7) */
3324 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3325 #endif /* (__ARM_VMSA__ == 7) */
3326
3327
3328
3329 if (pmap->tte) {
3330 #if (__ARM_VMSA__ == 7)
3331 pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max * sizeof(tt_entry_t), 0);
3332 pmap->tte_index_max = 0;
3333 #else /* (__ARM_VMSA__ == 7) */
3334 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3335 #endif /* (__ARM_VMSA__ == 7) */
3336 pmap->tte = (tt_entry_t *) NULL;
3337 pmap->ttep = 0;
3338 }
3339
3340 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3341
3342 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3343 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false);
3344 sync_tlb_flush();
3345 } else {
3346 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3347 sync_tlb_flush();
3348 /* return its asid to the pool */
3349 pmap_get_pt_ops(pmap)->free_id(pmap);
3350 if (pmap->nested_pmap != NULL) {
3351 #if XNU_MONITOR
3352 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3353 #endif
3354 /* release the reference we hold on the nested pmap */
3355 pmap_destroy_internal(pmap->nested_pmap);
3356 }
3357 }
3358
3359 pmap_check_ledgers(pmap);
3360
3361 if (pmap->nested_region_asid_bitmap) {
3362 #if XNU_MONITOR
3363 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3364 #else
3365 kfree_data(pmap->nested_region_asid_bitmap,
3366 pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3367 #endif
3368 }
3369
3370 #if XNU_MONITOR
3371 if (pmap->ledger) {
3372 pmap_ledger_release(pmap->ledger);
3373 }
3374
3375 pmap_lock_destroy(pmap);
3376 pmap_free_pmap(pmap);
3377 #else
3378 pmap_lock_destroy(pmap);
3379 zfree(pmap_zone, pmap);
3380 #endif
3381 }
3382
3383 void
pmap_destroy(pmap_t pmap)3384 pmap_destroy(
3385 pmap_t pmap)
3386 {
3387 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3388
3389 ledger_t ledger = pmap->ledger;
3390
3391 #if XNU_MONITOR
3392 pmap_destroy_ppl(pmap);
3393
3394 pmap_ledger_check_balance(pmap);
3395 #else
3396 pmap_destroy_internal(pmap);
3397 #endif
3398
3399 ledger_dereference(ledger);
3400
3401 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3402 }
3403
3404
3405 /*
3406 * Add a reference to the specified pmap.
3407 */
3408 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3409 pmap_reference_internal(
3410 pmap_t pmap)
3411 {
3412 if (pmap != PMAP_NULL) {
3413 validate_pmap_mutable(pmap);
3414 os_atomic_inc(&pmap->ref_count, relaxed);
3415 }
3416 }
3417
3418 void
pmap_reference(pmap_t pmap)3419 pmap_reference(
3420 pmap_t pmap)
3421 {
3422 #if XNU_MONITOR
3423 pmap_reference_ppl(pmap);
3424 #else
3425 pmap_reference_internal(pmap);
3426 #endif
3427 }
3428
3429 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3430 pmap_tt1_allocate(
3431 pmap_t pmap,
3432 vm_size_t size,
3433 unsigned option)
3434 {
3435 tt_entry_t *tt1 = NULL;
3436 tt_free_entry_t *tt1_free;
3437 pmap_paddr_t pa;
3438 vm_address_t va;
3439 vm_address_t va_end;
3440 kern_return_t ret;
3441
3442 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3443 size = PAGE_SIZE;
3444 }
3445
3446 pmap_simple_lock(&tt1_lock);
3447 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3448 free_page_size_tt_count--;
3449 tt1 = (tt_entry_t *)free_page_size_tt_list;
3450 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3451 } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3452 free_two_page_size_tt_count--;
3453 tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3454 free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3455 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3456 free_tt_count--;
3457 tt1 = (tt_entry_t *)free_tt_list;
3458 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3459 }
3460
3461 pmap_simple_unlock(&tt1_lock);
3462
3463 if (tt1 != NULL) {
3464 pmap_tt_ledger_credit(pmap, size);
3465 return (tt_entry_t *)tt1;
3466 }
3467
3468 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3469
3470 if (ret == KERN_RESOURCE_SHORTAGE) {
3471 return (tt_entry_t *)0;
3472 }
3473
3474 #if XNU_MONITOR
3475 assert(pa);
3476 #endif
3477
3478 if (size < PAGE_SIZE) {
3479 va = phystokv(pa) + size;
3480 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3481 tt_free_entry_t *next_free = NULL;
3482 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3483 tt1_free = (tt_free_entry_t *)va;
3484 tt1_free->next = next_free;
3485 next_free = tt1_free;
3486 }
3487 pmap_simple_lock(&tt1_lock);
3488 local_free_list->next = free_tt_list;
3489 free_tt_list = next_free;
3490 free_tt_count += ((PAGE_SIZE / size) - 1);
3491 if (free_tt_count > free_tt_max) {
3492 free_tt_max = free_tt_count;
3493 }
3494 pmap_simple_unlock(&tt1_lock);
3495 }
3496
3497 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3498 * Depending on the device, this can vary between 512b and 16K. */
3499 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3500 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3501 pmap_tt_ledger_credit(pmap, size);
3502
3503 return (tt_entry_t *) phystokv(pa);
3504 }
3505
3506 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3507 pmap_tt1_deallocate(
3508 pmap_t pmap,
3509 tt_entry_t *tt,
3510 vm_size_t size,
3511 unsigned option)
3512 {
3513 tt_free_entry_t *tt_entry;
3514
3515 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3516 size = PAGE_SIZE;
3517 }
3518
3519 tt_entry = (tt_free_entry_t *)tt;
3520 assert(not_in_kdp);
3521 pmap_simple_lock(&tt1_lock);
3522
3523 if (size < PAGE_SIZE) {
3524 free_tt_count++;
3525 if (free_tt_count > free_tt_max) {
3526 free_tt_max = free_tt_count;
3527 }
3528 tt_entry->next = free_tt_list;
3529 free_tt_list = tt_entry;
3530 }
3531
3532 if (size == PAGE_SIZE) {
3533 free_page_size_tt_count++;
3534 if (free_page_size_tt_count > free_page_size_tt_max) {
3535 free_page_size_tt_max = free_page_size_tt_count;
3536 }
3537 tt_entry->next = free_page_size_tt_list;
3538 free_page_size_tt_list = tt_entry;
3539 }
3540
3541 if (size == 2 * PAGE_SIZE) {
3542 free_two_page_size_tt_count++;
3543 if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3544 free_two_page_size_tt_max = free_two_page_size_tt_count;
3545 }
3546 tt_entry->next = free_two_page_size_tt_list;
3547 free_two_page_size_tt_list = tt_entry;
3548 }
3549
3550 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3551 pmap_simple_unlock(&tt1_lock);
3552 pmap_tt_ledger_debit(pmap, size);
3553 return;
3554 }
3555
3556 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3557 free_page_size_tt_count--;
3558 tt = (tt_entry_t *)free_page_size_tt_list;
3559 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3560
3561 pmap_simple_unlock(&tt1_lock);
3562
3563 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3564
3565 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3566
3567 pmap_simple_lock(&tt1_lock);
3568 }
3569
3570 while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3571 free_two_page_size_tt_count--;
3572 tt = (tt_entry_t *)free_two_page_size_tt_list;
3573 free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3574
3575 pmap_simple_unlock(&tt1_lock);
3576
3577 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3578
3579 OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3580
3581 pmap_simple_lock(&tt1_lock);
3582 }
3583 pmap_simple_unlock(&tt1_lock);
3584 pmap_tt_ledger_debit(pmap, size);
3585 }
3586
3587 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3588 pmap_tt_allocate(
3589 pmap_t pmap,
3590 tt_entry_t **ttp,
3591 unsigned int level,
3592 unsigned int options)
3593 {
3594 pmap_paddr_t pa;
3595 *ttp = NULL;
3596
3597 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3598 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3599 tt_free_entry_t *tt_free_cur, *tt_free_next;
3600
3601 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3602 tt_free_next = tt_free_cur->next;
3603 tt_free_cur->next = NULL;
3604 *ttp = (tt_entry_t *)tt_free_cur;
3605 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3606 }
3607 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3608
3609 if (*ttp == NULL) {
3610 pt_desc_t *ptdp;
3611
3612 /*
3613 * Allocate a VM page for the level x page table entries.
3614 */
3615 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3616 if (options & PMAP_OPTIONS_NOWAIT) {
3617 return KERN_RESOURCE_SHORTAGE;
3618 }
3619 VM_PAGE_WAIT();
3620 }
3621
3622 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3623 if (options & PMAP_OPTIONS_NOWAIT) {
3624 pmap_pages_free(pa, PAGE_SIZE);
3625 return KERN_RESOURCE_SHORTAGE;
3626 }
3627 VM_PAGE_WAIT();
3628 }
3629
3630 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3631 OSAddAtomic64(1, &alloc_ttepages_count);
3632 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3633 } else {
3634 OSAddAtomic64(1, &alloc_ptepages_count);
3635 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3636 }
3637
3638 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3639
3640 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3641
3642 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3643
3644 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3645 if (PAGE_SIZE > pmap_page_size) {
3646 vm_address_t va;
3647 vm_address_t va_end;
3648
3649 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3650
3651 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3652 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3653 pmap->tt_entry_free = (tt_entry_t *)va;
3654 }
3655 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3656 }
3657
3658 *ttp = (tt_entry_t *)phystokv(pa);
3659 }
3660
3661 #if XNU_MONITOR
3662 assert(*ttp);
3663 #endif
3664
3665 return KERN_SUCCESS;
3666 }
3667
3668
3669 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3670 pmap_tt_deallocate(
3671 pmap_t pmap,
3672 tt_entry_t *ttp,
3673 unsigned int level)
3674 {
3675 pt_desc_t *ptdp;
3676 ptd_info_t *ptd_info;
3677 unsigned pt_acc_cnt;
3678 unsigned i;
3679 vm_offset_t free_page = 0;
3680 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3681 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3682
3683 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3684
3685 ptdp = ptep_get_ptd(ttp);
3686 ptd_info = ptd_get_info(ptdp, ttp);
3687
3688 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3689
3690 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3691 ptd_info->refcnt = 0;
3692 }
3693
3694 if (ptd_info->refcnt != 0) {
3695 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3696 }
3697
3698 ptd_info->refcnt = 0;
3699
3700 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3701 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3702 }
3703
3704 if (pt_acc_cnt == 0) {
3705 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3706 unsigned pt_free_entry_cnt = 1;
3707
3708 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3709 tt_free_entry_t *tt_free_list_next;
3710
3711 tt_free_list_next = tt_free_list->next;
3712 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3713 pt_free_entry_cnt++;
3714 }
3715 tt_free_list = tt_free_list_next;
3716 }
3717 if (pt_free_entry_cnt == max_pt_index) {
3718 tt_free_entry_t *tt_free_list_cur;
3719
3720 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3721 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3722 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3723
3724 while (tt_free_list_cur) {
3725 tt_free_entry_t *tt_free_list_next;
3726
3727 tt_free_list_next = tt_free_list_cur->next;
3728 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3729 tt_free_list->next = tt_free_list_next->next;
3730 } else {
3731 tt_free_list = tt_free_list_next;
3732 }
3733 tt_free_list_cur = tt_free_list_next;
3734 }
3735 } else {
3736 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3737 pmap->tt_entry_free = ttp;
3738 }
3739 } else {
3740 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3741 pmap->tt_entry_free = ttp;
3742 }
3743
3744 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3745
3746 if (free_page != 0) {
3747 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3748 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3749 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3750 if (level < pt_attr_leaf_level(pt_attr)) {
3751 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3752 } else {
3753 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3754 }
3755 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3756 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3757 }
3758 }
3759
3760 /**
3761 * Safely clear out a translation table entry.
3762 *
3763 * @note If the TTE to clear out points to a leaf table, then that leaf table
3764 * must have a refcnt of zero before the TTE can be removed.
3765 * @note This function expects to be called with pmap locked exclusive, and will
3766 * return with pmap unlocked.
3767 *
3768 * @param pmap The pmap containing the page table whose TTE is being removed.
3769 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
3770 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
3771 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
3772 * @param ttep Pointer to the TTE that should be cleared out.
3773 * @param level The level of the page table that contains the TTE to be removed.
3774 */
3775 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3776 pmap_tte_remove(
3777 pmap_t pmap,
3778 vm_offset_t va_start,
3779 vm_offset_t va_end,
3780 bool need_strong_sync,
3781 tt_entry_t *ttep,
3782 unsigned int level)
3783 {
3784 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3785
3786 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3787 const tt_entry_t tte = *ttep;
3788
3789 if (__improbable(tte == ARM_TTE_EMPTY)) {
3790 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3791 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3792 }
3793
3794 #if (__ARM_VMSA__ == 7)
3795 {
3796 tt_entry_t *ttep_4M = (tt_entry_t *) ((vm_offset_t)ttep & 0xFFFFFFF0);
3797 unsigned i;
3798
3799 for (i = 0; i < 4; i++, ttep_4M++) {
3800 *ttep_4M = (tt_entry_t) 0;
3801 }
3802 FLUSH_PTE_STRONG();
3803 }
3804 #else
3805 *ttep = (tt_entry_t) 0;
3806 FLUSH_PTE_STRONG();
3807 #endif /* (__ARM_VMSA__ == 7) */
3808 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3809 if (va_end > va_start) {
3810 #if (__ARM_VMSA__ == 7)
3811 // Ensure intermediate translations are flushed for each 1MB block
3812 flush_mmu_tlb_entry_async((va_start & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff));
3813 flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3814 flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3815 flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3816 #endif
3817 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3818 }
3819
3820 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3821
3822 /**
3823 * Remember, the passed in "level" parameter refers to the level above the
3824 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3825 * page table).
3826 */
3827 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3828
3829 /**
3830 * Non-leaf pagetables don't track active references in the PTD and instead
3831 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3832 * the real refcount below.
3833 */
3834 unsigned short refcnt = PT_DESC_REFCOUNT;
3835
3836 /*
3837 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3838 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3839 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3840 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3841 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3842 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3843 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3844 * synchronize it against the disconnect operation. If that removal caused the
3845 * refcount to reach zero, the pagetable page could be freed before the disconnect
3846 * operation is finished using the relevant pagetable descriptor.
3847 * Address these cases by waiting until all CPUs have been observed to not be
3848 * executing pmap_disconnect().
3849 */
3850 if (remove_leaf_table) {
3851 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3852 const int max_cpu = ml_get_max_cpu_number();
3853 bitmap_full(&active_disconnects[0], max_cpu + 1);
3854 bool inflight_disconnect;
3855
3856 /*
3857 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3858 * ahead of any prior PTE load which may have observed the effect of a
3859 * concurrent disconnect operation. An acquire fence is required for this;
3860 * a load-acquire operation is insufficient.
3861 */
3862 os_atomic_thread_fence(acquire);
3863 do {
3864 inflight_disconnect = false;
3865 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3866 i >= 0;
3867 i = bitmap_next(&active_disconnects[0], i)) {
3868 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3869 if (cpu_data == NULL) {
3870 continue;
3871 }
3872 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3873 __builtin_arm_wfe();
3874 inflight_disconnect = true;
3875 continue;
3876 }
3877 os_atomic_clear_exclusive();
3878 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3879 }
3880 } while (inflight_disconnect);
3881 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3882 os_atomic_thread_fence(acquire);
3883 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3884 }
3885
3886 #if MACH_ASSERT
3887 /**
3888 * On internal devices, always do the page table consistency check
3889 * regardless of page table level or the actual refcnt value.
3890 */
3891 {
3892 #else /* MACH_ASSERT */
3893 /**
3894 * Only perform the page table consistency check when deleting leaf page
3895 * tables and it seems like there might be valid/compressed mappings
3896 * leftover.
3897 */
3898 if (__improbable(remove_leaf_table && refcnt != 0)) {
3899 #endif /* MACH_ASSERT */
3900
3901 /**
3902 * There are multiple problems that can arise as a non-zero refcnt:
3903 * 1. A bug in the refcnt management logic.
3904 * 2. A memory stomper or hardware failure.
3905 * 3. The VM forgetting to unmap all of the valid mappings in an address
3906 * space before destroying a pmap.
3907 *
3908 * By looping over the page table and determining how many valid or
3909 * compressed entries there actually are, we can narrow down which of
3910 * these three cases is causing this panic. If the expected refcnt
3911 * (valid + compressed) and the actual refcnt don't match then the
3912 * problem is probably either a memory corruption issue (if the
3913 * non-empty entries don't match valid+compressed, that could also be a
3914 * sign of corruption) or refcnt management bug. Otherwise, there
3915 * actually are leftover mappings and the higher layers of xnu are
3916 * probably at fault.
3917 */
3918 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3919 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3920
3921 pt_entry_t *ptep = bpte;
3922 unsigned short non_empty = 0, valid = 0, comp = 0;
3923 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3924 /* Keep track of all non-empty entries to detect memory corruption. */
3925 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3926 non_empty++;
3927 }
3928
3929 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3930 comp++;
3931 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3932 valid++;
3933 }
3934 }
3935
3936 #if MACH_ASSERT
3937 /**
3938 * On internal machines, panic whenever a page table getting deleted has
3939 * leftover mappings (valid or otherwise) or a leaf page table has a
3940 * non-zero refcnt.
3941 */
3942 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3943 #else /* MACH_ASSERT */
3944 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3945 {
3946 #endif /* MACH_ASSERT */
3947 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3948 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3949 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3950 }
3951 }
3952 }
3953
3954 /**
3955 * Given a pointer to an entry within a `level` page table, delete the
3956 * page table at `level` + 1 that is represented by that entry. For instance,
3957 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3958 * contains the PA of the L3 table, and `level` would be "2".
3959 *
3960 * @note If the table getting deallocated is a leaf table, then that leaf table
3961 * must have a refcnt of zero before getting deallocated. All other levels
3962 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3963 * @note This function expects to be called with pmap locked exclusive and will
3964 * return with pmap unlocked.
3965 *
3966 * @param pmap The pmap that owns the page table to be deallocated.
3967 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
3968 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
3969 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
3970 * @param ttep Pointer to the `level` TTE to remove.
3971 * @param level The level of the table that contains an entry pointing to the
3972 * table to be removed. The deallocated page table will be a
3973 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3974 * deleted).
3975 */
3976 void
3977 pmap_tte_deallocate(
3978 pmap_t pmap,
3979 vm_offset_t va_start,
3980 vm_offset_t va_end,
3981 bool need_strong_sync,
3982 tt_entry_t *ttep,
3983 unsigned int level)
3984 {
3985 pmap_paddr_t pa;
3986 tt_entry_t tte;
3987
3988 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3989
3990 tte = *ttep;
3991
3992 if (tte_get_ptd(tte)->pmap != pmap) {
3993 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3994 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3995 }
3996
3997 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3998 __func__, ttep, (unsigned long long)tte);
3999 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
4000
4001 /* pmap_tte_remove() will drop the pmap lock */
4002 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4003
4004 /* Clear any page offset: we mean to free the whole page, but armv7 TTEs may only be
4005 * aligned on 1K boundaries. We clear the surrounding "chunk" of 4 TTEs above. */
4006 pa = tte_to_pa(tte) & ~(pmap_page_size - 1);
4007 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(pa), level + 1);
4008 }
4009
4010 /*
4011 * Remove a range of hardware page-table entries.
4012 * The entries given are the first (inclusive)
4013 * and last (exclusive) entries for the VM pages.
4014 * The virtual address is the va for the first pte.
4015 *
4016 * The pmap must be locked.
4017 * If the pmap is not the kernel pmap, the range must lie
4018 * entirely within one pte-page. This is NOT checked.
4019 * Assumes that the pte-page exists.
4020 *
4021 * Returns the number of PTE changed
4022 */
4023 MARK_AS_PMAP_TEXT static int
4024 pmap_remove_range(
4025 pmap_t pmap,
4026 vm_map_address_t va,
4027 pt_entry_t *bpte,
4028 pt_entry_t *epte)
4029 {
4030 bool need_strong_sync = false;
4031 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4032 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4033 if (num_changed > 0) {
4034 PMAP_UPDATE_TLBS(pmap, va,
4035 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4036 }
4037 return num_changed;
4038 }
4039
4040
4041 #ifdef PVH_FLAG_EXEC
4042
4043 /*
4044 * Update the access protection bits of the physical aperture mapping for a page.
4045 * This is useful, for example, in guranteeing that a verified executable page
4046 * has no writable mappings anywhere in the system, including the physical
4047 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4048 * synchronization overhead in cases where the call to this function is
4049 * guaranteed to be followed by other TLB operations.
4050 */
4051 void
4052 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4053 {
4054 #if __ARM_PTE_PHYSMAP__
4055 pvh_assert_locked(pai);
4056 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4057 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4058
4059 pt_entry_t tmplate = *pte_p;
4060 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4061 return;
4062 }
4063 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4064 #if (__ARM_VMSA__ > 7)
4065 if (tmplate & ARM_PTE_HINT_MASK) {
4066 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4067 __func__, pte_p, (void *)kva, tmplate);
4068 }
4069 #endif
4070 write_pte_strong(pte_p, tmplate);
4071 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
4072 if (!flush_tlb_async) {
4073 sync_tlb_flush();
4074 }
4075 #endif
4076 }
4077
4078 #endif /* defined(PVH_FLAG_EXEC) */
4079
4080 MARK_AS_PMAP_TEXT int
4081 pmap_remove_range_options(
4082 pmap_t pmap,
4083 vm_map_address_t va,
4084 pt_entry_t *bpte,
4085 pt_entry_t *epte,
4086 vm_map_address_t *eva,
4087 bool *need_strong_sync __unused,
4088 int options)
4089 {
4090 pt_entry_t *cpte;
4091 size_t npages = 0;
4092 int num_removed, num_unwired;
4093 int num_pte_changed;
4094 unsigned int pai = 0;
4095 pmap_paddr_t pa;
4096 int num_external, num_internal, num_reusable;
4097 int num_alt_internal;
4098 uint64_t num_compressed, num_alt_compressed;
4099 int16_t refcnt = 0;
4100
4101 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4102
4103 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4104 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4105
4106 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4107 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4108 }
4109
4110 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4111 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4112 }
4113
4114 num_removed = 0;
4115 num_unwired = 0;
4116 num_pte_changed = 0;
4117 num_external = 0;
4118 num_internal = 0;
4119 num_reusable = 0;
4120 num_compressed = 0;
4121 num_alt_internal = 0;
4122 num_alt_compressed = 0;
4123
4124 #if XNU_MONITOR
4125 bool ro_va = false;
4126 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4127 ro_va = true;
4128 }
4129 #endif
4130 for (cpte = bpte; cpte < epte;
4131 cpte += PAGE_RATIO, va += pmap_page_size) {
4132 pt_entry_t spte;
4133 boolean_t managed = FALSE;
4134
4135 /*
4136 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4137 * so we need to be as aggressive as possible in checking for preemption when we can.
4138 */
4139 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4140 *eva = va;
4141 break;
4142 }
4143
4144 spte = *((volatile pt_entry_t*)cpte);
4145
4146 while (!managed) {
4147 if (pmap != kernel_pmap &&
4148 (options & PMAP_OPTIONS_REMOVE) &&
4149 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4150 /*
4151 * "pmap" must be locked at this point,
4152 * so this should not race with another
4153 * pmap_remove_range() or pmap_enter().
4154 */
4155
4156 /* one less "compressed"... */
4157 num_compressed++;
4158 if (spte & ARM_PTE_COMPRESSED_ALT) {
4159 /* ... but it used to be "ALTACCT" */
4160 num_alt_compressed++;
4161 }
4162
4163 /* clear marker */
4164 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4165 /*
4166 * "refcnt" also accounts for
4167 * our "compressed" markers,
4168 * so let's update it here.
4169 */
4170 --refcnt;
4171 spte = *((volatile pt_entry_t*)cpte);
4172 }
4173 /*
4174 * It may be possible for the pte to transition from managed
4175 * to unmanaged in this timeframe; for now, elide the assert.
4176 * We should break out as a consequence of checking pa_valid.
4177 */
4178 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4179 pa = pte_to_pa(spte);
4180 if (!pa_valid(pa)) {
4181 #if XNU_MONITOR
4182 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4183 #endif
4184 #if XNU_MONITOR
4185 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4186 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4187 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4188 __func__, (uint64_t)pa);
4189 }
4190 #endif
4191 break;
4192 }
4193 pai = pa_index(pa);
4194 pvh_lock(pai);
4195 spte = *((volatile pt_entry_t*)cpte);
4196 pa = pte_to_pa(spte);
4197 if (pai == pa_index(pa)) {
4198 managed = TRUE;
4199 break; // Leave pai locked as we will unlock it after we free the PV entry
4200 }
4201 pvh_unlock(pai);
4202 }
4203
4204 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4205 /*
4206 * There used to be a valid mapping here but it
4207 * has already been removed when the page was
4208 * sent to the VM compressor, so nothing left to
4209 * remove now...
4210 */
4211 continue;
4212 }
4213
4214 /* remove the translation, do not flush the TLB */
4215 if (*cpte != ARM_PTE_TYPE_FAULT) {
4216 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4217 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4218 #if MACH_ASSERT
4219 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4220 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4221 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4222 }
4223 #endif
4224 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4225 num_pte_changed++;
4226 }
4227
4228 if ((spte != ARM_PTE_TYPE_FAULT) &&
4229 (pmap != kernel_pmap)) {
4230 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4231 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4232 --refcnt;
4233 }
4234
4235 if (pte_is_wired(spte)) {
4236 pte_set_wired(pmap, cpte, 0);
4237 num_unwired++;
4238 }
4239 /*
4240 * if not managed, we're done
4241 */
4242 if (!managed) {
4243 continue;
4244 }
4245
4246 #if XNU_MONITOR
4247 if (__improbable(ro_va)) {
4248 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4249 }
4250 #endif
4251
4252 /*
4253 * find and remove the mapping from the chain for this
4254 * physical address.
4255 */
4256 bool is_internal, is_altacct;
4257 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4258
4259 if (is_altacct) {
4260 assert(is_internal);
4261 num_internal++;
4262 num_alt_internal++;
4263 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4264 ppattr_clear_altacct(pai);
4265 ppattr_clear_internal(pai);
4266 }
4267 } else if (is_internal) {
4268 if (ppattr_test_reusable(pai)) {
4269 num_reusable++;
4270 } else {
4271 num_internal++;
4272 }
4273 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4274 ppattr_clear_internal(pai);
4275 }
4276 } else {
4277 num_external++;
4278 }
4279 pvh_unlock(pai);
4280 num_removed++;
4281 }
4282
4283 /*
4284 * Update the counts
4285 */
4286 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4287
4288 if (pmap != kernel_pmap) {
4289 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4290 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4291 }
4292
4293 /* update ledgers */
4294 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4295 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4296 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4297 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4298 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4299 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4300 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4301 /* make needed adjustments to phys_footprint */
4302 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4303 ((num_internal -
4304 num_alt_internal) +
4305 (num_compressed -
4306 num_alt_compressed)) * pmap_page_size);
4307 }
4308
4309 /* flush the ptable entries we have written */
4310 if (num_pte_changed > 0) {
4311 FLUSH_PTE_STRONG();
4312 }
4313
4314 return num_pte_changed;
4315 }
4316
4317
4318 /*
4319 * Remove the given range of addresses
4320 * from the specified map.
4321 *
4322 * It is assumed that the start and end are properly
4323 * rounded to the hardware page size.
4324 */
4325 void
4326 pmap_remove(
4327 pmap_t pmap,
4328 vm_map_address_t start,
4329 vm_map_address_t end)
4330 {
4331 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4332 }
4333
4334 MARK_AS_PMAP_TEXT vm_map_address_t
4335 pmap_remove_options_internal(
4336 pmap_t pmap,
4337 vm_map_address_t start,
4338 vm_map_address_t end,
4339 int options)
4340 {
4341 vm_map_address_t eva = end;
4342 pt_entry_t *bpte, *epte;
4343 pt_entry_t *pte_p;
4344 tt_entry_t *tte_p;
4345 int remove_count = 0;
4346 bool need_strong_sync = false;
4347 bool unlock = true;
4348
4349 if (__improbable(end < start)) {
4350 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4351 }
4352
4353 validate_pmap_mutable(pmap);
4354
4355 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4356
4357 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4358
4359 tte_p = pmap_tte(pmap, start);
4360
4361 if (tte_p == (tt_entry_t *) NULL) {
4362 goto done;
4363 }
4364
4365 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4366 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4367 bpte = &pte_p[pte_index(pt_attr, start)];
4368 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4369
4370 /*
4371 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4372 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4373 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4374 */
4375 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4376 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4377 __func__, ptep_get_pmap(bpte), pmap, bpte);
4378 }
4379
4380 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4381 &need_strong_sync, options);
4382
4383 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4384 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4385 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4386 unlock = false; // pmap_tte_deallocate() has dropped the lock
4387 }
4388 }
4389
4390 done:
4391 if (unlock) {
4392 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4393 }
4394
4395 if (remove_count > 0) {
4396 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4397 }
4398 return eva;
4399 }
4400
4401 void
4402 pmap_remove_options(
4403 pmap_t pmap,
4404 vm_map_address_t start,
4405 vm_map_address_t end,
4406 int options)
4407 {
4408 vm_map_address_t va;
4409
4410 if (pmap == PMAP_NULL) {
4411 return;
4412 }
4413
4414 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4415
4416 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4417 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4418 VM_KERNEL_ADDRHIDE(end));
4419
4420 #if MACH_ASSERT
4421 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4422 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4423 pmap, (uint64_t)start, (uint64_t)end);
4424 }
4425 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4426 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4427 pmap, (uint64_t)start, (uint64_t)end);
4428 }
4429 #endif
4430
4431 /*
4432 * We allow single-page requests to execute non-preemptibly,
4433 * as it doesn't make sense to sample AST_URGENT for a single-page
4434 * operation, and there are a couple of special use cases that
4435 * require a non-preemptible single-page operation.
4436 */
4437 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4438 pmap_verify_preemptible();
4439 }
4440
4441 /*
4442 * Invalidate the translation buffer first
4443 */
4444 va = start;
4445 while (va < end) {
4446 vm_map_address_t l;
4447
4448 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4449 if (l > end) {
4450 l = end;
4451 }
4452
4453 #if XNU_MONITOR
4454 va = pmap_remove_options_ppl(pmap, va, l, options);
4455
4456 pmap_ledger_check_balance(pmap);
4457 #else
4458 va = pmap_remove_options_internal(pmap, va, l, options);
4459 #endif
4460 }
4461
4462 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4463 }
4464
4465
4466 /*
4467 * Remove phys addr if mapped in specified map
4468 */
4469 void
4470 pmap_remove_some_phys(
4471 __unused pmap_t map,
4472 __unused ppnum_t pn)
4473 {
4474 /* Implement to support working set code */
4475 }
4476
4477 /*
4478 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4479 * switch a thread onto a new vm_map.
4480 */
4481 void
4482 pmap_switch_user(thread_t thread, vm_map_t new_map)
4483 {
4484 pmap_t new_pmap = new_map->pmap;
4485
4486
4487 thread->map = new_map;
4488 pmap_set_pmap(new_pmap, thread);
4489
4490 }
4491
4492 void
4493 pmap_set_pmap(
4494 pmap_t pmap,
4495 #if !__ARM_USER_PROTECT__
4496 __unused
4497 #endif
4498 thread_t thread)
4499 {
4500 pmap_switch(pmap);
4501 #if __ARM_USER_PROTECT__
4502 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4503 thread->machine.asid = pmap->hw_asid;
4504 #endif
4505 }
4506
4507 static void
4508 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4509 {
4510 #if (__ARM_VMSA__ == 7)
4511 flush_core_tlb_asid_async(pmap->hw_asid);
4512 #else
4513 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4514 #endif
4515 }
4516
4517 static inline bool
4518 pmap_user_ttb_is_clear(void)
4519 {
4520 #if (__ARM_VMSA__ > 7)
4521 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4522 #else
4523 return get_mmu_ttb() == kernel_pmap->ttep;
4524 #endif
4525 }
4526
4527 MARK_AS_PMAP_TEXT void
4528 pmap_switch_internal(
4529 pmap_t pmap)
4530 {
4531 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4532 #if XNU_MONITOR
4533 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4534 #endif
4535 validate_pmap_mutable(pmap);
4536 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4537 uint16_t asid_index = pmap->hw_asid;
4538 bool do_asid_flush = false;
4539 bool do_commpage_flush = false;
4540
4541 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4542 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4543 }
4544 #if __ARM_KERNEL_PROTECT__
4545 asid_index >>= 1;
4546 #endif
4547
4548 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4549 #if (__ARM_VMSA__ > 7)
4550 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4551 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4552 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4553 #endif
4554 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4555 bool break_before_make = do_shared_region_flush;
4556
4557 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4558 asid_index -= 1;
4559 pmap_update_plru(asid_index);
4560
4561 /* Paranoia. */
4562 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4563
4564 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4565 uint8_t new_sw_asid = pmap->sw_asid;
4566 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4567
4568 if (new_sw_asid != last_sw_asid) {
4569 /*
4570 * If the virtual ASID of the new pmap does not match the virtual ASID
4571 * last seen on this CPU for the physical ASID (that was a mouthful),
4572 * then this switch runs the risk of aliasing. We need to flush the
4573 * TLB for this phyiscal ASID in this case.
4574 */
4575 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4576 do_asid_flush = true;
4577 break_before_make = true;
4578 }
4579 }
4580
4581 #if __ARM_MIXED_PAGE_SIZE__
4582 if (pt_attr->pta_tcr_value != get_tcr()) {
4583 break_before_make = true;
4584 }
4585 #endif
4586 #if __ARM_MIXED_PAGE_SIZE__
4587 /*
4588 * For mixed page size configurations, we need to flush the global commpage mappings from
4589 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4590 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4591 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4592 * conflict abort or other unpredictable behavior.
4593 */
4594 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4595 do_commpage_flush = true;
4596 }
4597 if (do_commpage_flush) {
4598 break_before_make = true;
4599 }
4600 #endif
4601 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4602 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4603 pmap_clear_user_ttb_internal();
4604 }
4605
4606 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4607 * to flush the userspace mappings for that region. Those mappings are global
4608 * and will not be protected by the ASID. It should also be cheaper to flush the
4609 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4610 if (__improbable(do_shared_region_flush)) {
4611 #if __ARM_RANGE_TLBI__
4612 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4613 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4614
4615 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4616 * There may still be non-global entries that overlap with the incoming pmap's
4617 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4618 * must necessarily belong to a different ASID than the incoming pmap, or they would
4619 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4620 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4621 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4622 * to consider additional invalidation here in the future. */
4623 if (npages_prev <= ARM64_TLB_RANGE_PAGES) {
4624 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4625 } else {
4626 do_asid_flush = false;
4627 flush_core_tlb_async();
4628 }
4629 #else
4630 do_asid_flush = false;
4631 flush_core_tlb_async();
4632 #endif // __ARM_RANGE_TLBI__
4633 }
4634
4635 #if __ARM_MIXED_PAGE_SIZE__
4636 if (__improbable(do_commpage_flush)) {
4637 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4638 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4639 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4640 flush_core_tlb_allrange_async(rtlbi_param);
4641 }
4642 #endif
4643 if (__improbable(do_asid_flush)) {
4644 pmap_flush_core_tlb_asid_async(pmap);
4645 #if DEVELOPMENT || DEBUG
4646 os_atomic_inc(&pmap_asid_flushes, relaxed);
4647 #endif
4648 }
4649 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4650 sync_tlb_flush_local();
4651 }
4652
4653 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4654 }
4655
4656 void
4657 pmap_switch(
4658 pmap_t pmap)
4659 {
4660 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4661 #if XNU_MONITOR
4662 pmap_switch_ppl(pmap);
4663 #else
4664 pmap_switch_internal(pmap);
4665 #endif
4666 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4667 }
4668
4669 void
4670 pmap_page_protect(
4671 ppnum_t ppnum,
4672 vm_prot_t prot)
4673 {
4674 pmap_page_protect_options(ppnum, prot, 0, NULL);
4675 }
4676
4677 /*
4678 * Routine: pmap_page_protect_options
4679 *
4680 * Function:
4681 * Lower the permission for all mappings to a given
4682 * page.
4683 */
4684 MARK_AS_PMAP_TEXT static void
4685 pmap_page_protect_options_with_flush_range(
4686 ppnum_t ppnum,
4687 vm_prot_t prot,
4688 unsigned int options,
4689 pmap_tlb_flush_range_t *flush_range)
4690 {
4691 pmap_paddr_t phys = ptoa(ppnum);
4692 pv_entry_t **pv_h;
4693 pv_entry_t *pve_p, *orig_pve_p;
4694 pv_entry_t *pveh_p;
4695 pv_entry_t *pvet_p;
4696 pt_entry_t *pte_p, *orig_pte_p;
4697 pv_entry_t *new_pve_p;
4698 pt_entry_t *new_pte_p;
4699 vm_offset_t pvh_flags;
4700 unsigned int pai;
4701 bool remove;
4702 bool set_NX;
4703 unsigned int pvh_cnt = 0;
4704 unsigned int pass1_updated = 0;
4705 unsigned int pass2_updated = 0;
4706
4707 assert(ppnum != vm_page_fictitious_addr);
4708
4709 /* Only work with managed pages. */
4710 if (!pa_valid(phys)) {
4711 return;
4712 }
4713
4714 /*
4715 * Determine the new protection.
4716 */
4717 switch (prot) {
4718 case VM_PROT_ALL:
4719 return; /* nothing to do */
4720 case VM_PROT_READ:
4721 case VM_PROT_READ | VM_PROT_EXECUTE:
4722 remove = false;
4723 break;
4724 default:
4725 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4726 options = options & ~PMAP_OPTIONS_NOFLUSH;
4727 remove = true;
4728 break;
4729 }
4730
4731 pmap_cpu_data_t *pmap_cpu_data = NULL;
4732 if (remove) {
4733 #if !XNU_MONITOR
4734 mp_disable_preemption();
4735 #endif
4736 pmap_cpu_data = pmap_get_cpu_data();
4737 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4738 /*
4739 * Ensure the store to inflight_disconnect will be observed before any of the
4740 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4741 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4742 * another CPU, in between this function's clearing a PTE and dropping the
4743 * corresponding pagetable refcount. That can lead to a panic if the
4744 * destroying thread observes a non-zero refcount. For this we need a store-
4745 * store barrier; a store-release operation would not be sufficient.
4746 */
4747 os_atomic_thread_fence(release);
4748 }
4749
4750 pai = pa_index(phys);
4751 pvh_lock(pai);
4752 pv_h = pai_to_pvh(pai);
4753 pvh_flags = pvh_get_flags(pv_h);
4754
4755 #if XNU_MONITOR
4756 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4757 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4758 }
4759 if (__improbable(ppattr_pa_test_monitor(phys))) {
4760 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4761 }
4762 #endif
4763
4764 orig_pte_p = pte_p = PT_ENTRY_NULL;
4765 orig_pve_p = pve_p = PV_ENTRY_NULL;
4766 pveh_p = PV_ENTRY_NULL;
4767 pvet_p = PV_ENTRY_NULL;
4768 new_pve_p = PV_ENTRY_NULL;
4769 new_pte_p = PT_ENTRY_NULL;
4770
4771
4772 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4773 orig_pte_p = pte_p = pvh_ptep(pv_h);
4774 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4775 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4776 pveh_p = pve_p;
4777 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4778 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4779 }
4780
4781 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4782 int pve_ptep_idx = 0;
4783
4784 /*
4785 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4786 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4787 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4788 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4789 * operation, TLB invalidation may be handled by the caller so it's possible for
4790 * tlb_flush_needed to be true while issue_tlbi is false.
4791 */
4792 bool issue_tlbi = false;
4793 bool tlb_flush_needed = false;
4794 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4795 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4796 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4797 bool update = false;
4798
4799 if (pve_p != PV_ENTRY_NULL) {
4800 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4801 if (pte_p == PT_ENTRY_NULL) {
4802 goto protect_skip_pve_pass1;
4803 }
4804 }
4805
4806 #ifdef PVH_FLAG_IOMMU
4807 if (pvh_ptep_is_iommu(pte_p)) {
4808 #if XNU_MONITOR
4809 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4810 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4811 ppnum, ptep_get_iommu(pte_p), pve_p);
4812 }
4813 #endif
4814 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4815 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4816 ppnum, ptep_get_iommu(pte_p), pve_p);
4817 }
4818 goto protect_skip_pve_pass1;
4819 }
4820 #endif
4821 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4822 const pmap_t pmap = ptdp->pmap;
4823 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4824
4825 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4826 #if MACH_ASSERT
4827 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4828 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4829 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4830 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4831
4832 pv_entry_t *check_pvep = pve_p;
4833
4834 do {
4835 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4836 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4837 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4838 }
4839 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4840
4841 /* Restore previous PTEP value. */
4842 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4843 }
4844 #endif
4845 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4846 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4847 }
4848
4849 #if DEVELOPMENT || DEBUG
4850 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4851 #else
4852 if ((prot & VM_PROT_EXECUTE))
4853 #endif
4854 {
4855 set_NX = false;
4856 } else {
4857 set_NX = true;
4858 }
4859
4860 /* Remove the mapping if new protection is NONE */
4861 if (remove) {
4862 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4863 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4864 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4865 pt_entry_t spte = *pte_p;
4866
4867 if (pte_is_wired(spte)) {
4868 pte_set_wired(pmap, pte_p, 0);
4869 spte = *pte_p;
4870 if (pmap != kernel_pmap) {
4871 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4872 }
4873 }
4874
4875 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4876 (uint64_t)spte, pte_p, ppnum);
4877
4878 if (compress && is_internal && (pmap != kernel_pmap)) {
4879 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4880 /* mark this PTE as having been "compressed" */
4881 tmplate = ARM_PTE_COMPRESSED;
4882 if (is_altacct) {
4883 tmplate |= ARM_PTE_COMPRESSED_ALT;
4884 }
4885 } else {
4886 tmplate = ARM_PTE_TYPE_FAULT;
4887 }
4888
4889 assert(spte != tmplate);
4890 write_pte_fast(pte_p, tmplate);
4891 update = true;
4892 ++pass1_updated;
4893
4894 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4895
4896 if (pmap != kernel_pmap) {
4897 if (ppattr_test_reusable(pai) &&
4898 is_internal &&
4899 !is_altacct) {
4900 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4901 } else if (!is_internal) {
4902 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4903 }
4904
4905 if (is_altacct) {
4906 assert(is_internal);
4907 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4908 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4909 if (options & PMAP_OPTIONS_COMPRESSOR) {
4910 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4911 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4912 }
4913 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4914 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4915 } else if (ppattr_test_reusable(pai)) {
4916 assert(is_internal);
4917 if (options & PMAP_OPTIONS_COMPRESSOR) {
4918 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4919 /* was not in footprint, but is now */
4920 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4921 }
4922 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4923 } else if (is_internal) {
4924 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4925
4926 /*
4927 * Update all stats related to physical footprint, which only
4928 * deals with internal pages.
4929 */
4930 if (options & PMAP_OPTIONS_COMPRESSOR) {
4931 /*
4932 * This removal is only being done so we can send this page to
4933 * the compressor; therefore it mustn't affect total task footprint.
4934 */
4935 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4936 } else {
4937 /*
4938 * This internal page isn't going to the compressor, so adjust stats to keep
4939 * phys_footprint up to date.
4940 */
4941 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4942 }
4943 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4944 } else {
4945 /* external page: no impact on ledgers */
4946 }
4947 }
4948 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4949 } else {
4950 pt_entry_t spte = *pte_p;
4951 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4952
4953 if (pmap == kernel_pmap) {
4954 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4955 } else {
4956 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4957 }
4958
4959 /*
4960 * While the naive implementation of this would serve to add execute
4961 * permission, this is not how the VM uses this interface, or how
4962 * x86_64 implements it. So ignore requests to add execute permissions.
4963 */
4964 if (set_NX) {
4965 tmplate |= pt_attr_leaf_xn(pt_attr);
4966 }
4967
4968
4969 assert(spte != ARM_PTE_TYPE_FAULT);
4970 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4971
4972 if (spte != tmplate) {
4973 /*
4974 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4975 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4976 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
4977 * should always be cleared by this function.
4978 */
4979 pte_set_was_writeable(tmplate, true);
4980 write_pte_fast(pte_p, tmplate);
4981 update = true;
4982 ++pass1_updated;
4983 } else if (pte_was_writeable(tmplate)) {
4984 /*
4985 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4986 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
4987 * write access to a page, this function should always at least clear that flag for
4988 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4989 * these mappings go through vm_fault(). We therefore don't want those accesses to
4990 * be handled through arm_fast_fault().
4991 */
4992 pte_set_was_writeable(tmplate, false);
4993 write_pte_fast(pte_p, tmplate);
4994 }
4995 }
4996
4997 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4998 tlb_flush_needed = true;
4999 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5000 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5001 issue_tlbi = true;
5002 }
5003 }
5004 protect_skip_pve_pass1:
5005 pte_p = PT_ENTRY_NULL;
5006 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5007 pve_ptep_idx = 0;
5008 pve_p = pve_next(pve_p);
5009 }
5010 }
5011
5012 if (tlb_flush_needed) {
5013 FLUSH_PTE_STRONG();
5014 }
5015
5016 if (!remove && !issue_tlbi) {
5017 goto protect_finish;
5018 }
5019
5020 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5021 pv_entry_t **pve_pp = pv_h;
5022 pve_p = orig_pve_p;
5023 pte_p = orig_pte_p;
5024 pve_ptep_idx = 0;
5025
5026 /*
5027 * We need to keep track of whether a particular PVE list contains IOMMU
5028 * mappings when removing entries, because we should only remove CPU
5029 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5030 * it around.
5031 */
5032 bool iommu_mapping_in_pve = false;
5033 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5034 if (pve_p != PV_ENTRY_NULL) {
5035 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5036 if (pte_p == PT_ENTRY_NULL) {
5037 goto protect_skip_pve_pass2;
5038 }
5039 }
5040
5041 #ifdef PVH_FLAG_IOMMU
5042 if (pvh_ptep_is_iommu(pte_p)) {
5043 iommu_mapping_in_pve = true;
5044 if (remove && (pve_p == PV_ENTRY_NULL)) {
5045 /*
5046 * We've found an IOMMU entry and it's the only entry in the PV list.
5047 * We don't discard IOMMU entries, so simply set up the new PV list to
5048 * contain the single IOMMU PTE and exit the loop.
5049 */
5050 new_pte_p = pte_p;
5051 break;
5052 }
5053 goto protect_skip_pve_pass2;
5054 }
5055 #endif
5056 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5057 const pmap_t pmap = ptdp->pmap;
5058 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5059
5060 if (remove) {
5061 if (!compress && (pmap != kernel_pmap)) {
5062 /*
5063 * We must wait to decrement the refcount until we're completely finished using the PTE
5064 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5065 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5066 * under us.
5067 */
5068 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5069 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5070 }
5071 }
5072 /* Remove this CPU mapping from PVE list. */
5073 if (pve_p != PV_ENTRY_NULL) {
5074 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5075 }
5076 } else {
5077 pt_entry_t spte = *pte_p;
5078 if (pte_was_writeable(spte)) {
5079 pte_set_was_writeable(spte, false);
5080 write_pte_fast(pte_p, spte);
5081 } else {
5082 goto protect_skip_pve_pass2;
5083 }
5084 }
5085 ++pass2_updated;
5086 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5087 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5088 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5089 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
5090 }
5091
5092 protect_skip_pve_pass2:
5093 pte_p = PT_ENTRY_NULL;
5094 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5095 pve_ptep_idx = 0;
5096
5097 if (remove) {
5098 /**
5099 * If there are any IOMMU mappings in the PVE list, preserve
5100 * those mappings in a new PVE list (new_pve_p) which will later
5101 * become the new PVH entry. Keep track of the CPU mappings in
5102 * pveh_p/pvet_p so they can be deallocated later.
5103 */
5104 if (iommu_mapping_in_pve) {
5105 iommu_mapping_in_pve = false;
5106 pv_entry_t *temp_pve_p = pve_next(pve_p);
5107 pve_remove(pv_h, pve_pp, pve_p);
5108 pveh_p = pvh_pve_list(pv_h);
5109 pve_p->pve_next = new_pve_p;
5110 new_pve_p = pve_p;
5111 pve_p = temp_pve_p;
5112 continue;
5113 } else {
5114 pvet_p = pve_p;
5115 pvh_cnt++;
5116 }
5117 }
5118
5119 pve_pp = pve_next_ptr(pve_p);
5120 pve_p = pve_next(pve_p);
5121 iommu_mapping_in_pve = false;
5122 }
5123 }
5124
5125 protect_finish:
5126
5127 #ifdef PVH_FLAG_EXEC
5128 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5129 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5130 }
5131 #endif
5132 if (__improbable(pass1_updated != pass2_updated)) {
5133 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5134 __func__, pass1_updated, pass2_updated);
5135 }
5136 /* if we removed a bunch of entries, take care of them now */
5137 if (remove) {
5138 if (new_pve_p != PV_ENTRY_NULL) {
5139 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5140 pvh_set_flags(pv_h, pvh_flags);
5141 } else if (new_pte_p != PT_ENTRY_NULL) {
5142 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5143 pvh_set_flags(pv_h, pvh_flags);
5144 } else {
5145 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5146 }
5147 }
5148
5149 if (flush_range && tlb_flush_needed) {
5150 if (!remove) {
5151 flush_range->ptfr_flush_needed = true;
5152 tlb_flush_needed = false;
5153 }
5154 }
5155
5156 /*
5157 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5158 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5159 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5160 * a page to be repurposed while it is still live in the TLBs.
5161 */
5162 if (remove && tlb_flush_needed) {
5163 sync_tlb_flush();
5164 }
5165
5166 pvh_unlock(pai);
5167
5168 if (remove) {
5169 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5170 #if !XNU_MONITOR
5171 mp_enable_preemption();
5172 #endif
5173 }
5174
5175 if (!remove && tlb_flush_needed) {
5176 sync_tlb_flush();
5177 }
5178
5179 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5180 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5181 }
5182 }
5183
5184 MARK_AS_PMAP_TEXT void
5185 pmap_page_protect_options_internal(
5186 ppnum_t ppnum,
5187 vm_prot_t prot,
5188 unsigned int options,
5189 void *arg)
5190 {
5191 if (arg != NULL) {
5192 /*
5193 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5194 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5195 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5196 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5197 * In that case, force the flush to take place.
5198 */
5199 options &= ~PMAP_OPTIONS_NOFLUSH;
5200 }
5201 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5202 }
5203
5204 void
5205 pmap_page_protect_options(
5206 ppnum_t ppnum,
5207 vm_prot_t prot,
5208 unsigned int options,
5209 void *arg)
5210 {
5211 pmap_paddr_t phys = ptoa(ppnum);
5212
5213 assert(ppnum != vm_page_fictitious_addr);
5214
5215 /* Only work with managed pages. */
5216 if (!pa_valid(phys)) {
5217 return;
5218 }
5219
5220 /*
5221 * Determine the new protection.
5222 */
5223 if (prot == VM_PROT_ALL) {
5224 return; /* nothing to do */
5225 }
5226
5227 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5228
5229 #if XNU_MONITOR
5230 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5231 #else
5232 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5233 #endif
5234
5235 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5236 }
5237
5238
5239 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
5240 MARK_AS_PMAP_TEXT void
5241 pmap_disable_user_jop_internal(pmap_t pmap)
5242 {
5243 if (pmap == kernel_pmap) {
5244 panic("%s: called with kernel_pmap", __func__);
5245 }
5246 validate_pmap_mutable(pmap);
5247 pmap->disable_jop = true;
5248 }
5249
5250 void
5251 pmap_disable_user_jop(pmap_t pmap)
5252 {
5253 #if XNU_MONITOR
5254 pmap_disable_user_jop_ppl(pmap);
5255 #else
5256 pmap_disable_user_jop_internal(pmap);
5257 #endif
5258 }
5259 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
5260
5261 /*
5262 * Indicates if the pmap layer enforces some additional restrictions on the
5263 * given set of protections.
5264 */
5265 bool
5266 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5267 {
5268 return false;
5269 }
5270
5271 /*
5272 * Set the physical protection on the
5273 * specified range of this map as requested.
5274 * VERY IMPORTANT: Will not increase permissions.
5275 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5276 */
5277 void
5278 pmap_protect(
5279 pmap_t pmap,
5280 vm_map_address_t b,
5281 vm_map_address_t e,
5282 vm_prot_t prot)
5283 {
5284 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5285 }
5286
5287 MARK_AS_PMAP_TEXT vm_map_address_t
5288 pmap_protect_options_internal(
5289 pmap_t pmap,
5290 vm_map_address_t start,
5291 vm_map_address_t end,
5292 vm_prot_t prot,
5293 unsigned int options,
5294 __unused void *args)
5295 {
5296 tt_entry_t *tte_p;
5297 pt_entry_t *bpte_p, *epte_p;
5298 pt_entry_t *pte_p;
5299 boolean_t set_NX = TRUE;
5300 #if (__ARM_VMSA__ > 7)
5301 boolean_t set_XO = FALSE;
5302 #endif
5303 boolean_t should_have_removed = FALSE;
5304 bool need_strong_sync = false;
5305
5306 /* Validate the pmap input before accessing its data. */
5307 validate_pmap_mutable(pmap);
5308
5309 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5310
5311 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5312 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5313 }
5314
5315 #if DEVELOPMENT || DEBUG
5316 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5317 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5318 should_have_removed = TRUE;
5319 }
5320 } else
5321 #endif
5322 {
5323 /* Determine the new protection. */
5324 switch (prot) {
5325 #if (__ARM_VMSA__ > 7)
5326 case VM_PROT_EXECUTE:
5327 set_XO = TRUE;
5328 OS_FALLTHROUGH;
5329 #endif
5330 case VM_PROT_READ:
5331 case VM_PROT_READ | VM_PROT_EXECUTE:
5332 break;
5333 case VM_PROT_READ | VM_PROT_WRITE:
5334 case VM_PROT_ALL:
5335 return end; /* nothing to do */
5336 default:
5337 should_have_removed = TRUE;
5338 }
5339 }
5340
5341 if (should_have_removed) {
5342 panic("%s: should have been a remove operation, "
5343 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5344 __FUNCTION__,
5345 pmap, (void *)start, (void *)end, prot, options, args);
5346 }
5347
5348 #if DEVELOPMENT || DEBUG
5349 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5350 #else
5351 if ((prot & VM_PROT_EXECUTE))
5352 #endif
5353 {
5354 set_NX = FALSE;
5355 } else {
5356 set_NX = TRUE;
5357 }
5358
5359 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5360 vm_map_address_t va = start;
5361 unsigned int npages = 0;
5362
5363 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5364
5365 tte_p = pmap_tte(pmap, start);
5366
5367 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5368 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5369 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5370 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5371 pte_p = bpte_p;
5372
5373 for (pte_p = bpte_p;
5374 pte_p < epte_p;
5375 pte_p += PAGE_RATIO, va += pmap_page_size) {
5376 ++npages;
5377 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5378 pmap_pending_preemption())) {
5379 break;
5380 }
5381 pt_entry_t spte;
5382 #if DEVELOPMENT || DEBUG
5383 boolean_t force_write = FALSE;
5384 #endif
5385
5386 spte = *((volatile pt_entry_t*)pte_p);
5387
5388 if ((spte == ARM_PTE_TYPE_FAULT) ||
5389 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5390 continue;
5391 }
5392
5393 pmap_paddr_t pa;
5394 unsigned int pai = 0;
5395 boolean_t managed = FALSE;
5396
5397 while (!managed) {
5398 /*
5399 * It may be possible for the pte to transition from managed
5400 * to unmanaged in this timeframe; for now, elide the assert.
5401 * We should break out as a consequence of checking pa_valid.
5402 */
5403 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5404 pa = pte_to_pa(spte);
5405 if (!pa_valid(pa)) {
5406 break;
5407 }
5408 pai = pa_index(pa);
5409 pvh_lock(pai);
5410 spte = *((volatile pt_entry_t*)pte_p);
5411 pa = pte_to_pa(spte);
5412 if (pai == pa_index(pa)) {
5413 managed = TRUE;
5414 break; // Leave the PVH locked as we will unlock it after we free the PTE
5415 }
5416 pvh_unlock(pai);
5417 }
5418
5419 if ((spte == ARM_PTE_TYPE_FAULT) ||
5420 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5421 continue;
5422 }
5423
5424 pt_entry_t tmplate;
5425
5426 if (pmap == kernel_pmap) {
5427 #if DEVELOPMENT || DEBUG
5428 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5429 force_write = TRUE;
5430 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5431 } else
5432 #endif
5433 {
5434 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5435 }
5436 } else {
5437 #if DEVELOPMENT || DEBUG
5438 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5439 assert(pmap->type != PMAP_TYPE_NESTED);
5440 force_write = TRUE;
5441 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5442 } else
5443 #endif
5444 {
5445 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5446 }
5447 }
5448
5449 /*
5450 * XXX Removing "NX" would
5451 * grant "execute" access
5452 * immediately, bypassing any
5453 * checks VM might want to do
5454 * in its soft fault path.
5455 * pmap_protect() and co. are
5456 * not allowed to increase
5457 * access permissions.
5458 */
5459 if (set_NX) {
5460 tmplate |= pt_attr_leaf_xn(pt_attr);
5461 } else {
5462 #if (__ARM_VMSA__ > 7)
5463 if (pmap == kernel_pmap) {
5464 /* do NOT clear "PNX"! */
5465 tmplate |= ARM_PTE_NX;
5466 } else {
5467 /* do NOT clear "NX"! */
5468 tmplate |= pt_attr_leaf_x(pt_attr);
5469 if (set_XO) {
5470 tmplate &= ~ARM_PTE_APMASK;
5471 tmplate |= pt_attr_leaf_rona(pt_attr);
5472 }
5473 }
5474 #endif
5475 }
5476
5477 #if DEVELOPMENT || DEBUG
5478 if (force_write) {
5479 /*
5480 * TODO: Run CS/Monitor checks here.
5481 */
5482 if (managed) {
5483 /*
5484 * We are marking the page as writable,
5485 * so we consider it to be modified and
5486 * referenced.
5487 */
5488 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5489 tmplate |= ARM_PTE_AF;
5490
5491 if (ppattr_test_reffault(pai)) {
5492 ppattr_clear_reffault(pai);
5493 }
5494
5495 if (ppattr_test_modfault(pai)) {
5496 ppattr_clear_modfault(pai);
5497 }
5498 }
5499 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5500 /*
5501 * An immediate request for anything other than
5502 * write should still mark the page as
5503 * referenced if managed.
5504 */
5505 if (managed) {
5506 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5507 tmplate |= ARM_PTE_AF;
5508
5509 if (ppattr_test_reffault(pai)) {
5510 ppattr_clear_reffault(pai);
5511 }
5512 }
5513 }
5514 #endif
5515
5516 /* We do not expect to write fast fault the entry. */
5517 pte_set_was_writeable(tmplate, false);
5518
5519 write_pte_fast(pte_p, tmplate);
5520
5521 if (managed) {
5522 pvh_assert_locked(pai);
5523 pvh_unlock(pai);
5524 }
5525 }
5526 FLUSH_PTE_STRONG();
5527 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5528 } else {
5529 va = end;
5530 }
5531
5532 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5533 return va;
5534 }
5535
5536 void
5537 pmap_protect_options(
5538 pmap_t pmap,
5539 vm_map_address_t b,
5540 vm_map_address_t e,
5541 vm_prot_t prot,
5542 unsigned int options,
5543 __unused void *args)
5544 {
5545 vm_map_address_t l, beg;
5546
5547 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5548
5549 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5550 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5551 pmap, (uint64_t)b, (uint64_t)e);
5552 }
5553
5554 /*
5555 * We allow single-page requests to execute non-preemptibly,
5556 * as it doesn't make sense to sample AST_URGENT for a single-page
5557 * operation, and there are a couple of special use cases that
5558 * require a non-preemptible single-page operation.
5559 */
5560 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5561 pmap_verify_preemptible();
5562 }
5563
5564 #if DEVELOPMENT || DEBUG
5565 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5566 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5567 pmap_remove_options(pmap, b, e, options);
5568 return;
5569 }
5570 } else
5571 #endif
5572 {
5573 /* Determine the new protection. */
5574 switch (prot) {
5575 case VM_PROT_EXECUTE:
5576 case VM_PROT_READ:
5577 case VM_PROT_READ | VM_PROT_EXECUTE:
5578 break;
5579 case VM_PROT_READ | VM_PROT_WRITE:
5580 case VM_PROT_ALL:
5581 return; /* nothing to do */
5582 default:
5583 pmap_remove_options(pmap, b, e, options);
5584 return;
5585 }
5586 }
5587
5588 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5589 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5590 VM_KERNEL_ADDRHIDE(e));
5591
5592 beg = b;
5593
5594 while (beg < e) {
5595 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5596
5597 if (l > e) {
5598 l = e;
5599 }
5600
5601 #if XNU_MONITOR
5602 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5603 #else
5604 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5605 #endif
5606 }
5607
5608 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5609 }
5610
5611 /**
5612 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5613 *
5614 * @param pmap pmap to insert the pages into.
5615 * @param va virtual address to map the pages into.
5616 * @param pa page number of the first physical page to map.
5617 * @param size block size, in number of pages.
5618 * @param prot mapping protection attributes.
5619 * @param attr flags to pass to pmap_enter().
5620 *
5621 * @return KERN_SUCCESS.
5622 */
5623 kern_return_t
5624 pmap_map_block(
5625 pmap_t pmap,
5626 addr64_t va,
5627 ppnum_t pa,
5628 uint32_t size,
5629 vm_prot_t prot,
5630 int attr,
5631 unsigned int flags)
5632 {
5633 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5634 }
5635
5636 /**
5637 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5638 * As opposed to pmap_map_block(), this function takes
5639 * a physical address as an input and operates using the
5640 * page size associated with the input pmap.
5641 *
5642 * @param pmap pmap to insert the pages into.
5643 * @param va virtual address to map the pages into.
5644 * @param pa physical address of the first physical page to map.
5645 * @param size block size, in number of pages.
5646 * @param prot mapping protection attributes.
5647 * @param attr flags to pass to pmap_enter().
5648 *
5649 * @return KERN_SUCCESS.
5650 */
5651 kern_return_t
5652 pmap_map_block_addr(
5653 pmap_t pmap,
5654 addr64_t va,
5655 pmap_paddr_t pa,
5656 uint32_t size,
5657 vm_prot_t prot,
5658 int attr,
5659 unsigned int flags)
5660 {
5661 #if __ARM_MIXED_PAGE_SIZE__
5662 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5663 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5664 #else
5665 const uint64_t pmap_page_size = PAGE_SIZE;
5666 #endif
5667
5668 for (ppnum_t page = 0; page < size; page++) {
5669 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5670 panic("%s: failed pmap_enter_addr, "
5671 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5672 __FUNCTION__,
5673 pmap, va, (uint64_t)pa, size, prot, flags);
5674 }
5675
5676 va += pmap_page_size;
5677 pa += pmap_page_size;
5678 }
5679
5680 return KERN_SUCCESS;
5681 }
5682
5683 kern_return_t
5684 pmap_enter_addr(
5685 pmap_t pmap,
5686 vm_map_address_t v,
5687 pmap_paddr_t pa,
5688 vm_prot_t prot,
5689 vm_prot_t fault_type,
5690 unsigned int flags,
5691 boolean_t wired)
5692 {
5693 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5694 }
5695
5696 /*
5697 * Insert the given physical page (p) at
5698 * the specified virtual address (v) in the
5699 * target physical map with the protection requested.
5700 *
5701 * If specified, the page will be wired down, meaning
5702 * that the related pte can not be reclaimed.
5703 *
5704 * NB: This is the only routine which MAY NOT lazy-evaluate
5705 * or lose information. That is, this routine must actually
5706 * insert this page into the given map eventually (must make
5707 * forward progress eventually.
5708 */
5709 kern_return_t
5710 pmap_enter(
5711 pmap_t pmap,
5712 vm_map_address_t v,
5713 ppnum_t pn,
5714 vm_prot_t prot,
5715 vm_prot_t fault_type,
5716 unsigned int flags,
5717 boolean_t wired)
5718 {
5719 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5720 }
5721
5722 /*
5723 * Attempt to commit the pte.
5724 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5725 * Performs no page table or accounting writes on failures.
5726 */
5727 static inline bool
5728 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5729 {
5730 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5731 bool success = false, changed_wiring = false;
5732
5733 __unreachable_ok_push
5734 if (TEST_PAGE_RATIO_4) {
5735 /*
5736 * 16K virtual pages w/ 4K hw pages.
5737 * We actually need to update 4 ptes here which can't easily be done atomically.
5738 * As a result we require the exclusive pmap lock.
5739 */
5740 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5741 *old_pte = *pte_p;
5742 if (*old_pte == new_pte) {
5743 /* Another thread completed this operation. Nothing to do here. */
5744 success = true;
5745 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5746 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5747 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5748 success = false;
5749 } else {
5750 write_pte_fast(pte_p, new_pte);
5751 success = true;
5752 }
5753 } else {
5754 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5755 }
5756 __unreachable_ok_pop
5757
5758 if (success && *old_pte != new_pte) {
5759 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5760 FLUSH_PTE_STRONG();
5761 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), false, true);
5762 } else {
5763 FLUSH_PTE();
5764 __builtin_arm_isb(ISB_SY);
5765 }
5766 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5767 (new_pte & ARM_PTE_WIRED) != 0 :
5768 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5769
5770 if (pmap != kernel_pmap && changed_wiring) {
5771 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5772 if (new_pte & ARM_PTE_WIRED) {
5773 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5774 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5775 } else {
5776 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5777 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5778 }
5779 }
5780
5781 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5782 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5783 }
5784 return success;
5785 }
5786
5787 MARK_AS_PMAP_TEXT static pt_entry_t
5788 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5789 {
5790 pt_entry_t pte;
5791
5792 switch (wimg & (VM_WIMG_MASK)) {
5793 case VM_WIMG_IO:
5794 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5795 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5796 // AP, while preserving the security benefits of using device
5797 // mapping against side-channel attacks. On pre-H14 platforms,
5798 // the accesses will still be strongly ordered.
5799 if (is_dram_addr(pa)) {
5800 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5801 } else {
5802 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5803 }
5804 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5805 break;
5806 case VM_WIMG_RT:
5807 #if HAS_UCNORMAL_MEM
5808 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5809 #else
5810 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5811 #endif
5812 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5813 break;
5814 case VM_WIMG_POSTED:
5815 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5816 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5817 break;
5818 case VM_WIMG_POSTED_REORDERED:
5819 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5820 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5821 break;
5822 case VM_WIMG_POSTED_COMBINED_REORDERED:
5823 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5824 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5825 break;
5826 case VM_WIMG_WCOMB:
5827 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5828 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5829 break;
5830 case VM_WIMG_WTHRU:
5831 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5832 #if (__ARM_VMSA__ > 7)
5833 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5834 #else
5835 pte |= ARM_PTE_SH;
5836 #endif
5837 break;
5838 case VM_WIMG_COPYBACK:
5839 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5840 #if (__ARM_VMSA__ > 7)
5841 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5842 #else
5843 pte |= ARM_PTE_SH;
5844 #endif
5845 break;
5846 case VM_WIMG_INNERWBACK:
5847 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5848 #if (__ARM_VMSA__ > 7)
5849 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5850 #else
5851 pte |= ARM_PTE_SH;
5852 #endif
5853 break;
5854 default:
5855 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5856 #if (__ARM_VMSA__ > 7)
5857 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5858 #else
5859 pte |= ARM_PTE_SH;
5860 #endif
5861 }
5862
5863 return pte;
5864 }
5865
5866
5867 /*
5868 * Construct a PTE (and the physical page attributes) for the given virtual to
5869 * physical mapping.
5870 *
5871 * This function has no side effects and is safe to call so that it is safe to
5872 * call while attempting a pmap_enter transaction.
5873 */
5874 MARK_AS_PMAP_TEXT static pt_entry_t
5875 pmap_construct_pte(
5876 const pmap_t pmap,
5877 vm_map_address_t va,
5878 pmap_paddr_t pa,
5879 vm_prot_t prot,
5880 vm_prot_t fault_type,
5881 boolean_t wired,
5882 const pt_attr_t* const pt_attr,
5883 uint16_t *pp_attr_bits /* OUTPUT */
5884 )
5885 {
5886 bool set_NX = false, set_XO = false;
5887 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5888 assert(pp_attr_bits != NULL);
5889 *pp_attr_bits = 0;
5890
5891 if (wired) {
5892 pte |= ARM_PTE_WIRED;
5893 }
5894
5895 #if DEVELOPMENT || DEBUG
5896 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5897 #else
5898 if ((prot & VM_PROT_EXECUTE))
5899 #endif
5900 {
5901 set_NX = false;
5902 } else {
5903 set_NX = true;
5904 }
5905
5906 #if (__ARM_VMSA__ > 7)
5907 if (prot == VM_PROT_EXECUTE) {
5908 set_XO = true;
5909 }
5910 #endif
5911
5912 if (set_NX) {
5913 pte |= pt_attr_leaf_xn(pt_attr);
5914 } else {
5915 #if (__ARM_VMSA__ > 7)
5916 if (pmap == kernel_pmap) {
5917 pte |= ARM_PTE_NX;
5918 } else {
5919 pte |= pt_attr_leaf_x(pt_attr);
5920 }
5921 #endif
5922 }
5923
5924 if (pmap == kernel_pmap) {
5925 #if __ARM_KERNEL_PROTECT__
5926 pte |= ARM_PTE_NG;
5927 #endif /* __ARM_KERNEL_PROTECT__ */
5928 if (prot & VM_PROT_WRITE) {
5929 pte |= ARM_PTE_AP(AP_RWNA);
5930 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5931 } else {
5932 pte |= ARM_PTE_AP(AP_RONA);
5933 *pp_attr_bits |= PP_ATTR_REFERENCED;
5934 }
5935 #if (__ARM_VMSA__ == 7)
5936 if ((_COMM_PAGE_BASE_ADDRESS <= va) && (va < _COMM_PAGE_BASE_ADDRESS + _COMM_PAGE_AREA_LENGTH)) {
5937 pte = (pte & ~(ARM_PTE_APMASK)) | ARM_PTE_AP(AP_RORO);
5938 }
5939 #endif
5940 } else {
5941 if (pmap->type != PMAP_TYPE_NESTED) {
5942 pte |= ARM_PTE_NG;
5943 } else if ((pmap->nested_region_asid_bitmap)
5944 && (va >= pmap->nested_region_addr)
5945 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5946 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5947
5948 if ((pmap->nested_region_asid_bitmap)
5949 && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5950 pte |= ARM_PTE_NG;
5951 }
5952 }
5953 if (prot & VM_PROT_WRITE) {
5954 assert(pmap->type != PMAP_TYPE_NESTED);
5955 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5956 if (fault_type & VM_PROT_WRITE) {
5957 if (set_XO) {
5958 pte |= pt_attr_leaf_rwna(pt_attr);
5959 } else {
5960 pte |= pt_attr_leaf_rw(pt_attr);
5961 }
5962 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5963 } else {
5964 if (set_XO) {
5965 pte |= pt_attr_leaf_rona(pt_attr);
5966 } else {
5967 pte |= pt_attr_leaf_ro(pt_attr);
5968 }
5969 /*
5970 * Mark the page as MODFAULT so that a subsequent write
5971 * may be handled through arm_fast_fault().
5972 */
5973 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5974 pte_set_was_writeable(pte, true);
5975 }
5976 } else {
5977 if (set_XO) {
5978 pte |= pt_attr_leaf_rwna(pt_attr);
5979 } else {
5980 pte |= pt_attr_leaf_rw(pt_attr);
5981 }
5982 *pp_attr_bits |= PP_ATTR_REFERENCED;
5983 }
5984 } else {
5985 if (set_XO) {
5986 pte |= pt_attr_leaf_rona(pt_attr);
5987 } else {
5988 pte |= pt_attr_leaf_ro(pt_attr);
5989 }
5990 *pp_attr_bits |= PP_ATTR_REFERENCED;
5991 }
5992 }
5993
5994 pte |= ARM_PTE_AF;
5995 return pte;
5996 }
5997
5998 MARK_AS_PMAP_TEXT kern_return_t
5999 pmap_enter_options_internal(
6000 pmap_t pmap,
6001 vm_map_address_t v,
6002 pmap_paddr_t pa,
6003 vm_prot_t prot,
6004 vm_prot_t fault_type,
6005 unsigned int flags,
6006 boolean_t wired,
6007 unsigned int options)
6008 {
6009 ppnum_t pn = (ppnum_t)atop(pa);
6010 pt_entry_t pte;
6011 pt_entry_t spte;
6012 pt_entry_t *pte_p;
6013 bool refcnt_updated;
6014 bool wiredcnt_updated;
6015 bool ro_va = false;
6016 unsigned int wimg_bits;
6017 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6018 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6019 kern_return_t kr = KERN_SUCCESS;
6020 uint16_t pp_attr_bits;
6021 volatile uint16_t *refcnt;
6022 volatile uint16_t *wiredcnt;
6023 pv_free_list_t *local_pv_free;
6024
6025 validate_pmap_mutable(pmap);
6026
6027 #if XNU_MONITOR
6028 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6029 panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
6030 }
6031 #endif
6032
6033 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6034
6035 if ((v) & pt_attr_leaf_offmask(pt_attr)) {
6036 panic("pmap_enter_options() pmap %p v 0x%llx",
6037 pmap, (uint64_t)v);
6038 }
6039
6040 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6041 panic("pmap_enter_options() pmap %p pa 0x%llx",
6042 pmap, (uint64_t)pa);
6043 }
6044
6045 /* The PA should not extend beyond the architected physical address space */
6046 pa &= ARM_PTE_PAGE_MASK;
6047
6048 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6049 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6050 extern vm_offset_t ctrr_test_page;
6051 if (__probable(v != ctrr_test_page))
6052 #endif
6053 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6054 }
6055 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6056 if (__improbable(prot != VM_PROT_READ)) {
6057 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6058 __func__, (unsigned long long)v, prot);
6059 }
6060 ro_va = true;
6061 }
6062 assert(pn != vm_page_fictitious_addr);
6063
6064 refcnt_updated = false;
6065 wiredcnt_updated = false;
6066
6067 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6068 /*
6069 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6070 *
6071 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6072 */
6073 lock_mode = PMAP_LOCK_EXCLUSIVE;
6074 }
6075 pmap_lock(pmap, lock_mode);
6076
6077 /*
6078 * Expand pmap to include this pte. Assume that
6079 * pmap is always expanded to include enough hardware
6080 * pages to map one VM page.
6081 */
6082 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6083 /* Must unlock to expand the pmap. */
6084 pmap_unlock(pmap, lock_mode);
6085
6086 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6087
6088 if (kr != KERN_SUCCESS) {
6089 return kr;
6090 }
6091
6092 pmap_lock(pmap, lock_mode);
6093 }
6094
6095 if (options & PMAP_OPTIONS_NOENTER) {
6096 pmap_unlock(pmap, lock_mode);
6097 return KERN_SUCCESS;
6098 }
6099
6100 /*
6101 * Since we may not hold the pmap lock exclusive, updating the pte is
6102 * done via a cmpxchg loop.
6103 * We need to be careful about modifying non-local data structures before commiting
6104 * the new pte since we may need to re-do the transaction.
6105 */
6106 spte = os_atomic_load(pte_p, relaxed);
6107 while (!committed) {
6108 refcnt = NULL;
6109 wiredcnt = NULL;
6110 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6111 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6112
6113 if (pmap != kernel_pmap) {
6114 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6115 refcnt = &ptd_info->refcnt;
6116 wiredcnt = &ptd_info->wiredcnt;
6117 /*
6118 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6119 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6120 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6121 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6122 * have PTDs, so we can't use the check there.
6123 */
6124 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6125 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6126 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6127 }
6128 /*
6129 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6130 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6131 * or acquire the pmap lock exclusive.
6132 */
6133 if (!wiredcnt_updated) {
6134 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6135 wiredcnt_updated = true;
6136 }
6137 if (!refcnt_updated) {
6138 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6139 refcnt_updated = true;
6140 drop_refcnt = true;
6141 }
6142 }
6143
6144 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6145 /*
6146 * There is already a mapping here & it's for a different physical page.
6147 * First remove that mapping.
6148 *
6149 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6150 */
6151 if (lock_mode == PMAP_LOCK_SHARED) {
6152 if (pmap_lock_shared_to_exclusive(pmap)) {
6153 lock_mode = PMAP_LOCK_EXCLUSIVE;
6154 } else {
6155 /*
6156 * We failed to upgrade to an exclusive lock.
6157 * As a result we no longer hold the lock at all,
6158 * so we need to re-acquire it and restart the transaction.
6159 */
6160 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6161 lock_mode = PMAP_LOCK_EXCLUSIVE;
6162 /* pmap might have changed after we dropped the lock. Try again. */
6163 spte = os_atomic_load(pte_p, relaxed);
6164 continue;
6165 }
6166 }
6167 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6168 spte = ARM_PTE_TYPE_FAULT;
6169 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6170 }
6171
6172 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6173
6174 if (pa_valid(pa)) {
6175 unsigned int pai;
6176 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6177
6178 is_internal = FALSE;
6179 is_altacct = FALSE;
6180
6181 pai = pa_index(pa);
6182
6183 pvh_lock(pai);
6184
6185 /*
6186 * Make sure that the current per-cpu PV free list has
6187 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6188 * if the transaction succeeds. We're either in the
6189 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6190 * Note that we can still be interrupted, but a primary
6191 * interrupt handler can never enter the pmap.
6192 */
6193 #if !XNU_MONITOR
6194 assert(get_preemption_level() > 0);
6195 #endif
6196 local_pv_free = &pmap_get_cpu_data()->pv_free;
6197 pv_entry_t **pv_h = pai_to_pvh(pai);
6198 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6199 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6200
6201 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6202 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6203 int new_allocated_pves = 0;
6204
6205 while (new_allocated_pves < 2) {
6206 local_pv_free = &pmap_get_cpu_data()->pv_free;
6207 pv_status = pv_alloc(pmap, pai, lock_mode, &new_pve_p[new_allocated_pves]);
6208 if (pv_status == PV_ALLOC_FAIL) {
6209 break;
6210 } else if (pv_status == PV_ALLOC_RETRY) {
6211 /*
6212 * In the case that pv_alloc() had to grab a new page of PVEs,
6213 * it will have dropped the pmap lock while doing so.
6214 * On non-PPL devices, dropping the lock re-enables preemption so we may
6215 * be on a different CPU now.
6216 */
6217 local_pv_free = &pmap_get_cpu_data()->pv_free;
6218 } else {
6219 /* If we've gotten this far then a node should've been allocated. */
6220 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6221
6222 new_allocated_pves++;
6223 }
6224 }
6225
6226 for (int i = 0; i < new_allocated_pves; i++) {
6227 pv_free(new_pve_p[i]);
6228 }
6229 }
6230
6231 if (pv_status == PV_ALLOC_FAIL) {
6232 pvh_unlock(pai);
6233 kr = KERN_RESOURCE_SHORTAGE;
6234 break;
6235 } else if (pv_status == PV_ALLOC_RETRY) {
6236 pvh_unlock(pai);
6237 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6238 spte = os_atomic_load(pte_p, relaxed);
6239 continue;
6240 }
6241
6242 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6243 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6244 } else {
6245 wimg_bits = pmap_cache_attributes(pn);
6246 }
6247
6248 /* We may be retrying this operation after dropping the PVH lock.
6249 * Cache attributes for the physical page may have changed while the lock
6250 * was dropped, so clear any cache attributes we may have previously set
6251 * in the PTE template. */
6252 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6253 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6254
6255 #if XNU_MONITOR
6256 /* The regular old kernel is not allowed to remap PPL pages. */
6257 if (__improbable(ppattr_pa_test_monitor(pa))) {
6258 panic("%s: page belongs to PPL, "
6259 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6260 __FUNCTION__,
6261 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6262 }
6263
6264 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6265 panic("%s: page locked down, "
6266 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6267 __FUNCTION__,
6268 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6269 }
6270 #endif
6271
6272
6273 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6274 if (!committed) {
6275 pvh_unlock(pai);
6276 continue;
6277 }
6278 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6279 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6280
6281 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6282 /*
6283 * If there was already a valid pte here then we reuse its reference
6284 * on the ptd and drop the one that we took above.
6285 */
6286 drop_refcnt = had_valid_mapping;
6287
6288 if (!had_valid_mapping) {
6289 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6290 int pve_ptep_idx = 0;
6291 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6292 /* We did all the allocations up top. So this shouldn't be able to fail. */
6293 if (pv_status != PV_ALLOC_SUCCESS) {
6294 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6295 __func__, pv_status, new_pve_p, pmap);
6296 }
6297
6298 if (pmap != kernel_pmap) {
6299 if (options & PMAP_OPTIONS_INTERNAL) {
6300 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6301 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6302 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6303 /*
6304 * Make a note to ourselves that this
6305 * mapping is using alternative
6306 * accounting. We'll need this in order
6307 * to know which ledger to debit when
6308 * the mapping is removed.
6309 *
6310 * The altacct bit must be set while
6311 * the pv head is locked. Defer the
6312 * ledger accounting until after we've
6313 * dropped the lock.
6314 */
6315 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6316 is_altacct = TRUE;
6317 }
6318 }
6319 if (ppattr_test_reusable(pai) &&
6320 !is_altacct) {
6321 is_reusable = TRUE;
6322 } else if (options & PMAP_OPTIONS_INTERNAL) {
6323 is_internal = TRUE;
6324 } else {
6325 is_external = TRUE;
6326 }
6327 }
6328 }
6329
6330 pvh_unlock(pai);
6331
6332 if (pp_attr_bits != 0) {
6333 ppattr_pa_set_bits(pa, pp_attr_bits);
6334 }
6335
6336 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6337 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6338
6339 if (is_internal) {
6340 /*
6341 * Make corresponding adjustments to
6342 * phys_footprint statistics.
6343 */
6344 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6345 if (is_altacct) {
6346 /*
6347 * If this page is internal and
6348 * in an IOKit region, credit
6349 * the task's total count of
6350 * dirty, internal IOKit pages.
6351 * It should *not* count towards
6352 * the task's total physical
6353 * memory footprint, because
6354 * this entire region was
6355 * already billed to the task
6356 * at the time the mapping was
6357 * created.
6358 *
6359 * Put another way, this is
6360 * internal++ and
6361 * alternate_accounting++, so
6362 * net effect on phys_footprint
6363 * is 0. That means: don't
6364 * touch phys_footprint here.
6365 */
6366 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6367 } else {
6368 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6369 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6370 skip_footprint_debit = true;
6371 } else {
6372 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6373 }
6374 }
6375 }
6376 if (is_reusable) {
6377 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6378 } else if (is_external) {
6379 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6380 }
6381 }
6382 } else {
6383 if (prot & VM_PROT_EXECUTE) {
6384 kr = KERN_FAILURE;
6385 break;
6386 }
6387
6388 wimg_bits = pmap_cache_attributes(pn);
6389 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6390 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6391 }
6392
6393 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6394
6395 #if XNU_MONITOR
6396 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
6397 uint64_t xprr_perm = pte_to_xprr_perm(pte);
6398 switch (xprr_perm) {
6399 case XPRR_KERN_RO_PERM:
6400 break;
6401 case XPRR_KERN_RW_PERM:
6402 pte &= ~ARM_PTE_XPRR_MASK;
6403 pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
6404 break;
6405 default:
6406 panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte);
6407 }
6408 }
6409 #endif
6410 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6411 if (committed) {
6412 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6413 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6414
6415 /**
6416 * If there was already a valid pte here then we reuse its
6417 * reference on the ptd and drop the one that we took above.
6418 */
6419 drop_refcnt = had_valid_mapping;
6420 }
6421 }
6422 if (committed) {
6423 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6424 assert(pmap != kernel_pmap);
6425
6426 /* One less "compressed" */
6427 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6428 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6429
6430 if (spte & ARM_PTE_COMPRESSED_ALT) {
6431 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6432 } else if (!skip_footprint_debit) {
6433 /* Was part of the footprint */
6434 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6435 }
6436 /* The old entry held a reference so drop the extra one that we took above. */
6437 drop_refcnt = true;
6438 }
6439 }
6440 }
6441
6442 if (drop_refcnt && refcnt != NULL) {
6443 assert(refcnt_updated);
6444 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6445 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6446 }
6447 }
6448
6449 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6450 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6451 }
6452
6453 pmap_unlock(pmap, lock_mode);
6454
6455 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6456 pmap_phys_write_disable(v);
6457 }
6458
6459 return kr;
6460 }
6461
6462 kern_return_t
6463 pmap_enter_options_addr(
6464 pmap_t pmap,
6465 vm_map_address_t v,
6466 pmap_paddr_t pa,
6467 vm_prot_t prot,
6468 vm_prot_t fault_type,
6469 unsigned int flags,
6470 boolean_t wired,
6471 unsigned int options,
6472 __unused void *arg)
6473 {
6474 kern_return_t kr = KERN_FAILURE;
6475
6476
6477 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6478 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6479
6480
6481 #if XNU_MONITOR
6482 /*
6483 * If NOWAIT was not requested, loop until the enter does not
6484 * fail due to lack of resources.
6485 */
6486 while ((kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT)) == KERN_RESOURCE_SHORTAGE) {
6487 pmap_alloc_page_for_ppl((options & PMAP_OPTIONS_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6488 if (options & PMAP_OPTIONS_NOWAIT) {
6489 break;
6490 }
6491 }
6492
6493 pmap_ledger_check_balance(pmap);
6494 #else
6495 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6496 #endif
6497
6498 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6499
6500 return kr;
6501 }
6502
6503 kern_return_t
6504 pmap_enter_options(
6505 pmap_t pmap,
6506 vm_map_address_t v,
6507 ppnum_t pn,
6508 vm_prot_t prot,
6509 vm_prot_t fault_type,
6510 unsigned int flags,
6511 boolean_t wired,
6512 unsigned int options,
6513 __unused void *arg)
6514 {
6515 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6516 }
6517
6518 /*
6519 * Routine: pmap_change_wiring
6520 * Function: Change the wiring attribute for a map/virtual-address
6521 * pair.
6522 * In/out conditions:
6523 * The mapping must already exist in the pmap.
6524 */
6525 MARK_AS_PMAP_TEXT void
6526 pmap_change_wiring_internal(
6527 pmap_t pmap,
6528 vm_map_address_t v,
6529 boolean_t wired)
6530 {
6531 pt_entry_t *pte_p;
6532 pmap_paddr_t pa;
6533
6534 validate_pmap_mutable(pmap);
6535
6536 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6537
6538 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6539
6540 pte_p = pmap_pte(pmap, v);
6541 if (pte_p == PT_ENTRY_NULL) {
6542 if (!wired) {
6543 /*
6544 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6545 * may have been freed by a remove operation.
6546 */
6547 goto pmap_change_wiring_return;
6548 } else {
6549 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6550 }
6551 }
6552 /*
6553 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6554 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6555 */
6556 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6557
6558 while (pa_valid(pa)) {
6559 pmap_paddr_t new_pa;
6560
6561 pvh_lock(pa_index(pa));
6562 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6563
6564 if (pa == new_pa) {
6565 break;
6566 }
6567
6568 pvh_unlock(pa_index(pa));
6569 pa = new_pa;
6570 }
6571
6572 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6573 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6574 if (!wired) {
6575 /* PTE cleared by prior remove/disconnect operation */
6576 goto pmap_change_wiring_cleanup;
6577 } else {
6578 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6579 __func__, pte_p, (uint64_t)*pte_p, pmap);
6580 }
6581 }
6582
6583 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6584 if (wired != pte_is_wired(*pte_p)) {
6585 pte_set_wired(pmap, pte_p, wired);
6586 if (pmap != kernel_pmap) {
6587 if (wired) {
6588 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6589 } else if (!wired) {
6590 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6591 }
6592 }
6593 }
6594
6595 pmap_change_wiring_cleanup:
6596 if (pa_valid(pa)) {
6597 pvh_unlock(pa_index(pa));
6598 }
6599
6600 pmap_change_wiring_return:
6601 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6602 }
6603
6604 void
6605 pmap_change_wiring(
6606 pmap_t pmap,
6607 vm_map_address_t v,
6608 boolean_t wired)
6609 {
6610 #if XNU_MONITOR
6611 pmap_change_wiring_ppl(pmap, v, wired);
6612
6613 pmap_ledger_check_balance(pmap);
6614 #else
6615 pmap_change_wiring_internal(pmap, v, wired);
6616 #endif
6617 }
6618
6619 MARK_AS_PMAP_TEXT pmap_paddr_t
6620 pmap_find_pa_internal(
6621 pmap_t pmap,
6622 addr64_t va)
6623 {
6624 pmap_paddr_t pa = 0;
6625
6626 validate_pmap(pmap);
6627
6628 if (pmap != kernel_pmap) {
6629 pmap_lock(pmap, PMAP_LOCK_SHARED);
6630 }
6631
6632 pa = pmap_vtophys(pmap, va);
6633
6634 if (pmap != kernel_pmap) {
6635 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6636 }
6637
6638 return pa;
6639 }
6640
6641 pmap_paddr_t
6642 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6643 {
6644 pmap_paddr_t pa = 0;
6645
6646 if (pmap == kernel_pmap) {
6647 pa = mmu_kvtop(va);
6648 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6649 /*
6650 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6651 * translation even if PAN would prevent kernel access through the translation.
6652 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6653 */
6654 pa = mmu_uvtop(va);
6655 }
6656 return pa;
6657 }
6658
6659 pmap_paddr_t
6660 pmap_find_pa(
6661 pmap_t pmap,
6662 addr64_t va)
6663 {
6664 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6665
6666 if (pa != 0) {
6667 return pa;
6668 }
6669
6670 if (not_in_kdp) {
6671 #if XNU_MONITOR
6672 return pmap_find_pa_ppl(pmap, va);
6673 #else
6674 return pmap_find_pa_internal(pmap, va);
6675 #endif
6676 } else {
6677 return pmap_vtophys(pmap, va);
6678 }
6679 }
6680
6681 ppnum_t
6682 pmap_find_phys_nofault(
6683 pmap_t pmap,
6684 addr64_t va)
6685 {
6686 ppnum_t ppn;
6687 ppn = atop(pmap_find_pa_nofault(pmap, va));
6688 return ppn;
6689 }
6690
6691 ppnum_t
6692 pmap_find_phys(
6693 pmap_t pmap,
6694 addr64_t va)
6695 {
6696 ppnum_t ppn;
6697 ppn = atop(pmap_find_pa(pmap, va));
6698 return ppn;
6699 }
6700
6701 /**
6702 * Translate a kernel virtual address into a physical address.
6703 *
6704 * @param va The kernel virtual address to translate. Does not work on user
6705 * virtual addresses.
6706 *
6707 * @return The physical address if the translation was successful, or zero if
6708 * no valid mappings were found for the given virtual address.
6709 */
6710 pmap_paddr_t
6711 kvtophys(vm_offset_t va)
6712 {
6713 /**
6714 * Attempt to do the translation first in hardware using the AT (address
6715 * translation) instruction. This will attempt to use the MMU to do the
6716 * translation for us.
6717 */
6718 pmap_paddr_t pa = mmu_kvtop(va);
6719
6720 if (pa) {
6721 return pa;
6722 }
6723
6724 /* If the MMU can't find the mapping, then manually walk the page tables. */
6725 return pmap_vtophys(kernel_pmap, va);
6726 }
6727
6728 /**
6729 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6730 * points to a non-kernel-managed physical page, then this call will panic().
6731 *
6732 * @note The output of this function is guaranteed to be a kernel-managed
6733 * physical page, which means it's safe to pass the output directly to
6734 * pa_index() to create a physical address index for various pmap data
6735 * structures.
6736 *
6737 * @param va The kernel virtual address to translate. Does not work on user
6738 * virtual addresses.
6739 *
6740 * @return The translated physical address for the given virtual address.
6741 */
6742 pmap_paddr_t
6743 kvtophys_nofail(vm_offset_t va)
6744 {
6745 pmap_paddr_t pa = kvtophys(va);
6746
6747 if (!pa_valid(pa)) {
6748 panic("%s: Invalid or non-kernel-managed physical page returned, "
6749 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6750 }
6751
6752 return pa;
6753 }
6754
6755 pmap_paddr_t
6756 pmap_vtophys(
6757 pmap_t pmap,
6758 addr64_t va)
6759 {
6760 if ((va < pmap->min) || (va >= pmap->max)) {
6761 return 0;
6762 }
6763
6764 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6765
6766 #if (__ARM_VMSA__ == 7)
6767 tt_entry_t *tte_p, tte;
6768 pt_entry_t *pte_p;
6769 pmap_paddr_t pa;
6770
6771 tte_p = pmap_tte(pmap, va);
6772 if (tte_p == (tt_entry_t *) NULL) {
6773 return (pmap_paddr_t) 0;
6774 }
6775
6776 tte = *tte_p;
6777 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
6778 pte_p = (pt_entry_t *) ttetokv(tte) + pte_index(pt_attr, va);
6779 pa = pte_to_pa(*pte_p) | (va & ARM_PGMASK);
6780 //LIONEL ppn = (ppnum_t) atop(pte_to_pa(*pte_p) | (va & ARM_PGMASK));
6781 #if DEVELOPMENT || DEBUG
6782 if (atop(pa) != 0 &&
6783 ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
6784 panic("pmap_vtophys(%p,0x%llx): compressed pte_p=%p 0x%llx with ppn=0x%x",
6785 pmap, va, pte_p, (uint64_t) (*pte_p), atop(pa));
6786 }
6787 #endif /* DEVELOPMENT || DEBUG */
6788 } else if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
6789 if ((tte & ARM_TTE_BLOCK_SUPER) == ARM_TTE_BLOCK_SUPER) {
6790 pa = suptte_to_pa(tte) | (va & ARM_TT_L1_SUPER_OFFMASK);
6791 } else {
6792 pa = sectte_to_pa(tte) | (va & ARM_TT_L1_BLOCK_OFFMASK);
6793 }
6794 } else {
6795 pa = 0;
6796 }
6797 #else
6798 tt_entry_t * ttp = NULL;
6799 tt_entry_t * ttep = NULL;
6800 tt_entry_t tte = ARM_TTE_EMPTY;
6801 pmap_paddr_t pa = 0;
6802 unsigned int cur_level;
6803
6804 ttp = pmap->tte;
6805
6806 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6807 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6808
6809 tte = *ttep;
6810
6811 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6812 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6813 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6814 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6815
6816 if ((tte & valid_mask) != valid_mask) {
6817 return (pmap_paddr_t) 0;
6818 }
6819
6820 /* This detects both leaf entries and intermediate block mappings. */
6821 if ((tte & type_mask) == type_block) {
6822 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6823 break;
6824 }
6825
6826 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6827 }
6828 #endif
6829
6830 return pa;
6831 }
6832
6833 /*
6834 * pmap_init_pte_page - Initialize a page table page.
6835 */
6836 MARK_AS_PMAP_TEXT void
6837 pmap_init_pte_page(
6838 pmap_t pmap,
6839 pt_entry_t *pte_p,
6840 vm_offset_t va,
6841 unsigned int ttlevel,
6842 boolean_t alloc_ptd)
6843 {
6844 pt_desc_t *ptdp = NULL;
6845 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6846
6847 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6848 if (alloc_ptd) {
6849 /*
6850 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6851 * on 4KB hardware, we may already have allocated a page table descriptor for a
6852 * bootstrap request, so we check for an existing PTD here.
6853 */
6854 ptdp = ptd_alloc(pmap);
6855 if (ptdp == NULL) {
6856 panic("%s: unable to allocate PTD", __func__);
6857 }
6858 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6859 } else {
6860 panic("pmap_init_pte_page(): pte_p %p", pte_p);
6861 }
6862 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6863 ptdp = pvh_ptd(pvh);
6864 } else {
6865 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6866 }
6867
6868 // below barrier ensures previous updates to the page are visible to PTW before
6869 // it is linked to the PTE of previous level
6870 __builtin_arm_dmb(DMB_ISHST);
6871 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6872 }
6873
6874 /*
6875 * Routine: pmap_expand
6876 *
6877 * Expands a pmap to be able to map the specified virtual address.
6878 *
6879 * Allocates new memory for the default (COARSE) translation table
6880 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6881 * also allocates space for the corresponding pv entries.
6882 *
6883 * Nothing should be locked.
6884 */
6885 MARK_AS_PMAP_TEXT static kern_return_t
6886 pmap_expand(
6887 pmap_t pmap,
6888 vm_map_address_t v,
6889 unsigned int options,
6890 unsigned int level)
6891 {
6892 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6893
6894 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6895 return KERN_INVALID_ADDRESS;
6896 }
6897 #if (__ARM_VMSA__ == 7)
6898 vm_offset_t pa;
6899 tt_entry_t *tte_p;
6900 tt_entry_t *tt_p;
6901 unsigned int i;
6902
6903 #if DEVELOPMENT || DEBUG
6904 /*
6905 * We no longer support root level expansion; panic in case something
6906 * still attempts to trigger it.
6907 */
6908 i = tte_index(pt_attr, v);
6909
6910 if (i >= pmap->tte_index_max) {
6911 panic("%s: index out of range, index=%u, max=%u, "
6912 "pmap=%p, addr=%p, options=%u, level=%u",
6913 __func__, i, pmap->tte_index_max,
6914 pmap, (void *)v, options, level);
6915 }
6916 #endif /* DEVELOPMENT || DEBUG */
6917
6918 if (level == 1) {
6919 return KERN_SUCCESS;
6920 }
6921
6922 {
6923 tt_entry_t *tte_next_p;
6924
6925 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6926 pa = 0;
6927 if (pmap_pte(pmap, v) != PT_ENTRY_NULL) {
6928 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6929 return KERN_SUCCESS;
6930 }
6931 tte_p = &pmap->tte[ttenum(v & ~ARM_TT_L1_PT_OFFMASK)];
6932 for (i = 0, tte_next_p = tte_p; i < 4; i++) {
6933 if (tte_to_pa(*tte_next_p)) {
6934 pa = tte_to_pa(*tte_next_p);
6935 break;
6936 }
6937 tte_next_p++;
6938 }
6939 pa = pa & ~PAGE_MASK;
6940 if (pa) {
6941 tte_p = &pmap->tte[ttenum(v)];
6942 *tte_p = pa_to_tte(pa) | (((v >> ARM_TT_L1_SHIFT) & 0x3) << 10) | ARM_TTE_TYPE_TABLE;
6943 FLUSH_PTE();
6944 PMAP_TRACE(5, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK),
6945 VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p);
6946 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6947 return KERN_SUCCESS;
6948 }
6949 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6950 }
6951 v = v & ~ARM_TT_L1_PT_OFFMASK;
6952
6953
6954 while (pmap_pte(pmap, v) == PT_ENTRY_NULL) {
6955 /*
6956 * Allocate a VM page for the level 2 page table entries.
6957 */
6958 while (pmap_tt_allocate(pmap, &tt_p, PMAP_TT_L2_LEVEL, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
6959 if (options & PMAP_OPTIONS_NOWAIT) {
6960 return KERN_RESOURCE_SHORTAGE;
6961 }
6962 VM_PAGE_WAIT();
6963 }
6964
6965 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6966 /*
6967 * See if someone else expanded us first
6968 */
6969 if (pmap_pte(pmap, v) == PT_ENTRY_NULL) {
6970 tt_entry_t *tte_next_p;
6971
6972 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE);
6973 pa = kvtophys_nofail((vm_offset_t)tt_p);
6974 tte_p = &pmap->tte[ttenum(v)];
6975 for (i = 0, tte_next_p = tte_p; i < 4; i++) {
6976 *tte_next_p = pa_to_tte(pa) | ARM_TTE_TYPE_TABLE;
6977 PMAP_TRACE(5, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + (i * ARM_TT_L1_SIZE)),
6978 VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + ((i + 1) * ARM_TT_L1_SIZE)), *tte_p);
6979 tte_next_p++;
6980 pa = pa + 0x400;
6981 }
6982 FLUSH_PTE();
6983
6984 pa = 0x0ULL;
6985 tt_p = (tt_entry_t *)NULL;
6986 }
6987 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6988 if (tt_p != (tt_entry_t *)NULL) {
6989 pmap_tt_deallocate(pmap, tt_p, PMAP_TT_L2_LEVEL);
6990 tt_p = (tt_entry_t *)NULL;
6991 }
6992 }
6993 return KERN_SUCCESS;
6994 #else
6995 pmap_paddr_t pa;
6996 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6997 tt_entry_t *tte_p;
6998 tt_entry_t *tt_p;
6999
7000 pa = 0x0ULL;
7001 tt_p = (tt_entry_t *)NULL;
7002
7003 for (; ttlevel < level; ttlevel++) {
7004 pmap_lock(pmap, PMAP_LOCK_SHARED);
7005
7006 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7007 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7008 while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
7009 if (options & PMAP_OPTIONS_NOWAIT) {
7010 return KERN_RESOURCE_SHORTAGE;
7011 }
7012 #if XNU_MONITOR
7013 panic("%s: failed to allocate tt, "
7014 "pmap=%p, v=%p, options=0x%x, level=%u",
7015 __FUNCTION__,
7016 pmap, (void *)v, options, level);
7017 #else
7018 VM_PAGE_WAIT();
7019 #endif
7020 }
7021 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
7022 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7023 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7024 pa = kvtophys_nofail((vm_offset_t)tt_p);
7025 tte_p = pmap_ttne(pmap, ttlevel, v);
7026 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7027 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7028 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7029 pa = 0x0ULL;
7030 tt_p = (tt_entry_t *)NULL;
7031 }
7032 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7033 } else {
7034 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7035 }
7036
7037 if (tt_p != (tt_entry_t *)NULL) {
7038 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7039 tt_p = (tt_entry_t *)NULL;
7040 }
7041 }
7042
7043 return KERN_SUCCESS;
7044 #endif
7045 }
7046
7047 /*
7048 * Routine: pmap_collect
7049 * Function:
7050 * Garbage collects the physical map system for
7051 * pages which are no longer used.
7052 * Success need not be guaranteed -- that is, there
7053 * may well be pages which are not referenced, but
7054 * others may be collected.
7055 */
7056 void
7057 pmap_collect(pmap_t pmap)
7058 {
7059 if (pmap == PMAP_NULL) {
7060 return;
7061 }
7062
7063 #if 0
7064 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
7065 if ((pmap->nested == FALSE) && (pmap != kernel_pmap)) {
7066 /* TODO: Scan for vm page assigned to top level page tables with no reference */
7067 }
7068 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7069 #endif
7070
7071 return;
7072 }
7073
7074 /*
7075 * Routine: pmap_gc
7076 * Function:
7077 * Pmap garbage collection
7078 * Called by the pageout daemon when pages are scarce.
7079 *
7080 */
7081 void
7082 pmap_gc(
7083 void)
7084 {
7085 #if XNU_MONITOR
7086 /*
7087 * We cannot invoke the scheduler from the PPL, so for now we elide the
7088 * GC logic if the PPL is enabled.
7089 */
7090 #endif
7091 #if !XNU_MONITOR
7092 pmap_t pmap, pmap_next;
7093 boolean_t gc_wait;
7094
7095 if (pmap_gc_allowed &&
7096 (pmap_gc_allowed_by_time_throttle ||
7097 pmap_gc_forced)) {
7098 pmap_gc_forced = FALSE;
7099 pmap_gc_allowed_by_time_throttle = FALSE;
7100 pmap_simple_lock(&pmaps_lock);
7101 pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&map_pmap_list));
7102 while (!queue_end(&map_pmap_list, (queue_entry_t)pmap)) {
7103 if (!(pmap->gc_status & PMAP_GC_INFLIGHT)) {
7104 pmap->gc_status |= PMAP_GC_INFLIGHT;
7105 }
7106 pmap_simple_unlock(&pmaps_lock);
7107
7108 pmap_collect(pmap);
7109
7110 pmap_simple_lock(&pmaps_lock);
7111 gc_wait = (pmap->gc_status & PMAP_GC_WAIT);
7112 pmap->gc_status &= ~(PMAP_GC_INFLIGHT | PMAP_GC_WAIT);
7113 pmap_next = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&pmap->pmaps));
7114 if (gc_wait) {
7115 if (!queue_end(&map_pmap_list, (queue_entry_t)pmap_next)) {
7116 pmap_next->gc_status |= PMAP_GC_INFLIGHT;
7117 }
7118 pmap_simple_unlock(&pmaps_lock);
7119 thread_wakeup((event_t) &pmap->gc_status);
7120 pmap_simple_lock(&pmaps_lock);
7121 }
7122 pmap = pmap_next;
7123 }
7124 pmap_simple_unlock(&pmaps_lock);
7125 }
7126 #endif
7127 }
7128
7129 /*
7130 * By default, don't attempt pmap GC more frequently
7131 * than once / 1 minutes.
7132 */
7133
7134 void
7135 compute_pmap_gc_throttle(
7136 void *arg __unused)
7137 {
7138 pmap_gc_allowed_by_time_throttle = TRUE;
7139 }
7140
7141 /*
7142 * pmap_attribute_cache_sync(vm_offset_t pa)
7143 *
7144 * Invalidates all of the instruction cache on a physical page and
7145 * pushes any dirty data from the data cache for the same physical page
7146 */
7147
7148 kern_return_t
7149 pmap_attribute_cache_sync(
7150 ppnum_t pp,
7151 vm_size_t size,
7152 __unused vm_machine_attribute_t attribute,
7153 __unused vm_machine_attribute_val_t * value)
7154 {
7155 if (size > PAGE_SIZE) {
7156 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7157 } else {
7158 cache_sync_page(pp);
7159 }
7160
7161 return KERN_SUCCESS;
7162 }
7163
7164 /*
7165 * pmap_sync_page_data_phys(ppnum_t pp)
7166 *
7167 * Invalidates all of the instruction cache on a physical page and
7168 * pushes any dirty data from the data cache for the same physical page
7169 */
7170 void
7171 pmap_sync_page_data_phys(
7172 ppnum_t pp)
7173 {
7174 cache_sync_page(pp);
7175 }
7176
7177 /*
7178 * pmap_sync_page_attributes_phys(ppnum_t pp)
7179 *
7180 * Write back and invalidate all cachelines on a physical page.
7181 */
7182 void
7183 pmap_sync_page_attributes_phys(
7184 ppnum_t pp)
7185 {
7186 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7187 }
7188
7189 #if CONFIG_COREDUMP
7190 /* temporary workaround */
7191 boolean_t
7192 coredumpok(
7193 vm_map_t map,
7194 mach_vm_offset_t va)
7195 {
7196 pt_entry_t *pte_p;
7197 pt_entry_t spte;
7198
7199 pte_p = pmap_pte(map->pmap, va);
7200 if (0 == pte_p) {
7201 return FALSE;
7202 }
7203 spte = *pte_p;
7204 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7205 }
7206 #endif
7207
7208 void
7209 fillPage(
7210 ppnum_t pn,
7211 unsigned int fill)
7212 {
7213 unsigned int *addr;
7214 int count;
7215
7216 addr = (unsigned int *) phystokv(ptoa(pn));
7217 count = PAGE_SIZE / sizeof(unsigned int);
7218 while (count--) {
7219 *addr++ = fill;
7220 }
7221 }
7222
7223 extern void mapping_set_mod(ppnum_t pn);
7224
7225 void
7226 mapping_set_mod(
7227 ppnum_t pn)
7228 {
7229 pmap_set_modify(pn);
7230 }
7231
7232 extern void mapping_set_ref(ppnum_t pn);
7233
7234 void
7235 mapping_set_ref(
7236 ppnum_t pn)
7237 {
7238 pmap_set_reference(pn);
7239 }
7240
7241 /*
7242 * Clear specified attribute bits.
7243 *
7244 * Try to force an arm_fast_fault() for all mappings of
7245 * the page - to force attributes to be set again at fault time.
7246 * If the forcing succeeds, clear the cached bits at the head.
7247 * Otherwise, something must have been wired, so leave the cached
7248 * attributes alone.
7249 */
7250 MARK_AS_PMAP_TEXT static void
7251 phys_attribute_clear_with_flush_range(
7252 ppnum_t pn,
7253 unsigned int bits,
7254 int options,
7255 void *arg,
7256 pmap_tlb_flush_range_t *flush_range)
7257 {
7258 pmap_paddr_t pa = ptoa(pn);
7259 vm_prot_t allow_mode = VM_PROT_ALL;
7260
7261 #if XNU_MONITOR
7262 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7263 panic("%s: illegal request, "
7264 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7265 __FUNCTION__,
7266 pn, bits, options, arg, flush_range);
7267 }
7268 #endif
7269 if ((arg != NULL) || (flush_range != NULL)) {
7270 options = options & ~PMAP_OPTIONS_NOFLUSH;
7271 }
7272
7273 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7274 (options & PMAP_OPTIONS_NOFLUSH))) {
7275 panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): "
7276 "should not clear 'modified' without flushing TLBs\n",
7277 pn, bits, options, arg, flush_range);
7278 }
7279
7280 assert(pn != vm_page_fictitious_addr);
7281
7282 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7283 assert(bits == PP_ATTR_MODIFIED);
7284
7285 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7286 /*
7287 * We short circuit this case; it should not need to
7288 * invoke arm_force_fast_fault, so just clear the modified bit.
7289 * pmap_page_protect has taken care of resetting
7290 * the state so that we'll see the next write as a fault to
7291 * the VM (i.e. we don't want a fast fault).
7292 */
7293 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7294 return;
7295 }
7296 if (bits & PP_ATTR_REFERENCED) {
7297 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7298 }
7299 if (bits & PP_ATTR_MODIFIED) {
7300 allow_mode &= ~VM_PROT_WRITE;
7301 }
7302
7303 if (bits == PP_ATTR_NOENCRYPT) {
7304 /*
7305 * We short circuit this case; it should not need to
7306 * invoke arm_force_fast_fault, so just clear and
7307 * return. On ARM, this bit is just a debugging aid.
7308 */
7309 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7310 return;
7311 }
7312
7313 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7314 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7315 }
7316 }
7317
7318 MARK_AS_PMAP_TEXT void
7319 phys_attribute_clear_internal(
7320 ppnum_t pn,
7321 unsigned int bits,
7322 int options,
7323 void *arg)
7324 {
7325 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7326 }
7327
7328 #if __ARM_RANGE_TLBI__
7329 MARK_AS_PMAP_TEXT static vm_map_address_t
7330 phys_attribute_clear_twig_internal(
7331 pmap_t pmap,
7332 vm_map_address_t start,
7333 vm_map_address_t end,
7334 unsigned int bits,
7335 unsigned int options,
7336 pmap_tlb_flush_range_t *flush_range)
7337 {
7338 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7339 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7340 assert(end >= start);
7341 assert((end - start) <= pt_attr_twig_size(pt_attr));
7342 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7343 vm_map_address_t va = start;
7344 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7345 tt_entry_t *tte_p;
7346 tte_p = pmap_tte(pmap, start);
7347 unsigned int npages = 0;
7348
7349 if (tte_p == (tt_entry_t *) NULL) {
7350 return end;
7351 }
7352
7353 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7354 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7355
7356 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7357 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7358 assert(end_pte_p >= start_pte_p);
7359 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7360 if (__improbable(npages++ && pmap_pending_preemption())) {
7361 return va;
7362 }
7363 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7364 if (pa_valid(pa)) {
7365 ppnum_t pn = (ppnum_t) atop(pa);
7366 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7367 }
7368 }
7369 }
7370 return end;
7371 }
7372
7373 MARK_AS_PMAP_TEXT vm_map_address_t
7374 phys_attribute_clear_range_internal(
7375 pmap_t pmap,
7376 vm_map_address_t start,
7377 vm_map_address_t end,
7378 unsigned int bits,
7379 unsigned int options)
7380 {
7381 if (__improbable(end < start)) {
7382 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7383 }
7384 validate_pmap_mutable(pmap);
7385
7386 vm_map_address_t va = start;
7387 pmap_tlb_flush_range_t flush_range = {
7388 .ptfr_pmap = pmap,
7389 .ptfr_start = start,
7390 .ptfr_end = end,
7391 .ptfr_flush_needed = false
7392 };
7393
7394 pmap_lock(pmap, PMAP_LOCK_SHARED);
7395 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7396
7397 while (va < end) {
7398 vm_map_address_t curr_end;
7399
7400 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7401 if (curr_end > end) {
7402 curr_end = end;
7403 }
7404
7405 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7406 if ((va < curr_end) || pmap_pending_preemption()) {
7407 break;
7408 }
7409 }
7410 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7411 if (flush_range.ptfr_flush_needed) {
7412 flush_range.ptfr_end = va;
7413 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7414 flush_range.ptfr_start,
7415 flush_range.ptfr_end - flush_range.ptfr_start,
7416 flush_range.ptfr_pmap,
7417 true);
7418 sync_tlb_flush();
7419 }
7420 return va;
7421 }
7422
7423 static void
7424 phys_attribute_clear_range(
7425 pmap_t pmap,
7426 vm_map_address_t start,
7427 vm_map_address_t end,
7428 unsigned int bits,
7429 unsigned int options)
7430 {
7431 /*
7432 * We allow single-page requests to execute non-preemptibly,
7433 * as it doesn't make sense to sample AST_URGENT for a single-page
7434 * operation, and there are a couple of special use cases that
7435 * require a non-preemptible single-page operation.
7436 */
7437 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7438 pmap_verify_preemptible();
7439 }
7440
7441 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7442
7443 while (start < end) {
7444 #if XNU_MONITOR
7445 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7446 #else
7447 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7448 #endif
7449 }
7450
7451 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7452 }
7453 #endif /* __ARM_RANGE_TLBI__ */
7454
7455 static void
7456 phys_attribute_clear(
7457 ppnum_t pn,
7458 unsigned int bits,
7459 int options,
7460 void *arg)
7461 {
7462 /*
7463 * Do we really want this tracepoint? It will be extremely chatty.
7464 * Also, should we have a corresponding trace point for the set path?
7465 */
7466 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7467
7468 #if XNU_MONITOR
7469 phys_attribute_clear_ppl(pn, bits, options, arg);
7470 #else
7471 phys_attribute_clear_internal(pn, bits, options, arg);
7472 #endif
7473
7474 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7475 }
7476
7477 /*
7478 * Set specified attribute bits.
7479 *
7480 * Set cached value in the pv head because we have
7481 * no per-mapping hardware support for referenced and
7482 * modify bits.
7483 */
7484 MARK_AS_PMAP_TEXT void
7485 phys_attribute_set_internal(
7486 ppnum_t pn,
7487 unsigned int bits)
7488 {
7489 pmap_paddr_t pa = ptoa(pn);
7490 assert(pn != vm_page_fictitious_addr);
7491
7492 #if XNU_MONITOR
7493 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7494 panic("%s: illegal request, "
7495 "pn=%u, bits=%#x",
7496 __FUNCTION__,
7497 pn, bits);
7498 }
7499 #endif
7500
7501 ppattr_pa_set_bits(pa, (uint16_t)bits);
7502
7503 return;
7504 }
7505
7506 static void
7507 phys_attribute_set(
7508 ppnum_t pn,
7509 unsigned int bits)
7510 {
7511 #if XNU_MONITOR
7512 phys_attribute_set_ppl(pn, bits);
7513 #else
7514 phys_attribute_set_internal(pn, bits);
7515 #endif
7516 }
7517
7518
7519 /*
7520 * Check specified attribute bits.
7521 *
7522 * use the software cached bits (since no hw support).
7523 */
7524 static boolean_t
7525 phys_attribute_test(
7526 ppnum_t pn,
7527 unsigned int bits)
7528 {
7529 pmap_paddr_t pa = ptoa(pn);
7530 assert(pn != vm_page_fictitious_addr);
7531 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7532 }
7533
7534
7535 /*
7536 * Set the modify/reference bits on the specified physical page.
7537 */
7538 void
7539 pmap_set_modify(ppnum_t pn)
7540 {
7541 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7542 }
7543
7544
7545 /*
7546 * Clear the modify bits on the specified physical page.
7547 */
7548 void
7549 pmap_clear_modify(
7550 ppnum_t pn)
7551 {
7552 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7553 }
7554
7555
7556 /*
7557 * pmap_is_modified:
7558 *
7559 * Return whether or not the specified physical page is modified
7560 * by any physical maps.
7561 */
7562 boolean_t
7563 pmap_is_modified(
7564 ppnum_t pn)
7565 {
7566 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7567 }
7568
7569
7570 /*
7571 * Set the reference bit on the specified physical page.
7572 */
7573 static void
7574 pmap_set_reference(
7575 ppnum_t pn)
7576 {
7577 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7578 }
7579
7580 /*
7581 * Clear the reference bits on the specified physical page.
7582 */
7583 void
7584 pmap_clear_reference(
7585 ppnum_t pn)
7586 {
7587 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7588 }
7589
7590
7591 /*
7592 * pmap_is_referenced:
7593 *
7594 * Return whether or not the specified physical page is referenced
7595 * by any physical maps.
7596 */
7597 boolean_t
7598 pmap_is_referenced(
7599 ppnum_t pn)
7600 {
7601 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7602 }
7603
7604 /*
7605 * pmap_get_refmod(phys)
7606 * returns the referenced and modified bits of the specified
7607 * physical page.
7608 */
7609 unsigned int
7610 pmap_get_refmod(
7611 ppnum_t pn)
7612 {
7613 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7614 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7615 }
7616
7617 static inline unsigned int
7618 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7619 {
7620 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7621 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7622 }
7623
7624 /*
7625 * pmap_clear_refmod(phys, mask)
7626 * clears the referenced and modified bits as specified by the mask
7627 * of the specified physical page.
7628 */
7629 void
7630 pmap_clear_refmod_options(
7631 ppnum_t pn,
7632 unsigned int mask,
7633 unsigned int options,
7634 void *arg)
7635 {
7636 unsigned int bits;
7637
7638 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7639 phys_attribute_clear(pn, bits, options, arg);
7640 }
7641
7642 /*
7643 * Perform pmap_clear_refmod_options on a virtual address range.
7644 * The operation will be performed in bulk & tlb flushes will be coalesced
7645 * if possible.
7646 *
7647 * Returns true if the operation is supported on this platform.
7648 * If this function returns false, the operation is not supported and
7649 * nothing has been modified in the pmap.
7650 */
7651 bool
7652 pmap_clear_refmod_range_options(
7653 pmap_t pmap __unused,
7654 vm_map_address_t start __unused,
7655 vm_map_address_t end __unused,
7656 unsigned int mask __unused,
7657 unsigned int options __unused)
7658 {
7659 #if __ARM_RANGE_TLBI__
7660 unsigned int bits;
7661 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7662 phys_attribute_clear_range(pmap, start, end, bits, options);
7663 return true;
7664 #else /* __ARM_RANGE_TLBI__ */
7665 #pragma unused(pmap, start, end, mask, options)
7666 /*
7667 * This operation allows the VM to bulk modify refmod bits on a virtually
7668 * contiguous range of addresses. This is large performance improvement on
7669 * platforms that support ranged tlbi instructions. But on older platforms,
7670 * we can only flush per-page or the entire asid. So we currently
7671 * only support this operation on platforms that support ranged tlbi.
7672 * instructions. On other platforms, we require that
7673 * the VM modify the bits on a per-page basis.
7674 */
7675 return false;
7676 #endif /* __ARM_RANGE_TLBI__ */
7677 }
7678
7679 void
7680 pmap_clear_refmod(
7681 ppnum_t pn,
7682 unsigned int mask)
7683 {
7684 pmap_clear_refmod_options(pn, mask, 0, NULL);
7685 }
7686
7687 unsigned int
7688 pmap_disconnect_options(
7689 ppnum_t pn,
7690 unsigned int options,
7691 void *arg)
7692 {
7693 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7694 /*
7695 * On ARM, the "modified" bit is managed by software, so
7696 * we know up-front if the physical page is "modified",
7697 * without having to scan all the PTEs pointing to it.
7698 * The caller should have made the VM page "busy" so noone
7699 * should be able to establish any new mapping and "modify"
7700 * the page behind us.
7701 */
7702 if (pmap_is_modified(pn)) {
7703 /*
7704 * The page has been modified and will be sent to
7705 * the VM compressor.
7706 */
7707 options |= PMAP_OPTIONS_COMPRESSOR;
7708 } else {
7709 /*
7710 * The page hasn't been modified and will be freed
7711 * instead of compressed.
7712 */
7713 }
7714 }
7715
7716 /* disconnect the page */
7717 pmap_page_protect_options(pn, 0, options, arg);
7718
7719 /* return ref/chg status */
7720 return pmap_get_refmod(pn);
7721 }
7722
7723 /*
7724 * Routine:
7725 * pmap_disconnect
7726 *
7727 * Function:
7728 * Disconnect all mappings for this page and return reference and change status
7729 * in generic format.
7730 *
7731 */
7732 unsigned int
7733 pmap_disconnect(
7734 ppnum_t pn)
7735 {
7736 pmap_page_protect(pn, 0); /* disconnect the page */
7737 return pmap_get_refmod(pn); /* return ref/chg status */
7738 }
7739
7740 boolean_t
7741 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7742 {
7743 if (ptoa(first) >= vm_last_phys) {
7744 return FALSE;
7745 }
7746 if (ptoa(last) < vm_first_phys) {
7747 return FALSE;
7748 }
7749
7750 return TRUE;
7751 }
7752
7753 /*
7754 * The state maintained by the noencrypt functions is used as a
7755 * debugging aid on ARM. This incurs some overhead on the part
7756 * of the caller. A special case check in phys_attribute_clear
7757 * (the most expensive path) currently minimizes this overhead,
7758 * but stubbing these functions out on RELEASE kernels yields
7759 * further wins.
7760 */
7761 boolean_t
7762 pmap_is_noencrypt(
7763 ppnum_t pn)
7764 {
7765 #if DEVELOPMENT || DEBUG
7766 boolean_t result = FALSE;
7767
7768 if (!pa_valid(ptoa(pn))) {
7769 return FALSE;
7770 }
7771
7772 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7773
7774 return result;
7775 #else
7776 #pragma unused(pn)
7777 return FALSE;
7778 #endif
7779 }
7780
7781 void
7782 pmap_set_noencrypt(
7783 ppnum_t pn)
7784 {
7785 #if DEVELOPMENT || DEBUG
7786 if (!pa_valid(ptoa(pn))) {
7787 return;
7788 }
7789
7790 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7791 #else
7792 #pragma unused(pn)
7793 #endif
7794 }
7795
7796 void
7797 pmap_clear_noencrypt(
7798 ppnum_t pn)
7799 {
7800 #if DEVELOPMENT || DEBUG
7801 if (!pa_valid(ptoa(pn))) {
7802 return;
7803 }
7804
7805 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7806 #else
7807 #pragma unused(pn)
7808 #endif
7809 }
7810
7811 #if XNU_MONITOR
7812 boolean_t
7813 pmap_is_monitor(ppnum_t pn)
7814 {
7815 assert(pa_valid(ptoa(pn)));
7816 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7817 }
7818 #endif
7819
7820 void
7821 pmap_lock_phys_page(ppnum_t pn)
7822 {
7823 #if !XNU_MONITOR
7824 unsigned int pai;
7825 pmap_paddr_t phys = ptoa(pn);
7826
7827 if (pa_valid(phys)) {
7828 pai = pa_index(phys);
7829 pvh_lock(pai);
7830 } else
7831 #else
7832 (void)pn;
7833 #endif
7834 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7835 }
7836
7837
7838 void
7839 pmap_unlock_phys_page(ppnum_t pn)
7840 {
7841 #if !XNU_MONITOR
7842 unsigned int pai;
7843 pmap_paddr_t phys = ptoa(pn);
7844
7845 if (pa_valid(phys)) {
7846 pai = pa_index(phys);
7847 pvh_unlock(pai);
7848 } else
7849 #else
7850 (void)pn;
7851 #endif
7852 { simple_unlock(&phys_backup_lock);}
7853 }
7854
7855 MARK_AS_PMAP_TEXT static void
7856 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7857 {
7858 #if (__ARM_VMSA__ == 7)
7859 cpu_data_ptr->cpu_user_pmap = pmap;
7860 cpu_data_ptr->cpu_user_pmap_stamp = pmap->stamp;
7861 if (pmap != kernel_pmap) {
7862 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7863 }
7864
7865 #if MACH_ASSERT && __ARM_USER_PROTECT__
7866 {
7867 unsigned int ttbr0_val, ttbr1_val;
7868 __asm__ volatile ("mrc p15,0,%0,c2,c0,0\n" : "=r"(ttbr0_val));
7869 __asm__ volatile ("mrc p15,0,%0,c2,c0,1\n" : "=r"(ttbr1_val));
7870 if (ttbr0_val != ttbr1_val) {
7871 panic("Misaligned ttbr0 %08X", ttbr0_val);
7872 }
7873 if (pmap->ttep & 0x1000) {
7874 panic("Misaligned ttbr0 %08X", pmap->ttep);
7875 }
7876 }
7877 #endif
7878 #if !__ARM_USER_PROTECT__
7879 set_mmu_ttb(pmap->ttep);
7880 set_context_id(pmap->hw_asid);
7881 #endif
7882
7883 #else /* (__ARM_VMSA__ == 7) */
7884
7885 if (pmap != kernel_pmap) {
7886 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7887 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7888 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7889 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7890 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7891 #if __ARM_MIXED_PAGE_SIZE__
7892 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7893 #endif
7894 }
7895
7896
7897 #if __ARM_MIXED_PAGE_SIZE__
7898 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7899 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7900 }
7901 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7902
7903
7904 if (pmap != kernel_pmap) {
7905 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7906 } else if (!pmap_user_ttb_is_clear()) {
7907 pmap_clear_user_ttb_internal();
7908 }
7909 #endif /* (__ARM_VMSA__ == 7) */
7910 }
7911
7912 MARK_AS_PMAP_TEXT void
7913 pmap_clear_user_ttb_internal(void)
7914 {
7915 #if (__ARM_VMSA__ > 7)
7916 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7917 #else
7918 set_mmu_ttb(kernel_pmap->ttep);
7919 #endif
7920 }
7921
7922 void
7923 pmap_clear_user_ttb(void)
7924 {
7925 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7926 #if XNU_MONITOR
7927 pmap_clear_user_ttb_ppl();
7928 #else
7929 pmap_clear_user_ttb_internal();
7930 #endif
7931 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7932 }
7933
7934
7935 #if defined(__arm64__)
7936 /*
7937 * Marker for use in multi-pass fast-fault PV list processing.
7938 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7939 * these functions, as compressed PTEs should never be present in PV lists.
7940 * Note that this only holds true for arm64; for arm32 we don't have enough
7941 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7942 * and WRITEABLE marker depending on whether the PTE is valid.
7943 */
7944 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7945 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7946 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7947 #endif
7948
7949
7950 MARK_AS_PMAP_TEXT static boolean_t
7951 arm_force_fast_fault_with_flush_range(
7952 ppnum_t ppnum,
7953 vm_prot_t allow_mode,
7954 int options,
7955 pmap_tlb_flush_range_t *flush_range)
7956 {
7957 pmap_paddr_t phys = ptoa(ppnum);
7958 pv_entry_t *pve_p;
7959 pt_entry_t *pte_p;
7960 unsigned int pai;
7961 unsigned int pass1_updated = 0;
7962 unsigned int pass2_updated = 0;
7963 boolean_t result;
7964 pv_entry_t **pv_h;
7965 bool is_reusable;
7966 bool ref_fault;
7967 bool mod_fault;
7968 bool clear_write_fault = false;
7969 bool ref_aliases_mod = false;
7970 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7971
7972 assert(ppnum != vm_page_fictitious_addr);
7973
7974 if (!pa_valid(phys)) {
7975 return FALSE; /* Not a managed page. */
7976 }
7977
7978 result = TRUE;
7979 ref_fault = false;
7980 mod_fault = false;
7981 pai = pa_index(phys);
7982 if (__probable(mustsynch)) {
7983 pvh_lock(pai);
7984 }
7985 pv_h = pai_to_pvh(pai);
7986
7987 #if XNU_MONITOR
7988 if (__improbable(ppattr_pa_test_monitor(phys))) {
7989 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7990 }
7991 #endif
7992 pte_p = PT_ENTRY_NULL;
7993 pve_p = PV_ENTRY_NULL;
7994 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7995 pte_p = pvh_ptep(pv_h);
7996 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7997 pve_p = pvh_pve_list(pv_h);
7998 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7999 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
8000 }
8001
8002 is_reusable = ppattr_test_reusable(pai);
8003
8004 /*
8005 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
8006 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
8007 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
8008 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
8009 * operation, TLB invalidation may be handled by the caller so it's possible for
8010 * tlb_flush_needed to be true while issue_tlbi is false.
8011 */
8012 bool issue_tlbi = false;
8013 bool tlb_flush_needed = false;
8014
8015 pv_entry_t *orig_pve_p = pve_p;
8016 pt_entry_t *orig_pte_p = pte_p;
8017 int pve_ptep_idx = 0;
8018
8019 /*
8020 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8021 * TLB invalidation in pass 2.
8022 */
8023 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8024 pt_entry_t spte;
8025 pt_entry_t tmplate;
8026
8027 if (pve_p != PV_ENTRY_NULL) {
8028 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8029 if (pte_p == PT_ENTRY_NULL) {
8030 goto fff_skip_pve_pass1;
8031 }
8032 }
8033
8034 #ifdef PVH_FLAG_IOMMU
8035 if (pvh_ptep_is_iommu(pte_p)) {
8036 goto fff_skip_pve_pass1;
8037 }
8038 #endif
8039 if (*pte_p == ARM_PTE_EMPTY) {
8040 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8041 }
8042 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8043 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8044 }
8045
8046 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8047 const pmap_t pmap = ptdp->pmap;
8048 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8049 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8050
8051 assert(va >= pmap->min && va < pmap->max);
8052
8053 /* update pmap stats and ledgers */
8054 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8055 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8056 if (is_altacct) {
8057 /*
8058 * We do not track "reusable" status for
8059 * "alternate accounting" mappings.
8060 */
8061 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8062 is_reusable &&
8063 is_internal &&
8064 pmap != kernel_pmap) {
8065 /* one less "reusable" */
8066 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8067 /* one more "internal" */
8068 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8069 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8070
8071 /*
8072 * Since the page is being marked non-reusable, we assume that it will be
8073 * modified soon. Avoid the cost of another trap to handle the fast
8074 * fault when we next write to this page.
8075 */
8076 clear_write_fault = true;
8077 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8078 !is_reusable &&
8079 is_internal &&
8080 pmap != kernel_pmap) {
8081 /* one more "reusable" */
8082 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8083 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8084 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8085 }
8086
8087 bool wiredskip = pte_is_wired(*pte_p) &&
8088 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8089
8090 if (wiredskip) {
8091 result = FALSE;
8092 goto fff_skip_pve_pass1;
8093 }
8094
8095 spte = *pte_p;
8096 tmplate = spte;
8097
8098 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8099 /* read protection sets the pte to fault */
8100 tmplate = tmplate & ~ARM_PTE_AF;
8101 ref_fault = true;
8102 }
8103 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8104 /* take away write permission if set */
8105 if (pmap == kernel_pmap) {
8106 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8107 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8108 pte_set_was_writeable(tmplate, true);
8109 mod_fault = true;
8110 }
8111 } else {
8112 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8113 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8114 pte_set_was_writeable(tmplate, true);
8115 mod_fault = true;
8116 }
8117 }
8118 }
8119
8120 #if MACH_ASSERT && XNU_MONITOR
8121 if (is_pte_xprr_protected(pmap, spte)) {
8122 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8123 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8124 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8125 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8126 ppnum, options, allow_mode);
8127 }
8128 }
8129 #endif /* MACH_ASSERT && XNU_MONITOR */
8130
8131 if (result && (tmplate != spte)) {
8132 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8133 !(options & PMAP_OPTIONS_NOFLUSH)) {
8134 tlb_flush_needed = true;
8135 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8136 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8137 #ifdef ARM_PTE_FF_MARKER
8138 assert(!(spte & ARM_PTE_FF_MARKER));
8139 tmplate |= ARM_PTE_FF_MARKER;
8140 ++pass1_updated;
8141 #endif
8142 issue_tlbi = true;
8143 }
8144 }
8145 write_pte_fast(pte_p, tmplate);
8146 }
8147
8148 fff_skip_pve_pass1:
8149 pte_p = PT_ENTRY_NULL;
8150 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8151 pve_ptep_idx = 0;
8152 pve_p = pve_next(pve_p);
8153 }
8154 }
8155
8156 if (tlb_flush_needed) {
8157 FLUSH_PTE_STRONG();
8158 }
8159
8160 if (!issue_tlbi) {
8161 goto fff_finish;
8162 }
8163
8164 /* Pass 2: Issue any required TLB invalidations */
8165 pve_p = orig_pve_p;
8166 pte_p = orig_pte_p;
8167 pve_ptep_idx = 0;
8168
8169 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8170 if (pve_p != PV_ENTRY_NULL) {
8171 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8172 if (pte_p == PT_ENTRY_NULL) {
8173 goto fff_skip_pve_pass2;
8174 }
8175 }
8176
8177 #ifdef PVH_FLAG_IOMMU
8178 if (pvh_ptep_is_iommu(pte_p)) {
8179 goto fff_skip_pve_pass2;
8180 }
8181 #endif
8182
8183 #ifdef ARM_PTE_FF_MARKER
8184 pt_entry_t spte = *pte_p;
8185
8186 if (!(spte & ARM_PTE_FF_MARKER)) {
8187 goto fff_skip_pve_pass2;
8188 } else {
8189 spte &= (~ARM_PTE_FF_MARKER);
8190 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8191 write_pte_fast(pte_p, spte);
8192 ++pass2_updated;
8193 }
8194 #endif
8195 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8196 const pmap_t pmap = ptdp->pmap;
8197 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8198
8199 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8200 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8201 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8202 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8203 }
8204
8205 fff_skip_pve_pass2:
8206 pte_p = PT_ENTRY_NULL;
8207 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8208 pve_ptep_idx = 0;
8209 pve_p = pve_next(pve_p);
8210 }
8211 }
8212
8213 fff_finish:
8214 if (__improbable(pass1_updated != pass2_updated)) {
8215 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8216 __func__, pass1_updated, pass2_updated);
8217 }
8218
8219 /*
8220 * If we are using the same approach for ref and mod
8221 * faults on this PTE, do not clear the write fault;
8222 * this would cause both ref and mod to be set on the
8223 * page again, and prevent us from taking ANY read/write
8224 * fault on the mapping.
8225 */
8226 if (clear_write_fault && !ref_aliases_mod) {
8227 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8228 }
8229 if (tlb_flush_needed) {
8230 if (flush_range) {
8231 /* Delayed flush. Signal to the caller that the flush is needed. */
8232 flush_range->ptfr_flush_needed = true;
8233 } else {
8234 sync_tlb_flush();
8235 }
8236 }
8237
8238 /* update global "reusable" status for this page */
8239 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8240 ppattr_clear_reusable(pai);
8241 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8242 ppattr_set_reusable(pai);
8243 }
8244
8245 if (mod_fault) {
8246 ppattr_set_modfault(pai);
8247 }
8248 if (ref_fault) {
8249 ppattr_set_reffault(pai);
8250 }
8251 if (__probable(mustsynch)) {
8252 pvh_unlock(pai);
8253 }
8254 return result;
8255 }
8256
8257 MARK_AS_PMAP_TEXT boolean_t
8258 arm_force_fast_fault_internal(
8259 ppnum_t ppnum,
8260 vm_prot_t allow_mode,
8261 int options)
8262 {
8263 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8264 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8265 }
8266 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8267 }
8268
8269 /*
8270 * Routine: arm_force_fast_fault
8271 *
8272 * Function:
8273 * Force all mappings for this page to fault according
8274 * to the access modes allowed, so we can gather ref/modify
8275 * bits again.
8276 */
8277
8278 boolean_t
8279 arm_force_fast_fault(
8280 ppnum_t ppnum,
8281 vm_prot_t allow_mode,
8282 int options,
8283 __unused void *arg)
8284 {
8285 pmap_paddr_t phys = ptoa(ppnum);
8286
8287 assert(ppnum != vm_page_fictitious_addr);
8288
8289 if (!pa_valid(phys)) {
8290 return FALSE; /* Not a managed page. */
8291 }
8292
8293 #if XNU_MONITOR
8294 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8295 #else
8296 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8297 #endif
8298 }
8299
8300 /*
8301 * Routine: arm_clear_fast_fault
8302 *
8303 * Function:
8304 * Clear pending force fault for all mappings for this page based on
8305 * the observed fault type, update ref/modify bits.
8306 */
8307 MARK_AS_PMAP_TEXT static boolean_t
8308 arm_clear_fast_fault(
8309 ppnum_t ppnum,
8310 vm_prot_t fault_type,
8311 pt_entry_t *pte_p)
8312 {
8313 pmap_paddr_t pa = ptoa(ppnum);
8314 pv_entry_t *pve_p;
8315 unsigned int pai;
8316 boolean_t result;
8317 bool tlb_flush_needed = false;
8318 pv_entry_t **pv_h;
8319 unsigned int npve = 0;
8320 unsigned int pass1_updated = 0;
8321 unsigned int pass2_updated = 0;
8322
8323 assert(ppnum != vm_page_fictitious_addr);
8324
8325 if (!pa_valid(pa)) {
8326 return FALSE; /* Not a managed page. */
8327 }
8328
8329 result = FALSE;
8330 pai = pa_index(pa);
8331 pvh_assert_locked(pai);
8332 pv_h = pai_to_pvh(pai);
8333
8334 pve_p = PV_ENTRY_NULL;
8335 if (pte_p == PT_ENTRY_NULL) {
8336 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8337 pte_p = pvh_ptep(pv_h);
8338 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8339 pve_p = pvh_pve_list(pv_h);
8340 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8341 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8342 }
8343 }
8344
8345 pv_entry_t *orig_pve_p = pve_p;
8346 pt_entry_t *orig_pte_p = pte_p;
8347 int pve_ptep_idx = 0;
8348
8349 /*
8350 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8351 * TLB invalidation in pass 2.
8352 */
8353 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8354 pt_entry_t spte;
8355 pt_entry_t tmplate;
8356
8357 if (pve_p != PV_ENTRY_NULL) {
8358 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8359 if (pte_p == PT_ENTRY_NULL) {
8360 goto cff_skip_pve_pass1;
8361 }
8362 }
8363
8364 #ifdef PVH_FLAG_IOMMU
8365 if (pvh_ptep_is_iommu(pte_p)) {
8366 goto cff_skip_pve_pass1;
8367 }
8368 #endif
8369 if (*pte_p == ARM_PTE_EMPTY) {
8370 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8371 }
8372
8373 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8374 const pmap_t pmap = ptdp->pmap;
8375 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8376
8377 assert(va >= pmap->min && va < pmap->max);
8378
8379 spte = *pte_p;
8380 tmplate = spte;
8381
8382 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8383 {
8384 if (pmap == kernel_pmap) {
8385 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8386 } else {
8387 assert(pmap->type != PMAP_TYPE_NESTED);
8388 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8389 }
8390 }
8391
8392 tmplate |= ARM_PTE_AF;
8393
8394 pte_set_was_writeable(tmplate, false);
8395 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8396 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8397 tmplate = spte | ARM_PTE_AF;
8398
8399 {
8400 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8401 }
8402 }
8403
8404 #if MACH_ASSERT && XNU_MONITOR
8405 if (is_pte_xprr_protected(pmap, spte)) {
8406 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8407 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8408 "ppnum=0x%x, fault_type=0x%x",
8409 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8410 ppnum, fault_type);
8411 }
8412 }
8413 #endif /* MACH_ASSERT && XNU_MONITOR */
8414
8415 assert(spte != ARM_PTE_TYPE_FAULT);
8416 if (spte != tmplate) {
8417 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8418 #ifdef ARM_PTE_FF_MARKER
8419 assert(!(spte & ARM_PTE_FF_MARKER));
8420 tmplate |= ARM_PTE_FF_MARKER;
8421 ++pass1_updated;
8422 #endif
8423 tlb_flush_needed = true;
8424 }
8425 write_pte_fast(pte_p, tmplate);
8426 result = TRUE;
8427 }
8428
8429 cff_skip_pve_pass1:
8430 pte_p = PT_ENTRY_NULL;
8431 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8432 pve_ptep_idx = 0;
8433 pve_p = pve_next(pve_p);
8434 ++npve;
8435 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8436 break;
8437 }
8438 }
8439 }
8440
8441 if (!tlb_flush_needed) {
8442 goto cff_finish;
8443 }
8444
8445 FLUSH_PTE_STRONG();
8446
8447 /* Pass 2: Issue any required TLB invalidations */
8448 pve_p = orig_pve_p;
8449 pte_p = orig_pte_p;
8450 pve_ptep_idx = 0;
8451 npve = 0;
8452
8453 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8454 if (pve_p != PV_ENTRY_NULL) {
8455 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8456 if (pte_p == PT_ENTRY_NULL) {
8457 goto cff_skip_pve_pass2;
8458 }
8459 }
8460
8461 #ifdef PVH_FLAG_IOMMU
8462 if (pvh_ptep_is_iommu(pte_p)) {
8463 goto cff_skip_pve_pass2;
8464 }
8465 #endif
8466
8467 #ifdef ARM_PTE_FF_MARKER
8468 pt_entry_t spte = *pte_p;
8469
8470 if (!(spte & ARM_PTE_FF_MARKER)) {
8471 goto cff_skip_pve_pass2;
8472 } else {
8473 spte &= (~ARM_PTE_FF_MARKER);
8474 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8475 write_pte_fast(pte_p, spte);
8476 ++pass2_updated;
8477 }
8478 #endif
8479 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8480 const pmap_t pmap = ptdp->pmap;
8481 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8482
8483 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8484
8485 cff_skip_pve_pass2:
8486 pte_p = PT_ENTRY_NULL;
8487 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8488 pve_ptep_idx = 0;
8489 pve_p = pve_next(pve_p);
8490 ++npve;
8491 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8492 break;
8493 }
8494 }
8495 }
8496
8497 cff_finish:
8498 if (__improbable(pass1_updated != pass2_updated)) {
8499 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8500 __func__, pass1_updated, pass2_updated);
8501 }
8502 if (tlb_flush_needed) {
8503 sync_tlb_flush();
8504 }
8505 return result;
8506 }
8507
8508 /*
8509 * Determine if the fault was induced by software tracking of
8510 * modify/reference bits. If so, re-enable the mapping (and set
8511 * the appropriate bits).
8512 *
8513 * Returns KERN_SUCCESS if the fault was induced and was
8514 * successfully handled.
8515 *
8516 * Returns KERN_FAILURE if the fault was not induced and
8517 * the function was unable to deal with it.
8518 *
8519 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8520 * disallows this type of access.
8521 */
8522 MARK_AS_PMAP_TEXT kern_return_t
8523 arm_fast_fault_internal(
8524 pmap_t pmap,
8525 vm_map_address_t va,
8526 vm_prot_t fault_type,
8527 __unused bool was_af_fault,
8528 __unused bool from_user)
8529 {
8530 kern_return_t result = KERN_FAILURE;
8531 pt_entry_t *ptep;
8532 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8533 unsigned int pai;
8534 pmap_paddr_t pa;
8535 validate_pmap_mutable(pmap);
8536
8537 pmap_lock(pmap, PMAP_LOCK_SHARED);
8538
8539 /*
8540 * If the entry doesn't exist, is completely invalid, or is already
8541 * valid, we can't fix it here.
8542 */
8543
8544 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8545 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8546 if (ptep != PT_ENTRY_NULL) {
8547 while (true) {
8548 spte = *((volatile pt_entry_t*)ptep);
8549
8550 pa = pte_to_pa(spte);
8551
8552 if ((spte == ARM_PTE_TYPE_FAULT) ||
8553 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8554 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8555 return result;
8556 }
8557
8558 if (!pa_valid(pa)) {
8559 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8560 #if XNU_MONITOR
8561 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8562 return KERN_PROTECTION_FAILURE;
8563 } else
8564 #endif
8565 return result;
8566 }
8567 pai = pa_index(pa);
8568 pvh_lock(pai);
8569 if (*ptep == spte) {
8570 /*
8571 * Double-check the spte value, as we care about the AF bit.
8572 * It's also possible that pmap_page_protect() transitioned the
8573 * PTE to compressed/empty before we grabbed the PVH lock.
8574 */
8575 break;
8576 }
8577 pvh_unlock(pai);
8578 }
8579 } else {
8580 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8581 return result;
8582 }
8583
8584
8585 if ((result != KERN_SUCCESS) &&
8586 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8587 /*
8588 * An attempted access will always clear ref/mod fault state, as
8589 * appropriate for the fault type. arm_clear_fast_fault will
8590 * update the associated PTEs for the page as appropriate; if
8591 * any PTEs are updated, we redrive the access. If the mapping
8592 * does not actually allow for the attempted access, the
8593 * following fault will (hopefully) fail to update any PTEs, and
8594 * thus cause arm_fast_fault to decide that it failed to handle
8595 * the fault.
8596 */
8597 if (ppattr_test_reffault(pai)) {
8598 ppattr_clear_reffault(pai);
8599 }
8600 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8601 ppattr_clear_modfault(pai);
8602 }
8603
8604 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8605 /*
8606 * Should this preserve KERN_PROTECTION_FAILURE? The
8607 * cost of not doing so is a another fault in a case
8608 * that should already result in an exception.
8609 */
8610 result = KERN_SUCCESS;
8611 }
8612 }
8613
8614 /*
8615 * If the PTE already has sufficient permissions, we can report the fault as handled.
8616 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8617 * on mappings of the same page
8618 */
8619 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8620 uintptr_t ap_ro, ap_rw, ap_x;
8621 if (pmap == kernel_pmap) {
8622 ap_ro = ARM_PTE_AP(AP_RONA);
8623 ap_rw = ARM_PTE_AP(AP_RWNA);
8624 ap_x = ARM_PTE_NX;
8625 } else {
8626 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8627 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8628 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8629 }
8630 /*
8631 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8632 * hardware they may be xPRR-protected, in which case they'll be handled
8633 * by the is_pte_xprr_protected() case above. Additionally, the exception
8634 * handling path currently does not call arm_fast_fault() without at least
8635 * VM_PROT_READ in fault_type.
8636 */
8637 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8638 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8639 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8640 result = KERN_SUCCESS;
8641 }
8642 }
8643 }
8644
8645 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8646 /*
8647 * A prior arm_clear_fast_fault() operation may have returned early due to
8648 * another pending PV list operation or an excessively large PV list.
8649 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8650 * taking a fault on the same mapping.
8651 */
8652 result = KERN_SUCCESS;
8653 }
8654
8655 pvh_unlock(pai);
8656 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8657 return result;
8658 }
8659
8660 kern_return_t
8661 arm_fast_fault(
8662 pmap_t pmap,
8663 vm_map_address_t va,
8664 vm_prot_t fault_type,
8665 bool was_af_fault,
8666 __unused bool from_user)
8667 {
8668 kern_return_t result = KERN_FAILURE;
8669
8670 if (va < pmap->min || va >= pmap->max) {
8671 return result;
8672 }
8673
8674 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8675 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8676 from_user);
8677
8678 #if (__ARM_VMSA__ == 7)
8679 if (pmap != kernel_pmap) {
8680 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
8681 pmap_t cur_pmap;
8682 pmap_t cur_user_pmap;
8683
8684 cur_pmap = current_pmap();
8685 cur_user_pmap = cpu_data_ptr->cpu_user_pmap;
8686
8687 if ((cur_user_pmap == cur_pmap) && (cur_pmap == pmap)) {
8688 if (cpu_data_ptr->cpu_user_pmap_stamp != pmap->stamp) {
8689 pmap_set_pmap(pmap, current_thread());
8690 result = KERN_SUCCESS;
8691 goto done;
8692 }
8693 }
8694 }
8695 #endif
8696
8697 #if XNU_MONITOR
8698 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8699 #else
8700 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8701 #endif
8702
8703 #if (__ARM_VMSA__ == 7)
8704 done:
8705 #endif
8706
8707 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8708
8709 return result;
8710 }
8711
8712 void
8713 pmap_copy_page(
8714 ppnum_t psrc,
8715 ppnum_t pdst)
8716 {
8717 bcopy_phys((addr64_t) (ptoa(psrc)),
8718 (addr64_t) (ptoa(pdst)),
8719 PAGE_SIZE);
8720 }
8721
8722
8723 /*
8724 * pmap_copy_page copies the specified (machine independent) pages.
8725 */
8726 void
8727 pmap_copy_part_page(
8728 ppnum_t psrc,
8729 vm_offset_t src_offset,
8730 ppnum_t pdst,
8731 vm_offset_t dst_offset,
8732 vm_size_t len)
8733 {
8734 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8735 (addr64_t) (ptoa(pdst) + dst_offset),
8736 len);
8737 }
8738
8739
8740 /*
8741 * pmap_zero_page zeros the specified (machine independent) page.
8742 */
8743 void
8744 pmap_zero_page(
8745 ppnum_t pn)
8746 {
8747 assert(pn != vm_page_fictitious_addr);
8748 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8749 }
8750
8751 /*
8752 * pmap_zero_part_page
8753 * zeros the specified (machine independent) part of a page.
8754 */
8755 void
8756 pmap_zero_part_page(
8757 ppnum_t pn,
8758 vm_offset_t offset,
8759 vm_size_t len)
8760 {
8761 assert(pn != vm_page_fictitious_addr);
8762 assert(offset + len <= PAGE_SIZE);
8763 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8764 }
8765
8766 void
8767 pmap_map_globals(
8768 void)
8769 {
8770 pt_entry_t *ptep, pte;
8771
8772 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8773 assert(ptep != PT_ENTRY_NULL);
8774 assert(*ptep == ARM_PTE_EMPTY);
8775
8776 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8777 #if __ARM_KERNEL_PROTECT__
8778 pte |= ARM_PTE_NG;
8779 #endif /* __ARM_KERNEL_PROTECT__ */
8780 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8781 #if (__ARM_VMSA__ > 7)
8782 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8783 #else
8784 pte |= ARM_PTE_SH;
8785 #endif
8786 *ptep = pte;
8787 FLUSH_PTE();
8788 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8789
8790 #if KASAN
8791 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8792 #endif
8793 }
8794
8795 vm_offset_t
8796 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8797 {
8798 if (__improbable(index >= CPUWINDOWS_MAX)) {
8799 panic("%s: invalid index %u", __func__, index);
8800 }
8801 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8802 }
8803
8804 MARK_AS_PMAP_TEXT unsigned int
8805 pmap_map_cpu_windows_copy_internal(
8806 ppnum_t pn,
8807 vm_prot_t prot,
8808 unsigned int wimg_bits)
8809 {
8810 pt_entry_t *ptep = NULL, pte;
8811 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8812 unsigned int cpu_num;
8813 unsigned int i;
8814 vm_offset_t cpu_copywindow_vaddr = 0;
8815 bool need_strong_sync = false;
8816
8817 #if XNU_MONITOR
8818 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8819 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8820 #endif
8821
8822 #if XNU_MONITOR
8823 #ifdef __ARM_COHERENT_IO__
8824 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8825 panic("%s: attempted to map a managed page, "
8826 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8827 __FUNCTION__,
8828 pn, prot, wimg_bits);
8829 }
8830 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8831 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8832 }
8833
8834 #else /* __ARM_COHERENT_IO__ */
8835 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8836 #endif /* __ARM_COHERENT_IO__ */
8837 #endif /* XNU_MONITOR */
8838 cpu_num = pmap_cpu_data->cpu_number;
8839
8840 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8841 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8842 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8843 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8844 if (*ptep == ARM_PTE_TYPE_FAULT) {
8845 break;
8846 }
8847 }
8848 if (i == CPUWINDOWS_MAX) {
8849 panic("pmap_map_cpu_windows_copy: out of window");
8850 }
8851
8852 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8853 #if __ARM_KERNEL_PROTECT__
8854 pte |= ARM_PTE_NG;
8855 #endif /* __ARM_KERNEL_PROTECT__ */
8856
8857 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8858
8859 if (prot & VM_PROT_WRITE) {
8860 pte |= ARM_PTE_AP(AP_RWNA);
8861 } else {
8862 pte |= ARM_PTE_AP(AP_RONA);
8863 }
8864
8865 write_pte_fast(ptep, pte);
8866 /*
8867 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8868 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8869 */
8870 FLUSH_PTE_STRONG();
8871 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8872 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8873
8874 return i;
8875 }
8876
8877 unsigned int
8878 pmap_map_cpu_windows_copy(
8879 ppnum_t pn,
8880 vm_prot_t prot,
8881 unsigned int wimg_bits)
8882 {
8883 #if XNU_MONITOR
8884 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8885 #else
8886 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8887 #endif
8888 }
8889
8890 MARK_AS_PMAP_TEXT void
8891 pmap_unmap_cpu_windows_copy_internal(
8892 unsigned int index)
8893 {
8894 pt_entry_t *ptep;
8895 unsigned int cpu_num;
8896 vm_offset_t cpu_copywindow_vaddr = 0;
8897 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8898
8899 cpu_num = pmap_cpu_data->cpu_number;
8900
8901 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8902 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8903 * (which are likely to have been on I/O memory) are complete before
8904 * tearing down the mapping. */
8905 __builtin_arm_dsb(DSB_SY);
8906 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8907 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8908 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8909 }
8910
8911 void
8912 pmap_unmap_cpu_windows_copy(
8913 unsigned int index)
8914 {
8915 #if XNU_MONITOR
8916 return pmap_unmap_cpu_windows_copy_ppl(index);
8917 #else
8918 return pmap_unmap_cpu_windows_copy_internal(index);
8919 #endif
8920 }
8921
8922 #if XNU_MONITOR
8923
8924 MARK_AS_PMAP_TEXT void
8925 pmap_invoke_with_page(
8926 ppnum_t page_number,
8927 void *ctx,
8928 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8929 {
8930 #pragma unused(page_number, ctx, callback)
8931 }
8932
8933 /*
8934 * Loop over every pmap_io_range (I/O ranges marked as owned by
8935 * the PPL in the device tree) and conditionally call callback() on each range
8936 * that needs to be included in the hibernation image.
8937 *
8938 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8939 * context is needed in the callback.
8940 * @param callback Callback function invoked on each range (gated by flag).
8941 */
8942 MARK_AS_PMAP_TEXT void
8943 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8944 {
8945 extern const pmap_io_range_t* io_attr_table;
8946 extern const unsigned int num_io_rgns;
8947 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8948 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8949 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8950 }
8951 }
8952 }
8953
8954 /**
8955 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8956 * PPL-owned page. Otherwise, do nothing.
8957 *
8958 * @param addr Physical address of the page to set the HASHED flag on.
8959 */
8960 MARK_AS_PMAP_TEXT void
8961 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8962 {
8963 /* Ignore non-managed kernel memory. */
8964 if (!pa_valid(addr)) {
8965 return;
8966 }
8967
8968 const unsigned int pai = pa_index(addr);
8969 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8970 pv_entry_t **pv_h = pai_to_pvh(pai);
8971
8972 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8973 pvh_lock(pai);
8974 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8975 pvh_unlock(pai);
8976 }
8977 }
8978
8979 /**
8980 * Loop through every physical page in the system and clear out the HASHED flag
8981 * on every PPL-owned page. That flag is used to keep track of which pages have
8982 * been hashed into the hibernation image during the hibernation entry process.
8983 *
8984 * The HASHED flag needs to be cleared out between hibernation cycles because the
8985 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8986 * image with the HASHED flag set on certain pages. It's important to clear the
8987 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8988 * into the hibernation image can't be compromised across hibernation cycles.
8989 */
8990 MARK_AS_PMAP_TEXT void
8991 pmap_clear_ppl_hashed_flag_all(void)
8992 {
8993 const unsigned int last_index = pa_index(vm_last_phys);
8994 pv_entry_t **pv_h = NULL;
8995
8996 for (int pai = 0; pai < last_index; ++pai) {
8997 pv_h = pai_to_pvh(pai);
8998
8999 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
9000 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
9001 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
9002 pvh_lock(pai);
9003 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
9004 pvh_unlock(pai);
9005 }
9006 }
9007 }
9008
9009 /**
9010 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
9011 * ppl_hib driver will call this after all wired pages have been copied into the
9012 * hibernation image.
9013 */
9014 MARK_AS_PMAP_TEXT void
9015 pmap_check_ppl_hashed_flag_all(void)
9016 {
9017 const unsigned int last_index = pa_index(vm_last_phys);
9018 pv_entry_t **pv_h = NULL;
9019
9020 for (int pai = 0; pai < last_index; ++pai) {
9021 pv_h = pai_to_pvh(pai);
9022
9023 /**
9024 * The PMAP stacks are explicitly not saved into the image so skip checking
9025 * the pages that contain the PMAP stacks.
9026 */
9027 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
9028 (pai < pa_index(pmap_stacks_end_pa));
9029
9030 if (!is_pmap_stack &&
9031 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
9032 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
9033 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
9034 }
9035 }
9036 }
9037
9038 #endif /* XNU_MONITOR */
9039
9040 /*
9041 * Indicate that a pmap is intended to be used as a nested pmap
9042 * within one or more larger address spaces. This must be set
9043 * before pmap_nest() is called with this pmap as the 'subordinate'.
9044 */
9045 MARK_AS_PMAP_TEXT void
9046 pmap_set_nested_internal(
9047 pmap_t pmap)
9048 {
9049 validate_pmap_mutable(pmap);
9050 if (__improbable(pmap->type != PMAP_TYPE_USER)) {
9051 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9052 __func__, pmap, pmap->type);
9053 }
9054 pmap->type = PMAP_TYPE_NESTED;
9055 pmap_get_pt_ops(pmap)->free_id(pmap);
9056 }
9057
9058 void
9059 pmap_set_nested(
9060 pmap_t pmap)
9061 {
9062 #if XNU_MONITOR
9063 pmap_set_nested_ppl(pmap);
9064 #else
9065 pmap_set_nested_internal(pmap);
9066 #endif
9067 }
9068
9069 /*
9070 * pmap_trim_range(pmap, start, end)
9071 *
9072 * pmap = pmap to operate on
9073 * start = start of the range
9074 * end = end of the range
9075 *
9076 * Attempts to deallocate TTEs for the given range in the nested range.
9077 */
9078 MARK_AS_PMAP_TEXT static void
9079 pmap_trim_range(
9080 pmap_t pmap,
9081 addr64_t start,
9082 addr64_t end)
9083 {
9084 addr64_t cur;
9085 addr64_t nested_region_start;
9086 addr64_t nested_region_end;
9087 addr64_t adjusted_start;
9088 addr64_t adjusted_end;
9089 addr64_t adjust_offmask;
9090 tt_entry_t * tte_p;
9091 pt_entry_t * pte_p;
9092 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9093
9094 if (__improbable(end < start)) {
9095 panic("%s: invalid address range, "
9096 "pmap=%p, start=%p, end=%p",
9097 __func__,
9098 pmap, (void*)start, (void*)end);
9099 }
9100
9101 nested_region_start = pmap->nested_region_addr;
9102 nested_region_end = nested_region_start + pmap->nested_region_size;
9103
9104 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9105 panic("%s: range outside nested region %p-%p, "
9106 "pmap=%p, start=%p, end=%p",
9107 __func__, (void *)nested_region_start, (void *)nested_region_end,
9108 pmap, (void*)start, (void*)end);
9109 }
9110
9111 /* Contract the range to TT page boundaries. */
9112 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9113 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9114 adjusted_end = end & ~adjust_offmask;
9115
9116 /* Iterate over the range, trying to remove TTEs. */
9117 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9118 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9119
9120 tte_p = pmap_tte(pmap, cur);
9121
9122 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
9123 pte_p = (pt_entry_t *) ttetokv(*tte_p);
9124
9125 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9126 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9127 /* Deallocate for the nested map. */
9128 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9129 } else if (pmap->type == PMAP_TYPE_USER) {
9130 /**
9131 * Just remove for the parent map. If the leaf table pointed
9132 * to by the TTE being removed (owned by the nested pmap)
9133 * has any mappings, then this call will panic. This
9134 * enforces the policy that tables being trimmed must be
9135 * empty to prevent possible use-after-free attacks.
9136 */
9137 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9138 } else {
9139 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9140 }
9141 } else {
9142 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9143 }
9144 }
9145
9146 #if (__ARM_VMSA__ > 7)
9147 /* Remove empty L2 TTs. */
9148 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9149 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9150
9151 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9152 /* For each L1 entry in our range... */
9153 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9154
9155 bool remove_tt1e = true;
9156 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9157 tt_entry_t * tt2e_start;
9158 tt_entry_t * tt2e_end;
9159 tt_entry_t * tt2e_p;
9160 tt_entry_t tt1e;
9161
9162 if (tt1e_p == NULL) {
9163 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9164 continue;
9165 }
9166
9167 tt1e = *tt1e_p;
9168
9169 if (tt1e == ARM_TTE_TYPE_FAULT) {
9170 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9171 continue;
9172 }
9173
9174 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9175 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9176
9177 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9178 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9179 /*
9180 * If any TTEs are populated, don't remove the
9181 * L1 TT.
9182 */
9183 remove_tt1e = false;
9184 }
9185 }
9186
9187 if (remove_tt1e) {
9188 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9189 } else {
9190 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9191 }
9192 }
9193 #endif /* (__ARM_VMSA__ > 7) */
9194 }
9195
9196 /*
9197 * pmap_trim_internal(grand, subord, vstart, size)
9198 *
9199 * grand = pmap subord is nested in
9200 * subord = nested pmap
9201 * vstart = start of the used range in grand
9202 * size = size of the used range
9203 *
9204 * Attempts to trim the shared region page tables down to only cover the given
9205 * range in subord and grand.
9206 */
9207 MARK_AS_PMAP_TEXT void
9208 pmap_trim_internal(
9209 pmap_t grand,
9210 pmap_t subord,
9211 addr64_t vstart,
9212 uint64_t size)
9213 {
9214 addr64_t vend;
9215 addr64_t adjust_offmask;
9216
9217 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9218 panic("%s: grand addr wraps around, "
9219 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9220 __func__, grand, subord, (void*)vstart, size);
9221 }
9222
9223 validate_pmap_mutable(grand);
9224 validate_pmap(subord);
9225
9226 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9227
9228 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9229
9230 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9231 panic("%s: subord is of non-nestable type 0x%hhx, "
9232 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9233 __func__, subord->type, grand, subord, (void*)vstart, size);
9234 }
9235
9236 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9237 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9238 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9239 __func__, grand->type, grand, subord, (void*)vstart, size);
9240 }
9241
9242 if (__improbable(grand->nested_pmap != subord)) {
9243 panic("%s: grand->nested != subord, "
9244 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9245 __func__, grand, subord, (void*)vstart, size);
9246 }
9247
9248 if (__improbable((size != 0) &&
9249 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9250 panic("%s: grand range not in nested region, "
9251 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9252 __func__, grand, subord, (void*)vstart, size);
9253 }
9254
9255
9256 if (!grand->nested_has_no_bounds_ref) {
9257 assert(subord->nested_bounds_set);
9258
9259 if (!grand->nested_bounds_set) {
9260 /* Inherit the bounds from subord. */
9261 grand->nested_region_true_start = subord->nested_region_true_start;
9262 grand->nested_region_true_end = subord->nested_region_true_end;
9263 grand->nested_bounds_set = true;
9264 }
9265
9266 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9267 return;
9268 }
9269
9270 if ((!subord->nested_bounds_set) && size) {
9271 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9272
9273 subord->nested_region_true_start = vstart;
9274 subord->nested_region_true_end = vend;
9275 subord->nested_region_true_start &= ~adjust_offmask;
9276
9277 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9278 panic("%s: padded true end wraps around, "
9279 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9280 __func__, grand, subord, (void*)vstart, size);
9281 }
9282
9283 subord->nested_region_true_end &= ~adjust_offmask;
9284 subord->nested_bounds_set = true;
9285 }
9286
9287 if (subord->nested_bounds_set) {
9288 /* Inherit the bounds from subord. */
9289 grand->nested_region_true_start = subord->nested_region_true_start;
9290 grand->nested_region_true_end = subord->nested_region_true_end;
9291 grand->nested_bounds_set = true;
9292
9293 /* If we know the bounds, we can trim the pmap. */
9294 grand->nested_has_no_bounds_ref = false;
9295 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9296 } else {
9297 /* Don't trim if we don't know the bounds. */
9298 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9299 return;
9300 }
9301
9302 /* Trim grand to only cover the given range. */
9303 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9304 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9305
9306 /* Try to trim subord. */
9307 pmap_trim_subord(subord);
9308 }
9309
9310 MARK_AS_PMAP_TEXT static void
9311 pmap_trim_self(pmap_t pmap)
9312 {
9313 if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
9314 /* If we have a no bounds ref, we need to drop it. */
9315 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9316 pmap->nested_has_no_bounds_ref = false;
9317 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9318 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9319 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9320 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9321
9322 if (nested_bounds_set) {
9323 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9324 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9325 }
9326 /*
9327 * Try trimming the nested pmap, in case we had the
9328 * last reference.
9329 */
9330 pmap_trim_subord(pmap->nested_pmap);
9331 }
9332 }
9333
9334 /*
9335 * pmap_trim_subord(grand, subord)
9336 *
9337 * grand = pmap that we have nested subord in
9338 * subord = nested pmap we are attempting to trim
9339 *
9340 * Trims subord if possible
9341 */
9342 MARK_AS_PMAP_TEXT static void
9343 pmap_trim_subord(pmap_t subord)
9344 {
9345 bool contract_subord = false;
9346
9347 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9348
9349 subord->nested_no_bounds_refcnt--;
9350
9351 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9352 /* If this was the last no bounds reference, trim subord. */
9353 contract_subord = true;
9354 }
9355
9356 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9357
9358 if (contract_subord) {
9359 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9360 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9361 }
9362 }
9363
9364 void
9365 pmap_trim(
9366 pmap_t grand,
9367 pmap_t subord,
9368 addr64_t vstart,
9369 uint64_t size)
9370 {
9371 #if XNU_MONITOR
9372 pmap_trim_ppl(grand, subord, vstart, size);
9373
9374 pmap_ledger_check_balance(grand);
9375 pmap_ledger_check_balance(subord);
9376 #else
9377 pmap_trim_internal(grand, subord, vstart, size);
9378 #endif
9379 }
9380
9381 #if HAS_APPLE_PAC
9382 void *
9383 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9384 {
9385 void *res = NULL;
9386 uint64_t current_intr_state = pmap_interrupts_disable();
9387
9388 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9389 switch (key) {
9390 case ptrauth_key_asia:
9391 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9392 break;
9393 case ptrauth_key_asda:
9394 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9395 break;
9396 default:
9397 panic("attempt to sign user pointer without process independent key");
9398 }
9399 ml_disable_user_jop_key(jop_key, saved_jop_state);
9400
9401 pmap_interrupts_restore(current_intr_state);
9402
9403 return res;
9404 }
9405
9406 void *
9407 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9408 {
9409 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9410 }
9411
9412 void *
9413 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9414 {
9415 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9416 panic("attempt to auth user pointer without process independent key");
9417 }
9418
9419 void *res = NULL;
9420 uint64_t current_intr_state = pmap_interrupts_disable();
9421
9422 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9423 res = ml_auth_ptr_unchecked(value, key, discriminator);
9424 ml_disable_user_jop_key(jop_key, saved_jop_state);
9425
9426 pmap_interrupts_restore(current_intr_state);
9427
9428 return res;
9429 }
9430
9431 void *
9432 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9433 {
9434 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9435 }
9436 #endif /* HAS_APPLE_PAC */
9437
9438 /*
9439 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9440 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9441 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9442 * return value, to indicate where a preempted [un]nest operation should resume.
9443 * When the return value contains the ending address of the nested region with
9444 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9445 */
9446 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9447
9448 /*
9449 * kern_return_t pmap_nest(grand, subord, vstart, size)
9450 *
9451 * grand = the pmap that we will nest subord into
9452 * subord = the pmap that goes into the grand
9453 * vstart = start of range in pmap to be inserted
9454 * size = Size of nest area (up to 16TB)
9455 *
9456 * Inserts a pmap into another. This is used to implement shared segments.
9457 *
9458 */
9459
9460 /**
9461 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9462 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9463 * This function operates in 3 main phases:
9464 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9465 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9466 * the mapping range are present in subord.
9467 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9468 * contains pointers to subord's leaf-level pagetable pages for the specified
9469 * VA range.
9470 *
9471 * This function may return early due to pending AST_URGENT preemption; if so
9472 * it will indicate the need to be re-entered.
9473 *
9474 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9475 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9476 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9477 * @param size twig-aligned size of the nesting range
9478 * @param vrestart the twig-aligned starting address of the current call. May contain
9479 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9480 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9481 * KERN_RESOURCE_SHORTAGE on allocation failure.
9482 *
9483 * @return the virtual address at which to restart the operation, possibly including
9484 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9485 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9486 */
9487 MARK_AS_PMAP_TEXT vm_map_offset_t
9488 pmap_nest_internal(
9489 pmap_t grand,
9490 pmap_t subord,
9491 addr64_t vstart,
9492 uint64_t size,
9493 vm_map_offset_t vrestart,
9494 kern_return_t *krp)
9495 {
9496 kern_return_t kr = KERN_FAILURE;
9497 vm_map_offset_t vaddr;
9498 tt_entry_t *stte_p;
9499 tt_entry_t *gtte_p;
9500 unsigned int nested_region_asid_bitmap_size;
9501 unsigned int* nested_region_asid_bitmap;
9502 int expand_options = 0;
9503 bool deref_subord = true;
9504
9505 addr64_t vend;
9506 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9507 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9508 }
9509 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9510 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9511 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9512 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9513 }
9514
9515 assert(krp != NULL);
9516 validate_pmap_mutable(grand);
9517 validate_pmap(subord);
9518 #if XNU_MONITOR
9519 /*
9520 * Ordering is important here. validate_pmap() has already ensured subord is a
9521 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9522 * be in the process of being destroyed. If destruction is already committed,
9523 * then the check of ref_count below will cover us. If destruction is initiated
9524 * during or after this call, then pmap_destroy() will catch the non-zero
9525 * nested_count.
9526 */
9527 os_atomic_inc(&subord->nested_count, relaxed);
9528 os_atomic_thread_fence(seq_cst);
9529 #endif
9530 if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9531 panic("%s: invalid subordinate pmap %p", __func__, subord);
9532 }
9533
9534 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9535 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9536 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9537 }
9538
9539 #if XNU_MONITOR
9540 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9541 #endif
9542
9543 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9544 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9545 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9546 grand, vstart, size, (unsigned long long)vrestart);
9547 }
9548
9549 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9550 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9551 }
9552
9553 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9554 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9555 }
9556
9557 if (subord->nested_region_asid_bitmap == NULL) {
9558 nested_region_asid_bitmap_size = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9559
9560 #if XNU_MONITOR
9561 pmap_paddr_t pa = 0;
9562
9563 if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9564 panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9565 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9566 __FUNCTION__, nested_region_asid_bitmap_size,
9567 grand, subord, vstart, size);
9568 }
9569
9570 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9571
9572 if (kr != KERN_SUCCESS) {
9573 goto nest_cleanup;
9574 }
9575
9576 assert(pa);
9577
9578 nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9579 #else
9580 nested_region_asid_bitmap = kalloc_data(
9581 nested_region_asid_bitmap_size * sizeof(unsigned int),
9582 Z_WAITOK | Z_ZERO);
9583 #endif
9584
9585 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9586 if (subord->nested_region_asid_bitmap == NULL) {
9587 subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9588 subord->nested_region_addr = vstart;
9589 subord->nested_region_size = (mach_vm_offset_t) size;
9590
9591 /**
9592 * Ensure that the rest of the subord->nested_region_* fields are
9593 * initialized and visible before setting the nested_region_asid_bitmap
9594 * field (which is used as the flag to say that the rest are initialized).
9595 */
9596 __builtin_arm_dmb(DMB_ISHST);
9597 subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9598 nested_region_asid_bitmap = NULL;
9599 }
9600 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9601 if (nested_region_asid_bitmap != NULL) {
9602 #if XNU_MONITOR
9603 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9604 #else
9605 kfree_data(nested_region_asid_bitmap,
9606 nested_region_asid_bitmap_size * sizeof(unsigned int));
9607 #endif
9608 }
9609 }
9610
9611 /**
9612 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9613 * speculated before their initialization.
9614 */
9615 __builtin_arm_dmb(DMB_ISHLD);
9616
9617 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9618 uint64_t new_size;
9619 unsigned int new_nested_region_asid_bitmap_size;
9620 unsigned int* new_nested_region_asid_bitmap;
9621
9622 nested_region_asid_bitmap = NULL;
9623 nested_region_asid_bitmap_size = 0;
9624 new_size = vend - subord->nested_region_addr;
9625
9626 /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9627 new_nested_region_asid_bitmap_size = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9628
9629 #if XNU_MONITOR
9630 pmap_paddr_t pa = 0;
9631
9632 if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9633 panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9634 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9635 __FUNCTION__, new_nested_region_asid_bitmap_size,
9636 grand, subord, vstart, new_size);
9637 }
9638
9639 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9640
9641 if (kr != KERN_SUCCESS) {
9642 goto nest_cleanup;
9643 }
9644
9645 assert(pa);
9646
9647 new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9648 #else
9649 new_nested_region_asid_bitmap = kalloc_data(
9650 new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9651 Z_WAITOK | Z_ZERO);
9652 #endif
9653 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9654 if (subord->nested_region_size < new_size) {
9655 bcopy(subord->nested_region_asid_bitmap,
9656 new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9657 nested_region_asid_bitmap_size = subord->nested_region_asid_bitmap_size;
9658 nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9659 subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9660 subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9661 subord->nested_region_size = new_size;
9662 new_nested_region_asid_bitmap = NULL;
9663 }
9664 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9665 if (nested_region_asid_bitmap != NULL) {
9666 #if XNU_MONITOR
9667 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9668 #else
9669 kfree_data(nested_region_asid_bitmap,
9670 nested_region_asid_bitmap_size * sizeof(unsigned int));
9671 #endif
9672 }
9673 if (new_nested_region_asid_bitmap != NULL) {
9674 #if XNU_MONITOR
9675 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9676 #else
9677 kfree_data(new_nested_region_asid_bitmap,
9678 new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9679 #endif
9680 }
9681 }
9682
9683 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9684
9685 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9686 /*
9687 * If this is grand's first nesting operation, keep the reference on subord.
9688 * It will be released by pmap_destroy_internal() when grand is destroyed.
9689 */
9690 deref_subord = false;
9691
9692 if (!subord->nested_bounds_set) {
9693 /*
9694 * We are nesting without the shared regions bounds
9695 * being known. We'll have to trim the pmap later.
9696 */
9697 grand->nested_has_no_bounds_ref = true;
9698 subord->nested_no_bounds_refcnt++;
9699 }
9700
9701 grand->nested_region_addr = vstart;
9702 grand->nested_region_size = (mach_vm_offset_t) size;
9703 } else {
9704 if (__improbable(grand->nested_pmap != subord)) {
9705 panic("pmap_nest() pmap %p has a nested pmap", grand);
9706 } else if (__improbable(grand->nested_region_addr > vstart)) {
9707 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9708 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9709 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9710 }
9711 }
9712
9713 vaddr = vrestart & ~PMAP_NEST_GRAND;
9714 if (vaddr < subord->nested_region_true_start) {
9715 vaddr = subord->nested_region_true_start;
9716 }
9717
9718 addr64_t true_end = vend;
9719 if (true_end > subord->nested_region_true_end) {
9720 true_end = subord->nested_region_true_end;
9721 }
9722 __unused unsigned int ttecount = 0;
9723
9724 if (vrestart & PMAP_NEST_GRAND) {
9725 goto nest_grand;
9726 }
9727 #if (__ARM_VMSA__ == 7)
9728
9729 while (vaddr < true_end) {
9730 stte_p = pmap_tte(subord, vaddr);
9731 if ((stte_p == (tt_entry_t *)NULL) || (((*stte_p) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE)) {
9732 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9733 kr = pmap_expand(subord, vaddr, expand_options, PMAP_TT_L2_LEVEL);
9734
9735 if (kr != KERN_SUCCESS) {
9736 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9737 goto done;
9738 }
9739
9740 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9741 }
9742 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9743 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9744 stte_p = pmap_tte(grand, vaddr);
9745 if (stte_p == (tt_entry_t *)NULL) {
9746 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9747 kr = pmap_expand(grand, vaddr, expand_options, PMAP_TT_L1_LEVEL);
9748
9749 if (kr != KERN_SUCCESS) {
9750 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9751 goto done;
9752 }
9753 } else {
9754 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9755 kr = KERN_SUCCESS;
9756 }
9757 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9758 vaddr += ARM_TT_L1_SIZE;
9759 vrestart = vaddr;
9760 }
9761
9762 #else
9763 while (vaddr < true_end) {
9764 stte_p = pmap_tte(subord, vaddr);
9765 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9766 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9767 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9768
9769 if (kr != KERN_SUCCESS) {
9770 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9771 goto done;
9772 }
9773
9774 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9775 }
9776 vaddr += pt_attr_twig_size(pt_attr);
9777 vrestart = vaddr;
9778 ++ttecount;
9779 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9780 pmap_pending_preemption())) {
9781 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9782 kr = KERN_SUCCESS;
9783 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9784 goto done;
9785 }
9786 }
9787 #endif
9788 /*
9789 * copy TTEs from subord pmap into grand pmap
9790 */
9791
9792 vaddr = (vm_map_offset_t) vstart;
9793 if (vaddr < subord->nested_region_true_start) {
9794 vaddr = subord->nested_region_true_start;
9795 }
9796 vrestart = vaddr | PMAP_NEST_GRAND;
9797
9798 nest_grand:
9799 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9800 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9801 #if (__ARM_VMSA__ == 7)
9802 while (vaddr < true_end) {
9803 stte_p = pmap_tte(subord, vaddr);
9804 gtte_p = pmap_tte(grand, vaddr);
9805 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9806 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9807 __func__, gtte_p, grand);
9808 }
9809 *gtte_p = *stte_p;
9810 vaddr += ARM_TT_L1_SIZE;
9811 }
9812 vrestart = vaddr | PMAP_NEST_GRAND;
9813 #else
9814 while (vaddr < true_end) {
9815 stte_p = pmap_tte(subord, vaddr);
9816 gtte_p = pmap_tte(grand, vaddr);
9817 if (gtte_p == PT_ENTRY_NULL) {
9818 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9819 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9820 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9821
9822 if (kr != KERN_SUCCESS) {
9823 goto done;
9824 }
9825
9826 gtte_p = pmap_tt2e(grand, vaddr);
9827 }
9828 /* Don't leak a page table page. Don't violate break-before-make. */
9829 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9830 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9831 __func__, gtte_p, grand);
9832 }
9833 *gtte_p = *stte_p;
9834
9835 vaddr += pt_attr_twig_size(pt_attr);
9836 vrestart = vaddr | PMAP_NEST_GRAND;
9837 ++ttecount;
9838 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9839 pmap_pending_preemption())) {
9840 break;
9841 }
9842 }
9843 #endif
9844 if (vaddr >= true_end) {
9845 vrestart = vend | PMAP_NEST_GRAND;
9846 }
9847
9848 kr = KERN_SUCCESS;
9849 done:
9850
9851 FLUSH_PTE();
9852 __builtin_arm_isb(ISB_SY);
9853
9854 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9855 #if XNU_MONITOR
9856 nest_cleanup:
9857 if (kr != KERN_SUCCESS) {
9858 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9859 *krp = kr;
9860 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9861 }
9862 #else
9863 if (kr != KERN_SUCCESS) {
9864 *krp = kr;
9865 }
9866 #endif
9867 if (deref_subord) {
9868 #if XNU_MONITOR
9869 os_atomic_dec(&subord->nested_count, relaxed);
9870 #endif
9871 pmap_destroy_internal(subord);
9872 }
9873 return vrestart;
9874 }
9875
9876 kern_return_t
9877 pmap_nest(
9878 pmap_t grand,
9879 pmap_t subord,
9880 addr64_t vstart,
9881 uint64_t size)
9882 {
9883 kern_return_t kr = KERN_SUCCESS;
9884 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9885 vm_map_offset_t vend = vaddr + size;
9886 __unused vm_map_offset_t vlast = vaddr;
9887
9888 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9889 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9890 VM_KERNEL_ADDRHIDE(vstart));
9891
9892 pmap_verify_preemptible();
9893 #if XNU_MONITOR
9894 while (vaddr != (vend | PMAP_NEST_GRAND)) {
9895 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9896 if (kr == KERN_RESOURCE_SHORTAGE) {
9897 pmap_alloc_page_for_ppl(0);
9898 kr = KERN_SUCCESS;
9899 } else if (kr != KERN_SUCCESS) {
9900 break;
9901 } else if (vaddr == vlast) {
9902 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9903 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9904 }
9905 vlast = vaddr;
9906 }
9907
9908 pmap_ledger_check_balance(grand);
9909 pmap_ledger_check_balance(subord);
9910 #else
9911 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9912 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9913 }
9914 #endif
9915
9916 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9917
9918 return kr;
9919 }
9920
9921 /*
9922 * kern_return_t pmap_unnest(grand, vaddr)
9923 *
9924 * grand = the pmap that will have the virtual range unnested
9925 * vaddr = start of range in pmap to be unnested
9926 * size = size of range in pmap to be unnested
9927 *
9928 */
9929
9930 kern_return_t
9931 pmap_unnest(
9932 pmap_t grand,
9933 addr64_t vaddr,
9934 uint64_t size)
9935 {
9936 return pmap_unnest_options(grand, vaddr, size, 0);
9937 }
9938
9939 /**
9940 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9941 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9942 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9943 * still have the region nested. The mappings in 'grand' will be left empty
9944 * with the assumption that they will be demand-filled by subsequent access faults.
9945 *
9946 * This function operates in 2 main phases:
9947 * 1. Iteration over the nested pmap's mappings for the specified range to mark
9948 * them non-global.
9949 * 2. Clearing of the twig-level TTEs for the address range in grand.
9950 *
9951 * This function may return early due to pending AST_URGENT preemption; if so
9952 * it will indicate the need to be re-entered.
9953 *
9954 * @param grand pmap from which to unnest mappings
9955 * @param vaddr twig-aligned virtual address for the beginning of the nested range
9956 * @param size twig-aligned size of the nested range
9957 * @param vrestart the page-aligned starting address of the current call. May contain
9958 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9959 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9960 * grand is being torn down and step 1) above is not needed.
9961 *
9962 * @return the virtual address at which to restart the operation, possibly including
9963 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9964 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9965 */
9966 MARK_AS_PMAP_TEXT vm_map_offset_t
9967 pmap_unnest_options_internal(
9968 pmap_t grand,
9969 addr64_t vaddr,
9970 uint64_t size,
9971 vm_map_offset_t vrestart,
9972 unsigned int option)
9973 {
9974 vm_map_offset_t start;
9975 vm_map_offset_t addr;
9976 tt_entry_t *tte_p;
9977 unsigned int current_index;
9978 unsigned int start_index;
9979 unsigned int max_index;
9980 unsigned int entry_count = 0;
9981
9982 addr64_t vend;
9983 addr64_t true_end;
9984 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9985 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9986 }
9987 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9988 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9989 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9990 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9991 }
9992
9993 validate_pmap_mutable(grand);
9994
9995 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9996
9997 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9998 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9999 (unsigned long long)vaddr, (unsigned long long)size);
10000 }
10001
10002 if (__improbable(grand->nested_pmap == NULL)) {
10003 panic("%s: %p has no nested pmap", __func__, grand);
10004 }
10005
10006 true_end = vend;
10007 if (true_end > grand->nested_pmap->nested_region_true_end) {
10008 true_end = grand->nested_pmap->nested_region_true_end;
10009 }
10010
10011 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10012 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10013 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10014 }
10015
10016 pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10017
10018 start = vrestart;
10019 if (start < grand->nested_pmap->nested_region_true_start) {
10020 start = grand->nested_pmap->nested_region_true_start;
10021 }
10022 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10023 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10024 bool flush_tlb = false;
10025
10026 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10027 pt_entry_t *bpte, *cpte;
10028
10029 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10030
10031 bpte = pmap_pte(grand->nested_pmap, addr);
10032
10033 /*
10034 * If we've re-entered this function partway through unnesting a leaf region, the
10035 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10036 * the run of PTEs. We therefore also need to check for a non-twig-aligned starting
10037 * address.
10038 */
10039 if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
10040 (addr & pt_attr_twig_offmask(pt_attr))) {
10041 /*
10042 * Mark the 'twig' region as being unnested. Every mapping entered within
10043 * the nested pmap in this region will now be marked non-global. Do this
10044 * before marking any of the PTEs within the region as non-global to avoid
10045 * the possibility of pmap_enter() subsequently inserting a global mapping
10046 * in the region, which could lead to a TLB conflict if a non-global entry
10047 * is later inserted for the same VA in a pmap which has fully unnested this
10048 * region.
10049 */
10050 setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
10051 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10052 pmap_paddr_t pa;
10053 unsigned int pai = 0;
10054 boolean_t managed = FALSE;
10055 pt_entry_t spte;
10056
10057 if ((*cpte != ARM_PTE_TYPE_FAULT)
10058 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10059 spte = *((volatile pt_entry_t*)cpte);
10060 while (!managed) {
10061 pa = pte_to_pa(spte);
10062 if (!pa_valid(pa)) {
10063 break;
10064 }
10065 pai = pa_index(pa);
10066 pvh_lock(pai);
10067 spte = *((volatile pt_entry_t*)cpte);
10068 pa = pte_to_pa(spte);
10069 if (pai == pa_index(pa)) {
10070 managed = TRUE;
10071 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10072 }
10073 pvh_unlock(pai);
10074 }
10075
10076 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10077 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10078 flush_tlb = true;
10079 }
10080
10081 if (managed) {
10082 pvh_assert_locked(pai);
10083 pvh_unlock(pai);
10084 }
10085 }
10086
10087 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10088 vrestart = addr;
10089 ++entry_count;
10090 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10091 pmap_pending_preemption())) {
10092 goto unnest_subord_done;
10093 }
10094 }
10095 }
10096 addr = vlim;
10097 vrestart = addr;
10098 ++entry_count;
10099 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10100 pmap_pending_preemption())) {
10101 break;
10102 }
10103 }
10104
10105 unnest_subord_done:
10106 if (flush_tlb) {
10107 FLUSH_PTE_STRONG();
10108 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10109 }
10110
10111 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10112 if (current_index < max_index) {
10113 return vrestart;
10114 }
10115 }
10116
10117 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
10118
10119 /*
10120 * invalidate all pdes for segment at vaddr in pmap grand
10121 */
10122 if (vrestart & PMAP_NEST_GRAND) {
10123 addr = vrestart & ~PMAP_NEST_GRAND;
10124 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10125 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10126 }
10127 } else {
10128 addr = vaddr;
10129 vrestart = vaddr | PMAP_NEST_GRAND;
10130 }
10131
10132 if (addr < grand->nested_pmap->nested_region_true_start) {
10133 addr = grand->nested_pmap->nested_region_true_start;
10134 }
10135
10136 while (addr < true_end) {
10137 tte_p = pmap_tte(grand, addr);
10138 /*
10139 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10140 * so it's possible that a region we're trying to unnest may not have been
10141 * nested in the first place.
10142 */
10143 if (tte_p != NULL) {
10144 *tte_p = ARM_TTE_TYPE_FAULT;
10145 }
10146 addr += pt_attr_twig_size(pt_attr);
10147 vrestart = addr | PMAP_NEST_GRAND;
10148 ++entry_count;
10149 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10150 pmap_pending_preemption())) {
10151 break;
10152 }
10153 }
10154 if (addr >= true_end) {
10155 vrestart = vend | PMAP_NEST_GRAND;
10156 }
10157
10158 FLUSH_PTE_STRONG();
10159 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10160
10161 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10162
10163 return vrestart;
10164 }
10165
10166 kern_return_t
10167 pmap_unnest_options(
10168 pmap_t grand,
10169 addr64_t vaddr,
10170 uint64_t size,
10171 unsigned int option)
10172 {
10173 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10174 vm_map_offset_t vend = vaddr + size;
10175 __unused vm_map_offset_t vlast = vrestart;
10176
10177 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10178 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10179
10180 pmap_verify_preemptible();
10181 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10182 #if XNU_MONITOR
10183 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10184 if (vrestart == vlast) {
10185 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10186 __func__, (unsigned long long)vaddr, (unsigned long long)vend, (unsigned long long)vrestart);
10187 }
10188 vlast = vrestart;
10189 #else
10190 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10191 #endif
10192 }
10193
10194 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10195
10196 return KERN_SUCCESS;
10197 }
10198
10199 boolean_t
10200 pmap_adjust_unnest_parameters(
10201 __unused pmap_t p,
10202 __unused vm_map_offset_t *s,
10203 __unused vm_map_offset_t *e)
10204 {
10205 return TRUE; /* to get to log_unnest_badness()... */
10206 }
10207
10208 /*
10209 * disable no-execute capability on
10210 * the specified pmap
10211 */
10212 #if DEVELOPMENT || DEBUG
10213 void
10214 pmap_disable_NX(
10215 pmap_t pmap)
10216 {
10217 pmap->nx_enabled = FALSE;
10218 }
10219 #else
10220 void
10221 pmap_disable_NX(
10222 __unused pmap_t pmap)
10223 {
10224 }
10225 #endif
10226
10227 /*
10228 * flush a range of hardware TLB entries.
10229 * NOTE: assumes the smallest TLB entry in use will be for
10230 * an ARM small page (4K).
10231 */
10232
10233 #define ARM_FULL_TLB_FLUSH_THRESHOLD 64
10234
10235 #if __ARM_RANGE_TLBI__
10236 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
10237 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_PAGES
10238 #else
10239 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10240 #endif // __ARM_RANGE_TLBI__
10241
10242 static void
10243 flush_mmu_tlb_region_asid_async(
10244 vm_offset_t va,
10245 size_t length,
10246 pmap_t pmap,
10247 bool last_level_only __unused)
10248 {
10249 #if (__ARM_VMSA__ == 7)
10250 vm_offset_t end = va + length;
10251 uint32_t asid;
10252
10253 asid = pmap->hw_asid;
10254
10255 if (length / ARM_SMALL_PAGE_SIZE > ARM_FULL_TLB_FLUSH_THRESHOLD) {
10256 boolean_t flush_all = FALSE;
10257
10258 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10259 flush_all = TRUE;
10260 }
10261 if (flush_all) {
10262 flush_mmu_tlb_async();
10263 } else {
10264 flush_mmu_tlb_asid_async(asid);
10265 }
10266
10267 return;
10268 }
10269 if (pmap->type == PMAP_TYPE_NESTED) {
10270 #if !__ARM_MP_EXT__
10271 flush_mmu_tlb();
10272 #else
10273 va = arm_trunc_page(va);
10274 while (va < end) {
10275 flush_mmu_tlb_mva_entries_async(va);
10276 va += ARM_SMALL_PAGE_SIZE;
10277 }
10278 #endif
10279 return;
10280 }
10281 va = arm_trunc_page(va) | (asid & 0xff);
10282 flush_mmu_tlb_entries_async(va, end);
10283
10284 #else
10285 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10286 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10287 ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10288 uint32_t asid;
10289
10290 asid = pmap->hw_asid;
10291
10292 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10293 boolean_t flush_all = FALSE;
10294
10295 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10296 flush_all = TRUE;
10297 }
10298 if (flush_all) {
10299 flush_mmu_tlb_async();
10300 } else {
10301 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT);
10302 }
10303 return;
10304 }
10305 #if __ARM_RANGE_TLBI__
10306 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10307 va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10308 if (pmap->type == PMAP_TYPE_NESTED) {
10309 flush_mmu_tlb_allrange_async(va, last_level_only);
10310 } else {
10311 flush_mmu_tlb_range_async(va, last_level_only);
10312 }
10313 return;
10314 }
10315 #endif
10316 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10317 va = tlbi_asid(asid) | tlbi_addr(va);
10318
10319 if (pmap->type == PMAP_TYPE_NESTED) {
10320 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only);
10321 } else {
10322 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only);
10323 }
10324
10325 #endif
10326 }
10327
10328 MARK_AS_PMAP_TEXT static void
10329 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10330 {
10331 #if (__ARM_VMSA__ == 7)
10332 flush_mmu_tlb_asid_async(pmap->hw_asid);
10333 #else /* (__ARM_VMSA__ == 7) */
10334 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT);
10335 #endif /* (__ARM_VMSA__ == 7) */
10336 }
10337
10338 void
10339 flush_mmu_tlb_region(
10340 vm_offset_t va,
10341 unsigned length)
10342 {
10343 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
10344 sync_tlb_flush();
10345 }
10346
10347 unsigned int
10348 pmap_cache_attributes(
10349 ppnum_t pn)
10350 {
10351 pmap_paddr_t paddr;
10352 unsigned int pai;
10353 unsigned int result;
10354 pp_attr_t pp_attr_current;
10355
10356 paddr = ptoa(pn);
10357
10358 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10359
10360 if (!pa_valid(paddr)) {
10361 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10362 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10363 }
10364
10365 result = VM_WIMG_DEFAULT;
10366
10367 pai = pa_index(paddr);
10368
10369 pp_attr_current = pp_attr_table[pai];
10370 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10371 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10372 }
10373 return result;
10374 }
10375
10376 MARK_AS_PMAP_TEXT static void
10377 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10378 {
10379 if ((wimg_bits_prev != wimg_bits_new)
10380 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10381 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10382 && (wimg_bits_new != VM_WIMG_COPYBACK))
10383 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10384 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10385 pmap_sync_page_attributes_phys(pn);
10386 }
10387
10388 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10389 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10390 }
10391 }
10392
10393 MARK_AS_PMAP_TEXT __unused void
10394 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10395 {
10396 pmap_paddr_t paddr = ptoa(pn);
10397 const unsigned int pai = pa_index(paddr);
10398
10399 if (__improbable(!pa_valid(paddr))) {
10400 panic("%s called on non-managed page 0x%08x", __func__, pn);
10401 }
10402
10403 pvh_lock(pai);
10404
10405 #if XNU_MONITOR
10406 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10407 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10408 }
10409 #endif
10410
10411 pmap_update_cache_attributes_locked(pn, new_cacheattr);
10412
10413 pvh_unlock(pai);
10414
10415 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10416 }
10417
10418 void *
10419 pmap_map_compressor_page(ppnum_t pn)
10420 {
10421 #if __ARM_PTE_PHYSMAP__
10422 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10423 if (cacheattr != VM_WIMG_DEFAULT) {
10424 #if XNU_MONITOR
10425 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10426 #else
10427 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10428 #endif
10429 }
10430 #endif
10431 return (void*)phystokv(ptoa(pn));
10432 }
10433
10434 void
10435 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10436 {
10437 #if __ARM_PTE_PHYSMAP__
10438 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10439 if (cacheattr != VM_WIMG_DEFAULT) {
10440 #if XNU_MONITOR
10441 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10442 #else
10443 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10444 #endif
10445 }
10446 #endif
10447 }
10448
10449 MARK_AS_PMAP_TEXT boolean_t
10450 pmap_batch_set_cache_attributes_internal(
10451 ppnum_t pn,
10452 unsigned int cacheattr,
10453 unsigned int page_cnt,
10454 unsigned int page_index,
10455 boolean_t doit,
10456 unsigned int *res)
10457 {
10458 pmap_paddr_t paddr;
10459 unsigned int pai;
10460 pp_attr_t pp_attr_current;
10461 pp_attr_t pp_attr_template;
10462 unsigned int wimg_bits_prev, wimg_bits_new;
10463
10464 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10465 cacheattr = VM_WIMG_DEFAULT;
10466 }
10467
10468 if ((doit == FALSE) && (*res == 0)) {
10469 pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
10470 *res = page_cnt;
10471 pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
10472 if (platform_cache_batch_wimg(cacheattr & (VM_WIMG_MASK), page_cnt << PAGE_SHIFT) == FALSE) {
10473 return FALSE;
10474 }
10475 }
10476
10477 paddr = ptoa(pn);
10478
10479 if (!pa_valid(paddr)) {
10480 panic("pmap_batch_set_cache_attributes(): pn 0x%08x not managed", pn);
10481 }
10482
10483 pai = pa_index(paddr);
10484
10485 if (doit) {
10486 pvh_lock(pai);
10487 #if XNU_MONITOR
10488 if (ppattr_pa_test_monitor(paddr)) {
10489 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10490 }
10491 #endif
10492 }
10493
10494 do {
10495 pp_attr_current = pp_attr_table[pai];
10496 wimg_bits_prev = VM_WIMG_DEFAULT;
10497 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10498 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10499 }
10500
10501 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10502
10503 if (!doit) {
10504 break;
10505 }
10506
10507 /* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
10508 * to avoid losing simultaneous updates to other bits like refmod. */
10509 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10510
10511 wimg_bits_new = VM_WIMG_DEFAULT;
10512 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10513 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10514 }
10515
10516 if (doit) {
10517 if (wimg_bits_new != wimg_bits_prev) {
10518 pmap_update_cache_attributes_locked(pn, cacheattr);
10519 }
10520 pvh_unlock(pai);
10521 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10522 pmap_force_dcache_clean(phystokv(paddr), PAGE_SIZE);
10523 }
10524 } else {
10525 if (wimg_bits_new == VM_WIMG_COPYBACK) {
10526 return FALSE;
10527 }
10528 if (wimg_bits_prev == wimg_bits_new) {
10529 pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
10530 *res = *res - 1;
10531 pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
10532 if (!platform_cache_batch_wimg(wimg_bits_new, (*res) << PAGE_SHIFT)) {
10533 return FALSE;
10534 }
10535 }
10536 return TRUE;
10537 }
10538
10539 if (page_cnt == (page_index + 1)) {
10540 wimg_bits_prev = VM_WIMG_COPYBACK;
10541 if (((wimg_bits_prev != wimg_bits_new))
10542 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10543 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10544 && (wimg_bits_new != VM_WIMG_COPYBACK))
10545 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10546 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10547 platform_cache_flush_wimg(wimg_bits_new);
10548 }
10549 }
10550
10551 return TRUE;
10552 }
10553
10554 boolean_t
10555 pmap_batch_set_cache_attributes(
10556 ppnum_t pn,
10557 unsigned int cacheattr,
10558 unsigned int page_cnt,
10559 unsigned int page_index,
10560 boolean_t doit,
10561 unsigned int *res)
10562 {
10563 #if XNU_MONITOR
10564 return pmap_batch_set_cache_attributes_ppl(pn, cacheattr, page_cnt, page_index, doit, res);
10565 #else
10566 return pmap_batch_set_cache_attributes_internal(pn, cacheattr, page_cnt, page_index, doit, res);
10567 #endif
10568 }
10569
10570 MARK_AS_PMAP_TEXT static void
10571 pmap_set_cache_attributes_priv(
10572 ppnum_t pn,
10573 unsigned int cacheattr,
10574 boolean_t external __unused)
10575 {
10576 pmap_paddr_t paddr;
10577 unsigned int pai;
10578 pp_attr_t pp_attr_current;
10579 pp_attr_t pp_attr_template;
10580 unsigned int wimg_bits_prev, wimg_bits_new;
10581
10582 paddr = ptoa(pn);
10583
10584 if (!pa_valid(paddr)) {
10585 return; /* Not a managed page. */
10586 }
10587
10588 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10589 cacheattr = VM_WIMG_DEFAULT;
10590 }
10591
10592 pai = pa_index(paddr);
10593
10594 pvh_lock(pai);
10595
10596 #if XNU_MONITOR
10597 if (external && ppattr_pa_test_monitor(paddr)) {
10598 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10599 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
10600 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10601 }
10602 #endif
10603
10604 do {
10605 pp_attr_current = pp_attr_table[pai];
10606 wimg_bits_prev = VM_WIMG_DEFAULT;
10607 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10608 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10609 }
10610
10611 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10612
10613 /* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
10614 * to avoid losing simultaneous updates to other bits like refmod. */
10615 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10616
10617 wimg_bits_new = VM_WIMG_DEFAULT;
10618 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10619 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10620 }
10621
10622 if (wimg_bits_new != wimg_bits_prev) {
10623 pmap_update_cache_attributes_locked(pn, cacheattr);
10624 }
10625
10626 pvh_unlock(pai);
10627
10628 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10629 }
10630
10631 MARK_AS_PMAP_TEXT void
10632 pmap_set_cache_attributes_internal(
10633 ppnum_t pn,
10634 unsigned int cacheattr)
10635 {
10636 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10637 }
10638
10639 void
10640 pmap_set_cache_attributes(
10641 ppnum_t pn,
10642 unsigned int cacheattr)
10643 {
10644 #if XNU_MONITOR
10645 pmap_set_cache_attributes_ppl(pn, cacheattr);
10646 #else
10647 pmap_set_cache_attributes_internal(pn, cacheattr);
10648 #endif
10649 }
10650
10651 MARK_AS_PMAP_TEXT void
10652 pmap_update_cache_attributes_locked(
10653 ppnum_t ppnum,
10654 unsigned attributes)
10655 {
10656 pmap_paddr_t phys = ptoa(ppnum);
10657 pv_entry_t *pve_p;
10658 pt_entry_t *pte_p;
10659 pv_entry_t **pv_h;
10660 pt_entry_t tmplate;
10661 unsigned int pai;
10662 boolean_t tlb_flush_needed = FALSE;
10663
10664 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10665
10666 if (pmap_panic_dev_wimg_on_managed) {
10667 switch (attributes & VM_WIMG_MASK) {
10668 case VM_WIMG_IO: // nGnRnE
10669 case VM_WIMG_POSTED: // nGnRE
10670 /* supported on DRAM, but slow, so we disallow */
10671
10672 case VM_WIMG_POSTED_REORDERED: // nGRE
10673 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10674 /* unsupported on DRAM */
10675
10676 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10677 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10678 break;
10679
10680 default:
10681 /* not device type memory, all good */
10682
10683 break;
10684 }
10685 }
10686
10687 #if __ARM_PTE_PHYSMAP__
10688 vm_offset_t kva = phystokv(phys);
10689 pte_p = pmap_pte(kernel_pmap, kva);
10690
10691 tmplate = *pte_p;
10692 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10693 #if XNU_MONITOR
10694 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10695 #else
10696 tmplate |= wimg_to_pte(attributes, phys);
10697 #endif
10698 #if (__ARM_VMSA__ > 7)
10699 if (tmplate & ARM_PTE_HINT_MASK) {
10700 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10701 __FUNCTION__, pte_p, (void *)kva, tmplate);
10702 }
10703 #endif
10704 write_pte_strong(pte_p, tmplate);
10705 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10706 tlb_flush_needed = TRUE;
10707 #endif
10708
10709 pai = pa_index(phys);
10710
10711 pv_h = pai_to_pvh(pai);
10712
10713 pte_p = PT_ENTRY_NULL;
10714 pve_p = PV_ENTRY_NULL;
10715 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10716 pte_p = pvh_ptep(pv_h);
10717 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10718 pve_p = pvh_pve_list(pv_h);
10719 pte_p = PT_ENTRY_NULL;
10720 }
10721
10722 int pve_ptep_idx = 0;
10723 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10724 vm_map_address_t va;
10725 pmap_t pmap;
10726
10727 if (pve_p != PV_ENTRY_NULL) {
10728 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10729 if (pte_p == PT_ENTRY_NULL) {
10730 goto cache_skip_pve;
10731 }
10732 }
10733
10734 #ifdef PVH_FLAG_IOMMU
10735 if (pvh_ptep_is_iommu(pte_p)) {
10736 goto cache_skip_pve;
10737 }
10738 #endif
10739 pmap = ptep_get_pmap(pte_p);
10740 va = ptep_get_va(pte_p);
10741
10742 tmplate = *pte_p;
10743 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10744 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
10745
10746 write_pte_strong(pte_p, tmplate);
10747 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10748 tlb_flush_needed = TRUE;
10749
10750 cache_skip_pve:
10751 pte_p = PT_ENTRY_NULL;
10752 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10753 pve_ptep_idx = 0;
10754 pve_p = pve_next(pve_p);
10755 }
10756 }
10757 if (tlb_flush_needed) {
10758 pmap_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
10759 }
10760
10761 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
10762 }
10763
10764 #if (__ARM_VMSA__ == 7)
10765 void
10766 pmap_create_sharedpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10767 vm_map_address_t *user_commpage_addr)
10768 {
10769 pmap_paddr_t pa;
10770 kern_return_t kr;
10771
10772 assert(kernel_data_addr != NULL);
10773 assert(kernel_text_addr != NULL);
10774 assert(user_commpage_addr != NULL);
10775
10776 (void) pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, 0);
10777
10778 kr = pmap_enter(kernel_pmap, _COMM_PAGE_BASE_ADDRESS, atop(pa), VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10779 assert(kr == KERN_SUCCESS);
10780
10781 *kernel_data_addr = phystokv(pa);
10782 // We don't have PFZ for 32 bit arm, always NULL
10783 *kernel_text_addr = 0;
10784 *user_commpage_addr = 0;
10785 }
10786
10787 #else /* __ARM_VMSA__ == 7 */
10788
10789 /**
10790 * Mark a pmap as being dedicated to use for a commpage mapping.
10791 * The pmap itself will never be activated on a CPU; its mappings will
10792 * only be embedded in userspace pmaps at a fixed virtual address.
10793 *
10794 * @param pmap the pmap to mark as belonging to a commpage.
10795 */
10796 static void
10797 pmap_set_commpage(pmap_t pmap)
10798 {
10799 #if XNU_MONITOR
10800 assert(!pmap_ppl_locked_down);
10801 #endif
10802 assert(pmap->type == PMAP_TYPE_USER);
10803 pmap->type = PMAP_TYPE_COMMPAGE;
10804 /*
10805 * Free the pmap's ASID. This pmap should not ever be directly
10806 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
10807 * ASID space contention but will also cause pmap_switch() to panic
10808 * if an attacker tries to activate this pmap. Disable preemption to
10809 * accommodate the *_nopreempt spinlock in free_asid().
10810 */
10811 mp_disable_preemption();
10812 pmap_get_pt_ops(pmap)->free_id(pmap);
10813 mp_enable_preemption();
10814 }
10815
10816 static void
10817 pmap_update_tt3e(
10818 pmap_t pmap,
10819 vm_address_t address,
10820 tt_entry_t template)
10821 {
10822 tt_entry_t *ptep, pte;
10823
10824 ptep = pmap_tt3e(pmap, address);
10825 if (ptep == NULL) {
10826 panic("%s: no ptep?", __FUNCTION__);
10827 }
10828
10829 pte = *ptep;
10830 pte = tte_to_pa(pte) | template;
10831 write_pte_strong(ptep, pte);
10832 }
10833
10834 /* Note absence of non-global bit */
10835 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10836 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10837 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
10838 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10839
10840 /* Note absence of non-global bit and no-execute bit. */
10841 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10842 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10843 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
10844 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10845
10846 void
10847 pmap_create_sharedpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10848 vm_map_address_t *user_text_addr)
10849 {
10850 kern_return_t kr;
10851 pmap_paddr_t data_pa = 0; // data address
10852 pmap_paddr_t text_pa = 0; // text address
10853
10854 *kernel_data_addr = 0;
10855 *kernel_text_addr = 0;
10856 *user_text_addr = 0;
10857
10858 #if XNU_MONITOR
10859 data_pa = pmap_alloc_page_for_kern(0);
10860 assert(data_pa);
10861 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
10862 #if CONFIG_ARM_PFZ
10863 text_pa = pmap_alloc_page_for_kern(0);
10864 assert(text_pa);
10865 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
10866 #endif
10867
10868 #else /* XNU_MONITOR */
10869 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
10870 #if CONFIG_ARM_PFZ
10871 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
10872 #endif
10873
10874 #endif /* XNU_MONITOR */
10875
10876 /*
10877 * In order to avoid burning extra pages on mapping the shared page, we
10878 * create a dedicated pmap for the shared page. We forcibly nest the
10879 * translation tables from this pmap into other pmaps. The level we
10880 * will nest at depends on the MMU configuration (page size, TTBR range,
10881 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
10882 *
10883 * Note that this is NOT "the nested pmap" (which is used to nest the
10884 * shared cache).
10885 *
10886 * Note that we update parameters of the entry for our unique needs (NG
10887 * entry, etc.).
10888 */
10889 sharedpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
10890 assert(sharedpage_pmap_default != NULL);
10891 pmap_set_commpage(sharedpage_pmap_default);
10892
10893 /* The user 64-bit mapping... */
10894 kr = pmap_enter_addr(sharedpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10895 assert(kr == KERN_SUCCESS);
10896 pmap_update_tt3e(sharedpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10897 #if CONFIG_ARM_PFZ
10898 /* User mapping of comm page text section for 64 bit mapping only
10899 *
10900 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10901 * user processes to get this page mapped in, they should never call into
10902 * this page.
10903 *
10904 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10905 * is slid in the same L3 as the data commpage. It is either outside the
10906 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
10907 * it is reserved and unavailable to mach VM for future mappings.
10908 */
10909 const pt_attr_t * const pt_attr = pmap_get_pt_attr(sharedpage_pmap_default);
10910 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
10911
10912 vm_map_address_t commpage_text_va = 0;
10913
10914 do {
10915 int text_leaf_index = random() % num_ptes;
10916
10917 // Generate a VA for the commpage text with the same root and twig index as data
10918 // comm page, but with new leaf index we've just generated.
10919 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
10920 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
10921 } while (commpage_text_va == _COMM_PAGE64_BASE_ADDRESS); // Try again if we collide (should be unlikely)
10922
10923 // Assert that this is empty
10924 __assert_only pt_entry_t *ptep = pmap_pte(sharedpage_pmap_default, commpage_text_va);
10925 assert(ptep != PT_ENTRY_NULL);
10926 assert(*ptep == ARM_TTE_EMPTY);
10927
10928 // At this point, we've found the address we want to insert our comm page at
10929 kr = pmap_enter_addr(sharedpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10930 assert(kr == KERN_SUCCESS);
10931 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
10932 pmap_update_tt3e(sharedpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
10933
10934 *user_text_addr = commpage_text_va;
10935 #endif
10936
10937 /* ...and the user 32-bit mapping. */
10938 kr = pmap_enter_addr(sharedpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10939 assert(kr == KERN_SUCCESS);
10940 pmap_update_tt3e(sharedpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10941
10942 #if __ARM_MIXED_PAGE_SIZE__
10943 /**
10944 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
10945 * new set of page tables that point to the exact same 16K shared page as
10946 * before. Only the first 4K of the 16K shared page is mapped since that's
10947 * the only part that contains relevant data.
10948 */
10949 sharedpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
10950 assert(sharedpage_pmap_4k != NULL);
10951 pmap_set_commpage(sharedpage_pmap_4k);
10952
10953 /* The user 64-bit mapping... */
10954 kr = pmap_enter_addr(sharedpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10955 assert(kr == KERN_SUCCESS);
10956 pmap_update_tt3e(sharedpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10957
10958 /* ...and the user 32-bit mapping. */
10959 kr = pmap_enter_addr(sharedpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10960 assert(kr == KERN_SUCCESS);
10961 pmap_update_tt3e(sharedpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10962
10963 #endif
10964
10965 /* For manipulation in kernel, go straight to physical page */
10966 *kernel_data_addr = phystokv(data_pa);
10967 *kernel_text_addr = (text_pa) ? phystokv(text_pa) : 0;
10968 }
10969
10970
10971 /*
10972 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10973 * with user controlled TTEs for regions that aren't explicitly reserved by the
10974 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10975 */
10976 #if (ARM_PGSHIFT == 14)
10977 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10978 #elif (ARM_PGSHIFT == 12)
10979 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= VM_MAX_ADDRESS);
10980 #else
10981 #error Nested shared page mapping is unsupported on this config
10982 #endif
10983
10984 MARK_AS_PMAP_TEXT kern_return_t
10985 pmap_insert_sharedpage_internal(
10986 pmap_t pmap)
10987 {
10988 kern_return_t kr = KERN_SUCCESS;
10989 vm_offset_t sharedpage_vaddr;
10990 pt_entry_t *ttep, *src_ttep;
10991 int options = 0;
10992 pmap_t sharedpage_pmap = sharedpage_pmap_default;
10993
10994 /* Validate the pmap input before accessing its data. */
10995 validate_pmap_mutable(pmap);
10996
10997 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10998 const unsigned int sharedpage_level = pt_attr_commpage_level(pt_attr);
10999
11000 #if __ARM_MIXED_PAGE_SIZE__
11001 #if !__ARM_16K_PG__
11002 /* The following code assumes that sharedpage_pmap_default is a 16KB pmap. */
11003 #error "pmap_insert_sharedpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11004 #endif /* !__ARM_16K_PG__ */
11005
11006 /* Choose the correct shared page pmap to use. */
11007 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11008 if (pmap_page_size == 16384) {
11009 sharedpage_pmap = sharedpage_pmap_default;
11010 } else if (pmap_page_size == 4096) {
11011 sharedpage_pmap = sharedpage_pmap_4k;
11012 } else {
11013 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11014 }
11015 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11016
11017 #if XNU_MONITOR
11018 options |= PMAP_OPTIONS_NOWAIT;
11019 #endif /* XNU_MONITOR */
11020
11021 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11022 #error We assume a single page.
11023 #endif
11024
11025 if (pmap_is_64bit(pmap)) {
11026 sharedpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11027 } else {
11028 sharedpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11029 }
11030
11031
11032 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11033
11034 /*
11035 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11036 * two (2MB) depending on the address space layout. For 16KB pages, each level
11037 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11038 * to "nest".
11039 *
11040 * Note: This is not "nesting" in the shared cache sense. This definition of
11041 * nesting just means inserting pointers to pre-allocated tables inside of
11042 * the passed in pmap to allow us to share page tables (which map the shared
11043 * page) for every task. This saves at least one page of memory per process
11044 * compared to creating new page tables in every process for mapping the
11045 * shared page.
11046 */
11047
11048 /**
11049 * Allocate the twig page tables if needed, and slam a pointer to the shared
11050 * page's tables into place.
11051 */
11052 while ((ttep = pmap_ttne(pmap, sharedpage_level, sharedpage_vaddr)) == TT_ENTRY_NULL) {
11053 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11054
11055 kr = pmap_expand(pmap, sharedpage_vaddr, options, sharedpage_level);
11056
11057 if (kr != KERN_SUCCESS) {
11058 #if XNU_MONITOR
11059 if (kr == KERN_RESOURCE_SHORTAGE) {
11060 return kr;
11061 } else
11062 #endif
11063 {
11064 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11065 }
11066 }
11067
11068 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11069 }
11070
11071 if (*ttep != ARM_PTE_EMPTY) {
11072 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11073 }
11074
11075 src_ttep = pmap_ttne(sharedpage_pmap, sharedpage_level, sharedpage_vaddr);
11076
11077 *ttep = *src_ttep;
11078 FLUSH_PTE_STRONG();
11079
11080 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11081
11082 return kr;
11083 }
11084
11085 static void
11086 pmap_unmap_sharedpage(
11087 pmap_t pmap)
11088 {
11089 pt_entry_t *ttep;
11090 vm_offset_t sharedpage_vaddr;
11091 pmap_t sharedpage_pmap = sharedpage_pmap_default;
11092
11093 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11094 const unsigned int sharedpage_level = pt_attr_commpage_level(pt_attr);
11095
11096 #if __ARM_MIXED_PAGE_SIZE__
11097 #if !__ARM_16K_PG__
11098 /* The following code assumes that sharedpage_pmap_default is a 16KB pmap. */
11099 #error "pmap_unmap_sharedpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11100 #endif /* !__ARM_16K_PG__ */
11101
11102 /* Choose the correct shared page pmap to use. */
11103 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11104 if (pmap_page_size == 16384) {
11105 sharedpage_pmap = sharedpage_pmap_default;
11106 } else if (pmap_page_size == 4096) {
11107 sharedpage_pmap = sharedpage_pmap_4k;
11108 } else {
11109 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11110 }
11111 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11112
11113 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11114 #error We assume a single page.
11115 #endif
11116
11117 if (pmap_is_64bit(pmap)) {
11118 sharedpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11119 } else {
11120 sharedpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11121 }
11122
11123
11124 ttep = pmap_ttne(pmap, sharedpage_level, sharedpage_vaddr);
11125
11126 if (ttep == NULL) {
11127 return;
11128 }
11129
11130 /* It had better be mapped to the shared page. */
11131 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(sharedpage_pmap, sharedpage_level, sharedpage_vaddr)) {
11132 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11133 }
11134
11135 *ttep = ARM_TTE_EMPTY;
11136 FLUSH_PTE_STRONG();
11137
11138 flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, pmap, false);
11139 sync_tlb_flush();
11140 }
11141
11142 void
11143 pmap_insert_sharedpage(
11144 pmap_t pmap)
11145 {
11146 #if XNU_MONITOR
11147 kern_return_t kr = KERN_FAILURE;
11148
11149 while ((kr = pmap_insert_sharedpage_ppl(pmap)) == KERN_RESOURCE_SHORTAGE) {
11150 pmap_alloc_page_for_ppl(0);
11151 }
11152
11153 pmap_ledger_check_balance(pmap);
11154
11155 if (kr != KERN_SUCCESS) {
11156 panic("%s: failed to insert the shared page, kr=%d, "
11157 "pmap=%p",
11158 __FUNCTION__, kr,
11159 pmap);
11160 }
11161 #else
11162 pmap_insert_sharedpage_internal(pmap);
11163 #endif
11164 }
11165
11166 static boolean_t
11167 pmap_is_64bit(
11168 pmap_t pmap)
11169 {
11170 return pmap->is_64bit;
11171 }
11172
11173 bool
11174 pmap_is_exotic(
11175 pmap_t pmap __unused)
11176 {
11177 return false;
11178 }
11179
11180 #endif
11181
11182 /* ARMTODO -- an implementation that accounts for
11183 * holes in the physical map, if any.
11184 */
11185 boolean_t
11186 pmap_valid_page(
11187 ppnum_t pn)
11188 {
11189 return pa_valid(ptoa(pn));
11190 }
11191
11192 boolean_t
11193 pmap_bootloader_page(
11194 ppnum_t pn)
11195 {
11196 pmap_paddr_t paddr = ptoa(pn);
11197
11198 if (pa_valid(paddr)) {
11199 return FALSE;
11200 }
11201 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11202 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11203 }
11204
11205 MARK_AS_PMAP_TEXT boolean_t
11206 pmap_is_empty_internal(
11207 pmap_t pmap,
11208 vm_map_offset_t va_start,
11209 vm_map_offset_t va_end)
11210 {
11211 vm_map_offset_t block_start, block_end;
11212 tt_entry_t *tte_p;
11213
11214 if (pmap == NULL) {
11215 return TRUE;
11216 }
11217
11218 validate_pmap(pmap);
11219
11220 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11221 unsigned int initial_not_in_kdp = not_in_kdp;
11222
11223 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11224 pmap_lock(pmap, PMAP_LOCK_SHARED);
11225 }
11226
11227 #if (__ARM_VMSA__ == 7)
11228 if (tte_index(pt_attr, va_end) >= pmap->tte_index_max) {
11229 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11230 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11231 }
11232 return TRUE;
11233 }
11234 #endif
11235
11236 /* TODO: This will be faster if we increment ttep at each level. */
11237 block_start = va_start;
11238
11239 while (block_start < va_end) {
11240 pt_entry_t *bpte_p, *epte_p;
11241 pt_entry_t *pte_p;
11242
11243 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11244 if (block_end > va_end) {
11245 block_end = va_end;
11246 }
11247
11248 tte_p = pmap_tte(pmap, block_start);
11249 if ((tte_p != PT_ENTRY_NULL)
11250 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11251 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11252 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11253 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11254
11255 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11256 if (*pte_p != ARM_PTE_EMPTY) {
11257 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11258 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11259 }
11260 return FALSE;
11261 }
11262 }
11263 }
11264 block_start = block_end;
11265 }
11266
11267 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11268 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11269 }
11270
11271 return TRUE;
11272 }
11273
11274 boolean_t
11275 pmap_is_empty(
11276 pmap_t pmap,
11277 vm_map_offset_t va_start,
11278 vm_map_offset_t va_end)
11279 {
11280 #if XNU_MONITOR
11281 return pmap_is_empty_ppl(pmap, va_start, va_end);
11282 #else
11283 return pmap_is_empty_internal(pmap, va_start, va_end);
11284 #endif
11285 }
11286
11287 vm_map_offset_t
11288 pmap_max_offset(
11289 boolean_t is64,
11290 unsigned int option)
11291 {
11292 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11293 }
11294
11295 vm_map_offset_t
11296 pmap_max_64bit_offset(
11297 __unused unsigned int option)
11298 {
11299 vm_map_offset_t max_offset_ret = 0;
11300
11301 #if defined(__arm64__)
11302 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000) // end of shared region + 512MB for various purposes
11303 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
11304 "Minimum address space size outside allowable range");
11305 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11306 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11307 max_offset_ret = arm64_pmap_max_offset_default;
11308 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11309 max_offset_ret = min_max_offset;
11310 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11311 max_offset_ret = MACH_VM_MAX_ADDRESS;
11312 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11313 if (arm64_pmap_max_offset_default) {
11314 max_offset_ret = arm64_pmap_max_offset_default;
11315 } else if (max_mem > 0xC0000000) {
11316 max_offset_ret = min_max_offset + 0x138000000; // Max offset is 13.375GB for devices with > 3GB of memory
11317 } else if (max_mem > 0x40000000) {
11318 max_offset_ret = min_max_offset + 0x38000000; // Max offset is 9.375GB for devices with > 1GB and <= 3GB of memory
11319 } else {
11320 max_offset_ret = min_max_offset;
11321 }
11322 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11323 if (arm64_pmap_max_offset_default) {
11324 // Allow the boot-arg to override jumbo size
11325 max_offset_ret = arm64_pmap_max_offset_default;
11326 } else {
11327 max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11328 }
11329 } else {
11330 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11331 }
11332
11333 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11334 assert(max_offset_ret >= min_max_offset);
11335 #else
11336 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11337 #endif
11338
11339 return max_offset_ret;
11340 }
11341
11342 vm_map_offset_t
11343 pmap_max_32bit_offset(
11344 unsigned int option)
11345 {
11346 vm_map_offset_t max_offset_ret = 0;
11347
11348 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11349 max_offset_ret = arm_pmap_max_offset_default;
11350 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11351 max_offset_ret = 0x80000000;
11352 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11353 max_offset_ret = VM_MAX_ADDRESS;
11354 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11355 if (arm_pmap_max_offset_default) {
11356 max_offset_ret = arm_pmap_max_offset_default;
11357 } else if (max_mem > 0x20000000) {
11358 max_offset_ret = 0x80000000;
11359 } else {
11360 max_offset_ret = 0x80000000;
11361 }
11362 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11363 max_offset_ret = 0x80000000;
11364 } else {
11365 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11366 }
11367
11368 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11369 return max_offset_ret;
11370 }
11371
11372 #if CONFIG_DTRACE
11373 /*
11374 * Constrain DTrace copyin/copyout actions
11375 */
11376 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11377 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11378
11379 kern_return_t
11380 dtrace_copyio_preflight(
11381 __unused addr64_t va)
11382 {
11383 if (current_map() == kernel_map) {
11384 return KERN_FAILURE;
11385 } else {
11386 return KERN_SUCCESS;
11387 }
11388 }
11389
11390 kern_return_t
11391 dtrace_copyio_postflight(
11392 __unused addr64_t va)
11393 {
11394 return KERN_SUCCESS;
11395 }
11396 #endif /* CONFIG_DTRACE */
11397
11398
11399 void
11400 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11401 {
11402 }
11403
11404
11405 void
11406 pmap_flush(
11407 __unused pmap_flush_context *cpus_to_flush)
11408 {
11409 /* not implemented yet */
11410 return;
11411 }
11412
11413 #if XNU_MONITOR
11414
11415 /*
11416 * Enforce that the address range described by kva and nbytes is not currently
11417 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11418 * unintentionally writing to PPL-owned memory.
11419 */
11420 static void
11421 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11422 {
11423 vm_offset_t end;
11424 if (os_add_overflow(kva, nbytes, &end)) {
11425 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11426 }
11427 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11428 pmap_paddr_t pa = kvtophys_nofail(ckva);
11429 pp_attr_t attr;
11430 unsigned int pai = pa_index(pa);
11431 if (ckva == phystokv(pa)) {
11432 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11433 }
11434 do {
11435 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11436 if (attr & PP_ATTR_MONITOR) {
11437 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11438 }
11439 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11440 }
11441 }
11442
11443 static void
11444 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11445 {
11446 vm_offset_t end;
11447 if (os_add_overflow(kva, nbytes, &end)) {
11448 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11449 }
11450 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11451 pmap_paddr_t pa = kvtophys_nofail(ckva);
11452
11453 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11454 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11455 }
11456 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11457 ppattr_pa_clear_no_monitor(pa);
11458 }
11459 }
11460
11461 /**
11462 * Lock down a page, making all mappings read-only, and preventing further
11463 * mappings or removal of this particular kva's mapping. Effectively, it makes
11464 * the physical page at kva immutable (see the ppl_writable parameter for an
11465 * exception to this).
11466 *
11467 * @param kva Valid address to any mapping of the physical page to lockdown.
11468 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11469 * @param ppl_writable True if the PPL should still be able to write to the page
11470 * using the physical aperture mapping. False will make the
11471 * page read-only for both the kernel and PPL in the
11472 * physical aperture.
11473 */
11474 MARK_AS_PMAP_TEXT static void
11475 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11476 {
11477 const pmap_paddr_t pa = kvtophys_nofail(kva);
11478 const unsigned int pai = pa_index(pa);
11479
11480 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11481 pvh_lock(pai);
11482 pv_entry_t **pvh = pai_to_pvh(pai);
11483 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11484
11485 if (__improbable(ppattr_pa_test_monitor(pa))) {
11486 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11487 }
11488
11489 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11490 panic("%s: %#lx already locked down/executable (%#llx)",
11491 __func__, kva, (uint64_t)pvh_flags);
11492 }
11493
11494 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11495
11496 /* Update the physical aperture mapping to prevent kernel write access. */
11497 const unsigned int new_xprr_perm =
11498 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11499 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11500
11501 pvh_unlock(pai);
11502
11503 pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0, NULL);
11504
11505 /**
11506 * Double-check that the mapping didn't change physical addresses before the
11507 * LOCKDOWN flag was set (there is a brief window between the above
11508 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11509 *
11510 * This doesn't solve the ABA problem, but this doesn't have to since once
11511 * the pvh_lock() is grabbed no new mappings can be created on this physical
11512 * page without the LOCKDOWN flag already set (so any future mappings can
11513 * only be RO, and no existing mappings can be removed).
11514 */
11515 if (kvtophys_nofail(kva) != pa) {
11516 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11517 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11518 }
11519 }
11520
11521 /**
11522 * Helper for releasing a page from being locked down to the PPL, making it writable to the
11523 * kernel once again.
11524 *
11525 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11526 * to unlockdown a page that was never locked down, will panic.
11527 *
11528 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
11529 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11530 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11531 * passed to the paired pmap_ppl_lockdown_page() call. Any
11532 * deviation will result in a panic.
11533 */
11534 MARK_AS_PMAP_TEXT static void
11535 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11536 {
11537 pvh_assert_locked(pai);
11538 pv_entry_t **pvh = pai_to_pvh(pai);
11539 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11540
11541 if (__improbable(!(pvh_flags & lockdown_flag))) {
11542 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11543 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11544 }
11545
11546 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11547
11548 /* Restore the pre-lockdown physical aperture mapping permissions. */
11549 const unsigned int old_xprr_perm =
11550 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11551 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11552 }
11553
11554 /**
11555 * Release a page from being locked down to the PPL, making it writable to the
11556 * kernel once again.
11557 *
11558 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11559 * to unlockdown a page that was never locked down, will panic.
11560 *
11561 * @param kva Valid address to any mapping of the physical page to unlockdown.
11562 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11563 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11564 * passed to the paired pmap_ppl_lockdown_page() call. Any
11565 * deviation will result in a panic.
11566 */
11567 MARK_AS_PMAP_TEXT static void
11568 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11569 {
11570 const pmap_paddr_t pa = kvtophys_nofail(kva);
11571 const unsigned int pai = pa_index(pa);
11572
11573 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11574 pvh_lock(pai);
11575 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11576 pvh_unlock(pai);
11577 }
11578
11579 #else /* XNU_MONITOR */
11580
11581 static void __unused
11582 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11583 {
11584 }
11585
11586 static void __unused
11587 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11588 {
11589 }
11590
11591 #endif /* !XNU_MONITOR */
11592
11593
11594 MARK_AS_PMAP_TEXT static inline void
11595 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11596 {
11597 #if XNU_MONITOR
11598 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11599 #else
11600 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11601 #endif
11602 }
11603
11604 MARK_AS_PMAP_TEXT static inline void
11605 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11606 {
11607 #if XNU_MONITOR
11608 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11609 #else
11610 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11611 #endif
11612 }
11613
11614 /**
11615 * Perform basic validation checks on the destination only and
11616 * corresponding offset/sizes prior to writing to a read only allocation.
11617 *
11618 * @note Should be called before writing to an allocation from the read
11619 * only allocator.
11620 *
11621 * @param zid The ID of the zone the allocation belongs to.
11622 * @param va VA of element being modified (destination).
11623 * @param offset Offset being written to, in the element.
11624 * @param new_data_size Size of modification.
11625 *
11626 */
11627
11628 MARK_AS_PMAP_TEXT static void
11629 pmap_ro_zone_validate_element_dst(
11630 zone_id_t zid,
11631 vm_offset_t va,
11632 vm_offset_t offset,
11633 vm_size_t new_data_size)
11634 {
11635 vm_size_t elem_size = zone_elem_size_ro(zid);
11636 vm_offset_t sum = 0, page = trunc_page(va);
11637
11638 if (__improbable(new_data_size > (elem_size - offset))) {
11639 panic("%s: New data size %lu too large for elem size %lu at addr %p",
11640 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11641 }
11642 if (__improbable(offset >= elem_size)) {
11643 panic("%s: Offset %lu too large for elem size %lu at addr %p",
11644 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11645 }
11646 if (__improbable(os_add3_overflow(va, offset, new_data_size, &sum))) {
11647 panic("%s: Integer addition overflow %p + %lu + %lu = %lu",
11648 __func__, (void*)va, (uintptr_t)offset, (uintptr_t) new_data_size,
11649 (uintptr_t)sum);
11650 }
11651 if (__improbable((va - page) % elem_size)) {
11652 panic("%s: Start of element %p is not aligned to element size %lu",
11653 __func__, (void *)va, (uintptr_t)elem_size);
11654 }
11655
11656 /* Check element is from correct zone */
11657 zone_require_ro(zid, elem_size, (void*)va);
11658 }
11659
11660
11661 /**
11662 * Perform basic validation checks on the source, destination and
11663 * corresponding offset/sizes prior to writing to a read only allocation.
11664 *
11665 * @note Should be called before writing to an allocation from the read
11666 * only allocator.
11667 *
11668 * @param zid The ID of the zone the allocation belongs to.
11669 * @param va VA of element being modified (destination).
11670 * @param offset Offset being written to, in the element.
11671 * @param new_data Pointer to new data (source).
11672 * @param new_data_size Size of modification.
11673 *
11674 */
11675
11676 MARK_AS_PMAP_TEXT static void
11677 pmap_ro_zone_validate_element(
11678 zone_id_t zid,
11679 vm_offset_t va,
11680 vm_offset_t offset,
11681 const vm_offset_t new_data,
11682 vm_size_t new_data_size)
11683 {
11684 vm_offset_t sum = 0;
11685
11686 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11687 panic("%s: Integer addition overflow %p + %lu = %lu",
11688 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11689 }
11690
11691 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11692 }
11693
11694 /**
11695 * Ensure that physical page is locked down and pinned, before writing to it.
11696 *
11697 * @note Should be called before writing to an allocation from the read
11698 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
11699 * ensure that it is called after the modification.
11700 *
11701 *
11702 * @param pa Physical address of the element being modified.
11703 * @param va Virtual address of element being modified.
11704 * @param size Size of the modification.
11705 *
11706 */
11707
11708 MARK_AS_PMAP_TEXT static void
11709 pmap_ro_zone_lock_phy_page(
11710 const pmap_paddr_t pa,
11711 vm_offset_t va,
11712 vm_size_t size)
11713 {
11714 const unsigned int pai = pa_index(pa);
11715 pvh_lock(pai);
11716
11717 /* Ensure that the physical page is locked down */
11718 #if XNU_MONITOR
11719 pv_entry_t **pvh = pai_to_pvh(pai);
11720 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
11721 panic("%s: Physical page not locked down %llx", __func__, pa);
11722 }
11723 #endif /* XNU_MONITOR */
11724
11725 /* Ensure page can't become PPL-owned memory before the memcpy occurs */
11726 pmap_pin_kernel_pages(va, size);
11727 }
11728
11729 /**
11730 * Unlock and unpin physical page after writing to it.
11731 *
11732 * @note Should be called after writing to an allocation from the read
11733 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
11734 * ensure that it has been called prior to the modification.
11735 *
11736 * @param pa Physical address of the element that was modified.
11737 * @param va Virtual address of element that was modified.
11738 * @param size Size of the modification.
11739 *
11740 */
11741
11742 MARK_AS_PMAP_TEXT static void
11743 pmap_ro_zone_unlock_phy_page(
11744 const pmap_paddr_t pa,
11745 vm_offset_t va,
11746 vm_size_t size)
11747 {
11748 const unsigned int pai = pa_index(pa);
11749 pmap_unpin_kernel_pages(va, size);
11750 pvh_unlock(pai);
11751 }
11752
11753 /**
11754 * Function to copy kauth_cred from new_data to kv.
11755 * Function defined in "kern_prot.c"
11756 *
11757 * @note Will be removed upon completion of
11758 * <rdar://problem/72635194> Compiler PAC support for memcpy.
11759 *
11760 * @param kv Address to copy new data to.
11761 * @param new_data Pointer to new data.
11762 *
11763 */
11764
11765 extern void
11766 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11767
11768 /**
11769 * Zalloc-specific memcpy that writes through the physical aperture
11770 * and ensures the element being modified is from a read-only zone.
11771 *
11772 * @note Designed to work only with the zone allocator's read-only submap.
11773 *
11774 * @param zid The ID of the zone to allocate from.
11775 * @param va VA of element to be modified.
11776 * @param offset Offset from element.
11777 * @param new_data Pointer to new data.
11778 * @param new_data_size Size of modification.
11779 *
11780 */
11781
11782 void
11783 pmap_ro_zone_memcpy(
11784 zone_id_t zid,
11785 vm_offset_t va,
11786 vm_offset_t offset,
11787 const vm_offset_t new_data,
11788 vm_size_t new_data_size)
11789 {
11790 #if XNU_MONITOR
11791 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
11792 #else /* XNU_MONITOR */
11793 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11794 #endif /* XNU_MONITOR */
11795 }
11796
11797 MARK_AS_PMAP_TEXT void
11798 pmap_ro_zone_memcpy_internal(
11799 zone_id_t zid,
11800 vm_offset_t va,
11801 vm_offset_t offset,
11802 const vm_offset_t new_data,
11803 vm_size_t new_data_size)
11804 {
11805 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11806
11807 if (!new_data || new_data_size == 0) {
11808 return;
11809 }
11810
11811 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11812 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
11813 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11814 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
11815 }
11816
11817 /**
11818 * Zalloc-specific function to atomically mutate fields of an element that
11819 * belongs to a read-only zone, via the physcial aperture.
11820 *
11821 * @note Designed to work only with the zone allocator's read-only submap.
11822 *
11823 * @param zid The ID of the zone the element belongs to.
11824 * @param va VA of element to be modified.
11825 * @param offset Offset in element.
11826 * @param op Atomic operation to perform.
11827 * @param value Mutation value.
11828 *
11829 */
11830
11831 uint64_t
11832 pmap_ro_zone_atomic_op(
11833 zone_id_t zid,
11834 vm_offset_t va,
11835 vm_offset_t offset,
11836 zro_atomic_op_t op,
11837 uint64_t value)
11838 {
11839 #if XNU_MONITOR
11840 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
11841 #else /* XNU_MONITOR */
11842 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11843 #endif /* XNU_MONITOR */
11844 }
11845
11846 MARK_AS_PMAP_TEXT uint64_t
11847 pmap_ro_zone_atomic_op_internal(
11848 zone_id_t zid,
11849 vm_offset_t va,
11850 vm_offset_t offset,
11851 zro_atomic_op_t op,
11852 uint64_t value)
11853 {
11854 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11855 vm_size_t value_size = op & 0xf;
11856
11857 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11858 pmap_ro_zone_lock_phy_page(pa, va, value_size);
11859 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11860 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
11861
11862 return value;
11863 }
11864
11865 /**
11866 * bzero for allocations from read only zones, that writes through the
11867 * physical aperture.
11868 *
11869 * @note This is called by the zfree path of all allocations from read
11870 * only zones.
11871 *
11872 * @param zid The ID of the zone the allocation belongs to.
11873 * @param va VA of element to be zeroed.
11874 * @param offset Offset in the element.
11875 * @param size Size of allocation.
11876 *
11877 */
11878
11879 void
11880 pmap_ro_zone_bzero(
11881 zone_id_t zid,
11882 vm_offset_t va,
11883 vm_offset_t offset,
11884 vm_size_t size)
11885 {
11886 #if XNU_MONITOR
11887 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
11888 #else /* XNU_MONITOR */
11889 pmap_ro_zone_bzero_internal(zid, va, offset, size);
11890 #endif /* XNU_MONITOR */
11891 }
11892
11893 MARK_AS_PMAP_TEXT void
11894 pmap_ro_zone_bzero_internal(
11895 zone_id_t zid,
11896 vm_offset_t va,
11897 vm_offset_t offset,
11898 vm_size_t size)
11899 {
11900 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11901 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11902 pmap_ro_zone_lock_phy_page(pa, va, size);
11903 bzero((void*)phystokv(pa), size);
11904 pmap_ro_zone_unlock_phy_page(pa, va, size);
11905 }
11906
11907 /**
11908 * Removes write access from the Physical Aperture.
11909 *
11910 * @note For non-PPL devices, it simply makes all virtual mappings RO.
11911 * @note Designed to work only with the zone allocator's read-only submap.
11912 *
11913 * @param va VA of the page to restore write access to.
11914 *
11915 */
11916 MARK_AS_PMAP_TEXT static void
11917 pmap_phys_write_disable(vm_address_t va)
11918 {
11919 #if XNU_MONITOR
11920 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
11921 #else /* XNU_MONITOR */
11922 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
11923 #endif /* XNU_MONITOR */
11924 }
11925
11926 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
11927
11928 MARK_AS_PMAP_TEXT mach_vm_size_t
11929 pmap_query_resident_internal(
11930 pmap_t pmap,
11931 vm_map_address_t start,
11932 vm_map_address_t end,
11933 mach_vm_size_t *compressed_bytes_p)
11934 {
11935 mach_vm_size_t resident_bytes = 0;
11936 mach_vm_size_t compressed_bytes = 0;
11937
11938 pt_entry_t *bpte, *epte;
11939 pt_entry_t *pte_p;
11940 tt_entry_t *tte_p;
11941
11942 if (pmap == NULL) {
11943 return PMAP_RESIDENT_INVALID;
11944 }
11945
11946 validate_pmap(pmap);
11947
11948 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11949
11950 /* Ensure that this request is valid, and addresses exactly one TTE. */
11951 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11952 (end % pt_attr_page_size(pt_attr)))) {
11953 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11954 }
11955
11956 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11957 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11958 }
11959
11960 pmap_lock(pmap, PMAP_LOCK_SHARED);
11961 tte_p = pmap_tte(pmap, start);
11962 if (tte_p == (tt_entry_t *) NULL) {
11963 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11964 return PMAP_RESIDENT_INVALID;
11965 }
11966 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
11967 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11968 bpte = &pte_p[pte_index(pt_attr, start)];
11969 epte = &pte_p[pte_index(pt_attr, end)];
11970
11971 for (; bpte < epte; bpte++) {
11972 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
11973 compressed_bytes += pt_attr_page_size(pt_attr);
11974 } else if (pa_valid(pte_to_pa(*bpte))) {
11975 resident_bytes += pt_attr_page_size(pt_attr);
11976 }
11977 }
11978 }
11979 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11980
11981 if (compressed_bytes_p) {
11982 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
11983 *compressed_bytes_p += compressed_bytes;
11984 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
11985 }
11986
11987 return resident_bytes;
11988 }
11989
11990 mach_vm_size_t
11991 pmap_query_resident(
11992 pmap_t pmap,
11993 vm_map_address_t start,
11994 vm_map_address_t end,
11995 mach_vm_size_t *compressed_bytes_p)
11996 {
11997 mach_vm_size_t total_resident_bytes;
11998 mach_vm_size_t compressed_bytes;
11999 vm_map_address_t va;
12000
12001
12002 if (pmap == PMAP_NULL) {
12003 if (compressed_bytes_p) {
12004 *compressed_bytes_p = 0;
12005 }
12006 return 0;
12007 }
12008
12009 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12010
12011 total_resident_bytes = 0;
12012 compressed_bytes = 0;
12013
12014 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12015 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12016 VM_KERNEL_ADDRHIDE(end));
12017
12018 va = start;
12019 while (va < end) {
12020 vm_map_address_t l;
12021 mach_vm_size_t resident_bytes;
12022
12023 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12024
12025 if (l > end) {
12026 l = end;
12027 }
12028 #if XNU_MONITOR
12029 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12030 #else
12031 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12032 #endif
12033 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12034 break;
12035 }
12036
12037 total_resident_bytes += resident_bytes;
12038
12039 va = l;
12040 }
12041
12042 if (compressed_bytes_p) {
12043 *compressed_bytes_p = compressed_bytes;
12044 }
12045
12046 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12047 total_resident_bytes);
12048
12049 return total_resident_bytes;
12050 }
12051
12052 #if MACH_ASSERT
12053 static void
12054 pmap_check_ledgers(
12055 pmap_t pmap)
12056 {
12057 int pid;
12058 char *procname;
12059
12060 if (pmap->pmap_pid == 0) {
12061 /*
12062 * This pmap was not or is no longer fully associated
12063 * with a task (e.g. the old pmap after a fork()/exec() or
12064 * spawn()). Its "ledger" still points at a task that is
12065 * now using a different (and active) address space, so
12066 * we can't check that all the pmap ledgers are balanced here.
12067 *
12068 * If the "pid" is set, that means that we went through
12069 * pmap_set_process() in task_terminate_internal(), so
12070 * this task's ledger should not have been re-used and
12071 * all the pmap ledgers should be back to 0.
12072 */
12073 return;
12074 }
12075
12076 pid = pmap->pmap_pid;
12077 procname = pmap->pmap_procname;
12078
12079 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12080 }
12081 #endif /* MACH_ASSERT */
12082
12083 void
12084 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12085 {
12086 }
12087
12088 /**
12089 * The minimum shared region nesting size is used by the VM to determine when to
12090 * break up large mappings to nested regions. The smallest size that these
12091 * mappings can be broken into is determined by what page table level those
12092 * regions are being nested in at and the size of the page tables.
12093 *
12094 * For instance, if a nested region is nesting at L2 for a process utilizing
12095 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12096 * block entry).
12097 *
12098 * @param pmap The target pmap to determine the block size based on whether it's
12099 * using 16KB or 4KB page tables.
12100 */
12101 uint64_t
12102 pmap_shared_region_size_min(__unused pmap_t pmap)
12103 {
12104 #if (__ARM_VMSA__ > 7)
12105 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12106
12107 /**
12108 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12109 * 4KB pages). This means that a target pmap will contain L2 entries that
12110 * point to shared L3 page tables in the shared region pmap.
12111 */
12112 return pt_attr_twig_size(pt_attr);
12113
12114 #else
12115 return ARM_NESTING_SIZE_MIN;
12116 #endif
12117 }
12118
12119 boolean_t
12120 pmap_enforces_execute_only(
12121 #if (__ARM_VMSA__ == 7)
12122 __unused
12123 #endif
12124 pmap_t pmap)
12125 {
12126 #if (__ARM_VMSA__ > 7)
12127 return pmap != kernel_pmap;
12128 #else
12129 return FALSE;
12130 #endif
12131 }
12132
12133 MARK_AS_PMAP_TEXT void
12134 pmap_set_vm_map_cs_enforced_internal(
12135 pmap_t pmap,
12136 bool new_value)
12137 {
12138 validate_pmap_mutable(pmap);
12139 pmap->pmap_vm_map_cs_enforced = new_value;
12140 }
12141
12142 void
12143 pmap_set_vm_map_cs_enforced(
12144 pmap_t pmap,
12145 bool new_value)
12146 {
12147 #if XNU_MONITOR
12148 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12149 #else
12150 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12151 #endif
12152 }
12153
12154 extern int cs_process_enforcement_enable;
12155 bool
12156 pmap_get_vm_map_cs_enforced(
12157 pmap_t pmap)
12158 {
12159 if (cs_process_enforcement_enable) {
12160 return true;
12161 }
12162 return pmap->pmap_vm_map_cs_enforced;
12163 }
12164
12165 MARK_AS_PMAP_TEXT void
12166 pmap_set_jit_entitled_internal(
12167 __unused pmap_t pmap)
12168 {
12169 return;
12170 }
12171
12172 void
12173 pmap_set_jit_entitled(
12174 pmap_t pmap)
12175 {
12176 #if XNU_MONITOR
12177 pmap_set_jit_entitled_ppl(pmap);
12178 #else
12179 pmap_set_jit_entitled_internal(pmap);
12180 #endif
12181 }
12182
12183 bool
12184 pmap_get_jit_entitled(
12185 __unused pmap_t pmap)
12186 {
12187 return false;
12188 }
12189
12190 MARK_AS_PMAP_TEXT kern_return_t
12191 pmap_query_page_info_internal(
12192 pmap_t pmap,
12193 vm_map_offset_t va,
12194 int *disp_p)
12195 {
12196 pmap_paddr_t pa;
12197 int disp;
12198 unsigned int pai;
12199 pt_entry_t *pte;
12200 pv_entry_t **pv_h, *pve_p;
12201
12202 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12203 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12204 *disp_p = 0;
12205 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12206 return KERN_INVALID_ARGUMENT;
12207 }
12208
12209 disp = 0;
12210
12211 validate_pmap(pmap);
12212 pmap_lock(pmap, PMAP_LOCK_SHARED);
12213
12214 pte = pmap_pte(pmap, va);
12215 if (pte == PT_ENTRY_NULL) {
12216 goto done;
12217 }
12218
12219 pa = pte_to_pa(*((volatile pt_entry_t*)pte));
12220 if (pa == 0) {
12221 if (ARM_PTE_IS_COMPRESSED(*pte, pte)) {
12222 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12223 if (*pte & ARM_PTE_COMPRESSED_ALT) {
12224 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12225 }
12226 }
12227 } else {
12228 disp |= PMAP_QUERY_PAGE_PRESENT;
12229 pai = pa_index(pa);
12230 if (!pa_valid(pa)) {
12231 goto done;
12232 }
12233 pvh_lock(pai);
12234 pv_h = pai_to_pvh(pai);
12235 pve_p = PV_ENTRY_NULL;
12236 int pve_ptep_idx = 0;
12237 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12238 pve_p = pvh_pve_list(pv_h);
12239 while (pve_p != PV_ENTRY_NULL &&
12240 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte)) == -1) {
12241 pve_p = pve_next(pve_p);
12242 }
12243 }
12244
12245 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12246 disp |= PMAP_QUERY_PAGE_ALTACCT;
12247 } else if (ppattr_test_reusable(pai)) {
12248 disp |= PMAP_QUERY_PAGE_REUSABLE;
12249 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12250 disp |= PMAP_QUERY_PAGE_INTERNAL;
12251 }
12252 pvh_unlock(pai);
12253 }
12254
12255 done:
12256 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12257 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12258 *disp_p = disp;
12259 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12260 return KERN_SUCCESS;
12261 }
12262
12263 kern_return_t
12264 pmap_query_page_info(
12265 pmap_t pmap,
12266 vm_map_offset_t va,
12267 int *disp_p)
12268 {
12269 #if XNU_MONITOR
12270 return pmap_query_page_info_ppl(pmap, va, disp_p);
12271 #else
12272 return pmap_query_page_info_internal(pmap, va, disp_p);
12273 #endif
12274 }
12275
12276
12277
12278 static vm_map_size_t
12279 pmap_user_va_size(pmap_t pmap __unused)
12280 {
12281 #if (__ARM_VMSA__ == 7)
12282 return VM_MAX_ADDRESS;
12283 #else
12284 #if __ARM_MIXED_PAGE_SIZE__
12285 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12286 return 1ULL << (64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK));
12287 #else
12288 return 1ULL << (64 - T0SZ_BOOT);
12289 #endif
12290 #endif /* __ARM_VMSA > 7 */
12291 }
12292
12293
12294
12295 kern_return_t
12296 pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache __unused *trust_cache,
12297 const vm_size_t __unused trust_cache_len)
12298 {
12299 // Unsupported
12300 return KERN_NOT_SUPPORTED;
12301 }
12302
12303 pmap_tc_ret_t
12304 pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cache,
12305 const vm_size_t __unused trust_cache_len,
12306 uint8_t const * __unused img4_manifest,
12307 const vm_size_t __unused img4_manifest_buffer_len,
12308 const vm_size_t __unused img4_manifest_actual_len,
12309 bool __unused dry_run)
12310 {
12311 // Unsupported
12312 return PMAP_TC_UNKNOWN_FORMAT;
12313 }
12314
12315 bool
12316 pmap_in_ppl(void)
12317 {
12318 // Unsupported
12319 return false;
12320 }
12321
12322 bool
12323 pmap_has_ppl(void)
12324 {
12325 // Unsupported
12326 return false;
12327 }
12328
12329 void
12330 pmap_lockdown_image4_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
12331 {
12332 // Unsupported
12333 }
12334
12335 void
12336 pmap_lockdown_image4_late_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
12337 {
12338 // Unsupported
12339 }
12340
12341 void *
12342 pmap_claim_reserved_ppl_page(void)
12343 {
12344 // Unsupported
12345 return NULL;
12346 }
12347
12348 void
12349 pmap_free_reserved_ppl_page(void __unused *kva)
12350 {
12351 // Unsupported
12352 }
12353
12354
12355 MARK_AS_PMAP_TEXT bool
12356 pmap_is_trust_cache_loaded_internal(const uuid_t uuid)
12357 {
12358 bool found = false;
12359
12360 pmap_simple_lock(&pmap_loaded_trust_caches_lock);
12361
12362 for (struct pmap_image4_trust_cache const *c = pmap_image4_trust_caches; c != NULL; c = c->next) {
12363 if (bcmp(uuid, c->module->uuid, sizeof(uuid_t)) == 0) {
12364 found = true;
12365 goto done;
12366 }
12367 }
12368
12369 #ifdef PLATFORM_BridgeOS
12370 for (struct pmap_legacy_trust_cache const *c = pmap_legacy_trust_caches; c != NULL; c = c->next) {
12371 if (bcmp(uuid, c->uuid, sizeof(uuid_t)) == 0) {
12372 found = true;
12373 goto done;
12374 }
12375 }
12376 #endif
12377
12378 done:
12379 pmap_simple_unlock(&pmap_loaded_trust_caches_lock);
12380 return found;
12381 }
12382
12383 bool
12384 pmap_is_trust_cache_loaded(const uuid_t uuid)
12385 {
12386 #if XNU_MONITOR
12387 return pmap_is_trust_cache_loaded_ppl(uuid);
12388 #else
12389 return pmap_is_trust_cache_loaded_internal(uuid);
12390 #endif
12391 }
12392
12393 MARK_AS_PMAP_TEXT bool
12394 pmap_lookup_in_loaded_trust_caches_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12395 {
12396 struct pmap_image4_trust_cache const *cache = NULL;
12397 #ifdef PLATFORM_BridgeOS
12398 struct pmap_legacy_trust_cache const *legacy = NULL;
12399 #endif
12400
12401 pmap_simple_lock(&pmap_loaded_trust_caches_lock);
12402
12403 for (cache = pmap_image4_trust_caches; cache != NULL; cache = cache->next) {
12404 uint8_t hash_type = 0, flags = 0;
12405
12406 if (lookup_in_trust_cache_module(cache->module, cdhash, &hash_type, &flags)) {
12407 goto done;
12408 }
12409 }
12410
12411 #ifdef PLATFORM_BridgeOS
12412 for (legacy = pmap_legacy_trust_caches; legacy != NULL; legacy = legacy->next) {
12413 for (uint32_t i = 0; i < legacy->num_hashes; i++) {
12414 if (bcmp(legacy->hashes[i], cdhash, CS_CDHASH_LEN) == 0) {
12415 goto done;
12416 }
12417 }
12418 }
12419 #endif
12420
12421 done:
12422 pmap_simple_unlock(&pmap_loaded_trust_caches_lock);
12423
12424 if (cache != NULL) {
12425 return true;
12426 #ifdef PLATFORM_BridgeOS
12427 } else if (legacy != NULL) {
12428 return true;
12429 #endif
12430 }
12431
12432 return false;
12433 }
12434
12435 bool
12436 pmap_lookup_in_loaded_trust_caches(const uint8_t cdhash[CS_CDHASH_LEN])
12437 {
12438 #if XNU_MONITOR
12439 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
12440 #else
12441 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
12442 #endif
12443 }
12444
12445 MARK_AS_PMAP_TEXT uint32_t
12446 pmap_lookup_in_static_trust_cache_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12447 {
12448 // Awkward indirection, because the PPL macros currently force their functions to be static.
12449 return lookup_in_static_trust_cache(cdhash);
12450 }
12451
12452 uint32_t
12453 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
12454 {
12455 #if XNU_MONITOR
12456 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
12457 #else
12458 return pmap_lookup_in_static_trust_cache_internal(cdhash);
12459 #endif
12460 }
12461
12462 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
12463 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
12464
12465 MARK_AS_PMAP_TEXT void
12466 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12467 {
12468
12469 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
12470 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
12471 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
12472
12473 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
12474 }
12475
12476 MARK_AS_PMAP_TEXT bool
12477 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12478 {
12479 bool match = false;
12480
12481 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
12482 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
12483 match = true;
12484 }
12485 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
12486
12487 if (match) {
12488 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
12489 }
12490
12491 return match;
12492 }
12493
12494 void
12495 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
12496 {
12497 #if XNU_MONITOR
12498 pmap_set_compilation_service_cdhash_ppl(cdhash);
12499 #else
12500 pmap_set_compilation_service_cdhash_internal(cdhash);
12501 #endif
12502 }
12503
12504 bool
12505 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
12506 {
12507 #if XNU_MONITOR
12508 return pmap_match_compilation_service_cdhash_ppl(cdhash);
12509 #else
12510 return pmap_match_compilation_service_cdhash_internal(cdhash);
12511 #endif
12512 }
12513
12514 /*
12515 * As part of supporting local signing on the device, we need the PMAP layer
12516 * to store the local signing key so that PMAP_CS can validate with it. We
12517 * store it at the PMAP layer such that it is accessible to both AMFI and
12518 * PMAP_CS should they need it.
12519 */
12520 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
12521 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE] = { 0 };
12522
12523 MARK_AS_PMAP_TEXT void
12524 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
12525 {
12526 bool key_set = false;
12527
12528 /*
12529 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
12530 * a successful exchange means that the local signing public key has _not_ been
12531 * set. In case the key has been set, we panic as we would never expect the
12532 * kernel to attempt to set the key more than once.
12533 */
12534 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
12535
12536 if (key_set) {
12537 panic("attempted to set the local signing public key multiple times");
12538 }
12539
12540 memcpy(pmap_local_signing_public_key, public_key, PMAP_ECC_P384_PUBLIC_KEY_SIZE);
12541 pmap_cs_log_info("set local signing public key");
12542 }
12543
12544 void
12545 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
12546 {
12547 #if XNU_MONITOR
12548 return pmap_set_local_signing_public_key_ppl(public_key);
12549 #else
12550 return pmap_set_local_signing_public_key_internal(public_key);
12551 #endif
12552 }
12553
12554 uint8_t*
12555 pmap_get_local_signing_public_key(void)
12556 {
12557 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
12558
12559 if (key_set) {
12560 return pmap_local_signing_public_key;
12561 }
12562
12563 return NULL;
12564 }
12565
12566 /*
12567 * Locally signed applications need to be explicitly authorized by an entitled application
12568 * before we allow them to run.
12569 */
12570 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
12571 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
12572
12573 MARK_AS_PMAP_TEXT void
12574 pmap_unrestrict_local_signing_internal(
12575 const uint8_t cdhash[CS_CDHASH_LEN])
12576 {
12577
12578 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12579 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
12580 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12581
12582 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
12583 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
12584 }
12585
12586 void
12587 pmap_unrestrict_local_signing(
12588 const uint8_t cdhash[CS_CDHASH_LEN])
12589 {
12590 #if XNU_MONITOR
12591 return pmap_unrestrict_local_signing_ppl(cdhash);
12592 #else
12593 return pmap_unrestrict_local_signing_internal(cdhash);
12594 #endif
12595 }
12596
12597 #if PMAP_CS
12598 MARK_AS_PMAP_TEXT static void
12599 pmap_restrict_local_signing(void)
12600 {
12601 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12602 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
12603 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12604 }
12605
12606 MARK_AS_PMAP_TEXT static bool
12607 pmap_local_signing_restricted(
12608 const uint8_t cdhash[CS_CDHASH_LEN])
12609 {
12610 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12611 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
12612 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12613
12614 return ret != 0;
12615 }
12616
12617 MARK_AS_PMAP_TEXT bool
12618 pmap_cs_query_entitlements_internal(
12619 pmap_t pmap,
12620 CEQuery_t query,
12621 size_t queryLength,
12622 CEQueryContext_t finalContext)
12623 {
12624 struct pmap_cs_code_directory *cd_entry = NULL;
12625 bool ret = false;
12626
12627 if (!pmap_cs) {
12628 panic("PMAP_CS: cannot query for entitlements as pmap_cs is turned off");
12629 }
12630
12631 /*
12632 * When a pmap has not been passed in, we assume the caller wants to check the
12633 * entitlements on the current user space process.
12634 */
12635 if (pmap == NULL) {
12636 pmap = current_pmap();
12637 }
12638
12639 if (pmap == kernel_pmap) {
12640 /*
12641 * Instead of panicking we will just return false.
12642 */
12643 return false;
12644 }
12645
12646 if (query == NULL || queryLength > 64) {
12647 panic("PMAP_CS: bogus entitlements query");
12648 } else {
12649 pmap_cs_assert_addr((vm_address_t)query, sizeof(CEQueryOperation_t) * queryLength, false, true);
12650 }
12651
12652 if (finalContext != NULL) {
12653 pmap_cs_assert_addr((vm_address_t)finalContext, sizeof(*finalContext), false, false);
12654 }
12655
12656 validate_pmap(pmap);
12657 pmap_lock(pmap, PMAP_LOCK_SHARED);
12658
12659 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
12660 if (cd_entry == NULL) {
12661 pmap_cs_log_error("attempted to query entitlements from an invalid pmap or a retired code directory");
12662 goto out;
12663 }
12664
12665 if (cd_entry->ce_ctx == NULL) {
12666 pmap_cs_log_debug("%s: code signature doesn't have any entitlements", cd_entry->identifier);
12667 goto out;
12668 }
12669
12670 der_vm_context_t executionContext = cd_entry->ce_ctx->der_context;
12671
12672 for (size_t op = 0; op < queryLength; op++) {
12673 executionContext = amfi->CoreEntitlements.der_vm_execute(executionContext, query[op]);
12674 }
12675
12676 if (amfi->CoreEntitlements.der_vm_context_is_valid(executionContext)) {
12677 ret = true;
12678 if (finalContext != NULL) {
12679 pmap_pin_kernel_pages((vm_offset_t)finalContext, sizeof(*finalContext));
12680 finalContext->der_context = executionContext;
12681 pmap_unpin_kernel_pages((vm_offset_t)finalContext, sizeof(*finalContext));
12682 }
12683 } else {
12684 ret = false;
12685 }
12686
12687 out:
12688 if (cd_entry) {
12689 lck_rw_unlock_shared(&cd_entry->rwlock);
12690 cd_entry = NULL;
12691 }
12692 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12693
12694 return ret;
12695 }
12696 #endif
12697
12698 bool
12699 pmap_query_entitlements(
12700 __unused pmap_t pmap,
12701 __unused CEQuery_t query,
12702 __unused size_t queryLength,
12703 __unused CEQueryContext_t finalContext)
12704 {
12705 #if !PMAP_SUPPORTS_ENTITLEMENT_CHECKS
12706 panic("PMAP_CS: do not use this API without checking for \'#if PMAP_SUPPORTS_ENTITLEMENT_CHECKS\'");
12707 #else
12708
12709 #if XNU_MONITOR
12710 return pmap_cs_query_entitlements_ppl(pmap, query, queryLength, finalContext);
12711 #else
12712 return pmap_cs_query_entitlements_internal(pmap, query, queryLength, finalContext);
12713 #endif
12714
12715 #endif /* !PMAP_SUPPORTS_ENTITLEMENT_CHECKS */
12716 }
12717
12718 MARK_AS_PMAP_TEXT void
12719 pmap_footprint_suspend_internal(
12720 vm_map_t map,
12721 boolean_t suspend)
12722 {
12723 #if DEVELOPMENT || DEBUG
12724 if (suspend) {
12725 current_thread()->pmap_footprint_suspended = TRUE;
12726 map->pmap->footprint_was_suspended = TRUE;
12727 } else {
12728 current_thread()->pmap_footprint_suspended = FALSE;
12729 }
12730 #else /* DEVELOPMENT || DEBUG */
12731 (void) map;
12732 (void) suspend;
12733 #endif /* DEVELOPMENT || DEBUG */
12734 }
12735
12736 void
12737 pmap_footprint_suspend(
12738 vm_map_t map,
12739 boolean_t suspend)
12740 {
12741 #if XNU_MONITOR
12742 pmap_footprint_suspend_ppl(map, suspend);
12743 #else
12744 pmap_footprint_suspend_internal(map, suspend);
12745 #endif
12746 }
12747
12748 MARK_AS_PMAP_TEXT void
12749 pmap_nop_internal(pmap_t pmap __unused)
12750 {
12751 validate_pmap_mutable(pmap);
12752 }
12753
12754 void
12755 pmap_nop(pmap_t pmap)
12756 {
12757 #if XNU_MONITOR
12758 pmap_nop_ppl(pmap);
12759 #else
12760 pmap_nop_internal(pmap);
12761 #endif
12762 }
12763
12764 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
12765
12766 struct page_table_dump_header {
12767 uint64_t pa;
12768 uint64_t num_entries;
12769 uint64_t start_va;
12770 uint64_t end_va;
12771 };
12772
12773 static kern_return_t
12774 pmap_dump_page_tables_recurse(pmap_t pmap,
12775 const tt_entry_t *ttp,
12776 unsigned int cur_level,
12777 unsigned int level_mask,
12778 uint64_t start_va,
12779 void *buf_start,
12780 void *buf_end,
12781 size_t *bytes_copied)
12782 {
12783 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12784 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
12785
12786 uint64_t size = pt_attr->pta_level_info[cur_level].size;
12787 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
12788 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
12789 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
12790
12791 void *bufp = (uint8_t*)buf_start + *bytes_copied;
12792
12793 if (cur_level == pt_attr_root_level(pt_attr)) {
12794 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
12795 }
12796
12797 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
12798 const tt_entry_t *tt_end = &ttp[num_entries];
12799
12800 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
12801 return KERN_INSUFFICIENT_BUFFER_SIZE;
12802 }
12803
12804 if (level_mask & (1U << cur_level)) {
12805 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
12806 header->pa = ml_static_vtop((vm_offset_t)ttp);
12807 header->num_entries = num_entries;
12808 header->start_va = start_va;
12809 header->end_va = start_va + (num_entries * size);
12810
12811 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
12812 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
12813 }
12814 uint64_t current_va = start_va;
12815
12816 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
12817 tt_entry_t tte = *ttep;
12818
12819 if (!(tte & valid_mask)) {
12820 continue;
12821 }
12822
12823 if ((tte & type_mask) == type_block) {
12824 continue;
12825 } else {
12826 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
12827 panic("%s: corrupt entry %#llx at %p, "
12828 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
12829 __FUNCTION__, tte, ttep,
12830 ttp, cur_level, bufp, buf_end);
12831 }
12832
12833 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
12834
12835 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
12836 level_mask, current_va, buf_start, buf_end, bytes_copied);
12837
12838 if (recurse_result != KERN_SUCCESS) {
12839 return recurse_result;
12840 }
12841 }
12842 }
12843
12844 return KERN_SUCCESS;
12845 }
12846
12847 kern_return_t
12848 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
12849 {
12850 if (not_in_kdp) {
12851 panic("pmap_dump_page_tables must only be called from kernel debugger context");
12852 }
12853 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
12854 level_mask, pmap->min, bufp, buf_end, bytes_copied);
12855 }
12856
12857 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
12858
12859 kern_return_t
12860 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
12861 unsigned int level_mask __unused, size_t *bytes_copied __unused)
12862 {
12863 return KERN_NOT_SUPPORTED;
12864 }
12865 #endif /* !defined(__arm64__) */
12866
12867
12868 #ifdef CONFIG_XNUPOST
12869 #ifdef __arm64__
12870 static volatile bool pmap_test_took_fault = false;
12871
12872 static bool
12873 pmap_test_fault_handler(arm_saved_state_t * state)
12874 {
12875 bool retval = false;
12876 uint32_t esr = get_saved_state_esr(state);
12877 esr_exception_class_t class = ESR_EC(esr);
12878 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
12879
12880 if ((class == ESR_EC_DABORT_EL1) &&
12881 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
12882 pmap_test_took_fault = true;
12883 /* return to the instruction immediately after the call to NX page */
12884 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
12885 retval = true;
12886 }
12887
12888 return retval;
12889 }
12890
12891 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
12892 static NOKASAN bool
12893 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
12894 {
12895 pmap_t old_pmap = NULL;
12896
12897 pmap_test_took_fault = false;
12898
12899 /*
12900 * We're potentially switching pmaps without using the normal thread
12901 * mechanism; disable interrupts and preemption to avoid any unexpected
12902 * memory accesses.
12903 */
12904 uint64_t old_int_state = pmap_interrupts_disable();
12905 mp_disable_preemption();
12906
12907 if (pmap != NULL) {
12908 old_pmap = current_pmap();
12909 pmap_switch(pmap);
12910
12911 /* Disable PAN; pmap shouldn't be the kernel pmap. */
12912 #if __ARM_PAN_AVAILABLE__
12913 __builtin_arm_wsr("pan", 0);
12914 #endif /* __ARM_PAN_AVAILABLE__ */
12915 }
12916
12917 ml_expect_fault_begin(pmap_test_fault_handler, va);
12918
12919 if (is_write) {
12920 *((volatile uint64_t*)(va)) = 0xdec0de;
12921 } else {
12922 volatile uint64_t tmp = *((volatile uint64_t*)(va));
12923 (void)tmp;
12924 }
12925
12926 /* Save the fault bool, and undo the gross stuff we did. */
12927 bool took_fault = pmap_test_took_fault;
12928 ml_expect_fault_end();
12929
12930 if (pmap != NULL) {
12931 #if __ARM_PAN_AVAILABLE__
12932 __builtin_arm_wsr("pan", 1);
12933 #endif /* __ARM_PAN_AVAILABLE__ */
12934
12935 pmap_switch(old_pmap);
12936 }
12937
12938 mp_enable_preemption();
12939 pmap_interrupts_restore(old_int_state);
12940 bool retval = (took_fault == should_fault);
12941 return retval;
12942 }
12943
12944 static bool
12945 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
12946 {
12947 bool retval = pmap_test_access(pmap, va, should_fault, false);
12948
12949 if (!retval) {
12950 T_FAIL("%s: %s, "
12951 "pmap=%p, va=%p, should_fault=%u",
12952 __func__, should_fault ? "did not fault" : "faulted",
12953 pmap, (void*)va, (unsigned)should_fault);
12954 }
12955
12956 return retval;
12957 }
12958
12959 static bool
12960 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
12961 {
12962 bool retval = pmap_test_access(pmap, va, should_fault, true);
12963
12964 if (!retval) {
12965 T_FAIL("%s: %s, "
12966 "pmap=%p, va=%p, should_fault=%u",
12967 __func__, should_fault ? "did not fault" : "faulted",
12968 pmap, (void*)va, (unsigned)should_fault);
12969 }
12970
12971 return retval;
12972 }
12973
12974 static bool
12975 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
12976 {
12977 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12978 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
12979
12980 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
12981
12982 if (!retval) {
12983 T_FAIL("%s: bits=%u, "
12984 "pa=%p, should_be_set=%u",
12985 __func__, bits,
12986 (void*)pa, should_be_set);
12987 }
12988
12989 return retval;
12990 }
12991
12992 static __attribute__((noinline)) bool
12993 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
12994 {
12995 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
12996 return retval;
12997 }
12998
12999 static int
13000 pmap_test_test_config(unsigned int flags)
13001 {
13002 T_LOG("running pmap_test_test_config flags=0x%X", flags);
13003 unsigned int map_count = 0;
13004 unsigned long page_ratio = 0;
13005 pmap_t pmap = pmap_create_options(NULL, 0, flags);
13006
13007 if (!pmap) {
13008 panic("Failed to allocate pmap");
13009 }
13010
13011 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13012 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
13013 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
13014 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
13015
13016 if (pmap_page_size <= native_page_size) {
13017 page_ratio = native_page_size / pmap_page_size;
13018 } else {
13019 /*
13020 * We claim to support a page_ratio of less than 1, which is
13021 * not currently supported by the pmap layer; panic.
13022 */
13023 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
13024 "flags=%u",
13025 __func__, native_page_size, pmap_page_size,
13026 flags);
13027 }
13028
13029 if (PAGE_RATIO > 1) {
13030 /*
13031 * The kernel is deliberately pretending to have 16KB pages.
13032 * The pmap layer has code that supports this, so pretend the
13033 * page size is larger than it is.
13034 */
13035 pmap_page_size = PAGE_SIZE;
13036 native_page_size = PAGE_SIZE;
13037 }
13038
13039 /*
13040 * Get two pages from the VM; one to be mapped wired, and one to be
13041 * mapped nonwired.
13042 */
13043 vm_page_t unwired_vm_page = vm_page_grab();
13044 vm_page_t wired_vm_page = vm_page_grab();
13045
13046 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
13047 panic("Failed to grab VM pages");
13048 }
13049
13050 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
13051 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
13052
13053 pmap_paddr_t pa = ptoa(pn);
13054 pmap_paddr_t wired_pa = ptoa(wired_pn);
13055
13056 /*
13057 * We'll start mappings at the second twig TT. This keeps us from only
13058 * using the first entry in each TT, which would trivially be address
13059 * 0; one of the things we will need to test is retrieving the VA for
13060 * a given PTE.
13061 */
13062 vm_map_address_t va_base = pmap_twig_size;
13063 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
13064
13065 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
13066 /*
13067 * Not exactly a functional failure, but this test relies on
13068 * there being a spare PTE slot we can use to pin the TT.
13069 */
13070 panic("Cannot pin translation table");
13071 }
13072
13073 /*
13074 * Create the wired mapping; this will prevent the pmap layer from
13075 * reclaiming our test TTs, which would interfere with this test
13076 * ("interfere" -> "make it panic").
13077 */
13078 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
13079
13080 #if XNU_MONITOR
13081 /*
13082 * If the PPL is enabled, make sure that the kernel cannot write
13083 * to PPL memory.
13084 */
13085 if (!pmap_ppl_disable) {
13086 T_LOG("Validate that kernel cannot write to PPL memory.");
13087 pt_entry_t * ptep = pmap_pte(pmap, va_base);
13088 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
13089 }
13090 #endif
13091
13092 /*
13093 * Create read-only mappings of the nonwired page; if the pmap does
13094 * not use the same page size as the kernel, create multiple mappings
13095 * so that the kernel page is fully mapped.
13096 */
13097 for (map_count = 0; map_count < page_ratio; map_count++) {
13098 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
13099 }
13100
13101 /* Validate that all the PTEs have the expected PA and VA. */
13102 for (map_count = 0; map_count < page_ratio; map_count++) {
13103 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
13104
13105 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
13106 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
13107 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
13108 }
13109
13110 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
13111 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
13112 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
13113 }
13114 }
13115
13116 T_LOG("Validate that reads to our mapping do not fault.");
13117 pmap_test_read(pmap, va_base, false);
13118
13119 T_LOG("Validate that writes to our mapping fault.");
13120 pmap_test_write(pmap, va_base, true);
13121
13122 T_LOG("Make the first mapping writable.");
13123 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13124
13125 T_LOG("Validate that writes to our mapping do not fault.");
13126 pmap_test_write(pmap, va_base, false);
13127
13128
13129 T_LOG("Make the first mapping XO.");
13130 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
13131
13132 T_LOG("Validate that reads to our mapping do not fault.");
13133 pmap_test_read(pmap, va_base, false);
13134
13135 T_LOG("Validate that writes to our mapping fault.");
13136 pmap_test_write(pmap, va_base, true);
13137
13138
13139 /*
13140 * For page ratios of greater than 1: validate that writes to the other
13141 * mappings still fault. Remove the mappings afterwards (we're done
13142 * with page ratio testing).
13143 */
13144 for (map_count = 1; map_count < page_ratio; map_count++) {
13145 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
13146 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
13147 }
13148
13149 T_LOG("Mark the page unreferenced and unmodified.");
13150 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13151 pmap_test_check_refmod(pa, 0);
13152
13153 /*
13154 * Begin testing the ref/mod state machine. Re-enter the mapping with
13155 * different protection/fault_type settings, and confirm that the
13156 * ref/mod state matches our expectations at each step.
13157 */
13158 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
13159 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
13160 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13161
13162 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
13163 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13164 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
13165 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13166
13167 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
13168 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13169 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
13170 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13171
13172 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
13173 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
13174 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13175
13176 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
13177 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13178 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13179 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13180
13181 /*
13182 * Shared memory testing; we'll have two mappings; one read-only,
13183 * one read-write.
13184 */
13185 vm_map_address_t rw_base = va_base;
13186 vm_map_address_t ro_base = va_base + pmap_page_size;
13187
13188 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13189 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
13190
13191 /*
13192 * Test that we take faults as expected for unreferenced/unmodified
13193 * pages. Also test the arm_fast_fault interface, to ensure that
13194 * mapping permissions change as expected.
13195 */
13196 T_LOG("!ref/!mod: expect no access");
13197 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13198 pmap_test_read_write(pmap, ro_base, false, false);
13199 pmap_test_read_write(pmap, rw_base, false, false);
13200
13201 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
13202 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
13203 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13204 pmap_test_read_write(pmap, ro_base, true, false);
13205 pmap_test_read_write(pmap, rw_base, true, false);
13206
13207 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
13208 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
13209 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13210 pmap_test_read_write(pmap, ro_base, true, false);
13211 pmap_test_read_write(pmap, rw_base, true, true);
13212
13213 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
13214 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13215 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
13216 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13217 pmap_test_read_write(pmap, ro_base, true, false);
13218 pmap_test_read_write(pmap, rw_base, true, true);
13219
13220 T_LOG("RW protect both mappings; should not change protections.");
13221 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
13222 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
13223 pmap_test_read_write(pmap, ro_base, true, false);
13224 pmap_test_read_write(pmap, rw_base, true, true);
13225
13226 T_LOG("Read protect both mappings; RW mapping should become RO.");
13227 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
13228 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
13229 pmap_test_read_write(pmap, ro_base, true, false);
13230 pmap_test_read_write(pmap, rw_base, true, false);
13231
13232 T_LOG("RW protect the page; mappings should not change protections.");
13233 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13234 pmap_page_protect(pn, VM_PROT_ALL);
13235 pmap_test_read_write(pmap, ro_base, true, false);
13236 pmap_test_read_write(pmap, rw_base, true, true);
13237
13238 T_LOG("Read protect the page; RW mapping should become RO.");
13239 pmap_page_protect(pn, VM_PROT_READ);
13240 pmap_test_read_write(pmap, ro_base, true, false);
13241 pmap_test_read_write(pmap, rw_base, true, false);
13242
13243 T_LOG("Validate that disconnect removes all known mappings of the page.");
13244 pmap_disconnect(pn);
13245 if (!pmap_verify_free(pn)) {
13246 T_FAIL("Page still has mappings");
13247 }
13248
13249 T_LOG("Remove the wired mapping, so we can tear down the test map.");
13250 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
13251 pmap_destroy(pmap);
13252
13253 T_LOG("Release the pages back to the VM.");
13254 vm_page_lock_queues();
13255 vm_page_free(unwired_vm_page);
13256 vm_page_free(wired_vm_page);
13257 vm_page_unlock_queues();
13258
13259 T_LOG("Testing successful!");
13260 return 0;
13261 }
13262 #endif /* __arm64__ */
13263
13264 kern_return_t
13265 pmap_test(void)
13266 {
13267 T_LOG("Starting pmap_tests");
13268 #ifdef __arm64__
13269 int flags = 0;
13270 flags |= PMAP_CREATE_64BIT;
13271
13272 #if __ARM_MIXED_PAGE_SIZE__
13273 T_LOG("Testing VM_PAGE_SIZE_4KB");
13274 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
13275 T_LOG("Testing VM_PAGE_SIZE_16KB");
13276 pmap_test_test_config(flags);
13277 #else /* __ARM_MIXED_PAGE_SIZE__ */
13278 pmap_test_test_config(flags);
13279 #endif /* __ARM_MIXED_PAGE_SIZE__ */
13280
13281 #endif /* __arm64__ */
13282 T_PASS("completed pmap_test successfully");
13283 return KERN_SUCCESS;
13284 }
13285 #endif /* CONFIG_XNUPOST */
13286
13287 /*
13288 * The following function should never make it to RELEASE code, since
13289 * it provides a way to get the PPL to modify text pages.
13290 */
13291 #if DEVELOPMENT || DEBUG
13292
13293 #define ARM_UNDEFINED_INSN 0xe7f000f0
13294 #define ARM_UNDEFINED_INSN_THUMB 0xde00
13295
13296 /**
13297 * Forcibly overwrite executable text with an illegal instruction.
13298 *
13299 * @note Only used for xnu unit testing.
13300 *
13301 * @param pa The physical address to corrupt.
13302 *
13303 * @return KERN_SUCCESS on success.
13304 */
13305 kern_return_t
13306 pmap_test_text_corruption(pmap_paddr_t pa)
13307 {
13308 #if XNU_MONITOR
13309 return pmap_test_text_corruption_ppl(pa);
13310 #else /* XNU_MONITOR */
13311 return pmap_test_text_corruption_internal(pa);
13312 #endif /* XNU_MONITOR */
13313 }
13314
13315 MARK_AS_PMAP_TEXT kern_return_t
13316 pmap_test_text_corruption_internal(pmap_paddr_t pa)
13317 {
13318 vm_offset_t va = phystokv(pa);
13319 unsigned int pai = pa_index(pa);
13320
13321 assert(pa_valid(pa));
13322
13323 pvh_lock(pai);
13324
13325 pv_entry_t **pv_h = pai_to_pvh(pai);
13326 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
13327 #if defined(PVH_FLAG_EXEC)
13328 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
13329
13330 if (need_ap_twiddle) {
13331 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
13332 }
13333 #endif /* defined(PVH_FLAG_EXEC) */
13334
13335 /*
13336 * The low bit in an instruction address indicates a THUMB instruction
13337 */
13338 if (va & 1) {
13339 va &= ~(vm_offset_t)1;
13340 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
13341 } else {
13342 *(uint32_t *)va = ARM_UNDEFINED_INSN;
13343 }
13344
13345 #if defined(PVH_FLAG_EXEC)
13346 if (need_ap_twiddle) {
13347 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
13348 }
13349 #endif /* defined(PVH_FLAG_EXEC) */
13350
13351 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
13352
13353 pvh_unlock(pai);
13354
13355 return KERN_SUCCESS;
13356 }
13357
13358 #endif /* DEVELOPMENT || DEBUG */
13359