1 /*
2 * Copyright (c) 2011-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <mach_assert.h>
30 #include <mach_ldebug.h>
31
32 #include <mach/shared_region.h>
33 #include <mach/vm_param.h>
34 #include <mach/vm_prot.h>
35 #include <mach/vm_map.h>
36 #include <mach/machine/vm_param.h>
37 #include <mach/machine/vm_types.h>
38
39 #include <mach/boolean.h>
40 #include <kern/bits.h>
41 #include <kern/thread.h>
42 #include <kern/sched.h>
43 #include <kern/zalloc.h>
44 #include <kern/zalloc_internal.h>
45 #include <kern/kalloc.h>
46 #include <kern/spl.h>
47 #include <kern/startup.h>
48 #include <kern/trustcache.h>
49
50 #include <os/overflow.h>
51
52 #include <vm/pmap.h>
53 #include <vm/pmap_cs.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_protos.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/cpm.h>
61
62 #include <libkern/img4/interface.h>
63 #include <libkern/section_keywords.h>
64 #include <sys/errno.h>
65
66 #include <machine/atomic.h>
67 #include <machine/thread.h>
68 #include <machine/lowglobals.h>
69
70 #include <arm/caches_internal.h>
71 #include <arm/cpu_data.h>
72 #include <arm/cpu_data_internal.h>
73 #include <arm/cpu_capabilities.h>
74 #include <arm/cpu_number.h>
75 #include <arm/machine_cpu.h>
76 #include <arm/misc_protos.h>
77 #include <arm/pmap/pmap_internal.h>
78 #include <arm/trap.h>
79
80 #if (__ARM_VMSA__ > 7)
81 #include <arm64/proc_reg.h>
82 #include <pexpert/arm64/boot.h>
83 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
84 #include <arm64/amcc_rorgn.h>
85 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
86 #endif
87
88 #include <pexpert/device_tree.h>
89
90 #include <san/kasan.h>
91 #include <sys/cdefs.h>
92
93 #if defined(HAS_APPLE_PAC)
94 #include <ptrauth.h>
95 #endif
96
97 #ifdef CONFIG_XNUPOST
98 #include <tests/xnupost.h>
99 #endif
100
101
102 #if HIBERNATION
103 #include <IOKit/IOHibernatePrivate.h>
104 #endif /* HIBERNATION */
105
106 #ifdef __ARM64_PMAP_SUBPAGE_L1__
107 #if (__ARM_VMSA__ <= 7)
108 #error This is not supported for old-style page tables
109 #endif
110 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
111 #else
112 #if (__ARM_VMSA__ <= 7)
113 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES * 2)
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 #endif
118
119 extern u_int32_t random(void); /* from <libkern/libkern.h> */
120
121 static bool alloc_asid(pmap_t pmap);
122 static void free_asid(pmap_t pmap);
123 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
124 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
125 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
126
127 static const struct page_table_ops native_pt_ops =
128 {
129 .alloc_id = alloc_asid,
130 .free_id = free_asid,
131 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
132 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
133 .wimg_to_pte = wimg_to_pte,
134 };
135
136 #if (__ARM_VMSA__ > 7)
137 const struct page_table_level_info pmap_table_level_info_16k[] =
138 {
139 [0] = {
140 .size = ARM_16K_TT_L0_SIZE,
141 .offmask = ARM_16K_TT_L0_OFFMASK,
142 .shift = ARM_16K_TT_L0_SHIFT,
143 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
144 .valid_mask = ARM_TTE_VALID,
145 .type_mask = ARM_TTE_TYPE_MASK,
146 .type_block = ARM_TTE_TYPE_BLOCK
147 },
148 [1] = {
149 .size = ARM_16K_TT_L1_SIZE,
150 .offmask = ARM_16K_TT_L1_OFFMASK,
151 .shift = ARM_16K_TT_L1_SHIFT,
152 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
153 .valid_mask = ARM_TTE_VALID,
154 .type_mask = ARM_TTE_TYPE_MASK,
155 .type_block = ARM_TTE_TYPE_BLOCK
156 },
157 [2] = {
158 .size = ARM_16K_TT_L2_SIZE,
159 .offmask = ARM_16K_TT_L2_OFFMASK,
160 .shift = ARM_16K_TT_L2_SHIFT,
161 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
162 .valid_mask = ARM_TTE_VALID,
163 .type_mask = ARM_TTE_TYPE_MASK,
164 .type_block = ARM_TTE_TYPE_BLOCK
165 },
166 [3] = {
167 .size = ARM_16K_TT_L3_SIZE,
168 .offmask = ARM_16K_TT_L3_OFFMASK,
169 .shift = ARM_16K_TT_L3_SHIFT,
170 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
171 .valid_mask = ARM_PTE_TYPE_VALID,
172 .type_mask = ARM_PTE_TYPE_MASK,
173 .type_block = ARM_TTE_TYPE_L3BLOCK
174 }
175 };
176
177 const struct page_table_level_info pmap_table_level_info_4k[] =
178 {
179 [0] = {
180 .size = ARM_4K_TT_L0_SIZE,
181 .offmask = ARM_4K_TT_L0_OFFMASK,
182 .shift = ARM_4K_TT_L0_SHIFT,
183 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
184 .valid_mask = ARM_TTE_VALID,
185 .type_mask = ARM_TTE_TYPE_MASK,
186 .type_block = ARM_TTE_TYPE_BLOCK
187 },
188 [1] = {
189 .size = ARM_4K_TT_L1_SIZE,
190 .offmask = ARM_4K_TT_L1_OFFMASK,
191 .shift = ARM_4K_TT_L1_SHIFT,
192 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
193 .valid_mask = ARM_TTE_VALID,
194 .type_mask = ARM_TTE_TYPE_MASK,
195 .type_block = ARM_TTE_TYPE_BLOCK
196 },
197 [2] = {
198 .size = ARM_4K_TT_L2_SIZE,
199 .offmask = ARM_4K_TT_L2_OFFMASK,
200 .shift = ARM_4K_TT_L2_SHIFT,
201 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
202 .valid_mask = ARM_TTE_VALID,
203 .type_mask = ARM_TTE_TYPE_MASK,
204 .type_block = ARM_TTE_TYPE_BLOCK
205 },
206 [3] = {
207 .size = ARM_4K_TT_L3_SIZE,
208 .offmask = ARM_4K_TT_L3_OFFMASK,
209 .shift = ARM_4K_TT_L3_SHIFT,
210 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
211 .valid_mask = ARM_PTE_TYPE_VALID,
212 .type_mask = ARM_PTE_TYPE_MASK,
213 .type_block = ARM_TTE_TYPE_L3BLOCK
214 }
215 };
216
217 const struct page_table_attr pmap_pt_attr_4k = {
218 .pta_level_info = pmap_table_level_info_4k,
219 .pta_root_level = (T0SZ_BOOT - 16) / 9,
220 #if __ARM_MIXED_PAGE_SIZE__
221 .pta_commpage_level = PMAP_TT_L2_LEVEL,
222 #else /* __ARM_MIXED_PAGE_SIZE__ */
223 #if __ARM_16K_PG__
224 .pta_commpage_level = PMAP_TT_L2_LEVEL,
225 #else /* __ARM_16K_PG__ */
226 .pta_commpage_level = PMAP_TT_L1_LEVEL,
227 #endif /* __ARM_16K_PG__ */
228 #endif /* __ARM_MIXED_PAGE_SIZE__ */
229 .pta_max_level = PMAP_TT_L3_LEVEL,
230 .pta_ops = &native_pt_ops,
231 .ap_ro = ARM_PTE_AP(AP_RORO),
232 .ap_rw = ARM_PTE_AP(AP_RWRW),
233 .ap_rona = ARM_PTE_AP(AP_RONA),
234 .ap_rwna = ARM_PTE_AP(AP_RWNA),
235 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
236 .ap_x = ARM_PTE_PNX,
237 #if __ARM_MIXED_PAGE_SIZE__
238 .pta_tcr_value = TCR_EL1_4KB,
239 #endif /* __ARM_MIXED_PAGE_SIZE__ */
240 .pta_page_size = 4096,
241 .pta_page_shift = 12,
242 };
243
244 const struct page_table_attr pmap_pt_attr_16k = {
245 .pta_level_info = pmap_table_level_info_16k,
246 .pta_root_level = PMAP_TT_L1_LEVEL,
247 .pta_commpage_level = PMAP_TT_L2_LEVEL,
248 .pta_max_level = PMAP_TT_L3_LEVEL,
249 .pta_ops = &native_pt_ops,
250 .ap_ro = ARM_PTE_AP(AP_RORO),
251 .ap_rw = ARM_PTE_AP(AP_RWRW),
252 .ap_rona = ARM_PTE_AP(AP_RONA),
253 .ap_rwna = ARM_PTE_AP(AP_RWNA),
254 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
255 .ap_x = ARM_PTE_PNX,
256 #if __ARM_MIXED_PAGE_SIZE__
257 .pta_tcr_value = TCR_EL1_16KB,
258 #endif /* __ARM_MIXED_PAGE_SIZE__ */
259 .pta_page_size = 16384,
260 .pta_page_shift = 14,
261 };
262
263 #if __ARM_16K_PG__
264 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
265 #else /* !__ARM_16K_PG__ */
266 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
267 #endif /* !__ARM_16K_PG__ */
268
269
270 #else /* (__ARM_VMSA__ > 7) */
271 /*
272 * We don't support pmap parameterization for VMSA7, so use an opaque
273 * page_table_attr structure.
274 */
275 const struct page_table_attr * const native_pt_attr = NULL;
276 #endif /* (__ARM_VMSA__ > 7) */
277
278
279 static inline void
pmap_sync_tlb(bool strong __unused)280 pmap_sync_tlb(bool strong __unused)
281 {
282 sync_tlb_flush();
283 }
284
285 #if MACH_ASSERT
286 int vm_footprint_suspend_allowed = 1;
287
288 extern int pmap_ledgers_panic;
289 extern int pmap_ledgers_panic_leeway;
290
291 #endif /* MACH_ASSERT */
292
293 #if DEVELOPMENT || DEBUG
294 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
295 (current_thread()->pmap_footprint_suspended)
296 #else /* DEVELOPMENT || DEBUG */
297 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
298 #endif /* DEVELOPMENT || DEBUG */
299
300
301 #ifdef PLATFORM_BridgeOS
302 static struct pmap_legacy_trust_cache *pmap_legacy_trust_caches MARK_AS_PMAP_DATA = NULL;
303 #endif
304 static struct pmap_image4_trust_cache *pmap_image4_trust_caches MARK_AS_PMAP_DATA = NULL;
305
306 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_loaded_trust_caches_lock, 0);
307
308 SECURITY_READ_ONLY_LATE(int) srd_fused = 0;
309
310 /*
311 * Represents a tlb range that will be flushed before exiting
312 * the ppl.
313 * Used by phys_attribute_clear_range to defer flushing pages in
314 * this range until the end of the operation.
315 */
316 typedef struct pmap_tlb_flush_range {
317 pmap_t ptfr_pmap;
318 vm_map_address_t ptfr_start;
319 vm_map_address_t ptfr_end;
320 bool ptfr_flush_needed;
321 } pmap_tlb_flush_range_t;
322
323 #if XNU_MONITOR
324 /*
325 * PPL External References.
326 */
327 extern vm_offset_t segPPLDATAB;
328 extern unsigned long segSizePPLDATA;
329 extern vm_offset_t segPPLTEXTB;
330 extern unsigned long segSizePPLTEXT;
331 extern vm_offset_t segPPLDATACONSTB;
332 extern unsigned long segSizePPLDATACONST;
333
334
335 /*
336 * PPL Global Variables
337 */
338
339 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
340 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
341 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
342 #else
343 const boolean_t pmap_ppl_disable = FALSE;
344 #endif
345
346 /*
347 * Indicates if the PPL has started applying APRR.
348 * This variable is accessed from various assembly trampolines, so be sure to change
349 * those if you change the size or layout of this variable.
350 */
351 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
352
353 extern void *pmap_stacks_start;
354 extern void *pmap_stacks_end;
355
356 #endif /* !XNU_MONITOR */
357
358
359 /* Virtual memory region for early allocation */
360 #if (__ARM_VMSA__ == 7)
361 #define VREGION1_HIGH_WINDOW (0)
362 #else
363 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
364 #endif
365 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
366 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
367
368 extern uint8_t bootstrap_pagetables[];
369
370 extern unsigned int not_in_kdp;
371
372 extern vm_offset_t first_avail;
373
374 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
375 extern vm_offset_t virtual_space_end; /* End of kernel address space */
376 extern vm_offset_t static_memory_end;
377
378 extern const vm_map_address_t physmap_base;
379 extern const vm_map_address_t physmap_end;
380
381 extern int maxproc, hard_maxproc;
382
383 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
384 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
385
386 #if (__ARM_VMSA__ > 7)
387 /* The number of address bits one TTBR can cover. */
388 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
389
390 /*
391 * The bounds on our TTBRs. These are for sanity checking that
392 * an address is accessible by a TTBR before we attempt to map it.
393 */
394
395 /* The level of the root of a page table. */
396 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
397
398 /* The number of entries in the root TT of a page table. */
399 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
400 #else
401 const uint64_t arm64_root_pgtable_level = 0;
402 const uint64_t arm64_root_pgtable_num_ttes = 0;
403 #endif
404
405 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
406 SECURITY_READ_ONLY_LATE(pmap_t) kernel_pmap = &kernel_pmap_store;
407
408 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
409
410 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
411 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
412 unsigned int pmap_stamp MARK_AS_PMAP_DATA;
413 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
414
415 typedef struct tt_free_entry {
416 struct tt_free_entry *next;
417 } tt_free_entry_t;
418
419 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
420
421 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
422 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
423 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
424 #define FREE_PAGE_SIZE_TT_MAX 4
425 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
426 unsigned int free_two_page_size_tt_count MARK_AS_PMAP_DATA;
427 unsigned int free_two_page_size_tt_max MARK_AS_PMAP_DATA;
428 #define FREE_TWO_PAGE_SIZE_TT_MAX 4
429 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
430 unsigned int free_tt_count MARK_AS_PMAP_DATA;
431 unsigned int free_tt_max MARK_AS_PMAP_DATA;
432
433 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
434
435 boolean_t pmap_gc_allowed MARK_AS_PMAP_DATA = TRUE;
436 boolean_t pmap_gc_forced MARK_AS_PMAP_DATA = FALSE;
437 boolean_t pmap_gc_allowed_by_time_throttle = TRUE;
438
439 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
440 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
441 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
442 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
443 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
444 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
445
446 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
447 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
448
449 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
450 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
451
452 /* Lock group used for all pmap object locks. */
453 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
454
455 #if DEVELOPMENT || DEBUG
456 int nx_enabled = 1; /* enable no-execute protection */
457 int allow_data_exec = 0; /* No apps may execute data */
458 int allow_stack_exec = 0; /* No apps may execute from the stack */
459 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
460 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
461 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
462 #else /* DEVELOPMENT || DEBUG */
463 const int nx_enabled = 1; /* enable no-execute protection */
464 const int allow_data_exec = 0; /* No apps may execute data */
465 const int allow_stack_exec = 0; /* No apps may execute from the stack */
466 #endif /* DEVELOPMENT || DEBUG */
467
468 /**
469 * This variable is set true during hibernation entry to protect pmap data structures
470 * during image copying, and reset false on hibernation exit.
471 */
472 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
473
474 #if MACH_ASSERT
475 static void pmap_check_ledgers(pmap_t pmap);
476 #else
477 static inline void
pmap_check_ledgers(__unused pmap_t pmap)478 pmap_check_ledgers(__unused pmap_t pmap)
479 {
480 }
481 #endif /* MACH_ASSERT */
482
483 /**
484 * This helper function ensures that potentially-long-running batched PPL operations are
485 * called in preemptible context before entering the PPL, so that the PPL call may
486 * periodically exit to allow pending urgent ASTs to be taken.
487 */
488 static inline void
pmap_verify_preemptible(void)489 pmap_verify_preemptible(void)
490 {
491 assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
492 }
493
494 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
495
496 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
497 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
498
499 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
500
501 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
502 #if defined(__arm64__)
503 # ifdef XNU_TARGET_OS_OSX
504 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
505 # else
506 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
507 # endif
508 #endif /* __arm64__ */
509
510 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
511 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
512 #else
513 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
514 #endif
515
516 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
517 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
518 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
519 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
520 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
521 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
522 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
523 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
524
525
526 #if (__ARM_VMSA__ > 7)
527 #if __ARM_MIXED_PAGE_SIZE__
528 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap_4k;
529 #endif
530 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap_default;
531 #endif
532
533 /* PTE Define Macros */
534
535 #define pte_is_wired(pte) \
536 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
537
538 #define pte_was_writeable(pte) \
539 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
540
541 #define pte_set_was_writeable(pte, was_writeable) \
542 do { \
543 if ((was_writeable)) { \
544 (pte) |= ARM_PTE_WRITEABLE; \
545 } else { \
546 (pte) &= ~ARM_PTE_WRITEABLE; \
547 } \
548 } while(0)
549
550 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)551 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
552 {
553 if (wired) {
554 *ptep |= ARM_PTE_WIRED;
555 } else {
556 *ptep &= ~ARM_PTE_WIRED;
557 }
558 /*
559 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
560 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
561 * never reclaimed.
562 */
563 if (pmap == kernel_pmap) {
564 return;
565 }
566 unsigned short *ptd_wiredcnt_ptr;
567 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
568 if (wired) {
569 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
570 } else {
571 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
572 if (__improbable(prev_wired == 0)) {
573 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
574 }
575 }
576 }
577
578 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
579 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only); \
580 pmap_sync_tlb(strong); \
581 }
582
583 /*
584 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
585 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
586 * will observe the updated PTE.
587 */
588 #define FLUSH_PTE() \
589 __builtin_arm_dmb(DMB_ISH);
590
591 /*
592 * Synchronize updates to PTEs that were previously valid and thus may be cached in
593 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
594 * TLBI. This should only require a store-store barrier, as subsequent accesses in
595 * program order will not issue until the DSB completes. Prior loads may be reordered
596 * after the barrier, but their behavior should not be materially affected by the
597 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
598 * matter for loads until the access is re-driven well after the TLB update is
599 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
600 * we should be in a position to handle access faults. For "voluntary" PTE access
601 * restriction due to unmapping or protection, the decision to restrict access should
602 * have a data dependency on prior loads in order to avoid a data race.
603 */
604 #define FLUSH_PTE_STRONG() \
605 __builtin_arm_dsb(DSB_ISHST);
606
607 /**
608 * Write enough page table entries to map a single VM page. On systems where the
609 * VM page size does not match the hardware page size, multiple page table
610 * entries will need to be written.
611 *
612 * @note This function does not emit a barrier to ensure these page table writes
613 * have completed before continuing. This is commonly needed. In the case
614 * where a DMB or DSB barrier is needed, then use the write_pte() and
615 * write_pte_strong() functions respectively instead of this one.
616 *
617 * @param ptep Pointer to the first page table entry to update.
618 * @param pte The value to write into each page table entry. In the case that
619 * multiple PTEs are updated to a non-empty value, then the address
620 * in this value will automatically be incremented for each PTE
621 * write.
622 */
623 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)624 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
625 {
626 /**
627 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
628 * systems, which is why it's checked at runtime instead of compile time.
629 * The "unreachable" warning needs to be suppressed because it still is a
630 * compile time constant on some systems.
631 */
632 __unreachable_ok_push
633 if (TEST_PAGE_RATIO_4) {
634 if (((uintptr_t)ptep) & 0x1f) {
635 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
636 __func__, ptep, (void*)pte);
637 }
638
639 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
640 /**
641 * If we're writing an empty/compressed PTE value, then don't
642 * auto-increment the address for each PTE write.
643 */
644 *ptep = pte;
645 *(ptep + 1) = pte;
646 *(ptep + 2) = pte;
647 *(ptep + 3) = pte;
648 } else {
649 *ptep = pte;
650 *(ptep + 1) = pte | 0x1000;
651 *(ptep + 2) = pte | 0x2000;
652 *(ptep + 3) = pte | 0x3000;
653 }
654 } else {
655 *ptep = pte;
656 }
657 __unreachable_ok_pop
658 }
659
660 /**
661 * Writes enough page table entries to map a single VM page and then ensures
662 * those writes complete by executing a Data Memory Barrier.
663 *
664 * @note The DMB issued by this function is not strong enough to protect against
665 * TLB invalidates from being reordered above the PTE writes. If a TLBI
666 * instruction is going to immediately be called after this write, it's
667 * recommended to call write_pte_strong() instead of this function.
668 *
669 * See the function header for write_pte_fast() for more details on the
670 * parameters.
671 */
672 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)673 write_pte(pt_entry_t *ptep, pt_entry_t pte)
674 {
675 write_pte_fast(ptep, pte);
676 FLUSH_PTE();
677 }
678
679 /**
680 * Writes enough page table entries to map a single VM page and then ensures
681 * those writes complete by executing a Data Synchronization Barrier. This
682 * barrier provides stronger guarantees than the DMB executed by write_pte().
683 *
684 * @note This function is useful if you're going to immediately flush the TLB
685 * after making the PTE write. A DSB is required to protect against the
686 * TLB invalidate being reordered before the PTE write.
687 *
688 * See the function header for write_pte_fast() for more details on the
689 * parameters.
690 */
691 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)692 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
693 {
694 write_pte_fast(ptep, pte);
695 FLUSH_PTE_STRONG();
696 }
697
698 /**
699 * Retrieve the pmap structure for the thread running on the current CPU.
700 */
701 pmap_t
current_pmap()702 current_pmap()
703 {
704 const pmap_t current = vm_map_pmap(current_thread()->map);
705
706 assert(current != NULL);
707
708 #if XNU_MONITOR
709 /**
710 * On PPL-enabled systems, it's important that PPL policy decisions aren't
711 * decided by kernel-writable memory. This function is used in various parts
712 * of the PPL, and besides validating that the pointer returned by this
713 * function is indeed a pmap structure, it's also important to ensure that
714 * it's actually the current thread's pmap. This is because different pmaps
715 * will have access to different entitlements based on the code signature of
716 * their loaded process. So if a different user pmap is set in the current
717 * thread structure (in an effort to bypass code signing restrictions), even
718 * though the structure would validate correctly as it is a real pmap
719 * structure, it should fail here.
720 *
721 * This only needs to occur for user pmaps because the kernel pmap's root
722 * page table is always the same as TTBR1 (it's set during bootstrap and not
723 * changed so it'd be redundant to check), and its code signing fields are
724 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
725 * it shouldn't be possible to set those fields. Due to that, an attacker
726 * setting the current thread's pmap to the kernel pmap as a way to bypass
727 * this check won't accomplish anything as it doesn't provide any extra code
728 * signing entitlements.
729 */
730 if ((current != kernel_pmap) &&
731 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
732 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
733 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
734 }
735 #endif /* XNU_MONITOR */
736
737 return current;
738 }
739
740 #if DEVELOPMENT || DEBUG
741
742 /*
743 * Trace levels are controlled by a bitmask in which each
744 * level can be enabled/disabled by the (1<<level) position
745 * in the boot arg
746 * Level 0: PPL extension functionality
747 * Level 1: pmap lifecycle (create/destroy/switch)
748 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
749 * Level 3: internal state management (attributes/fast-fault)
750 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
751 */
752
753 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
754
755 #define PMAP_TRACE(level, ...) \
756 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
757 KDBG_RELEASE(__VA_ARGS__); \
758 }
759 #else /* DEVELOPMENT || DEBUG */
760
761 #define PMAP_TRACE(level, ...)
762
763 #endif /* DEVELOPMENT || DEBUG */
764
765
766 /*
767 * Internal function prototypes (forward declarations).
768 */
769
770 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
771
772 static void pmap_set_reference(ppnum_t pn);
773
774 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
775
776 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
777
778 static kern_return_t pmap_expand(
779 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
780
781 static int pmap_remove_range(
782 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
783
784 static tt_entry_t *pmap_tt1_allocate(
785 pmap_t, vm_size_t, unsigned int);
786
787 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
788
789 static void pmap_tt1_deallocate(
790 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
791
792 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
793
794 static kern_return_t pmap_tt_allocate(
795 pmap_t, tt_entry_t **, unsigned int, unsigned int);
796
797 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
798
799 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
800 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
801 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
802
803 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
804
805 #if (__ARM_VMSA__ > 7)
806
807 static void pmap_unmap_sharedpage(
808 pmap_t pmap);
809
810 static boolean_t
811 pmap_is_64bit(pmap_t);
812
813
814 #endif /* (__ARM_VMSA__ > 7) */
815
816 static void pmap_update_cache_attributes_locked(
817 ppnum_t, unsigned);
818
819 static boolean_t arm_clear_fast_fault(
820 ppnum_t ppnum,
821 vm_prot_t fault_type,
822 pt_entry_t *pte_p);
823
824 static void pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes);
825
826 static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes);
827
828 static void pmap_trim_self(pmap_t pmap);
829 static void pmap_trim_subord(pmap_t subord);
830
831
832 /*
833 * Temporary prototypes, while we wait for pmap_enter to move to taking an
834 * address instead of a page number.
835 */
836 static kern_return_t
837 pmap_enter_addr(
838 pmap_t pmap,
839 vm_map_address_t v,
840 pmap_paddr_t pa,
841 vm_prot_t prot,
842 vm_prot_t fault_type,
843 unsigned int flags,
844 boolean_t wired);
845
846 kern_return_t
847 pmap_enter_options_addr(
848 pmap_t pmap,
849 vm_map_address_t v,
850 pmap_paddr_t pa,
851 vm_prot_t prot,
852 vm_prot_t fault_type,
853 unsigned int flags,
854 boolean_t wired,
855 unsigned int options,
856 __unused void *arg);
857
858 #ifdef CONFIG_XNUPOST
859 kern_return_t pmap_test(void);
860 #endif /* CONFIG_XNUPOST */
861
862 PMAP_SUPPORT_PROTOTYPES(
863 kern_return_t,
864 arm_fast_fault, (pmap_t pmap,
865 vm_map_address_t va,
866 vm_prot_t fault_type,
867 bool was_af_fault,
868 bool from_user), ARM_FAST_FAULT_INDEX);
869
870 PMAP_SUPPORT_PROTOTYPES(
871 boolean_t,
872 arm_force_fast_fault, (ppnum_t ppnum,
873 vm_prot_t allow_mode,
874 int options), ARM_FORCE_FAST_FAULT_INDEX);
875
876 MARK_AS_PMAP_TEXT static boolean_t
877 arm_force_fast_fault_with_flush_range(
878 ppnum_t ppnum,
879 vm_prot_t allow_mode,
880 int options,
881 pmap_tlb_flush_range_t *flush_range);
882
883 PMAP_SUPPORT_PROTOTYPES(
884 boolean_t,
885 pmap_batch_set_cache_attributes, (ppnum_t pn,
886 unsigned int cacheattr,
887 unsigned int page_cnt,
888 unsigned int page_index,
889 boolean_t doit,
890 unsigned int *res), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
891
892 PMAP_SUPPORT_PROTOTYPES(
893 void,
894 pmap_change_wiring, (pmap_t pmap,
895 vm_map_address_t v,
896 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
897
898 PMAP_SUPPORT_PROTOTYPES(
899 pmap_t,
900 pmap_create_options, (ledger_t ledger,
901 vm_map_size_t size,
902 unsigned int flags,
903 kern_return_t * kr), PMAP_CREATE_INDEX);
904
905 PMAP_SUPPORT_PROTOTYPES(
906 void,
907 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
908
909 PMAP_SUPPORT_PROTOTYPES(
910 kern_return_t,
911 pmap_enter_options, (pmap_t pmap,
912 vm_map_address_t v,
913 pmap_paddr_t pa,
914 vm_prot_t prot,
915 vm_prot_t fault_type,
916 unsigned int flags,
917 boolean_t wired,
918 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
919
920 PMAP_SUPPORT_PROTOTYPES(
921 pmap_paddr_t,
922 pmap_find_pa, (pmap_t pmap,
923 addr64_t va), PMAP_FIND_PA_INDEX);
924
925 #if (__ARM_VMSA__ > 7)
926 PMAP_SUPPORT_PROTOTYPES(
927 kern_return_t,
928 pmap_insert_sharedpage, (pmap_t pmap), PMAP_INSERT_SHAREDPAGE_INDEX);
929 #endif
930
931
932 PMAP_SUPPORT_PROTOTYPES(
933 boolean_t,
934 pmap_is_empty, (pmap_t pmap,
935 vm_map_offset_t va_start,
936 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
937
938
939 PMAP_SUPPORT_PROTOTYPES(
940 unsigned int,
941 pmap_map_cpu_windows_copy, (ppnum_t pn,
942 vm_prot_t prot,
943 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
944
945 PMAP_SUPPORT_PROTOTYPES(
946 void,
947 pmap_ro_zone_memcpy, (zone_id_t zid,
948 vm_offset_t va,
949 vm_offset_t offset,
950 const vm_offset_t new_data,
951 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
952
953 PMAP_SUPPORT_PROTOTYPES(
954 uint64_t,
955 pmap_ro_zone_atomic_op, (zone_id_t zid,
956 vm_offset_t va,
957 vm_offset_t offset,
958 zro_atomic_op_t op,
959 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
960
961 PMAP_SUPPORT_PROTOTYPES(
962 void,
963 pmap_ro_zone_bzero, (zone_id_t zid,
964 vm_offset_t va,
965 vm_offset_t offset,
966 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
967
968 PMAP_SUPPORT_PROTOTYPES(
969 vm_map_offset_t,
970 pmap_nest, (pmap_t grand,
971 pmap_t subord,
972 addr64_t vstart,
973 uint64_t size,
974 vm_map_offset_t vrestart,
975 kern_return_t * krp), PMAP_NEST_INDEX);
976
977 PMAP_SUPPORT_PROTOTYPES(
978 void,
979 pmap_page_protect_options, (ppnum_t ppnum,
980 vm_prot_t prot,
981 unsigned int options,
982 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
983
984 PMAP_SUPPORT_PROTOTYPES(
985 vm_map_address_t,
986 pmap_protect_options, (pmap_t pmap,
987 vm_map_address_t start,
988 vm_map_address_t end,
989 vm_prot_t prot,
990 unsigned int options,
991 void *args), PMAP_PROTECT_OPTIONS_INDEX);
992
993 PMAP_SUPPORT_PROTOTYPES(
994 kern_return_t,
995 pmap_query_page_info, (pmap_t pmap,
996 vm_map_offset_t va,
997 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
998
999 PMAP_SUPPORT_PROTOTYPES(
1000 mach_vm_size_t,
1001 pmap_query_resident, (pmap_t pmap,
1002 vm_map_address_t start,
1003 vm_map_address_t end,
1004 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1005
1006 PMAP_SUPPORT_PROTOTYPES(
1007 void,
1008 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1009
1010 PMAP_SUPPORT_PROTOTYPES(
1011 vm_map_address_t,
1012 pmap_remove_options, (pmap_t pmap,
1013 vm_map_address_t start,
1014 vm_map_address_t end,
1015 int options), PMAP_REMOVE_OPTIONS_INDEX);
1016
1017
1018 PMAP_SUPPORT_PROTOTYPES(
1019 void,
1020 pmap_set_cache_attributes, (ppnum_t pn,
1021 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1022
1023 PMAP_SUPPORT_PROTOTYPES(
1024 void,
1025 pmap_update_compressor_page, (ppnum_t pn,
1026 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1027
1028 PMAP_SUPPORT_PROTOTYPES(
1029 void,
1030 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1031
1032 #if MACH_ASSERT || XNU_MONITOR
1033 PMAP_SUPPORT_PROTOTYPES(
1034 void,
1035 pmap_set_process, (pmap_t pmap,
1036 int pid,
1037 char *procname), PMAP_SET_PROCESS_INDEX);
1038 #endif
1039
1040 PMAP_SUPPORT_PROTOTYPES(
1041 void,
1042 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1043
1044 PMAP_SUPPORT_PROTOTYPES(
1045 vm_map_offset_t,
1046 pmap_unnest_options, (pmap_t grand,
1047 addr64_t vaddr,
1048 uint64_t size,
1049 vm_map_offset_t vrestart,
1050 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1051
1052 PMAP_SUPPORT_PROTOTYPES(
1053 void,
1054 phys_attribute_set, (ppnum_t pn,
1055 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1056
1057 PMAP_SUPPORT_PROTOTYPES(
1058 void,
1059 phys_attribute_clear, (ppnum_t pn,
1060 unsigned int bits,
1061 int options,
1062 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1063
1064 #if __ARM_RANGE_TLBI__
1065 PMAP_SUPPORT_PROTOTYPES(
1066 vm_map_address_t,
1067 phys_attribute_clear_range, (pmap_t pmap,
1068 vm_map_address_t start,
1069 vm_map_address_t end,
1070 unsigned int bits,
1071 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1072 #endif /* __ARM_RANGE_TLBI__ */
1073
1074
1075 PMAP_SUPPORT_PROTOTYPES(
1076 void,
1077 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1078
1079 PMAP_SUPPORT_PROTOTYPES(
1080 void,
1081 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1082
1083 PMAP_SUPPORT_PROTOTYPES(
1084 void,
1085 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1086
1087 PMAP_SUPPORT_PROTOTYPES(
1088 void,
1089 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1090
1091 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1092 PMAP_SUPPORT_PROTOTYPES(
1093 void,
1094 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1095 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1096
1097 PMAP_SUPPORT_PROTOTYPES(
1098 void,
1099 pmap_trim, (pmap_t grand,
1100 pmap_t subord,
1101 addr64_t vstart,
1102 uint64_t size), PMAP_TRIM_INDEX);
1103
1104 #if HAS_APPLE_PAC
1105 PMAP_SUPPORT_PROTOTYPES(
1106 void *,
1107 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1108 PMAP_SUPPORT_PROTOTYPES(
1109 void *,
1110 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1111 #endif /* HAS_APPLE_PAC */
1112
1113
1114
1115
1116 PMAP_SUPPORT_PROTOTYPES(
1117 bool,
1118 pmap_is_trust_cache_loaded, (const uuid_t uuid), PMAP_IS_TRUST_CACHE_LOADED_INDEX);
1119
1120 PMAP_SUPPORT_PROTOTYPES(
1121 uint32_t,
1122 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1123
1124 PMAP_SUPPORT_PROTOTYPES(
1125 bool,
1126 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1127
1128 PMAP_SUPPORT_PROTOTYPES(
1129 void,
1130 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1131 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1132
1133 PMAP_SUPPORT_PROTOTYPES(
1134 bool,
1135 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1136 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1137
1138 PMAP_SUPPORT_PROTOTYPES(
1139 void,
1140 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE]),
1141 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1142
1143 PMAP_SUPPORT_PROTOTYPES(
1144 void,
1145 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1146 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1147
1148 PMAP_SUPPORT_PROTOTYPES(
1149 void,
1150 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1151
1152 void pmap_footprint_suspend(vm_map_t map,
1153 boolean_t suspend);
1154 PMAP_SUPPORT_PROTOTYPES(
1155 void,
1156 pmap_footprint_suspend, (vm_map_t map,
1157 boolean_t suspend),
1158 PMAP_FOOTPRINT_SUSPEND_INDEX);
1159
1160
1161
1162
1163 #if DEVELOPMENT || DEBUG
1164 PMAP_SUPPORT_PROTOTYPES(
1165 kern_return_t,
1166 pmap_test_text_corruption, (pmap_paddr_t),
1167 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1168 #endif /* DEVELOPMENT || DEBUG */
1169
1170 #if (__ARM_VMSA__ > 7)
1171 /*
1172 * The low global vector page is mapped at a fixed alias.
1173 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1174 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1175 * to check both addresses anyway for backward compatibility. So for now
1176 * we leave H6 and H7 where they were.
1177 */
1178 #if (ARM_PGSHIFT == 14)
1179 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1180 #else
1181 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1182 #endif
1183
1184 #else
1185 #define LOWGLOBAL_ALIAS (0xFFFF1000)
1186 #endif
1187
1188 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1189 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1190 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1191
1192 #if XNU_MONITOR
1193
1194 #if __has_feature(ptrauth_calls)
1195 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1196 #else
1197 #define __ptrauth_ppl_handler
1198 #endif
1199
1200 /*
1201 * Table of function pointers used for PPL dispatch.
1202 */
1203 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1204 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1205 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1206 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1207 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1208 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1209 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1210 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1211 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1212 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1213 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1214 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1215 [PMAP_INSERT_SHAREDPAGE_INDEX] = pmap_insert_sharedpage_internal,
1216 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1217 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1218 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1219 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1220 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1221 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1222 [PMAP_NEST_INDEX] = pmap_nest_internal,
1223 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1224 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1225 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1226 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1227 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1228 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1229 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1230 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1231 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1232 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1233 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1234 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1235 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1236 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1237 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1238 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1239 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1240 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1241 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1242 [PMAP_IS_TRUST_CACHE_LOADED_INDEX] = pmap_is_trust_cache_loaded_internal,
1243 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1244 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1245 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1246 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1247 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1248 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1249 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1250 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1251 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1252 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1253 #if HAS_APPLE_PAC
1254 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1255 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1256 #endif /* HAS_APPLE_PAC */
1257 #if __ARM_RANGE_TLBI__
1258 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1259 #endif /* __ARM_RANGE_TLBI__ */
1260 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1261 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1262 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1263 [PMAP_NOP_INDEX] = pmap_nop_internal,
1264
1265 #if DEVELOPMENT || DEBUG
1266 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1267 #endif /* DEVELOPMENT || DEBUG */
1268 };
1269 #endif
1270
1271 #if XNU_MONITOR
1272 /**
1273 * A convenience function for setting protections on a single physical
1274 * aperture or static region mapping without invalidating the TLB.
1275 *
1276 * @note This function does not perform any TLB invalidations. That must be done
1277 * separately to be able to safely use the updated mapping.
1278 *
1279 * @note This function understands the difference between the VM page size and
1280 * the kernel page size and will update multiple PTEs if the sizes differ.
1281 * In other words, enough PTEs will always get updated to change the
1282 * permissions on a PAGE_SIZE amount of memory.
1283 *
1284 * @note The PVH lock for the physical page represented by this mapping must
1285 * already be locked.
1286 *
1287 * @note This function assumes the caller has already verified that the PTE
1288 * pointer does indeed point to a physical aperture or static region page
1289 * table. Please validate your inputs before passing it along to this
1290 * function.
1291 *
1292 * @param ptep Pointer to the physical aperture or static region page table to
1293 * update with a new XPRR index.
1294 * @param expected_perm The XPRR index that is expected to already exist at the
1295 * current mapping. If the current index doesn't match this
1296 * then the system will panic.
1297 * @param new_perm The new XPRR index to update the mapping with.
1298 */
1299 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1300 pmap_set_pte_xprr_perm(
1301 pt_entry_t * const ptep,
1302 unsigned int expected_perm,
1303 unsigned int new_perm)
1304 {
1305 assert(ptep != NULL);
1306
1307 pt_entry_t spte = *ptep;
1308 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1309
1310 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1311 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1312 __func__, ptep, new_perm, expected_perm);
1313 }
1314
1315 /**
1316 * The PTE involved should be valid, should not have the hint bit set, and
1317 * should have the expected XPRR index.
1318 */
1319 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1320 panic_plain("%s: physical aperture or static region PTE is invalid, "
1321 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1322 __func__, ptep, spte, new_perm, expected_perm);
1323 }
1324
1325 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1326 panic_plain("%s: physical aperture or static region PTE has hint bit "
1327 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1328 __func__, ptep, spte, new_perm, expected_perm);
1329 }
1330
1331 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1332 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1333 "ptep=%p, new_perm=%u, expected_perm=%u",
1334 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1335 }
1336
1337 pt_entry_t template = spte;
1338 template &= ~ARM_PTE_XPRR_MASK;
1339 template |= xprr_perm_to_pte(new_perm);
1340
1341 write_pte_strong(ptep, template);
1342 }
1343
1344 /**
1345 * Update the protections on a single physical aperture mapping and invalidate
1346 * the TLB so the mapping can be used.
1347 *
1348 * @note The PVH lock for the physical page must already be locked.
1349 *
1350 * @param pai The physical address index of the page whose physical aperture
1351 * mapping will be updated with new permissions.
1352 * @param expected_perm The XPRR index that is expected to already exist at the
1353 * current mapping. If the current index doesn't match this
1354 * then the system will panic.
1355 * @param new_perm The new XPRR index to update the mapping with.
1356 */
1357 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1358 pmap_set_xprr_perm(
1359 unsigned int pai,
1360 unsigned int expected_perm,
1361 unsigned int new_perm)
1362 {
1363 pvh_assert_locked(pai);
1364
1365 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1366 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1367
1368 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1369
1370 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true);
1371 sync_tlb_flush();
1372 }
1373
1374 /**
1375 * Update the protections on a range of physical aperture or static region
1376 * mappings and invalidate the TLB so the mappings can be used.
1377 *
1378 * @note Static region mappings can only be updated before machine_lockdown().
1379 * Physical aperture mappings can be updated at any time.
1380 *
1381 * @param start The starting virtual address of the static region or physical
1382 * aperture range whose permissions will be updated.
1383 * @param end The final (inclusive) virtual address of the static region or
1384 * physical aperture range whose permissions will be updated.
1385 * @param expected_perm The XPRR index that is expected to already exist at the
1386 * current mappings. If the current indices don't match
1387 * this then the system will panic.
1388 * @param new_perm The new XPRR index to update the mappings with.
1389 */
1390 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1391 pmap_set_range_xprr_perm(
1392 vm_address_t start,
1393 vm_address_t end,
1394 unsigned int expected_perm,
1395 unsigned int new_perm)
1396 {
1397 #if (__ARM_VMSA__ == 7)
1398 #error This function is not supported on older ARM hardware.
1399 #endif /* (__ARM_VMSA__ == 7) */
1400
1401 /**
1402 * Validate our arguments; any invalid argument will be grounds for a panic.
1403 */
1404 if (__improbable((start | end) & ARM_PGMASK)) {
1405 panic_plain("%s: start or end not page aligned, "
1406 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1407 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1408 }
1409
1410 if (__improbable(start > end)) {
1411 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1412 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1413 }
1414
1415 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1416 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1417
1418 if (__improbable(!(in_physmap || in_static))) {
1419 panic_plain("%s: address not in static region or physical aperture, "
1420 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1421 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1422 }
1423
1424 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1425 panic_plain("%s: invalid XPRR index, "
1426 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1427 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1428 }
1429
1430 /*
1431 * Walk over the PTEs for the given range, and set the protections on those
1432 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1433 * one twig entry (whichever twig entry currently maps "va").
1434 */
1435 vm_address_t va = start;
1436 while (va < end) {
1437 /**
1438 * Get the last VA that the twig entry for "va" maps. All of the leaf
1439 * PTEs from va to tte_va_end will have their permissions updated.
1440 */
1441 vm_address_t tte_va_end =
1442 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1443
1444 if (tte_va_end > end) {
1445 tte_va_end = end;
1446 }
1447
1448 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1449
1450 if (ttep == NULL) {
1451 panic_plain("%s: physical aperture or static region tte is NULL, "
1452 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1453 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1454 }
1455
1456 tt_entry_t tte = *ttep;
1457
1458 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1459 panic_plain("%s: tte=0x%llx is not a table type entry, "
1460 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1461 tte, (void *)start, (void *)end, new_perm, expected_perm);
1462 }
1463
1464 /* Walk over the given L3 page table page and update the PTEs. */
1465 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1466 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1467 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1468 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1469
1470 /**
1471 * The current PTE pointer is incremented by the page ratio (ratio of
1472 * VM page size to kernel hardware page size) because one call to
1473 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1474 * a PAGE_SIZE worth of hardware pages.
1475 */
1476 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1477 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1478 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1479 pvh_lock(pai);
1480 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1481 pvh_unlock(pai);
1482 }
1483
1484 va = tte_va_end;
1485 }
1486
1487 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1488 }
1489
1490 #endif /* XNU_MONITOR */
1491
1492 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1493 PMAP_ZINFO_PALLOC(
1494 pmap_t pmap, int bytes)
1495 {
1496 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1497 }
1498
1499 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1500 PMAP_ZINFO_PFREE(
1501 pmap_t pmap,
1502 int bytes)
1503 {
1504 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1505 }
1506
1507 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1508 pmap_tt_ledger_credit(
1509 pmap_t pmap,
1510 vm_size_t size)
1511 {
1512 if (pmap != kernel_pmap) {
1513 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1514 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1515 }
1516 }
1517
1518 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1519 pmap_tt_ledger_debit(
1520 pmap_t pmap,
1521 vm_size_t size)
1522 {
1523 if (pmap != kernel_pmap) {
1524 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1525 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1526 }
1527 }
1528
1529 static inline void
pmap_update_plru(uint16_t asid_index)1530 pmap_update_plru(uint16_t asid_index)
1531 {
1532 if (__probable(pmap_asid_plru)) {
1533 unsigned plru_index = asid_index >> 6;
1534 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1535 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1536 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1537 }
1538 }
1539 }
1540
1541 static bool
alloc_asid(pmap_t pmap)1542 alloc_asid(pmap_t pmap)
1543 {
1544 int vasid = -1;
1545 uint16_t hw_asid;
1546
1547 pmap_simple_lock(&asid_lock);
1548
1549 if (__probable(pmap_asid_plru)) {
1550 unsigned plru_index = 0;
1551 uint64_t lowest_gen = asid_plru_generation[0];
1552 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1553 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1554 if (asid_plru_generation[i] < lowest_gen) {
1555 plru_index = i;
1556 lowest_gen = asid_plru_generation[i];
1557 lowest_gen_bitmap = asid_plru_bitmap[i];
1558 }
1559 }
1560
1561 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1562 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1563 if (temp_plru) {
1564 vasid = (plru_index << 6) + lsb_first(temp_plru);
1565 #if DEVELOPMENT || DEBUG
1566 ++pmap_asid_hits;
1567 #endif
1568 break;
1569 }
1570 }
1571 }
1572 if (__improbable(vasid < 0)) {
1573 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1574 // slightly better with the collision detection scheme used by pmap_switch_internal().
1575 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1576 #if DEVELOPMENT || DEBUG
1577 ++pmap_asid_misses;
1578 #endif
1579 }
1580 if (__improbable(vasid < 0)) {
1581 pmap_simple_unlock(&asid_lock);
1582 return false;
1583 }
1584 assert((uint32_t)vasid < pmap_max_asids);
1585 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1586 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1587 pmap_simple_unlock(&asid_lock);
1588 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1589 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1590 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1591 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1592 * reassign to a reserved VASID. */
1593 assert(pmap->sw_asid < UINT8_MAX);
1594 pmap->sw_asid = UINT8_MAX;
1595 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1596 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1597 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1598 assert(hw_asid < MAX_HW_ASIDS);
1599 }
1600 pmap_update_plru(hw_asid);
1601 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1602 #if __ARM_KERNEL_PROTECT__
1603 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1604 #endif
1605 pmap->hw_asid = hw_asid;
1606 return true;
1607 }
1608
1609 static void
free_asid(pmap_t pmap)1610 free_asid(pmap_t pmap)
1611 {
1612 unsigned int vasid;
1613 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1614 if (__improbable(hw_asid == 0)) {
1615 return;
1616 }
1617
1618 #if __ARM_KERNEL_PROTECT__
1619 hw_asid >>= 1;
1620 #endif
1621 hw_asid -= 1;
1622
1623 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1624 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1625 } else {
1626 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1627 }
1628
1629 if (__probable(pmap_asid_plru)) {
1630 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1631 }
1632 pmap_simple_lock(&asid_lock);
1633 assert(!bitmap_test(&asid_bitmap[0], vasid));
1634 bitmap_set(&asid_bitmap[0], vasid);
1635 pmap_simple_unlock(&asid_lock);
1636 }
1637
1638
1639 boolean_t
pmap_valid_address(pmap_paddr_t addr)1640 pmap_valid_address(
1641 pmap_paddr_t addr)
1642 {
1643 return pa_valid(addr);
1644 }
1645
1646
1647
1648
1649
1650
1651 /*
1652 * Map memory at initialization. The physical addresses being
1653 * mapped are not managed and are never unmapped.
1654 *
1655 * For now, VM is already on, we only need to map the
1656 * specified memory.
1657 */
1658 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1659 pmap_map(
1660 vm_map_address_t virt,
1661 vm_offset_t start,
1662 vm_offset_t end,
1663 vm_prot_t prot,
1664 unsigned int flags)
1665 {
1666 kern_return_t kr;
1667 vm_size_t ps;
1668
1669 ps = PAGE_SIZE;
1670 while (start < end) {
1671 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1672 prot, VM_PROT_NONE, flags, FALSE);
1673
1674 if (kr != KERN_SUCCESS) {
1675 panic("%s: failed pmap_enter, "
1676 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1677 __FUNCTION__,
1678 (void *) virt, (void *) start, (void *) end, prot, flags);
1679 }
1680
1681 virt += ps;
1682 start += ps;
1683 }
1684 return virt;
1685 }
1686
1687 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1688 pmap_map_bd_with_options(
1689 vm_map_address_t virt,
1690 vm_offset_t start,
1691 vm_offset_t end,
1692 vm_prot_t prot,
1693 int32_t options)
1694 {
1695 pt_entry_t tmplate;
1696 pt_entry_t *ptep;
1697 vm_map_address_t vaddr;
1698 vm_offset_t paddr;
1699 pt_entry_t mem_attr;
1700
1701 switch (options & PMAP_MAP_BD_MASK) {
1702 case PMAP_MAP_BD_WCOMB:
1703 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1704 #if (__ARM_VMSA__ > 7)
1705 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1706 #else
1707 mem_attr |= ARM_PTE_SH;
1708 #endif
1709 break;
1710 case PMAP_MAP_BD_POSTED:
1711 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1712 break;
1713 case PMAP_MAP_BD_POSTED_REORDERED:
1714 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1715 break;
1716 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1717 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1718 break;
1719 default:
1720 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1721 break;
1722 }
1723
1724 tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1725 mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1726 #if __ARM_KERNEL_PROTECT__
1727 tmplate |= ARM_PTE_NG;
1728 #endif /* __ARM_KERNEL_PROTECT__ */
1729
1730 vaddr = virt;
1731 paddr = start;
1732 while (paddr < end) {
1733 ptep = pmap_pte(kernel_pmap, vaddr);
1734 if (ptep == PT_ENTRY_NULL) {
1735 panic("%s: no PTE for vaddr=%p, "
1736 "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1737 __FUNCTION__, (void*)vaddr,
1738 (void*)virt, (void*)start, (void*)end, prot, options);
1739 }
1740
1741 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1742 write_pte_strong(ptep, tmplate);
1743
1744 pte_increment_pa(tmplate);
1745 vaddr += PAGE_SIZE;
1746 paddr += PAGE_SIZE;
1747 }
1748
1749 if (end >= start) {
1750 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1751 }
1752
1753 return vaddr;
1754 }
1755
1756 /*
1757 * Back-door routine for mapping kernel VM at initialization.
1758 * Useful for mapping memory outside the range
1759 * [vm_first_phys, vm_last_phys] (i.e., devices).
1760 * Otherwise like pmap_map.
1761 */
1762 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1763 pmap_map_bd(
1764 vm_map_address_t virt,
1765 vm_offset_t start,
1766 vm_offset_t end,
1767 vm_prot_t prot)
1768 {
1769 pt_entry_t tmplate;
1770 pt_entry_t *ptep;
1771 vm_map_address_t vaddr;
1772 vm_offset_t paddr;
1773
1774 /* not cacheable and not buffered */
1775 tmplate = pa_to_pte(start)
1776 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1777 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1778 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1779 #if __ARM_KERNEL_PROTECT__
1780 tmplate |= ARM_PTE_NG;
1781 #endif /* __ARM_KERNEL_PROTECT__ */
1782
1783 vaddr = virt;
1784 paddr = start;
1785 while (paddr < end) {
1786 ptep = pmap_pte(kernel_pmap, vaddr);
1787 if (ptep == PT_ENTRY_NULL) {
1788 panic("pmap_map_bd");
1789 }
1790 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1791 write_pte_strong(ptep, tmplate);
1792
1793 pte_increment_pa(tmplate);
1794 vaddr += PAGE_SIZE;
1795 paddr += PAGE_SIZE;
1796 }
1797
1798 if (end >= start) {
1799 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1800 }
1801
1802 return vaddr;
1803 }
1804
1805 /*
1806 * Back-door routine for mapping kernel VM at initialization.
1807 * Useful for mapping memory specific physical addresses in early
1808 * boot (i.e., before kernel_map is initialized).
1809 *
1810 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1811 */
1812
1813 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1814 pmap_map_high_window_bd(
1815 vm_offset_t pa_start,
1816 vm_size_t len,
1817 vm_prot_t prot)
1818 {
1819 pt_entry_t *ptep, pte;
1820 #if (__ARM_VMSA__ == 7)
1821 vm_map_address_t va_start = VM_HIGH_KERNEL_WINDOW;
1822 vm_map_address_t va_max = VM_MAX_KERNEL_ADDRESS;
1823 #else
1824 vm_map_address_t va_start = VREGION1_START;
1825 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1826 #endif
1827 vm_map_address_t va_end;
1828 vm_map_address_t va;
1829 vm_size_t offset;
1830
1831 offset = pa_start & PAGE_MASK;
1832 pa_start -= offset;
1833 len += offset;
1834
1835 if (len > (va_max - va_start)) {
1836 panic("%s: area too large, "
1837 "pa_start=%p, len=%p, prot=0x%x",
1838 __FUNCTION__,
1839 (void*)pa_start, (void*)len, prot);
1840 }
1841
1842 scan:
1843 for (; va_start < va_max; va_start += PAGE_SIZE) {
1844 ptep = pmap_pte(kernel_pmap, va_start);
1845 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1846 if (*ptep == ARM_PTE_TYPE_FAULT) {
1847 break;
1848 }
1849 }
1850 if (va_start > va_max) {
1851 panic("%s: insufficient pages, "
1852 "pa_start=%p, len=%p, prot=0x%x",
1853 __FUNCTION__,
1854 (void*)pa_start, (void*)len, prot);
1855 }
1856
1857 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1858 ptep = pmap_pte(kernel_pmap, va_end);
1859 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1860 if (*ptep != ARM_PTE_TYPE_FAULT) {
1861 va_start = va_end + PAGE_SIZE;
1862 goto scan;
1863 }
1864 }
1865
1866 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1867 ptep = pmap_pte(kernel_pmap, va);
1868 pte = pa_to_pte(pa_start)
1869 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1870 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1871 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
1872 #if (__ARM_VMSA__ > 7)
1873 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
1874 #else
1875 pte |= ARM_PTE_SH;
1876 #endif
1877 #if __ARM_KERNEL_PROTECT__
1878 pte |= ARM_PTE_NG;
1879 #endif /* __ARM_KERNEL_PROTECT__ */
1880 write_pte_strong(ptep, pte);
1881 }
1882 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
1883 #if KASAN
1884 kasan_notify_address(va_start, len);
1885 #endif
1886 return va_start;
1887 }
1888
1889 static uint32_t
pmap_compute_max_asids(void)1890 pmap_compute_max_asids(void)
1891 {
1892 DTEntry entry;
1893 void const *prop = NULL;
1894 uint32_t max_asids;
1895 int err;
1896 unsigned int prop_size;
1897
1898 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1899 assert(err == kSuccess);
1900
1901 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
1902 /* TODO: consider allowing maxproc limits to be scaled earlier so that
1903 * we can choose a more flexible default value here. */
1904 return MAX_ASIDS;
1905 }
1906
1907 if (prop_size != sizeof(max_asids)) {
1908 panic("pmap-max-asids property is not a 32-bit integer");
1909 }
1910
1911 max_asids = *((uint32_t const *)prop);
1912 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
1913 max_asids = (max_asids + 63) & ~63UL;
1914
1915 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
1916 /* currently capped by size of pmap->sw_asid */
1917 panic("pmap-max-asids too large");
1918 }
1919 if (max_asids == 0) {
1920 panic("pmap-max-asids cannot be zero");
1921 }
1922 return max_asids;
1923 }
1924
1925 #if __arm64__
1926 /*
1927 * pmap_get_arm64_prot
1928 *
1929 * return effective armv8 VMSA block protections including
1930 * table AP/PXN/XN overrides of a pmap entry
1931 *
1932 */
1933
1934 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1935 pmap_get_arm64_prot(
1936 pmap_t pmap,
1937 vm_offset_t addr)
1938 {
1939 tt_entry_t tte = 0;
1940 unsigned int level = 0;
1941 uint64_t tte_type = 0;
1942 uint64_t effective_prot_bits = 0;
1943 uint64_t aggregate_tte = 0;
1944 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1945 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1946
1947 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1948 tte = *pmap_ttne(pmap, level, addr);
1949
1950 if (!(tte & ARM_TTE_VALID)) {
1951 return 0;
1952 }
1953
1954 tte_type = tte & ARM_TTE_TYPE_MASK;
1955
1956 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
1957 (level == pt_attr->pta_max_level)) {
1958 /* Block or page mapping; both have the same protection bit layout. */
1959 break;
1960 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
1961 /* All of the table bits we care about are overrides, so just OR them together. */
1962 aggregate_tte |= tte;
1963 }
1964 }
1965
1966 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1967 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1968 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1969
1970 /* Start with the PTE bits. */
1971 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1972
1973 /* Table AP bits mask out block/page AP bits */
1974 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1975
1976 /* XN/PXN bits can be OR'd in. */
1977 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1978 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1979
1980 return effective_prot_bits;
1981 }
1982 #endif /* __arm64__ */
1983
1984 static void
pmap_set_srd_fusing()1985 pmap_set_srd_fusing()
1986 {
1987 DTEntry entry;
1988 uint32_t const *prop = NULL;
1989 int err;
1990 unsigned int prop_size = 0;
1991
1992 err = SecureDTLookupEntry(NULL, "/chosen", &entry);
1993 if (err != kSuccess) {
1994 panic("PMAP: no chosen DT node");
1995 }
1996
1997 if (kSuccess == SecureDTGetProperty(entry, "research-enabled", (const void**)&prop, &prop_size)) {
1998 if (prop_size == sizeof(uint32_t)) {
1999 srd_fused = *prop;
2000 }
2001 }
2002
2003 #if DEVELOPMENT || DEBUG
2004 PE_parse_boot_argn("srd_fusing", &srd_fused, sizeof(srd_fused));
2005 #endif
2006 }
2007
2008 /*
2009 * Bootstrap the system enough to run with virtual memory.
2010 *
2011 * The early VM initialization code has already allocated
2012 * the first CPU's translation table and made entries for
2013 * all the one-to-one mappings to be found there.
2014 *
2015 * We must set up the kernel pmap structures, the
2016 * physical-to-virtual translation lookup tables for the
2017 * physical memory to be managed (between avail_start and
2018 * avail_end).
2019 *
2020 * Map the kernel's code and data, and allocate the system page table.
2021 * Page_size must already be set.
2022 *
2023 * Parameters:
2024 * first_avail first available physical page -
2025 * after kernel page tables
2026 * avail_start PA of first managed physical page
2027 * avail_end PA of last managed physical page
2028 */
2029
2030 void
pmap_bootstrap(vm_offset_t vstart)2031 pmap_bootstrap(
2032 vm_offset_t vstart)
2033 {
2034 vm_map_offset_t maxoffset;
2035
2036 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2037
2038 pmap_set_srd_fusing();
2039
2040 #if XNU_MONITOR
2041
2042 #if DEVELOPMENT || DEBUG
2043 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2044 #endif
2045
2046 #if CONFIG_CSR_FROM_DT
2047 if (csr_unsafe_kernel_text) {
2048 pmap_ppl_disable = true;
2049 }
2050 #endif /* CONFIG_CSR_FROM_DT */
2051
2052 #endif /* XNU_MONITOR */
2053
2054 #if DEVELOPMENT || DEBUG
2055 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2056 kprintf("Kernel traces for pmap operations enabled\n");
2057 }
2058 #endif
2059
2060 /*
2061 * Initialize the kernel pmap.
2062 */
2063 pmap_stamp = 1;
2064 #if ARM_PARAMETERIZED_PMAP
2065 kernel_pmap->pmap_pt_attr = native_pt_attr;
2066 #endif /* ARM_PARAMETERIZED_PMAP */
2067 #if HAS_APPLE_PAC
2068 kernel_pmap->disable_jop = 0;
2069 #endif /* HAS_APPLE_PAC */
2070 kernel_pmap->tte = cpu_tte;
2071 kernel_pmap->ttep = cpu_ttep;
2072 #if (__ARM_VMSA__ > 7)
2073 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2074 #else
2075 kernel_pmap->min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
2076 #endif
2077 kernel_pmap->max = UINTPTR_MAX;
2078 os_atomic_init(&kernel_pmap->ref_count, 1);
2079 #if XNU_MONITOR
2080 os_atomic_init(&kernel_pmap->nested_count, 0);
2081 #endif
2082 kernel_pmap->gc_status = 0;
2083 kernel_pmap->nx_enabled = TRUE;
2084 #ifdef __arm64__
2085 kernel_pmap->is_64bit = TRUE;
2086 #else
2087 kernel_pmap->is_64bit = FALSE;
2088 #endif
2089 kernel_pmap->stamp = os_atomic_inc(&pmap_stamp, relaxed);
2090
2091 #if ARM_PARAMETERIZED_PMAP
2092 kernel_pmap->pmap_pt_attr = native_pt_attr;
2093 #endif /* ARM_PARAMETERIZED_PMAP */
2094
2095 kernel_pmap->nested_region_addr = 0x0ULL;
2096 kernel_pmap->nested_region_size = 0x0ULL;
2097 kernel_pmap->nested_region_asid_bitmap = NULL;
2098 kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2099 kernel_pmap->type = PMAP_TYPE_KERNEL;
2100
2101 #if (__ARM_VMSA__ == 7)
2102 kernel_pmap->tte_index_max = 4 * (ARM_PGBYTES / sizeof(tt_entry_t));
2103 #endif
2104 kernel_pmap->hw_asid = 0;
2105 kernel_pmap->sw_asid = 0;
2106
2107 pmap_lock_init(kernel_pmap);
2108
2109 pmap_max_asids = pmap_compute_max_asids();
2110 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2111 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2112 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2113 * masking used by the PLRU scheme. This means we must handle the case in which
2114 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2115 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2116 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2117 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2118
2119 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2120
2121 /**
2122 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2123 * pp_attr_table, etc). This function will use `avail_start` to allocate
2124 * space for these data structures.
2125 * */
2126 pmap_data_bootstrap();
2127
2128 /**
2129 * Don't make any assumptions about the alignment of avail_start before this
2130 * point (i.e., pmap_data_bootstrap() performs allocations).
2131 */
2132 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2133
2134 const pmap_paddr_t pmap_struct_start = avail_start;
2135
2136 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2137 avail_start = round_page(avail_start + asid_table_size);
2138
2139 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2140
2141 vm_first_phys = gPhysBase;
2142 vm_last_phys = trunc_page(avail_end);
2143
2144 queue_init(&map_pmap_list);
2145 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2146 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2147 free_page_size_tt_count = 0;
2148 free_page_size_tt_max = 0;
2149 free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2150 free_two_page_size_tt_count = 0;
2151 free_two_page_size_tt_max = 0;
2152 free_tt_list = TT_FREE_ENTRY_NULL;
2153 free_tt_count = 0;
2154 free_tt_max = 0;
2155
2156 virtual_space_start = vstart;
2157 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2158
2159 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2160 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2161 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2162 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2163
2164
2165
2166 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2167 maxoffset = trunc_page(maxoffset);
2168 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2169 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2170 arm_pmap_max_offset_default = maxoffset;
2171 }
2172 }
2173 #if defined(__arm64__)
2174 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2175 maxoffset = trunc_page(maxoffset);
2176 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2177 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2178 arm64_pmap_max_offset_default = maxoffset;
2179 }
2180 }
2181 #endif
2182
2183 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2184
2185
2186 #if MACH_ASSERT
2187 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2188 &vm_footprint_suspend_allowed,
2189 sizeof(vm_footprint_suspend_allowed));
2190 #endif /* MACH_ASSERT */
2191
2192 #if KASAN
2193 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2194 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2195 #endif /* KASAN */
2196
2197 /**
2198 * Ensure that avail_start is always left on a page boundary. The calling
2199 * code might not perform any alignment before allocating page tables so
2200 * this is important.
2201 */
2202 avail_start = round_page(avail_start);
2203 }
2204
2205 #if XNU_MONITOR
2206
2207 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2208 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2209 {
2210 pmap_paddr_t cur_pa;
2211 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2212 assert(pa_valid(cur_pa));
2213 ppattr_pa_set_monitor(cur_pa);
2214 }
2215 }
2216
2217 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2218 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2219 pmap_paddr_t end_pa,
2220 unsigned int expected_perm,
2221 unsigned int new_perm)
2222 {
2223 vm_offset_t start_va = phystokv(start_pa);
2224 vm_offset_t end_va = start_va + (end_pa - start_pa);
2225
2226 pa_set_range_monitor(start_pa, end_pa);
2227 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2228 }
2229
2230 static void
pmap_lockdown_kc(void)2231 pmap_lockdown_kc(void)
2232 {
2233 extern vm_offset_t vm_kernelcache_base;
2234 extern vm_offset_t vm_kernelcache_top;
2235 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2236 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2237 pmap_paddr_t cur_pa = start_pa;
2238 vm_offset_t cur_va = vm_kernelcache_base;
2239 while (cur_pa < end_pa) {
2240 vm_size_t range_size = end_pa - cur_pa;
2241 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2242 if (ptov_va != cur_va) {
2243 /*
2244 * If the physical address maps back to a virtual address that is non-linear
2245 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2246 * reclaimed by the OS and should therefore not be locked down.
2247 */
2248 cur_pa += range_size;
2249 cur_va += range_size;
2250 continue;
2251 }
2252 unsigned int pai = pa_index(cur_pa);
2253 pv_entry_t **pv_h = pai_to_pvh(pai);
2254
2255 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2256
2257 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2258 panic("pai %d already locked down", pai);
2259 }
2260 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2261 cur_pa += ARM_PGBYTES;
2262 cur_va += ARM_PGBYTES;
2263 }
2264 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2265 extern uint64_t ctrr_ro_test;
2266 extern uint64_t ctrr_nx_test;
2267 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2268 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2269 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2270 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2271 }
2272 #endif
2273 }
2274
2275 void
pmap_static_allocations_done(void)2276 pmap_static_allocations_done(void)
2277 {
2278 pmap_paddr_t monitor_start_pa;
2279 pmap_paddr_t monitor_end_pa;
2280
2281 /*
2282 * Protect the bootstrap (V=P and V->P) page tables.
2283 *
2284 * These bootstrap allocations will be used primarily for page tables.
2285 * If we wish to secure the page tables, we need to start by marking
2286 * these bootstrap allocations as pages that we want to protect.
2287 */
2288 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2289 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2290
2291 /* The bootstrap page tables are mapped RW at boostrap. */
2292 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2293
2294 /*
2295 * We use avail_start as a pointer to the first address that has not
2296 * been reserved for bootstrap, so we know which pages to give to the
2297 * virtual memory layer.
2298 */
2299 monitor_start_pa = BootArgs->topOfKernelData;
2300 monitor_end_pa = avail_start;
2301
2302 /* The other bootstrap allocations are mapped RW at bootstrap. */
2303 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2304
2305 /*
2306 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2307 * to RO in arm_vm_prot_finalize(), which is called after this function.
2308 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2309 * they can't be allocated for other uses. We don't need a special xPRR
2310 * protection index, as there is no PPL_RO index, and these pages are ultimately
2311 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2312 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2313 * lookup table index to USER_XO before APRR is applied, leading the hardware
2314 * to believe we are dealing with an user XO page upon performing a translation.
2315 */
2316 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2317 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2318 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2319
2320 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2321 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2322
2323 /* PPL data is RW for the PPL, RO for the kernel. */
2324 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2325
2326 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2327 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2328
2329 /* PPL text is RX for the PPL, RO for the kernel. */
2330 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2331
2332
2333 /*
2334 * In order to support DTrace, the save areas for the PPL must be
2335 * writable. This is due to the fact that DTrace will try to update
2336 * register state.
2337 */
2338 if (pmap_ppl_disable) {
2339 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2340 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2341
2342 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2343 }
2344
2345
2346 if (segSizePPLDATACONST > 0) {
2347 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2348 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2349
2350 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2351 }
2352
2353 /*
2354 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2355 * precaution. The real RW mappings are at a different location with guard pages.
2356 */
2357 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2358
2359 /* Prevent remapping of the kernelcache */
2360 pmap_lockdown_kc();
2361 }
2362
2363 void
pmap_lockdown_ppl(void)2364 pmap_lockdown_ppl(void)
2365 {
2366 /* Mark the PPL as being locked down. */
2367
2368 #error "XPRR configuration error"
2369 }
2370 #endif /* XNU_MONITOR */
2371
2372 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2373 pmap_virtual_space(
2374 vm_offset_t *startp,
2375 vm_offset_t *endp
2376 )
2377 {
2378 *startp = virtual_space_start;
2379 *endp = virtual_space_end;
2380 }
2381
2382
2383 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2384 pmap_virtual_region(
2385 unsigned int region_select,
2386 vm_map_offset_t *startp,
2387 vm_map_size_t *size
2388 )
2389 {
2390 boolean_t ret = FALSE;
2391 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2392 if (region_select == 0) {
2393 /*
2394 * In this config, the bootstrap mappings should occupy their own L2
2395 * TTs, as they should be immutable after boot. Having the associated
2396 * TTEs and PTEs in their own pages allows us to lock down those pages,
2397 * while allowing the rest of the kernel address range to be remapped.
2398 */
2399 #if (__ARM_VMSA__ > 7)
2400 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2401 #else
2402 #error Unsupported configuration
2403 #endif
2404 #if defined(ARM_LARGE_MEMORY)
2405 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2406 #else
2407 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2408 #endif
2409 ret = TRUE;
2410 }
2411 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2412 #if defined(ARM_LARGE_MEMORY)
2413 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2414 #if (__ARM_VMSA__ > 7)
2415 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2416 #else
2417 #error Unsupported configuration
2418 #endif
2419 if (region_select == 0) {
2420 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2421 ret = TRUE;
2422 }
2423 #else /* !defined(ARM_LARGE_MEMORY) */
2424 #if (__ARM_VMSA__ > 7)
2425 unsigned long low_global_vr_mask = 0;
2426 vm_map_size_t low_global_vr_size = 0;
2427 #endif
2428
2429 if (region_select == 0) {
2430 #if (__ARM_VMSA__ == 7)
2431 *startp = gVirtBase & 0xFFC00000;
2432 *size = ((virtual_space_start - (gVirtBase & 0xFFC00000)) + ~0xFFC00000) & 0xFFC00000;
2433 #else
2434 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2435 if (!TEST_PAGE_SIZE_4K) {
2436 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2437 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2438 } else {
2439 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2440 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2441 }
2442 #endif
2443 ret = TRUE;
2444 }
2445 if (region_select == 1) {
2446 *startp = VREGION1_START;
2447 *size = VREGION1_SIZE;
2448 ret = TRUE;
2449 }
2450 #if (__ARM_VMSA__ > 7)
2451 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2452 if (!TEST_PAGE_SIZE_4K) {
2453 low_global_vr_mask = 0xFFFFFFFFFE000000;
2454 low_global_vr_size = 0x2000000;
2455 } else {
2456 low_global_vr_mask = 0xFFFFFFFFFF800000;
2457 low_global_vr_size = 0x800000;
2458 }
2459
2460 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2461 *startp = LOW_GLOBAL_BASE_ADDRESS;
2462 *size = low_global_vr_size;
2463 ret = TRUE;
2464 }
2465
2466 if (region_select == 3) {
2467 /* In this config, we allow the bootstrap mappings to occupy the same
2468 * page table pages as the heap.
2469 */
2470 *startp = VM_MIN_KERNEL_ADDRESS;
2471 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2472 ret = TRUE;
2473 }
2474 #endif
2475 #endif /* defined(ARM_LARGE_MEMORY) */
2476 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2477 return ret;
2478 }
2479
2480 /*
2481 * Routines to track and allocate physical pages during early boot.
2482 * On most systems that memory runs from first_avail through to avail_end
2483 * with no gaps.
2484 *
2485 * However if the system supports ECC and bad_ram_pages_count > 0, we
2486 * need to be careful and skip those pages.
2487 */
2488 static unsigned int avail_page_count = 0;
2489 static bool need_ram_ranges_init = true;
2490
2491 #if defined(__arm64__)
2492 pmap_paddr_t *bad_ram_pages = NULL;
2493 unsigned int bad_ram_pages_count = 0;
2494
2495 /*
2496 * We use this sub-range of bad_ram_pages for pmap_next_page()
2497 */
2498 static pmap_paddr_t *skip_pages;
2499 static unsigned int skip_pages_count = 0;
2500
2501 #define MAX_BAD_RAM_PAGE_COUNT 64
2502 static pmap_paddr_t bad_ram_pages_arr[MAX_BAD_RAM_PAGE_COUNT];
2503
2504 /*
2505 * XXX - temporary code to get the bad pages array from boot-args.
2506 * expects a comma separated list of offsets from the start
2507 * of physical memory to be considered bad.
2508 *
2509 * HERE JOE -- will eventually be replaced by data provided by iboot
2510 */
2511 static void
parse_bad_ram_pages_boot_arg(void)2512 parse_bad_ram_pages_boot_arg(void)
2513 {
2514 char buf[256] = {0};
2515 char *s = buf;
2516 char *end;
2517 int count = 0;
2518 pmap_paddr_t num;
2519 extern uint64_t strtouq(const char *, char **, int);
2520
2521 if (!PE_parse_boot_arg_str("bad_ram_pages", buf, sizeof(buf))) {
2522 goto done;
2523 }
2524
2525 while (*s && count < MAX_BAD_RAM_PAGE_COUNT) {
2526 num = (pmap_paddr_t)strtouq(s, &end, 0);
2527 if (num == 0) {
2528 break;
2529 }
2530 num &= ~PAGE_MASK;
2531
2532 bad_ram_pages_arr[count++] = gDramBase + num;
2533
2534 if (*end != ',') {
2535 break;
2536 }
2537
2538 s = end + 1;
2539 }
2540
2541 done:
2542 bad_ram_pages = bad_ram_pages_arr;
2543 bad_ram_pages_count = count;
2544 }
2545
2546 /*
2547 * Comparison routine for qsort of array of physical addresses.
2548 */
2549 static int
pmap_paddr_cmp(void * a,void * b)2550 pmap_paddr_cmp(void *a, void *b)
2551 {
2552 pmap_paddr_t *x = a;
2553 pmap_paddr_t *y = b;
2554 if (*x < *y) {
2555 return -1;
2556 }
2557 return *x > *y;
2558 }
2559 #endif /* defined(__arm64__) */
2560
2561 /*
2562 * Look up ppn in the sorted bad_ram_pages array.
2563 */
2564 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2565 pmap_is_bad_ram(__unused ppnum_t ppn)
2566 {
2567 #if defined(__arm64__)
2568 pmap_paddr_t pa = ptoa(ppn);
2569 int low = 0;
2570 int high = bad_ram_pages_count - 1;
2571 int mid;
2572
2573 while (low <= high) {
2574 mid = (low + high) / 2;
2575 if (bad_ram_pages[mid] < pa) {
2576 low = mid + 1;
2577 } else if (bad_ram_pages[mid] > pa) {
2578 high = mid - 1;
2579 } else {
2580 return true;
2581 }
2582 }
2583 #endif /* defined(__arm64__) */
2584 return false;
2585 }
2586
2587 /*
2588 * Initialize the count of available pages. If we have bad_ram_pages, then sort the list of them.
2589 * No lock needed here, as this code is called while kernel boot up is single threaded.
2590 */
2591 static void
initialize_ram_ranges(void)2592 initialize_ram_ranges(void)
2593 {
2594 pmap_paddr_t first = first_avail;
2595 pmap_paddr_t end = avail_end;
2596
2597 assert(first <= end);
2598 assert(first == (first & ~PAGE_MASK));
2599 assert(end == (end & ~PAGE_MASK));
2600 avail_page_count = atop(end - first);
2601
2602 #if defined(__arm64__)
2603 /*
2604 * XXX Temporary code for testing, until there is iboot support
2605 *
2606 * Parse a list of known bad pages from a boot-args.
2607 */
2608 parse_bad_ram_pages_boot_arg();
2609
2610 /*
2611 * Sort and filter the bad pages list and adjust avail_page_count.
2612 */
2613 if (bad_ram_pages_count != 0) {
2614 qsort(bad_ram_pages, bad_ram_pages_count, sizeof(*bad_ram_pages), (cmpfunc_t)pmap_paddr_cmp);
2615 skip_pages = bad_ram_pages;
2616 skip_pages_count = bad_ram_pages_count;
2617
2618 /* ignore any pages before first */
2619 while (skip_pages_count > 0 && skip_pages[0] < first) {
2620 --skip_pages_count;
2621 ++skip_pages;
2622 }
2623
2624 /* ignore any pages at or after end */
2625 while (skip_pages_count > 0 && skip_pages[skip_pages_count - 1] >= end) {
2626 --skip_pages_count;
2627 }
2628
2629 avail_page_count -= skip_pages_count;
2630 }
2631 #endif /* defined(__arm64__) */
2632 need_ram_ranges_init = false;
2633 }
2634
2635 unsigned int
pmap_free_pages(void)2636 pmap_free_pages(
2637 void)
2638 {
2639 if (need_ram_ranges_init) {
2640 initialize_ram_ranges();
2641 }
2642 return avail_page_count;
2643 }
2644
2645 unsigned int
pmap_free_pages_span(void)2646 pmap_free_pages_span(
2647 void)
2648 {
2649 if (need_ram_ranges_init) {
2650 initialize_ram_ranges();
2651 }
2652 return (unsigned int)atop(avail_end - first_avail);
2653 }
2654
2655
2656 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2657 pmap_next_page_hi(
2658 ppnum_t * pnum,
2659 __unused boolean_t might_free)
2660 {
2661 return pmap_next_page(pnum);
2662 }
2663
2664
2665 boolean_t
pmap_next_page(ppnum_t * pnum)2666 pmap_next_page(
2667 ppnum_t *pnum)
2668 {
2669 if (need_ram_ranges_init) {
2670 initialize_ram_ranges();
2671 }
2672
2673 #if defined(__arm64__)
2674 /*
2675 * Skip over any known bad pages.
2676 */
2677 while (skip_pages_count > 0 && first_avail == skip_pages[0]) {
2678 first_avail += PAGE_SIZE;
2679 ++skip_pages;
2680 --skip_pages_count;
2681 }
2682 #endif /* defined(__arm64__) */
2683
2684 if (first_avail != avail_end) {
2685 *pnum = (ppnum_t)atop(first_avail);
2686 first_avail += PAGE_SIZE;
2687 assert(avail_page_count > 0);
2688 --avail_page_count;
2689 return TRUE;
2690 }
2691 assert(avail_page_count == 0);
2692 return FALSE;
2693 }
2694
2695 void
pmap_retire_page(__unused ppnum_t pnum)2696 pmap_retire_page(
2697 __unused ppnum_t pnum)
2698 {
2699 /* XXX Justin TBD - mark the page as unusable in pmap data structures */
2700 }
2701
2702
2703 /*
2704 * Initialize the pmap module.
2705 * Called by vm_init, to initialize any structures that the pmap
2706 * system needs to map virtual memory.
2707 */
2708 void
pmap_init(void)2709 pmap_init(
2710 void)
2711 {
2712 /*
2713 * Protect page zero in the kernel map.
2714 * (can be overruled by permanent transltion
2715 * table entries at page zero - see arm_vm_init).
2716 */
2717 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2718
2719 pmap_initialized = TRUE;
2720
2721 /*
2722 * Create the zone of physical maps
2723 * and the physical-to-virtual entries.
2724 */
2725 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2726 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2727
2728
2729 /*
2730 * Initialize the pmap object (for tracking the vm_page_t
2731 * structures for pages we allocate to be page tables in
2732 * pmap_expand().
2733 */
2734 _vm_object_allocate(mem_size, pmap_object);
2735 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2736
2737 /*
2738 * The values of [hard_]maxproc may have been scaled, make sure
2739 * they are still less than the value of pmap_max_asids.
2740 */
2741 if ((uint32_t)maxproc > pmap_max_asids) {
2742 maxproc = pmap_max_asids;
2743 }
2744 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2745 hard_maxproc = pmap_max_asids;
2746 }
2747 }
2748
2749 /**
2750 * Verify that a given physical page contains no mappings (outside of the
2751 * default physical aperture mapping).
2752 *
2753 * @param ppnum Physical page number to check there are no mappings to.
2754 *
2755 * @return True if there are no mappings, false otherwise or if the page is not
2756 * kernel-managed.
2757 */
2758 bool
pmap_verify_free(ppnum_t ppnum)2759 pmap_verify_free(ppnum_t ppnum)
2760 {
2761 const pmap_paddr_t pa = ptoa(ppnum);
2762
2763 assert(pa != vm_page_fictitious_addr);
2764
2765 /* Only mappings to kernel-managed physical memory are tracked. */
2766 if (!pa_valid(pa)) {
2767 return false;
2768 }
2769
2770 const unsigned int pai = pa_index(pa);
2771 pv_entry_t **pvh = pai_to_pvh(pai);
2772
2773 return pvh_test_type(pvh, PVH_TYPE_NULL);
2774 }
2775
2776 #if MACH_ASSERT
2777 /**
2778 * Verify that a given physical page contains no mappings (outside of the
2779 * default physical aperture mapping) and if it does, then panic.
2780 *
2781 * @note It's recommended to use pmap_verify_free() directly when operating in
2782 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2783 * normally being called from outside of the PPL, and the pv_head_table
2784 * can't be modified outside of the PPL).
2785 *
2786 * @param ppnum Physical page number to check there are no mappings to.
2787 */
2788 void
pmap_assert_free(ppnum_t ppnum)2789 pmap_assert_free(ppnum_t ppnum)
2790 {
2791 const pmap_paddr_t pa = ptoa(ppnum);
2792
2793 /* Only mappings to kernel-managed physical memory are tracked. */
2794 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2795 return;
2796 }
2797
2798 const unsigned int pai = pa_index(pa);
2799 pv_entry_t **pvh = pai_to_pvh(pai);
2800
2801 /**
2802 * This function is always called from outside of the PPL. Because of this,
2803 * the PVH entry can't be locked. This function is generally only called
2804 * before the VM reclaims a physical page and shouldn't be creating new
2805 * mappings. Even if a new mapping is created while parsing the hierarchy,
2806 * the worst case is that the system will panic in another way, and we were
2807 * already about to panic anyway.
2808 */
2809
2810 /**
2811 * Since pmap_verify_free() returned false, that means there is at least one
2812 * mapping left. Let's get some extra info on the first mapping we find to
2813 * dump in the panic string (the common case is that there is one spare
2814 * mapping that was never unmapped).
2815 */
2816 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2817
2818 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2819 first_ptep = pvh_ptep(pvh);
2820 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2821 pv_entry_t *pvep = pvh_pve_list(pvh);
2822
2823 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2824 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2825 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2826 if (first_ptep != PT_ENTRY_NULL) {
2827 break;
2828 }
2829 }
2830
2831 /* The PVE should have at least one valid PTE. */
2832 assert(first_ptep != PT_ENTRY_NULL);
2833 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2834 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2835 __func__, pvh, pai);
2836 } else {
2837 /**
2838 * The mapping disappeared between here and the pmap_verify_free() call.
2839 * The only way that can happen is if the VM was racing this call with
2840 * a call that unmaps PTEs. Operations on this page should not be
2841 * occurring at the same time as this check, and unfortunately we can't
2842 * lock the PVH entry to prevent it, so just panic instead.
2843 */
2844 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2845 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2846 __func__, pvh, pai);
2847 }
2848
2849 /* Panic with a unique string identifying the first bad mapping and owner. */
2850 {
2851 /* First PTE is mapped by the main CPUs. */
2852 pmap_t pmap = ptep_get_pmap(first_ptep);
2853 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2854
2855 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2856 "%s CPU mapping (pmap: %p)",
2857 __func__, (uint64_t)pa, first_ptep, type, pmap);
2858 }
2859 }
2860 #endif
2861
2862
2863 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2864 pmap_root_alloc_size(pmap_t pmap)
2865 {
2866 #if (__ARM_VMSA__ > 7)
2867 #pragma unused(pmap)
2868 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2869 unsigned int root_level = pt_attr_root_level(pt_attr);
2870 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2871 #else
2872 (void)pmap;
2873 return PMAP_ROOT_ALLOC_SIZE;
2874 #endif
2875 }
2876
2877 /*
2878 * Create and return a physical map.
2879 *
2880 * If the size specified for the map
2881 * is zero, the map is an actual physical
2882 * map, and may be referenced by the
2883 * hardware.
2884 *
2885 * If the size specified is non-zero,
2886 * the map will be used in software only, and
2887 * is bounded by that size.
2888 */
2889 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2890 pmap_create_options_internal(
2891 ledger_t ledger,
2892 vm_map_size_t size,
2893 unsigned int flags,
2894 kern_return_t *kr)
2895 {
2896 unsigned i;
2897 unsigned tte_index_max;
2898 pmap_t p;
2899 bool is_64bit = flags & PMAP_CREATE_64BIT;
2900 #if defined(HAS_APPLE_PAC)
2901 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2902 #endif /* defined(HAS_APPLE_PAC) */
2903 kern_return_t local_kr = KERN_SUCCESS;
2904
2905 /*
2906 * A software use-only map doesn't even need a pmap.
2907 */
2908 if (size != 0) {
2909 return PMAP_NULL;
2910 }
2911
2912 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2913 return PMAP_NULL;
2914 }
2915
2916 #if XNU_MONITOR
2917 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2918 goto pmap_create_fail;
2919 }
2920
2921 assert(p != PMAP_NULL);
2922
2923 if (ledger) {
2924 pmap_ledger_validate(ledger);
2925 pmap_ledger_retain(ledger);
2926 }
2927 #else
2928 /*
2929 * Allocate a pmap struct from the pmap_zone. Then allocate
2930 * the translation table of the right size for the pmap.
2931 */
2932 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2933 local_kr = KERN_RESOURCE_SHORTAGE;
2934 goto pmap_create_fail;
2935 }
2936 #endif
2937
2938 p->ledger = ledger;
2939
2940
2941 p->pmap_vm_map_cs_enforced = false;
2942
2943 p->min = 0;
2944 if (flags & PMAP_CREATE_64BIT) {
2945 } else {
2946 }
2947
2948 #if defined(HAS_APPLE_PAC)
2949 p->disable_jop = disable_jop;
2950 #endif /* defined(HAS_APPLE_PAC) */
2951
2952 p->nested_region_true_start = 0;
2953 p->nested_region_true_end = ~0;
2954
2955 p->gc_status = 0;
2956 p->stamp = os_atomic_inc(&pmap_stamp, relaxed);
2957 p->nx_enabled = true;
2958 p->is_64bit = is_64bit;
2959 p->nested_pmap = PMAP_NULL;
2960 p->type = PMAP_TYPE_USER;
2961
2962 #if ARM_PARAMETERIZED_PMAP
2963 /* Default to the native pt_attr */
2964 p->pmap_pt_attr = native_pt_attr;
2965 #endif /* ARM_PARAMETERIZED_PMAP */
2966 #if __ARM_MIXED_PAGE_SIZE__
2967 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2968 p->pmap_pt_attr = &pmap_pt_attr_4k;
2969 }
2970 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2971 p->max = pmap_user_va_size(p);
2972
2973 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2974 local_kr = KERN_NO_SPACE;
2975 goto id_alloc_fail;
2976 }
2977
2978 pmap_lock_init(p);
2979
2980 p->tt_entry_free = (tt_entry_t *)0;
2981 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
2982
2983 #if (__ARM_VMSA__ == 7)
2984 p->tte_index_max = tte_index_max;
2985 #endif
2986
2987 #if XNU_MONITOR
2988 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
2989 #else
2990 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
2991 #endif
2992 if (!(p->tte)) {
2993 local_kr = KERN_RESOURCE_SHORTAGE;
2994 goto tt1_alloc_fail;
2995 }
2996
2997 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
2998 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2999
3000 /* nullify the translation table */
3001 for (i = 0; i < tte_index_max; i++) {
3002 p->tte[i] = ARM_TTE_TYPE_FAULT;
3003 }
3004
3005 FLUSH_PTE();
3006
3007 /*
3008 * initialize the rest of the structure
3009 */
3010 p->nested_region_addr = 0x0ULL;
3011 p->nested_region_size = 0x0ULL;
3012 p->nested_region_asid_bitmap = NULL;
3013 p->nested_region_asid_bitmap_size = 0x0UL;
3014
3015 p->nested_has_no_bounds_ref = false;
3016 p->nested_no_bounds_refcnt = 0;
3017 p->nested_bounds_set = false;
3018
3019
3020 #if MACH_ASSERT
3021 p->pmap_stats_assert = TRUE;
3022 p->pmap_pid = 0;
3023 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3024 #endif /* MACH_ASSERT */
3025 #if DEVELOPMENT || DEBUG
3026 p->footprint_was_suspended = FALSE;
3027 #endif /* DEVELOPMENT || DEBUG */
3028
3029 #if XNU_MONITOR
3030 os_atomic_init(&p->nested_count, 0);
3031 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3032 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3033 os_atomic_thread_fence(release);
3034 #endif
3035 os_atomic_init(&p->ref_count, 1);
3036 pmap_simple_lock(&pmaps_lock);
3037 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3038 pmap_simple_unlock(&pmaps_lock);
3039
3040 return p;
3041
3042 tt1_alloc_fail:
3043 pmap_get_pt_ops(p)->free_id(p);
3044 id_alloc_fail:
3045 #if XNU_MONITOR
3046 pmap_free_pmap(p);
3047
3048 if (ledger) {
3049 pmap_ledger_release(ledger);
3050 }
3051 #else
3052 zfree(pmap_zone, p);
3053 #endif
3054 pmap_create_fail:
3055 #if XNU_MONITOR
3056 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3057 #endif
3058 *kr = local_kr;
3059 #if XNU_MONITOR
3060 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3061 #endif
3062 return PMAP_NULL;
3063 }
3064
3065 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3066 pmap_create_options(
3067 ledger_t ledger,
3068 vm_map_size_t size,
3069 unsigned int flags)
3070 {
3071 pmap_t pmap;
3072 kern_return_t kr = KERN_SUCCESS;
3073
3074 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3075
3076 ledger_reference(ledger);
3077
3078 #if XNU_MONITOR
3079 for (;;) {
3080 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3081 if (kr != KERN_RESOURCE_SHORTAGE) {
3082 break;
3083 }
3084 assert(pmap == PMAP_NULL);
3085 pmap_alloc_page_for_ppl(0);
3086 kr = KERN_SUCCESS;
3087 }
3088 #else
3089 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3090 #endif
3091
3092 if (pmap == PMAP_NULL) {
3093 ledger_dereference(ledger);
3094 }
3095
3096 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3097
3098 return pmap;
3099 }
3100
3101 #if XNU_MONITOR
3102 /*
3103 * This symbol remains in place when the PPL is enabled so that the dispatch
3104 * table does not change from development to release configurations.
3105 */
3106 #endif
3107 #if MACH_ASSERT || XNU_MONITOR
3108 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3109 pmap_set_process_internal(
3110 __unused pmap_t pmap,
3111 __unused int pid,
3112 __unused char *procname)
3113 {
3114 #if MACH_ASSERT
3115 if (pmap == NULL) {
3116 return;
3117 }
3118
3119 validate_pmap_mutable(pmap);
3120
3121 pmap->pmap_pid = pid;
3122 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3123 if (pmap_ledgers_panic_leeway) {
3124 /*
3125 * XXX FBDP
3126 * Some processes somehow trigger some issues that make
3127 * the pmap stats and ledgers go off track, causing
3128 * some assertion failures and ledger panics.
3129 * Turn off the sanity checks if we allow some ledger leeway
3130 * because of that. We'll still do a final check in
3131 * pmap_check_ledgers() for discrepancies larger than the
3132 * allowed leeway after the address space has been fully
3133 * cleaned up.
3134 */
3135 pmap->pmap_stats_assert = FALSE;
3136 ledger_disable_panic_on_negative(pmap->ledger,
3137 task_ledgers.phys_footprint);
3138 ledger_disable_panic_on_negative(pmap->ledger,
3139 task_ledgers.internal);
3140 ledger_disable_panic_on_negative(pmap->ledger,
3141 task_ledgers.internal_compressed);
3142 ledger_disable_panic_on_negative(pmap->ledger,
3143 task_ledgers.iokit_mapped);
3144 ledger_disable_panic_on_negative(pmap->ledger,
3145 task_ledgers.alternate_accounting);
3146 ledger_disable_panic_on_negative(pmap->ledger,
3147 task_ledgers.alternate_accounting_compressed);
3148 }
3149 #endif /* MACH_ASSERT */
3150 }
3151 #endif /* MACH_ASSERT || XNU_MONITOR */
3152
3153 #if MACH_ASSERT
3154 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3155 pmap_set_process(
3156 pmap_t pmap,
3157 int pid,
3158 char *procname)
3159 {
3160 #if XNU_MONITOR
3161 pmap_set_process_ppl(pmap, pid, procname);
3162 #else
3163 pmap_set_process_internal(pmap, pid, procname);
3164 #endif
3165 }
3166 #endif /* MACH_ASSERT */
3167
3168 #if (__ARM_VMSA__ > 7)
3169 /*
3170 * pmap_deallocate_all_leaf_tts:
3171 *
3172 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3173 * removing and deallocating all TTEs.
3174 */
3175 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3176 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3177 {
3178 tt_entry_t tte = ARM_TTE_EMPTY;
3179 tt_entry_t * ttep = NULL;
3180 tt_entry_t * last_ttep = NULL;
3181
3182 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3183
3184 assert(level < pt_attr_leaf_level(pt_attr));
3185
3186 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3187
3188 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3189 tte = *ttep;
3190
3191 if (!(tte & ARM_TTE_VALID)) {
3192 continue;
3193 }
3194
3195 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3196 panic("%s: found block mapping, ttep=%p, tte=%p, "
3197 "pmap=%p, first_ttep=%p, level=%u",
3198 __FUNCTION__, ttep, (void *)tte,
3199 pmap, first_ttep, level);
3200 }
3201
3202 /* Must be valid, type table */
3203 if (level < pt_attr_twig_level(pt_attr)) {
3204 /* If we haven't reached the twig level, recurse to the next level. */
3205 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3206 }
3207
3208 /* Remove the TTE. */
3209 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3210 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3211 }
3212 }
3213 #endif /* (__ARM_VMSA__ > 7) */
3214
3215 /*
3216 * We maintain stats and ledgers so that a task's physical footprint is:
3217 * phys_footprint = ((internal - alternate_accounting)
3218 * + (internal_compressed - alternate_accounting_compressed)
3219 * + iokit_mapped
3220 * + purgeable_nonvolatile
3221 * + purgeable_nonvolatile_compressed
3222 * + page_table)
3223 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3224 */
3225
3226 /*
3227 * Retire the given physical map from service.
3228 * Should only be called if the map contains
3229 * no valid mappings.
3230 */
3231 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3232 pmap_destroy_internal(
3233 pmap_t pmap)
3234 {
3235 if (pmap == PMAP_NULL) {
3236 return;
3237 }
3238
3239 validate_pmap(pmap);
3240
3241 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3242
3243 int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3244 if (ref_count > 0) {
3245 return;
3246 } else if (__improbable(ref_count < 0)) {
3247 panic("pmap %p: refcount underflow", pmap);
3248 } else if (__improbable(pmap == kernel_pmap)) {
3249 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3250 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3251 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3252 }
3253
3254 #if XNU_MONITOR
3255 /*
3256 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3257 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3258 * That ensures that if the pmap is currently in use elsewhere, this path will
3259 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3260 * ref_count of 0 and panic.
3261 */
3262 os_atomic_thread_fence(seq_cst);
3263 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3264 panic("pmap %p: attempt to destroy while nested", pmap);
3265 }
3266 const int max_cpu = ml_get_max_cpu_number();
3267 for (unsigned int i = 0; i <= max_cpu; ++i) {
3268 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3269 if (cpu_data == NULL) {
3270 continue;
3271 }
3272 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3273 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3274 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3275 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3276 }
3277 }
3278 #endif
3279 #if (__ARM_VMSA__ > 7)
3280 pmap_unmap_sharedpage(pmap);
3281 #endif /* (__ARM_VMSA__ > 7) */
3282
3283 pmap_simple_lock(&pmaps_lock);
3284 #if !XNU_MONITOR
3285 while (pmap->gc_status & PMAP_GC_INFLIGHT) {
3286 pmap->gc_status |= PMAP_GC_WAIT;
3287 assert_wait((event_t) &pmap->gc_status, THREAD_UNINT);
3288 pmap_simple_unlock(&pmaps_lock);
3289 (void) thread_block(THREAD_CONTINUE_NULL);
3290 pmap_simple_lock(&pmaps_lock);
3291 }
3292 #endif /* !XNU_MONITOR */
3293 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3294 pmap_simple_unlock(&pmaps_lock);
3295
3296 pmap_trim_self(pmap);
3297
3298 /*
3299 * Free the memory maps, then the
3300 * pmap structure.
3301 */
3302 #if (__ARM_VMSA__ == 7)
3303 unsigned int i = 0;
3304 pt_entry_t *ttep;
3305
3306 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3307 for (i = 0; i < pmap->tte_index_max; i++) {
3308 ttep = &pmap->tte[i];
3309 if ((*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
3310 pmap_tte_deallocate(pmap, 0, 0, false, ttep, PMAP_TT_L1_LEVEL);
3311 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3312 }
3313 }
3314 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3315 #else /* (__ARM_VMSA__ == 7) */
3316 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3317 #endif /* (__ARM_VMSA__ == 7) */
3318
3319
3320
3321 if (pmap->tte) {
3322 #if (__ARM_VMSA__ == 7)
3323 pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max * sizeof(tt_entry_t), 0);
3324 pmap->tte_index_max = 0;
3325 #else /* (__ARM_VMSA__ == 7) */
3326 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3327 #endif /* (__ARM_VMSA__ == 7) */
3328 pmap->tte = (tt_entry_t *) NULL;
3329 pmap->ttep = 0;
3330 }
3331
3332 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3333
3334 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3335 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false);
3336 sync_tlb_flush();
3337 } else {
3338 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3339 sync_tlb_flush();
3340 /* return its asid to the pool */
3341 pmap_get_pt_ops(pmap)->free_id(pmap);
3342 if (pmap->nested_pmap != NULL) {
3343 #if XNU_MONITOR
3344 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3345 #endif
3346 /* release the reference we hold on the nested pmap */
3347 pmap_destroy_internal(pmap->nested_pmap);
3348 }
3349 }
3350
3351 pmap_check_ledgers(pmap);
3352
3353 if (pmap->nested_region_asid_bitmap) {
3354 #if XNU_MONITOR
3355 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3356 #else
3357 kfree_data(pmap->nested_region_asid_bitmap,
3358 pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3359 #endif
3360 }
3361
3362 #if XNU_MONITOR
3363 if (pmap->ledger) {
3364 pmap_ledger_release(pmap->ledger);
3365 }
3366
3367 pmap_lock_destroy(pmap);
3368 pmap_free_pmap(pmap);
3369 #else
3370 pmap_lock_destroy(pmap);
3371 zfree(pmap_zone, pmap);
3372 #endif
3373 }
3374
3375 void
pmap_destroy(pmap_t pmap)3376 pmap_destroy(
3377 pmap_t pmap)
3378 {
3379 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3380
3381 ledger_t ledger = pmap->ledger;
3382
3383 #if XNU_MONITOR
3384 pmap_destroy_ppl(pmap);
3385
3386 pmap_ledger_check_balance(pmap);
3387 #else
3388 pmap_destroy_internal(pmap);
3389 #endif
3390
3391 ledger_dereference(ledger);
3392
3393 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3394 }
3395
3396
3397 /*
3398 * Add a reference to the specified pmap.
3399 */
3400 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3401 pmap_reference_internal(
3402 pmap_t pmap)
3403 {
3404 if (pmap != PMAP_NULL) {
3405 validate_pmap_mutable(pmap);
3406 os_atomic_inc(&pmap->ref_count, relaxed);
3407 }
3408 }
3409
3410 void
pmap_reference(pmap_t pmap)3411 pmap_reference(
3412 pmap_t pmap)
3413 {
3414 #if XNU_MONITOR
3415 pmap_reference_ppl(pmap);
3416 #else
3417 pmap_reference_internal(pmap);
3418 #endif
3419 }
3420
3421 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3422 pmap_tt1_allocate(
3423 pmap_t pmap,
3424 vm_size_t size,
3425 unsigned option)
3426 {
3427 tt_entry_t *tt1 = NULL;
3428 tt_free_entry_t *tt1_free;
3429 pmap_paddr_t pa;
3430 vm_address_t va;
3431 vm_address_t va_end;
3432 kern_return_t ret;
3433
3434 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3435 size = PAGE_SIZE;
3436 }
3437
3438 pmap_simple_lock(&tt1_lock);
3439 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3440 free_page_size_tt_count--;
3441 tt1 = (tt_entry_t *)free_page_size_tt_list;
3442 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3443 } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3444 free_two_page_size_tt_count--;
3445 tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3446 free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3447 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3448 free_tt_count--;
3449 tt1 = (tt_entry_t *)free_tt_list;
3450 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3451 }
3452
3453 pmap_simple_unlock(&tt1_lock);
3454
3455 if (tt1 != NULL) {
3456 pmap_tt_ledger_credit(pmap, size);
3457 return (tt_entry_t *)tt1;
3458 }
3459
3460 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3461
3462 if (ret == KERN_RESOURCE_SHORTAGE) {
3463 return (tt_entry_t *)0;
3464 }
3465
3466 #if XNU_MONITOR
3467 assert(pa);
3468 #endif
3469
3470 if (size < PAGE_SIZE) {
3471 va = phystokv(pa) + size;
3472 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3473 tt_free_entry_t *next_free = NULL;
3474 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3475 tt1_free = (tt_free_entry_t *)va;
3476 tt1_free->next = next_free;
3477 next_free = tt1_free;
3478 }
3479 pmap_simple_lock(&tt1_lock);
3480 local_free_list->next = free_tt_list;
3481 free_tt_list = next_free;
3482 free_tt_count += ((PAGE_SIZE / size) - 1);
3483 if (free_tt_count > free_tt_max) {
3484 free_tt_max = free_tt_count;
3485 }
3486 pmap_simple_unlock(&tt1_lock);
3487 }
3488
3489 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3490 * Depending on the device, this can vary between 512b and 16K. */
3491 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3492 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3493 pmap_tt_ledger_credit(pmap, size);
3494
3495 return (tt_entry_t *) phystokv(pa);
3496 }
3497
3498 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3499 pmap_tt1_deallocate(
3500 pmap_t pmap,
3501 tt_entry_t *tt,
3502 vm_size_t size,
3503 unsigned option)
3504 {
3505 tt_free_entry_t *tt_entry;
3506
3507 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3508 size = PAGE_SIZE;
3509 }
3510
3511 tt_entry = (tt_free_entry_t *)tt;
3512 assert(not_in_kdp);
3513 pmap_simple_lock(&tt1_lock);
3514
3515 if (size < PAGE_SIZE) {
3516 free_tt_count++;
3517 if (free_tt_count > free_tt_max) {
3518 free_tt_max = free_tt_count;
3519 }
3520 tt_entry->next = free_tt_list;
3521 free_tt_list = tt_entry;
3522 }
3523
3524 if (size == PAGE_SIZE) {
3525 free_page_size_tt_count++;
3526 if (free_page_size_tt_count > free_page_size_tt_max) {
3527 free_page_size_tt_max = free_page_size_tt_count;
3528 }
3529 tt_entry->next = free_page_size_tt_list;
3530 free_page_size_tt_list = tt_entry;
3531 }
3532
3533 if (size == 2 * PAGE_SIZE) {
3534 free_two_page_size_tt_count++;
3535 if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3536 free_two_page_size_tt_max = free_two_page_size_tt_count;
3537 }
3538 tt_entry->next = free_two_page_size_tt_list;
3539 free_two_page_size_tt_list = tt_entry;
3540 }
3541
3542 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3543 pmap_simple_unlock(&tt1_lock);
3544 pmap_tt_ledger_debit(pmap, size);
3545 return;
3546 }
3547
3548 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3549 free_page_size_tt_count--;
3550 tt = (tt_entry_t *)free_page_size_tt_list;
3551 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3552
3553 pmap_simple_unlock(&tt1_lock);
3554
3555 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3556
3557 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3558
3559 pmap_simple_lock(&tt1_lock);
3560 }
3561
3562 while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3563 free_two_page_size_tt_count--;
3564 tt = (tt_entry_t *)free_two_page_size_tt_list;
3565 free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3566
3567 pmap_simple_unlock(&tt1_lock);
3568
3569 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3570
3571 OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3572
3573 pmap_simple_lock(&tt1_lock);
3574 }
3575 pmap_simple_unlock(&tt1_lock);
3576 pmap_tt_ledger_debit(pmap, size);
3577 }
3578
3579 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3580 pmap_tt_allocate(
3581 pmap_t pmap,
3582 tt_entry_t **ttp,
3583 unsigned int level,
3584 unsigned int options)
3585 {
3586 pmap_paddr_t pa;
3587 *ttp = NULL;
3588
3589 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3590 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3591 tt_free_entry_t *tt_free_cur, *tt_free_next;
3592
3593 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3594 tt_free_next = tt_free_cur->next;
3595 tt_free_cur->next = NULL;
3596 *ttp = (tt_entry_t *)tt_free_cur;
3597 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3598 }
3599 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3600
3601 if (*ttp == NULL) {
3602 pt_desc_t *ptdp;
3603
3604 /*
3605 * Allocate a VM page for the level x page table entries.
3606 */
3607 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3608 if (options & PMAP_OPTIONS_NOWAIT) {
3609 return KERN_RESOURCE_SHORTAGE;
3610 }
3611 VM_PAGE_WAIT();
3612 }
3613
3614 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3615 if (options & PMAP_OPTIONS_NOWAIT) {
3616 pmap_pages_free(pa, PAGE_SIZE);
3617 return KERN_RESOURCE_SHORTAGE;
3618 }
3619 VM_PAGE_WAIT();
3620 }
3621
3622 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3623 OSAddAtomic64(1, &alloc_ttepages_count);
3624 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3625 } else {
3626 OSAddAtomic64(1, &alloc_ptepages_count);
3627 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3628 }
3629
3630 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3631
3632 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3633
3634 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3635
3636 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3637 if (PAGE_SIZE > pmap_page_size) {
3638 vm_address_t va;
3639 vm_address_t va_end;
3640
3641 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3642
3643 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3644 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3645 pmap->tt_entry_free = (tt_entry_t *)va;
3646 }
3647 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3648 }
3649
3650 *ttp = (tt_entry_t *)phystokv(pa);
3651 }
3652
3653 #if XNU_MONITOR
3654 assert(*ttp);
3655 #endif
3656
3657 return KERN_SUCCESS;
3658 }
3659
3660
3661 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3662 pmap_tt_deallocate(
3663 pmap_t pmap,
3664 tt_entry_t *ttp,
3665 unsigned int level)
3666 {
3667 pt_desc_t *ptdp;
3668 ptd_info_t *ptd_info;
3669 unsigned pt_acc_cnt;
3670 unsigned i;
3671 vm_offset_t free_page = 0;
3672 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3673 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3674
3675 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3676
3677 ptdp = ptep_get_ptd(ttp);
3678 ptd_info = ptd_get_info(ptdp, ttp);
3679
3680 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3681
3682 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3683 ptd_info->refcnt = 0;
3684 }
3685
3686 if (ptd_info->refcnt != 0) {
3687 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3688 }
3689
3690 ptd_info->refcnt = 0;
3691
3692 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3693 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3694 }
3695
3696 if (pt_acc_cnt == 0) {
3697 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3698 unsigned pt_free_entry_cnt = 1;
3699
3700 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3701 tt_free_entry_t *tt_free_list_next;
3702
3703 tt_free_list_next = tt_free_list->next;
3704 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3705 pt_free_entry_cnt++;
3706 }
3707 tt_free_list = tt_free_list_next;
3708 }
3709 if (pt_free_entry_cnt == max_pt_index) {
3710 tt_free_entry_t *tt_free_list_cur;
3711
3712 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3713 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3714 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3715
3716 while (tt_free_list_cur) {
3717 tt_free_entry_t *tt_free_list_next;
3718
3719 tt_free_list_next = tt_free_list_cur->next;
3720 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3721 tt_free_list->next = tt_free_list_next->next;
3722 } else {
3723 tt_free_list = tt_free_list_next;
3724 }
3725 tt_free_list_cur = tt_free_list_next;
3726 }
3727 } else {
3728 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3729 pmap->tt_entry_free = ttp;
3730 }
3731 } else {
3732 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3733 pmap->tt_entry_free = ttp;
3734 }
3735
3736 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3737
3738 if (free_page != 0) {
3739 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3740 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3741 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3742 if (level < pt_attr_leaf_level(pt_attr)) {
3743 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3744 } else {
3745 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3746 }
3747 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3748 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3749 }
3750 }
3751
3752 /**
3753 * Safely clear out a translation table entry.
3754 *
3755 * @note If the TTE to clear out points to a leaf table, then that leaf table
3756 * must have a refcnt of zero before the TTE can be removed.
3757 * @note This function expects to be called with pmap locked exclusive, and will
3758 * return with pmap unlocked.
3759 *
3760 * @param pmap The pmap containing the page table whose TTE is being removed.
3761 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
3762 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
3763 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
3764 * @param ttep Pointer to the TTE that should be cleared out.
3765 * @param level The level of the page table that contains the TTE to be removed.
3766 */
3767 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3768 pmap_tte_remove(
3769 pmap_t pmap,
3770 vm_offset_t va_start,
3771 vm_offset_t va_end,
3772 bool need_strong_sync,
3773 tt_entry_t *ttep,
3774 unsigned int level)
3775 {
3776 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3777
3778 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3779 const tt_entry_t tte = *ttep;
3780
3781 if (__improbable(tte == ARM_TTE_EMPTY)) {
3782 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3783 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3784 }
3785
3786 #if (__ARM_VMSA__ == 7)
3787 {
3788 tt_entry_t *ttep_4M = (tt_entry_t *) ((vm_offset_t)ttep & 0xFFFFFFF0);
3789 unsigned i;
3790
3791 for (i = 0; i < 4; i++, ttep_4M++) {
3792 *ttep_4M = (tt_entry_t) 0;
3793 }
3794 FLUSH_PTE_STRONG();
3795 }
3796 #else
3797 *ttep = (tt_entry_t) 0;
3798 FLUSH_PTE_STRONG();
3799 #endif /* (__ARM_VMSA__ == 7) */
3800 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3801 if (va_end > va_start) {
3802 #if (__ARM_VMSA__ == 7)
3803 // Ensure intermediate translations are flushed for each 1MB block
3804 flush_mmu_tlb_entry_async((va_start & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff));
3805 flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3806 flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3807 flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3808 #endif
3809 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3810 }
3811
3812 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3813
3814 /**
3815 * Remember, the passed in "level" parameter refers to the level above the
3816 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3817 * page table).
3818 */
3819 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3820
3821 /**
3822 * Non-leaf pagetables don't track active references in the PTD and instead
3823 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3824 * the real refcount below.
3825 */
3826 unsigned short refcnt = PT_DESC_REFCOUNT;
3827
3828 /*
3829 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3830 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3831 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3832 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3833 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3834 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3835 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3836 * synchronize it against the disconnect operation. If that removal caused the
3837 * refcount to reach zero, the pagetable page could be freed before the disconnect
3838 * operation is finished using the relevant pagetable descriptor.
3839 * Address these cases by waiting until all CPUs have been observed to not be
3840 * executing pmap_disconnect().
3841 */
3842 if (remove_leaf_table) {
3843 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3844 const int max_cpu = ml_get_max_cpu_number();
3845 bitmap_full(&active_disconnects[0], max_cpu + 1);
3846 bool inflight_disconnect;
3847
3848 /*
3849 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3850 * ahead of any prior PTE load which may have observed the effect of a
3851 * concurrent disconnect operation. An acquire fence is required for this;
3852 * a load-acquire operation is insufficient.
3853 */
3854 os_atomic_thread_fence(acquire);
3855 do {
3856 inflight_disconnect = false;
3857 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3858 i >= 0;
3859 i = bitmap_next(&active_disconnects[0], i)) {
3860 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3861 if (cpu_data == NULL) {
3862 continue;
3863 }
3864 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3865 __builtin_arm_wfe();
3866 inflight_disconnect = true;
3867 continue;
3868 }
3869 os_atomic_clear_exclusive();
3870 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3871 }
3872 } while (inflight_disconnect);
3873 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3874 os_atomic_thread_fence(acquire);
3875 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3876 }
3877
3878 #if MACH_ASSERT
3879 /**
3880 * On internal devices, always do the page table consistency check
3881 * regardless of page table level or the actual refcnt value.
3882 */
3883 {
3884 #else /* MACH_ASSERT */
3885 /**
3886 * Only perform the page table consistency check when deleting leaf page
3887 * tables and it seems like there might be valid/compressed mappings
3888 * leftover.
3889 */
3890 if (__improbable(remove_leaf_table && refcnt != 0)) {
3891 #endif /* MACH_ASSERT */
3892
3893 /**
3894 * There are multiple problems that can arise as a non-zero refcnt:
3895 * 1. A bug in the refcnt management logic.
3896 * 2. A memory stomper or hardware failure.
3897 * 3. The VM forgetting to unmap all of the valid mappings in an address
3898 * space before destroying a pmap.
3899 *
3900 * By looping over the page table and determining how many valid or
3901 * compressed entries there actually are, we can narrow down which of
3902 * these three cases is causing this panic. If the expected refcnt
3903 * (valid + compressed) and the actual refcnt don't match then the
3904 * problem is probably either a memory corruption issue (if the
3905 * non-empty entries don't match valid+compressed, that could also be a
3906 * sign of corruption) or refcnt management bug. Otherwise, there
3907 * actually are leftover mappings and the higher layers of xnu are
3908 * probably at fault.
3909 */
3910 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3911 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3912
3913 pt_entry_t *ptep = bpte;
3914 unsigned short non_empty = 0, valid = 0, comp = 0;
3915 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3916 /* Keep track of all non-empty entries to detect memory corruption. */
3917 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3918 non_empty++;
3919 }
3920
3921 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3922 comp++;
3923 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3924 valid++;
3925 }
3926 }
3927
3928 #if MACH_ASSERT
3929 /**
3930 * On internal machines, panic whenever a page table getting deleted has
3931 * leftover mappings (valid or otherwise) or a leaf page table has a
3932 * non-zero refcnt.
3933 */
3934 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3935 #else /* MACH_ASSERT */
3936 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3937 {
3938 #endif /* MACH_ASSERT */
3939 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3940 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3941 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3942 }
3943 }
3944 }
3945
3946 /**
3947 * Given a pointer to an entry within a `level` page table, delete the
3948 * page table at `level` + 1 that is represented by that entry. For instance,
3949 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3950 * contains the PA of the L3 table, and `level` would be "2".
3951 *
3952 * @note If the table getting deallocated is a leaf table, then that leaf table
3953 * must have a refcnt of zero before getting deallocated. All other levels
3954 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3955 * @note This function expects to be called with pmap locked exclusive and will
3956 * return with pmap unlocked.
3957 *
3958 * @param pmap The pmap that owns the page table to be deallocated.
3959 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
3960 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
3961 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
3962 * @param ttep Pointer to the `level` TTE to remove.
3963 * @param level The level of the table that contains an entry pointing to the
3964 * table to be removed. The deallocated page table will be a
3965 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3966 * deleted).
3967 */
3968 void
3969 pmap_tte_deallocate(
3970 pmap_t pmap,
3971 vm_offset_t va_start,
3972 vm_offset_t va_end,
3973 bool need_strong_sync,
3974 tt_entry_t *ttep,
3975 unsigned int level)
3976 {
3977 pmap_paddr_t pa;
3978 tt_entry_t tte;
3979
3980 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3981
3982 tte = *ttep;
3983
3984 if (tte_get_ptd(tte)->pmap != pmap) {
3985 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3986 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3987 }
3988
3989 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3990 __func__, ttep, (unsigned long long)tte);
3991 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3992
3993 /* pmap_tte_remove() will drop the pmap lock */
3994 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3995
3996 /* Clear any page offset: we mean to free the whole page, but armv7 TTEs may only be
3997 * aligned on 1K boundaries. We clear the surrounding "chunk" of 4 TTEs above. */
3998 pa = tte_to_pa(tte) & ~(pmap_page_size - 1);
3999 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(pa), level + 1);
4000 }
4001
4002 /*
4003 * Remove a range of hardware page-table entries.
4004 * The entries given are the first (inclusive)
4005 * and last (exclusive) entries for the VM pages.
4006 * The virtual address is the va for the first pte.
4007 *
4008 * The pmap must be locked.
4009 * If the pmap is not the kernel pmap, the range must lie
4010 * entirely within one pte-page. This is NOT checked.
4011 * Assumes that the pte-page exists.
4012 *
4013 * Returns the number of PTE changed
4014 */
4015 MARK_AS_PMAP_TEXT static int
4016 pmap_remove_range(
4017 pmap_t pmap,
4018 vm_map_address_t va,
4019 pt_entry_t *bpte,
4020 pt_entry_t *epte)
4021 {
4022 bool need_strong_sync = false;
4023 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4024 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4025 if (num_changed > 0) {
4026 PMAP_UPDATE_TLBS(pmap, va,
4027 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4028 }
4029 return num_changed;
4030 }
4031
4032
4033 #ifdef PVH_FLAG_EXEC
4034
4035 /*
4036 * Update the access protection bits of the physical aperture mapping for a page.
4037 * This is useful, for example, in guranteeing that a verified executable page
4038 * has no writable mappings anywhere in the system, including the physical
4039 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4040 * synchronization overhead in cases where the call to this function is
4041 * guaranteed to be followed by other TLB operations.
4042 */
4043 void
4044 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4045 {
4046 #if __ARM_PTE_PHYSMAP__
4047 pvh_assert_locked(pai);
4048 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4049 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4050
4051 pt_entry_t tmplate = *pte_p;
4052 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4053 return;
4054 }
4055 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4056 #if (__ARM_VMSA__ > 7)
4057 if (tmplate & ARM_PTE_HINT_MASK) {
4058 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4059 __func__, pte_p, (void *)kva, tmplate);
4060 }
4061 #endif
4062 write_pte_strong(pte_p, tmplate);
4063 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
4064 if (!flush_tlb_async) {
4065 sync_tlb_flush();
4066 }
4067 #endif
4068 }
4069
4070 #endif /* defined(PVH_FLAG_EXEC) */
4071
4072 MARK_AS_PMAP_TEXT int
4073 pmap_remove_range_options(
4074 pmap_t pmap,
4075 vm_map_address_t va,
4076 pt_entry_t *bpte,
4077 pt_entry_t *epte,
4078 vm_map_address_t *eva,
4079 bool *need_strong_sync __unused,
4080 int options)
4081 {
4082 pt_entry_t *cpte;
4083 size_t npages = 0;
4084 int num_removed, num_unwired;
4085 int num_pte_changed;
4086 unsigned int pai = 0;
4087 pmap_paddr_t pa;
4088 int num_external, num_internal, num_reusable;
4089 int num_alt_internal;
4090 uint64_t num_compressed, num_alt_compressed;
4091 int16_t refcnt = 0;
4092
4093 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4094
4095 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4096 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4097
4098 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4099 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4100 }
4101
4102 num_removed = 0;
4103 num_unwired = 0;
4104 num_pte_changed = 0;
4105 num_external = 0;
4106 num_internal = 0;
4107 num_reusable = 0;
4108 num_compressed = 0;
4109 num_alt_internal = 0;
4110 num_alt_compressed = 0;
4111
4112 #if XNU_MONITOR
4113 bool ro_va = false;
4114 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4115 ro_va = true;
4116 }
4117 #endif
4118 for (cpte = bpte; cpte < epte;
4119 cpte += PAGE_RATIO, va += pmap_page_size) {
4120 pt_entry_t spte;
4121 boolean_t managed = FALSE;
4122
4123 /*
4124 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4125 * so we need to be as aggressive as possible in checking for preemption when we can.
4126 */
4127 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4128 *eva = va;
4129 break;
4130 }
4131
4132 spte = *((volatile pt_entry_t*)cpte);
4133
4134 while (!managed) {
4135 if (pmap != kernel_pmap &&
4136 (options & PMAP_OPTIONS_REMOVE) &&
4137 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4138 /*
4139 * "pmap" must be locked at this point,
4140 * so this should not race with another
4141 * pmap_remove_range() or pmap_enter().
4142 */
4143
4144 /* one less "compressed"... */
4145 num_compressed++;
4146 if (spte & ARM_PTE_COMPRESSED_ALT) {
4147 /* ... but it used to be "ALTACCT" */
4148 num_alt_compressed++;
4149 }
4150
4151 /* clear marker */
4152 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4153 /*
4154 * "refcnt" also accounts for
4155 * our "compressed" markers,
4156 * so let's update it here.
4157 */
4158 --refcnt;
4159 spte = *((volatile pt_entry_t*)cpte);
4160 }
4161 /*
4162 * It may be possible for the pte to transition from managed
4163 * to unmanaged in this timeframe; for now, elide the assert.
4164 * We should break out as a consequence of checking pa_valid.
4165 */
4166 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4167 pa = pte_to_pa(spte);
4168 if (!pa_valid(pa)) {
4169 #if XNU_MONITOR
4170 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4171 #endif
4172 #if XNU_MONITOR
4173 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4174 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4175 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4176 __func__, (uint64_t)pa);
4177 }
4178 #endif
4179 break;
4180 }
4181 pai = pa_index(pa);
4182 pvh_lock(pai);
4183 spte = *((volatile pt_entry_t*)cpte);
4184 pa = pte_to_pa(spte);
4185 if (pai == pa_index(pa)) {
4186 managed = TRUE;
4187 break; // Leave pai locked as we will unlock it after we free the PV entry
4188 }
4189 pvh_unlock(pai);
4190 }
4191
4192 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4193 /*
4194 * There used to be a valid mapping here but it
4195 * has already been removed when the page was
4196 * sent to the VM compressor, so nothing left to
4197 * remove now...
4198 */
4199 continue;
4200 }
4201
4202 /* remove the translation, do not flush the TLB */
4203 if (*cpte != ARM_PTE_TYPE_FAULT) {
4204 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4205 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4206 #if MACH_ASSERT
4207 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4208 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4209 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4210 }
4211 #endif
4212 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4213 num_pte_changed++;
4214 }
4215
4216 if ((spte != ARM_PTE_TYPE_FAULT) &&
4217 (pmap != kernel_pmap)) {
4218 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4219 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4220 --refcnt;
4221 }
4222
4223 if (pte_is_wired(spte)) {
4224 pte_set_wired(pmap, cpte, 0);
4225 num_unwired++;
4226 }
4227 /*
4228 * if not managed, we're done
4229 */
4230 if (!managed) {
4231 continue;
4232 }
4233
4234 #if XNU_MONITOR
4235 if (__improbable(ro_va)) {
4236 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4237 }
4238 #endif
4239
4240 /*
4241 * find and remove the mapping from the chain for this
4242 * physical address.
4243 */
4244 bool is_internal, is_altacct;
4245 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4246
4247 if (is_altacct) {
4248 assert(is_internal);
4249 num_internal++;
4250 num_alt_internal++;
4251 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4252 ppattr_clear_altacct(pai);
4253 ppattr_clear_internal(pai);
4254 }
4255 } else if (is_internal) {
4256 if (ppattr_test_reusable(pai)) {
4257 num_reusable++;
4258 } else {
4259 num_internal++;
4260 }
4261 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4262 ppattr_clear_internal(pai);
4263 }
4264 } else {
4265 num_external++;
4266 }
4267 pvh_unlock(pai);
4268 num_removed++;
4269 }
4270
4271 /*
4272 * Update the counts
4273 */
4274 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4275
4276 if (pmap != kernel_pmap) {
4277 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4278 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4279 }
4280
4281 /* update ledgers */
4282 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4283 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4284 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4285 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4286 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4287 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4288 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4289 /* make needed adjustments to phys_footprint */
4290 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4291 ((num_internal -
4292 num_alt_internal) +
4293 (num_compressed -
4294 num_alt_compressed)) * pmap_page_size);
4295 }
4296
4297 /* flush the ptable entries we have written */
4298 if (num_pte_changed > 0) {
4299 FLUSH_PTE_STRONG();
4300 }
4301
4302 return num_pte_changed;
4303 }
4304
4305
4306 /*
4307 * Remove the given range of addresses
4308 * from the specified map.
4309 *
4310 * It is assumed that the start and end are properly
4311 * rounded to the hardware page size.
4312 */
4313 void
4314 pmap_remove(
4315 pmap_t pmap,
4316 vm_map_address_t start,
4317 vm_map_address_t end)
4318 {
4319 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4320 }
4321
4322 MARK_AS_PMAP_TEXT vm_map_address_t
4323 pmap_remove_options_internal(
4324 pmap_t pmap,
4325 vm_map_address_t start,
4326 vm_map_address_t end,
4327 int options)
4328 {
4329 vm_map_address_t eva = end;
4330 pt_entry_t *bpte, *epte;
4331 pt_entry_t *pte_p;
4332 tt_entry_t *tte_p;
4333 int remove_count = 0;
4334 bool need_strong_sync = false;
4335 bool unlock = true;
4336
4337 if (__improbable(end < start)) {
4338 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4339 }
4340 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4341 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4342 }
4343
4344 validate_pmap_mutable(pmap);
4345
4346 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4347
4348 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4349
4350 tte_p = pmap_tte(pmap, start);
4351
4352 if (tte_p == (tt_entry_t *) NULL) {
4353 goto done;
4354 }
4355
4356 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4357 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4358 bpte = &pte_p[pte_index(pt_attr, start)];
4359 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4360
4361 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4362 &need_strong_sync, options);
4363
4364 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4365 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4366 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4367 unlock = false; // pmap_tte_deallocate() has dropped the lock
4368 }
4369 }
4370
4371 done:
4372 if (unlock) {
4373 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4374 }
4375
4376 if (remove_count > 0) {
4377 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4378 }
4379 return eva;
4380 }
4381
4382 void
4383 pmap_remove_options(
4384 pmap_t pmap,
4385 vm_map_address_t start,
4386 vm_map_address_t end,
4387 int options)
4388 {
4389 vm_map_address_t va;
4390
4391 if (pmap == PMAP_NULL) {
4392 return;
4393 }
4394
4395 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4396
4397 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4398 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4399 VM_KERNEL_ADDRHIDE(end));
4400
4401 #if MACH_ASSERT
4402 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4403 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4404 pmap, (uint64_t)start, (uint64_t)end);
4405 }
4406 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4407 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4408 pmap, (uint64_t)start, (uint64_t)end);
4409 }
4410 #endif
4411
4412 /*
4413 * We allow single-page requests to execute non-preemptibly,
4414 * as it doesn't make sense to sample AST_URGENT for a single-page
4415 * operation, and there are a couple of special use cases that
4416 * require a non-preemptible single-page operation.
4417 */
4418 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4419 pmap_verify_preemptible();
4420 }
4421
4422 /*
4423 * Invalidate the translation buffer first
4424 */
4425 va = start;
4426 while (va < end) {
4427 vm_map_address_t l;
4428
4429 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4430 if (l > end) {
4431 l = end;
4432 }
4433
4434 #if XNU_MONITOR
4435 va = pmap_remove_options_ppl(pmap, va, l, options);
4436
4437 pmap_ledger_check_balance(pmap);
4438 #else
4439 va = pmap_remove_options_internal(pmap, va, l, options);
4440 #endif
4441 }
4442
4443 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4444 }
4445
4446
4447 /*
4448 * Remove phys addr if mapped in specified map
4449 */
4450 void
4451 pmap_remove_some_phys(
4452 __unused pmap_t map,
4453 __unused ppnum_t pn)
4454 {
4455 /* Implement to support working set code */
4456 }
4457
4458 /*
4459 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4460 * switch a thread onto a new vm_map.
4461 */
4462 void
4463 pmap_switch_user(thread_t thread, vm_map_t new_map)
4464 {
4465 pmap_t new_pmap = new_map->pmap;
4466
4467
4468 thread->map = new_map;
4469 pmap_set_pmap(new_pmap, thread);
4470
4471 }
4472
4473 void
4474 pmap_set_pmap(
4475 pmap_t pmap,
4476 #if !__ARM_USER_PROTECT__
4477 __unused
4478 #endif
4479 thread_t thread)
4480 {
4481 pmap_switch(pmap);
4482 #if __ARM_USER_PROTECT__
4483 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4484 thread->machine.asid = pmap->hw_asid;
4485 #endif
4486 }
4487
4488 static void
4489 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4490 {
4491 #if (__ARM_VMSA__ == 7)
4492 flush_core_tlb_asid_async(pmap->hw_asid);
4493 #else
4494 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4495 #endif
4496 }
4497
4498 static inline bool
4499 pmap_user_ttb_is_clear(void)
4500 {
4501 #if (__ARM_VMSA__ > 7)
4502 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4503 #else
4504 return get_mmu_ttb() == kernel_pmap->ttep;
4505 #endif
4506 }
4507
4508 MARK_AS_PMAP_TEXT void
4509 pmap_switch_internal(
4510 pmap_t pmap)
4511 {
4512 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4513 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4514 #if XNU_MONITOR
4515 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4516 #endif
4517 validate_pmap_mutable(pmap);
4518 uint16_t asid_index = pmap->hw_asid;
4519 bool do_asid_flush = false;
4520 bool do_commpage_flush = false;
4521
4522 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4523 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4524 }
4525 #if __ARM_KERNEL_PROTECT__
4526 asid_index >>= 1;
4527 #endif
4528
4529 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4530 #if (__ARM_VMSA__ > 7)
4531 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4532 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4533 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4534 #endif
4535 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4536 bool break_before_make = do_shared_region_flush;
4537
4538 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4539 asid_index -= 1;
4540 pmap_update_plru(asid_index);
4541
4542 /* Paranoia. */
4543 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4544
4545 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4546 uint8_t new_sw_asid = pmap->sw_asid;
4547 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4548
4549 if (new_sw_asid != last_sw_asid) {
4550 /*
4551 * If the virtual ASID of the new pmap does not match the virtual ASID
4552 * last seen on this CPU for the physical ASID (that was a mouthful),
4553 * then this switch runs the risk of aliasing. We need to flush the
4554 * TLB for this phyiscal ASID in this case.
4555 */
4556 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4557 do_asid_flush = true;
4558 break_before_make = true;
4559 }
4560 }
4561
4562 #if __ARM_MIXED_PAGE_SIZE__
4563 if (pt_attr->pta_tcr_value != get_tcr()) {
4564 break_before_make = true;
4565 }
4566 #endif
4567 #if __ARM_MIXED_PAGE_SIZE__
4568 /*
4569 * For mixed page size configurations, we need to flush the global commpage mappings from
4570 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4571 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4572 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4573 * conflict abort or other unpredictable behavior.
4574 */
4575 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4576 do_commpage_flush = true;
4577 }
4578 if (do_commpage_flush) {
4579 break_before_make = true;
4580 }
4581 #endif
4582 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4583 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4584 pmap_clear_user_ttb_internal();
4585 }
4586
4587 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4588 * to flush the userspace mappings for that region. Those mappings are global
4589 * and will not be protected by the ASID. It should also be cheaper to flush the
4590 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4591 if (__improbable(do_shared_region_flush)) {
4592 #if __ARM_RANGE_TLBI__
4593 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4594 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4595
4596 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4597 * There may still be non-global entries that overlap with the incoming pmap's
4598 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4599 * must necessarily belong to a different ASID than the incoming pmap, or they would
4600 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4601 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4602 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4603 * to consider additional invalidation here in the future. */
4604 if (npages_prev <= ARM64_TLB_RANGE_PAGES) {
4605 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4606 } else {
4607 do_asid_flush = false;
4608 flush_core_tlb_async();
4609 }
4610 #else
4611 do_asid_flush = false;
4612 flush_core_tlb_async();
4613 #endif // __ARM_RANGE_TLBI__
4614 }
4615
4616 #if __ARM_MIXED_PAGE_SIZE__
4617 if (__improbable(do_commpage_flush)) {
4618 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4619 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4620 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4621 flush_core_tlb_allrange_async(rtlbi_param);
4622 }
4623 #endif
4624 if (__improbable(do_asid_flush)) {
4625 pmap_flush_core_tlb_asid_async(pmap);
4626 #if DEVELOPMENT || DEBUG
4627 os_atomic_inc(&pmap_asid_flushes, relaxed);
4628 #endif
4629 }
4630 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4631 sync_tlb_flush_local();
4632 }
4633
4634 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4635 }
4636
4637 void
4638 pmap_switch(
4639 pmap_t pmap)
4640 {
4641 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4642 #if XNU_MONITOR
4643 pmap_switch_ppl(pmap);
4644 #else
4645 pmap_switch_internal(pmap);
4646 #endif
4647 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4648 }
4649
4650 void
4651 pmap_page_protect(
4652 ppnum_t ppnum,
4653 vm_prot_t prot)
4654 {
4655 pmap_page_protect_options(ppnum, prot, 0, NULL);
4656 }
4657
4658 /*
4659 * Routine: pmap_page_protect_options
4660 *
4661 * Function:
4662 * Lower the permission for all mappings to a given
4663 * page.
4664 */
4665 MARK_AS_PMAP_TEXT static void
4666 pmap_page_protect_options_with_flush_range(
4667 ppnum_t ppnum,
4668 vm_prot_t prot,
4669 unsigned int options,
4670 pmap_tlb_flush_range_t *flush_range)
4671 {
4672 pmap_paddr_t phys = ptoa(ppnum);
4673 pv_entry_t **pv_h;
4674 pv_entry_t *pve_p, *orig_pve_p;
4675 pv_entry_t *pveh_p;
4676 pv_entry_t *pvet_p;
4677 pt_entry_t *pte_p, *orig_pte_p;
4678 pv_entry_t *new_pve_p;
4679 pt_entry_t *new_pte_p;
4680 vm_offset_t pvh_flags;
4681 unsigned int pai;
4682 bool remove;
4683 bool set_NX;
4684 unsigned int pvh_cnt = 0;
4685 unsigned int pass1_updated = 0;
4686 unsigned int pass2_updated = 0;
4687
4688 assert(ppnum != vm_page_fictitious_addr);
4689
4690 /* Only work with managed pages. */
4691 if (!pa_valid(phys)) {
4692 return;
4693 }
4694
4695 /*
4696 * Determine the new protection.
4697 */
4698 switch (prot) {
4699 case VM_PROT_ALL:
4700 return; /* nothing to do */
4701 case VM_PROT_READ:
4702 case VM_PROT_READ | VM_PROT_EXECUTE:
4703 remove = false;
4704 break;
4705 default:
4706 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4707 options = options & ~PMAP_OPTIONS_NOFLUSH;
4708 remove = true;
4709 break;
4710 }
4711
4712 pmap_cpu_data_t *pmap_cpu_data = NULL;
4713 if (remove) {
4714 #if !XNU_MONITOR
4715 mp_disable_preemption();
4716 #endif
4717 pmap_cpu_data = pmap_get_cpu_data();
4718 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4719 /*
4720 * Ensure the store to inflight_disconnect will be observed before any of the
4721 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4722 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4723 * another CPU, in between this function's clearing a PTE and dropping the
4724 * corresponding pagetable refcount. That can lead to a panic if the
4725 * destroying thread observes a non-zero refcount. For this we need a store-
4726 * store barrier; a store-release operation would not be sufficient.
4727 */
4728 os_atomic_thread_fence(release);
4729 }
4730
4731 pai = pa_index(phys);
4732 pvh_lock(pai);
4733 pv_h = pai_to_pvh(pai);
4734 pvh_flags = pvh_get_flags(pv_h);
4735
4736 #if XNU_MONITOR
4737 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4738 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4739 }
4740 if (__improbable(ppattr_pa_test_monitor(phys))) {
4741 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4742 }
4743 #endif
4744
4745 orig_pte_p = pte_p = PT_ENTRY_NULL;
4746 orig_pve_p = pve_p = PV_ENTRY_NULL;
4747 pveh_p = PV_ENTRY_NULL;
4748 pvet_p = PV_ENTRY_NULL;
4749 new_pve_p = PV_ENTRY_NULL;
4750 new_pte_p = PT_ENTRY_NULL;
4751
4752
4753 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4754 orig_pte_p = pte_p = pvh_ptep(pv_h);
4755 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4756 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4757 pveh_p = pve_p;
4758 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4759 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4760 }
4761
4762 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4763 int pve_ptep_idx = 0;
4764
4765 /*
4766 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4767 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4768 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4769 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4770 * operation, TLB invalidation may be handled by the caller so it's possible for
4771 * tlb_flush_needed to be true while issue_tlbi is false.
4772 */
4773 bool issue_tlbi = false;
4774 bool tlb_flush_needed = false;
4775 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4776 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4777 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4778 bool update = false;
4779
4780 if (pve_p != PV_ENTRY_NULL) {
4781 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4782 if (pte_p == PT_ENTRY_NULL) {
4783 goto protect_skip_pve_pass1;
4784 }
4785 }
4786
4787 #ifdef PVH_FLAG_IOMMU
4788 if (pvh_ptep_is_iommu(pte_p)) {
4789 #if XNU_MONITOR
4790 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4791 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4792 ppnum, ptep_get_iommu(pte_p), pve_p);
4793 }
4794 #endif
4795 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4796 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4797 ppnum, ptep_get_iommu(pte_p), pve_p);
4798 }
4799 goto protect_skip_pve_pass1;
4800 }
4801 #endif
4802 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4803 const pmap_t pmap = ptdp->pmap;
4804 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4805
4806 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4807 #if MACH_ASSERT
4808 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4809 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4810 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4811 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4812
4813 pv_entry_t *check_pvep = pve_p;
4814
4815 do {
4816 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4817 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4818 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4819 }
4820 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4821
4822 /* Restore previous PTEP value. */
4823 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4824 }
4825 #endif
4826 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4827 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4828 }
4829
4830 #if DEVELOPMENT || DEBUG
4831 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4832 #else
4833 if ((prot & VM_PROT_EXECUTE))
4834 #endif
4835 {
4836 set_NX = false;
4837 } else {
4838 set_NX = true;
4839 }
4840
4841 /* Remove the mapping if new protection is NONE */
4842 if (remove) {
4843 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4844 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4845 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4846 pt_entry_t spte = *pte_p;
4847
4848 if (pte_is_wired(spte)) {
4849 pte_set_wired(pmap, pte_p, 0);
4850 spte = *pte_p;
4851 if (pmap != kernel_pmap) {
4852 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4853 }
4854 }
4855
4856 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4857 (uint64_t)spte, pte_p, ppnum);
4858
4859 if (compress && is_internal && (pmap != kernel_pmap)) {
4860 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4861 /* mark this PTE as having been "compressed" */
4862 tmplate = ARM_PTE_COMPRESSED;
4863 if (is_altacct) {
4864 tmplate |= ARM_PTE_COMPRESSED_ALT;
4865 }
4866 } else {
4867 tmplate = ARM_PTE_TYPE_FAULT;
4868 }
4869
4870 assert(spte != tmplate);
4871 write_pte_fast(pte_p, tmplate);
4872 update = true;
4873 ++pass1_updated;
4874
4875 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4876
4877 if (pmap != kernel_pmap) {
4878 if (ppattr_test_reusable(pai) &&
4879 is_internal &&
4880 !is_altacct) {
4881 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4882 } else if (!is_internal) {
4883 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4884 }
4885
4886 if (is_altacct) {
4887 assert(is_internal);
4888 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4889 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4890 if (options & PMAP_OPTIONS_COMPRESSOR) {
4891 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4892 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4893 }
4894 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4895 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4896 } else if (ppattr_test_reusable(pai)) {
4897 assert(is_internal);
4898 if (options & PMAP_OPTIONS_COMPRESSOR) {
4899 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4900 /* was not in footprint, but is now */
4901 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4902 }
4903 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4904 } else if (is_internal) {
4905 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4906
4907 /*
4908 * Update all stats related to physical footprint, which only
4909 * deals with internal pages.
4910 */
4911 if (options & PMAP_OPTIONS_COMPRESSOR) {
4912 /*
4913 * This removal is only being done so we can send this page to
4914 * the compressor; therefore it mustn't affect total task footprint.
4915 */
4916 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4917 } else {
4918 /*
4919 * This internal page isn't going to the compressor, so adjust stats to keep
4920 * phys_footprint up to date.
4921 */
4922 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4923 }
4924 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4925 } else {
4926 /* external page: no impact on ledgers */
4927 }
4928 }
4929 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4930 } else {
4931 pt_entry_t spte = *pte_p;
4932 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4933
4934 if (pmap == kernel_pmap) {
4935 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4936 } else {
4937 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4938 }
4939
4940 /*
4941 * While the naive implementation of this would serve to add execute
4942 * permission, this is not how the VM uses this interface, or how
4943 * x86_64 implements it. So ignore requests to add execute permissions.
4944 */
4945 if (set_NX) {
4946 tmplate |= pt_attr_leaf_xn(pt_attr);
4947 }
4948
4949
4950 assert(spte != ARM_PTE_TYPE_FAULT);
4951 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4952
4953 if (spte != tmplate) {
4954 /*
4955 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4956 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4957 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
4958 * should always be cleared by this function.
4959 */
4960 pte_set_was_writeable(tmplate, true);
4961 write_pte_fast(pte_p, tmplate);
4962 update = true;
4963 ++pass1_updated;
4964 } else if (pte_was_writeable(tmplate)) {
4965 /*
4966 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4967 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
4968 * write access to a page, this function should always at least clear that flag for
4969 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4970 * these mappings go through vm_fault(). We therefore don't want those accesses to
4971 * be handled through arm_fast_fault().
4972 */
4973 pte_set_was_writeable(tmplate, false);
4974 write_pte_fast(pte_p, tmplate);
4975 }
4976 }
4977
4978 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4979 tlb_flush_needed = true;
4980 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4981 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4982 issue_tlbi = true;
4983 }
4984 }
4985 protect_skip_pve_pass1:
4986 pte_p = PT_ENTRY_NULL;
4987 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4988 pve_ptep_idx = 0;
4989 pve_p = pve_next(pve_p);
4990 }
4991 }
4992
4993 if (tlb_flush_needed) {
4994 FLUSH_PTE_STRONG();
4995 }
4996
4997 if (!remove && !issue_tlbi) {
4998 goto protect_finish;
4999 }
5000
5001 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5002 pv_entry_t **pve_pp = pv_h;
5003 pve_p = orig_pve_p;
5004 pte_p = orig_pte_p;
5005 pve_ptep_idx = 0;
5006
5007 /*
5008 * We need to keep track of whether a particular PVE list contains IOMMU
5009 * mappings when removing entries, because we should only remove CPU
5010 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5011 * it around.
5012 */
5013 bool iommu_mapping_in_pve = false;
5014 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5015 if (pve_p != PV_ENTRY_NULL) {
5016 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5017 if (pte_p == PT_ENTRY_NULL) {
5018 goto protect_skip_pve_pass2;
5019 }
5020 }
5021
5022 #ifdef PVH_FLAG_IOMMU
5023 if (pvh_ptep_is_iommu(pte_p)) {
5024 iommu_mapping_in_pve = true;
5025 if (remove && (pve_p == PV_ENTRY_NULL)) {
5026 /*
5027 * We've found an IOMMU entry and it's the only entry in the PV list.
5028 * We don't discard IOMMU entries, so simply set up the new PV list to
5029 * contain the single IOMMU PTE and exit the loop.
5030 */
5031 new_pte_p = pte_p;
5032 break;
5033 }
5034 goto protect_skip_pve_pass2;
5035 }
5036 #endif
5037 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5038 const pmap_t pmap = ptdp->pmap;
5039 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5040
5041 if (remove) {
5042 if (!compress && (pmap != kernel_pmap)) {
5043 /*
5044 * We must wait to decrement the refcount until we're completely finished using the PTE
5045 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5046 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5047 * under us.
5048 */
5049 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5050 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5051 }
5052 }
5053 /* Remove this CPU mapping from PVE list. */
5054 if (pve_p != PV_ENTRY_NULL) {
5055 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5056 }
5057 } else {
5058 pt_entry_t spte = *pte_p;
5059 if (pte_was_writeable(spte)) {
5060 pte_set_was_writeable(spte, false);
5061 write_pte_fast(pte_p, spte);
5062 } else {
5063 goto protect_skip_pve_pass2;
5064 }
5065 }
5066 ++pass2_updated;
5067 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5068 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5069 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5070 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
5071 }
5072
5073 protect_skip_pve_pass2:
5074 pte_p = PT_ENTRY_NULL;
5075 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5076 pve_ptep_idx = 0;
5077
5078 if (remove) {
5079 /**
5080 * If there are any IOMMU mappings in the PVE list, preserve
5081 * those mappings in a new PVE list (new_pve_p) which will later
5082 * become the new PVH entry. Keep track of the CPU mappings in
5083 * pveh_p/pvet_p so they can be deallocated later.
5084 */
5085 if (iommu_mapping_in_pve) {
5086 iommu_mapping_in_pve = false;
5087 pv_entry_t *temp_pve_p = pve_next(pve_p);
5088 pve_remove(pv_h, pve_pp, pve_p);
5089 pveh_p = pvh_pve_list(pv_h);
5090 pve_p->pve_next = new_pve_p;
5091 new_pve_p = pve_p;
5092 pve_p = temp_pve_p;
5093 continue;
5094 } else {
5095 pvet_p = pve_p;
5096 pvh_cnt++;
5097 }
5098 }
5099
5100 pve_pp = pve_next_ptr(pve_p);
5101 pve_p = pve_next(pve_p);
5102 iommu_mapping_in_pve = false;
5103 }
5104 }
5105
5106 protect_finish:
5107
5108 #ifdef PVH_FLAG_EXEC
5109 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5110 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5111 }
5112 #endif
5113 if (__improbable(pass1_updated != pass2_updated)) {
5114 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5115 __func__, pass1_updated, pass2_updated);
5116 }
5117 /* if we removed a bunch of entries, take care of them now */
5118 if (remove) {
5119 if (new_pve_p != PV_ENTRY_NULL) {
5120 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5121 pvh_set_flags(pv_h, pvh_flags);
5122 } else if (new_pte_p != PT_ENTRY_NULL) {
5123 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5124 pvh_set_flags(pv_h, pvh_flags);
5125 } else {
5126 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5127 }
5128 }
5129
5130 if (flush_range && tlb_flush_needed) {
5131 if (!remove) {
5132 flush_range->ptfr_flush_needed = true;
5133 tlb_flush_needed = false;
5134 }
5135 }
5136
5137 /*
5138 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5139 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5140 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5141 * a page to be repurposed while it is still live in the TLBs.
5142 */
5143 if (remove && tlb_flush_needed) {
5144 sync_tlb_flush();
5145 }
5146
5147 pvh_unlock(pai);
5148
5149 if (remove) {
5150 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5151 #if !XNU_MONITOR
5152 mp_enable_preemption();
5153 #endif
5154 }
5155
5156 if (!remove && tlb_flush_needed) {
5157 sync_tlb_flush();
5158 }
5159
5160 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5161 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5162 }
5163 }
5164
5165 MARK_AS_PMAP_TEXT void
5166 pmap_page_protect_options_internal(
5167 ppnum_t ppnum,
5168 vm_prot_t prot,
5169 unsigned int options,
5170 void *arg)
5171 {
5172 if (arg != NULL) {
5173 /*
5174 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5175 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5176 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5177 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5178 * In that case, force the flush to take place.
5179 */
5180 options &= ~PMAP_OPTIONS_NOFLUSH;
5181 }
5182 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5183 }
5184
5185 void
5186 pmap_page_protect_options(
5187 ppnum_t ppnum,
5188 vm_prot_t prot,
5189 unsigned int options,
5190 void *arg)
5191 {
5192 pmap_paddr_t phys = ptoa(ppnum);
5193
5194 assert(ppnum != vm_page_fictitious_addr);
5195
5196 /* Only work with managed pages. */
5197 if (!pa_valid(phys)) {
5198 return;
5199 }
5200
5201 /*
5202 * Determine the new protection.
5203 */
5204 if (prot == VM_PROT_ALL) {
5205 return; /* nothing to do */
5206 }
5207
5208 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5209
5210 #if XNU_MONITOR
5211 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5212 #else
5213 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5214 #endif
5215
5216 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5217 }
5218
5219
5220 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
5221 MARK_AS_PMAP_TEXT void
5222 pmap_disable_user_jop_internal(pmap_t pmap)
5223 {
5224 if (pmap == kernel_pmap) {
5225 panic("%s: called with kernel_pmap", __func__);
5226 }
5227 validate_pmap_mutable(pmap);
5228 pmap->disable_jop = true;
5229 }
5230
5231 void
5232 pmap_disable_user_jop(pmap_t pmap)
5233 {
5234 #if XNU_MONITOR
5235 pmap_disable_user_jop_ppl(pmap);
5236 #else
5237 pmap_disable_user_jop_internal(pmap);
5238 #endif
5239 }
5240 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
5241
5242 /*
5243 * Indicates if the pmap layer enforces some additional restrictions on the
5244 * given set of protections.
5245 */
5246 bool
5247 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5248 {
5249 return false;
5250 }
5251
5252 /*
5253 * Set the physical protection on the
5254 * specified range of this map as requested.
5255 * VERY IMPORTANT: Will not increase permissions.
5256 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5257 */
5258 void
5259 pmap_protect(
5260 pmap_t pmap,
5261 vm_map_address_t b,
5262 vm_map_address_t e,
5263 vm_prot_t prot)
5264 {
5265 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5266 }
5267
5268 MARK_AS_PMAP_TEXT vm_map_address_t
5269 pmap_protect_options_internal(
5270 pmap_t pmap,
5271 vm_map_address_t start,
5272 vm_map_address_t end,
5273 vm_prot_t prot,
5274 unsigned int options,
5275 __unused void *args)
5276 {
5277 tt_entry_t *tte_p;
5278 pt_entry_t *bpte_p, *epte_p;
5279 pt_entry_t *pte_p;
5280 boolean_t set_NX = TRUE;
5281 #if (__ARM_VMSA__ > 7)
5282 boolean_t set_XO = FALSE;
5283 #endif
5284 boolean_t should_have_removed = FALSE;
5285 bool need_strong_sync = false;
5286
5287 /* Validate the pmap input before accessing its data. */
5288 validate_pmap_mutable(pmap);
5289
5290 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5291
5292 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5293 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5294 }
5295
5296 #if DEVELOPMENT || DEBUG
5297 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5298 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5299 should_have_removed = TRUE;
5300 }
5301 } else
5302 #endif
5303 {
5304 /* Determine the new protection. */
5305 switch (prot) {
5306 #if (__ARM_VMSA__ > 7)
5307 case VM_PROT_EXECUTE:
5308 set_XO = TRUE;
5309 OS_FALLTHROUGH;
5310 #endif
5311 case VM_PROT_READ:
5312 case VM_PROT_READ | VM_PROT_EXECUTE:
5313 break;
5314 case VM_PROT_READ | VM_PROT_WRITE:
5315 case VM_PROT_ALL:
5316 return end; /* nothing to do */
5317 default:
5318 should_have_removed = TRUE;
5319 }
5320 }
5321
5322 if (should_have_removed) {
5323 panic("%s: should have been a remove operation, "
5324 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5325 __FUNCTION__,
5326 pmap, (void *)start, (void *)end, prot, options, args);
5327 }
5328
5329 #if DEVELOPMENT || DEBUG
5330 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5331 #else
5332 if ((prot & VM_PROT_EXECUTE))
5333 #endif
5334 {
5335 set_NX = FALSE;
5336 } else {
5337 set_NX = TRUE;
5338 }
5339
5340 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5341 vm_map_address_t va = start;
5342 unsigned int npages = 0;
5343
5344 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5345
5346 tte_p = pmap_tte(pmap, start);
5347
5348 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5349 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5350 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5351 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5352 pte_p = bpte_p;
5353
5354 for (pte_p = bpte_p;
5355 pte_p < epte_p;
5356 pte_p += PAGE_RATIO, va += pmap_page_size) {
5357 ++npages;
5358 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5359 pmap_pending_preemption())) {
5360 break;
5361 }
5362 pt_entry_t spte;
5363 #if DEVELOPMENT || DEBUG
5364 boolean_t force_write = FALSE;
5365 #endif
5366
5367 spte = *((volatile pt_entry_t*)pte_p);
5368
5369 if ((spte == ARM_PTE_TYPE_FAULT) ||
5370 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5371 continue;
5372 }
5373
5374 pmap_paddr_t pa;
5375 unsigned int pai = 0;
5376 boolean_t managed = FALSE;
5377
5378 while (!managed) {
5379 /*
5380 * It may be possible for the pte to transition from managed
5381 * to unmanaged in this timeframe; for now, elide the assert.
5382 * We should break out as a consequence of checking pa_valid.
5383 */
5384 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5385 pa = pte_to_pa(spte);
5386 if (!pa_valid(pa)) {
5387 break;
5388 }
5389 pai = pa_index(pa);
5390 pvh_lock(pai);
5391 spte = *((volatile pt_entry_t*)pte_p);
5392 pa = pte_to_pa(spte);
5393 if (pai == pa_index(pa)) {
5394 managed = TRUE;
5395 break; // Leave the PVH locked as we will unlock it after we free the PTE
5396 }
5397 pvh_unlock(pai);
5398 }
5399
5400 if ((spte == ARM_PTE_TYPE_FAULT) ||
5401 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5402 continue;
5403 }
5404
5405 pt_entry_t tmplate;
5406
5407 if (pmap == kernel_pmap) {
5408 #if DEVELOPMENT || DEBUG
5409 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5410 force_write = TRUE;
5411 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5412 } else
5413 #endif
5414 {
5415 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5416 }
5417 } else {
5418 #if DEVELOPMENT || DEBUG
5419 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5420 assert(pmap->type != PMAP_TYPE_NESTED);
5421 force_write = TRUE;
5422 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5423 } else
5424 #endif
5425 {
5426 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5427 }
5428 }
5429
5430 /*
5431 * XXX Removing "NX" would
5432 * grant "execute" access
5433 * immediately, bypassing any
5434 * checks VM might want to do
5435 * in its soft fault path.
5436 * pmap_protect() and co. are
5437 * not allowed to increase
5438 * access permissions.
5439 */
5440 if (set_NX) {
5441 tmplate |= pt_attr_leaf_xn(pt_attr);
5442 } else {
5443 #if (__ARM_VMSA__ > 7)
5444 if (pmap == kernel_pmap) {
5445 /* do NOT clear "PNX"! */
5446 tmplate |= ARM_PTE_NX;
5447 } else {
5448 /* do NOT clear "NX"! */
5449 tmplate |= pt_attr_leaf_x(pt_attr);
5450 if (set_XO) {
5451 tmplate &= ~ARM_PTE_APMASK;
5452 tmplate |= pt_attr_leaf_rona(pt_attr);
5453 }
5454 }
5455 #endif
5456 }
5457
5458 #if DEVELOPMENT || DEBUG
5459 if (force_write) {
5460 /*
5461 * TODO: Run CS/Monitor checks here.
5462 */
5463 if (managed) {
5464 /*
5465 * We are marking the page as writable,
5466 * so we consider it to be modified and
5467 * referenced.
5468 */
5469 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5470 tmplate |= ARM_PTE_AF;
5471
5472 if (ppattr_test_reffault(pai)) {
5473 ppattr_clear_reffault(pai);
5474 }
5475
5476 if (ppattr_test_modfault(pai)) {
5477 ppattr_clear_modfault(pai);
5478 }
5479 }
5480 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5481 /*
5482 * An immediate request for anything other than
5483 * write should still mark the page as
5484 * referenced if managed.
5485 */
5486 if (managed) {
5487 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5488 tmplate |= ARM_PTE_AF;
5489
5490 if (ppattr_test_reffault(pai)) {
5491 ppattr_clear_reffault(pai);
5492 }
5493 }
5494 }
5495 #endif
5496
5497 /* We do not expect to write fast fault the entry. */
5498 pte_set_was_writeable(tmplate, false);
5499
5500 write_pte_fast(pte_p, tmplate);
5501
5502 if (managed) {
5503 pvh_assert_locked(pai);
5504 pvh_unlock(pai);
5505 }
5506 }
5507 FLUSH_PTE_STRONG();
5508 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5509 } else {
5510 va = end;
5511 }
5512
5513 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5514 return va;
5515 }
5516
5517 void
5518 pmap_protect_options(
5519 pmap_t pmap,
5520 vm_map_address_t b,
5521 vm_map_address_t e,
5522 vm_prot_t prot,
5523 unsigned int options,
5524 __unused void *args)
5525 {
5526 vm_map_address_t l, beg;
5527
5528 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5529
5530 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5531 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5532 pmap, (uint64_t)b, (uint64_t)e);
5533 }
5534
5535 /*
5536 * We allow single-page requests to execute non-preemptibly,
5537 * as it doesn't make sense to sample AST_URGENT for a single-page
5538 * operation, and there are a couple of special use cases that
5539 * require a non-preemptible single-page operation.
5540 */
5541 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5542 pmap_verify_preemptible();
5543 }
5544
5545 #if DEVELOPMENT || DEBUG
5546 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5547 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5548 pmap_remove_options(pmap, b, e, options);
5549 return;
5550 }
5551 } else
5552 #endif
5553 {
5554 /* Determine the new protection. */
5555 switch (prot) {
5556 case VM_PROT_EXECUTE:
5557 case VM_PROT_READ:
5558 case VM_PROT_READ | VM_PROT_EXECUTE:
5559 break;
5560 case VM_PROT_READ | VM_PROT_WRITE:
5561 case VM_PROT_ALL:
5562 return; /* nothing to do */
5563 default:
5564 pmap_remove_options(pmap, b, e, options);
5565 return;
5566 }
5567 }
5568
5569 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5570 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5571 VM_KERNEL_ADDRHIDE(e));
5572
5573 beg = b;
5574
5575 while (beg < e) {
5576 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5577
5578 if (l > e) {
5579 l = e;
5580 }
5581
5582 #if XNU_MONITOR
5583 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5584 #else
5585 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5586 #endif
5587 }
5588
5589 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5590 }
5591
5592 /**
5593 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5594 *
5595 * @param pmap pmap to insert the pages into.
5596 * @param va virtual address to map the pages into.
5597 * @param pa page number of the first physical page to map.
5598 * @param size block size, in number of pages.
5599 * @param prot mapping protection attributes.
5600 * @param attr flags to pass to pmap_enter().
5601 *
5602 * @return KERN_SUCCESS.
5603 */
5604 kern_return_t
5605 pmap_map_block(
5606 pmap_t pmap,
5607 addr64_t va,
5608 ppnum_t pa,
5609 uint32_t size,
5610 vm_prot_t prot,
5611 int attr,
5612 unsigned int flags)
5613 {
5614 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5615 }
5616
5617 /**
5618 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5619 * As opposed to pmap_map_block(), this function takes
5620 * a physical address as an input and operates using the
5621 * page size associated with the input pmap.
5622 *
5623 * @param pmap pmap to insert the pages into.
5624 * @param va virtual address to map the pages into.
5625 * @param pa physical address of the first physical page to map.
5626 * @param size block size, in number of pages.
5627 * @param prot mapping protection attributes.
5628 * @param attr flags to pass to pmap_enter().
5629 *
5630 * @return KERN_SUCCESS.
5631 */
5632 kern_return_t
5633 pmap_map_block_addr(
5634 pmap_t pmap,
5635 addr64_t va,
5636 pmap_paddr_t pa,
5637 uint32_t size,
5638 vm_prot_t prot,
5639 int attr,
5640 unsigned int flags)
5641 {
5642 #if __ARM_MIXED_PAGE_SIZE__
5643 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5644 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5645 #else
5646 const uint64_t pmap_page_size = PAGE_SIZE;
5647 #endif
5648
5649 for (ppnum_t page = 0; page < size; page++) {
5650 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5651 panic("%s: failed pmap_enter_addr, "
5652 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5653 __FUNCTION__,
5654 pmap, va, (uint64_t)pa, size, prot, flags);
5655 }
5656
5657 va += pmap_page_size;
5658 pa += pmap_page_size;
5659 }
5660
5661 return KERN_SUCCESS;
5662 }
5663
5664 kern_return_t
5665 pmap_enter_addr(
5666 pmap_t pmap,
5667 vm_map_address_t v,
5668 pmap_paddr_t pa,
5669 vm_prot_t prot,
5670 vm_prot_t fault_type,
5671 unsigned int flags,
5672 boolean_t wired)
5673 {
5674 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5675 }
5676
5677 /*
5678 * Insert the given physical page (p) at
5679 * the specified virtual address (v) in the
5680 * target physical map with the protection requested.
5681 *
5682 * If specified, the page will be wired down, meaning
5683 * that the related pte can not be reclaimed.
5684 *
5685 * NB: This is the only routine which MAY NOT lazy-evaluate
5686 * or lose information. That is, this routine must actually
5687 * insert this page into the given map eventually (must make
5688 * forward progress eventually.
5689 */
5690 kern_return_t
5691 pmap_enter(
5692 pmap_t pmap,
5693 vm_map_address_t v,
5694 ppnum_t pn,
5695 vm_prot_t prot,
5696 vm_prot_t fault_type,
5697 unsigned int flags,
5698 boolean_t wired)
5699 {
5700 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5701 }
5702
5703 /*
5704 * Attempt to commit the pte.
5705 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5706 * Performs no page table or accounting writes on failures.
5707 */
5708 static inline bool
5709 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5710 {
5711 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5712 bool success = false, changed_wiring = false;
5713
5714 __unreachable_ok_push
5715 if (TEST_PAGE_RATIO_4) {
5716 /*
5717 * 16K virtual pages w/ 4K hw pages.
5718 * We actually need to update 4 ptes here which can't easily be done atomically.
5719 * As a result we require the exclusive pmap lock.
5720 */
5721 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5722 *old_pte = *pte_p;
5723 if (*old_pte == new_pte) {
5724 /* Another thread completed this operation. Nothing to do here. */
5725 success = true;
5726 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5727 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5728 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5729 success = false;
5730 } else {
5731 write_pte_fast(pte_p, new_pte);
5732 success = true;
5733 }
5734 } else {
5735 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5736 }
5737 __unreachable_ok_pop
5738
5739 if (success && *old_pte != new_pte) {
5740 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5741 FLUSH_PTE_STRONG();
5742 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), false, true);
5743 } else {
5744 FLUSH_PTE();
5745 __builtin_arm_isb(ISB_SY);
5746 }
5747 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5748 (new_pte & ARM_PTE_WIRED) != 0 :
5749 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5750
5751 if (pmap != kernel_pmap && changed_wiring) {
5752 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5753 if (new_pte & ARM_PTE_WIRED) {
5754 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5755 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5756 } else {
5757 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5758 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5759 }
5760 }
5761
5762 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5763 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5764 }
5765 return success;
5766 }
5767
5768 MARK_AS_PMAP_TEXT static pt_entry_t
5769 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5770 {
5771 pt_entry_t pte;
5772
5773 switch (wimg & (VM_WIMG_MASK)) {
5774 case VM_WIMG_IO:
5775 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5776 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5777 // AP, while preserving the security benefits of using device
5778 // mapping against side-channel attacks. On pre-H14 platforms,
5779 // the accesses will still be strongly ordered.
5780 if (is_dram_addr(pa)) {
5781 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5782 } else {
5783 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5784 }
5785 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5786 break;
5787 case VM_WIMG_RT:
5788 #if HAS_UCNORMAL_MEM
5789 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5790 #else
5791 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5792 #endif
5793 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5794 break;
5795 case VM_WIMG_POSTED:
5796 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5797 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5798 break;
5799 case VM_WIMG_POSTED_REORDERED:
5800 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5801 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5802 break;
5803 case VM_WIMG_POSTED_COMBINED_REORDERED:
5804 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5805 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5806 break;
5807 case VM_WIMG_WCOMB:
5808 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5809 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5810 break;
5811 case VM_WIMG_WTHRU:
5812 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5813 #if (__ARM_VMSA__ > 7)
5814 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5815 #else
5816 pte |= ARM_PTE_SH;
5817 #endif
5818 break;
5819 case VM_WIMG_COPYBACK:
5820 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5821 #if (__ARM_VMSA__ > 7)
5822 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5823 #else
5824 pte |= ARM_PTE_SH;
5825 #endif
5826 break;
5827 case VM_WIMG_INNERWBACK:
5828 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5829 #if (__ARM_VMSA__ > 7)
5830 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5831 #else
5832 pte |= ARM_PTE_SH;
5833 #endif
5834 break;
5835 default:
5836 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5837 #if (__ARM_VMSA__ > 7)
5838 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5839 #else
5840 pte |= ARM_PTE_SH;
5841 #endif
5842 }
5843
5844 return pte;
5845 }
5846
5847
5848 /*
5849 * Construct a PTE (and the physical page attributes) for the given virtual to
5850 * physical mapping.
5851 *
5852 * This function has no side effects and is safe to call so that it is safe to
5853 * call while attempting a pmap_enter transaction.
5854 */
5855 MARK_AS_PMAP_TEXT static pt_entry_t
5856 pmap_construct_pte(
5857 const pmap_t pmap,
5858 vm_map_address_t va,
5859 pmap_paddr_t pa,
5860 vm_prot_t prot,
5861 vm_prot_t fault_type,
5862 boolean_t wired,
5863 const pt_attr_t* const pt_attr,
5864 uint16_t *pp_attr_bits /* OUTPUT */
5865 )
5866 {
5867 bool set_NX = false, set_XO = false;
5868 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5869 assert(pp_attr_bits != NULL);
5870 *pp_attr_bits = 0;
5871
5872 if (wired) {
5873 pte |= ARM_PTE_WIRED;
5874 }
5875
5876 #if DEVELOPMENT || DEBUG
5877 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5878 #else
5879 if ((prot & VM_PROT_EXECUTE))
5880 #endif
5881 {
5882 set_NX = false;
5883 } else {
5884 set_NX = true;
5885 }
5886
5887 #if (__ARM_VMSA__ > 7)
5888 if (prot == VM_PROT_EXECUTE) {
5889 set_XO = true;
5890 }
5891 #endif
5892
5893 if (set_NX) {
5894 pte |= pt_attr_leaf_xn(pt_attr);
5895 } else {
5896 #if (__ARM_VMSA__ > 7)
5897 if (pmap == kernel_pmap) {
5898 pte |= ARM_PTE_NX;
5899 } else {
5900 pte |= pt_attr_leaf_x(pt_attr);
5901 }
5902 #endif
5903 }
5904
5905 if (pmap == kernel_pmap) {
5906 #if __ARM_KERNEL_PROTECT__
5907 pte |= ARM_PTE_NG;
5908 #endif /* __ARM_KERNEL_PROTECT__ */
5909 if (prot & VM_PROT_WRITE) {
5910 pte |= ARM_PTE_AP(AP_RWNA);
5911 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5912 } else {
5913 pte |= ARM_PTE_AP(AP_RONA);
5914 *pp_attr_bits |= PP_ATTR_REFERENCED;
5915 }
5916 #if (__ARM_VMSA__ == 7)
5917 if ((_COMM_PAGE_BASE_ADDRESS <= va) && (va < _COMM_PAGE_BASE_ADDRESS + _COMM_PAGE_AREA_LENGTH)) {
5918 pte = (pte & ~(ARM_PTE_APMASK)) | ARM_PTE_AP(AP_RORO);
5919 }
5920 #endif
5921 } else {
5922 if (pmap->type != PMAP_TYPE_NESTED) {
5923 pte |= ARM_PTE_NG;
5924 } else if ((pmap->nested_region_asid_bitmap)
5925 && (va >= pmap->nested_region_addr)
5926 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5927 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5928
5929 if ((pmap->nested_region_asid_bitmap)
5930 && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5931 pte |= ARM_PTE_NG;
5932 }
5933 }
5934 #if MACH_ASSERT
5935 if (pmap->nested_pmap != NULL) {
5936 vm_map_address_t nest_vaddr;
5937 pt_entry_t *nest_pte_p;
5938
5939 nest_vaddr = va;
5940
5941 if ((nest_vaddr >= pmap->nested_region_addr)
5942 && (nest_vaddr < (pmap->nested_region_addr + pmap->nested_region_size))
5943 && ((nest_pte_p = pmap_pte(pmap->nested_pmap, nest_vaddr)) != PT_ENTRY_NULL)
5944 && (*nest_pte_p != ARM_PTE_TYPE_FAULT)
5945 && (!ARM_PTE_IS_COMPRESSED(*nest_pte_p, nest_pte_p))
5946 && (((*nest_pte_p) & ARM_PTE_NG) != ARM_PTE_NG)) {
5947 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5948
5949 if ((pmap->nested_pmap->nested_region_asid_bitmap)
5950 && !testbit(index, (int *)pmap->nested_pmap->nested_region_asid_bitmap)) {
5951 panic("pmap_enter(): Global attribute conflict nest_pte_p=%p pmap=%p va=0x%llx spte=0x%llx",
5952 nest_pte_p, pmap, (uint64_t)va, (uint64_t)*nest_pte_p);
5953 }
5954 }
5955 }
5956 #endif
5957 if (prot & VM_PROT_WRITE) {
5958 assert(pmap->type != PMAP_TYPE_NESTED);
5959 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5960 if (fault_type & VM_PROT_WRITE) {
5961 if (set_XO) {
5962 pte |= pt_attr_leaf_rwna(pt_attr);
5963 } else {
5964 pte |= pt_attr_leaf_rw(pt_attr);
5965 }
5966 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5967 } else {
5968 if (set_XO) {
5969 pte |= pt_attr_leaf_rona(pt_attr);
5970 } else {
5971 pte |= pt_attr_leaf_ro(pt_attr);
5972 }
5973 /*
5974 * Mark the page as MODFAULT so that a subsequent write
5975 * may be handled through arm_fast_fault().
5976 */
5977 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5978 pte_set_was_writeable(pte, true);
5979 }
5980 } else {
5981 if (set_XO) {
5982 pte |= pt_attr_leaf_rwna(pt_attr);
5983 } else {
5984 pte |= pt_attr_leaf_rw(pt_attr);
5985 }
5986 *pp_attr_bits |= PP_ATTR_REFERENCED;
5987 }
5988 } else {
5989 if (set_XO) {
5990 pte |= pt_attr_leaf_rona(pt_attr);
5991 } else {
5992 pte |= pt_attr_leaf_ro(pt_attr);
5993 }
5994 *pp_attr_bits |= PP_ATTR_REFERENCED;
5995 }
5996 }
5997
5998 pte |= ARM_PTE_AF;
5999 return pte;
6000 }
6001
6002 MARK_AS_PMAP_TEXT kern_return_t
6003 pmap_enter_options_internal(
6004 pmap_t pmap,
6005 vm_map_address_t v,
6006 pmap_paddr_t pa,
6007 vm_prot_t prot,
6008 vm_prot_t fault_type,
6009 unsigned int flags,
6010 boolean_t wired,
6011 unsigned int options)
6012 {
6013 ppnum_t pn = (ppnum_t)atop(pa);
6014 pt_entry_t pte;
6015 pt_entry_t spte;
6016 pt_entry_t *pte_p;
6017 bool refcnt_updated;
6018 bool wiredcnt_updated;
6019 bool ro_va = false;
6020 unsigned int wimg_bits;
6021 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6022 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6023 kern_return_t kr = KERN_SUCCESS;
6024 uint16_t pp_attr_bits;
6025 volatile uint16_t *refcnt;
6026 volatile uint16_t *wiredcnt;
6027 pv_free_list_t *local_pv_free;
6028
6029 validate_pmap_mutable(pmap);
6030
6031 #if XNU_MONITOR
6032 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6033 panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
6034 }
6035 #endif
6036
6037 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6038
6039 if ((v) & pt_attr_leaf_offmask(pt_attr)) {
6040 panic("pmap_enter_options() pmap %p v 0x%llx",
6041 pmap, (uint64_t)v);
6042 }
6043
6044 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6045 panic("pmap_enter_options() pmap %p pa 0x%llx",
6046 pmap, (uint64_t)pa);
6047 }
6048
6049 /* The PA should not extend beyond the architected physical address space */
6050 pa &= ARM_PTE_PAGE_MASK;
6051
6052 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6053 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6054 extern vm_offset_t ctrr_test_page;
6055 if (__probable(v != ctrr_test_page))
6056 #endif
6057 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6058 }
6059 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6060 if (__improbable(prot != VM_PROT_READ)) {
6061 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6062 __func__, (unsigned long long)v, prot);
6063 }
6064 ro_va = true;
6065 }
6066 assert(pn != vm_page_fictitious_addr);
6067
6068 refcnt_updated = false;
6069 wiredcnt_updated = false;
6070
6071 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6072 /*
6073 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6074 *
6075 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6076 */
6077 lock_mode = PMAP_LOCK_EXCLUSIVE;
6078 }
6079 pmap_lock(pmap, lock_mode);
6080
6081 /*
6082 * Expand pmap to include this pte. Assume that
6083 * pmap is always expanded to include enough hardware
6084 * pages to map one VM page.
6085 */
6086 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6087 /* Must unlock to expand the pmap. */
6088 pmap_unlock(pmap, lock_mode);
6089
6090 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6091
6092 if (kr != KERN_SUCCESS) {
6093 return kr;
6094 }
6095
6096 pmap_lock(pmap, lock_mode);
6097 }
6098
6099 if (options & PMAP_OPTIONS_NOENTER) {
6100 pmap_unlock(pmap, lock_mode);
6101 return KERN_SUCCESS;
6102 }
6103
6104 /*
6105 * Since we may not hold the pmap lock exclusive, updating the pte is
6106 * done via a cmpxchg loop.
6107 * We need to be careful about modifying non-local data structures before commiting
6108 * the new pte since we may need to re-do the transaction.
6109 */
6110 spte = os_atomic_load(pte_p, relaxed);
6111 while (!committed) {
6112 refcnt = NULL;
6113 wiredcnt = NULL;
6114 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6115 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6116
6117 if (pmap != kernel_pmap) {
6118 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6119 refcnt = &ptd_info->refcnt;
6120 wiredcnt = &ptd_info->wiredcnt;
6121 /*
6122 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6123 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6124 * or acquire the pmap lock exclusive.
6125 */
6126 if (!wiredcnt_updated) {
6127 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6128 wiredcnt_updated = true;
6129 }
6130 if (!refcnt_updated) {
6131 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6132 refcnt_updated = true;
6133 drop_refcnt = true;
6134 }
6135 }
6136
6137 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6138 /*
6139 * There is already a mapping here & it's for a different physical page.
6140 * First remove that mapping.
6141 *
6142 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6143 */
6144 if (lock_mode == PMAP_LOCK_SHARED) {
6145 if (pmap_lock_shared_to_exclusive(pmap)) {
6146 lock_mode = PMAP_LOCK_EXCLUSIVE;
6147 } else {
6148 /*
6149 * We failed to upgrade to an exclusive lock.
6150 * As a result we no longer hold the lock at all,
6151 * so we need to re-acquire it and restart the transaction.
6152 */
6153 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6154 lock_mode = PMAP_LOCK_EXCLUSIVE;
6155 /* pmap might have changed after we dropped the lock. Try again. */
6156 spte = os_atomic_load(pte_p, relaxed);
6157 continue;
6158 }
6159 }
6160 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6161 spte = ARM_PTE_TYPE_FAULT;
6162 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6163 }
6164
6165 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6166
6167 if (pa_valid(pa)) {
6168 unsigned int pai;
6169 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6170
6171 is_internal = FALSE;
6172 is_altacct = FALSE;
6173
6174 pai = pa_index(pa);
6175
6176 pvh_lock(pai);
6177
6178 /*
6179 * Make sure that the current per-cpu PV free list has
6180 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6181 * if the transaction succeeds. We're either in the
6182 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6183 * Note that we can still be interrupted, but a primary
6184 * interrupt handler can never enter the pmap.
6185 */
6186 #if !XNU_MONITOR
6187 assert(get_preemption_level() > 0);
6188 #endif
6189 local_pv_free = &pmap_get_cpu_data()->pv_free;
6190 pv_entry_t **pv_h = pai_to_pvh(pai);
6191 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6192 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6193
6194 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6195 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6196 int new_allocated_pves = 0;
6197
6198 while (new_allocated_pves < 2) {
6199 local_pv_free = &pmap_get_cpu_data()->pv_free;
6200 pv_status = pv_alloc(pmap, pai, lock_mode, &new_pve_p[new_allocated_pves]);
6201 if (pv_status == PV_ALLOC_FAIL) {
6202 break;
6203 } else if (pv_status == PV_ALLOC_RETRY) {
6204 /*
6205 * In the case that pv_alloc() had to grab a new page of PVEs,
6206 * it will have dropped the pmap lock while doing so.
6207 * On non-PPL devices, dropping the lock re-enables preemption so we may
6208 * be on a different CPU now.
6209 */
6210 local_pv_free = &pmap_get_cpu_data()->pv_free;
6211 } else {
6212 /* If we've gotten this far then a node should've been allocated. */
6213 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6214
6215 new_allocated_pves++;
6216 }
6217 }
6218
6219 for (int i = 0; i < new_allocated_pves; i++) {
6220 pv_free(new_pve_p[i]);
6221 }
6222 }
6223
6224 if (pv_status == PV_ALLOC_FAIL) {
6225 pvh_unlock(pai);
6226 kr = KERN_RESOURCE_SHORTAGE;
6227 break;
6228 } else if (pv_status == PV_ALLOC_RETRY) {
6229 pvh_unlock(pai);
6230 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6231 spte = os_atomic_load(pte_p, relaxed);
6232 continue;
6233 }
6234
6235 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6236 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6237 } else {
6238 wimg_bits = pmap_cache_attributes(pn);
6239 }
6240
6241 /* We may be retrying this operation after dropping the PVH lock.
6242 * Cache attributes for the physical page may have changed while the lock
6243 * was dropped, so clear any cache attributes we may have previously set
6244 * in the PTE template. */
6245 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6246 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6247
6248 #if XNU_MONITOR
6249 /* The regular old kernel is not allowed to remap PPL pages. */
6250 if (__improbable(ppattr_pa_test_monitor(pa))) {
6251 panic("%s: page belongs to PPL, "
6252 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6253 __FUNCTION__,
6254 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6255 }
6256
6257 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6258 panic("%s: page locked down, "
6259 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6260 __FUNCTION__,
6261 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6262 }
6263 #endif
6264
6265
6266 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6267 if (!committed) {
6268 pvh_unlock(pai);
6269 continue;
6270 }
6271 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6272 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6273
6274 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6275 /*
6276 * If there was already a valid pte here then we reuse its reference
6277 * on the ptd and drop the one that we took above.
6278 */
6279 drop_refcnt = had_valid_mapping;
6280
6281 if (!had_valid_mapping) {
6282 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6283 int pve_ptep_idx = 0;
6284 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6285 /* We did all the allocations up top. So this shouldn't be able to fail. */
6286 if (pv_status != PV_ALLOC_SUCCESS) {
6287 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6288 __func__, pv_status, new_pve_p, pmap);
6289 }
6290
6291 if (pmap != kernel_pmap) {
6292 if (options & PMAP_OPTIONS_INTERNAL) {
6293 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6294 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6295 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6296 /*
6297 * Make a note to ourselves that this
6298 * mapping is using alternative
6299 * accounting. We'll need this in order
6300 * to know which ledger to debit when
6301 * the mapping is removed.
6302 *
6303 * The altacct bit must be set while
6304 * the pv head is locked. Defer the
6305 * ledger accounting until after we've
6306 * dropped the lock.
6307 */
6308 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6309 is_altacct = TRUE;
6310 }
6311 }
6312 if (ppattr_test_reusable(pai) &&
6313 !is_altacct) {
6314 is_reusable = TRUE;
6315 } else if (options & PMAP_OPTIONS_INTERNAL) {
6316 is_internal = TRUE;
6317 } else {
6318 is_external = TRUE;
6319 }
6320 }
6321 }
6322
6323 pvh_unlock(pai);
6324
6325 if (pp_attr_bits != 0) {
6326 ppattr_pa_set_bits(pa, pp_attr_bits);
6327 }
6328
6329 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6330 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6331
6332 if (is_internal) {
6333 /*
6334 * Make corresponding adjustments to
6335 * phys_footprint statistics.
6336 */
6337 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6338 if (is_altacct) {
6339 /*
6340 * If this page is internal and
6341 * in an IOKit region, credit
6342 * the task's total count of
6343 * dirty, internal IOKit pages.
6344 * It should *not* count towards
6345 * the task's total physical
6346 * memory footprint, because
6347 * this entire region was
6348 * already billed to the task
6349 * at the time the mapping was
6350 * created.
6351 *
6352 * Put another way, this is
6353 * internal++ and
6354 * alternate_accounting++, so
6355 * net effect on phys_footprint
6356 * is 0. That means: don't
6357 * touch phys_footprint here.
6358 */
6359 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6360 } else {
6361 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6362 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6363 skip_footprint_debit = true;
6364 } else {
6365 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6366 }
6367 }
6368 }
6369 if (is_reusable) {
6370 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6371 } else if (is_external) {
6372 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6373 }
6374 }
6375 } else {
6376 if (prot & VM_PROT_EXECUTE) {
6377 kr = KERN_FAILURE;
6378 break;
6379 }
6380
6381 wimg_bits = pmap_cache_attributes(pn);
6382 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6383 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6384 }
6385
6386 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6387
6388 #if XNU_MONITOR
6389 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
6390 uint64_t xprr_perm = pte_to_xprr_perm(pte);
6391 switch (xprr_perm) {
6392 case XPRR_KERN_RO_PERM:
6393 break;
6394 case XPRR_KERN_RW_PERM:
6395 pte &= ~ARM_PTE_XPRR_MASK;
6396 pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
6397 break;
6398 default:
6399 panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte);
6400 }
6401 }
6402 #endif
6403 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6404 if (committed) {
6405 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6406 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6407
6408 /**
6409 * If there was already a valid pte here then we reuse its
6410 * reference on the ptd and drop the one that we took above.
6411 */
6412 drop_refcnt = had_valid_mapping;
6413 }
6414 }
6415 if (committed) {
6416 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6417 assert(pmap != kernel_pmap);
6418
6419 /* One less "compressed" */
6420 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6421 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6422
6423 if (spte & ARM_PTE_COMPRESSED_ALT) {
6424 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6425 } else if (!skip_footprint_debit) {
6426 /* Was part of the footprint */
6427 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6428 }
6429 /* The old entry held a reference so drop the extra one that we took above. */
6430 drop_refcnt = true;
6431 }
6432 }
6433 }
6434
6435 if (drop_refcnt && refcnt != NULL) {
6436 assert(refcnt_updated);
6437 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6438 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6439 }
6440 }
6441
6442 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6443 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6444 }
6445
6446 pmap_unlock(pmap, lock_mode);
6447
6448 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6449 pmap_phys_write_disable(v);
6450 }
6451
6452 return kr;
6453 }
6454
6455 kern_return_t
6456 pmap_enter_options_addr(
6457 pmap_t pmap,
6458 vm_map_address_t v,
6459 pmap_paddr_t pa,
6460 vm_prot_t prot,
6461 vm_prot_t fault_type,
6462 unsigned int flags,
6463 boolean_t wired,
6464 unsigned int options,
6465 __unused void *arg)
6466 {
6467 kern_return_t kr = KERN_FAILURE;
6468
6469
6470 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6471 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6472
6473
6474 #if XNU_MONITOR
6475 /*
6476 * If NOWAIT was not requested, loop until the enter does not
6477 * fail due to lack of resources.
6478 */
6479 while ((kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT)) == KERN_RESOURCE_SHORTAGE) {
6480 pmap_alloc_page_for_ppl((options & PMAP_OPTIONS_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6481 if (options & PMAP_OPTIONS_NOWAIT) {
6482 break;
6483 }
6484 }
6485
6486 pmap_ledger_check_balance(pmap);
6487 #else
6488 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6489 #endif
6490
6491 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6492
6493 return kr;
6494 }
6495
6496 kern_return_t
6497 pmap_enter_options(
6498 pmap_t pmap,
6499 vm_map_address_t v,
6500 ppnum_t pn,
6501 vm_prot_t prot,
6502 vm_prot_t fault_type,
6503 unsigned int flags,
6504 boolean_t wired,
6505 unsigned int options,
6506 __unused void *arg)
6507 {
6508 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6509 }
6510
6511 /*
6512 * Routine: pmap_change_wiring
6513 * Function: Change the wiring attribute for a map/virtual-address
6514 * pair.
6515 * In/out conditions:
6516 * The mapping must already exist in the pmap.
6517 */
6518 MARK_AS_PMAP_TEXT void
6519 pmap_change_wiring_internal(
6520 pmap_t pmap,
6521 vm_map_address_t v,
6522 boolean_t wired)
6523 {
6524 pt_entry_t *pte_p;
6525 pmap_paddr_t pa;
6526
6527 validate_pmap_mutable(pmap);
6528
6529 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6530
6531 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6532
6533 pte_p = pmap_pte(pmap, v);
6534 if (pte_p == PT_ENTRY_NULL) {
6535 if (!wired) {
6536 /*
6537 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6538 * may have been freed by a remove operation.
6539 */
6540 goto pmap_change_wiring_return;
6541 } else {
6542 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6543 }
6544 }
6545 /*
6546 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6547 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6548 */
6549 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6550
6551 while (pa_valid(pa)) {
6552 pmap_paddr_t new_pa;
6553
6554 pvh_lock(pa_index(pa));
6555 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6556
6557 if (pa == new_pa) {
6558 break;
6559 }
6560
6561 pvh_unlock(pa_index(pa));
6562 pa = new_pa;
6563 }
6564
6565 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6566 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6567 if (!wired) {
6568 /* PTE cleared by prior remove/disconnect operation */
6569 goto pmap_change_wiring_cleanup;
6570 } else {
6571 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6572 __func__, pte_p, (uint64_t)*pte_p, pmap);
6573 }
6574 }
6575
6576 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6577 if (wired != pte_is_wired(*pte_p)) {
6578 pte_set_wired(pmap, pte_p, wired);
6579 if (pmap != kernel_pmap) {
6580 if (wired) {
6581 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6582 } else if (!wired) {
6583 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6584 }
6585 }
6586 }
6587
6588 pmap_change_wiring_cleanup:
6589 if (pa_valid(pa)) {
6590 pvh_unlock(pa_index(pa));
6591 }
6592
6593 pmap_change_wiring_return:
6594 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6595 }
6596
6597 void
6598 pmap_change_wiring(
6599 pmap_t pmap,
6600 vm_map_address_t v,
6601 boolean_t wired)
6602 {
6603 #if XNU_MONITOR
6604 pmap_change_wiring_ppl(pmap, v, wired);
6605
6606 pmap_ledger_check_balance(pmap);
6607 #else
6608 pmap_change_wiring_internal(pmap, v, wired);
6609 #endif
6610 }
6611
6612 MARK_AS_PMAP_TEXT pmap_paddr_t
6613 pmap_find_pa_internal(
6614 pmap_t pmap,
6615 addr64_t va)
6616 {
6617 pmap_paddr_t pa = 0;
6618
6619 validate_pmap(pmap);
6620
6621 if (pmap != kernel_pmap) {
6622 pmap_lock(pmap, PMAP_LOCK_SHARED);
6623 }
6624
6625 pa = pmap_vtophys(pmap, va);
6626
6627 if (pmap != kernel_pmap) {
6628 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6629 }
6630
6631 return pa;
6632 }
6633
6634 pmap_paddr_t
6635 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6636 {
6637 pmap_paddr_t pa = 0;
6638
6639 if (pmap == kernel_pmap) {
6640 pa = mmu_kvtop(va);
6641 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6642 /*
6643 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6644 * translation even if PAN would prevent kernel access through the translation.
6645 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6646 */
6647 pa = mmu_uvtop(va);
6648 }
6649 return pa;
6650 }
6651
6652 pmap_paddr_t
6653 pmap_find_pa(
6654 pmap_t pmap,
6655 addr64_t va)
6656 {
6657 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6658
6659 if (pa != 0) {
6660 return pa;
6661 }
6662
6663 if (not_in_kdp) {
6664 #if XNU_MONITOR
6665 return pmap_find_pa_ppl(pmap, va);
6666 #else
6667 return pmap_find_pa_internal(pmap, va);
6668 #endif
6669 } else {
6670 return pmap_vtophys(pmap, va);
6671 }
6672 }
6673
6674 ppnum_t
6675 pmap_find_phys_nofault(
6676 pmap_t pmap,
6677 addr64_t va)
6678 {
6679 ppnum_t ppn;
6680 ppn = atop(pmap_find_pa_nofault(pmap, va));
6681 return ppn;
6682 }
6683
6684 ppnum_t
6685 pmap_find_phys(
6686 pmap_t pmap,
6687 addr64_t va)
6688 {
6689 ppnum_t ppn;
6690 ppn = atop(pmap_find_pa(pmap, va));
6691 return ppn;
6692 }
6693
6694 /**
6695 * Translate a kernel virtual address into a physical address.
6696 *
6697 * @param va The kernel virtual address to translate. Does not work on user
6698 * virtual addresses.
6699 *
6700 * @return The physical address if the translation was successful, or zero if
6701 * no valid mappings were found for the given virtual address.
6702 */
6703 pmap_paddr_t
6704 kvtophys(vm_offset_t va)
6705 {
6706 /**
6707 * Attempt to do the translation first in hardware using the AT (address
6708 * translation) instruction. This will attempt to use the MMU to do the
6709 * translation for us.
6710 */
6711 pmap_paddr_t pa = mmu_kvtop(va);
6712
6713 if (pa) {
6714 return pa;
6715 }
6716
6717 /* If the MMU can't find the mapping, then manually walk the page tables. */
6718 return pmap_vtophys(kernel_pmap, va);
6719 }
6720
6721 /**
6722 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6723 * points to a non-kernel-managed physical page, then this call will panic().
6724 *
6725 * @note The output of this function is guaranteed to be a kernel-managed
6726 * physical page, which means it's safe to pass the output directly to
6727 * pa_index() to create a physical address index for various pmap data
6728 * structures.
6729 *
6730 * @param va The kernel virtual address to translate. Does not work on user
6731 * virtual addresses.
6732 *
6733 * @return The translated physical address for the given virtual address.
6734 */
6735 pmap_paddr_t
6736 kvtophys_nofail(vm_offset_t va)
6737 {
6738 pmap_paddr_t pa = kvtophys(va);
6739
6740 if (!pa_valid(pa)) {
6741 panic("%s: Invalid or non-kernel-managed physical page returned, "
6742 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6743 }
6744
6745 return pa;
6746 }
6747
6748 pmap_paddr_t
6749 pmap_vtophys(
6750 pmap_t pmap,
6751 addr64_t va)
6752 {
6753 if ((va < pmap->min) || (va >= pmap->max)) {
6754 return 0;
6755 }
6756
6757 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6758
6759 #if (__ARM_VMSA__ == 7)
6760 tt_entry_t *tte_p, tte;
6761 pt_entry_t *pte_p;
6762 pmap_paddr_t pa;
6763
6764 tte_p = pmap_tte(pmap, va);
6765 if (tte_p == (tt_entry_t *) NULL) {
6766 return (pmap_paddr_t) 0;
6767 }
6768
6769 tte = *tte_p;
6770 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
6771 pte_p = (pt_entry_t *) ttetokv(tte) + pte_index(pt_attr, va);
6772 pa = pte_to_pa(*pte_p) | (va & ARM_PGMASK);
6773 //LIONEL ppn = (ppnum_t) atop(pte_to_pa(*pte_p) | (va & ARM_PGMASK));
6774 #if DEVELOPMENT || DEBUG
6775 if (atop(pa) != 0 &&
6776 ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
6777 panic("pmap_vtophys(%p,0x%llx): compressed pte_p=%p 0x%llx with ppn=0x%x",
6778 pmap, va, pte_p, (uint64_t) (*pte_p), atop(pa));
6779 }
6780 #endif /* DEVELOPMENT || DEBUG */
6781 } else if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
6782 if ((tte & ARM_TTE_BLOCK_SUPER) == ARM_TTE_BLOCK_SUPER) {
6783 pa = suptte_to_pa(tte) | (va & ARM_TT_L1_SUPER_OFFMASK);
6784 } else {
6785 pa = sectte_to_pa(tte) | (va & ARM_TT_L1_BLOCK_OFFMASK);
6786 }
6787 } else {
6788 pa = 0;
6789 }
6790 #else
6791 tt_entry_t * ttp = NULL;
6792 tt_entry_t * ttep = NULL;
6793 tt_entry_t tte = ARM_TTE_EMPTY;
6794 pmap_paddr_t pa = 0;
6795 unsigned int cur_level;
6796
6797 ttp = pmap->tte;
6798
6799 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6800 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6801
6802 tte = *ttep;
6803
6804 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6805 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6806 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6807 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6808
6809 if ((tte & valid_mask) != valid_mask) {
6810 return (pmap_paddr_t) 0;
6811 }
6812
6813 /* This detects both leaf entries and intermediate block mappings. */
6814 if ((tte & type_mask) == type_block) {
6815 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6816 break;
6817 }
6818
6819 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6820 }
6821 #endif
6822
6823 return pa;
6824 }
6825
6826 /*
6827 * pmap_init_pte_page - Initialize a page table page.
6828 */
6829 MARK_AS_PMAP_TEXT void
6830 pmap_init_pte_page(
6831 pmap_t pmap,
6832 pt_entry_t *pte_p,
6833 vm_offset_t va,
6834 unsigned int ttlevel,
6835 boolean_t alloc_ptd)
6836 {
6837 pt_desc_t *ptdp = NULL;
6838 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6839
6840 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6841 if (alloc_ptd) {
6842 /*
6843 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6844 * on 4KB hardware, we may already have allocated a page table descriptor for a
6845 * bootstrap request, so we check for an existing PTD here.
6846 */
6847 ptdp = ptd_alloc(pmap);
6848 if (ptdp == NULL) {
6849 panic("%s: unable to allocate PTD", __func__);
6850 }
6851 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6852 } else {
6853 panic("pmap_init_pte_page(): pte_p %p", pte_p);
6854 }
6855 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6856 ptdp = pvh_ptd(pvh);
6857 } else {
6858 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6859 }
6860
6861 // below barrier ensures previous updates to the page are visible to PTW before
6862 // it is linked to the PTE of previous level
6863 __builtin_arm_dmb(DMB_ISHST);
6864 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6865 }
6866
6867 /*
6868 * Routine: pmap_expand
6869 *
6870 * Expands a pmap to be able to map the specified virtual address.
6871 *
6872 * Allocates new memory for the default (COARSE) translation table
6873 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6874 * also allocates space for the corresponding pv entries.
6875 *
6876 * Nothing should be locked.
6877 */
6878 MARK_AS_PMAP_TEXT static kern_return_t
6879 pmap_expand(
6880 pmap_t pmap,
6881 vm_map_address_t v,
6882 unsigned int options,
6883 unsigned int level)
6884 {
6885 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6886
6887 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6888 return KERN_INVALID_ADDRESS;
6889 }
6890 #if (__ARM_VMSA__ == 7)
6891 vm_offset_t pa;
6892 tt_entry_t *tte_p;
6893 tt_entry_t *tt_p;
6894 unsigned int i;
6895
6896 #if DEVELOPMENT || DEBUG
6897 /*
6898 * We no longer support root level expansion; panic in case something
6899 * still attempts to trigger it.
6900 */
6901 i = tte_index(pt_attr, v);
6902
6903 if (i >= pmap->tte_index_max) {
6904 panic("%s: index out of range, index=%u, max=%u, "
6905 "pmap=%p, addr=%p, options=%u, level=%u",
6906 __func__, i, pmap->tte_index_max,
6907 pmap, (void *)v, options, level);
6908 }
6909 #endif /* DEVELOPMENT || DEBUG */
6910
6911 if (level == 1) {
6912 return KERN_SUCCESS;
6913 }
6914
6915 {
6916 tt_entry_t *tte_next_p;
6917
6918 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6919 pa = 0;
6920 if (pmap_pte(pmap, v) != PT_ENTRY_NULL) {
6921 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6922 return KERN_SUCCESS;
6923 }
6924 tte_p = &pmap->tte[ttenum(v & ~ARM_TT_L1_PT_OFFMASK)];
6925 for (i = 0, tte_next_p = tte_p; i < 4; i++) {
6926 if (tte_to_pa(*tte_next_p)) {
6927 pa = tte_to_pa(*tte_next_p);
6928 break;
6929 }
6930 tte_next_p++;
6931 }
6932 pa = pa & ~PAGE_MASK;
6933 if (pa) {
6934 tte_p = &pmap->tte[ttenum(v)];
6935 *tte_p = pa_to_tte(pa) | (((v >> ARM_TT_L1_SHIFT) & 0x3) << 10) | ARM_TTE_TYPE_TABLE;
6936 FLUSH_PTE();
6937 PMAP_TRACE(5, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK),
6938 VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p);
6939 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6940 return KERN_SUCCESS;
6941 }
6942 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6943 }
6944 v = v & ~ARM_TT_L1_PT_OFFMASK;
6945
6946
6947 while (pmap_pte(pmap, v) == PT_ENTRY_NULL) {
6948 /*
6949 * Allocate a VM page for the level 2 page table entries.
6950 */
6951 while (pmap_tt_allocate(pmap, &tt_p, PMAP_TT_L2_LEVEL, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
6952 if (options & PMAP_OPTIONS_NOWAIT) {
6953 return KERN_RESOURCE_SHORTAGE;
6954 }
6955 VM_PAGE_WAIT();
6956 }
6957
6958 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6959 /*
6960 * See if someone else expanded us first
6961 */
6962 if (pmap_pte(pmap, v) == PT_ENTRY_NULL) {
6963 tt_entry_t *tte_next_p;
6964
6965 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE);
6966 pa = kvtophys_nofail((vm_offset_t)tt_p);
6967 tte_p = &pmap->tte[ttenum(v)];
6968 for (i = 0, tte_next_p = tte_p; i < 4; i++) {
6969 *tte_next_p = pa_to_tte(pa) | ARM_TTE_TYPE_TABLE;
6970 PMAP_TRACE(5, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + (i * ARM_TT_L1_SIZE)),
6971 VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + ((i + 1) * ARM_TT_L1_SIZE)), *tte_p);
6972 tte_next_p++;
6973 pa = pa + 0x400;
6974 }
6975 FLUSH_PTE();
6976
6977 pa = 0x0ULL;
6978 tt_p = (tt_entry_t *)NULL;
6979 }
6980 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6981 if (tt_p != (tt_entry_t *)NULL) {
6982 pmap_tt_deallocate(pmap, tt_p, PMAP_TT_L2_LEVEL);
6983 tt_p = (tt_entry_t *)NULL;
6984 }
6985 }
6986 return KERN_SUCCESS;
6987 #else
6988 pmap_paddr_t pa;
6989 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6990 tt_entry_t *tte_p;
6991 tt_entry_t *tt_p;
6992
6993 pa = 0x0ULL;
6994 tt_p = (tt_entry_t *)NULL;
6995
6996 for (; ttlevel < level; ttlevel++) {
6997 pmap_lock(pmap, PMAP_LOCK_SHARED);
6998
6999 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7000 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7001 while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
7002 if (options & PMAP_OPTIONS_NOWAIT) {
7003 return KERN_RESOURCE_SHORTAGE;
7004 }
7005 #if XNU_MONITOR
7006 panic("%s: failed to allocate tt, "
7007 "pmap=%p, v=%p, options=0x%x, level=%u",
7008 __FUNCTION__,
7009 pmap, (void *)v, options, level);
7010 #else
7011 VM_PAGE_WAIT();
7012 #endif
7013 }
7014 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
7015 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7016 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7017 pa = kvtophys_nofail((vm_offset_t)tt_p);
7018 tte_p = pmap_ttne(pmap, ttlevel, v);
7019 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7020 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7021 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7022 pa = 0x0ULL;
7023 tt_p = (tt_entry_t *)NULL;
7024 }
7025 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7026 } else {
7027 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7028 }
7029
7030 if (tt_p != (tt_entry_t *)NULL) {
7031 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7032 tt_p = (tt_entry_t *)NULL;
7033 }
7034 }
7035
7036 return KERN_SUCCESS;
7037 #endif
7038 }
7039
7040 /*
7041 * Routine: pmap_collect
7042 * Function:
7043 * Garbage collects the physical map system for
7044 * pages which are no longer used.
7045 * Success need not be guaranteed -- that is, there
7046 * may well be pages which are not referenced, but
7047 * others may be collected.
7048 */
7049 void
7050 pmap_collect(pmap_t pmap)
7051 {
7052 if (pmap == PMAP_NULL) {
7053 return;
7054 }
7055
7056 #if 0
7057 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
7058 if ((pmap->nested == FALSE) && (pmap != kernel_pmap)) {
7059 /* TODO: Scan for vm page assigned to top level page tables with no reference */
7060 }
7061 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7062 #endif
7063
7064 return;
7065 }
7066
7067 /*
7068 * Routine: pmap_gc
7069 * Function:
7070 * Pmap garbage collection
7071 * Called by the pageout daemon when pages are scarce.
7072 *
7073 */
7074 void
7075 pmap_gc(
7076 void)
7077 {
7078 #if XNU_MONITOR
7079 /*
7080 * We cannot invoke the scheduler from the PPL, so for now we elide the
7081 * GC logic if the PPL is enabled.
7082 */
7083 #endif
7084 #if !XNU_MONITOR
7085 pmap_t pmap, pmap_next;
7086 boolean_t gc_wait;
7087
7088 if (pmap_gc_allowed &&
7089 (pmap_gc_allowed_by_time_throttle ||
7090 pmap_gc_forced)) {
7091 pmap_gc_forced = FALSE;
7092 pmap_gc_allowed_by_time_throttle = FALSE;
7093 pmap_simple_lock(&pmaps_lock);
7094 pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&map_pmap_list));
7095 while (!queue_end(&map_pmap_list, (queue_entry_t)pmap)) {
7096 if (!(pmap->gc_status & PMAP_GC_INFLIGHT)) {
7097 pmap->gc_status |= PMAP_GC_INFLIGHT;
7098 }
7099 pmap_simple_unlock(&pmaps_lock);
7100
7101 pmap_collect(pmap);
7102
7103 pmap_simple_lock(&pmaps_lock);
7104 gc_wait = (pmap->gc_status & PMAP_GC_WAIT);
7105 pmap->gc_status &= ~(PMAP_GC_INFLIGHT | PMAP_GC_WAIT);
7106 pmap_next = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&pmap->pmaps));
7107 if (gc_wait) {
7108 if (!queue_end(&map_pmap_list, (queue_entry_t)pmap_next)) {
7109 pmap_next->gc_status |= PMAP_GC_INFLIGHT;
7110 }
7111 pmap_simple_unlock(&pmaps_lock);
7112 thread_wakeup((event_t) &pmap->gc_status);
7113 pmap_simple_lock(&pmaps_lock);
7114 }
7115 pmap = pmap_next;
7116 }
7117 pmap_simple_unlock(&pmaps_lock);
7118 }
7119 #endif
7120 }
7121
7122 /*
7123 * By default, don't attempt pmap GC more frequently
7124 * than once / 1 minutes.
7125 */
7126
7127 void
7128 compute_pmap_gc_throttle(
7129 void *arg __unused)
7130 {
7131 pmap_gc_allowed_by_time_throttle = TRUE;
7132 }
7133
7134 /*
7135 * pmap_attribute_cache_sync(vm_offset_t pa)
7136 *
7137 * Invalidates all of the instruction cache on a physical page and
7138 * pushes any dirty data from the data cache for the same physical page
7139 */
7140
7141 kern_return_t
7142 pmap_attribute_cache_sync(
7143 ppnum_t pp,
7144 vm_size_t size,
7145 __unused vm_machine_attribute_t attribute,
7146 __unused vm_machine_attribute_val_t * value)
7147 {
7148 if (size > PAGE_SIZE) {
7149 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7150 } else {
7151 cache_sync_page(pp);
7152 }
7153
7154 return KERN_SUCCESS;
7155 }
7156
7157 /*
7158 * pmap_sync_page_data_phys(ppnum_t pp)
7159 *
7160 * Invalidates all of the instruction cache on a physical page and
7161 * pushes any dirty data from the data cache for the same physical page
7162 */
7163 void
7164 pmap_sync_page_data_phys(
7165 ppnum_t pp)
7166 {
7167 cache_sync_page(pp);
7168 }
7169
7170 /*
7171 * pmap_sync_page_attributes_phys(ppnum_t pp)
7172 *
7173 * Write back and invalidate all cachelines on a physical page.
7174 */
7175 void
7176 pmap_sync_page_attributes_phys(
7177 ppnum_t pp)
7178 {
7179 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7180 }
7181
7182 #if CONFIG_COREDUMP
7183 /* temporary workaround */
7184 boolean_t
7185 coredumpok(
7186 vm_map_t map,
7187 mach_vm_offset_t va)
7188 {
7189 pt_entry_t *pte_p;
7190 pt_entry_t spte;
7191
7192 pte_p = pmap_pte(map->pmap, va);
7193 if (0 == pte_p) {
7194 return FALSE;
7195 }
7196 spte = *pte_p;
7197 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7198 }
7199 #endif
7200
7201 void
7202 fillPage(
7203 ppnum_t pn,
7204 unsigned int fill)
7205 {
7206 unsigned int *addr;
7207 int count;
7208
7209 addr = (unsigned int *) phystokv(ptoa(pn));
7210 count = PAGE_SIZE / sizeof(unsigned int);
7211 while (count--) {
7212 *addr++ = fill;
7213 }
7214 }
7215
7216 extern void mapping_set_mod(ppnum_t pn);
7217
7218 void
7219 mapping_set_mod(
7220 ppnum_t pn)
7221 {
7222 pmap_set_modify(pn);
7223 }
7224
7225 extern void mapping_set_ref(ppnum_t pn);
7226
7227 void
7228 mapping_set_ref(
7229 ppnum_t pn)
7230 {
7231 pmap_set_reference(pn);
7232 }
7233
7234 /*
7235 * Clear specified attribute bits.
7236 *
7237 * Try to force an arm_fast_fault() for all mappings of
7238 * the page - to force attributes to be set again at fault time.
7239 * If the forcing succeeds, clear the cached bits at the head.
7240 * Otherwise, something must have been wired, so leave the cached
7241 * attributes alone.
7242 */
7243 MARK_AS_PMAP_TEXT static void
7244 phys_attribute_clear_with_flush_range(
7245 ppnum_t pn,
7246 unsigned int bits,
7247 int options,
7248 void *arg,
7249 pmap_tlb_flush_range_t *flush_range)
7250 {
7251 pmap_paddr_t pa = ptoa(pn);
7252 vm_prot_t allow_mode = VM_PROT_ALL;
7253
7254 #if XNU_MONITOR
7255 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7256 panic("%s: illegal request, "
7257 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7258 __FUNCTION__,
7259 pn, bits, options, arg, flush_range);
7260 }
7261 #endif
7262 if ((arg != NULL) || (flush_range != NULL)) {
7263 options = options & ~PMAP_OPTIONS_NOFLUSH;
7264 }
7265
7266 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7267 (options & PMAP_OPTIONS_NOFLUSH))) {
7268 panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): "
7269 "should not clear 'modified' without flushing TLBs\n",
7270 pn, bits, options, arg, flush_range);
7271 }
7272
7273 assert(pn != vm_page_fictitious_addr);
7274
7275 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7276 assert(bits == PP_ATTR_MODIFIED);
7277
7278 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7279 /*
7280 * We short circuit this case; it should not need to
7281 * invoke arm_force_fast_fault, so just clear the modified bit.
7282 * pmap_page_protect has taken care of resetting
7283 * the state so that we'll see the next write as a fault to
7284 * the VM (i.e. we don't want a fast fault).
7285 */
7286 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7287 return;
7288 }
7289 if (bits & PP_ATTR_REFERENCED) {
7290 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7291 }
7292 if (bits & PP_ATTR_MODIFIED) {
7293 allow_mode &= ~VM_PROT_WRITE;
7294 }
7295
7296 if (bits == PP_ATTR_NOENCRYPT) {
7297 /*
7298 * We short circuit this case; it should not need to
7299 * invoke arm_force_fast_fault, so just clear and
7300 * return. On ARM, this bit is just a debugging aid.
7301 */
7302 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7303 return;
7304 }
7305
7306 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7307 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7308 }
7309 }
7310
7311 MARK_AS_PMAP_TEXT void
7312 phys_attribute_clear_internal(
7313 ppnum_t pn,
7314 unsigned int bits,
7315 int options,
7316 void *arg)
7317 {
7318 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7319 }
7320
7321 #if __ARM_RANGE_TLBI__
7322 MARK_AS_PMAP_TEXT static vm_map_address_t
7323 phys_attribute_clear_twig_internal(
7324 pmap_t pmap,
7325 vm_map_address_t start,
7326 vm_map_address_t end,
7327 unsigned int bits,
7328 unsigned int options,
7329 pmap_tlb_flush_range_t *flush_range)
7330 {
7331 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7332 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7333 assert(end >= start);
7334 assert((end - start) <= pt_attr_twig_size(pt_attr));
7335 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7336 vm_map_address_t va = start;
7337 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7338 tt_entry_t *tte_p;
7339 tte_p = pmap_tte(pmap, start);
7340 unsigned int npages = 0;
7341
7342 if (tte_p == (tt_entry_t *) NULL) {
7343 return end;
7344 }
7345
7346 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7347 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7348
7349 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7350 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7351 assert(end_pte_p >= start_pte_p);
7352 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7353 if (__improbable(npages++ && pmap_pending_preemption())) {
7354 return va;
7355 }
7356 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7357 if (pa_valid(pa)) {
7358 ppnum_t pn = (ppnum_t) atop(pa);
7359 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7360 }
7361 }
7362 }
7363 return end;
7364 }
7365
7366 MARK_AS_PMAP_TEXT vm_map_address_t
7367 phys_attribute_clear_range_internal(
7368 pmap_t pmap,
7369 vm_map_address_t start,
7370 vm_map_address_t end,
7371 unsigned int bits,
7372 unsigned int options)
7373 {
7374 if (__improbable(end < start)) {
7375 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7376 }
7377 validate_pmap_mutable(pmap);
7378
7379 vm_map_address_t va = start;
7380 pmap_tlb_flush_range_t flush_range = {
7381 .ptfr_pmap = pmap,
7382 .ptfr_start = start,
7383 .ptfr_end = end,
7384 .ptfr_flush_needed = false
7385 };
7386
7387 pmap_lock(pmap, PMAP_LOCK_SHARED);
7388 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7389
7390 while (va < end) {
7391 vm_map_address_t curr_end;
7392
7393 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7394 if (curr_end > end) {
7395 curr_end = end;
7396 }
7397
7398 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7399 if ((va < curr_end) || pmap_pending_preemption()) {
7400 break;
7401 }
7402 }
7403 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7404 if (flush_range.ptfr_flush_needed) {
7405 flush_range.ptfr_end = va;
7406 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7407 flush_range.ptfr_start,
7408 flush_range.ptfr_end - flush_range.ptfr_start,
7409 flush_range.ptfr_pmap,
7410 true);
7411 sync_tlb_flush();
7412 }
7413 return va;
7414 }
7415
7416 static void
7417 phys_attribute_clear_range(
7418 pmap_t pmap,
7419 vm_map_address_t start,
7420 vm_map_address_t end,
7421 unsigned int bits,
7422 unsigned int options)
7423 {
7424 /*
7425 * We allow single-page requests to execute non-preemptibly,
7426 * as it doesn't make sense to sample AST_URGENT for a single-page
7427 * operation, and there are a couple of special use cases that
7428 * require a non-preemptible single-page operation.
7429 */
7430 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7431 pmap_verify_preemptible();
7432 }
7433
7434 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7435
7436 while (start < end) {
7437 #if XNU_MONITOR
7438 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7439 #else
7440 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7441 #endif
7442 }
7443
7444 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7445 }
7446 #endif /* __ARM_RANGE_TLBI__ */
7447
7448 static void
7449 phys_attribute_clear(
7450 ppnum_t pn,
7451 unsigned int bits,
7452 int options,
7453 void *arg)
7454 {
7455 /*
7456 * Do we really want this tracepoint? It will be extremely chatty.
7457 * Also, should we have a corresponding trace point for the set path?
7458 */
7459 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7460
7461 #if XNU_MONITOR
7462 phys_attribute_clear_ppl(pn, bits, options, arg);
7463 #else
7464 phys_attribute_clear_internal(pn, bits, options, arg);
7465 #endif
7466
7467 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7468 }
7469
7470 /*
7471 * Set specified attribute bits.
7472 *
7473 * Set cached value in the pv head because we have
7474 * no per-mapping hardware support for referenced and
7475 * modify bits.
7476 */
7477 MARK_AS_PMAP_TEXT void
7478 phys_attribute_set_internal(
7479 ppnum_t pn,
7480 unsigned int bits)
7481 {
7482 pmap_paddr_t pa = ptoa(pn);
7483 assert(pn != vm_page_fictitious_addr);
7484
7485 #if XNU_MONITOR
7486 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7487 panic("%s: illegal request, "
7488 "pn=%u, bits=%#x",
7489 __FUNCTION__,
7490 pn, bits);
7491 }
7492 #endif
7493
7494 ppattr_pa_set_bits(pa, (uint16_t)bits);
7495
7496 return;
7497 }
7498
7499 static void
7500 phys_attribute_set(
7501 ppnum_t pn,
7502 unsigned int bits)
7503 {
7504 #if XNU_MONITOR
7505 phys_attribute_set_ppl(pn, bits);
7506 #else
7507 phys_attribute_set_internal(pn, bits);
7508 #endif
7509 }
7510
7511
7512 /*
7513 * Check specified attribute bits.
7514 *
7515 * use the software cached bits (since no hw support).
7516 */
7517 static boolean_t
7518 phys_attribute_test(
7519 ppnum_t pn,
7520 unsigned int bits)
7521 {
7522 pmap_paddr_t pa = ptoa(pn);
7523 assert(pn != vm_page_fictitious_addr);
7524 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7525 }
7526
7527
7528 /*
7529 * Set the modify/reference bits on the specified physical page.
7530 */
7531 void
7532 pmap_set_modify(ppnum_t pn)
7533 {
7534 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7535 }
7536
7537
7538 /*
7539 * Clear the modify bits on the specified physical page.
7540 */
7541 void
7542 pmap_clear_modify(
7543 ppnum_t pn)
7544 {
7545 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7546 }
7547
7548
7549 /*
7550 * pmap_is_modified:
7551 *
7552 * Return whether or not the specified physical page is modified
7553 * by any physical maps.
7554 */
7555 boolean_t
7556 pmap_is_modified(
7557 ppnum_t pn)
7558 {
7559 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7560 }
7561
7562
7563 /*
7564 * Set the reference bit on the specified physical page.
7565 */
7566 static void
7567 pmap_set_reference(
7568 ppnum_t pn)
7569 {
7570 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7571 }
7572
7573 /*
7574 * Clear the reference bits on the specified physical page.
7575 */
7576 void
7577 pmap_clear_reference(
7578 ppnum_t pn)
7579 {
7580 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7581 }
7582
7583
7584 /*
7585 * pmap_is_referenced:
7586 *
7587 * Return whether or not the specified physical page is referenced
7588 * by any physical maps.
7589 */
7590 boolean_t
7591 pmap_is_referenced(
7592 ppnum_t pn)
7593 {
7594 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7595 }
7596
7597 /*
7598 * pmap_get_refmod(phys)
7599 * returns the referenced and modified bits of the specified
7600 * physical page.
7601 */
7602 unsigned int
7603 pmap_get_refmod(
7604 ppnum_t pn)
7605 {
7606 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7607 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7608 }
7609
7610 static inline unsigned int
7611 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7612 {
7613 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7614 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7615 }
7616
7617 /*
7618 * pmap_clear_refmod(phys, mask)
7619 * clears the referenced and modified bits as specified by the mask
7620 * of the specified physical page.
7621 */
7622 void
7623 pmap_clear_refmod_options(
7624 ppnum_t pn,
7625 unsigned int mask,
7626 unsigned int options,
7627 void *arg)
7628 {
7629 unsigned int bits;
7630
7631 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7632 phys_attribute_clear(pn, bits, options, arg);
7633 }
7634
7635 /*
7636 * Perform pmap_clear_refmod_options on a virtual address range.
7637 * The operation will be performed in bulk & tlb flushes will be coalesced
7638 * if possible.
7639 *
7640 * Returns true if the operation is supported on this platform.
7641 * If this function returns false, the operation is not supported and
7642 * nothing has been modified in the pmap.
7643 */
7644 bool
7645 pmap_clear_refmod_range_options(
7646 pmap_t pmap __unused,
7647 vm_map_address_t start __unused,
7648 vm_map_address_t end __unused,
7649 unsigned int mask __unused,
7650 unsigned int options __unused)
7651 {
7652 #if __ARM_RANGE_TLBI__
7653 unsigned int bits;
7654 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7655 phys_attribute_clear_range(pmap, start, end, bits, options);
7656 return true;
7657 #else /* __ARM_RANGE_TLBI__ */
7658 #pragma unused(pmap, start, end, mask, options)
7659 /*
7660 * This operation allows the VM to bulk modify refmod bits on a virtually
7661 * contiguous range of addresses. This is large performance improvement on
7662 * platforms that support ranged tlbi instructions. But on older platforms,
7663 * we can only flush per-page or the entire asid. So we currently
7664 * only support this operation on platforms that support ranged tlbi.
7665 * instructions. On other platforms, we require that
7666 * the VM modify the bits on a per-page basis.
7667 */
7668 return false;
7669 #endif /* __ARM_RANGE_TLBI__ */
7670 }
7671
7672 void
7673 pmap_clear_refmod(
7674 ppnum_t pn,
7675 unsigned int mask)
7676 {
7677 pmap_clear_refmod_options(pn, mask, 0, NULL);
7678 }
7679
7680 unsigned int
7681 pmap_disconnect_options(
7682 ppnum_t pn,
7683 unsigned int options,
7684 void *arg)
7685 {
7686 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7687 /*
7688 * On ARM, the "modified" bit is managed by software, so
7689 * we know up-front if the physical page is "modified",
7690 * without having to scan all the PTEs pointing to it.
7691 * The caller should have made the VM page "busy" so noone
7692 * should be able to establish any new mapping and "modify"
7693 * the page behind us.
7694 */
7695 if (pmap_is_modified(pn)) {
7696 /*
7697 * The page has been modified and will be sent to
7698 * the VM compressor.
7699 */
7700 options |= PMAP_OPTIONS_COMPRESSOR;
7701 } else {
7702 /*
7703 * The page hasn't been modified and will be freed
7704 * instead of compressed.
7705 */
7706 }
7707 }
7708
7709 /* disconnect the page */
7710 pmap_page_protect_options(pn, 0, options, arg);
7711
7712 /* return ref/chg status */
7713 return pmap_get_refmod(pn);
7714 }
7715
7716 /*
7717 * Routine:
7718 * pmap_disconnect
7719 *
7720 * Function:
7721 * Disconnect all mappings for this page and return reference and change status
7722 * in generic format.
7723 *
7724 */
7725 unsigned int
7726 pmap_disconnect(
7727 ppnum_t pn)
7728 {
7729 pmap_page_protect(pn, 0); /* disconnect the page */
7730 return pmap_get_refmod(pn); /* return ref/chg status */
7731 }
7732
7733 boolean_t
7734 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7735 {
7736 if (ptoa(first) >= vm_last_phys) {
7737 return FALSE;
7738 }
7739 if (ptoa(last) < vm_first_phys) {
7740 return FALSE;
7741 }
7742
7743 return TRUE;
7744 }
7745
7746 /*
7747 * The state maintained by the noencrypt functions is used as a
7748 * debugging aid on ARM. This incurs some overhead on the part
7749 * of the caller. A special case check in phys_attribute_clear
7750 * (the most expensive path) currently minimizes this overhead,
7751 * but stubbing these functions out on RELEASE kernels yields
7752 * further wins.
7753 */
7754 boolean_t
7755 pmap_is_noencrypt(
7756 ppnum_t pn)
7757 {
7758 #if DEVELOPMENT || DEBUG
7759 boolean_t result = FALSE;
7760
7761 if (!pa_valid(ptoa(pn))) {
7762 return FALSE;
7763 }
7764
7765 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7766
7767 return result;
7768 #else
7769 #pragma unused(pn)
7770 return FALSE;
7771 #endif
7772 }
7773
7774 void
7775 pmap_set_noencrypt(
7776 ppnum_t pn)
7777 {
7778 #if DEVELOPMENT || DEBUG
7779 if (!pa_valid(ptoa(pn))) {
7780 return;
7781 }
7782
7783 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7784 #else
7785 #pragma unused(pn)
7786 #endif
7787 }
7788
7789 void
7790 pmap_clear_noencrypt(
7791 ppnum_t pn)
7792 {
7793 #if DEVELOPMENT || DEBUG
7794 if (!pa_valid(ptoa(pn))) {
7795 return;
7796 }
7797
7798 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7799 #else
7800 #pragma unused(pn)
7801 #endif
7802 }
7803
7804 #if XNU_MONITOR
7805 boolean_t
7806 pmap_is_monitor(ppnum_t pn)
7807 {
7808 assert(pa_valid(ptoa(pn)));
7809 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7810 }
7811 #endif
7812
7813 void
7814 pmap_lock_phys_page(ppnum_t pn)
7815 {
7816 #if !XNU_MONITOR
7817 unsigned int pai;
7818 pmap_paddr_t phys = ptoa(pn);
7819
7820 if (pa_valid(phys)) {
7821 pai = pa_index(phys);
7822 pvh_lock(pai);
7823 } else
7824 #else
7825 (void)pn;
7826 #endif
7827 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7828 }
7829
7830
7831 void
7832 pmap_unlock_phys_page(ppnum_t pn)
7833 {
7834 #if !XNU_MONITOR
7835 unsigned int pai;
7836 pmap_paddr_t phys = ptoa(pn);
7837
7838 if (pa_valid(phys)) {
7839 pai = pa_index(phys);
7840 pvh_unlock(pai);
7841 } else
7842 #else
7843 (void)pn;
7844 #endif
7845 { simple_unlock(&phys_backup_lock);}
7846 }
7847
7848 MARK_AS_PMAP_TEXT static void
7849 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7850 {
7851 #if (__ARM_VMSA__ == 7)
7852 cpu_data_ptr->cpu_user_pmap = pmap;
7853 cpu_data_ptr->cpu_user_pmap_stamp = pmap->stamp;
7854 if (pmap != kernel_pmap) {
7855 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7856 }
7857
7858 #if MACH_ASSERT && __ARM_USER_PROTECT__
7859 {
7860 unsigned int ttbr0_val, ttbr1_val;
7861 __asm__ volatile ("mrc p15,0,%0,c2,c0,0\n" : "=r"(ttbr0_val));
7862 __asm__ volatile ("mrc p15,0,%0,c2,c0,1\n" : "=r"(ttbr1_val));
7863 if (ttbr0_val != ttbr1_val) {
7864 panic("Misaligned ttbr0 %08X", ttbr0_val);
7865 }
7866 if (pmap->ttep & 0x1000) {
7867 panic("Misaligned ttbr0 %08X", pmap->ttep);
7868 }
7869 }
7870 #endif
7871 #if !__ARM_USER_PROTECT__
7872 set_mmu_ttb(pmap->ttep);
7873 set_context_id(pmap->hw_asid);
7874 #endif
7875
7876 #else /* (__ARM_VMSA__ == 7) */
7877
7878 if (pmap != kernel_pmap) {
7879 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7880 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7881 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7882 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7883 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7884 #if __ARM_MIXED_PAGE_SIZE__
7885 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7886 #endif
7887 }
7888
7889
7890 #if __ARM_MIXED_PAGE_SIZE__
7891 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7892 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7893 }
7894 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7895
7896
7897 if (pmap != kernel_pmap) {
7898 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7899 } else if (!pmap_user_ttb_is_clear()) {
7900 pmap_clear_user_ttb_internal();
7901 }
7902 #endif /* (__ARM_VMSA__ == 7) */
7903 }
7904
7905 MARK_AS_PMAP_TEXT void
7906 pmap_clear_user_ttb_internal(void)
7907 {
7908 #if (__ARM_VMSA__ > 7)
7909 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7910 #else
7911 set_mmu_ttb(kernel_pmap->ttep);
7912 #endif
7913 }
7914
7915 void
7916 pmap_clear_user_ttb(void)
7917 {
7918 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7919 #if XNU_MONITOR
7920 pmap_clear_user_ttb_ppl();
7921 #else
7922 pmap_clear_user_ttb_internal();
7923 #endif
7924 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7925 }
7926
7927
7928 #if defined(__arm64__)
7929 /*
7930 * Marker for use in multi-pass fast-fault PV list processing.
7931 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7932 * these functions, as compressed PTEs should never be present in PV lists.
7933 * Note that this only holds true for arm64; for arm32 we don't have enough
7934 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7935 * and WRITEABLE marker depending on whether the PTE is valid.
7936 */
7937 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7938 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7939 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7940 #endif
7941
7942
7943 MARK_AS_PMAP_TEXT static boolean_t
7944 arm_force_fast_fault_with_flush_range(
7945 ppnum_t ppnum,
7946 vm_prot_t allow_mode,
7947 int options,
7948 pmap_tlb_flush_range_t *flush_range)
7949 {
7950 pmap_paddr_t phys = ptoa(ppnum);
7951 pv_entry_t *pve_p;
7952 pt_entry_t *pte_p;
7953 unsigned int pai;
7954 unsigned int pass1_updated = 0;
7955 unsigned int pass2_updated = 0;
7956 boolean_t result;
7957 pv_entry_t **pv_h;
7958 bool is_reusable;
7959 bool ref_fault;
7960 bool mod_fault;
7961 bool clear_write_fault = false;
7962 bool ref_aliases_mod = false;
7963 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7964
7965 assert(ppnum != vm_page_fictitious_addr);
7966
7967 if (!pa_valid(phys)) {
7968 return FALSE; /* Not a managed page. */
7969 }
7970
7971 result = TRUE;
7972 ref_fault = false;
7973 mod_fault = false;
7974 pai = pa_index(phys);
7975 if (__probable(mustsynch)) {
7976 pvh_lock(pai);
7977 }
7978 pv_h = pai_to_pvh(pai);
7979
7980 #if XNU_MONITOR
7981 if (__improbable(ppattr_pa_test_monitor(phys))) {
7982 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7983 }
7984 #endif
7985 pte_p = PT_ENTRY_NULL;
7986 pve_p = PV_ENTRY_NULL;
7987 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7988 pte_p = pvh_ptep(pv_h);
7989 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7990 pve_p = pvh_pve_list(pv_h);
7991 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7992 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7993 }
7994
7995 is_reusable = ppattr_test_reusable(pai);
7996
7997 /*
7998 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7999 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
8000 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
8001 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
8002 * operation, TLB invalidation may be handled by the caller so it's possible for
8003 * tlb_flush_needed to be true while issue_tlbi is false.
8004 */
8005 bool issue_tlbi = false;
8006 bool tlb_flush_needed = false;
8007
8008 pv_entry_t *orig_pve_p = pve_p;
8009 pt_entry_t *orig_pte_p = pte_p;
8010 int pve_ptep_idx = 0;
8011
8012 /*
8013 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8014 * TLB invalidation in pass 2.
8015 */
8016 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8017 pt_entry_t spte;
8018 pt_entry_t tmplate;
8019
8020 if (pve_p != PV_ENTRY_NULL) {
8021 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8022 if (pte_p == PT_ENTRY_NULL) {
8023 goto fff_skip_pve_pass1;
8024 }
8025 }
8026
8027 #ifdef PVH_FLAG_IOMMU
8028 if (pvh_ptep_is_iommu(pte_p)) {
8029 goto fff_skip_pve_pass1;
8030 }
8031 #endif
8032 if (*pte_p == ARM_PTE_EMPTY) {
8033 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8034 }
8035 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8036 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8037 }
8038
8039 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8040 const pmap_t pmap = ptdp->pmap;
8041 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8042 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8043
8044 assert(va >= pmap->min && va < pmap->max);
8045
8046 /* update pmap stats and ledgers */
8047 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8048 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8049 if (is_altacct) {
8050 /*
8051 * We do not track "reusable" status for
8052 * "alternate accounting" mappings.
8053 */
8054 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8055 is_reusable &&
8056 is_internal &&
8057 pmap != kernel_pmap) {
8058 /* one less "reusable" */
8059 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8060 /* one more "internal" */
8061 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8062 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8063
8064 /*
8065 * Since the page is being marked non-reusable, we assume that it will be
8066 * modified soon. Avoid the cost of another trap to handle the fast
8067 * fault when we next write to this page.
8068 */
8069 clear_write_fault = true;
8070 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8071 !is_reusable &&
8072 is_internal &&
8073 pmap != kernel_pmap) {
8074 /* one more "reusable" */
8075 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8076 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8077 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8078 }
8079
8080 bool wiredskip = pte_is_wired(*pte_p) &&
8081 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8082
8083 if (wiredskip) {
8084 result = FALSE;
8085 goto fff_skip_pve_pass1;
8086 }
8087
8088 spte = *pte_p;
8089 tmplate = spte;
8090
8091 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8092 /* read protection sets the pte to fault */
8093 tmplate = tmplate & ~ARM_PTE_AF;
8094 ref_fault = true;
8095 }
8096 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8097 /* take away write permission if set */
8098 if (pmap == kernel_pmap) {
8099 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8100 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8101 pte_set_was_writeable(tmplate, true);
8102 mod_fault = true;
8103 }
8104 } else {
8105 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8106 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8107 pte_set_was_writeable(tmplate, true);
8108 mod_fault = true;
8109 }
8110 }
8111 }
8112
8113 #if MACH_ASSERT && XNU_MONITOR
8114 if (is_pte_xprr_protected(pmap, spte)) {
8115 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8116 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8117 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8118 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8119 ppnum, options, allow_mode);
8120 }
8121 }
8122 #endif /* MACH_ASSERT && XNU_MONITOR */
8123
8124 if (result && (tmplate != spte)) {
8125 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8126 !(options & PMAP_OPTIONS_NOFLUSH)) {
8127 tlb_flush_needed = true;
8128 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8129 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8130 #ifdef ARM_PTE_FF_MARKER
8131 assert(!(spte & ARM_PTE_FF_MARKER));
8132 tmplate |= ARM_PTE_FF_MARKER;
8133 ++pass1_updated;
8134 #endif
8135 issue_tlbi = true;
8136 }
8137 }
8138 write_pte_fast(pte_p, tmplate);
8139 }
8140
8141 fff_skip_pve_pass1:
8142 pte_p = PT_ENTRY_NULL;
8143 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8144 pve_ptep_idx = 0;
8145 pve_p = pve_next(pve_p);
8146 }
8147 }
8148
8149 if (tlb_flush_needed) {
8150 FLUSH_PTE_STRONG();
8151 }
8152
8153 if (!issue_tlbi) {
8154 goto fff_finish;
8155 }
8156
8157 /* Pass 2: Issue any required TLB invalidations */
8158 pve_p = orig_pve_p;
8159 pte_p = orig_pte_p;
8160 pve_ptep_idx = 0;
8161
8162 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8163 if (pve_p != PV_ENTRY_NULL) {
8164 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8165 if (pte_p == PT_ENTRY_NULL) {
8166 goto fff_skip_pve_pass2;
8167 }
8168 }
8169
8170 #ifdef PVH_FLAG_IOMMU
8171 if (pvh_ptep_is_iommu(pte_p)) {
8172 goto fff_skip_pve_pass2;
8173 }
8174 #endif
8175
8176 #ifdef ARM_PTE_FF_MARKER
8177 pt_entry_t spte = *pte_p;
8178
8179 if (!(spte & ARM_PTE_FF_MARKER)) {
8180 goto fff_skip_pve_pass2;
8181 } else {
8182 spte &= (~ARM_PTE_FF_MARKER);
8183 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8184 write_pte_fast(pte_p, spte);
8185 ++pass2_updated;
8186 }
8187 #endif
8188 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8189 const pmap_t pmap = ptdp->pmap;
8190 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8191
8192 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8193 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8194 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8195 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8196 }
8197
8198 fff_skip_pve_pass2:
8199 pte_p = PT_ENTRY_NULL;
8200 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8201 pve_ptep_idx = 0;
8202 pve_p = pve_next(pve_p);
8203 }
8204 }
8205
8206 fff_finish:
8207 if (__improbable(pass1_updated != pass2_updated)) {
8208 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8209 __func__, pass1_updated, pass2_updated);
8210 }
8211
8212 /*
8213 * If we are using the same approach for ref and mod
8214 * faults on this PTE, do not clear the write fault;
8215 * this would cause both ref and mod to be set on the
8216 * page again, and prevent us from taking ANY read/write
8217 * fault on the mapping.
8218 */
8219 if (clear_write_fault && !ref_aliases_mod) {
8220 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8221 }
8222 if (tlb_flush_needed) {
8223 if (flush_range) {
8224 /* Delayed flush. Signal to the caller that the flush is needed. */
8225 flush_range->ptfr_flush_needed = true;
8226 } else {
8227 sync_tlb_flush();
8228 }
8229 }
8230
8231 /* update global "reusable" status for this page */
8232 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8233 ppattr_clear_reusable(pai);
8234 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8235 ppattr_set_reusable(pai);
8236 }
8237
8238 if (mod_fault) {
8239 ppattr_set_modfault(pai);
8240 }
8241 if (ref_fault) {
8242 ppattr_set_reffault(pai);
8243 }
8244 if (__probable(mustsynch)) {
8245 pvh_unlock(pai);
8246 }
8247 return result;
8248 }
8249
8250 MARK_AS_PMAP_TEXT boolean_t
8251 arm_force_fast_fault_internal(
8252 ppnum_t ppnum,
8253 vm_prot_t allow_mode,
8254 int options)
8255 {
8256 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8257 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8258 }
8259 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8260 }
8261
8262 /*
8263 * Routine: arm_force_fast_fault
8264 *
8265 * Function:
8266 * Force all mappings for this page to fault according
8267 * to the access modes allowed, so we can gather ref/modify
8268 * bits again.
8269 */
8270
8271 boolean_t
8272 arm_force_fast_fault(
8273 ppnum_t ppnum,
8274 vm_prot_t allow_mode,
8275 int options,
8276 __unused void *arg)
8277 {
8278 pmap_paddr_t phys = ptoa(ppnum);
8279
8280 assert(ppnum != vm_page_fictitious_addr);
8281
8282 if (!pa_valid(phys)) {
8283 return FALSE; /* Not a managed page. */
8284 }
8285
8286 #if XNU_MONITOR
8287 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8288 #else
8289 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8290 #endif
8291 }
8292
8293 /*
8294 * Routine: arm_clear_fast_fault
8295 *
8296 * Function:
8297 * Clear pending force fault for all mappings for this page based on
8298 * the observed fault type, update ref/modify bits.
8299 */
8300 MARK_AS_PMAP_TEXT static boolean_t
8301 arm_clear_fast_fault(
8302 ppnum_t ppnum,
8303 vm_prot_t fault_type,
8304 pt_entry_t *pte_p)
8305 {
8306 pmap_paddr_t pa = ptoa(ppnum);
8307 pv_entry_t *pve_p;
8308 unsigned int pai;
8309 boolean_t result;
8310 bool tlb_flush_needed = false;
8311 pv_entry_t **pv_h;
8312 unsigned int npve = 0;
8313 unsigned int pass1_updated = 0;
8314 unsigned int pass2_updated = 0;
8315
8316 assert(ppnum != vm_page_fictitious_addr);
8317
8318 if (!pa_valid(pa)) {
8319 return FALSE; /* Not a managed page. */
8320 }
8321
8322 result = FALSE;
8323 pai = pa_index(pa);
8324 pvh_assert_locked(pai);
8325 pv_h = pai_to_pvh(pai);
8326
8327 pve_p = PV_ENTRY_NULL;
8328 if (pte_p == PT_ENTRY_NULL) {
8329 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8330 pte_p = pvh_ptep(pv_h);
8331 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8332 pve_p = pvh_pve_list(pv_h);
8333 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8334 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8335 }
8336 }
8337
8338 pv_entry_t *orig_pve_p = pve_p;
8339 pt_entry_t *orig_pte_p = pte_p;
8340 int pve_ptep_idx = 0;
8341
8342 /*
8343 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8344 * TLB invalidation in pass 2.
8345 */
8346 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8347 pt_entry_t spte;
8348 pt_entry_t tmplate;
8349
8350 if (pve_p != PV_ENTRY_NULL) {
8351 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8352 if (pte_p == PT_ENTRY_NULL) {
8353 goto cff_skip_pve_pass1;
8354 }
8355 }
8356
8357 #ifdef PVH_FLAG_IOMMU
8358 if (pvh_ptep_is_iommu(pte_p)) {
8359 goto cff_skip_pve_pass1;
8360 }
8361 #endif
8362 if (*pte_p == ARM_PTE_EMPTY) {
8363 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8364 }
8365
8366 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8367 const pmap_t pmap = ptdp->pmap;
8368 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8369
8370 assert(va >= pmap->min && va < pmap->max);
8371
8372 spte = *pte_p;
8373 tmplate = spte;
8374
8375 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8376 {
8377 if (pmap == kernel_pmap) {
8378 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8379 } else {
8380 assert(pmap->type != PMAP_TYPE_NESTED);
8381 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8382 }
8383 }
8384
8385 tmplate |= ARM_PTE_AF;
8386
8387 pte_set_was_writeable(tmplate, false);
8388 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8389 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8390 tmplate = spte | ARM_PTE_AF;
8391
8392 {
8393 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8394 }
8395 }
8396
8397 #if MACH_ASSERT && XNU_MONITOR
8398 if (is_pte_xprr_protected(pmap, spte)) {
8399 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8400 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8401 "ppnum=0x%x, fault_type=0x%x",
8402 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8403 ppnum, fault_type);
8404 }
8405 }
8406 #endif /* MACH_ASSERT && XNU_MONITOR */
8407
8408 assert(spte != ARM_PTE_TYPE_FAULT);
8409 if (spte != tmplate) {
8410 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8411 #ifdef ARM_PTE_FF_MARKER
8412 assert(!(spte & ARM_PTE_FF_MARKER));
8413 tmplate |= ARM_PTE_FF_MARKER;
8414 ++pass1_updated;
8415 #endif
8416 tlb_flush_needed = true;
8417 }
8418 write_pte_fast(pte_p, tmplate);
8419 result = TRUE;
8420 }
8421
8422 cff_skip_pve_pass1:
8423 pte_p = PT_ENTRY_NULL;
8424 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8425 pve_ptep_idx = 0;
8426 pve_p = pve_next(pve_p);
8427 ++npve;
8428 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8429 break;
8430 }
8431 }
8432 }
8433
8434 if (!tlb_flush_needed) {
8435 goto cff_finish;
8436 }
8437
8438 FLUSH_PTE_STRONG();
8439
8440 /* Pass 2: Issue any required TLB invalidations */
8441 pve_p = orig_pve_p;
8442 pte_p = orig_pte_p;
8443 pve_ptep_idx = 0;
8444 npve = 0;
8445
8446 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8447 if (pve_p != PV_ENTRY_NULL) {
8448 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8449 if (pte_p == PT_ENTRY_NULL) {
8450 goto cff_skip_pve_pass2;
8451 }
8452 }
8453
8454 #ifdef PVH_FLAG_IOMMU
8455 if (pvh_ptep_is_iommu(pte_p)) {
8456 goto cff_skip_pve_pass2;
8457 }
8458 #endif
8459
8460 #ifdef ARM_PTE_FF_MARKER
8461 pt_entry_t spte = *pte_p;
8462
8463 if (!(spte & ARM_PTE_FF_MARKER)) {
8464 goto cff_skip_pve_pass2;
8465 } else {
8466 spte &= (~ARM_PTE_FF_MARKER);
8467 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8468 write_pte_fast(pte_p, spte);
8469 ++pass2_updated;
8470 }
8471 #endif
8472 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8473 const pmap_t pmap = ptdp->pmap;
8474 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8475
8476 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8477
8478 cff_skip_pve_pass2:
8479 pte_p = PT_ENTRY_NULL;
8480 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8481 pve_ptep_idx = 0;
8482 pve_p = pve_next(pve_p);
8483 ++npve;
8484 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8485 break;
8486 }
8487 }
8488 }
8489
8490 cff_finish:
8491 if (__improbable(pass1_updated != pass2_updated)) {
8492 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8493 __func__, pass1_updated, pass2_updated);
8494 }
8495 if (tlb_flush_needed) {
8496 sync_tlb_flush();
8497 }
8498 return result;
8499 }
8500
8501 /*
8502 * Determine if the fault was induced by software tracking of
8503 * modify/reference bits. If so, re-enable the mapping (and set
8504 * the appropriate bits).
8505 *
8506 * Returns KERN_SUCCESS if the fault was induced and was
8507 * successfully handled.
8508 *
8509 * Returns KERN_FAILURE if the fault was not induced and
8510 * the function was unable to deal with it.
8511 *
8512 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8513 * disallows this type of access.
8514 */
8515 MARK_AS_PMAP_TEXT kern_return_t
8516 arm_fast_fault_internal(
8517 pmap_t pmap,
8518 vm_map_address_t va,
8519 vm_prot_t fault_type,
8520 __unused bool was_af_fault,
8521 __unused bool from_user)
8522 {
8523 kern_return_t result = KERN_FAILURE;
8524 pt_entry_t *ptep;
8525 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8526 unsigned int pai;
8527 pmap_paddr_t pa;
8528 validate_pmap_mutable(pmap);
8529
8530 pmap_lock(pmap, PMAP_LOCK_SHARED);
8531
8532 /*
8533 * If the entry doesn't exist, is completely invalid, or is already
8534 * valid, we can't fix it here.
8535 */
8536
8537 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8538 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8539 if (ptep != PT_ENTRY_NULL) {
8540 while (true) {
8541 spte = *((volatile pt_entry_t*)ptep);
8542
8543 pa = pte_to_pa(spte);
8544
8545 if ((spte == ARM_PTE_TYPE_FAULT) ||
8546 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8547 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8548 return result;
8549 }
8550
8551 if (!pa_valid(pa)) {
8552 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8553 #if XNU_MONITOR
8554 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8555 return KERN_PROTECTION_FAILURE;
8556 } else
8557 #endif
8558 return result;
8559 }
8560 pai = pa_index(pa);
8561 pvh_lock(pai);
8562 if (*ptep == spte) {
8563 /*
8564 * Double-check the spte value, as we care about the AF bit.
8565 * It's also possible that pmap_page_protect() transitioned the
8566 * PTE to compressed/empty before we grabbed the PVH lock.
8567 */
8568 break;
8569 }
8570 pvh_unlock(pai);
8571 }
8572 } else {
8573 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8574 return result;
8575 }
8576
8577
8578 if ((result != KERN_SUCCESS) &&
8579 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8580 /*
8581 * An attempted access will always clear ref/mod fault state, as
8582 * appropriate for the fault type. arm_clear_fast_fault will
8583 * update the associated PTEs for the page as appropriate; if
8584 * any PTEs are updated, we redrive the access. If the mapping
8585 * does not actually allow for the attempted access, the
8586 * following fault will (hopefully) fail to update any PTEs, and
8587 * thus cause arm_fast_fault to decide that it failed to handle
8588 * the fault.
8589 */
8590 if (ppattr_test_reffault(pai)) {
8591 ppattr_clear_reffault(pai);
8592 }
8593 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8594 ppattr_clear_modfault(pai);
8595 }
8596
8597 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8598 /*
8599 * Should this preserve KERN_PROTECTION_FAILURE? The
8600 * cost of not doing so is a another fault in a case
8601 * that should already result in an exception.
8602 */
8603 result = KERN_SUCCESS;
8604 }
8605 }
8606
8607 /*
8608 * If the PTE already has sufficient permissions, we can report the fault as handled.
8609 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8610 * on mappings of the same page
8611 */
8612 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8613 uintptr_t ap_ro, ap_rw, ap_x;
8614 if (pmap == kernel_pmap) {
8615 ap_ro = ARM_PTE_AP(AP_RONA);
8616 ap_rw = ARM_PTE_AP(AP_RWNA);
8617 ap_x = ARM_PTE_NX;
8618 } else {
8619 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8620 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8621 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8622 }
8623 /*
8624 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8625 * hardware they may be xPRR-protected, in which case they'll be handled
8626 * by the is_pte_xprr_protected() case above. Additionally, the exception
8627 * handling path currently does not call arm_fast_fault() without at least
8628 * VM_PROT_READ in fault_type.
8629 */
8630 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8631 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8632 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8633 result = KERN_SUCCESS;
8634 }
8635 }
8636 }
8637
8638 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8639 /*
8640 * A prior arm_clear_fast_fault() operation may have returned early due to
8641 * another pending PV list operation or an excessively large PV list.
8642 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8643 * taking a fault on the same mapping.
8644 */
8645 result = KERN_SUCCESS;
8646 }
8647
8648 pvh_unlock(pai);
8649 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8650 return result;
8651 }
8652
8653 kern_return_t
8654 arm_fast_fault(
8655 pmap_t pmap,
8656 vm_map_address_t va,
8657 vm_prot_t fault_type,
8658 bool was_af_fault,
8659 __unused bool from_user)
8660 {
8661 kern_return_t result = KERN_FAILURE;
8662
8663 if (va < pmap->min || va >= pmap->max) {
8664 return result;
8665 }
8666
8667 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8668 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8669 from_user);
8670
8671 #if (__ARM_VMSA__ == 7)
8672 if (pmap != kernel_pmap) {
8673 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
8674 pmap_t cur_pmap;
8675 pmap_t cur_user_pmap;
8676
8677 cur_pmap = current_pmap();
8678 cur_user_pmap = cpu_data_ptr->cpu_user_pmap;
8679
8680 if ((cur_user_pmap == cur_pmap) && (cur_pmap == pmap)) {
8681 if (cpu_data_ptr->cpu_user_pmap_stamp != pmap->stamp) {
8682 pmap_set_pmap(pmap, current_thread());
8683 result = KERN_SUCCESS;
8684 goto done;
8685 }
8686 }
8687 }
8688 #endif
8689
8690 #if XNU_MONITOR
8691 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8692 #else
8693 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8694 #endif
8695
8696 #if (__ARM_VMSA__ == 7)
8697 done:
8698 #endif
8699
8700 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8701
8702 return result;
8703 }
8704
8705 void
8706 pmap_copy_page(
8707 ppnum_t psrc,
8708 ppnum_t pdst)
8709 {
8710 bcopy_phys((addr64_t) (ptoa(psrc)),
8711 (addr64_t) (ptoa(pdst)),
8712 PAGE_SIZE);
8713 }
8714
8715
8716 /*
8717 * pmap_copy_page copies the specified (machine independent) pages.
8718 */
8719 void
8720 pmap_copy_part_page(
8721 ppnum_t psrc,
8722 vm_offset_t src_offset,
8723 ppnum_t pdst,
8724 vm_offset_t dst_offset,
8725 vm_size_t len)
8726 {
8727 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8728 (addr64_t) (ptoa(pdst) + dst_offset),
8729 len);
8730 }
8731
8732
8733 /*
8734 * pmap_zero_page zeros the specified (machine independent) page.
8735 */
8736 void
8737 pmap_zero_page(
8738 ppnum_t pn)
8739 {
8740 assert(pn != vm_page_fictitious_addr);
8741 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8742 }
8743
8744 /*
8745 * pmap_zero_part_page
8746 * zeros the specified (machine independent) part of a page.
8747 */
8748 void
8749 pmap_zero_part_page(
8750 ppnum_t pn,
8751 vm_offset_t offset,
8752 vm_size_t len)
8753 {
8754 assert(pn != vm_page_fictitious_addr);
8755 assert(offset + len <= PAGE_SIZE);
8756 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8757 }
8758
8759 void
8760 pmap_map_globals(
8761 void)
8762 {
8763 pt_entry_t *ptep, pte;
8764
8765 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8766 assert(ptep != PT_ENTRY_NULL);
8767 assert(*ptep == ARM_PTE_EMPTY);
8768
8769 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8770 #if __ARM_KERNEL_PROTECT__
8771 pte |= ARM_PTE_NG;
8772 #endif /* __ARM_KERNEL_PROTECT__ */
8773 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8774 #if (__ARM_VMSA__ > 7)
8775 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8776 #else
8777 pte |= ARM_PTE_SH;
8778 #endif
8779 *ptep = pte;
8780 FLUSH_PTE();
8781 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8782
8783 #if KASAN
8784 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8785 #endif
8786 }
8787
8788 vm_offset_t
8789 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8790 {
8791 if (__improbable(index >= CPUWINDOWS_MAX)) {
8792 panic("%s: invalid index %u", __func__, index);
8793 }
8794 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8795 }
8796
8797 MARK_AS_PMAP_TEXT unsigned int
8798 pmap_map_cpu_windows_copy_internal(
8799 ppnum_t pn,
8800 vm_prot_t prot,
8801 unsigned int wimg_bits)
8802 {
8803 pt_entry_t *ptep = NULL, pte;
8804 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8805 unsigned int cpu_num;
8806 unsigned int i;
8807 vm_offset_t cpu_copywindow_vaddr = 0;
8808 bool need_strong_sync = false;
8809
8810 #if XNU_MONITOR
8811 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8812 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8813 #endif
8814
8815 #if XNU_MONITOR
8816 #ifdef __ARM_COHERENT_IO__
8817 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8818 panic("%s: attempted to map a managed page, "
8819 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8820 __FUNCTION__,
8821 pn, prot, wimg_bits);
8822 }
8823 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8824 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8825 }
8826
8827 #else /* __ARM_COHERENT_IO__ */
8828 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8829 #endif /* __ARM_COHERENT_IO__ */
8830 #endif /* XNU_MONITOR */
8831 cpu_num = pmap_cpu_data->cpu_number;
8832
8833 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8834 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8835 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8836 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8837 if (*ptep == ARM_PTE_TYPE_FAULT) {
8838 break;
8839 }
8840 }
8841 if (i == CPUWINDOWS_MAX) {
8842 panic("pmap_map_cpu_windows_copy: out of window");
8843 }
8844
8845 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8846 #if __ARM_KERNEL_PROTECT__
8847 pte |= ARM_PTE_NG;
8848 #endif /* __ARM_KERNEL_PROTECT__ */
8849
8850 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8851
8852 if (prot & VM_PROT_WRITE) {
8853 pte |= ARM_PTE_AP(AP_RWNA);
8854 } else {
8855 pte |= ARM_PTE_AP(AP_RONA);
8856 }
8857
8858 write_pte_fast(ptep, pte);
8859 /*
8860 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8861 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8862 */
8863 FLUSH_PTE_STRONG();
8864 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8865 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8866
8867 return i;
8868 }
8869
8870 unsigned int
8871 pmap_map_cpu_windows_copy(
8872 ppnum_t pn,
8873 vm_prot_t prot,
8874 unsigned int wimg_bits)
8875 {
8876 #if XNU_MONITOR
8877 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8878 #else
8879 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8880 #endif
8881 }
8882
8883 MARK_AS_PMAP_TEXT void
8884 pmap_unmap_cpu_windows_copy_internal(
8885 unsigned int index)
8886 {
8887 pt_entry_t *ptep;
8888 unsigned int cpu_num;
8889 vm_offset_t cpu_copywindow_vaddr = 0;
8890 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8891
8892 cpu_num = pmap_cpu_data->cpu_number;
8893
8894 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8895 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8896 * (which are likely to have been on I/O memory) are complete before
8897 * tearing down the mapping. */
8898 __builtin_arm_dsb(DSB_SY);
8899 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8900 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8901 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8902 }
8903
8904 void
8905 pmap_unmap_cpu_windows_copy(
8906 unsigned int index)
8907 {
8908 #if XNU_MONITOR
8909 return pmap_unmap_cpu_windows_copy_ppl(index);
8910 #else
8911 return pmap_unmap_cpu_windows_copy_internal(index);
8912 #endif
8913 }
8914
8915 #if XNU_MONITOR
8916
8917 MARK_AS_PMAP_TEXT void
8918 pmap_invoke_with_page(
8919 ppnum_t page_number,
8920 void *ctx,
8921 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8922 {
8923 #pragma unused(page_number, ctx, callback)
8924 }
8925
8926 /*
8927 * Loop over every pmap_io_range (I/O ranges marked as owned by
8928 * the PPL in the device tree) and conditionally call callback() on each range
8929 * that needs to be included in the hibernation image.
8930 *
8931 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8932 * context is needed in the callback.
8933 * @param callback Callback function invoked on each range (gated by flag).
8934 */
8935 MARK_AS_PMAP_TEXT void
8936 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8937 {
8938 extern const pmap_io_range_t* io_attr_table;
8939 extern const unsigned int num_io_rgns;
8940 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8941 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8942 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8943 }
8944 }
8945 }
8946
8947 /**
8948 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8949 * PPL-owned page. Otherwise, do nothing.
8950 *
8951 * @param addr Physical address of the page to set the HASHED flag on.
8952 */
8953 MARK_AS_PMAP_TEXT void
8954 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8955 {
8956 /* Ignore non-managed kernel memory. */
8957 if (!pa_valid(addr)) {
8958 return;
8959 }
8960
8961 const unsigned int pai = pa_index(addr);
8962 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8963 pv_entry_t **pv_h = pai_to_pvh(pai);
8964
8965 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8966 pvh_lock(pai);
8967 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8968 pvh_unlock(pai);
8969 }
8970 }
8971
8972 /**
8973 * Loop through every physical page in the system and clear out the HASHED flag
8974 * on every PPL-owned page. That flag is used to keep track of which pages have
8975 * been hashed into the hibernation image during the hibernation entry process.
8976 *
8977 * The HASHED flag needs to be cleared out between hibernation cycles because the
8978 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8979 * image with the HASHED flag set on certain pages. It's important to clear the
8980 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8981 * into the hibernation image can't be compromised across hibernation cycles.
8982 */
8983 MARK_AS_PMAP_TEXT void
8984 pmap_clear_ppl_hashed_flag_all(void)
8985 {
8986 const unsigned int last_index = pa_index(vm_last_phys);
8987 pv_entry_t **pv_h = NULL;
8988
8989 for (int pai = 0; pai < last_index; ++pai) {
8990 pv_h = pai_to_pvh(pai);
8991
8992 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8993 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8994 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8995 pvh_lock(pai);
8996 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8997 pvh_unlock(pai);
8998 }
8999 }
9000 }
9001
9002 /**
9003 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
9004 * ppl_hib driver will call this after all wired pages have been copied into the
9005 * hibernation image.
9006 */
9007 MARK_AS_PMAP_TEXT void
9008 pmap_check_ppl_hashed_flag_all(void)
9009 {
9010 const unsigned int last_index = pa_index(vm_last_phys);
9011 pv_entry_t **pv_h = NULL;
9012
9013 for (int pai = 0; pai < last_index; ++pai) {
9014 pv_h = pai_to_pvh(pai);
9015
9016 /**
9017 * The PMAP stacks are explicitly not saved into the image so skip checking
9018 * the pages that contain the PMAP stacks.
9019 */
9020 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
9021 (pai < pa_index(pmap_stacks_end_pa));
9022
9023 if (!is_pmap_stack &&
9024 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
9025 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
9026 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
9027 }
9028 }
9029 }
9030
9031 #endif /* XNU_MONITOR */
9032
9033 /*
9034 * Indicate that a pmap is intended to be used as a nested pmap
9035 * within one or more larger address spaces. This must be set
9036 * before pmap_nest() is called with this pmap as the 'subordinate'.
9037 */
9038 MARK_AS_PMAP_TEXT void
9039 pmap_set_nested_internal(
9040 pmap_t pmap)
9041 {
9042 validate_pmap_mutable(pmap);
9043 if (__improbable(pmap->type != PMAP_TYPE_USER)) {
9044 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9045 __func__, pmap, pmap->type);
9046 }
9047 pmap->type = PMAP_TYPE_NESTED;
9048 pmap_get_pt_ops(pmap)->free_id(pmap);
9049 }
9050
9051 void
9052 pmap_set_nested(
9053 pmap_t pmap)
9054 {
9055 #if XNU_MONITOR
9056 pmap_set_nested_ppl(pmap);
9057 #else
9058 pmap_set_nested_internal(pmap);
9059 #endif
9060 }
9061
9062 /*
9063 * pmap_trim_range(pmap, start, end)
9064 *
9065 * pmap = pmap to operate on
9066 * start = start of the range
9067 * end = end of the range
9068 *
9069 * Attempts to deallocate TTEs for the given range in the nested range.
9070 */
9071 MARK_AS_PMAP_TEXT static void
9072 pmap_trim_range(
9073 pmap_t pmap,
9074 addr64_t start,
9075 addr64_t end)
9076 {
9077 addr64_t cur;
9078 addr64_t nested_region_start;
9079 addr64_t nested_region_end;
9080 addr64_t adjusted_start;
9081 addr64_t adjusted_end;
9082 addr64_t adjust_offmask;
9083 tt_entry_t * tte_p;
9084 pt_entry_t * pte_p;
9085 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9086
9087 if (__improbable(end < start)) {
9088 panic("%s: invalid address range, "
9089 "pmap=%p, start=%p, end=%p",
9090 __func__,
9091 pmap, (void*)start, (void*)end);
9092 }
9093
9094 nested_region_start = pmap->nested_region_addr;
9095 nested_region_end = nested_region_start + pmap->nested_region_size;
9096
9097 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9098 panic("%s: range outside nested region %p-%p, "
9099 "pmap=%p, start=%p, end=%p",
9100 __func__, (void *)nested_region_start, (void *)nested_region_end,
9101 pmap, (void*)start, (void*)end);
9102 }
9103
9104 /* Contract the range to TT page boundaries. */
9105 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9106 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9107 adjusted_end = end & ~adjust_offmask;
9108
9109 /* Iterate over the range, trying to remove TTEs. */
9110 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9111 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9112
9113 tte_p = pmap_tte(pmap, cur);
9114
9115 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
9116 pte_p = (pt_entry_t *) ttetokv(*tte_p);
9117
9118 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9119 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9120 /* Deallocate for the nested map. */
9121 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9122 } else if (pmap->type == PMAP_TYPE_USER) {
9123 /**
9124 * Just remove for the parent map. If the leaf table pointed
9125 * to by the TTE being removed (owned by the nested pmap)
9126 * has any mappings, then this call will panic. This
9127 * enforces the policy that tables being trimmed must be
9128 * empty to prevent possible use-after-free attacks.
9129 */
9130 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9131 } else {
9132 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9133 }
9134 } else {
9135 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9136 }
9137 }
9138
9139 #if (__ARM_VMSA__ > 7)
9140 /* Remove empty L2 TTs. */
9141 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9142 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9143
9144 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9145 /* For each L1 entry in our range... */
9146 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9147
9148 bool remove_tt1e = true;
9149 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9150 tt_entry_t * tt2e_start;
9151 tt_entry_t * tt2e_end;
9152 tt_entry_t * tt2e_p;
9153 tt_entry_t tt1e;
9154
9155 if (tt1e_p == NULL) {
9156 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9157 continue;
9158 }
9159
9160 tt1e = *tt1e_p;
9161
9162 if (tt1e == ARM_TTE_TYPE_FAULT) {
9163 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9164 continue;
9165 }
9166
9167 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9168 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9169
9170 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9171 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9172 /*
9173 * If any TTEs are populated, don't remove the
9174 * L1 TT.
9175 */
9176 remove_tt1e = false;
9177 }
9178 }
9179
9180 if (remove_tt1e) {
9181 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9182 } else {
9183 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9184 }
9185 }
9186 #endif /* (__ARM_VMSA__ > 7) */
9187 }
9188
9189 /*
9190 * pmap_trim_internal(grand, subord, vstart, size)
9191 *
9192 * grand = pmap subord is nested in
9193 * subord = nested pmap
9194 * vstart = start of the used range in grand
9195 * size = size of the used range
9196 *
9197 * Attempts to trim the shared region page tables down to only cover the given
9198 * range in subord and grand.
9199 */
9200 MARK_AS_PMAP_TEXT void
9201 pmap_trim_internal(
9202 pmap_t grand,
9203 pmap_t subord,
9204 addr64_t vstart,
9205 uint64_t size)
9206 {
9207 addr64_t vend;
9208 addr64_t adjust_offmask;
9209
9210 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9211 panic("%s: grand addr wraps around, "
9212 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9213 __func__, grand, subord, (void*)vstart, size);
9214 }
9215
9216 validate_pmap_mutable(grand);
9217 validate_pmap(subord);
9218
9219 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9220
9221 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9222
9223 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9224 panic("%s: subord is of non-nestable type 0x%hhx, "
9225 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9226 __func__, subord->type, grand, subord, (void*)vstart, size);
9227 }
9228
9229 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9230 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9231 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9232 __func__, grand->type, grand, subord, (void*)vstart, size);
9233 }
9234
9235 if (__improbable(grand->nested_pmap != subord)) {
9236 panic("%s: grand->nested != subord, "
9237 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9238 __func__, grand, subord, (void*)vstart, size);
9239 }
9240
9241 if (__improbable((size != 0) &&
9242 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9243 panic("%s: grand range not in nested region, "
9244 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9245 __func__, grand, subord, (void*)vstart, size);
9246 }
9247
9248
9249 if (!grand->nested_has_no_bounds_ref) {
9250 assert(subord->nested_bounds_set);
9251
9252 if (!grand->nested_bounds_set) {
9253 /* Inherit the bounds from subord. */
9254 grand->nested_region_true_start = subord->nested_region_true_start;
9255 grand->nested_region_true_end = subord->nested_region_true_end;
9256 grand->nested_bounds_set = true;
9257 }
9258
9259 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9260 return;
9261 }
9262
9263 if ((!subord->nested_bounds_set) && size) {
9264 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9265
9266 subord->nested_region_true_start = vstart;
9267 subord->nested_region_true_end = vend;
9268 subord->nested_region_true_start &= ~adjust_offmask;
9269
9270 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9271 panic("%s: padded true end wraps around, "
9272 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9273 __func__, grand, subord, (void*)vstart, size);
9274 }
9275
9276 subord->nested_region_true_end &= ~adjust_offmask;
9277 subord->nested_bounds_set = true;
9278 }
9279
9280 if (subord->nested_bounds_set) {
9281 /* Inherit the bounds from subord. */
9282 grand->nested_region_true_start = subord->nested_region_true_start;
9283 grand->nested_region_true_end = subord->nested_region_true_end;
9284 grand->nested_bounds_set = true;
9285
9286 /* If we know the bounds, we can trim the pmap. */
9287 grand->nested_has_no_bounds_ref = false;
9288 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9289 } else {
9290 /* Don't trim if we don't know the bounds. */
9291 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9292 return;
9293 }
9294
9295 /* Trim grand to only cover the given range. */
9296 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9297 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9298
9299 /* Try to trim subord. */
9300 pmap_trim_subord(subord);
9301 }
9302
9303 MARK_AS_PMAP_TEXT static void
9304 pmap_trim_self(pmap_t pmap)
9305 {
9306 if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
9307 /* If we have a no bounds ref, we need to drop it. */
9308 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9309 pmap->nested_has_no_bounds_ref = false;
9310 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9311 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9312 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9313 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9314
9315 if (nested_bounds_set) {
9316 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9317 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9318 }
9319 /*
9320 * Try trimming the nested pmap, in case we had the
9321 * last reference.
9322 */
9323 pmap_trim_subord(pmap->nested_pmap);
9324 }
9325 }
9326
9327 /*
9328 * pmap_trim_subord(grand, subord)
9329 *
9330 * grand = pmap that we have nested subord in
9331 * subord = nested pmap we are attempting to trim
9332 *
9333 * Trims subord if possible
9334 */
9335 MARK_AS_PMAP_TEXT static void
9336 pmap_trim_subord(pmap_t subord)
9337 {
9338 bool contract_subord = false;
9339
9340 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9341
9342 subord->nested_no_bounds_refcnt--;
9343
9344 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9345 /* If this was the last no bounds reference, trim subord. */
9346 contract_subord = true;
9347 }
9348
9349 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9350
9351 if (contract_subord) {
9352 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9353 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9354 }
9355 }
9356
9357 void
9358 pmap_trim(
9359 pmap_t grand,
9360 pmap_t subord,
9361 addr64_t vstart,
9362 uint64_t size)
9363 {
9364 #if XNU_MONITOR
9365 pmap_trim_ppl(grand, subord, vstart, size);
9366
9367 pmap_ledger_check_balance(grand);
9368 pmap_ledger_check_balance(subord);
9369 #else
9370 pmap_trim_internal(grand, subord, vstart, size);
9371 #endif
9372 }
9373
9374 #if HAS_APPLE_PAC
9375 void *
9376 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9377 {
9378 void *res = NULL;
9379 uint64_t current_intr_state = pmap_interrupts_disable();
9380
9381 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9382 switch (key) {
9383 case ptrauth_key_asia:
9384 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9385 break;
9386 case ptrauth_key_asda:
9387 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9388 break;
9389 default:
9390 panic("attempt to sign user pointer without process independent key");
9391 }
9392 ml_disable_user_jop_key(jop_key, saved_jop_state);
9393
9394 pmap_interrupts_restore(current_intr_state);
9395
9396 return res;
9397 }
9398
9399 void *
9400 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9401 {
9402 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9403 }
9404
9405 void *
9406 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9407 {
9408 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9409 panic("attempt to auth user pointer without process independent key");
9410 }
9411
9412 void *res = NULL;
9413 uint64_t current_intr_state = pmap_interrupts_disable();
9414
9415 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9416 res = ml_auth_ptr_unchecked(value, key, discriminator);
9417 ml_disable_user_jop_key(jop_key, saved_jop_state);
9418
9419 pmap_interrupts_restore(current_intr_state);
9420
9421 return res;
9422 }
9423
9424 void *
9425 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9426 {
9427 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9428 }
9429 #endif /* HAS_APPLE_PAC */
9430
9431 /*
9432 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9433 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9434 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9435 * return value, to indicate where a preempted [un]nest operation should resume.
9436 * When the return value contains the ending address of the nested region with
9437 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9438 */
9439 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9440
9441 /*
9442 * kern_return_t pmap_nest(grand, subord, vstart, size)
9443 *
9444 * grand = the pmap that we will nest subord into
9445 * subord = the pmap that goes into the grand
9446 * vstart = start of range in pmap to be inserted
9447 * size = Size of nest area (up to 16TB)
9448 *
9449 * Inserts a pmap into another. This is used to implement shared segments.
9450 *
9451 */
9452
9453 /**
9454 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9455 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9456 * This function operates in 3 main phases:
9457 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9458 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9459 * the mapping range are present in subord.
9460 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9461 * contains pointers to subord's leaf-level pagetable pages for the specified
9462 * VA range.
9463 *
9464 * This function may return early due to pending AST_URGENT preemption; if so
9465 * it will indicate the need to be re-entered.
9466 *
9467 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9468 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9469 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9470 * @param size twig-aligned size of the nesting range
9471 * @param vrestart the twig-aligned starting address of the current call. May contain
9472 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9473 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9474 * KERN_RESOURCE_SHORTAGE on allocation failure.
9475 *
9476 * @return the virtual address at which to restart the operation, possibly including
9477 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9478 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9479 */
9480 MARK_AS_PMAP_TEXT vm_map_offset_t
9481 pmap_nest_internal(
9482 pmap_t grand,
9483 pmap_t subord,
9484 addr64_t vstart,
9485 uint64_t size,
9486 vm_map_offset_t vrestart,
9487 kern_return_t *krp)
9488 {
9489 kern_return_t kr = KERN_FAILURE;
9490 vm_map_offset_t vaddr;
9491 tt_entry_t *stte_p;
9492 tt_entry_t *gtte_p;
9493 unsigned int nested_region_asid_bitmap_size;
9494 unsigned int* nested_region_asid_bitmap;
9495 int expand_options = 0;
9496 bool deref_subord = true;
9497
9498 addr64_t vend;
9499 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9500 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9501 }
9502 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9503 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9504 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9505 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9506 }
9507
9508 assert(krp != NULL);
9509 validate_pmap_mutable(grand);
9510 validate_pmap(subord);
9511 #if XNU_MONITOR
9512 /*
9513 * Ordering is important here. validate_pmap() has already ensured subord is a
9514 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9515 * be in the process of being destroyed. If destruction is already committed,
9516 * then the check of ref_count below will cover us. If destruction is initiated
9517 * during or after this call, then pmap_destroy() will catch the non-zero
9518 * nested_count.
9519 */
9520 os_atomic_inc(&subord->nested_count, relaxed);
9521 os_atomic_thread_fence(seq_cst);
9522 #endif
9523 if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9524 panic("%s: invalid subordinate pmap %p", __func__, subord);
9525 }
9526
9527 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9528 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9529 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9530 }
9531
9532 #if XNU_MONITOR
9533 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9534 #endif
9535
9536 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9537 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9538 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9539 grand, vstart, size, (unsigned long long)vrestart);
9540 }
9541
9542 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9543 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9544 }
9545
9546 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9547 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9548 }
9549
9550 if (subord->nested_region_asid_bitmap == NULL) {
9551 nested_region_asid_bitmap_size = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9552
9553 #if XNU_MONITOR
9554 pmap_paddr_t pa = 0;
9555
9556 if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9557 panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9558 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9559 __FUNCTION__, nested_region_asid_bitmap_size,
9560 grand, subord, vstart, size);
9561 }
9562
9563 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9564
9565 if (kr != KERN_SUCCESS) {
9566 goto nest_cleanup;
9567 }
9568
9569 assert(pa);
9570
9571 nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9572 #else
9573 nested_region_asid_bitmap = kalloc_data(
9574 nested_region_asid_bitmap_size * sizeof(unsigned int),
9575 Z_WAITOK | Z_ZERO);
9576 #endif
9577
9578 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9579 if (subord->nested_region_asid_bitmap == NULL) {
9580 subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9581 subord->nested_region_addr = vstart;
9582 subord->nested_region_size = (mach_vm_offset_t) size;
9583
9584 /**
9585 * Ensure that the rest of the subord->nested_region_* fields are
9586 * initialized and visible before setting the nested_region_asid_bitmap
9587 * field (which is used as the flag to say that the rest are initialized).
9588 */
9589 __builtin_arm_dmb(DMB_ISHST);
9590 subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9591 nested_region_asid_bitmap = NULL;
9592 }
9593 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9594 if (nested_region_asid_bitmap != NULL) {
9595 #if XNU_MONITOR
9596 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9597 #else
9598 kfree_data(nested_region_asid_bitmap,
9599 nested_region_asid_bitmap_size * sizeof(unsigned int));
9600 #endif
9601 }
9602 }
9603
9604 /**
9605 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9606 * speculated before their initialization.
9607 */
9608 __builtin_arm_dmb(DMB_ISHLD);
9609
9610 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9611 uint64_t new_size;
9612 unsigned int new_nested_region_asid_bitmap_size;
9613 unsigned int* new_nested_region_asid_bitmap;
9614
9615 nested_region_asid_bitmap = NULL;
9616 nested_region_asid_bitmap_size = 0;
9617 new_size = vend - subord->nested_region_addr;
9618
9619 /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9620 new_nested_region_asid_bitmap_size = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9621
9622 #if XNU_MONITOR
9623 pmap_paddr_t pa = 0;
9624
9625 if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9626 panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9627 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9628 __FUNCTION__, new_nested_region_asid_bitmap_size,
9629 grand, subord, vstart, new_size);
9630 }
9631
9632 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9633
9634 if (kr != KERN_SUCCESS) {
9635 goto nest_cleanup;
9636 }
9637
9638 assert(pa);
9639
9640 new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9641 #else
9642 new_nested_region_asid_bitmap = kalloc_data(
9643 new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9644 Z_WAITOK | Z_ZERO);
9645 #endif
9646 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9647 if (subord->nested_region_size < new_size) {
9648 bcopy(subord->nested_region_asid_bitmap,
9649 new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9650 nested_region_asid_bitmap_size = subord->nested_region_asid_bitmap_size;
9651 nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9652 subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9653 subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9654 subord->nested_region_size = new_size;
9655 new_nested_region_asid_bitmap = NULL;
9656 }
9657 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9658 if (nested_region_asid_bitmap != NULL) {
9659 #if XNU_MONITOR
9660 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9661 #else
9662 kfree_data(nested_region_asid_bitmap,
9663 nested_region_asid_bitmap_size * sizeof(unsigned int));
9664 #endif
9665 }
9666 if (new_nested_region_asid_bitmap != NULL) {
9667 #if XNU_MONITOR
9668 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9669 #else
9670 kfree_data(new_nested_region_asid_bitmap,
9671 new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9672 #endif
9673 }
9674 }
9675
9676 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9677
9678 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9679 /*
9680 * If this is grand's first nesting operation, keep the reference on subord.
9681 * It will be released by pmap_destroy_internal() when grand is destroyed.
9682 */
9683 deref_subord = false;
9684
9685 if (!subord->nested_bounds_set) {
9686 /*
9687 * We are nesting without the shared regions bounds
9688 * being known. We'll have to trim the pmap later.
9689 */
9690 grand->nested_has_no_bounds_ref = true;
9691 subord->nested_no_bounds_refcnt++;
9692 }
9693
9694 grand->nested_region_addr = vstart;
9695 grand->nested_region_size = (mach_vm_offset_t) size;
9696 } else {
9697 if (__improbable(grand->nested_pmap != subord)) {
9698 panic("pmap_nest() pmap %p has a nested pmap", grand);
9699 } else if (__improbable(grand->nested_region_addr > vstart)) {
9700 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9701 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9702 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9703 }
9704 }
9705
9706 vaddr = vrestart & ~PMAP_NEST_GRAND;
9707 if (vaddr < subord->nested_region_true_start) {
9708 vaddr = subord->nested_region_true_start;
9709 }
9710
9711 addr64_t true_end = vend;
9712 if (true_end > subord->nested_region_true_end) {
9713 true_end = subord->nested_region_true_end;
9714 }
9715 __unused unsigned int ttecount = 0;
9716
9717 if (vrestart & PMAP_NEST_GRAND) {
9718 goto nest_grand;
9719 }
9720 #if (__ARM_VMSA__ == 7)
9721
9722 while (vaddr < true_end) {
9723 stte_p = pmap_tte(subord, vaddr);
9724 if ((stte_p == (tt_entry_t *)NULL) || (((*stte_p) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE)) {
9725 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9726 kr = pmap_expand(subord, vaddr, expand_options, PMAP_TT_L2_LEVEL);
9727
9728 if (kr != KERN_SUCCESS) {
9729 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9730 goto done;
9731 }
9732
9733 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9734 }
9735 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9736 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9737 stte_p = pmap_tte(grand, vaddr);
9738 if (stte_p == (tt_entry_t *)NULL) {
9739 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9740 kr = pmap_expand(grand, vaddr, expand_options, PMAP_TT_L1_LEVEL);
9741
9742 if (kr != KERN_SUCCESS) {
9743 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9744 goto done;
9745 }
9746 } else {
9747 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9748 kr = KERN_SUCCESS;
9749 }
9750 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9751 vaddr += ARM_TT_L1_SIZE;
9752 vrestart = vaddr;
9753 }
9754
9755 #else
9756 while (vaddr < true_end) {
9757 stte_p = pmap_tte(subord, vaddr);
9758 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9759 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9760 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9761
9762 if (kr != KERN_SUCCESS) {
9763 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9764 goto done;
9765 }
9766
9767 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9768 }
9769 vaddr += pt_attr_twig_size(pt_attr);
9770 vrestart = vaddr;
9771 ++ttecount;
9772 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9773 pmap_pending_preemption())) {
9774 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9775 kr = KERN_SUCCESS;
9776 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9777 goto done;
9778 }
9779 }
9780 #endif
9781 /*
9782 * copy TTEs from subord pmap into grand pmap
9783 */
9784
9785 vaddr = (vm_map_offset_t) vstart;
9786 if (vaddr < subord->nested_region_true_start) {
9787 vaddr = subord->nested_region_true_start;
9788 }
9789 vrestart = vaddr | PMAP_NEST_GRAND;
9790
9791 nest_grand:
9792 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9793 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9794 #if (__ARM_VMSA__ == 7)
9795 while (vaddr < true_end) {
9796 stte_p = pmap_tte(subord, vaddr);
9797 gtte_p = pmap_tte(grand, vaddr);
9798 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9799 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9800 __func__, gtte_p, grand);
9801 }
9802 *gtte_p = *stte_p;
9803 vaddr += ARM_TT_L1_SIZE;
9804 }
9805 vrestart = vaddr | PMAP_NEST_GRAND;
9806 #else
9807 while (vaddr < true_end) {
9808 stte_p = pmap_tte(subord, vaddr);
9809 gtte_p = pmap_tte(grand, vaddr);
9810 if (gtte_p == PT_ENTRY_NULL) {
9811 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9812 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9813 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9814
9815 if (kr != KERN_SUCCESS) {
9816 goto done;
9817 }
9818
9819 gtte_p = pmap_tt2e(grand, vaddr);
9820 }
9821 /* Don't leak a page table page. Don't violate break-before-make. */
9822 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9823 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9824 __func__, gtte_p, grand);
9825 }
9826 *gtte_p = *stte_p;
9827
9828 vaddr += pt_attr_twig_size(pt_attr);
9829 vrestart = vaddr | PMAP_NEST_GRAND;
9830 ++ttecount;
9831 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9832 pmap_pending_preemption())) {
9833 break;
9834 }
9835 }
9836 #endif
9837 if (vaddr >= true_end) {
9838 vrestart = vend | PMAP_NEST_GRAND;
9839 }
9840
9841 kr = KERN_SUCCESS;
9842 done:
9843
9844 FLUSH_PTE();
9845 __builtin_arm_isb(ISB_SY);
9846
9847 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9848 #if XNU_MONITOR
9849 nest_cleanup:
9850 if (kr != KERN_SUCCESS) {
9851 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9852 *krp = kr;
9853 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9854 }
9855 #else
9856 if (kr != KERN_SUCCESS) {
9857 *krp = kr;
9858 }
9859 #endif
9860 if (deref_subord) {
9861 #if XNU_MONITOR
9862 os_atomic_dec(&subord->nested_count, relaxed);
9863 #endif
9864 pmap_destroy_internal(subord);
9865 }
9866 return vrestart;
9867 }
9868
9869 kern_return_t
9870 pmap_nest(
9871 pmap_t grand,
9872 pmap_t subord,
9873 addr64_t vstart,
9874 uint64_t size)
9875 {
9876 kern_return_t kr = KERN_SUCCESS;
9877 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9878 vm_map_offset_t vend = vaddr + size;
9879 __unused vm_map_offset_t vlast = vaddr;
9880
9881 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9882 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9883 VM_KERNEL_ADDRHIDE(vstart));
9884
9885 pmap_verify_preemptible();
9886 #if XNU_MONITOR
9887 while (vaddr != (vend | PMAP_NEST_GRAND)) {
9888 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9889 if (kr == KERN_RESOURCE_SHORTAGE) {
9890 pmap_alloc_page_for_ppl(0);
9891 kr = KERN_SUCCESS;
9892 } else if (kr != KERN_SUCCESS) {
9893 break;
9894 } else if (vaddr == vlast) {
9895 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9896 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9897 }
9898 vlast = vaddr;
9899 }
9900
9901 pmap_ledger_check_balance(grand);
9902 pmap_ledger_check_balance(subord);
9903 #else
9904 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9905 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9906 }
9907 #endif
9908
9909 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9910
9911 return kr;
9912 }
9913
9914 /*
9915 * kern_return_t pmap_unnest(grand, vaddr)
9916 *
9917 * grand = the pmap that will have the virtual range unnested
9918 * vaddr = start of range in pmap to be unnested
9919 * size = size of range in pmap to be unnested
9920 *
9921 */
9922
9923 kern_return_t
9924 pmap_unnest(
9925 pmap_t grand,
9926 addr64_t vaddr,
9927 uint64_t size)
9928 {
9929 return pmap_unnest_options(grand, vaddr, size, 0);
9930 }
9931
9932 /**
9933 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9934 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9935 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9936 * still have the region nested. The mappings in 'grand' will be left empty
9937 * with the assumption that they will be demand-filled by subsequent access faults.
9938 *
9939 * This function operates in 2 main phases:
9940 * 1. Iteration over the nested pmap's mappings for the specified range to mark
9941 * them non-global.
9942 * 2. Clearing of the twig-level TTEs for the address range in grand.
9943 *
9944 * This function may return early due to pending AST_URGENT preemption; if so
9945 * it will indicate the need to be re-entered.
9946 *
9947 * @param grand pmap from which to unnest mappings
9948 * @param vaddr twig-aligned virtual address for the beginning of the nested range
9949 * @param size twig-aligned size of the nested range
9950 * @param vrestart the page-aligned starting address of the current call. May contain
9951 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9952 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9953 * grand is being torn down and step 1) above is not needed.
9954 *
9955 * @return the virtual address at which to restart the operation, possibly including
9956 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9957 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9958 */
9959 MARK_AS_PMAP_TEXT vm_map_offset_t
9960 pmap_unnest_options_internal(
9961 pmap_t grand,
9962 addr64_t vaddr,
9963 uint64_t size,
9964 vm_map_offset_t vrestart,
9965 unsigned int option)
9966 {
9967 vm_map_offset_t start;
9968 vm_map_offset_t addr;
9969 tt_entry_t *tte_p;
9970 unsigned int current_index;
9971 unsigned int start_index;
9972 unsigned int max_index;
9973 unsigned int entry_count = 0;
9974
9975 addr64_t vend;
9976 addr64_t true_end;
9977 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9978 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9979 }
9980 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9981 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9982 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9983 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9984 }
9985
9986 validate_pmap_mutable(grand);
9987
9988 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9989
9990 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9991 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9992 (unsigned long long)vaddr, (unsigned long long)size);
9993 }
9994
9995 if (__improbable(grand->nested_pmap == NULL)) {
9996 panic("%s: %p has no nested pmap", __func__, grand);
9997 }
9998
9999 true_end = vend;
10000 if (true_end > grand->nested_pmap->nested_region_true_end) {
10001 true_end = grand->nested_pmap->nested_region_true_end;
10002 }
10003
10004 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10005 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10006 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10007 }
10008
10009 pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10010
10011 start = vrestart;
10012 if (start < grand->nested_pmap->nested_region_true_start) {
10013 start = grand->nested_pmap->nested_region_true_start;
10014 }
10015 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10016 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10017 bool flush_tlb = false;
10018
10019 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10020 pt_entry_t *bpte, *cpte;
10021
10022 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10023
10024 bpte = pmap_pte(grand->nested_pmap, addr);
10025
10026 /*
10027 * If we've re-entered this function partway through unnesting a leaf region, the
10028 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10029 * the run of PTEs. We therefore also need to check for a non-twig-aligned starting
10030 * address.
10031 */
10032 if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
10033 (addr & pt_attr_twig_offmask(pt_attr))) {
10034 /*
10035 * Mark the 'twig' region as being unnested. Every mapping entered within
10036 * the nested pmap in this region will now be marked non-global. Do this
10037 * before marking any of the PTEs within the region as non-global to avoid
10038 * the possibility of pmap_enter() subsequently inserting a global mapping
10039 * in the region, which could lead to a TLB conflict if a non-global entry
10040 * is later inserted for the same VA in a pmap which has fully unnested this
10041 * region.
10042 */
10043 setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
10044 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10045 pmap_paddr_t pa;
10046 unsigned int pai = 0;
10047 boolean_t managed = FALSE;
10048 pt_entry_t spte;
10049
10050 if ((*cpte != ARM_PTE_TYPE_FAULT)
10051 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10052 spte = *((volatile pt_entry_t*)cpte);
10053 while (!managed) {
10054 pa = pte_to_pa(spte);
10055 if (!pa_valid(pa)) {
10056 break;
10057 }
10058 pai = pa_index(pa);
10059 pvh_lock(pai);
10060 spte = *((volatile pt_entry_t*)cpte);
10061 pa = pte_to_pa(spte);
10062 if (pai == pa_index(pa)) {
10063 managed = TRUE;
10064 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10065 }
10066 pvh_unlock(pai);
10067 }
10068
10069 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10070 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10071 flush_tlb = true;
10072 }
10073
10074 if (managed) {
10075 pvh_assert_locked(pai);
10076 pvh_unlock(pai);
10077 }
10078 }
10079
10080 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10081 vrestart = addr;
10082 ++entry_count;
10083 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10084 pmap_pending_preemption())) {
10085 goto unnest_subord_done;
10086 }
10087 }
10088 }
10089 addr = vlim;
10090 vrestart = addr;
10091 ++entry_count;
10092 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10093 pmap_pending_preemption())) {
10094 break;
10095 }
10096 }
10097
10098 unnest_subord_done:
10099 if (flush_tlb) {
10100 FLUSH_PTE_STRONG();
10101 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10102 }
10103
10104 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10105 if (current_index < max_index) {
10106 return vrestart;
10107 }
10108 }
10109
10110 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
10111
10112 /*
10113 * invalidate all pdes for segment at vaddr in pmap grand
10114 */
10115 if (vrestart & PMAP_NEST_GRAND) {
10116 addr = vrestart & ~PMAP_NEST_GRAND;
10117 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10118 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10119 }
10120 } else {
10121 addr = vaddr;
10122 vrestart = vaddr | PMAP_NEST_GRAND;
10123 }
10124
10125 if (addr < grand->nested_pmap->nested_region_true_start) {
10126 addr = grand->nested_pmap->nested_region_true_start;
10127 }
10128
10129 while (addr < true_end) {
10130 tte_p = pmap_tte(grand, addr);
10131 /*
10132 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10133 * so it's possible that a region we're trying to unnest may not have been
10134 * nested in the first place.
10135 */
10136 if (tte_p != NULL) {
10137 *tte_p = ARM_TTE_TYPE_FAULT;
10138 }
10139 addr += pt_attr_twig_size(pt_attr);
10140 vrestart = addr | PMAP_NEST_GRAND;
10141 ++entry_count;
10142 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10143 pmap_pending_preemption())) {
10144 break;
10145 }
10146 }
10147 if (addr >= true_end) {
10148 vrestart = vend | PMAP_NEST_GRAND;
10149 }
10150
10151 FLUSH_PTE_STRONG();
10152 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10153
10154 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10155
10156 return vrestart;
10157 }
10158
10159 kern_return_t
10160 pmap_unnest_options(
10161 pmap_t grand,
10162 addr64_t vaddr,
10163 uint64_t size,
10164 unsigned int option)
10165 {
10166 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10167 vm_map_offset_t vend = vaddr + size;
10168 __unused vm_map_offset_t vlast = vrestart;
10169
10170 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10171 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10172
10173 pmap_verify_preemptible();
10174 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10175 #if XNU_MONITOR
10176 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10177 if (vrestart == vlast) {
10178 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10179 __func__, (unsigned long long)vaddr, (unsigned long long)vend, (unsigned long long)vrestart);
10180 }
10181 vlast = vrestart;
10182 #else
10183 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10184 #endif
10185 }
10186
10187 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10188
10189 return KERN_SUCCESS;
10190 }
10191
10192 boolean_t
10193 pmap_adjust_unnest_parameters(
10194 __unused pmap_t p,
10195 __unused vm_map_offset_t *s,
10196 __unused vm_map_offset_t *e)
10197 {
10198 return TRUE; /* to get to log_unnest_badness()... */
10199 }
10200
10201 /*
10202 * disable no-execute capability on
10203 * the specified pmap
10204 */
10205 #if DEVELOPMENT || DEBUG
10206 void
10207 pmap_disable_NX(
10208 pmap_t pmap)
10209 {
10210 pmap->nx_enabled = FALSE;
10211 }
10212 #else
10213 void
10214 pmap_disable_NX(
10215 __unused pmap_t pmap)
10216 {
10217 }
10218 #endif
10219
10220 /*
10221 * flush a range of hardware TLB entries.
10222 * NOTE: assumes the smallest TLB entry in use will be for
10223 * an ARM small page (4K).
10224 */
10225
10226 #define ARM_FULL_TLB_FLUSH_THRESHOLD 64
10227
10228 #if __ARM_RANGE_TLBI__
10229 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
10230 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_PAGES
10231 #else
10232 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10233 #endif // __ARM_RANGE_TLBI__
10234
10235 static void
10236 flush_mmu_tlb_region_asid_async(
10237 vm_offset_t va,
10238 size_t length,
10239 pmap_t pmap,
10240 bool last_level_only __unused)
10241 {
10242 #if (__ARM_VMSA__ == 7)
10243 vm_offset_t end = va + length;
10244 uint32_t asid;
10245
10246 asid = pmap->hw_asid;
10247
10248 if (length / ARM_SMALL_PAGE_SIZE > ARM_FULL_TLB_FLUSH_THRESHOLD) {
10249 boolean_t flush_all = FALSE;
10250
10251 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10252 flush_all = TRUE;
10253 }
10254 if (flush_all) {
10255 flush_mmu_tlb_async();
10256 } else {
10257 flush_mmu_tlb_asid_async(asid);
10258 }
10259
10260 return;
10261 }
10262 if (pmap->type == PMAP_TYPE_NESTED) {
10263 #if !__ARM_MP_EXT__
10264 flush_mmu_tlb();
10265 #else
10266 va = arm_trunc_page(va);
10267 while (va < end) {
10268 flush_mmu_tlb_mva_entries_async(va);
10269 va += ARM_SMALL_PAGE_SIZE;
10270 }
10271 #endif
10272 return;
10273 }
10274 va = arm_trunc_page(va) | (asid & 0xff);
10275 flush_mmu_tlb_entries_async(va, end);
10276
10277 #else
10278 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10279 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10280 ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10281 uint32_t asid;
10282
10283 asid = pmap->hw_asid;
10284
10285 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10286 boolean_t flush_all = FALSE;
10287
10288 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10289 flush_all = TRUE;
10290 }
10291 if (flush_all) {
10292 flush_mmu_tlb_async();
10293 } else {
10294 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT);
10295 }
10296 return;
10297 }
10298 #if __ARM_RANGE_TLBI__
10299 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10300 va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10301 if (pmap->type == PMAP_TYPE_NESTED) {
10302 flush_mmu_tlb_allrange_async(va, last_level_only);
10303 } else {
10304 flush_mmu_tlb_range_async(va, last_level_only);
10305 }
10306 return;
10307 }
10308 #endif
10309 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10310 va = tlbi_asid(asid) | tlbi_addr(va);
10311
10312 if (pmap->type == PMAP_TYPE_NESTED) {
10313 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only);
10314 } else {
10315 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only);
10316 }
10317
10318 #endif
10319 }
10320
10321 MARK_AS_PMAP_TEXT static void
10322 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10323 {
10324 #if (__ARM_VMSA__ == 7)
10325 flush_mmu_tlb_asid_async(pmap->hw_asid);
10326 #else /* (__ARM_VMSA__ == 7) */
10327 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT);
10328 #endif /* (__ARM_VMSA__ == 7) */
10329 }
10330
10331 void
10332 flush_mmu_tlb_region(
10333 vm_offset_t va,
10334 unsigned length)
10335 {
10336 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
10337 sync_tlb_flush();
10338 }
10339
10340 unsigned int
10341 pmap_cache_attributes(
10342 ppnum_t pn)
10343 {
10344 pmap_paddr_t paddr;
10345 unsigned int pai;
10346 unsigned int result;
10347 pp_attr_t pp_attr_current;
10348
10349 paddr = ptoa(pn);
10350
10351 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10352
10353 if (!pa_valid(paddr)) {
10354 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10355 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10356 }
10357
10358 result = VM_WIMG_DEFAULT;
10359
10360 pai = pa_index(paddr);
10361
10362 pp_attr_current = pp_attr_table[pai];
10363 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10364 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10365 }
10366 return result;
10367 }
10368
10369 MARK_AS_PMAP_TEXT static void
10370 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10371 {
10372 if ((wimg_bits_prev != wimg_bits_new)
10373 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10374 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10375 && (wimg_bits_new != VM_WIMG_COPYBACK))
10376 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10377 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10378 pmap_sync_page_attributes_phys(pn);
10379 }
10380
10381 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10382 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10383 }
10384 }
10385
10386 MARK_AS_PMAP_TEXT __unused void
10387 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10388 {
10389 pmap_paddr_t paddr = ptoa(pn);
10390 const unsigned int pai = pa_index(paddr);
10391
10392 if (__improbable(!pa_valid(paddr))) {
10393 panic("%s called on non-managed page 0x%08x", __func__, pn);
10394 }
10395
10396 pvh_lock(pai);
10397
10398 #if XNU_MONITOR
10399 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10400 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10401 }
10402 #endif
10403
10404 pmap_update_cache_attributes_locked(pn, new_cacheattr);
10405
10406 pvh_unlock(pai);
10407
10408 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10409 }
10410
10411 void *
10412 pmap_map_compressor_page(ppnum_t pn)
10413 {
10414 #if __ARM_PTE_PHYSMAP__
10415 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10416 if (cacheattr != VM_WIMG_DEFAULT) {
10417 #if XNU_MONITOR
10418 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10419 #else
10420 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10421 #endif
10422 }
10423 #endif
10424 return (void*)phystokv(ptoa(pn));
10425 }
10426
10427 void
10428 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10429 {
10430 #if __ARM_PTE_PHYSMAP__
10431 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10432 if (cacheattr != VM_WIMG_DEFAULT) {
10433 #if XNU_MONITOR
10434 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10435 #else
10436 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10437 #endif
10438 }
10439 #endif
10440 }
10441
10442 MARK_AS_PMAP_TEXT boolean_t
10443 pmap_batch_set_cache_attributes_internal(
10444 ppnum_t pn,
10445 unsigned int cacheattr,
10446 unsigned int page_cnt,
10447 unsigned int page_index,
10448 boolean_t doit,
10449 unsigned int *res)
10450 {
10451 pmap_paddr_t paddr;
10452 unsigned int pai;
10453 pp_attr_t pp_attr_current;
10454 pp_attr_t pp_attr_template;
10455 unsigned int wimg_bits_prev, wimg_bits_new;
10456
10457 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10458 cacheattr = VM_WIMG_DEFAULT;
10459 }
10460
10461 if ((doit == FALSE) && (*res == 0)) {
10462 pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
10463 *res = page_cnt;
10464 pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
10465 if (platform_cache_batch_wimg(cacheattr & (VM_WIMG_MASK), page_cnt << PAGE_SHIFT) == FALSE) {
10466 return FALSE;
10467 }
10468 }
10469
10470 paddr = ptoa(pn);
10471
10472 if (!pa_valid(paddr)) {
10473 panic("pmap_batch_set_cache_attributes(): pn 0x%08x not managed", pn);
10474 }
10475
10476 pai = pa_index(paddr);
10477
10478 if (doit) {
10479 pvh_lock(pai);
10480 #if XNU_MONITOR
10481 if (ppattr_pa_test_monitor(paddr)) {
10482 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10483 }
10484 #endif
10485 }
10486
10487 do {
10488 pp_attr_current = pp_attr_table[pai];
10489 wimg_bits_prev = VM_WIMG_DEFAULT;
10490 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10491 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10492 }
10493
10494 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10495
10496 if (!doit) {
10497 break;
10498 }
10499
10500 /* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
10501 * to avoid losing simultaneous updates to other bits like refmod. */
10502 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10503
10504 wimg_bits_new = VM_WIMG_DEFAULT;
10505 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10506 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10507 }
10508
10509 if (doit) {
10510 if (wimg_bits_new != wimg_bits_prev) {
10511 pmap_update_cache_attributes_locked(pn, cacheattr);
10512 }
10513 pvh_unlock(pai);
10514 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10515 pmap_force_dcache_clean(phystokv(paddr), PAGE_SIZE);
10516 }
10517 } else {
10518 if (wimg_bits_new == VM_WIMG_COPYBACK) {
10519 return FALSE;
10520 }
10521 if (wimg_bits_prev == wimg_bits_new) {
10522 pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
10523 *res = *res - 1;
10524 pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
10525 if (!platform_cache_batch_wimg(wimg_bits_new, (*res) << PAGE_SHIFT)) {
10526 return FALSE;
10527 }
10528 }
10529 return TRUE;
10530 }
10531
10532 if (page_cnt == (page_index + 1)) {
10533 wimg_bits_prev = VM_WIMG_COPYBACK;
10534 if (((wimg_bits_prev != wimg_bits_new))
10535 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10536 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10537 && (wimg_bits_new != VM_WIMG_COPYBACK))
10538 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10539 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10540 platform_cache_flush_wimg(wimg_bits_new);
10541 }
10542 }
10543
10544 return TRUE;
10545 }
10546
10547 boolean_t
10548 pmap_batch_set_cache_attributes(
10549 ppnum_t pn,
10550 unsigned int cacheattr,
10551 unsigned int page_cnt,
10552 unsigned int page_index,
10553 boolean_t doit,
10554 unsigned int *res)
10555 {
10556 #if XNU_MONITOR
10557 return pmap_batch_set_cache_attributes_ppl(pn, cacheattr, page_cnt, page_index, doit, res);
10558 #else
10559 return pmap_batch_set_cache_attributes_internal(pn, cacheattr, page_cnt, page_index, doit, res);
10560 #endif
10561 }
10562
10563 MARK_AS_PMAP_TEXT static void
10564 pmap_set_cache_attributes_priv(
10565 ppnum_t pn,
10566 unsigned int cacheattr,
10567 boolean_t external __unused)
10568 {
10569 pmap_paddr_t paddr;
10570 unsigned int pai;
10571 pp_attr_t pp_attr_current;
10572 pp_attr_t pp_attr_template;
10573 unsigned int wimg_bits_prev, wimg_bits_new;
10574
10575 paddr = ptoa(pn);
10576
10577 if (!pa_valid(paddr)) {
10578 return; /* Not a managed page. */
10579 }
10580
10581 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10582 cacheattr = VM_WIMG_DEFAULT;
10583 }
10584
10585 pai = pa_index(paddr);
10586
10587 pvh_lock(pai);
10588
10589 #if XNU_MONITOR
10590 if (external && ppattr_pa_test_monitor(paddr)) {
10591 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10592 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
10593 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10594 }
10595 #endif
10596
10597 do {
10598 pp_attr_current = pp_attr_table[pai];
10599 wimg_bits_prev = VM_WIMG_DEFAULT;
10600 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10601 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10602 }
10603
10604 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10605
10606 /* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
10607 * to avoid losing simultaneous updates to other bits like refmod. */
10608 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10609
10610 wimg_bits_new = VM_WIMG_DEFAULT;
10611 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10612 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10613 }
10614
10615 if (wimg_bits_new != wimg_bits_prev) {
10616 pmap_update_cache_attributes_locked(pn, cacheattr);
10617 }
10618
10619 pvh_unlock(pai);
10620
10621 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10622 }
10623
10624 MARK_AS_PMAP_TEXT void
10625 pmap_set_cache_attributes_internal(
10626 ppnum_t pn,
10627 unsigned int cacheattr)
10628 {
10629 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10630 }
10631
10632 void
10633 pmap_set_cache_attributes(
10634 ppnum_t pn,
10635 unsigned int cacheattr)
10636 {
10637 #if XNU_MONITOR
10638 pmap_set_cache_attributes_ppl(pn, cacheattr);
10639 #else
10640 pmap_set_cache_attributes_internal(pn, cacheattr);
10641 #endif
10642 }
10643
10644 MARK_AS_PMAP_TEXT void
10645 pmap_update_cache_attributes_locked(
10646 ppnum_t ppnum,
10647 unsigned attributes)
10648 {
10649 pmap_paddr_t phys = ptoa(ppnum);
10650 pv_entry_t *pve_p;
10651 pt_entry_t *pte_p;
10652 pv_entry_t **pv_h;
10653 pt_entry_t tmplate;
10654 unsigned int pai;
10655 boolean_t tlb_flush_needed = FALSE;
10656
10657 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10658
10659 if (pmap_panic_dev_wimg_on_managed) {
10660 switch (attributes & VM_WIMG_MASK) {
10661 case VM_WIMG_IO: // nGnRnE
10662 case VM_WIMG_POSTED: // nGnRE
10663 /* supported on DRAM, but slow, so we disallow */
10664
10665 case VM_WIMG_POSTED_REORDERED: // nGRE
10666 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10667 /* unsupported on DRAM */
10668
10669 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10670 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10671 break;
10672
10673 default:
10674 /* not device type memory, all good */
10675
10676 break;
10677 }
10678 }
10679
10680 #if __ARM_PTE_PHYSMAP__
10681 vm_offset_t kva = phystokv(phys);
10682 pte_p = pmap_pte(kernel_pmap, kva);
10683
10684 tmplate = *pte_p;
10685 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10686 #if XNU_MONITOR
10687 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10688 #else
10689 tmplate |= wimg_to_pte(attributes, phys);
10690 #endif
10691 #if (__ARM_VMSA__ > 7)
10692 if (tmplate & ARM_PTE_HINT_MASK) {
10693 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10694 __FUNCTION__, pte_p, (void *)kva, tmplate);
10695 }
10696 #endif
10697 write_pte_strong(pte_p, tmplate);
10698 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10699 tlb_flush_needed = TRUE;
10700 #endif
10701
10702 pai = pa_index(phys);
10703
10704 pv_h = pai_to_pvh(pai);
10705
10706 pte_p = PT_ENTRY_NULL;
10707 pve_p = PV_ENTRY_NULL;
10708 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10709 pte_p = pvh_ptep(pv_h);
10710 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10711 pve_p = pvh_pve_list(pv_h);
10712 pte_p = PT_ENTRY_NULL;
10713 }
10714
10715 int pve_ptep_idx = 0;
10716 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10717 vm_map_address_t va;
10718 pmap_t pmap;
10719
10720 if (pve_p != PV_ENTRY_NULL) {
10721 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10722 if (pte_p == PT_ENTRY_NULL) {
10723 goto cache_skip_pve;
10724 }
10725 }
10726
10727 #ifdef PVH_FLAG_IOMMU
10728 if (pvh_ptep_is_iommu(pte_p)) {
10729 goto cache_skip_pve;
10730 }
10731 #endif
10732 pmap = ptep_get_pmap(pte_p);
10733 va = ptep_get_va(pte_p);
10734
10735 tmplate = *pte_p;
10736 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10737 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
10738
10739 write_pte_strong(pte_p, tmplate);
10740 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10741 tlb_flush_needed = TRUE;
10742
10743 cache_skip_pve:
10744 pte_p = PT_ENTRY_NULL;
10745 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10746 pve_ptep_idx = 0;
10747 pve_p = pve_next(pve_p);
10748 }
10749 }
10750 if (tlb_flush_needed) {
10751 pmap_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
10752 }
10753
10754 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
10755 }
10756
10757 #if (__ARM_VMSA__ == 7)
10758 void
10759 pmap_create_sharedpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10760 vm_map_address_t *user_commpage_addr)
10761 {
10762 pmap_paddr_t pa;
10763 kern_return_t kr;
10764
10765 assert(kernel_data_addr != NULL);
10766 assert(kernel_text_addr != NULL);
10767 assert(user_commpage_addr != NULL);
10768
10769 (void) pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, 0);
10770
10771 kr = pmap_enter(kernel_pmap, _COMM_PAGE_BASE_ADDRESS, atop(pa), VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10772 assert(kr == KERN_SUCCESS);
10773
10774 *kernel_data_addr = phystokv(pa);
10775 // We don't have PFZ for 32 bit arm, always NULL
10776 *kernel_text_addr = 0;
10777 *user_commpage_addr = 0;
10778 }
10779
10780 #else /* __ARM_VMSA__ == 7 */
10781
10782 /**
10783 * Mark a pmap as being dedicated to use for a commpage mapping.
10784 * The pmap itself will never be activated on a CPU; its mappings will
10785 * only be embedded in userspace pmaps at a fixed virtual address.
10786 *
10787 * @param pmap the pmap to mark as belonging to a commpage.
10788 */
10789 static void
10790 pmap_set_commpage(pmap_t pmap)
10791 {
10792 #if XNU_MONITOR
10793 assert(!pmap_ppl_locked_down);
10794 #endif
10795 assert(pmap->type == PMAP_TYPE_USER);
10796 pmap->type = PMAP_TYPE_COMMPAGE;
10797 /*
10798 * Free the pmap's ASID. This pmap should not ever be directly
10799 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
10800 * ASID space contention but will also cause pmap_switch() to panic
10801 * if an attacker tries to activate this pmap. Disable preemption to
10802 * accommodate the *_nopreempt spinlock in free_asid().
10803 */
10804 mp_disable_preemption();
10805 pmap_get_pt_ops(pmap)->free_id(pmap);
10806 mp_enable_preemption();
10807 }
10808
10809 static void
10810 pmap_update_tt3e(
10811 pmap_t pmap,
10812 vm_address_t address,
10813 tt_entry_t template)
10814 {
10815 tt_entry_t *ptep, pte;
10816
10817 ptep = pmap_tt3e(pmap, address);
10818 if (ptep == NULL) {
10819 panic("%s: no ptep?", __FUNCTION__);
10820 }
10821
10822 pte = *ptep;
10823 pte = tte_to_pa(pte) | template;
10824 write_pte_strong(ptep, pte);
10825 }
10826
10827 /* Note absence of non-global bit */
10828 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10829 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10830 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
10831 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10832
10833 /* Note absence of non-global bit and no-execute bit. */
10834 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10835 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10836 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
10837 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10838
10839 void
10840 pmap_create_sharedpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10841 vm_map_address_t *user_text_addr)
10842 {
10843 kern_return_t kr;
10844 pmap_paddr_t data_pa = 0; // data address
10845 pmap_paddr_t text_pa = 0; // text address
10846
10847 *kernel_data_addr = 0;
10848 *kernel_text_addr = 0;
10849 *user_text_addr = 0;
10850
10851 #if XNU_MONITOR
10852 data_pa = pmap_alloc_page_for_kern(0);
10853 assert(data_pa);
10854 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
10855 #if CONFIG_ARM_PFZ
10856 text_pa = pmap_alloc_page_for_kern(0);
10857 assert(text_pa);
10858 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
10859 #endif
10860
10861 #else /* XNU_MONITOR */
10862 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
10863 #if CONFIG_ARM_PFZ
10864 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
10865 #endif
10866
10867 #endif /* XNU_MONITOR */
10868
10869 /*
10870 * In order to avoid burning extra pages on mapping the shared page, we
10871 * create a dedicated pmap for the shared page. We forcibly nest the
10872 * translation tables from this pmap into other pmaps. The level we
10873 * will nest at depends on the MMU configuration (page size, TTBR range,
10874 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
10875 *
10876 * Note that this is NOT "the nested pmap" (which is used to nest the
10877 * shared cache).
10878 *
10879 * Note that we update parameters of the entry for our unique needs (NG
10880 * entry, etc.).
10881 */
10882 sharedpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
10883 assert(sharedpage_pmap_default != NULL);
10884 pmap_set_commpage(sharedpage_pmap_default);
10885
10886 /* The user 64-bit mapping... */
10887 kr = pmap_enter_addr(sharedpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10888 assert(kr == KERN_SUCCESS);
10889 pmap_update_tt3e(sharedpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10890 #if CONFIG_ARM_PFZ
10891 /* User mapping of comm page text section for 64 bit mapping only
10892 *
10893 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10894 * user processes to get this page mapped in, they should never call into
10895 * this page.
10896 *
10897 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10898 * is slid in the same L3 as the data commpage. It is either outside the
10899 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
10900 * it is reserved and unavailable to mach VM for future mappings.
10901 */
10902 const pt_attr_t * const pt_attr = pmap_get_pt_attr(sharedpage_pmap_default);
10903 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
10904
10905 vm_map_address_t commpage_text_va = 0;
10906
10907 do {
10908 int text_leaf_index = random() % num_ptes;
10909
10910 // Generate a VA for the commpage text with the same root and twig index as data
10911 // comm page, but with new leaf index we've just generated.
10912 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
10913 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
10914 } while (commpage_text_va == _COMM_PAGE64_BASE_ADDRESS); // Try again if we collide (should be unlikely)
10915
10916 // Assert that this is empty
10917 __assert_only pt_entry_t *ptep = pmap_pte(sharedpage_pmap_default, commpage_text_va);
10918 assert(ptep != PT_ENTRY_NULL);
10919 assert(*ptep == ARM_TTE_EMPTY);
10920
10921 // At this point, we've found the address we want to insert our comm page at
10922 kr = pmap_enter_addr(sharedpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10923 assert(kr == KERN_SUCCESS);
10924 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
10925 pmap_update_tt3e(sharedpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
10926
10927 *user_text_addr = commpage_text_va;
10928 #endif
10929
10930 /* ...and the user 32-bit mapping. */
10931 kr = pmap_enter_addr(sharedpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10932 assert(kr == KERN_SUCCESS);
10933 pmap_update_tt3e(sharedpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10934
10935 #if __ARM_MIXED_PAGE_SIZE__
10936 /**
10937 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
10938 * new set of page tables that point to the exact same 16K shared page as
10939 * before. Only the first 4K of the 16K shared page is mapped since that's
10940 * the only part that contains relevant data.
10941 */
10942 sharedpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
10943 assert(sharedpage_pmap_4k != NULL);
10944 pmap_set_commpage(sharedpage_pmap_4k);
10945
10946 /* The user 64-bit mapping... */
10947 kr = pmap_enter_addr(sharedpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10948 assert(kr == KERN_SUCCESS);
10949 pmap_update_tt3e(sharedpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10950
10951 /* ...and the user 32-bit mapping. */
10952 kr = pmap_enter_addr(sharedpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10953 assert(kr == KERN_SUCCESS);
10954 pmap_update_tt3e(sharedpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10955
10956 #endif
10957
10958 /* For manipulation in kernel, go straight to physical page */
10959 *kernel_data_addr = phystokv(data_pa);
10960 *kernel_text_addr = (text_pa) ? phystokv(text_pa) : 0;
10961 }
10962
10963
10964 /*
10965 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10966 * with user controlled TTEs for regions that aren't explicitly reserved by the
10967 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10968 */
10969 #if (ARM_PGSHIFT == 14)
10970 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10971 #elif (ARM_PGSHIFT == 12)
10972 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= VM_MAX_ADDRESS);
10973 #else
10974 #error Nested shared page mapping is unsupported on this config
10975 #endif
10976
10977 MARK_AS_PMAP_TEXT kern_return_t
10978 pmap_insert_sharedpage_internal(
10979 pmap_t pmap)
10980 {
10981 kern_return_t kr = KERN_SUCCESS;
10982 vm_offset_t sharedpage_vaddr;
10983 pt_entry_t *ttep, *src_ttep;
10984 int options = 0;
10985 pmap_t sharedpage_pmap = sharedpage_pmap_default;
10986
10987 /* Validate the pmap input before accessing its data. */
10988 validate_pmap_mutable(pmap);
10989
10990 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10991 const unsigned int sharedpage_level = pt_attr_commpage_level(pt_attr);
10992
10993 #if __ARM_MIXED_PAGE_SIZE__
10994 #if !__ARM_16K_PG__
10995 /* The following code assumes that sharedpage_pmap_default is a 16KB pmap. */
10996 #error "pmap_insert_sharedpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
10997 #endif /* !__ARM_16K_PG__ */
10998
10999 /* Choose the correct shared page pmap to use. */
11000 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11001 if (pmap_page_size == 16384) {
11002 sharedpage_pmap = sharedpage_pmap_default;
11003 } else if (pmap_page_size == 4096) {
11004 sharedpage_pmap = sharedpage_pmap_4k;
11005 } else {
11006 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11007 }
11008 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11009
11010 #if XNU_MONITOR
11011 options |= PMAP_OPTIONS_NOWAIT;
11012 #endif /* XNU_MONITOR */
11013
11014 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11015 #error We assume a single page.
11016 #endif
11017
11018 if (pmap_is_64bit(pmap)) {
11019 sharedpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11020 } else {
11021 sharedpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11022 }
11023
11024
11025 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11026
11027 /*
11028 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11029 * two (2MB) depending on the address space layout. For 16KB pages, each level
11030 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11031 * to "nest".
11032 *
11033 * Note: This is not "nesting" in the shared cache sense. This definition of
11034 * nesting just means inserting pointers to pre-allocated tables inside of
11035 * the passed in pmap to allow us to share page tables (which map the shared
11036 * page) for every task. This saves at least one page of memory per process
11037 * compared to creating new page tables in every process for mapping the
11038 * shared page.
11039 */
11040
11041 /**
11042 * Allocate the twig page tables if needed, and slam a pointer to the shared
11043 * page's tables into place.
11044 */
11045 while ((ttep = pmap_ttne(pmap, sharedpage_level, sharedpage_vaddr)) == TT_ENTRY_NULL) {
11046 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11047
11048 kr = pmap_expand(pmap, sharedpage_vaddr, options, sharedpage_level);
11049
11050 if (kr != KERN_SUCCESS) {
11051 #if XNU_MONITOR
11052 if (kr == KERN_RESOURCE_SHORTAGE) {
11053 return kr;
11054 } else
11055 #endif
11056 {
11057 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11058 }
11059 }
11060
11061 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11062 }
11063
11064 if (*ttep != ARM_PTE_EMPTY) {
11065 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11066 }
11067
11068 src_ttep = pmap_ttne(sharedpage_pmap, sharedpage_level, sharedpage_vaddr);
11069
11070 *ttep = *src_ttep;
11071 FLUSH_PTE_STRONG();
11072
11073 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11074
11075 return kr;
11076 }
11077
11078 static void
11079 pmap_unmap_sharedpage(
11080 pmap_t pmap)
11081 {
11082 pt_entry_t *ttep;
11083 vm_offset_t sharedpage_vaddr;
11084 pmap_t sharedpage_pmap = sharedpage_pmap_default;
11085
11086 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11087 const unsigned int sharedpage_level = pt_attr_commpage_level(pt_attr);
11088
11089 #if __ARM_MIXED_PAGE_SIZE__
11090 #if !__ARM_16K_PG__
11091 /* The following code assumes that sharedpage_pmap_default is a 16KB pmap. */
11092 #error "pmap_unmap_sharedpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11093 #endif /* !__ARM_16K_PG__ */
11094
11095 /* Choose the correct shared page pmap to use. */
11096 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11097 if (pmap_page_size == 16384) {
11098 sharedpage_pmap = sharedpage_pmap_default;
11099 } else if (pmap_page_size == 4096) {
11100 sharedpage_pmap = sharedpage_pmap_4k;
11101 } else {
11102 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11103 }
11104 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11105
11106 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11107 #error We assume a single page.
11108 #endif
11109
11110 if (pmap_is_64bit(pmap)) {
11111 sharedpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11112 } else {
11113 sharedpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11114 }
11115
11116
11117 ttep = pmap_ttne(pmap, sharedpage_level, sharedpage_vaddr);
11118
11119 if (ttep == NULL) {
11120 return;
11121 }
11122
11123 /* It had better be mapped to the shared page. */
11124 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(sharedpage_pmap, sharedpage_level, sharedpage_vaddr)) {
11125 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11126 }
11127
11128 *ttep = ARM_TTE_EMPTY;
11129 FLUSH_PTE_STRONG();
11130
11131 flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, pmap, false);
11132 sync_tlb_flush();
11133 }
11134
11135 void
11136 pmap_insert_sharedpage(
11137 pmap_t pmap)
11138 {
11139 #if XNU_MONITOR
11140 kern_return_t kr = KERN_FAILURE;
11141
11142 while ((kr = pmap_insert_sharedpage_ppl(pmap)) == KERN_RESOURCE_SHORTAGE) {
11143 pmap_alloc_page_for_ppl(0);
11144 }
11145
11146 pmap_ledger_check_balance(pmap);
11147
11148 if (kr != KERN_SUCCESS) {
11149 panic("%s: failed to insert the shared page, kr=%d, "
11150 "pmap=%p",
11151 __FUNCTION__, kr,
11152 pmap);
11153 }
11154 #else
11155 pmap_insert_sharedpage_internal(pmap);
11156 #endif
11157 }
11158
11159 static boolean_t
11160 pmap_is_64bit(
11161 pmap_t pmap)
11162 {
11163 return pmap->is_64bit;
11164 }
11165
11166 bool
11167 pmap_is_exotic(
11168 pmap_t pmap __unused)
11169 {
11170 return false;
11171 }
11172
11173 #endif
11174
11175 /* ARMTODO -- an implementation that accounts for
11176 * holes in the physical map, if any.
11177 */
11178 boolean_t
11179 pmap_valid_page(
11180 ppnum_t pn)
11181 {
11182 return pa_valid(ptoa(pn));
11183 }
11184
11185 boolean_t
11186 pmap_bootloader_page(
11187 ppnum_t pn)
11188 {
11189 pmap_paddr_t paddr = ptoa(pn);
11190
11191 if (pa_valid(paddr)) {
11192 return FALSE;
11193 }
11194 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11195 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11196 }
11197
11198 MARK_AS_PMAP_TEXT boolean_t
11199 pmap_is_empty_internal(
11200 pmap_t pmap,
11201 vm_map_offset_t va_start,
11202 vm_map_offset_t va_end)
11203 {
11204 vm_map_offset_t block_start, block_end;
11205 tt_entry_t *tte_p;
11206
11207 if (pmap == NULL) {
11208 return TRUE;
11209 }
11210
11211 validate_pmap(pmap);
11212
11213 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11214 unsigned int initial_not_in_kdp = not_in_kdp;
11215
11216 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11217 pmap_lock(pmap, PMAP_LOCK_SHARED);
11218 }
11219
11220 #if (__ARM_VMSA__ == 7)
11221 if (tte_index(pt_attr, va_end) >= pmap->tte_index_max) {
11222 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11223 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11224 }
11225 return TRUE;
11226 }
11227 #endif
11228
11229 /* TODO: This will be faster if we increment ttep at each level. */
11230 block_start = va_start;
11231
11232 while (block_start < va_end) {
11233 pt_entry_t *bpte_p, *epte_p;
11234 pt_entry_t *pte_p;
11235
11236 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11237 if (block_end > va_end) {
11238 block_end = va_end;
11239 }
11240
11241 tte_p = pmap_tte(pmap, block_start);
11242 if ((tte_p != PT_ENTRY_NULL)
11243 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11244 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11245 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11246 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11247
11248 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11249 if (*pte_p != ARM_PTE_EMPTY) {
11250 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11251 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11252 }
11253 return FALSE;
11254 }
11255 }
11256 }
11257 block_start = block_end;
11258 }
11259
11260 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11261 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11262 }
11263
11264 return TRUE;
11265 }
11266
11267 boolean_t
11268 pmap_is_empty(
11269 pmap_t pmap,
11270 vm_map_offset_t va_start,
11271 vm_map_offset_t va_end)
11272 {
11273 #if XNU_MONITOR
11274 return pmap_is_empty_ppl(pmap, va_start, va_end);
11275 #else
11276 return pmap_is_empty_internal(pmap, va_start, va_end);
11277 #endif
11278 }
11279
11280 vm_map_offset_t
11281 pmap_max_offset(
11282 boolean_t is64,
11283 unsigned int option)
11284 {
11285 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11286 }
11287
11288 vm_map_offset_t
11289 pmap_max_64bit_offset(
11290 __unused unsigned int option)
11291 {
11292 vm_map_offset_t max_offset_ret = 0;
11293
11294 #if defined(__arm64__)
11295 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000) // end of shared region + 512MB for various purposes
11296 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
11297 "Minimum address space size outside allowable range");
11298 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11299 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11300 max_offset_ret = arm64_pmap_max_offset_default;
11301 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11302 max_offset_ret = min_max_offset;
11303 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11304 max_offset_ret = MACH_VM_MAX_ADDRESS;
11305 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11306 if (arm64_pmap_max_offset_default) {
11307 max_offset_ret = arm64_pmap_max_offset_default;
11308 } else if (max_mem > 0xC0000000) {
11309 max_offset_ret = min_max_offset + 0x138000000; // Max offset is 13.375GB for devices with > 3GB of memory
11310 } else if (max_mem > 0x40000000) {
11311 max_offset_ret = min_max_offset + 0x38000000; // Max offset is 9.375GB for devices with > 1GB and <= 3GB of memory
11312 } else {
11313 max_offset_ret = min_max_offset;
11314 }
11315 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11316 if (arm64_pmap_max_offset_default) {
11317 // Allow the boot-arg to override jumbo size
11318 max_offset_ret = arm64_pmap_max_offset_default;
11319 } else {
11320 max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11321 }
11322 } else {
11323 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11324 }
11325
11326 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11327 assert(max_offset_ret >= min_max_offset);
11328 #else
11329 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11330 #endif
11331
11332 return max_offset_ret;
11333 }
11334
11335 vm_map_offset_t
11336 pmap_max_32bit_offset(
11337 unsigned int option)
11338 {
11339 vm_map_offset_t max_offset_ret = 0;
11340
11341 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11342 max_offset_ret = arm_pmap_max_offset_default;
11343 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11344 max_offset_ret = 0x80000000;
11345 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11346 max_offset_ret = VM_MAX_ADDRESS;
11347 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11348 if (arm_pmap_max_offset_default) {
11349 max_offset_ret = arm_pmap_max_offset_default;
11350 } else if (max_mem > 0x20000000) {
11351 max_offset_ret = 0x80000000;
11352 } else {
11353 max_offset_ret = 0x80000000;
11354 }
11355 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11356 max_offset_ret = 0x80000000;
11357 } else {
11358 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11359 }
11360
11361 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11362 return max_offset_ret;
11363 }
11364
11365 #if CONFIG_DTRACE
11366 /*
11367 * Constrain DTrace copyin/copyout actions
11368 */
11369 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11370 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11371
11372 kern_return_t
11373 dtrace_copyio_preflight(
11374 __unused addr64_t va)
11375 {
11376 if (current_map() == kernel_map) {
11377 return KERN_FAILURE;
11378 } else {
11379 return KERN_SUCCESS;
11380 }
11381 }
11382
11383 kern_return_t
11384 dtrace_copyio_postflight(
11385 __unused addr64_t va)
11386 {
11387 return KERN_SUCCESS;
11388 }
11389 #endif /* CONFIG_DTRACE */
11390
11391
11392 void
11393 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11394 {
11395 }
11396
11397
11398 void
11399 pmap_flush(
11400 __unused pmap_flush_context *cpus_to_flush)
11401 {
11402 /* not implemented yet */
11403 return;
11404 }
11405
11406 #if XNU_MONITOR
11407
11408 /*
11409 * Enforce that the address range described by kva and nbytes is not currently
11410 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11411 * unintentionally writing to PPL-owned memory.
11412 */
11413 static void
11414 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11415 {
11416 vm_offset_t end;
11417 if (os_add_overflow(kva, nbytes, &end)) {
11418 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11419 }
11420 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11421 pmap_paddr_t pa = kvtophys_nofail(ckva);
11422 pp_attr_t attr;
11423 unsigned int pai = pa_index(pa);
11424 if (ckva == phystokv(pa)) {
11425 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11426 }
11427 do {
11428 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11429 if (attr & PP_ATTR_MONITOR) {
11430 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11431 }
11432 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11433 }
11434 }
11435
11436 static void
11437 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11438 {
11439 vm_offset_t end;
11440 if (os_add_overflow(kva, nbytes, &end)) {
11441 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11442 }
11443 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11444 pmap_paddr_t pa = kvtophys_nofail(ckva);
11445
11446 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11447 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11448 }
11449 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11450 ppattr_pa_clear_no_monitor(pa);
11451 }
11452 }
11453
11454 /**
11455 * Lock down a page, making all mappings read-only, and preventing further
11456 * mappings or removal of this particular kva's mapping. Effectively, it makes
11457 * the physical page at kva immutable (see the ppl_writable parameter for an
11458 * exception to this).
11459 *
11460 * @param kva Valid address to any mapping of the physical page to lockdown.
11461 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11462 * @param ppl_writable True if the PPL should still be able to write to the page
11463 * using the physical aperture mapping. False will make the
11464 * page read-only for both the kernel and PPL in the
11465 * physical aperture.
11466 */
11467 MARK_AS_PMAP_TEXT static void
11468 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11469 {
11470 const pmap_paddr_t pa = kvtophys_nofail(kva);
11471 const unsigned int pai = pa_index(pa);
11472
11473 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11474 pvh_lock(pai);
11475 pv_entry_t **pvh = pai_to_pvh(pai);
11476 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11477
11478 if (__improbable(ppattr_pa_test_monitor(pa))) {
11479 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11480 }
11481
11482 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11483 panic("%s: %#lx already locked down/executable (%#llx)",
11484 __func__, kva, (uint64_t)pvh_flags);
11485 }
11486
11487 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11488
11489 /* Update the physical aperture mapping to prevent kernel write access. */
11490 const unsigned int new_xprr_perm =
11491 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11492 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11493
11494 pvh_unlock(pai);
11495
11496 pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0, NULL);
11497
11498 /**
11499 * Double-check that the mapping didn't change physical addresses before the
11500 * LOCKDOWN flag was set (there is a brief window between the above
11501 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11502 *
11503 * This doesn't solve the ABA problem, but this doesn't have to since once
11504 * the pvh_lock() is grabbed no new mappings can be created on this physical
11505 * page without the LOCKDOWN flag already set (so any future mappings can
11506 * only be RO, and no existing mappings can be removed).
11507 */
11508 if (kvtophys_nofail(kva) != pa) {
11509 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11510 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11511 }
11512 }
11513
11514 /**
11515 * Helper for releasing a page from being locked down to the PPL, making it writable to the
11516 * kernel once again.
11517 *
11518 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11519 * to unlockdown a page that was never locked down, will panic.
11520 *
11521 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
11522 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11523 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11524 * passed to the paired pmap_ppl_lockdown_page() call. Any
11525 * deviation will result in a panic.
11526 */
11527 MARK_AS_PMAP_TEXT static void
11528 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11529 {
11530 pvh_assert_locked(pai);
11531 pv_entry_t **pvh = pai_to_pvh(pai);
11532 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11533
11534 if (__improbable(!(pvh_flags & lockdown_flag))) {
11535 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11536 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11537 }
11538
11539 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11540
11541 /* Restore the pre-lockdown physical aperture mapping permissions. */
11542 const unsigned int old_xprr_perm =
11543 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11544 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11545 }
11546
11547 /**
11548 * Release a page from being locked down to the PPL, making it writable to the
11549 * kernel once again.
11550 *
11551 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11552 * to unlockdown a page that was never locked down, will panic.
11553 *
11554 * @param kva Valid address to any mapping of the physical page to unlockdown.
11555 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11556 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11557 * passed to the paired pmap_ppl_lockdown_page() call. Any
11558 * deviation will result in a panic.
11559 */
11560 MARK_AS_PMAP_TEXT static void
11561 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11562 {
11563 const pmap_paddr_t pa = kvtophys_nofail(kva);
11564 const unsigned int pai = pa_index(pa);
11565
11566 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11567 pvh_lock(pai);
11568 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11569 pvh_unlock(pai);
11570 }
11571
11572 #else /* XNU_MONITOR */
11573
11574 static void __unused
11575 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11576 {
11577 }
11578
11579 static void __unused
11580 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11581 {
11582 }
11583
11584 #endif /* !XNU_MONITOR */
11585
11586
11587 MARK_AS_PMAP_TEXT static inline void
11588 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11589 {
11590 #if XNU_MONITOR
11591 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11592 #else
11593 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11594 #endif
11595 }
11596
11597 MARK_AS_PMAP_TEXT static inline void
11598 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11599 {
11600 #if XNU_MONITOR
11601 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11602 #else
11603 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11604 #endif
11605 }
11606
11607 /**
11608 * Perform basic validation checks on the destination only and
11609 * corresponding offset/sizes prior to writing to a read only allocation.
11610 *
11611 * @note Should be called before writing to an allocation from the read
11612 * only allocator.
11613 *
11614 * @param zid The ID of the zone the allocation belongs to.
11615 * @param va VA of element being modified (destination).
11616 * @param offset Offset being written to, in the element.
11617 * @param new_data_size Size of modification.
11618 *
11619 */
11620
11621 MARK_AS_PMAP_TEXT static void
11622 pmap_ro_zone_validate_element_dst(
11623 zone_id_t zid,
11624 vm_offset_t va,
11625 vm_offset_t offset,
11626 vm_size_t new_data_size)
11627 {
11628 vm_size_t elem_size = zone_elem_size_ro(zid);
11629 vm_offset_t sum = 0, page = trunc_page(va);
11630
11631 if (__improbable(new_data_size > (elem_size - offset))) {
11632 panic("%s: New data size %lu too large for elem size %lu at addr %p",
11633 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11634 }
11635 if (__improbable(offset >= elem_size)) {
11636 panic("%s: Offset %lu too large for elem size %lu at addr %p",
11637 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11638 }
11639 if (__improbable(os_add3_overflow(va, offset, new_data_size, &sum))) {
11640 panic("%s: Integer addition overflow %p + %lu + %lu = %lu",
11641 __func__, (void*)va, (uintptr_t)offset, (uintptr_t) new_data_size,
11642 (uintptr_t)sum);
11643 }
11644 if (__improbable((va - page) % elem_size)) {
11645 panic("%s: Start of element %p is not aligned to element size %lu",
11646 __func__, (void *)va, (uintptr_t)elem_size);
11647 }
11648
11649 /* Check element is from correct zone */
11650 zone_require_ro(zid, elem_size, (void*)va);
11651 }
11652
11653
11654 /**
11655 * Perform basic validation checks on the source, destination and
11656 * corresponding offset/sizes prior to writing to a read only allocation.
11657 *
11658 * @note Should be called before writing to an allocation from the read
11659 * only allocator.
11660 *
11661 * @param zid The ID of the zone the allocation belongs to.
11662 * @param va VA of element being modified (destination).
11663 * @param offset Offset being written to, in the element.
11664 * @param new_data Pointer to new data (source).
11665 * @param new_data_size Size of modification.
11666 *
11667 */
11668
11669 MARK_AS_PMAP_TEXT static void
11670 pmap_ro_zone_validate_element(
11671 zone_id_t zid,
11672 vm_offset_t va,
11673 vm_offset_t offset,
11674 const vm_offset_t new_data,
11675 vm_size_t new_data_size)
11676 {
11677 vm_offset_t sum = 0;
11678
11679 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11680 panic("%s: Integer addition overflow %p + %lu = %lu",
11681 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11682 }
11683
11684 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11685 }
11686
11687 /**
11688 * Ensure that physical page is locked down and pinned, before writing to it.
11689 *
11690 * @note Should be called before writing to an allocation from the read
11691 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
11692 * ensure that it is called after the modification.
11693 *
11694 *
11695 * @param pa Physical address of the element being modified.
11696 * @param va Virtual address of element being modified.
11697 * @param size Size of the modification.
11698 *
11699 */
11700
11701 MARK_AS_PMAP_TEXT static void
11702 pmap_ro_zone_lock_phy_page(
11703 const pmap_paddr_t pa,
11704 vm_offset_t va,
11705 vm_size_t size)
11706 {
11707 const unsigned int pai = pa_index(pa);
11708 pvh_lock(pai);
11709
11710 /* Ensure that the physical page is locked down */
11711 #if XNU_MONITOR
11712 pv_entry_t **pvh = pai_to_pvh(pai);
11713 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
11714 panic("%s: Physical page not locked down %llx", __func__, pa);
11715 }
11716 #endif /* XNU_MONITOR */
11717
11718 /* Ensure page can't become PPL-owned memory before the memcpy occurs */
11719 pmap_pin_kernel_pages(va, size);
11720 }
11721
11722 /**
11723 * Unlock and unpin physical page after writing to it.
11724 *
11725 * @note Should be called after writing to an allocation from the read
11726 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
11727 * ensure that it has been called prior to the modification.
11728 *
11729 * @param pa Physical address of the element that was modified.
11730 * @param va Virtual address of element that was modified.
11731 * @param size Size of the modification.
11732 *
11733 */
11734
11735 MARK_AS_PMAP_TEXT static void
11736 pmap_ro_zone_unlock_phy_page(
11737 const pmap_paddr_t pa,
11738 vm_offset_t va,
11739 vm_size_t size)
11740 {
11741 const unsigned int pai = pa_index(pa);
11742 pmap_unpin_kernel_pages(va, size);
11743 pvh_unlock(pai);
11744 }
11745
11746 /**
11747 * Function to copy kauth_cred from new_data to kv.
11748 * Function defined in "kern_prot.c"
11749 *
11750 * @note Will be removed upon completion of
11751 * <rdar://problem/72635194> Compiler PAC support for memcpy.
11752 *
11753 * @param kv Address to copy new data to.
11754 * @param new_data Pointer to new data.
11755 *
11756 */
11757
11758 extern void
11759 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11760
11761 /**
11762 * Zalloc-specific memcpy that writes through the physical aperture
11763 * and ensures the element being modified is from a read-only zone.
11764 *
11765 * @note Designed to work only with the zone allocator's read-only submap.
11766 *
11767 * @param zid The ID of the zone to allocate from.
11768 * @param va VA of element to be modified.
11769 * @param offset Offset from element.
11770 * @param new_data Pointer to new data.
11771 * @param new_data_size Size of modification.
11772 *
11773 */
11774
11775 void
11776 pmap_ro_zone_memcpy(
11777 zone_id_t zid,
11778 vm_offset_t va,
11779 vm_offset_t offset,
11780 const vm_offset_t new_data,
11781 vm_size_t new_data_size)
11782 {
11783 #if XNU_MONITOR
11784 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
11785 #else /* XNU_MONITOR */
11786 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11787 #endif /* XNU_MONITOR */
11788 }
11789
11790 MARK_AS_PMAP_TEXT void
11791 pmap_ro_zone_memcpy_internal(
11792 zone_id_t zid,
11793 vm_offset_t va,
11794 vm_offset_t offset,
11795 const vm_offset_t new_data,
11796 vm_size_t new_data_size)
11797 {
11798 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11799
11800 if (!new_data || new_data_size == 0) {
11801 return;
11802 }
11803
11804 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11805 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
11806 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11807 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
11808 }
11809
11810 /**
11811 * Zalloc-specific function to atomically mutate fields of an element that
11812 * belongs to a read-only zone, via the physcial aperture.
11813 *
11814 * @note Designed to work only with the zone allocator's read-only submap.
11815 *
11816 * @param zid The ID of the zone the element belongs to.
11817 * @param va VA of element to be modified.
11818 * @param offset Offset in element.
11819 * @param op Atomic operation to perform.
11820 * @param value Mutation value.
11821 *
11822 */
11823
11824 uint64_t
11825 pmap_ro_zone_atomic_op(
11826 zone_id_t zid,
11827 vm_offset_t va,
11828 vm_offset_t offset,
11829 zro_atomic_op_t op,
11830 uint64_t value)
11831 {
11832 #if XNU_MONITOR
11833 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
11834 #else /* XNU_MONITOR */
11835 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11836 #endif /* XNU_MONITOR */
11837 }
11838
11839 MARK_AS_PMAP_TEXT uint64_t
11840 pmap_ro_zone_atomic_op_internal(
11841 zone_id_t zid,
11842 vm_offset_t va,
11843 vm_offset_t offset,
11844 zro_atomic_op_t op,
11845 uint64_t value)
11846 {
11847 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11848 vm_size_t value_size = op & 0xf;
11849
11850 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11851 pmap_ro_zone_lock_phy_page(pa, va, value_size);
11852 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11853 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
11854
11855 return value;
11856 }
11857
11858 /**
11859 * bzero for allocations from read only zones, that writes through the
11860 * physical aperture.
11861 *
11862 * @note This is called by the zfree path of all allocations from read
11863 * only zones.
11864 *
11865 * @param zid The ID of the zone the allocation belongs to.
11866 * @param va VA of element to be zeroed.
11867 * @param offset Offset in the element.
11868 * @param size Size of allocation.
11869 *
11870 */
11871
11872 void
11873 pmap_ro_zone_bzero(
11874 zone_id_t zid,
11875 vm_offset_t va,
11876 vm_offset_t offset,
11877 vm_size_t size)
11878 {
11879 #if XNU_MONITOR
11880 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
11881 #else /* XNU_MONITOR */
11882 pmap_ro_zone_bzero_internal(zid, va, offset, size);
11883 #endif /* XNU_MONITOR */
11884 }
11885
11886 MARK_AS_PMAP_TEXT void
11887 pmap_ro_zone_bzero_internal(
11888 zone_id_t zid,
11889 vm_offset_t va,
11890 vm_offset_t offset,
11891 vm_size_t size)
11892 {
11893 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11894 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11895 pmap_ro_zone_lock_phy_page(pa, va, size);
11896 bzero((void*)phystokv(pa), size);
11897 pmap_ro_zone_unlock_phy_page(pa, va, size);
11898 }
11899
11900 /**
11901 * Removes write access from the Physical Aperture.
11902 *
11903 * @note For non-PPL devices, it simply makes all virtual mappings RO.
11904 * @note Designed to work only with the zone allocator's read-only submap.
11905 *
11906 * @param va VA of the page to restore write access to.
11907 *
11908 */
11909 MARK_AS_PMAP_TEXT static void
11910 pmap_phys_write_disable(vm_address_t va)
11911 {
11912 #if XNU_MONITOR
11913 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
11914 #else /* XNU_MONITOR */
11915 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
11916 #endif /* XNU_MONITOR */
11917 }
11918
11919 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
11920
11921 MARK_AS_PMAP_TEXT mach_vm_size_t
11922 pmap_query_resident_internal(
11923 pmap_t pmap,
11924 vm_map_address_t start,
11925 vm_map_address_t end,
11926 mach_vm_size_t *compressed_bytes_p)
11927 {
11928 mach_vm_size_t resident_bytes = 0;
11929 mach_vm_size_t compressed_bytes = 0;
11930
11931 pt_entry_t *bpte, *epte;
11932 pt_entry_t *pte_p;
11933 tt_entry_t *tte_p;
11934
11935 if (pmap == NULL) {
11936 return PMAP_RESIDENT_INVALID;
11937 }
11938
11939 validate_pmap(pmap);
11940
11941 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11942
11943 /* Ensure that this request is valid, and addresses exactly one TTE. */
11944 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11945 (end % pt_attr_page_size(pt_attr)))) {
11946 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11947 }
11948
11949 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11950 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11951 }
11952
11953 pmap_lock(pmap, PMAP_LOCK_SHARED);
11954 tte_p = pmap_tte(pmap, start);
11955 if (tte_p == (tt_entry_t *) NULL) {
11956 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11957 return PMAP_RESIDENT_INVALID;
11958 }
11959 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
11960 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11961 bpte = &pte_p[pte_index(pt_attr, start)];
11962 epte = &pte_p[pte_index(pt_attr, end)];
11963
11964 for (; bpte < epte; bpte++) {
11965 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
11966 compressed_bytes += pt_attr_page_size(pt_attr);
11967 } else if (pa_valid(pte_to_pa(*bpte))) {
11968 resident_bytes += pt_attr_page_size(pt_attr);
11969 }
11970 }
11971 }
11972 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11973
11974 if (compressed_bytes_p) {
11975 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
11976 *compressed_bytes_p += compressed_bytes;
11977 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
11978 }
11979
11980 return resident_bytes;
11981 }
11982
11983 mach_vm_size_t
11984 pmap_query_resident(
11985 pmap_t pmap,
11986 vm_map_address_t start,
11987 vm_map_address_t end,
11988 mach_vm_size_t *compressed_bytes_p)
11989 {
11990 mach_vm_size_t total_resident_bytes;
11991 mach_vm_size_t compressed_bytes;
11992 vm_map_address_t va;
11993
11994
11995 if (pmap == PMAP_NULL) {
11996 if (compressed_bytes_p) {
11997 *compressed_bytes_p = 0;
11998 }
11999 return 0;
12000 }
12001
12002 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12003
12004 total_resident_bytes = 0;
12005 compressed_bytes = 0;
12006
12007 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12008 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12009 VM_KERNEL_ADDRHIDE(end));
12010
12011 va = start;
12012 while (va < end) {
12013 vm_map_address_t l;
12014 mach_vm_size_t resident_bytes;
12015
12016 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12017
12018 if (l > end) {
12019 l = end;
12020 }
12021 #if XNU_MONITOR
12022 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12023 #else
12024 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12025 #endif
12026 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12027 break;
12028 }
12029
12030 total_resident_bytes += resident_bytes;
12031
12032 va = l;
12033 }
12034
12035 if (compressed_bytes_p) {
12036 *compressed_bytes_p = compressed_bytes;
12037 }
12038
12039 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12040 total_resident_bytes);
12041
12042 return total_resident_bytes;
12043 }
12044
12045 #if MACH_ASSERT
12046 static void
12047 pmap_check_ledgers(
12048 pmap_t pmap)
12049 {
12050 int pid;
12051 char *procname;
12052
12053 if (pmap->pmap_pid == 0) {
12054 /*
12055 * This pmap was not or is no longer fully associated
12056 * with a task (e.g. the old pmap after a fork()/exec() or
12057 * spawn()). Its "ledger" still points at a task that is
12058 * now using a different (and active) address space, so
12059 * we can't check that all the pmap ledgers are balanced here.
12060 *
12061 * If the "pid" is set, that means that we went through
12062 * pmap_set_process() in task_terminate_internal(), so
12063 * this task's ledger should not have been re-used and
12064 * all the pmap ledgers should be back to 0.
12065 */
12066 return;
12067 }
12068
12069 pid = pmap->pmap_pid;
12070 procname = pmap->pmap_procname;
12071
12072 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12073 }
12074 #endif /* MACH_ASSERT */
12075
12076 void
12077 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12078 {
12079 }
12080
12081 /**
12082 * The minimum shared region nesting size is used by the VM to determine when to
12083 * break up large mappings to nested regions. The smallest size that these
12084 * mappings can be broken into is determined by what page table level those
12085 * regions are being nested in at and the size of the page tables.
12086 *
12087 * For instance, if a nested region is nesting at L2 for a process utilizing
12088 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12089 * block entry).
12090 *
12091 * @param pmap The target pmap to determine the block size based on whether it's
12092 * using 16KB or 4KB page tables.
12093 */
12094 uint64_t
12095 pmap_shared_region_size_min(__unused pmap_t pmap)
12096 {
12097 #if (__ARM_VMSA__ > 7)
12098 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12099
12100 /**
12101 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12102 * 4KB pages). This means that a target pmap will contain L2 entries that
12103 * point to shared L3 page tables in the shared region pmap.
12104 */
12105 return pt_attr_twig_size(pt_attr);
12106
12107 #else
12108 return ARM_NESTING_SIZE_MIN;
12109 #endif
12110 }
12111
12112 boolean_t
12113 pmap_enforces_execute_only(
12114 #if (__ARM_VMSA__ == 7)
12115 __unused
12116 #endif
12117 pmap_t pmap)
12118 {
12119 #if (__ARM_VMSA__ > 7)
12120 return pmap != kernel_pmap;
12121 #else
12122 return FALSE;
12123 #endif
12124 }
12125
12126 MARK_AS_PMAP_TEXT void
12127 pmap_set_vm_map_cs_enforced_internal(
12128 pmap_t pmap,
12129 bool new_value)
12130 {
12131 validate_pmap_mutable(pmap);
12132 pmap->pmap_vm_map_cs_enforced = new_value;
12133 }
12134
12135 void
12136 pmap_set_vm_map_cs_enforced(
12137 pmap_t pmap,
12138 bool new_value)
12139 {
12140 #if XNU_MONITOR
12141 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12142 #else
12143 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12144 #endif
12145 }
12146
12147 extern int cs_process_enforcement_enable;
12148 bool
12149 pmap_get_vm_map_cs_enforced(
12150 pmap_t pmap)
12151 {
12152 if (cs_process_enforcement_enable) {
12153 return true;
12154 }
12155 return pmap->pmap_vm_map_cs_enforced;
12156 }
12157
12158 MARK_AS_PMAP_TEXT void
12159 pmap_set_jit_entitled_internal(
12160 __unused pmap_t pmap)
12161 {
12162 return;
12163 }
12164
12165 void
12166 pmap_set_jit_entitled(
12167 pmap_t pmap)
12168 {
12169 #if XNU_MONITOR
12170 pmap_set_jit_entitled_ppl(pmap);
12171 #else
12172 pmap_set_jit_entitled_internal(pmap);
12173 #endif
12174 }
12175
12176 bool
12177 pmap_get_jit_entitled(
12178 __unused pmap_t pmap)
12179 {
12180 return false;
12181 }
12182
12183 MARK_AS_PMAP_TEXT kern_return_t
12184 pmap_query_page_info_internal(
12185 pmap_t pmap,
12186 vm_map_offset_t va,
12187 int *disp_p)
12188 {
12189 pmap_paddr_t pa;
12190 int disp;
12191 unsigned int pai;
12192 pt_entry_t *pte;
12193 pv_entry_t **pv_h, *pve_p;
12194
12195 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12196 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12197 *disp_p = 0;
12198 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12199 return KERN_INVALID_ARGUMENT;
12200 }
12201
12202 disp = 0;
12203
12204 validate_pmap(pmap);
12205 pmap_lock(pmap, PMAP_LOCK_SHARED);
12206
12207 pte = pmap_pte(pmap, va);
12208 if (pte == PT_ENTRY_NULL) {
12209 goto done;
12210 }
12211
12212 pa = pte_to_pa(*((volatile pt_entry_t*)pte));
12213 if (pa == 0) {
12214 if (ARM_PTE_IS_COMPRESSED(*pte, pte)) {
12215 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12216 if (*pte & ARM_PTE_COMPRESSED_ALT) {
12217 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12218 }
12219 }
12220 } else {
12221 disp |= PMAP_QUERY_PAGE_PRESENT;
12222 pai = pa_index(pa);
12223 if (!pa_valid(pa)) {
12224 goto done;
12225 }
12226 pvh_lock(pai);
12227 pv_h = pai_to_pvh(pai);
12228 pve_p = PV_ENTRY_NULL;
12229 int pve_ptep_idx = 0;
12230 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12231 pve_p = pvh_pve_list(pv_h);
12232 while (pve_p != PV_ENTRY_NULL &&
12233 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte)) == -1) {
12234 pve_p = pve_next(pve_p);
12235 }
12236 }
12237
12238 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12239 disp |= PMAP_QUERY_PAGE_ALTACCT;
12240 } else if (ppattr_test_reusable(pai)) {
12241 disp |= PMAP_QUERY_PAGE_REUSABLE;
12242 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12243 disp |= PMAP_QUERY_PAGE_INTERNAL;
12244 }
12245 pvh_unlock(pai);
12246 }
12247
12248 done:
12249 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12250 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12251 *disp_p = disp;
12252 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12253 return KERN_SUCCESS;
12254 }
12255
12256 kern_return_t
12257 pmap_query_page_info(
12258 pmap_t pmap,
12259 vm_map_offset_t va,
12260 int *disp_p)
12261 {
12262 #if XNU_MONITOR
12263 return pmap_query_page_info_ppl(pmap, va, disp_p);
12264 #else
12265 return pmap_query_page_info_internal(pmap, va, disp_p);
12266 #endif
12267 }
12268
12269
12270
12271 static vm_map_size_t
12272 pmap_user_va_size(pmap_t pmap __unused)
12273 {
12274 #if (__ARM_VMSA__ == 7)
12275 return VM_MAX_ADDRESS;
12276 #else
12277 #if __ARM_MIXED_PAGE_SIZE__
12278 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12279 return 1ULL << (64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK));
12280 #else
12281 return 1ULL << (64 - T0SZ_BOOT);
12282 #endif
12283 #endif /* __ARM_VMSA > 7 */
12284 }
12285
12286
12287
12288 kern_return_t
12289 pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache __unused *trust_cache,
12290 const vm_size_t __unused trust_cache_len)
12291 {
12292 // Unsupported
12293 return KERN_NOT_SUPPORTED;
12294 }
12295
12296 pmap_tc_ret_t
12297 pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cache,
12298 const vm_size_t __unused trust_cache_len,
12299 uint8_t const * __unused img4_manifest,
12300 const vm_size_t __unused img4_manifest_buffer_len,
12301 const vm_size_t __unused img4_manifest_actual_len,
12302 bool __unused dry_run)
12303 {
12304 // Unsupported
12305 return PMAP_TC_UNKNOWN_FORMAT;
12306 }
12307
12308 bool
12309 pmap_in_ppl(void)
12310 {
12311 // Unsupported
12312 return false;
12313 }
12314
12315 bool
12316 pmap_has_ppl(void)
12317 {
12318 // Unsupported
12319 return false;
12320 }
12321
12322 void
12323 pmap_lockdown_image4_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
12324 {
12325 // Unsupported
12326 }
12327
12328 void
12329 pmap_lockdown_image4_late_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
12330 {
12331 // Unsupported
12332 }
12333
12334 void *
12335 pmap_claim_reserved_ppl_page(void)
12336 {
12337 // Unsupported
12338 return NULL;
12339 }
12340
12341 void
12342 pmap_free_reserved_ppl_page(void __unused *kva)
12343 {
12344 // Unsupported
12345 }
12346
12347
12348 MARK_AS_PMAP_TEXT bool
12349 pmap_is_trust_cache_loaded_internal(const uuid_t uuid)
12350 {
12351 bool found = false;
12352
12353 pmap_simple_lock(&pmap_loaded_trust_caches_lock);
12354
12355 for (struct pmap_image4_trust_cache const *c = pmap_image4_trust_caches; c != NULL; c = c->next) {
12356 if (bcmp(uuid, c->module->uuid, sizeof(uuid_t)) == 0) {
12357 found = true;
12358 goto done;
12359 }
12360 }
12361
12362 #ifdef PLATFORM_BridgeOS
12363 for (struct pmap_legacy_trust_cache const *c = pmap_legacy_trust_caches; c != NULL; c = c->next) {
12364 if (bcmp(uuid, c->uuid, sizeof(uuid_t)) == 0) {
12365 found = true;
12366 goto done;
12367 }
12368 }
12369 #endif
12370
12371 done:
12372 pmap_simple_unlock(&pmap_loaded_trust_caches_lock);
12373 return found;
12374 }
12375
12376 bool
12377 pmap_is_trust_cache_loaded(const uuid_t uuid)
12378 {
12379 #if XNU_MONITOR
12380 return pmap_is_trust_cache_loaded_ppl(uuid);
12381 #else
12382 return pmap_is_trust_cache_loaded_internal(uuid);
12383 #endif
12384 }
12385
12386 MARK_AS_PMAP_TEXT bool
12387 pmap_lookup_in_loaded_trust_caches_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12388 {
12389 struct pmap_image4_trust_cache const *cache = NULL;
12390 #ifdef PLATFORM_BridgeOS
12391 struct pmap_legacy_trust_cache const *legacy = NULL;
12392 #endif
12393
12394 pmap_simple_lock(&pmap_loaded_trust_caches_lock);
12395
12396 for (cache = pmap_image4_trust_caches; cache != NULL; cache = cache->next) {
12397 uint8_t hash_type = 0, flags = 0;
12398
12399 if (lookup_in_trust_cache_module(cache->module, cdhash, &hash_type, &flags)) {
12400 goto done;
12401 }
12402 }
12403
12404 #ifdef PLATFORM_BridgeOS
12405 for (legacy = pmap_legacy_trust_caches; legacy != NULL; legacy = legacy->next) {
12406 for (uint32_t i = 0; i < legacy->num_hashes; i++) {
12407 if (bcmp(legacy->hashes[i], cdhash, CS_CDHASH_LEN) == 0) {
12408 goto done;
12409 }
12410 }
12411 }
12412 #endif
12413
12414 done:
12415 pmap_simple_unlock(&pmap_loaded_trust_caches_lock);
12416
12417 if (cache != NULL) {
12418 return true;
12419 #ifdef PLATFORM_BridgeOS
12420 } else if (legacy != NULL) {
12421 return true;
12422 #endif
12423 }
12424
12425 return false;
12426 }
12427
12428 bool
12429 pmap_lookup_in_loaded_trust_caches(const uint8_t cdhash[CS_CDHASH_LEN])
12430 {
12431 #if XNU_MONITOR
12432 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
12433 #else
12434 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
12435 #endif
12436 }
12437
12438 MARK_AS_PMAP_TEXT uint32_t
12439 pmap_lookup_in_static_trust_cache_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12440 {
12441 // Awkward indirection, because the PPL macros currently force their functions to be static.
12442 return lookup_in_static_trust_cache(cdhash);
12443 }
12444
12445 uint32_t
12446 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
12447 {
12448 #if XNU_MONITOR
12449 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
12450 #else
12451 return pmap_lookup_in_static_trust_cache_internal(cdhash);
12452 #endif
12453 }
12454
12455 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
12456 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
12457
12458 MARK_AS_PMAP_TEXT void
12459 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12460 {
12461
12462 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
12463 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
12464 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
12465
12466 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
12467 }
12468
12469 MARK_AS_PMAP_TEXT bool
12470 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12471 {
12472 bool match = false;
12473
12474 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
12475 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
12476 match = true;
12477 }
12478 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
12479
12480 if (match) {
12481 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
12482 }
12483
12484 return match;
12485 }
12486
12487 void
12488 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
12489 {
12490 #if XNU_MONITOR
12491 pmap_set_compilation_service_cdhash_ppl(cdhash);
12492 #else
12493 pmap_set_compilation_service_cdhash_internal(cdhash);
12494 #endif
12495 }
12496
12497 bool
12498 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
12499 {
12500 #if XNU_MONITOR
12501 return pmap_match_compilation_service_cdhash_ppl(cdhash);
12502 #else
12503 return pmap_match_compilation_service_cdhash_internal(cdhash);
12504 #endif
12505 }
12506
12507 /*
12508 * As part of supporting local signing on the device, we need the PMAP layer
12509 * to store the local signing key so that PMAP_CS can validate with it. We
12510 * store it at the PMAP layer such that it is accessible to both AMFI and
12511 * PMAP_CS should they need it.
12512 */
12513 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
12514 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE] = { 0 };
12515
12516 MARK_AS_PMAP_TEXT void
12517 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
12518 {
12519 bool key_set = false;
12520
12521 /*
12522 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
12523 * a successful exchange means that the local signing public key has _not_ been
12524 * set. In case the key has been set, we panic as we would never expect the
12525 * kernel to attempt to set the key more than once.
12526 */
12527 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
12528
12529 if (key_set) {
12530 panic("attempted to set the local signing public key multiple times");
12531 }
12532
12533 memcpy(pmap_local_signing_public_key, public_key, PMAP_ECC_P384_PUBLIC_KEY_SIZE);
12534 pmap_cs_log_info("set local signing public key");
12535 }
12536
12537 void
12538 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
12539 {
12540 #if XNU_MONITOR
12541 return pmap_set_local_signing_public_key_ppl(public_key);
12542 #else
12543 return pmap_set_local_signing_public_key_internal(public_key);
12544 #endif
12545 }
12546
12547 uint8_t*
12548 pmap_get_local_signing_public_key(void)
12549 {
12550 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
12551
12552 if (key_set) {
12553 return pmap_local_signing_public_key;
12554 }
12555
12556 return NULL;
12557 }
12558
12559 /*
12560 * Locally signed applications need to be explicitly authorized by an entitled application
12561 * before we allow them to run.
12562 */
12563 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
12564 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
12565
12566 MARK_AS_PMAP_TEXT void
12567 pmap_unrestrict_local_signing_internal(
12568 const uint8_t cdhash[CS_CDHASH_LEN])
12569 {
12570
12571 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12572 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
12573 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12574
12575 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
12576 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
12577 }
12578
12579 void
12580 pmap_unrestrict_local_signing(
12581 const uint8_t cdhash[CS_CDHASH_LEN])
12582 {
12583 #if XNU_MONITOR
12584 return pmap_unrestrict_local_signing_ppl(cdhash);
12585 #else
12586 return pmap_unrestrict_local_signing_internal(cdhash);
12587 #endif
12588 }
12589
12590 #if PMAP_CS
12591 MARK_AS_PMAP_TEXT static void
12592 pmap_restrict_local_signing(void)
12593 {
12594 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12595 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
12596 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12597 }
12598
12599 MARK_AS_PMAP_TEXT static bool
12600 pmap_local_signing_restricted(
12601 const uint8_t cdhash[CS_CDHASH_LEN])
12602 {
12603 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12604 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
12605 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12606
12607 return ret != 0;
12608 }
12609
12610 MARK_AS_PMAP_TEXT bool
12611 pmap_cs_query_entitlements_internal(
12612 pmap_t pmap,
12613 CEQuery_t query,
12614 size_t queryLength,
12615 CEQueryContext_t finalContext)
12616 {
12617 struct pmap_cs_code_directory *cd_entry = NULL;
12618 bool ret = false;
12619
12620 if (!pmap_cs) {
12621 panic("PMAP_CS: cannot query for entitlements as pmap_cs is turned off");
12622 }
12623
12624 /*
12625 * When a pmap has not been passed in, we assume the caller wants to check the
12626 * entitlements on the current user space process.
12627 */
12628 if (pmap == NULL) {
12629 pmap = current_pmap();
12630 }
12631
12632 if (pmap == kernel_pmap) {
12633 /*
12634 * Instead of panicking we will just return false.
12635 */
12636 return false;
12637 }
12638
12639 if (query == NULL || queryLength > 64) {
12640 panic("PMAP_CS: bogus entitlements query");
12641 } else {
12642 pmap_cs_assert_addr((vm_address_t)query, sizeof(CEQueryOperation_t) * queryLength, false, true);
12643 }
12644
12645 if (finalContext != NULL) {
12646 pmap_cs_assert_addr((vm_address_t)finalContext, sizeof(*finalContext), false, false);
12647 }
12648
12649 validate_pmap(pmap);
12650 pmap_lock(pmap, PMAP_LOCK_SHARED);
12651
12652 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
12653 if (cd_entry == NULL) {
12654 pmap_cs_log_error("attempted to query entitlements from an invalid pmap or a retired code directory");
12655 goto out;
12656 }
12657
12658 if (cd_entry->ce_ctx == NULL) {
12659 pmap_cs_log_debug("%s: code signature doesn't have any entitlements", cd_entry->identifier);
12660 goto out;
12661 }
12662
12663 der_vm_context_t executionContext = cd_entry->ce_ctx->der_context;
12664
12665 for (size_t op = 0; op < queryLength; op++) {
12666 executionContext = amfi->CoreEntitlements.der_vm_execute(executionContext, query[op]);
12667 }
12668
12669 if (amfi->CoreEntitlements.der_vm_context_is_valid(executionContext)) {
12670 ret = true;
12671 if (finalContext != NULL) {
12672 pmap_pin_kernel_pages((vm_offset_t)finalContext, sizeof(*finalContext));
12673 finalContext->der_context = executionContext;
12674 pmap_unpin_kernel_pages((vm_offset_t)finalContext, sizeof(*finalContext));
12675 }
12676 } else {
12677 ret = false;
12678 }
12679
12680 out:
12681 if (cd_entry) {
12682 lck_rw_unlock_shared(&cd_entry->rwlock);
12683 cd_entry = NULL;
12684 }
12685 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12686
12687 return ret;
12688 }
12689 #endif
12690
12691 bool
12692 pmap_query_entitlements(
12693 __unused pmap_t pmap,
12694 __unused CEQuery_t query,
12695 __unused size_t queryLength,
12696 __unused CEQueryContext_t finalContext)
12697 {
12698 #if !PMAP_SUPPORTS_ENTITLEMENT_CHECKS
12699 panic("PMAP_CS: do not use this API without checking for \'#if PMAP_SUPPORTS_ENTITLEMENT_CHECKS\'");
12700 #else
12701
12702 #if XNU_MONITOR
12703 return pmap_cs_query_entitlements_ppl(pmap, query, queryLength, finalContext);
12704 #else
12705 return pmap_cs_query_entitlements_internal(pmap, query, queryLength, finalContext);
12706 #endif
12707
12708 #endif /* !PMAP_SUPPORTS_ENTITLEMENT_CHECKS */
12709 }
12710
12711 MARK_AS_PMAP_TEXT void
12712 pmap_footprint_suspend_internal(
12713 vm_map_t map,
12714 boolean_t suspend)
12715 {
12716 #if DEVELOPMENT || DEBUG
12717 if (suspend) {
12718 current_thread()->pmap_footprint_suspended = TRUE;
12719 map->pmap->footprint_was_suspended = TRUE;
12720 } else {
12721 current_thread()->pmap_footprint_suspended = FALSE;
12722 }
12723 #else /* DEVELOPMENT || DEBUG */
12724 (void) map;
12725 (void) suspend;
12726 #endif /* DEVELOPMENT || DEBUG */
12727 }
12728
12729 void
12730 pmap_footprint_suspend(
12731 vm_map_t map,
12732 boolean_t suspend)
12733 {
12734 #if XNU_MONITOR
12735 pmap_footprint_suspend_ppl(map, suspend);
12736 #else
12737 pmap_footprint_suspend_internal(map, suspend);
12738 #endif
12739 }
12740
12741 MARK_AS_PMAP_TEXT void
12742 pmap_nop_internal(pmap_t pmap __unused)
12743 {
12744 validate_pmap_mutable(pmap);
12745 }
12746
12747 void
12748 pmap_nop(pmap_t pmap)
12749 {
12750 #if XNU_MONITOR
12751 pmap_nop_ppl(pmap);
12752 #else
12753 pmap_nop_internal(pmap);
12754 #endif
12755 }
12756
12757 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
12758
12759 struct page_table_dump_header {
12760 uint64_t pa;
12761 uint64_t num_entries;
12762 uint64_t start_va;
12763 uint64_t end_va;
12764 };
12765
12766 static kern_return_t
12767 pmap_dump_page_tables_recurse(pmap_t pmap,
12768 const tt_entry_t *ttp,
12769 unsigned int cur_level,
12770 unsigned int level_mask,
12771 uint64_t start_va,
12772 void *buf_start,
12773 void *buf_end,
12774 size_t *bytes_copied)
12775 {
12776 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12777 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
12778
12779 uint64_t size = pt_attr->pta_level_info[cur_level].size;
12780 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
12781 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
12782 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
12783
12784 void *bufp = (uint8_t*)buf_start + *bytes_copied;
12785
12786 if (cur_level == pt_attr_root_level(pt_attr)) {
12787 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
12788 }
12789
12790 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
12791 const tt_entry_t *tt_end = &ttp[num_entries];
12792
12793 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
12794 return KERN_INSUFFICIENT_BUFFER_SIZE;
12795 }
12796
12797 if (level_mask & (1U << cur_level)) {
12798 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
12799 header->pa = ml_static_vtop((vm_offset_t)ttp);
12800 header->num_entries = num_entries;
12801 header->start_va = start_va;
12802 header->end_va = start_va + (num_entries * size);
12803
12804 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
12805 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
12806 }
12807 uint64_t current_va = start_va;
12808
12809 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
12810 tt_entry_t tte = *ttep;
12811
12812 if (!(tte & valid_mask)) {
12813 continue;
12814 }
12815
12816 if ((tte & type_mask) == type_block) {
12817 continue;
12818 } else {
12819 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
12820 panic("%s: corrupt entry %#llx at %p, "
12821 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
12822 __FUNCTION__, tte, ttep,
12823 ttp, cur_level, bufp, buf_end);
12824 }
12825
12826 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
12827
12828 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
12829 level_mask, current_va, buf_start, buf_end, bytes_copied);
12830
12831 if (recurse_result != KERN_SUCCESS) {
12832 return recurse_result;
12833 }
12834 }
12835 }
12836
12837 return KERN_SUCCESS;
12838 }
12839
12840 kern_return_t
12841 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
12842 {
12843 if (not_in_kdp) {
12844 panic("pmap_dump_page_tables must only be called from kernel debugger context");
12845 }
12846 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
12847 level_mask, pmap->min, bufp, buf_end, bytes_copied);
12848 }
12849
12850 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
12851
12852 kern_return_t
12853 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
12854 unsigned int level_mask __unused, size_t *bytes_copied __unused)
12855 {
12856 return KERN_NOT_SUPPORTED;
12857 }
12858 #endif /* !defined(__arm64__) */
12859
12860
12861 #ifdef CONFIG_XNUPOST
12862 #ifdef __arm64__
12863 static volatile bool pmap_test_took_fault = false;
12864
12865 static bool
12866 pmap_test_fault_handler(arm_saved_state_t * state)
12867 {
12868 bool retval = false;
12869 uint32_t esr = get_saved_state_esr(state);
12870 esr_exception_class_t class = ESR_EC(esr);
12871 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
12872
12873 if ((class == ESR_EC_DABORT_EL1) &&
12874 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
12875 pmap_test_took_fault = true;
12876 /* return to the instruction immediately after the call to NX page */
12877 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
12878 retval = true;
12879 }
12880
12881 return retval;
12882 }
12883
12884 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
12885 static NOKASAN bool
12886 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
12887 {
12888 pmap_t old_pmap = NULL;
12889
12890 pmap_test_took_fault = false;
12891
12892 /*
12893 * We're potentially switching pmaps without using the normal thread
12894 * mechanism; disable interrupts and preemption to avoid any unexpected
12895 * memory accesses.
12896 */
12897 uint64_t old_int_state = pmap_interrupts_disable();
12898 mp_disable_preemption();
12899
12900 if (pmap != NULL) {
12901 old_pmap = current_pmap();
12902 pmap_switch(pmap);
12903
12904 /* Disable PAN; pmap shouldn't be the kernel pmap. */
12905 #if __ARM_PAN_AVAILABLE__
12906 __builtin_arm_wsr("pan", 0);
12907 #endif /* __ARM_PAN_AVAILABLE__ */
12908 }
12909
12910 ml_expect_fault_begin(pmap_test_fault_handler, va);
12911
12912 if (is_write) {
12913 *((volatile uint64_t*)(va)) = 0xdec0de;
12914 } else {
12915 volatile uint64_t tmp = *((volatile uint64_t*)(va));
12916 (void)tmp;
12917 }
12918
12919 /* Save the fault bool, and undo the gross stuff we did. */
12920 bool took_fault = pmap_test_took_fault;
12921 ml_expect_fault_end();
12922
12923 if (pmap != NULL) {
12924 #if __ARM_PAN_AVAILABLE__
12925 __builtin_arm_wsr("pan", 1);
12926 #endif /* __ARM_PAN_AVAILABLE__ */
12927
12928 pmap_switch(old_pmap);
12929 }
12930
12931 mp_enable_preemption();
12932 pmap_interrupts_restore(old_int_state);
12933 bool retval = (took_fault == should_fault);
12934 return retval;
12935 }
12936
12937 static bool
12938 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
12939 {
12940 bool retval = pmap_test_access(pmap, va, should_fault, false);
12941
12942 if (!retval) {
12943 T_FAIL("%s: %s, "
12944 "pmap=%p, va=%p, should_fault=%u",
12945 __func__, should_fault ? "did not fault" : "faulted",
12946 pmap, (void*)va, (unsigned)should_fault);
12947 }
12948
12949 return retval;
12950 }
12951
12952 static bool
12953 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
12954 {
12955 bool retval = pmap_test_access(pmap, va, should_fault, true);
12956
12957 if (!retval) {
12958 T_FAIL("%s: %s, "
12959 "pmap=%p, va=%p, should_fault=%u",
12960 __func__, should_fault ? "did not fault" : "faulted",
12961 pmap, (void*)va, (unsigned)should_fault);
12962 }
12963
12964 return retval;
12965 }
12966
12967 static bool
12968 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
12969 {
12970 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12971 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
12972
12973 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
12974
12975 if (!retval) {
12976 T_FAIL("%s: bits=%u, "
12977 "pa=%p, should_be_set=%u",
12978 __func__, bits,
12979 (void*)pa, should_be_set);
12980 }
12981
12982 return retval;
12983 }
12984
12985 static __attribute__((noinline)) bool
12986 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
12987 {
12988 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
12989 return retval;
12990 }
12991
12992 static int
12993 pmap_test_test_config(unsigned int flags)
12994 {
12995 T_LOG("running pmap_test_test_config flags=0x%X", flags);
12996 unsigned int map_count = 0;
12997 unsigned long page_ratio = 0;
12998 pmap_t pmap = pmap_create_options(NULL, 0, flags);
12999
13000 if (!pmap) {
13001 panic("Failed to allocate pmap");
13002 }
13003
13004 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13005 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
13006 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
13007 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
13008
13009 if (pmap_page_size <= native_page_size) {
13010 page_ratio = native_page_size / pmap_page_size;
13011 } else {
13012 /*
13013 * We claim to support a page_ratio of less than 1, which is
13014 * not currently supported by the pmap layer; panic.
13015 */
13016 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
13017 "flags=%u",
13018 __func__, native_page_size, pmap_page_size,
13019 flags);
13020 }
13021
13022 if (PAGE_RATIO > 1) {
13023 /*
13024 * The kernel is deliberately pretending to have 16KB pages.
13025 * The pmap layer has code that supports this, so pretend the
13026 * page size is larger than it is.
13027 */
13028 pmap_page_size = PAGE_SIZE;
13029 native_page_size = PAGE_SIZE;
13030 }
13031
13032 /*
13033 * Get two pages from the VM; one to be mapped wired, and one to be
13034 * mapped nonwired.
13035 */
13036 vm_page_t unwired_vm_page = vm_page_grab();
13037 vm_page_t wired_vm_page = vm_page_grab();
13038
13039 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
13040 panic("Failed to grab VM pages");
13041 }
13042
13043 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
13044 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
13045
13046 pmap_paddr_t pa = ptoa(pn);
13047 pmap_paddr_t wired_pa = ptoa(wired_pn);
13048
13049 /*
13050 * We'll start mappings at the second twig TT. This keeps us from only
13051 * using the first entry in each TT, which would trivially be address
13052 * 0; one of the things we will need to test is retrieving the VA for
13053 * a given PTE.
13054 */
13055 vm_map_address_t va_base = pmap_twig_size;
13056 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
13057
13058 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
13059 /*
13060 * Not exactly a functional failure, but this test relies on
13061 * there being a spare PTE slot we can use to pin the TT.
13062 */
13063 panic("Cannot pin translation table");
13064 }
13065
13066 /*
13067 * Create the wired mapping; this will prevent the pmap layer from
13068 * reclaiming our test TTs, which would interfere with this test
13069 * ("interfere" -> "make it panic").
13070 */
13071 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
13072
13073 #if XNU_MONITOR
13074 /*
13075 * If the PPL is enabled, make sure that the kernel cannot write
13076 * to PPL memory.
13077 */
13078 if (!pmap_ppl_disable) {
13079 T_LOG("Validate that kernel cannot write to PPL memory.");
13080 pt_entry_t * ptep = pmap_pte(pmap, va_base);
13081 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
13082 }
13083 #endif
13084
13085 /*
13086 * Create read-only mappings of the nonwired page; if the pmap does
13087 * not use the same page size as the kernel, create multiple mappings
13088 * so that the kernel page is fully mapped.
13089 */
13090 for (map_count = 0; map_count < page_ratio; map_count++) {
13091 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
13092 }
13093
13094 /* Validate that all the PTEs have the expected PA and VA. */
13095 for (map_count = 0; map_count < page_ratio; map_count++) {
13096 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
13097
13098 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
13099 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
13100 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
13101 }
13102
13103 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
13104 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
13105 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
13106 }
13107 }
13108
13109 T_LOG("Validate that reads to our mapping do not fault.");
13110 pmap_test_read(pmap, va_base, false);
13111
13112 T_LOG("Validate that writes to our mapping fault.");
13113 pmap_test_write(pmap, va_base, true);
13114
13115 T_LOG("Make the first mapping writable.");
13116 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13117
13118 T_LOG("Validate that writes to our mapping do not fault.");
13119 pmap_test_write(pmap, va_base, false);
13120
13121
13122 T_LOG("Make the first mapping XO.");
13123 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
13124
13125 T_LOG("Validate that reads to our mapping do not fault.");
13126 pmap_test_read(pmap, va_base, false);
13127
13128 T_LOG("Validate that writes to our mapping fault.");
13129 pmap_test_write(pmap, va_base, true);
13130
13131
13132 /*
13133 * For page ratios of greater than 1: validate that writes to the other
13134 * mappings still fault. Remove the mappings afterwards (we're done
13135 * with page ratio testing).
13136 */
13137 for (map_count = 1; map_count < page_ratio; map_count++) {
13138 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
13139 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
13140 }
13141
13142 T_LOG("Mark the page unreferenced and unmodified.");
13143 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13144 pmap_test_check_refmod(pa, 0);
13145
13146 /*
13147 * Begin testing the ref/mod state machine. Re-enter the mapping with
13148 * different protection/fault_type settings, and confirm that the
13149 * ref/mod state matches our expectations at each step.
13150 */
13151 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
13152 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
13153 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13154
13155 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
13156 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13157 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
13158 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13159
13160 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
13161 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13162 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
13163 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13164
13165 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
13166 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
13167 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13168
13169 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
13170 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13171 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13172 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13173
13174 /*
13175 * Shared memory testing; we'll have two mappings; one read-only,
13176 * one read-write.
13177 */
13178 vm_map_address_t rw_base = va_base;
13179 vm_map_address_t ro_base = va_base + pmap_page_size;
13180
13181 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13182 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
13183
13184 /*
13185 * Test that we take faults as expected for unreferenced/unmodified
13186 * pages. Also test the arm_fast_fault interface, to ensure that
13187 * mapping permissions change as expected.
13188 */
13189 T_LOG("!ref/!mod: expect no access");
13190 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13191 pmap_test_read_write(pmap, ro_base, false, false);
13192 pmap_test_read_write(pmap, rw_base, false, false);
13193
13194 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
13195 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
13196 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13197 pmap_test_read_write(pmap, ro_base, true, false);
13198 pmap_test_read_write(pmap, rw_base, true, false);
13199
13200 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
13201 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
13202 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13203 pmap_test_read_write(pmap, ro_base, true, false);
13204 pmap_test_read_write(pmap, rw_base, true, true);
13205
13206 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
13207 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13208 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
13209 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13210 pmap_test_read_write(pmap, ro_base, true, false);
13211 pmap_test_read_write(pmap, rw_base, true, true);
13212
13213 T_LOG("RW protect both mappings; should not change protections.");
13214 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
13215 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
13216 pmap_test_read_write(pmap, ro_base, true, false);
13217 pmap_test_read_write(pmap, rw_base, true, true);
13218
13219 T_LOG("Read protect both mappings; RW mapping should become RO.");
13220 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
13221 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
13222 pmap_test_read_write(pmap, ro_base, true, false);
13223 pmap_test_read_write(pmap, rw_base, true, false);
13224
13225 T_LOG("RW protect the page; mappings should not change protections.");
13226 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13227 pmap_page_protect(pn, VM_PROT_ALL);
13228 pmap_test_read_write(pmap, ro_base, true, false);
13229 pmap_test_read_write(pmap, rw_base, true, true);
13230
13231 T_LOG("Read protect the page; RW mapping should become RO.");
13232 pmap_page_protect(pn, VM_PROT_READ);
13233 pmap_test_read_write(pmap, ro_base, true, false);
13234 pmap_test_read_write(pmap, rw_base, true, false);
13235
13236 T_LOG("Validate that disconnect removes all known mappings of the page.");
13237 pmap_disconnect(pn);
13238 if (!pmap_verify_free(pn)) {
13239 T_FAIL("Page still has mappings");
13240 }
13241
13242 T_LOG("Remove the wired mapping, so we can tear down the test map.");
13243 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
13244 pmap_destroy(pmap);
13245
13246 T_LOG("Release the pages back to the VM.");
13247 vm_page_lock_queues();
13248 vm_page_free(unwired_vm_page);
13249 vm_page_free(wired_vm_page);
13250 vm_page_unlock_queues();
13251
13252 T_LOG("Testing successful!");
13253 return 0;
13254 }
13255 #endif /* __arm64__ */
13256
13257 kern_return_t
13258 pmap_test(void)
13259 {
13260 T_LOG("Starting pmap_tests");
13261 #ifdef __arm64__
13262 int flags = 0;
13263 flags |= PMAP_CREATE_64BIT;
13264
13265 #if __ARM_MIXED_PAGE_SIZE__
13266 T_LOG("Testing VM_PAGE_SIZE_4KB");
13267 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
13268 T_LOG("Testing VM_PAGE_SIZE_16KB");
13269 pmap_test_test_config(flags);
13270 #else /* __ARM_MIXED_PAGE_SIZE__ */
13271 pmap_test_test_config(flags);
13272 #endif /* __ARM_MIXED_PAGE_SIZE__ */
13273
13274 #endif /* __arm64__ */
13275 T_PASS("completed pmap_test successfully");
13276 return KERN_SUCCESS;
13277 }
13278 #endif /* CONFIG_XNUPOST */
13279
13280 /*
13281 * The following function should never make it to RELEASE code, since
13282 * it provides a way to get the PPL to modify text pages.
13283 */
13284 #if DEVELOPMENT || DEBUG
13285
13286 #define ARM_UNDEFINED_INSN 0xe7f000f0
13287 #define ARM_UNDEFINED_INSN_THUMB 0xde00
13288
13289 /**
13290 * Forcibly overwrite executable text with an illegal instruction.
13291 *
13292 * @note Only used for xnu unit testing.
13293 *
13294 * @param pa The physical address to corrupt.
13295 *
13296 * @return KERN_SUCCESS on success.
13297 */
13298 kern_return_t
13299 pmap_test_text_corruption(pmap_paddr_t pa)
13300 {
13301 #if XNU_MONITOR
13302 return pmap_test_text_corruption_ppl(pa);
13303 #else /* XNU_MONITOR */
13304 return pmap_test_text_corruption_internal(pa);
13305 #endif /* XNU_MONITOR */
13306 }
13307
13308 MARK_AS_PMAP_TEXT kern_return_t
13309 pmap_test_text_corruption_internal(pmap_paddr_t pa)
13310 {
13311 vm_offset_t va = phystokv(pa);
13312 unsigned int pai = pa_index(pa);
13313
13314 assert(pa_valid(pa));
13315
13316 pvh_lock(pai);
13317
13318 pv_entry_t **pv_h = pai_to_pvh(pai);
13319 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
13320 #if defined(PVH_FLAG_EXEC)
13321 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
13322
13323 if (need_ap_twiddle) {
13324 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
13325 }
13326 #endif /* defined(PVH_FLAG_EXEC) */
13327
13328 /*
13329 * The low bit in an instruction address indicates a THUMB instruction
13330 */
13331 if (va & 1) {
13332 va &= ~(vm_offset_t)1;
13333 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
13334 } else {
13335 *(uint32_t *)va = ARM_UNDEFINED_INSN;
13336 }
13337
13338 #if defined(PVH_FLAG_EXEC)
13339 if (need_ap_twiddle) {
13340 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
13341 }
13342 #endif /* defined(PVH_FLAG_EXEC) */
13343
13344 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
13345
13346 pvh_unlock(pai);
13347
13348 return KERN_SUCCESS;
13349 }
13350
13351 #endif /* DEVELOPMENT || DEBUG */
13352