1 /*
2 * Copyright (c) 2011-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/backtrace.h>
42 #include <kern/bits.h>
43 #include <kern/ecc.h>
44 #include <kern/thread.h>
45 #include <kern/sched.h>
46 #include <kern/zalloc.h>
47 #include <kern/zalloc_internal.h>
48 #include <kern/kalloc.h>
49 #include <kern/spl.h>
50 #include <kern/startup.h>
51 #include <kern/trap_telemetry.h>
52 #include <kern/trustcache.h>
53
54 #include <os/overflow.h>
55
56 #include <vm/pmap.h>
57 #include <vm/pmap_cs.h>
58 #include <vm/vm_map_xnu.h>
59 #include <vm/vm_kern.h>
60 #include <vm/vm_protos.h>
61 #include <vm/vm_object_internal.h>
62 #include <vm/vm_page_internal.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/cpm_internal.h>
65
66
67 #include <libkern/section_keywords.h>
68 #include <sys/errno.h>
69
70 #include <libkern/amfi/amfi.h>
71 #include <sys/trusted_execution_monitor.h>
72 #include <sys/trust_caches.h>
73 #include <sys/code_signing.h>
74
75 #include <machine/atomic.h>
76 #include <machine/thread.h>
77 #include <machine/lowglobals.h>
78
79 #include <arm/caches_internal.h>
80 #include <arm/cpu_data.h>
81 #include <arm/cpu_data_internal.h>
82 #include <arm/cpu_capabilities.h>
83 #include <arm/cpu_number.h>
84 #include <arm/machine_cpu.h>
85 #include <arm/misc_protos.h>
86 #include <arm/trap_internal.h>
87 #include <arm64/sptm/pmap/pmap_internal.h>
88 #include <arm64/sptm/sptm.h>
89
90 #include <arm64/proc_reg.h>
91 #include <pexpert/arm64/boot.h>
92 #include <arm64/ppl/uat.h>
93 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
94 #include <arm64/amcc_rorgn.h>
95 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
96
97 #include <pexpert/device_tree.h>
98
99 #include <san/kasan.h>
100 #include <sys/cdefs.h>
101
102 #if defined(HAS_APPLE_PAC)
103 #include <ptrauth.h>
104 #endif
105
106 #ifdef CONFIG_XNUPOST
107 #include <tests/xnupost.h>
108 #endif
109
110
111 #if HIBERNATION
112 #include <IOKit/IOHibernatePrivate.h>
113 #endif /* HIBERNATION */
114
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116
117 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
118
119
120 /**
121 * Per-CPU data used to do setup and post-processing for SPTM calls.
122 * On the setup side, this structure is used to store parameters for batched SPTM operations.
123 * These parameters may be large (upwards of 1K), and given that SPTM calls are generally
124 * issued from preemption-disabled contexts anyway, it's better to store them in per-CPU
125 * data rather than the local stack.
126 * On the post-processing side, this structure exposes a pointer to the SPTM's per-CPU array
127 * of 'prev_ptes', that is the prior value encountered in each PTE at the time of the SPTM's
128 * atomic update of that PTE.
129 */
130 pmap_sptm_percpu_data_t PERCPU_DATA(pmap_sptm_percpu);
131
132 /**
133 * Reference group for global tracking of all outstanding pmap references.
134 */
135 os_refgrp_decl(static, pmap_refgrp, "pmap", NULL);
136
137 /* Boot-arg to enable/disable the use of XNU_KERNEL_RESTRICTED type in SPTM. */
138 TUNABLE(bool, use_xnu_restricted, "xnu_restricted", true);
139
140 extern u_int32_t random(void); /* from <libkern/libkern.h> */
141
142 static bool alloc_asid(pmap_t pmap);
143 static void free_asid(pmap_t pmap);
144 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
145 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
146
147 const struct page_table_ops native_pt_ops =
148 {
149 .alloc_id = alloc_asid,
150 .free_id = free_asid,
151 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
152 .wimg_to_pte = wimg_to_pte,
153 };
154
155 const struct page_table_level_info pmap_table_level_info_16k[] =
156 {
157 [0] = {
158 .size = ARM_16K_TT_L0_SIZE,
159 .offmask = ARM_16K_TT_L0_OFFMASK,
160 .shift = ARM_16K_TT_L0_SHIFT,
161 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
162 .valid_mask = ARM_TTE_VALID,
163 .type_mask = ARM_TTE_TYPE_MASK,
164 .type_block = ARM_TTE_TYPE_BLOCK
165 },
166 [1] = {
167 .size = ARM_16K_TT_L1_SIZE,
168 .offmask = ARM_16K_TT_L1_OFFMASK,
169 .shift = ARM_16K_TT_L1_SHIFT,
170 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
171 .valid_mask = ARM_TTE_VALID,
172 .type_mask = ARM_TTE_TYPE_MASK,
173 .type_block = ARM_TTE_TYPE_BLOCK
174 },
175 [2] = {
176 .size = ARM_16K_TT_L2_SIZE,
177 .offmask = ARM_16K_TT_L2_OFFMASK,
178 .shift = ARM_16K_TT_L2_SHIFT,
179 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
180 .valid_mask = ARM_TTE_VALID,
181 .type_mask = ARM_TTE_TYPE_MASK,
182 .type_block = ARM_TTE_TYPE_BLOCK
183 },
184 [3] = {
185 .size = ARM_16K_TT_L3_SIZE,
186 .offmask = ARM_16K_TT_L3_OFFMASK,
187 .shift = ARM_16K_TT_L3_SHIFT,
188 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
189 .valid_mask = ARM_PTE_TYPE_VALID,
190 .type_mask = ARM_TTE_TYPE_MASK,
191 .type_block = ARM_TTE_TYPE_L3BLOCK
192 }
193 };
194
195 const struct page_table_level_info pmap_table_level_info_4k[] =
196 {
197 [0] = {
198 .size = ARM_4K_TT_L0_SIZE,
199 .offmask = ARM_4K_TT_L0_OFFMASK,
200 .shift = ARM_4K_TT_L0_SHIFT,
201 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
202 .valid_mask = ARM_TTE_VALID,
203 .type_mask = ARM_TTE_TYPE_MASK,
204 .type_block = ARM_TTE_TYPE_BLOCK
205 },
206 [1] = {
207 .size = ARM_4K_TT_L1_SIZE,
208 .offmask = ARM_4K_TT_L1_OFFMASK,
209 .shift = ARM_4K_TT_L1_SHIFT,
210 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
211 .valid_mask = ARM_TTE_VALID,
212 .type_mask = ARM_TTE_TYPE_MASK,
213 .type_block = ARM_TTE_TYPE_BLOCK
214 },
215 [2] = {
216 .size = ARM_4K_TT_L2_SIZE,
217 .offmask = ARM_4K_TT_L2_OFFMASK,
218 .shift = ARM_4K_TT_L2_SHIFT,
219 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
220 .valid_mask = ARM_TTE_VALID,
221 .type_mask = ARM_TTE_TYPE_MASK,
222 .type_block = ARM_TTE_TYPE_BLOCK
223 },
224 [3] = {
225 .size = ARM_4K_TT_L3_SIZE,
226 .offmask = ARM_4K_TT_L3_OFFMASK,
227 .shift = ARM_4K_TT_L3_SHIFT,
228 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
229 .valid_mask = ARM_PTE_TYPE_VALID,
230 .type_mask = ARM_TTE_TYPE_MASK,
231 .type_block = ARM_TTE_TYPE_L3BLOCK
232 }
233 };
234
235 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
236 {
237 [0] = { /* Unused */
238 .size = ARM_4K_TT_L0_SIZE,
239 .offmask = ARM_4K_TT_L0_OFFMASK,
240 .shift = ARM_4K_TT_L0_SHIFT,
241 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
242 .valid_mask = ARM_TTE_VALID,
243 .type_mask = ARM_TTE_TYPE_MASK,
244 .type_block = ARM_TTE_TYPE_BLOCK
245 },
246 [1] = { /* Concatenated, so index mask is larger than normal */
247 .size = ARM_4K_TT_L1_SIZE,
248 .offmask = ARM_4K_TT_L1_OFFMASK,
249 .shift = ARM_4K_TT_L1_SHIFT,
250 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
251 .index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
252 #else
253 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
254 #endif
255 .valid_mask = ARM_TTE_VALID,
256 .type_mask = ARM_TTE_TYPE_MASK,
257 .type_block = ARM_TTE_TYPE_BLOCK
258 },
259 [2] = {
260 .size = ARM_4K_TT_L2_SIZE,
261 .offmask = ARM_4K_TT_L2_OFFMASK,
262 .shift = ARM_4K_TT_L2_SHIFT,
263 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
264 .valid_mask = ARM_TTE_VALID,
265 .type_mask = ARM_TTE_TYPE_MASK,
266 .type_block = ARM_TTE_TYPE_BLOCK
267 },
268 [3] = {
269 .size = ARM_4K_TT_L3_SIZE,
270 .offmask = ARM_4K_TT_L3_OFFMASK,
271 .shift = ARM_4K_TT_L3_SHIFT,
272 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
273 .valid_mask = ARM_PTE_TYPE_VALID,
274 .type_mask = ARM_TTE_TYPE_MASK,
275 .type_block = ARM_TTE_TYPE_L3BLOCK
276 }
277 };
278
279 const struct page_table_attr pmap_pt_attr_4k = {
280 .pta_level_info = pmap_table_level_info_4k,
281 .pta_root_level = (T0SZ_BOOT - 16) / 9,
282 #if __ARM_MIXED_PAGE_SIZE__
283 .pta_commpage_level = PMAP_TT_L2_LEVEL,
284 #else /* __ARM_MIXED_PAGE_SIZE__ */
285 #if __ARM_16K_PG__
286 .pta_commpage_level = PMAP_TT_L2_LEVEL,
287 #else /* __ARM_16K_PG__ */
288 .pta_commpage_level = PMAP_TT_L1_LEVEL,
289 #endif /* __ARM_16K_PG__ */
290 #endif /* __ARM_MIXED_PAGE_SIZE__ */
291 .pta_max_level = PMAP_TT_L3_LEVEL,
292 .pta_ops = &native_pt_ops,
293 .ap_ro = ARM_PTE_AP(AP_RORO),
294 .ap_rw = ARM_PTE_AP(AP_RWRW),
295 .ap_rona = ARM_PTE_AP(AP_RONA),
296 .ap_rwna = ARM_PTE_AP(AP_RWNA),
297 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
298 .ap_x = ARM_PTE_PNX,
299 #if __ARM_MIXED_PAGE_SIZE__
300 .pta_tcr_value = TCR_EL1_4KB,
301 #endif /* __ARM_MIXED_PAGE_SIZE__ */
302 .pta_page_size = 4096,
303 .pta_page_shift = 12,
304 .geometry_id = SPTM_PT_GEOMETRY_4K,
305 };
306
307 const struct page_table_attr pmap_pt_attr_16k = {
308 .pta_level_info = pmap_table_level_info_16k,
309 .pta_root_level = PMAP_TT_L1_LEVEL,
310 .pta_commpage_level = PMAP_TT_L2_LEVEL,
311 .pta_max_level = PMAP_TT_L3_LEVEL,
312 .pta_ops = &native_pt_ops,
313 .ap_ro = ARM_PTE_AP(AP_RORO),
314 .ap_rw = ARM_PTE_AP(AP_RWRW),
315 .ap_rona = ARM_PTE_AP(AP_RONA),
316 .ap_rwna = ARM_PTE_AP(AP_RWNA),
317 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
318 .ap_x = ARM_PTE_PNX,
319 #if __ARM_MIXED_PAGE_SIZE__
320 .pta_tcr_value = TCR_EL1_16KB,
321 #endif /* __ARM_MIXED_PAGE_SIZE__ */
322 .pta_page_size = 16384,
323 .pta_page_shift = 14,
324 .geometry_id = SPTM_PT_GEOMETRY_16K,
325 };
326
327 #if __ARM_16K_PG__
328 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
329 #else /* !__ARM_16K_PG__ */
330 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
331 #endif /* !__ARM_16K_PG__ */
332
333
334 #if DEVELOPMENT || DEBUG
335 int vm_footprint_suspend_allowed = 1;
336
337 extern int pmap_ledgers_panic;
338 extern int pmap_ledgers_panic_leeway;
339
340 #endif /* DEVELOPMENT || DEBUG */
341
342 #if DEVELOPMENT || DEBUG
343 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
344 (current_thread()->pmap_footprint_suspended)
345 #else /* DEVELOPMENT || DEBUG */
346 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
347 #endif /* DEVELOPMENT || DEBUG */
348
349 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
350
351
352 /* Keeps track of whether the pmap has been bootstrapped */
353 SECURITY_READ_ONLY_LATE(bool) pmap_bootstrapped = false;
354
355 /*
356 * Represents a tlb range that will be flushed before returning from the pmap.
357 * Used by phys_attribute_clear_range to defer flushing pages in this range until
358 * the end of the operation, and to accumulate batched operations for submission
359 * to the SPTM as a performance optimization.
360 */
361 typedef struct pmap_tlb_flush_range {
362 /* Address space in which the flush region resides */
363 pmap_t ptfr_pmap;
364
365 /* Page-aligned beginning of the flush region */
366 vm_map_address_t ptfr_start;
367
368 /* Page-aligned non-inclusive end of the flush region */
369 vm_map_address_t ptfr_end;
370
371 /**
372 * Address of current PTE position in ptfr_pmap's [ptfr_start, ptfr_end) region.
373 * This is meant to be set up by the caller of pmap_page_protect_options_with_flush_range()
374 * or arm_force_fast_fault_with_flush_range(), and used by those functions to determine
375 * when a given mapping can be added to the SPTM's per-CPU region templates array vs.
376 * the more complex task of adding it to the disjoint ops array.
377 */
378 pt_entry_t *current_ptep;
379
380 /**
381 * Starting VA for any not-yet-submitted per-CPU region templates. This is meant to be
382 * set up by the caller of pmap_page_protect_options_with_flush_range() or
383 * arm_force_fast_fault_with_flush_range() and used by pmap_multipage_op_submit_region()
384 * when issuing the SPTM call to purge any pending region ops.
385 */
386 vm_map_address_t pending_region_start;
387
388 /**
389 * Number of entries in the per-CPU SPTM region templates array which have not
390 * yet been submitted to the SPTM.
391 */
392 unsigned int pending_region_entries;
393
394 /**
395 * Indicates whether at least one region entry was added to the per-CPU region ops
396 * array since the last time this field was checked. Intended to be cleared by the
397 * caller.
398 */
399 bool region_entry_added;
400
401 /**
402 * Marker for the current paddr "header" entry in the per-CPU SPTM disjoint ops array.
403 * This field is intended to be modified only by pmap_multipage_op_submit_disjoint()
404 * and pmap_multipage_op_add_page(), and should be treated as opaque by callers
405 * of those functions.
406 */
407 sptm_update_disjoint_multipage_op_t *current_header;
408
409 /**
410 * Position in the per-CPU SPTM ops array of the first ordinary
411 * sptm_disjoint_op_t entry following [current_header]. This is the starting
412 * point at which mappings should be inserted for the page described by
413 * [current_header].
414 */
415 unsigned int current_header_first_mapping_index;
416
417 /**
418 * Number of entries in the per-CPU SPTM disjoint ops array, including paddr headers,
419 * which have not yet been submitted to the SPTM.
420 */
421 unsigned int pending_disjoint_entries;
422
423 /**
424 * This field is used by the preemption check interval logic on the
425 * phys_attribute_clear_range() path to determine when sufficient
426 * forward progress has been made to check for and (if necessary)
427 * handle pending preemption.
428 */
429 unsigned int processed_entries;
430
431 /**
432 * Indicates whether the top-level caller needs to flush the TLB for
433 * the region in [ptfr_pmap] described by [ptfr_start, ptfr_end).
434 * This will be set if the SPTM indicates that it needed to alter
435 * any valid mapping within this region and SPTM_UPDATE_DEFER_TLBI
436 * was passed to the relevant SPTM call(s).
437 */
438 bool ptfr_flush_needed;
439 } pmap_tlb_flush_range_t;
440
441
442
443 /* Virtual memory region for early allocation */
444 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
445 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
446 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
447
448 extern uint8_t bootstrap_pagetables[];
449
450 extern unsigned int not_in_kdp;
451
452 extern vm_offset_t first_avail;
453
454 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
455 extern vm_offset_t virtual_space_end; /* End of kernel address space */
456 extern vm_offset_t static_memory_end;
457
458 extern const vm_map_address_t physmap_base;
459 extern const vm_map_address_t physmap_end;
460
461 extern int maxproc, hard_maxproc;
462
463 extern bool sdsb_io_rgns_present;
464
465 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
466 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
467
468 /* The number of address bits one TTBR can cover. */
469 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
470
471 /*
472 * The bounds on our TTBRs. These are for sanity checking that
473 * an address is accessible by a TTBR before we attempt to map it.
474 */
475
476 /* The level of the root of a page table. */
477 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
478
479 /* The number of entries in the root TT of a page table. */
480 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
481
482 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
483 const pmap_t kernel_pmap = &kernel_pmap_store;
484
485 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
486
487 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
488 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
489
490 typedef struct tt_free_entry {
491 struct tt_free_entry *next;
492 } tt_free_entry_t;
493
494 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
495 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
496 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
497 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
498 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
499 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
500 _Atomic unsigned int inuse_iommu_pages_count[SPTM_IOMMUS_N_IDS] = {0}; /* number of active pages for each IOMMU class */
501
502 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
503 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
504
505 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
506 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
507
508 /* Lock group used for all pmap object locks. */
509 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
510
511 #if DEVELOPMENT || DEBUG
512 int nx_enabled = 1; /* enable no-execute protection */
513 int allow_data_exec = 0; /* No apps may execute data */
514 int allow_stack_exec = 0; /* No apps may execute from the stack */
515 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
516 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
517 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
518 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
519 #else /* DEVELOPMENT || DEBUG */
520 const int nx_enabled = 1; /* enable no-execute protection */
521 const int allow_data_exec = 0; /* No apps may execute data */
522 const int allow_stack_exec = 0; /* No apps may execute from the stack */
523 #endif /* DEVELOPMENT || DEBUG */
524
525
526 #if MACH_ASSERT
527 static void pmap_check_ledgers(pmap_t pmap);
528 #else
529 static inline void
pmap_check_ledgers(__unused pmap_t pmap)530 pmap_check_ledgers(__unused pmap_t pmap)
531 {
532 }
533 #endif /* MACH_ASSERT */
534
535 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
536
537 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
538 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
539
540 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
541
542 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
543
544 /* end of shared region + 512MB for various purposes */
545 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
546 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
547 "Minimum address space size outside allowable range");
548
549 // Max offset is 15.375GB for devices with "large" memory config
550 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
551 // Max offset is 11.375GB for devices with "small" memory config
552 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
553
554
555 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
556 "Large device address space size outside allowable range");
557 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
558 "Small device address space size outside allowable range");
559
560 # ifdef XNU_TARGET_OS_OSX
561 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
562 # else
563 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
564 # endif
565
566 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
567 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
568 #else
569 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
570 #endif
571
572 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
573 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
574 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
575 #if !HAS_16BIT_ASID
576 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
577 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
578 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
579 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
580 #else
581 static uint16_t last_allocated_asid = 0;
582 #endif /* !HAS_16BIT_ASID */
583
584
585 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_default_table;
586 //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_default_table;
587 #if __ARM_MIXED_PAGE_SIZE__
588 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_4k_table;
589 //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_4k_table;
590 #endif
591 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_data_pa = 0;
592 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_text_pa = 0;
593 SECURITY_READ_ONLY_LATE(static vm_map_address_t) commpage_text_user_va = 0;
594 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_ro_data_pa = 0;
595
596
597 #if (DEVELOPMENT || DEBUG)
598 /* Caches whether the SPTM sysreg API has been enabled by the SPTM */
599 SECURITY_READ_ONLY_LATE(static bool) sptm_sysreg_available = false;
600 #endif /* (DEVELOPMENT || DEBUG) */
601
602 /* PTE Define Macros */
603
604 #ifndef SPTM_PTE_IN_FLIGHT_MARKER
605 /* SPTM TODO: Get rid of this once we export SPTM_PTE_IN_FLIGHT_MARKER from the SPTM. */
606 #define SPTM_PTE_IN_FLIGHT_MARKER 0x80U
607 #endif /* SPTM_PTE_IN_FLIGHT_MARKER */
608
609 /**
610 * Determine whether a PTE has been marked as compressed. This function also panics if
611 * the PTE contains bits that shouldn't be present in a compressed PTE, which is most of them.
612 *
613 * @param pte the PTE contents to check
614 * @param ptep the address of the PTE contents, for diagnostic purposes only
615 *
616 * @return true if the PTE is compressed, false otherwise
617 */
618 static inline bool
pte_is_compressed(pt_entry_t pte,pt_entry_t * ptep)619 pte_is_compressed(pt_entry_t pte, pt_entry_t *ptep)
620 {
621 const bool compressed = (!pte_is_valid(pte) && (pte & ARM_PTE_COMPRESSED));
622 /**
623 * Check for bits that shouldn't be present in a compressed PTE. This is everything except the
624 * compressed/compressed-alt bits, as well as the SPTM's in-flight marker which may be set while
625 * the SPTM is in the process of flushing the TLBs after marking a previously-valid PTE as
626 * compressed.
627 */
628 if (__improbable(compressed && (pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER)))) {
629 panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?",
630 ptep, pte, pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER));
631 }
632 return compressed;
633 }
634
635 #define pte_is_wired(pte) \
636 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
637
638 #define pte_was_writeable(pte) \
639 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
640
641 #define pte_set_was_writeable(pte, was_writeable) \
642 do { \
643 if ((was_writeable)) { \
644 (pte) |= ARM_PTE_WRITEABLE; \
645 } else { \
646 (pte) &= ~ARM_PTE_WRITEABLE; \
647 } \
648 } while(0)
649
650
651 /**
652 * Updated wired-mapping accountings in the PTD and ledger.
653 *
654 * @param pmap The pmap against which to update accounting
655 * @param pte_p The PTE whose wired state is being changed
656 * @param wired Indicates whether the PTE is being wired or unwired.
657 */
658 static inline void
pte_update_wiredcnt(pmap_t pmap,pt_entry_t * pte_p,boolean_t wired)659 pte_update_wiredcnt(pmap_t pmap, pt_entry_t *pte_p, boolean_t wired)
660 {
661 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
662 unsigned short *ptd_wiredcnt_ptr = &(ptep_get_info(pte_p)->wiredcnt);
663 if (wired) {
664 if (__improbable(os_atomic_inc_orig(ptd_wiredcnt_ptr, relaxed) == UINT16_MAX)) {
665 panic("pmap %p (pte %p): wired count overflow", pmap, pte_p);
666 }
667 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
668 } else {
669 if (__improbable(os_atomic_dec_orig(ptd_wiredcnt_ptr, relaxed) == 0)) {
670 panic("pmap %p (pte %p): wired count underflow", pmap, pte_p);
671 }
672 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
673 }
674 }
675
676 /*
677 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
678 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
679 * will observe the updated PTE.
680 */
681 #define FLUSH_PTE() \
682 __builtin_arm_dmb(DMB_ISH);
683
684 /*
685 * Synchronize updates to PTEs that were previously valid and thus may be cached in
686 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
687 * TLBI. This should only require a store-store barrier, as subsequent accesses in
688 * program order will not issue until the DSB completes. Prior loads may be reordered
689 * after the barrier, but their behavior should not be materially affected by the
690 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
691 * matter for loads until the access is re-driven well after the TLB update is
692 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
693 * we should be in a position to handle access faults. For "voluntary" PTE access
694 * restriction due to unmapping or protection, the decision to restrict access should
695 * have a data dependency on prior loads in order to avoid a data race.
696 */
697 #define FLUSH_PTE_STRONG() \
698 __builtin_arm_dsb(DSB_ISHST);
699
700 /**
701 * Write enough page table entries to map a single VM page. On systems where the
702 * VM page size does not match the hardware page size, multiple page table
703 * entries will need to be written.
704 *
705 * @note This function does not emit a barrier to ensure these page table writes
706 * have completed before continuing. This is commonly needed. In the case
707 * where a DMB or DSB barrier is needed, then use the write_pte() and
708 * write_pte_strong() functions respectively instead of this one.
709 *
710 * @param ptep Pointer to the first page table entry to update.
711 * @param pte The value to write into each page table entry. In the case that
712 * multiple PTEs are updated to a non-empty value, then the address
713 * in this value will automatically be incremented for each PTE
714 * write.
715 */
716 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)717 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
718 {
719 /**
720 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
721 * systems, which is why it's checked at runtime instead of compile time.
722 * The "unreachable" warning needs to be suppressed because it still is a
723 * compile time constant on some systems.
724 */
725 __unreachable_ok_push
726 if (TEST_PAGE_RATIO_4) {
727 if (((uintptr_t)ptep) & 0x1f) {
728 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
729 __func__, ptep, (void*)pte);
730 }
731
732 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
733 /**
734 * If we're writing an empty/compressed PTE value, then don't
735 * auto-increment the address for each PTE write.
736 */
737 *ptep = pte;
738 *(ptep + 1) = pte;
739 *(ptep + 2) = pte;
740 *(ptep + 3) = pte;
741 } else {
742 *ptep = pte;
743 *(ptep + 1) = pte | 0x1000;
744 *(ptep + 2) = pte | 0x2000;
745 *(ptep + 3) = pte | 0x3000;
746 }
747 } else {
748 *ptep = pte;
749 }
750 __unreachable_ok_pop
751 }
752
753 /**
754 * Writes enough page table entries to map a single VM page and then ensures
755 * those writes complete by executing a Data Memory Barrier.
756 *
757 * @note The DMB issued by this function is not strong enough to protect against
758 * TLB invalidates from being reordered above the PTE writes. If a TLBI
759 * instruction is going to immediately be called after this write, it's
760 * recommended to call write_pte_strong() instead of this function.
761 *
762 * See the function header for write_pte_fast() for more details on the
763 * parameters.
764 */
765 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)766 write_pte(pt_entry_t *ptep, pt_entry_t pte)
767 {
768 write_pte_fast(ptep, pte);
769 FLUSH_PTE();
770 }
771
772 /**
773 * Retrieve the pmap structure for the thread running on the current CPU.
774 */
775 pmap_t
current_pmap()776 current_pmap()
777 {
778 const pmap_t current = vm_map_pmap(current_thread()->map);
779 assert(current != NULL);
780 return current;
781 }
782
783 #if DEVELOPMENT || DEBUG
784
785 /*
786 * Trace levels are controlled by a bitmask in which each
787 * level can be enabled/disabled by the (1<<level) position
788 * in the boot arg
789 * Level 0: PPL extension functionality
790 * Level 1: pmap lifecycle (create/destroy/switch)
791 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
792 * Level 3: internal state management (attributes/fast-fault)
793 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
794 */
795
796 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
797
798 #define PMAP_TRACE(level, ...) \
799 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
800 KDBG_RELEASE(__VA_ARGS__); \
801 }
802 #else /* DEVELOPMENT || DEBUG */
803
804 #define PMAP_TRACE(level, ...)
805
806 #endif /* DEVELOPMENT || DEBUG */
807
808
809 /*
810 * Internal function prototypes (forward declarations).
811 */
812
813 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
814
815 static void pmap_set_reference(ppnum_t pn);
816
817 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
818
819 static kern_return_t pmap_expand(
820 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
821
822 static void pmap_remove_range(pmap_t, vm_map_address_t, vm_map_address_t);
823
824 static tt_entry_t *pmap_tt1_allocate(pmap_t, uint8_t);
825
826 static void pmap_tt1_deallocate(pmap_t, tt_entry_t *);
827
828 static kern_return_t pmap_tt_allocate(
829 pmap_t, tt_entry_t **, unsigned int, unsigned int);
830
831 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
832 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
833 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
834
835 static void pmap_unmap_commpage(
836 pmap_t pmap);
837
838 static boolean_t
839 pmap_is_64bit(pmap_t);
840
841
842 static void pmap_flush_tlb_for_paddr_async(pmap_paddr_t);
843
844 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
845
846 static boolean_t arm_clear_fast_fault(
847 ppnum_t ppnum,
848 vm_prot_t fault_type,
849 uintptr_t pvh,
850 pt_entry_t *pte_p,
851 pp_attr_t attrs_to_clear);
852
853 static void pmap_trim_self(pmap_t pmap);
854 static void pmap_trim_subord(pmap_t subord);
855
856
857 /*
858 * Temporary prototypes, while we wait for pmap_enter to move to taking an
859 * address instead of a page number.
860 */
861 kern_return_t
862 pmap_enter(
863 pmap_t pmap,
864 vm_map_address_t v,
865 ppnum_t pn,
866 vm_prot_t prot,
867 vm_prot_t fault_type,
868 unsigned int flags,
869 boolean_t wired,
870 pmap_mapping_type_t mapping_type);
871
872 static kern_return_t
873 pmap_enter_addr(
874 pmap_t pmap,
875 vm_map_address_t v,
876 pmap_paddr_t pa,
877 vm_prot_t prot,
878 vm_prot_t fault_type,
879 unsigned int flags,
880 boolean_t wired,
881 pmap_mapping_type_t mapping_type);
882
883 kern_return_t
884 pmap_enter_options_addr(
885 pmap_t pmap,
886 vm_map_address_t v,
887 pmap_paddr_t pa,
888 vm_prot_t prot,
889 vm_prot_t fault_type,
890 unsigned int flags,
891 boolean_t wired,
892 unsigned int options,
893 __unused void *arg,
894 pmap_mapping_type_t mapping_type);
895
896 #ifdef CONFIG_XNUPOST
897 kern_return_t pmap_test(void);
898 #endif /* CONFIG_XNUPOST */
899
900 PMAP_SUPPORT_PROTOTYPES(
901 kern_return_t,
902 arm_fast_fault, (pmap_t pmap,
903 vm_map_address_t va,
904 vm_prot_t fault_type,
905 bool was_af_fault,
906 bool from_user), ARM_FAST_FAULT_INDEX);
907
908 PMAP_SUPPORT_PROTOTYPES(
909 boolean_t,
910 arm_force_fast_fault, (ppnum_t ppnum,
911 vm_prot_t allow_mode,
912 int options), ARM_FORCE_FAST_FAULT_INDEX);
913
914 MARK_AS_PMAP_TEXT static boolean_t
915 arm_force_fast_fault_with_flush_range(
916 ppnum_t ppnum,
917 vm_prot_t allow_mode,
918 int options,
919 locked_pvh_t *locked_pvh,
920 pp_attr_t bits_to_clear,
921 pmap_tlb_flush_range_t *flush_range);
922
923 PMAP_SUPPORT_PROTOTYPES(
924 void,
925 pmap_batch_set_cache_attributes, (
926 const unified_page_list_t * page_list,
927 unsigned int cacheattr,
928 bool update_attr_table), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
929
930 PMAP_SUPPORT_PROTOTYPES(
931 void,
932 pmap_change_wiring, (pmap_t pmap,
933 vm_map_address_t v,
934 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
935
936 PMAP_SUPPORT_PROTOTYPES(
937 pmap_t,
938 pmap_create_options, (ledger_t ledger,
939 vm_map_size_t size,
940 unsigned int flags,
941 kern_return_t * kr), PMAP_CREATE_INDEX);
942
943 PMAP_SUPPORT_PROTOTYPES(
944 void,
945 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
946
947 PMAP_SUPPORT_PROTOTYPES(
948 kern_return_t,
949 pmap_enter_options, (pmap_t pmap,
950 vm_map_address_t v,
951 pmap_paddr_t pa,
952 vm_prot_t prot,
953 vm_prot_t fault_type,
954 unsigned int flags,
955 boolean_t wired,
956 unsigned int options,
957 pmap_mapping_type_t mapping_type), PMAP_ENTER_OPTIONS_INDEX);
958
959 PMAP_SUPPORT_PROTOTYPES(
960 pmap_paddr_t,
961 pmap_find_pa, (pmap_t pmap,
962 addr64_t va), PMAP_FIND_PA_INDEX);
963
964 PMAP_SUPPORT_PROTOTYPES(
965 kern_return_t,
966 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
967
968
969 PMAP_SUPPORT_PROTOTYPES(
970 boolean_t,
971 pmap_is_empty, (pmap_t pmap,
972 vm_map_offset_t va_start,
973 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
974
975
976 PMAP_SUPPORT_PROTOTYPES(
977 unsigned int,
978 pmap_map_cpu_windows_copy, (ppnum_t pn,
979 vm_prot_t prot,
980 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
981
982 PMAP_SUPPORT_PROTOTYPES(
983 void,
984 pmap_ro_zone_memcpy, (zone_id_t zid,
985 vm_offset_t va,
986 vm_offset_t offset,
987 const vm_offset_t new_data,
988 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
989
990 PMAP_SUPPORT_PROTOTYPES(
991 uint64_t,
992 pmap_ro_zone_atomic_op, (zone_id_t zid,
993 vm_offset_t va,
994 vm_offset_t offset,
995 zro_atomic_op_t op,
996 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
997
998 PMAP_SUPPORT_PROTOTYPES(
999 void,
1000 pmap_ro_zone_bzero, (zone_id_t zid,
1001 vm_offset_t va,
1002 vm_offset_t offset,
1003 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1004
1005 PMAP_SUPPORT_PROTOTYPES(
1006 kern_return_t,
1007 pmap_nest, (pmap_t grand,
1008 pmap_t subord,
1009 addr64_t vstart,
1010 uint64_t size), PMAP_NEST_INDEX);
1011
1012 PMAP_SUPPORT_PROTOTYPES(
1013 void,
1014 pmap_page_protect_options, (ppnum_t ppnum,
1015 vm_prot_t prot,
1016 unsigned int options,
1017 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1018
1019 PMAP_SUPPORT_PROTOTYPES(
1020 vm_map_address_t,
1021 pmap_protect_options, (pmap_t pmap,
1022 vm_map_address_t start,
1023 vm_map_address_t end,
1024 vm_prot_t prot,
1025 unsigned int options,
1026 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1027
1028 PMAP_SUPPORT_PROTOTYPES(
1029 kern_return_t,
1030 pmap_query_page_info, (pmap_t pmap,
1031 vm_map_offset_t va,
1032 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1033
1034 PMAP_SUPPORT_PROTOTYPES(
1035 mach_vm_size_t,
1036 pmap_query_resident, (pmap_t pmap,
1037 vm_map_address_t start,
1038 vm_map_address_t end,
1039 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1040
1041 PMAP_SUPPORT_PROTOTYPES(
1042 void,
1043 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1044
1045 PMAP_SUPPORT_PROTOTYPES(
1046 vm_map_address_t,
1047 pmap_remove_options, (pmap_t pmap,
1048 vm_map_address_t start,
1049 vm_map_address_t end,
1050 int options), PMAP_REMOVE_OPTIONS_INDEX);
1051
1052
1053 PMAP_SUPPORT_PROTOTYPES(
1054 void,
1055 pmap_set_cache_attributes, (ppnum_t pn,
1056 unsigned int cacheattr,
1057 bool update_attr_table), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1058
1059 PMAP_SUPPORT_PROTOTYPES(
1060 void,
1061 pmap_update_compressor_page, (ppnum_t pn,
1062 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1063
1064 PMAP_SUPPORT_PROTOTYPES(
1065 void,
1066 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1067
1068 #if MACH_ASSERT
1069 PMAP_SUPPORT_PROTOTYPES(
1070 void,
1071 pmap_set_process, (pmap_t pmap,
1072 int pid,
1073 char *procname), PMAP_SET_PROCESS_INDEX);
1074 #endif
1075
1076 PMAP_SUPPORT_PROTOTYPES(
1077 void,
1078 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1079
1080 PMAP_SUPPORT_PROTOTYPES(
1081 void,
1082 pmap_unnest_options, (pmap_t grand,
1083 addr64_t vaddr,
1084 uint64_t size,
1085 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1086
1087 PMAP_SUPPORT_PROTOTYPES(
1088 void,
1089 phys_attribute_set, (ppnum_t pn,
1090 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1091
1092 PMAP_SUPPORT_PROTOTYPES(
1093 void,
1094 phys_attribute_clear, (ppnum_t pn,
1095 unsigned int bits,
1096 int options,
1097 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1098
1099 #if __ARM_RANGE_TLBI__
1100 PMAP_SUPPORT_PROTOTYPES(
1101 vm_map_address_t,
1102 phys_attribute_clear_range, (pmap_t pmap,
1103 vm_map_address_t start,
1104 vm_map_address_t end,
1105 unsigned int bits,
1106 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1107 #endif /* __ARM_RANGE_TLBI__ */
1108
1109
1110 PMAP_SUPPORT_PROTOTYPES(
1111 void,
1112 pmap_switch, (pmap_t pmap, thread_t thread), PMAP_SWITCH_INDEX);
1113
1114 PMAP_SUPPORT_PROTOTYPES(
1115 void,
1116 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1117
1118 PMAP_SUPPORT_PROTOTYPES(
1119 void,
1120 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1121
1122 PMAP_SUPPORT_PROTOTYPES(
1123 void,
1124 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1125
1126 PMAP_SUPPORT_PROTOTYPES(
1127 void,
1128 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1129
1130 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1131 PMAP_SUPPORT_PROTOTYPES(
1132 void,
1133 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1134 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1135
1136 PMAP_SUPPORT_PROTOTYPES(
1137 void,
1138 pmap_trim, (pmap_t grand,
1139 pmap_t subord,
1140 addr64_t vstart,
1141 uint64_t size), PMAP_TRIM_INDEX);
1142
1143 #if HAS_APPLE_PAC
1144 PMAP_SUPPORT_PROTOTYPES(
1145 void *,
1146 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1147 PMAP_SUPPORT_PROTOTYPES(
1148 void *,
1149 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1150 #endif /* HAS_APPLE_PAC */
1151
1152
1153 void pmap_footprint_suspend(vm_map_t map,
1154 boolean_t suspend);
1155 PMAP_SUPPORT_PROTOTYPES(
1156 void,
1157 pmap_footprint_suspend, (vm_map_t map,
1158 boolean_t suspend),
1159 PMAP_FOOTPRINT_SUSPEND_INDEX);
1160
1161
1162
1163
1164
1165 /*
1166 * The low global vector page is mapped at a fixed alias.
1167 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1168 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1169 * to check both addresses anyway for backward compatibility. So for now
1170 * we leave H6 and H7 where they were.
1171 */
1172 #if (ARM_PGSHIFT == 14)
1173 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1174 #else
1175 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1176 #endif
1177
1178 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1179 PMAP_ZINFO_PALLOC(
1180 pmap_t pmap, int bytes)
1181 {
1182 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1183 }
1184
1185 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1186 PMAP_ZINFO_PFREE(
1187 pmap_t pmap,
1188 int bytes)
1189 {
1190 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1191 }
1192
1193 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1194 pmap_tt_ledger_credit(
1195 pmap_t pmap,
1196 vm_size_t size)
1197 {
1198 if (pmap != kernel_pmap) {
1199 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1200 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1201 }
1202 }
1203
1204 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1205 pmap_tt_ledger_debit(
1206 pmap_t pmap,
1207 vm_size_t size)
1208 {
1209 if (pmap != kernel_pmap) {
1210 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1211 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1212 }
1213 }
1214
1215 static inline void
pmap_update_plru(uint16_t asid_index __unused)1216 pmap_update_plru(uint16_t asid_index __unused)
1217 {
1218 #if !HAS_16BIT_ASID
1219 if (__probable(pmap_asid_plru)) {
1220 unsigned plru_index = asid_index >> 6;
1221 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1222 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1223 asid_plru_bitmap[plru_index] = ((plru_index == 0) ? ~1ULL : UINT64_MAX);
1224 }
1225 }
1226 #endif /* !HAS_16BIT_ASID */
1227 }
1228
1229 static bool
alloc_asid(pmap_t pmap)1230 alloc_asid(pmap_t pmap)
1231 {
1232 int vasid = -1;
1233
1234 pmap_simple_lock(&asid_lock);
1235
1236 #if !HAS_16BIT_ASID
1237 if (__probable(pmap_asid_plru)) {
1238 unsigned plru_index = 0;
1239 uint64_t lowest_gen = asid_plru_generation[0];
1240 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1241 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1242 if (asid_plru_generation[i] < lowest_gen) {
1243 plru_index = i;
1244 lowest_gen = asid_plru_generation[i];
1245 lowest_gen_bitmap = asid_plru_bitmap[i];
1246 }
1247 }
1248
1249 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += (MAX_HW_ASIDS >> 6)) {
1250 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1251 if (temp_plru) {
1252 vasid = (plru_index << 6) + lsb_first(temp_plru);
1253 #if DEVELOPMENT || DEBUG
1254 ++pmap_asid_hits;
1255 #endif
1256 break;
1257 }
1258 }
1259 }
1260 #else
1261 /**
1262 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1263 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1264 * However, we first try to allocate starting from the position of the most-recently allocated
1265 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1266 * lower bit positions and then re-checking those same lower positions every time we allocate
1267 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1268 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1269 * logic, without requiring prohibitively expensive RCTX instructions.
1270 */
1271 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1272 #endif /* !HAS_16BIT_ASID */
1273 if (__improbable(vasid < 0)) {
1274 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1275 // slightly better with the collision detection scheme used by pmap_switch_internal().
1276 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1277 #if DEVELOPMENT || DEBUG
1278 ++pmap_asid_misses;
1279 #endif
1280 }
1281 if (__improbable(vasid < 0)) {
1282 pmap_simple_unlock(&asid_lock);
1283 return false;
1284 }
1285 assert((uint32_t)vasid < pmap_max_asids);
1286 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1287 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1288 const uint16_t hw_asid = (uint16_t)(vasid & (MAX_HW_ASIDS - 1));
1289 #if HAS_16BIT_ASID
1290 last_allocated_asid = hw_asid;
1291 #endif /* HAS_16BIT_ASID */
1292 pmap_simple_unlock(&asid_lock);
1293 assert(hw_asid != 0); // Should never alias kernel ASID
1294 pmap->asid = (uint16_t)vasid;
1295 pmap_update_plru(hw_asid);
1296 return true;
1297 }
1298
1299 static void
free_asid(pmap_t pmap)1300 free_asid(pmap_t pmap)
1301 {
1302 const uint16_t vasid = os_atomic_xchg(&pmap->asid, 0, relaxed);
1303 if (__improbable(vasid == 0)) {
1304 return;
1305 }
1306
1307 #if !HAS_16BIT_ASID
1308 if (pmap_asid_plru) {
1309 const uint16_t hw_asid = vasid & (MAX_HW_ASIDS - 1);
1310 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1311 }
1312 #endif /* !HAS_16BIT_ASID */
1313 pmap_simple_lock(&asid_lock);
1314 assert(!bitmap_test(&asid_bitmap[0], vasid));
1315 bitmap_set(&asid_bitmap[0], vasid);
1316 pmap_simple_unlock(&asid_lock);
1317 }
1318
1319
1320 boolean_t
pmap_valid_address(pmap_paddr_t addr)1321 pmap_valid_address(
1322 pmap_paddr_t addr)
1323 {
1324 return pa_valid(addr);
1325 }
1326
1327
1328
1329
1330
1331
1332 /*
1333 * Map memory at initialization. The physical addresses being
1334 * mapped are not managed and are never unmapped.
1335 *
1336 * For now, VM is already on, we only need to map the
1337 * specified memory.
1338 */
1339 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1340 pmap_map(
1341 vm_map_address_t virt,
1342 vm_offset_t start,
1343 vm_offset_t end,
1344 vm_prot_t prot,
1345 unsigned int flags)
1346 {
1347 kern_return_t kr;
1348 vm_size_t ps;
1349
1350 ps = PAGE_SIZE;
1351 while (start < end) {
1352 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1353 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1354
1355 if (kr != KERN_SUCCESS) {
1356 panic("%s: failed pmap_enter, "
1357 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1358 __FUNCTION__,
1359 (void *) virt, (void *) start, (void *) end, prot, flags);
1360 }
1361
1362 virt += ps;
1363 start += ps;
1364 }
1365
1366
1367 return virt;
1368 }
1369
1370 /**
1371 * Force the permission of a PTE to be kernel RO if a page has XNU_PROTECTED_IO type.
1372 *
1373 * @param paddr The physical address of the page.
1374 * @param tmplate The PTE value to be evaluated.
1375 *
1376 * @return A new PTE value with permission bits modified.
1377 */
1378 static inline
1379 pt_entry_t
pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr,pt_entry_t tmplate)1380 pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr, pt_entry_t tmplate)
1381 {
1382 /**
1383 * When requesting RW mappings to an XNU_PROTECTED_IO frame, downgrade
1384 * the mapping to RO. This is required because IOKit relies on this
1385 * behavior currently in the PPL.
1386 */
1387 const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
1388 if (frame_type == XNU_PROTECTED_IO) {
1389 /* SPTM to own the page by converting KERN_RW to PPL_RW. */
1390 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1391 switch (xprr_perm) {
1392 case XPRR_KERN_RO_PERM:
1393 break;
1394 case XPRR_KERN_RW_PERM:
1395 tmplate &= ~ARM_PTE_XPRR_MASK;
1396 tmplate |= xprr_perm_to_pte(XPRR_KERN_RO_PERM);
1397 break;
1398 default:
1399 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1400 }
1401 }
1402
1403 return tmplate;
1404 }
1405
1406 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1407 pmap_map_bd_with_options(
1408 vm_map_address_t virt,
1409 vm_offset_t start,
1410 vm_offset_t end,
1411 vm_prot_t prot,
1412 int32_t options)
1413 {
1414 pt_entry_t tmplate;
1415 vm_map_address_t vaddr;
1416 vm_offset_t paddr;
1417 pt_entry_t mem_attr;
1418
1419 switch (options & PMAP_MAP_BD_MASK) {
1420 case PMAP_MAP_BD_WCOMB:
1421 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1422 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1423 break;
1424 case PMAP_MAP_BD_POSTED:
1425 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1426 break;
1427 case PMAP_MAP_BD_POSTED_REORDERED:
1428 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1429 break;
1430 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1431 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1432 break;
1433 default:
1434 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1435 break;
1436 }
1437
1438 tmplate = ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1439 mem_attr | ARM_PTE_TYPE_VALID | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1440
1441 #if __ARM_KERNEL_PROTECT__
1442 tmplate |= ARM_PTE_NG;
1443 #endif /* __ARM_KERNEL_PROTECT__ */
1444
1445 vaddr = virt;
1446 paddr = start;
1447 while (paddr < end) {
1448 __assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, vaddr, pmap_force_pte_kernel_ro_if_protected_io(paddr, tmplate) | pa_to_pte(paddr));
1449 assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1450
1451 vaddr += PAGE_SIZE;
1452 paddr += PAGE_SIZE;
1453 }
1454
1455 return vaddr;
1456 }
1457
1458 /*
1459 * Back-door routine for mapping kernel VM at initialization.
1460 * Useful for mapping memory outside the range
1461 * [vm_first_phys, vm_last_phys] (i.e., devices).
1462 * Otherwise like pmap_map.
1463 */
1464 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1465 pmap_map_bd(
1466 vm_map_address_t virt,
1467 vm_offset_t start,
1468 vm_offset_t end,
1469 vm_prot_t prot)
1470 {
1471 return pmap_map_bd_with_options(virt, start, end, prot, 0);
1472 }
1473
1474 /*
1475 * Back-door routine for mapping kernel VM at initialization.
1476 * Useful for mapping memory specific physical addresses in early
1477 * boot (i.e., before kernel_map is initialized).
1478 *
1479 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1480 */
1481
1482 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1483 pmap_map_high_window_bd(
1484 vm_offset_t pa_start,
1485 vm_size_t len,
1486 vm_prot_t prot)
1487 {
1488 pt_entry_t *ptep, pte;
1489 vm_map_address_t va_start = VREGION1_START;
1490 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1491 vm_map_address_t va_end;
1492 vm_map_address_t va;
1493 vm_size_t offset;
1494
1495 offset = pa_start & PAGE_MASK;
1496 pa_start -= offset;
1497 len += offset;
1498
1499 if (len > (va_max - va_start)) {
1500 panic("%s: area too large, "
1501 "pa_start=%p, len=%p, prot=0x%x",
1502 __FUNCTION__,
1503 (void*)pa_start, (void*)len, prot);
1504 }
1505
1506 scan:
1507 for (; va_start < va_max; va_start += PAGE_SIZE) {
1508 ptep = pmap_pte(kernel_pmap, va_start);
1509 assert(!pte_is_compressed(*ptep, ptep));
1510 if (!pte_is_valid(*ptep)) {
1511 break;
1512 }
1513 }
1514 if (va_start > va_max) {
1515 panic("%s: insufficient pages, "
1516 "pa_start=%p, len=%p, prot=0x%x",
1517 __FUNCTION__,
1518 (void*)pa_start, (void*)len, prot);
1519 }
1520
1521 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1522 ptep = pmap_pte(kernel_pmap, va_end);
1523 assert(!pte_is_compressed(*ptep, ptep));
1524 if (pte_is_valid(*ptep)) {
1525 va_start = va_end + PAGE_SIZE;
1526 goto scan;
1527 }
1528 }
1529
1530 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1531 ptep = pmap_pte(kernel_pmap, va);
1532 pte = pa_to_pte(pa_start)
1533 | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1534 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1535 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT)
1536 | ARM_PTE_SH(SH_OUTER_MEMORY);
1537 #if __ARM_KERNEL_PROTECT__
1538 pte |= ARM_PTE_NG;
1539 #endif /* __ARM_KERNEL_PROTECT__ */
1540 __assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, va, pte);
1541 assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1542 }
1543 #if KASAN
1544 kasan_notify_address(va_start, len);
1545 #endif
1546 return va_start;
1547 }
1548
1549 /*
1550 * pmap_get_arm64_prot
1551 *
1552 * return effective armv8 VMSA block protections including
1553 * table AP/PXN/XN overrides of a pmap entry
1554 *
1555 */
1556
1557 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1558 pmap_get_arm64_prot(
1559 pmap_t pmap,
1560 vm_offset_t addr)
1561 {
1562 tt_entry_t tte = 0;
1563 unsigned int level = 0;
1564 uint64_t effective_prot_bits = 0;
1565 uint64_t aggregate_tte = 0;
1566 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1567 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1568
1569 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1570 tte = *pmap_ttne(pmap, level, addr);
1571
1572 if (!(tte & ARM_TTE_VALID)) {
1573 return 0;
1574 }
1575
1576 if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
1577 /* Block or page mapping; both have the same protection bit layout. */
1578 break;
1579 } else if (tte_is_table(tte)) {
1580 /* All of the table bits we care about are overrides, so just OR them together. */
1581 aggregate_tte |= tte;
1582 }
1583 }
1584
1585 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1586 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1587 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1588
1589 /* Start with the PTE bits. */
1590 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1591
1592 /* Table AP bits mask out block/page AP bits */
1593 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1594
1595 /* XN/PXN bits can be OR'd in. */
1596 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1597 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1598
1599 return effective_prot_bits;
1600 }
1601
1602 /*
1603 * Bootstrap the system enough to run with virtual memory.
1604 *
1605 * The early VM initialization code has already allocated
1606 * the first CPU's translation table and made entries for
1607 * all the one-to-one mappings to be found there.
1608 *
1609 * We must set up the kernel pmap structures, the
1610 * physical-to-virtual translation lookup tables for the
1611 * physical memory to be managed (between avail_start and
1612 * avail_end).
1613 *
1614 * Map the kernel's code and data, and allocate the system page table.
1615 * Page_size must already be set.
1616 *
1617 * Parameters:
1618 * first_avail first available physical page -
1619 * after kernel page tables
1620 * avail_start PA of first managed physical page
1621 * avail_end PA of last managed physical page
1622 */
1623
1624 void
pmap_bootstrap(vm_offset_t vstart)1625 pmap_bootstrap(
1626 vm_offset_t vstart)
1627 {
1628 vm_map_offset_t maxoffset;
1629
1630 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
1631
1632 #if DEVELOPMENT || DEBUG
1633 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
1634 kprintf("Kernel traces for pmap operations enabled\n");
1635 }
1636 #endif
1637
1638 /*
1639 * Initialize the kernel pmap.
1640 */
1641 #if ARM_PARAMETERIZED_PMAP
1642 kernel_pmap->pmap_pt_attr = native_pt_attr;
1643 #endif /* ARM_PARAMETERIZED_PMAP */
1644 #if HAS_APPLE_PAC
1645 kernel_pmap->disable_jop = 0;
1646 #endif /* HAS_APPLE_PAC */
1647 kernel_pmap->tte = cpu_tte;
1648 kernel_pmap->ttep = cpu_ttep;
1649 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
1650 kernel_pmap->max = UINTPTR_MAX;
1651 os_ref_init_count_raw(&kernel_pmap->ref_count, &pmap_refgrp, 1);
1652 kernel_pmap->nx_enabled = TRUE;
1653 kernel_pmap->is_64bit = TRUE;
1654 #if CONFIG_ROSETTA
1655 kernel_pmap->is_rosetta = FALSE;
1656 #endif
1657
1658 #if ARM_PARAMETERIZED_PMAP
1659 kernel_pmap->pmap_pt_attr = native_pt_attr;
1660 #endif /* ARM_PARAMETERIZED_PMAP */
1661
1662 kernel_pmap->nested_region_addr = 0x0ULL;
1663 kernel_pmap->nested_region_size = 0x0ULL;
1664 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
1665 kernel_pmap->type = PMAP_TYPE_KERNEL;
1666
1667 kernel_pmap->asid = 0;
1668
1669 pmap_lock_init(kernel_pmap);
1670
1671 pmap_max_asids = SPTMArgs->num_asids;
1672
1673 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
1674
1675 /**
1676 * Bootstrap the core pmap data structures (e.g., pv_head_table,
1677 * pp_attr_table, etc). This function will use `avail_start` to allocate
1678 * space for these data structures.
1679 * */
1680 pmap_data_bootstrap();
1681
1682 /**
1683 * Bootstrap any necessary UAT data structures and values needed from the device tree.
1684 */
1685 uat_bootstrap();
1686
1687 /**
1688 * Don't make any assumptions about the alignment of avail_start before this
1689 * point (i.e., pmap_data_bootstrap() performs allocations).
1690 */
1691 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
1692
1693 const pmap_paddr_t pmap_struct_start = avail_start;
1694
1695 asid_bitmap = (bitmap_t*)phystokv(avail_start);
1696 avail_start = round_page(avail_start + asid_table_size);
1697
1698 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
1699
1700 queue_init(&map_pmap_list);
1701 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
1702
1703 virtual_space_start = vstart;
1704 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
1705
1706 bitmap_full(&asid_bitmap[0], pmap_max_asids);
1707 /* Clear the ASIDs which will alias the reserved kernel ASID of 0. */
1708 for (unsigned int i = 0; i < pmap_max_asids; i += MAX_HW_ASIDS) {
1709 bitmap_clear(&asid_bitmap[0], i);
1710 }
1711
1712
1713 #if !HAS_16BIT_ASID
1714 /**
1715 * Align the range of available hardware ASIDs to a multiple of 64 to enable the
1716 * masking used by the PLRU scheme. This means we must handle the case in which
1717 * the returned hardware ASID is 0, which we do by clearing all vASIDs that will
1718 * alias the kernel ASID.
1719 */
1720 pmap_max_asids = pmap_max_asids & ~63ul;
1721 if (__improbable(pmap_max_asids == 0)) {
1722 panic("%s: insufficient number of ASIDs (%u) supplied by SPTM", __func__, (unsigned int)pmap_max_asids);
1723 }
1724 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
1725 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
1726 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
1727 _Static_assert((MAX_HW_ASIDS % 64) == 0, "MAX_HW_ASIDS is not divisible by 64");
1728 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
1729 bitmap_clear(&asid_plru_bitmap[0], 0);
1730 #endif /* !HAS_16BIT_ASID */
1731
1732
1733 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
1734 maxoffset = trunc_page(maxoffset);
1735 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
1736 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
1737 arm_pmap_max_offset_default = maxoffset;
1738 }
1739 }
1740 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
1741 maxoffset = trunc_page(maxoffset);
1742 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
1743 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
1744 arm64_pmap_max_offset_default = maxoffset;
1745 }
1746 }
1747
1748 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
1749
1750
1751 #if DEVELOPMENT || DEBUG
1752 PE_parse_boot_argn("vm_footprint_suspend_allowed",
1753 &vm_footprint_suspend_allowed,
1754 sizeof(vm_footprint_suspend_allowed));
1755 #endif /* DEVELOPMENT || DEBUG */
1756
1757 #if KASAN
1758 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
1759 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
1760 #endif /* KASAN */
1761
1762 /**
1763 * Ensure that avail_start is always left on a page boundary. The calling
1764 * code might not perform any alignment before allocating page tables so
1765 * this is important.
1766 */
1767 avail_start = round_page(avail_start);
1768
1769
1770 #if (DEVELOPMENT || DEBUG)
1771 (void)sptm_features_available(SPTM_FEATURE_SYSREG, &sptm_sysreg_available);
1772 #endif /* (DEVELOPMENT || DEBUG) */
1773
1774 /* Signal that the pmap has been bootstrapped */
1775 pmap_bootstrapped = true;
1776 }
1777
1778 /**
1779 * Helper for creating a populated commpage table
1780 *
1781 * In order to avoid burning extra pages on mapping the commpage, we create a
1782 * dedicated table hierarchy for the commpage. We forcibly nest the translation tables from
1783 * this pmap into other pmaps. The level we will nest at depends on the MMU configuration (page
1784 * size, TTBR range, etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
1785 *
1786 * @note that this is NOT "the nested pmap" (which is used to nest the shared cache).
1787 *
1788 * @param rw_va Virtual address at which to insert a mapping to the kernel R/W commpage
1789 * @param ro_va Virtual address at which to insert a mapping to the kernel R/O commpage
1790 * @param rw_pa Physical address of kernel R/W commpage
1791 * @param ro_pa Physical address of kernel R/O commpage, may be 0 if not supported in this
1792 * configuration
1793 * @param rx_pa Physical address of user executable (and kernel R/O) commpage, may be 0 if
1794 * not supported in this configuration
1795 * @param pmap_create_flags Control flags for the temporary pmap created by this function
1796 *
1797 * @return the physical address of the created commpage table, typed as
1798 * XNU_PAGE_TABLE_COMMPAGE and containing all relevant commpage mappings.
1799 */
1800 static pmap_paddr_t
pmap_create_commpage_table(vm_map_address_t rw_va,vm_map_address_t ro_va,pmap_paddr_t rw_pa,pmap_paddr_t ro_pa,pmap_paddr_t rx_pa,unsigned int pmap_create_flags)1801 pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va,
1802 pmap_paddr_t rw_pa, pmap_paddr_t ro_pa, pmap_paddr_t rx_pa, unsigned int pmap_create_flags)
1803 {
1804 pmap_t temp_commpage_pmap = pmap_create_options(NULL, 0, pmap_create_flags);
1805 assert(temp_commpage_pmap != NULL);
1806 assert(rw_pa != 0);
1807 const pt_attr_t *pt_attr = pmap_get_pt_attr(temp_commpage_pmap);
1808
1809 /*
1810 * We only use pmap_expand to expand the pmap up to the commpage nesting level. At that level
1811 * and beyond, all the newly created tables will be nested directly into the userspace region
1812 * for each process, and as such they must be of the dedicated SPTM commpage table type so that
1813 * the SPTM can enforce the commpage security model which forbids random replacement of commpage
1814 * mappings.
1815 */
1816 kern_return_t kr = pmap_expand(temp_commpage_pmap, rw_va, 0, pt_attr_commpage_level(pt_attr));
1817 assert(kr == KERN_SUCCESS);
1818
1819 pmap_paddr_t commpage_table_pa = 0;
1820 for (unsigned int i = pt_attr_commpage_level(pt_attr); i < pt_attr_leaf_level(pt_attr); i++) {
1821 pmap_paddr_t new_table = 0;
1822 kr = pmap_page_alloc(&new_table, 0);
1823 assert((kr == KERN_SUCCESS) && (new_table != 0));
1824 if (commpage_table_pa == 0) {
1825 commpage_table_pa = new_table;
1826 }
1827
1828 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1829 retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
1830 sptm_retype(new_table, XNU_DEFAULT, XNU_PAGE_TABLE_COMMPAGE, retype_params);
1831
1832 const sptm_tte_t table_tte = (new_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
1833
1834 sptm_map_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, i, rw_va),
1835 (sptm_pt_level_t)i, table_tte);
1836 }
1837
1838 /*
1839 * Note the lack of ARM_PTE_NG here: commpage mappings are at fixed addresses and
1840 * frequently accessed, so we map them global to avoid unnecessary TLB pressure.
1841 */
1842 static const sptm_pte_t commpage_pte_template = ARM_PTE_TYPE_VALID
1843 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK)
1844 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX
1845 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF;
1846
1847 sptm_return_t sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, rw_va,
1848 commpage_pte_template | ARM_PTE_NX | pa_to_pte(rw_pa));
1849 assert(sptm_ret == SPTM_SUCCESS);
1850
1851 if (ro_pa != 0) {
1852 assert((ro_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1853 sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, ro_va,
1854 commpage_pte_template | ARM_PTE_NX | pa_to_pte(ro_pa));
1855 assert(sptm_ret == SPTM_SUCCESS);
1856 }
1857
1858 if (rx_pa != 0) {
1859 assert((commpage_text_user_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1860 assert((commpage_text_user_va != rw_va) && (commpage_text_user_va != ro_va));
1861 sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, commpage_text_user_va, commpage_pte_template | pa_to_pte(rx_pa));
1862 assert(sptm_ret == SPTM_SUCCESS);
1863 }
1864
1865
1866 sptm_unmap_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, pt_attr_commpage_level(pt_attr), rw_va),
1867 (sptm_pt_level_t)pt_attr_commpage_level(pt_attr));
1868 pmap_destroy(temp_commpage_pmap);
1869
1870 return commpage_table_pa;
1871 }
1872
1873 /**
1874 * Helper for creating all commpage tables applicable to the current configuration.
1875 *
1876 * @note This function is intended to be called during bootstrap.
1877 * @note This function assumes that pmap_create_commpages has already executed, and therefore
1878 * the commpage_*_pa variables have been assigned to their final values. commpage_data_pa
1879 * is the kernel RW commpage and is assumed to be present on all configurations, so it
1880 * therefore must be non-zero at this point. The other variables are considered optional
1881 * depending upon configuration and may be zero.
1882 */
1883 void pmap_prepare_commpages(void);
1884 void
pmap_prepare_commpages(void)1885 pmap_prepare_commpages(void)
1886 {
1887 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1888 assert(commpage_data_pa != 0);
1889 sptm_retype(commpage_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RW, retype_params);
1890 if (commpage_ro_data_pa != 0) {
1891 sptm_retype(commpage_ro_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RO, retype_params);
1892 }
1893 if (commpage_text_pa != 0) {
1894 sptm_retype(commpage_text_pa, XNU_DEFAULT, XNU_COMMPAGE_RX, retype_params);
1895 }
1896
1897 /*
1898 * User mapping of comm page text section for 64 bit mapping only
1899 *
1900 * We don't insert the text commpage into the 32 bit mapping because we don't want
1901 * 32-bit user processes to get this page mapped in, they should never call into
1902 * this page.
1903 */
1904 commpage_default_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
1905 commpage_data_pa, commpage_ro_data_pa, commpage_text_pa, 0);
1906
1907 /*
1908 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
1909 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
1910 *
1911 * commpage32_default_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
1912 * commpage_data_pa, commpage_ro_data_pa, 0, 0);
1913 */
1914 #if __ARM_MIXED_PAGE_SIZE__
1915 commpage_4k_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
1916 commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES);
1917
1918 /*
1919 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
1920 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
1921 * commpage32_4k_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
1922 * commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES);
1923 */
1924 #endif /* __ARM_MIXED_PAGE_SIZE__ */
1925
1926 }
1927
1928 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)1929 pmap_virtual_space(
1930 vm_offset_t *startp,
1931 vm_offset_t *endp
1932 )
1933 {
1934 *startp = virtual_space_start;
1935 *endp = virtual_space_end;
1936 }
1937
1938
1939 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)1940 pmap_virtual_region(
1941 unsigned int region_select,
1942 vm_map_offset_t *startp,
1943 vm_map_size_t *size
1944 )
1945 {
1946 boolean_t ret = FALSE;
1947 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1948 if (region_select == 0) {
1949 /*
1950 * In this config, the bootstrap mappings should occupy their own L2
1951 * TTs, as they should be immutable after boot. Having the associated
1952 * TTEs and PTEs in their own pages allows us to lock down those pages,
1953 * while allowing the rest of the kernel address range to be remapped.
1954 */
1955 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
1956 #if defined(ARM_LARGE_MEMORY)
1957 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
1958 #else
1959 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
1960 #endif
1961 ret = TRUE;
1962 }
1963
1964 #if defined(ARM_LARGE_MEMORY)
1965 if (region_select == 1) {
1966 *startp = VREGION1_START;
1967 *size = VREGION1_SIZE;
1968 ret = TRUE;
1969 }
1970 #endif
1971 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
1972 #if defined(ARM_LARGE_MEMORY)
1973 /* For large memory systems with no KTRR/CTRR such as virtual machines */
1974 if (region_select == 0) {
1975 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
1976 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
1977 ret = TRUE;
1978 }
1979
1980 if (region_select == 1) {
1981 *startp = VREGION1_START;
1982 *size = VREGION1_SIZE;
1983 ret = TRUE;
1984 }
1985 #else /* !defined(ARM_LARGE_MEMORY) */
1986 unsigned long low_global_vr_mask = 0;
1987 vm_map_size_t low_global_vr_size = 0;
1988
1989 if (region_select == 0) {
1990 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
1991 if (!TEST_PAGE_SIZE_4K) {
1992 *startp = gVirtBase & 0xFFFFFFFFFE000000;
1993 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
1994 } else {
1995 *startp = gVirtBase & 0xFFFFFFFFFF800000;
1996 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
1997 }
1998 ret = TRUE;
1999 }
2000 if (region_select == 1) {
2001 *startp = VREGION1_START;
2002 *size = VREGION1_SIZE;
2003 ret = TRUE;
2004 }
2005 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2006 if (!TEST_PAGE_SIZE_4K) {
2007 low_global_vr_mask = 0xFFFFFFFFFE000000;
2008 low_global_vr_size = 0x2000000;
2009 } else {
2010 low_global_vr_mask = 0xFFFFFFFFFF800000;
2011 low_global_vr_size = 0x800000;
2012 }
2013
2014 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2015 *startp = LOW_GLOBAL_BASE_ADDRESS;
2016 *size = low_global_vr_size;
2017 ret = TRUE;
2018 }
2019
2020 if (region_select == 3) {
2021 /* In this config, we allow the bootstrap mappings to occupy the same
2022 * page table pages as the heap.
2023 */
2024 *startp = VM_MIN_KERNEL_ADDRESS;
2025 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2026 ret = TRUE;
2027 }
2028 #endif /* defined(ARM_LARGE_MEMORY) */
2029 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2030 return ret;
2031 }
2032
2033 /*
2034 * Routines to track and allocate physical pages during early boot.
2035 * On most systems that memory runs from first_avail through to avail_end
2036 * with no gaps.
2037 *
2038 * If the system supports ECC and ecc_bad_pages_count > 0, we
2039 * need to skip those pages.
2040 */
2041
2042 static unsigned int avail_page_count = 0;
2043 static bool need_ram_ranges_init = true;
2044
2045
2046 /**
2047 * Checks to see if a given page is in
2048 * the array of known bad pages
2049 *
2050 * @param ppn page number to check
2051 */
2052 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2053 pmap_is_bad_ram(__unused ppnum_t ppn)
2054 {
2055 return false;
2056 }
2057
2058 /**
2059 * Prepare bad ram pages to be skipped.
2060 */
2061
2062 /*
2063 * Initialize the count of available pages. No lock needed here,
2064 * as this code is called while kernel boot up is single threaded.
2065 */
2066 static void
initialize_ram_ranges(void)2067 initialize_ram_ranges(void)
2068 {
2069 pmap_paddr_t first = first_avail;
2070 pmap_paddr_t end = avail_end;
2071
2072 assert(first <= end);
2073 assert(first == (first & ~PAGE_MASK));
2074 assert(end == (end & ~PAGE_MASK));
2075 avail_page_count = atop(end - first);
2076
2077 need_ram_ranges_init = false;
2078
2079 }
2080
2081 unsigned int
pmap_free_pages(void)2082 pmap_free_pages(
2083 void)
2084 {
2085 if (need_ram_ranges_init) {
2086 initialize_ram_ranges();
2087 }
2088 return avail_page_count;
2089 }
2090
2091 unsigned int
pmap_free_pages_span(void)2092 pmap_free_pages_span(
2093 void)
2094 {
2095 if (need_ram_ranges_init) {
2096 initialize_ram_ranges();
2097 }
2098 return (unsigned int)atop(avail_end - first_avail);
2099 }
2100
2101
2102 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2103 pmap_next_page_hi(
2104 ppnum_t * pnum,
2105 __unused boolean_t might_free)
2106 {
2107 return pmap_next_page(pnum);
2108 }
2109
2110
2111 boolean_t
pmap_next_page(ppnum_t * pnum)2112 pmap_next_page(
2113 ppnum_t *pnum)
2114 {
2115 if (need_ram_ranges_init) {
2116 initialize_ram_ranges();
2117 }
2118
2119
2120 if (first_avail != avail_end) {
2121 *pnum = (ppnum_t)atop(first_avail);
2122 first_avail += PAGE_SIZE;
2123 assert(avail_page_count > 0);
2124 --avail_page_count;
2125 return TRUE;
2126 }
2127 assert(avail_page_count == 0);
2128 return FALSE;
2129 }
2130
2131
2132
2133
2134 /**
2135 * Helper function to check wheter the given physical
2136 * page number is a restricted page.
2137 *
2138 * @param pn the physical page number to query.
2139 */
2140 bool
pmap_is_page_restricted(ppnum_t pn)2141 pmap_is_page_restricted(ppnum_t pn)
2142 {
2143 sptm_frame_type_t frame_type = sptm_get_frame_type(ptoa(pn));
2144 return frame_type == XNU_KERNEL_RESTRICTED;
2145 }
2146
2147 /*
2148 * Initialize the pmap module.
2149 * Called by vm_init, to initialize any structures that the pmap
2150 * system needs to map virtual memory.
2151 */
2152 void
pmap_init(void)2153 pmap_init(
2154 void)
2155 {
2156 /*
2157 * Protect page zero in the kernel map.
2158 * (can be overruled by permanent transltion
2159 * table entries at page zero - see arm_vm_init).
2160 */
2161 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2162
2163 pmap_initialized = TRUE;
2164
2165 /*
2166 * Create the zone of physical maps
2167 * and the physical-to-virtual entries.
2168 */
2169 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2170 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2171
2172
2173 /*
2174 * Initialize the pmap object (for tracking the vm_page_t
2175 * structures for pages we allocate to be page tables in
2176 * pmap_expand().
2177 */
2178 _vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2179 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2180
2181 /*
2182 * Initialize the TXM VM object in the same way as the
2183 * PMAP VM object.
2184 */
2185 _vm_object_allocate(mem_size, txm_vm_object, VM_MAP_SERIAL_SPECIAL);
2186 txm_vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2187
2188 /*
2189 * The values of [hard_]maxproc may have been scaled, make sure
2190 * they are still less than the value of pmap_max_asids.
2191 */
2192 if ((uint32_t)maxproc > pmap_max_asids) {
2193 maxproc = pmap_max_asids;
2194 }
2195 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2196 hard_maxproc = pmap_max_asids;
2197 }
2198 }
2199
2200 /**
2201 * Verify that a given physical page contains no mappings (outside of the
2202 * default physical aperture mapping).
2203 *
2204 * @param ppnum Physical page number to check there are no mappings to.
2205 *
2206 * @return True if there are no mappings, false otherwise or if the page is not
2207 * kernel-managed.
2208 */
2209 bool
pmap_verify_free(ppnum_t ppnum)2210 pmap_verify_free(ppnum_t ppnum)
2211 {
2212 const pmap_paddr_t pa = ptoa(ppnum);
2213
2214 assert(pa != vm_page_fictitious_addr);
2215
2216 /* Only mappings to kernel-managed physical memory are tracked. */
2217 if (!pa_valid(pa)) {
2218 return false;
2219 }
2220
2221 const unsigned int pai = pa_index(pa);
2222
2223 return pvh_test_type(pai_to_pvh(pai), PVH_TYPE_NULL);
2224 }
2225
2226 #if MACH_ASSERT
2227 /**
2228 * Verify that a given physical page contains no mappings (outside of the
2229 * default physical aperture mapping) and if it does, then panic.
2230 *
2231 * @note It's recommended to use pmap_verify_free() directly when operating in
2232 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2233 * normally being called from outside of the PPL, and the pv_head_table
2234 * can't be modified outside of the PPL).
2235 *
2236 * @param ppnum Physical page number to check there are no mappings to.
2237 */
2238 void
pmap_assert_free(ppnum_t ppnum)2239 pmap_assert_free(ppnum_t ppnum)
2240 {
2241 const pmap_paddr_t pa = ptoa(ppnum);
2242
2243 /* Only mappings to kernel-managed physical memory are tracked. */
2244 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2245 return;
2246 }
2247
2248 const unsigned int pai = pa_index(pa);
2249 const uintptr_t pvh = pai_to_pvh(pai);
2250
2251 /**
2252 * This function is always called from outside of the PPL. Because of this,
2253 * the PVH entry can't be locked. This function is generally only called
2254 * before the VM reclaims a physical page and shouldn't be creating new
2255 * mappings. Even if a new mapping is created while parsing the hierarchy,
2256 * the worst case is that the system will panic in another way, and we were
2257 * already about to panic anyway.
2258 */
2259
2260 /**
2261 * Since pmap_verify_free() returned false, that means there is at least one
2262 * mapping left. Let's get some extra info on the first mapping we find to
2263 * dump in the panic string (the common case is that there is one spare
2264 * mapping that was never unmapped).
2265 */
2266 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2267
2268 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2269 first_ptep = pvh_ptep(pvh);
2270 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2271 pv_entry_t *pvep = pvh_pve_list(pvh);
2272
2273 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2274 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2275 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2276 if (first_ptep != PT_ENTRY_NULL) {
2277 break;
2278 }
2279 }
2280
2281 /* The PVE should have at least one valid PTE. */
2282 assert(first_ptep != PT_ENTRY_NULL);
2283 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2284 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2285 __func__, (void*)pvh, pai);
2286 } else {
2287 /**
2288 * The mapping disappeared between here and the pmap_verify_free() call.
2289 * The only way that can happen is if the VM was racing this call with
2290 * a call that unmaps PTEs. Operations on this page should not be
2291 * occurring at the same time as this check, and unfortunately we can't
2292 * lock the PVH entry to prevent it, so just panic instead.
2293 */
2294 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2295 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2296 __func__, (void*)pvh, pai);
2297 }
2298
2299 /* Panic with a unique string identifying the first bad mapping and owner. */
2300 {
2301 /* First PTE is mapped by the main CPUs. */
2302 pmap_t pmap = ptep_get_pmap(first_ptep);
2303 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2304
2305 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2306 "%s CPU mapping (pmap: %p)",
2307 __func__, (uint64_t)pa, first_ptep, type, pmap);
2308 }
2309 }
2310 #endif
2311
2312
2313 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2314 pmap_root_alloc_size(pmap_t pmap)
2315 {
2316 #pragma unused(pmap)
2317 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2318 unsigned int root_level = pt_attr_root_level(pt_attr);
2319 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2320 }
2321
2322 /*
2323 * Create and return a physical map.
2324 *
2325 * If the size specified for the map
2326 * is zero, the map is an actual physical
2327 * map, and may be referenced by the
2328 * hardware.
2329 *
2330 * If the size specified is non-zero,
2331 * the map will be used in software only, and
2332 * is bounded by that size.
2333 */
2334 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2335 pmap_create_options_internal(
2336 ledger_t ledger,
2337 vm_map_size_t size,
2338 unsigned int flags,
2339 kern_return_t *kr)
2340 {
2341 pmap_t p;
2342 bool is_64bit = flags & PMAP_CREATE_64BIT;
2343 #if defined(HAS_APPLE_PAC)
2344 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2345 #endif /* defined(HAS_APPLE_PAC) */
2346 kern_return_t local_kr = KERN_SUCCESS;
2347 __unused uint8_t sptm_root_flags = SPTM_ROOT_PT_FLAGS_DEFAULT;
2348 TXMAddressSpaceFlags_t txm_flags = kTXMAddressSpaceFlagInit;
2349 const bool is_stage2 = false;
2350
2351 if (size != 0) {
2352 {
2353 // Size parameter should only be set for stage 2.
2354 return PMAP_NULL;
2355 }
2356 }
2357
2358 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2359 return PMAP_NULL;
2360 }
2361
2362 /*
2363 * Allocate a pmap struct from the pmap_zone. Then allocate
2364 * the translation table of the right size for the pmap.
2365 */
2366 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2367 local_kr = KERN_RESOURCE_SHORTAGE;
2368 goto pmap_create_fail;
2369 }
2370
2371 p->ledger = ledger;
2372
2373
2374 p->pmap_vm_map_cs_enforced = false;
2375 p->min = 0;
2376
2377
2378 #if CONFIG_ROSETTA
2379 if (flags & PMAP_CREATE_ROSETTA) {
2380 p->is_rosetta = TRUE;
2381 } else {
2382 p->is_rosetta = FALSE;
2383 }
2384 #endif /* CONFIG_ROSETTA */
2385 #if defined(HAS_APPLE_PAC)
2386 p->disable_jop = disable_jop;
2387
2388 if (p->disable_jop) {
2389 sptm_root_flags &= ~SPTM_ROOT_PT_FLAG_JOP;
2390 }
2391 #endif /* defined(HAS_APPLE_PAC) */
2392
2393 p->nested_region_true_start = 0;
2394 p->nested_region_true_end = ~0;
2395
2396 p->nx_enabled = true;
2397 p->is_64bit = is_64bit;
2398 p->nested_pmap = PMAP_NULL;
2399 p->type = PMAP_TYPE_USER;
2400
2401 #if ARM_PARAMETERIZED_PMAP
2402 /* Default to the native pt_attr */
2403 p->pmap_pt_attr = native_pt_attr;
2404 #endif /* ARM_PARAMETERIZED_PMAP */
2405 #if __ARM_MIXED_PAGE_SIZE__
2406 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2407 p->pmap_pt_attr = &pmap_pt_attr_4k;
2408 }
2409 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2410 p->max = pmap_user_va_size(p);
2411
2412 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2413 local_kr = KERN_NO_SPACE;
2414 goto id_alloc_fail;
2415 }
2416
2417 /**
2418 * We expect top level translation tables to always fit into a single
2419 * physical page. This would also catch a misconfiguration if 4K
2420 * concatenated page tables needed more than one physical tt1 page.
2421 */
2422 vm_size_t pmap_root_size = pmap_root_alloc_size(p);
2423 if (__improbable(pmap_root_size > PAGE_SIZE)) {
2424 panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)pmap_root_size);
2425 }
2426
2427 pmap_lock_init(p);
2428
2429 p->tte = pmap_tt1_allocate(p, sptm_root_flags);
2430 if (!(p->tte)) {
2431 local_kr = KERN_RESOURCE_SHORTAGE;
2432 goto tt1_alloc_fail;
2433 }
2434
2435 p->ttep = kvtophys_nofail((vm_offset_t)p->tte);
2436 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2437
2438 /*
2439 * initialize the rest of the structure
2440 */
2441 p->nested_region_addr = 0x0ULL;
2442 p->nested_region_size = 0x0ULL;
2443 p->nested_region_unnested_table_bitmap = NULL;
2444
2445 p->nested_has_no_bounds_ref = false;
2446 p->nested_no_bounds_refcnt = 0;
2447 p->nested_bounds_set = false;
2448
2449 p->associated_vm_map_serial_id = VM_MAP_SERIAL_NONE;
2450
2451 #if MACH_ASSERT
2452 p->pmap_pid = 0;
2453 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
2454 #endif /* MACH_ASSERT */
2455 #if DEVELOPMENT || DEBUG
2456 p->footprint_was_suspended = FALSE;
2457 #endif /* DEVELOPMENT || DEBUG */
2458
2459 os_ref_init_count_raw(&p->ref_count, &pmap_refgrp, 1);
2460 pmap_simple_lock(&pmaps_lock);
2461 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
2462 pmap_simple_unlock(&pmaps_lock);
2463
2464 /**
2465 * The SPTM pmap's concurrency model can sometimes allow ledger balances to transiently
2466 * go negative. Note that we still check overall ledger balance on pmap destruction.
2467 */
2468 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
2469 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
2470 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
2471 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
2472 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
2473 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
2474 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
2475 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
2476 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
2477
2478 if (!is_stage2) {
2479 /*
2480 * Complete initialization for the TXM address space. This needs to be done
2481 * after the SW ASID has been registered with the SPTM.
2482 * TXM enforcement does not apply to virtual machines.
2483 */
2484 if (flags & PMAP_CREATE_TEST) {
2485 txm_flags |= kTXMAddressSpaceFlagTest;
2486 }
2487
2488 pmap_txmlock_init(p);
2489 txm_register_address_space(p, p->asid, txm_flags);
2490 p->txm_trust_level = kCSTrustUntrusted;
2491 }
2492
2493 return p;
2494
2495 tt1_alloc_fail:
2496 pmap_get_pt_ops(p)->free_id(p);
2497 id_alloc_fail:
2498 zfree(pmap_zone, p);
2499 pmap_create_fail:
2500 *kr = local_kr;
2501 return PMAP_NULL;
2502 }
2503
2504 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)2505 pmap_create_options(
2506 ledger_t ledger,
2507 vm_map_size_t size,
2508 unsigned int flags)
2509 {
2510 pmap_t pmap;
2511 kern_return_t kr = KERN_SUCCESS;
2512
2513 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
2514
2515 ledger_reference(ledger);
2516
2517 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
2518
2519 if (pmap == PMAP_NULL) {
2520 ledger_dereference(ledger);
2521 }
2522
2523 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2524
2525 return pmap;
2526 }
2527
2528 #if MACH_ASSERT
2529 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)2530 pmap_set_process_internal(
2531 __unused pmap_t pmap,
2532 __unused int pid,
2533 __unused char *procname)
2534 {
2535 if (pmap == NULL || pmap->pmap_pid == -1) {
2536 return;
2537 }
2538
2539 validate_pmap_mutable(pmap);
2540
2541 pmap->pmap_pid = pid;
2542 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
2543 }
2544 #endif /* MACH_ASSERT */
2545
2546 #if MACH_ASSERT
2547 void
pmap_set_process(pmap_t pmap,int pid,char * procname)2548 pmap_set_process(
2549 pmap_t pmap,
2550 int pid,
2551 char *procname)
2552 {
2553 pmap_set_process_internal(pmap, pid, procname);
2554 }
2555 #endif /* MACH_ASSERT */
2556
2557 /*
2558 * pmap_deallocate_all_leaf_tts:
2559 *
2560 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
2561 * removing and deallocating all TTEs.
2562 */
2563 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,vm_map_address_t start_va,unsigned level)2564 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, vm_map_address_t start_va, unsigned level)
2565 {
2566 tt_entry_t tte = ARM_TTE_EMPTY;
2567 tt_entry_t * ttep = NULL;
2568 tt_entry_t * last_ttep = NULL;
2569
2570 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2571 const uint64_t size = pt_attr->pta_level_info[level].size;
2572
2573 assert(level < pt_attr_leaf_level(pt_attr));
2574
2575 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
2576
2577 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
2578 vm_map_address_t va = start_va;
2579 for (ttep = first_ttep; ttep <= last_ttep; ttep += page_ratio, va += (size * page_ratio)) {
2580 if (!(*ttep & ARM_TTE_VALID)) {
2581 continue;
2582 }
2583
2584 for (unsigned i = 0; i < page_ratio; i++) {
2585 tte = ttep[i];
2586
2587 if (!(tte & ARM_TTE_VALID)) {
2588 panic("%s: found unexpectedly invalid tte, ttep=%p, tte=%p, "
2589 "pmap=%p, first_ttep=%p, level=%u",
2590 __FUNCTION__, ttep + i, (void *)tte,
2591 pmap, first_ttep, level);
2592 }
2593
2594 if (tte_is_block(tte)) {
2595 panic("%s: found block mapping, ttep=%p, tte=%p, "
2596 "pmap=%p, first_ttep=%p, level=%u",
2597 __FUNCTION__, ttep + i, (void *)tte,
2598 pmap, first_ttep, level);
2599 }
2600
2601 /* Must be valid, type table */
2602 if (level < pt_attr_twig_level(pt_attr)) {
2603 /* If we haven't reached the twig level, recurse to the next level. */
2604 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK),
2605 va + (size * i), level + 1);
2606 }
2607 }
2608
2609 /* Remove the TTE. */
2610 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
2611 pmap_tte_deallocate(pmap, va, ttep, level);
2612 }
2613 }
2614
2615 /*
2616 * We maintain stats and ledgers so that a task's physical footprint is:
2617 * phys_footprint = ((internal - alternate_accounting)
2618 * + (internal_compressed - alternate_accounting_compressed)
2619 * + iokit_mapped
2620 * + purgeable_nonvolatile
2621 * + purgeable_nonvolatile_compressed
2622 * + page_table)
2623 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
2624 */
2625
2626 /*
2627 * Retire the given physical map from service.
2628 * Should only be called if the map contains
2629 * no valid mappings.
2630 */
2631 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)2632 pmap_destroy_internal(
2633 pmap_t pmap)
2634 {
2635 if (pmap == PMAP_NULL) {
2636 return;
2637 }
2638
2639 validate_pmap(pmap);
2640
2641 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2642 const bool is_stage2_pmap = false;
2643
2644 if (os_ref_release_raw(&pmap->ref_count, &pmap_refgrp) > 0) {
2645 return;
2646 }
2647
2648 if (!is_stage2_pmap) {
2649 /*
2650 * Complete all clean up required for TXM. This needs to happen before the
2651 * SW ASID has been unregistered with the SPTM.
2652 */
2653 txm_unregister_address_space(pmap);
2654 pmap_txmlock_destroy(pmap);
2655 }
2656
2657 /**
2658 * Drain any concurrent retype-sensitive SPTM operations. This is needed to
2659 * ensure that we don't unmap and retype the page tables while those operations
2660 * are still finishing on other CPUs, leading to an SPTM violation. In particular,
2661 * the multipage batched cacheability/attribute update code may issue SPTM calls
2662 * without holding the relevant PVH or pmap locks, so we can't guarantee those
2663 * calls have actually completed despite observing refcnt == 0.
2664 *
2665 * At this point, we CAN guarantee that:
2666 * 1) All prior PTE removals required to empty the pmap have completed and
2667 * been synchronized with DSB, *except* the commpage removal which doesn't
2668 * involve pages that can ever be retyped. Subsequent calls not already
2669 * in the retype epoch will no longer observe these mappings.
2670 * 2) The pmap now has a zero refcount, so in a correctly functioning system
2671 * no further mappings will be requested for it.
2672 */
2673 pmap_retype_epoch_prepare_drain();
2674
2675 if (!is_stage2_pmap) {
2676 pmap_unmap_commpage(pmap);
2677 }
2678
2679 pmap_simple_lock(&pmaps_lock);
2680 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
2681 pmap_simple_unlock(&pmaps_lock);
2682
2683 pmap_retype_epoch_drain();
2684
2685 pmap_trim_self(pmap);
2686
2687 /*
2688 * Free the memory maps, then the
2689 * pmap structure.
2690 */
2691 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pmap->min, pt_attr_root_level(pt_attr));
2692
2693 if (pmap->tte) {
2694 pmap_tt1_deallocate(pmap, pmap->tte);
2695 pmap->tte = (tt_entry_t *) NULL;
2696 pmap->ttep = 0;
2697 }
2698
2699 if (pmap->type != PMAP_TYPE_NESTED) {
2700 /* return its asid to the pool */
2701 pmap_get_pt_ops(pmap)->free_id(pmap);
2702 if (pmap->nested_pmap != NULL) {
2703 /* release the reference we hold on the nested pmap */
2704 pmap_destroy_internal(pmap->nested_pmap);
2705 }
2706 }
2707
2708 pmap_check_ledgers(pmap);
2709
2710 if (pmap->nested_region_unnested_table_bitmap) {
2711 bitmap_free(pmap->nested_region_unnested_table_bitmap, pmap->nested_region_size >> pt_attr_twig_shift(pt_attr));
2712 }
2713
2714 pmap_lock_destroy(pmap);
2715 zfree(pmap_zone, pmap);
2716 }
2717
2718 void
pmap_destroy(pmap_t pmap)2719 pmap_destroy(
2720 pmap_t pmap)
2721 {
2722 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2723
2724 ledger_t ledger = pmap->ledger;
2725
2726 pmap_destroy_internal(pmap);
2727
2728 ledger_dereference(ledger);
2729
2730 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
2731 }
2732
2733
2734 /*
2735 * Add a reference to the specified pmap.
2736 */
2737 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)2738 pmap_reference_internal(
2739 pmap_t pmap)
2740 {
2741 if (pmap != PMAP_NULL) {
2742 validate_pmap_mutable(pmap);
2743 os_ref_retain_raw(&pmap->ref_count, &pmap_refgrp);
2744 }
2745 }
2746
2747 void
pmap_reference(pmap_t pmap)2748 pmap_reference(
2749 pmap_t pmap)
2750 {
2751 pmap_reference_internal(pmap);
2752 }
2753
2754 static sptm_frame_type_t
get_sptm_pt_type(pmap_t pmap)2755 get_sptm_pt_type(pmap_t pmap)
2756 {
2757 const bool is_stage2_pmap = false;
2758 if (is_stage2_pmap) {
2759 assert(pmap->type != PMAP_TYPE_NESTED);
2760 return XNU_STAGE2_PAGE_TABLE;
2761 } else {
2762 return pmap->type == PMAP_TYPE_NESTED ? XNU_PAGE_TABLE_SHARED : XNU_PAGE_TABLE;
2763 }
2764 }
2765
2766 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,uint8_t sptm_root_flags)2767 pmap_tt1_allocate(pmap_t pmap, uint8_t sptm_root_flags)
2768 {
2769 pmap_paddr_t pa = 0;
2770 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2771 const bool is_stage2_pmap = false;
2772
2773 const kern_return_t ret = pmap_page_alloc(&pa, PMAP_PAGE_NOZEROFILL);
2774
2775 if (ret != KERN_SUCCESS) {
2776 return (tt_entry_t *)0;
2777 }
2778
2779 /**
2780 * Drain the epochs to ensure any lingering batched operations that may have taken
2781 * an in-flight reference to this page are complete.
2782 */
2783 pmap_retype_epoch_prepare_drain();
2784
2785 assert(pa);
2786
2787 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
2788 * Depending on the device, this can vary between 512b and 16K. */
2789 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
2790 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
2791
2792 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2793 retype_params.attr_idx = pt_attr->geometry_id;
2794 retype_params.flags = sptm_root_flags;
2795 if (is_stage2_pmap) {
2796 retype_params.vmid = pmap->vmid;
2797 } else {
2798 retype_params.asid = pmap->asid;
2799 }
2800
2801 pmap_retype_epoch_drain();
2802
2803 sptm_retype(pa, XNU_DEFAULT, is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : XNU_USER_ROOT_TABLE,
2804 retype_params);
2805
2806 return (tt_entry_t *) phystokv(pa);
2807 }
2808
2809 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt)2810 pmap_tt1_deallocate(
2811 pmap_t pmap,
2812 tt_entry_t *tt)
2813 {
2814 pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)tt);
2815 const bool is_stage2_pmap = false;
2816 const sptm_frame_type_t page_type = is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE :
2817 pmap->type == PMAP_TYPE_NESTED ? XNU_SHARED_ROOT_TABLE : XNU_USER_ROOT_TABLE;
2818
2819 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2820 sptm_retype(pa, page_type, XNU_DEFAULT, retype_params);
2821 pmap_page_free(pa);
2822
2823 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
2824 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
2825 }
2826
2827 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)2828 pmap_tt_allocate(
2829 pmap_t pmap,
2830 tt_entry_t **ttp,
2831 unsigned int level,
2832 unsigned int options)
2833 {
2834 pmap_paddr_t pa;
2835 *ttp = NULL;
2836
2837 if (*ttp == NULL) {
2838 const unsigned int alloc_flags =
2839 (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0;
2840
2841 /* Allocate a VM page to be used as the page table. */
2842 if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) {
2843 return KERN_RESOURCE_SHORTAGE;
2844 }
2845
2846 pt_desc_t *ptdp = ptd_alloc(pmap, alloc_flags);
2847 if (ptdp == NULL) {
2848 pmap_page_free(pa);
2849 return KERN_RESOURCE_SHORTAGE;
2850 }
2851
2852 unsigned int pai = pa_index(pa);
2853 locked_pvh_t locked_pvh = pvh_lock(pai);
2854 assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL), "%s: non-empty PVH %p",
2855 __func__, (void*)locked_pvh.pvh);
2856
2857 /**
2858 * Drain the epochs to ensure any lingering batched operations that may have taken
2859 * an in-flight reference to this page are complete.
2860 */
2861 pmap_retype_epoch_prepare_drain();
2862
2863 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
2864 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
2865 } else {
2866 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
2867 }
2868
2869 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
2870
2871 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
2872
2873 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
2874 pvh_unlock(&locked_pvh);
2875
2876 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2877 retype_params.level = (sptm_pt_level_t)level;
2878
2879 /**
2880 * SPTM TODO: To reduce the cost of draining and retyping, consider caching freed page table pages
2881 * in a small per-CPU bucket and reusing them in preference to calling pmap_page_alloc() above.
2882 */
2883 pmap_retype_epoch_drain();
2884
2885 sptm_retype(pa, XNU_DEFAULT, get_sptm_pt_type(pmap), retype_params);
2886
2887 *ttp = (tt_entry_t *)phystokv(pa);
2888 }
2889
2890 assert(*ttp);
2891
2892 return KERN_SUCCESS;
2893 }
2894
2895 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)2896 pmap_tt_deallocate(
2897 pmap_t pmap,
2898 tt_entry_t *ttp,
2899 unsigned int level)
2900 {
2901 pt_desc_t *ptdp;
2902 vm_offset_t free_page = 0;
2903 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2904
2905 ptdp = ptep_get_ptd(ttp);
2906 ptdp->va = (vm_offset_t)-1;
2907
2908 const uint16_t refcnt = sptm_get_page_table_refcnt(kvtophys_nofail((vm_offset_t)ttp));
2909
2910 if (__improbable(refcnt != 0)) {
2911 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, refcnt);
2912 }
2913
2914 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
2915 if (free_page != 0) {
2916 pmap_paddr_t pa = kvtophys_nofail(free_page);
2917 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2918 sptm_retype(pa, get_sptm_pt_type(pmap), XNU_DEFAULT, retype_params);
2919 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
2920
2921 unsigned int pai = pa_index(pa);
2922 locked_pvh_t locked_pvh = pvh_lock(pai);
2923 assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTDP), "%s: non-PTD PVH %p",
2924 __func__, (void*)locked_pvh.pvh);
2925 pvh_update_head(&locked_pvh, NULL, PVH_TYPE_NULL);
2926 pvh_unlock(&locked_pvh);
2927 pmap_page_free(pa);
2928 if (level < pt_attr_leaf_level(pt_attr)) {
2929 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
2930 } else {
2931 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
2932 }
2933 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
2934 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
2935 }
2936 }
2937
2938 /**
2939 * Check table refcounts after clearing a translation table entry pointing to that table
2940 *
2941 * @note If the cleared TTE points to a leaf table, then that leaf table
2942 * must have a refcnt of zero before the TTE can be removed.
2943 *
2944 * @param pmap The pmap containing the page table whose TTE is being removed.
2945 * @param tte Value stored in the TTE prior to clearing it
2946 * @param level The level of the page table that contains the TTE being removed
2947 */
2948 static void
pmap_tte_check_refcounts(pmap_t pmap,tt_entry_t tte,unsigned int level)2949 pmap_tte_check_refcounts(
2950 pmap_t pmap,
2951 tt_entry_t tte,
2952 unsigned int level)
2953 {
2954 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2955
2956 /**
2957 * Remember, the passed in "level" parameter refers to the level above the
2958 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
2959 * page table).
2960 */
2961 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
2962
2963 unsigned short refcnt = 0;
2964
2965 /**
2966 * It's possible that a concurrent pmap_disconnect() operation may need to reference
2967 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
2968 * one or more PTEs on this page but not yet dropped the refcount, which would cause
2969 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
2970 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
2971 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
2972 * drop the pagetable refcount accordingly, without taking any PVH locks that could
2973 * synchronize it against the disconnect operation. If that removal caused the
2974 * refcount to reach zero, the pagetable page could be freed before the disconnect
2975 * operation is finished using the relevant pagetable descriptor.
2976 * Address these cases by waiting until all CPUs have been observed to not be
2977 * executing pmap_disconnect().
2978 */
2979 if (remove_leaf_table) {
2980 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
2981 const int max_cpu = ml_get_max_cpu_number();
2982 bitmap_full(&active_disconnects[0], max_cpu + 1);
2983 bool inflight_disconnect;
2984
2985 /*
2986 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
2987 * ahead of any prior PTE load which may have observed the effect of a
2988 * concurrent disconnect operation. An acquire fence is required for this;
2989 * a load-acquire operation is insufficient.
2990 */
2991 os_atomic_thread_fence(acquire);
2992 do {
2993 inflight_disconnect = false;
2994 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
2995 i >= 0;
2996 i = bitmap_next(&active_disconnects[0], i)) {
2997 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
2998 if (cpu_data == NULL) {
2999 continue;
3000 }
3001 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3002 __builtin_arm_wfe();
3003 inflight_disconnect = true;
3004 continue;
3005 }
3006 os_atomic_clear_exclusive();
3007 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3008 }
3009 } while (inflight_disconnect);
3010 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3011 os_atomic_thread_fence(acquire);
3012 refcnt = sptm_get_page_table_refcnt(tte_to_pa(tte));
3013 }
3014
3015 #if MACH_ASSERT
3016 /**
3017 * On internal devices, always do the page table consistency check
3018 * regardless of page table level or the actual refcnt value.
3019 */
3020 {
3021 #else /* MACH_ASSERT */
3022 /**
3023 * Only perform the page table consistency check when deleting leaf page
3024 * tables and it seems like there might be valid/compressed mappings
3025 * leftover.
3026 */
3027 if (__improbable(remove_leaf_table && refcnt != 0)) {
3028 #endif /* MACH_ASSERT */
3029
3030 /**
3031 * There are multiple problems that can arise as a non-zero refcnt:
3032 * 1. A bug in the refcnt management logic.
3033 * 2. A memory stomper or hardware failure.
3034 * 3. The VM forgetting to unmap all of the valid mappings in an address
3035 * space before destroying a pmap.
3036 *
3037 * By looping over the page table and determining how many valid or
3038 * compressed entries there actually are, we can narrow down which of
3039 * these three cases is causing this panic. If the expected refcnt
3040 * (valid + compressed) and the actual refcnt don't match then the
3041 * problem is probably either a memory corruption issue (if the
3042 * non-empty entries don't match valid+compressed, that could also be a
3043 * sign of corruption) or refcnt management bug. Otherwise, there
3044 * actually are leftover mappings and the higher layers of xnu are
3045 * probably at fault.
3046 *
3047 * Note that we use PAGE_SIZE to govern the range of the table check,
3048 * because even for 4K processes we still allocate a 16K page for each
3049 * page table; we simply map it using 4 adjacent TTEs for the 4K case.
3050 */
3051 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(PAGE_SIZE - 1)));
3052
3053 pt_entry_t *ptep = bpte;
3054 unsigned short wiredcnt = ptep_get_info((pt_entry_t*)ttetokv(tte))->wiredcnt;
3055 unsigned short non_empty = 0, valid = 0, comp = 0;
3056 for (unsigned int i = 0; i < (PAGE_SIZE / sizeof(*ptep)); i++, ptep++) {
3057 /* Keep track of all non-empty entries to detect memory corruption. */
3058 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3059 non_empty++;
3060 }
3061
3062 if (__improbable(pte_is_compressed(*ptep, ptep))) {
3063 comp++;
3064 } else if (__improbable(pte_is_valid(*ptep))) {
3065 valid++;
3066 }
3067 }
3068
3069 #if MACH_ASSERT
3070 /**
3071 * On internal machines, panic whenever a page table getting deleted has
3072 * leftover mappings (valid or otherwise) or a leaf page table has a
3073 * non-zero refcnt.
3074 */
3075 if (__improbable((non_empty != 0) || (remove_leaf_table && ((refcnt != 0) || (wiredcnt != 0))))) {
3076 #else /* MACH_ASSERT */
3077 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3078 {
3079 #endif /* MACH_ASSERT */
3080 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3081 "%d compressed, %d non-empty, refcnt=%d, wiredcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3082 level + 1, valid, comp, non_empty, refcnt, wiredcnt, level, (uint64_t)tte, pmap, bpte);
3083 }
3084 }
3085 }
3086
3087 /**
3088 * Remove translation table entry pointing to a nested shared region table
3089 *
3090 * @note The TTE to clear out is expected to point to a leaf table with a refcnt
3091 * of zero.
3092 *
3093 * @param pmap The user pmap containing the nested page table whose TTE is being removed.
3094 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3095 * @param ttep Pointer to the TTE that should be cleared out.
3096 */
3097 static void
3098 pmap_tte_trim(
3099 pmap_t pmap,
3100 vm_offset_t va_start,
3101 tt_entry_t *ttep)
3102 {
3103 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3104 assert(ttep != NULL);
3105 const tt_entry_t tte = *ttep;
3106 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3107
3108 if (__improbable(tte == ARM_TTE_EMPTY)) {
3109 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3110 "stomper? pmap=%p ttep=%p", __func__, pt_attr_twig_level(pt_attr), pmap, ttep);
3111 }
3112
3113 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
3114 sptm_unnest_region(pmap->ttep, pmap->nested_pmap->ttep, va_start, (pt_attr_twig_size(pt_attr) * page_ratio) >> pt_attr->pta_page_shift);
3115
3116 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3117
3118 pmap_tte_check_refcounts(pmap, tte, pt_attr_twig_level(pt_attr));
3119 }
3120
3121 /**
3122 * Remove a translation table entry.
3123 *
3124 * @note If the TTE to clear out points to a leaf table, then that leaf table
3125 * must have a mapping refcount of zero before the TTE can be removed.
3126 * @note This function expects to be called with pmap locked exclusive, and will
3127 * return with pmap unlocked.
3128 *
3129 * @param pmap The pmap containing the page table whose TTE is being removed.
3130 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3131 * @param ttep Pointer to the TTE that should be cleared out.
3132 * @param level The level of the page table that contains the TTE to be removed.
3133 */
3134 static void
3135 pmap_tte_remove(
3136 pmap_t pmap,
3137 vm_offset_t va_start,
3138 tt_entry_t *ttep,
3139 unsigned int level)
3140 {
3141 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3142 assert(ttep != NULL);
3143 const tt_entry_t tte = *ttep;
3144
3145 if (__improbable(tte == ARM_TTE_EMPTY)) {
3146 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3147 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3148 }
3149
3150 sptm_unmap_table(pmap->ttep, pt_attr_align_va(pmap_get_pt_attr(pmap), level, va_start), (sptm_pt_level_t)level);
3151
3152 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3153
3154 pmap_tte_check_refcounts(pmap, tte, level);
3155 }
3156
3157 /**
3158 * Given a pointer to an entry within a `level` page table, delete the
3159 * page table at `level` + 1 that is represented by that entry. For instance,
3160 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3161 * contains the PA of the L3 table, and `level` would be "2".
3162 *
3163 * @note If the table getting deallocated is a leaf table, then that leaf table
3164 * must have a mapping refcount of zero before getting deallocated.
3165 * @note This function expects to be called with pmap locked exclusive and will
3166 * return with pmap unlocked.
3167 *
3168 * @param pmap The pmap that owns the page table to be deallocated.
3169 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3170 * @param ttep Pointer to the `level` TTE to remove.
3171 * @param level The level of the table that contains an entry pointing to the
3172 * table to be removed. The deallocated page table will be a
3173 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3174 * deleted).
3175 */
3176 void
3177 pmap_tte_deallocate(
3178 pmap_t pmap,
3179 vm_offset_t va_start,
3180 tt_entry_t *ttep,
3181 unsigned int level)
3182 {
3183 tt_entry_t tte;
3184
3185 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3186
3187 tte = *ttep;
3188
3189 if (tte_get_ptd(tte)->pmap != pmap) {
3190 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3191 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3192 }
3193
3194 assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
3195 (unsigned long long)tte);
3196
3197 /* pmap_tte_remove() will drop the pmap lock */
3198 pmap_tte_remove(pmap, va_start, ttep, level);
3199
3200 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3201 }
3202
3203 /*
3204 * Remove a range of hardware page-table entries.
3205 * The range is given as the first (inclusive)
3206 * and last (exclusive) virtual addresses mapped by
3207 * the PTE region to be removed.
3208 *
3209 * The pmap must be locked shared.
3210 * If the pmap is not the kernel pmap, the range must lie
3211 * entirely within one pte-page. Assumes that the pte-page exists.
3212 *
3213 * Returns the number of PTE changed
3214 */
3215 MARK_AS_PMAP_TEXT static void
3216 pmap_remove_range(
3217 pmap_t pmap,
3218 vm_map_address_t va,
3219 vm_map_address_t end)
3220 {
3221 pmap_remove_range_options(pmap, va, end, PMAP_OPTIONS_REMOVE);
3222 }
3223
3224 MARK_AS_PMAP_TEXT void
3225 pmap_remove_range_options(
3226 pmap_t pmap,
3227 vm_map_address_t start,
3228 vm_map_address_t end,
3229 int options)
3230 {
3231 const unsigned int sptm_flags = ((options & PMAP_OPTIONS_REMOVE) ? SPTM_REMOVE_COMPRESSED : 0);
3232 unsigned int num_removed = 0;
3233 unsigned int num_external = 0, num_internal = 0, num_reusable = 0;
3234 unsigned int num_alt_internal = 0;
3235 unsigned int num_compressed = 0, num_alt_compressed = 0;
3236 unsigned short num_unwired = 0;
3237 bool need_strong_sync = false;
3238
3239 /*
3240 * The pmap lock should be held here. It will only be held shared in most if not all cases.
3241 */
3242 pmap_assert_locked(pmap, PMAP_LOCK_HELD);
3243
3244 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3245 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
3246 const uint64_t pmap_page_shift = pt_attr_leaf_shift(pt_attr);
3247 vm_map_address_t va = start;
3248 pt_entry_t *cpte = pmap_pte(pmap, va);
3249 assert(cpte != NULL);
3250
3251 while (va < end) {
3252 /**
3253 * We may need to sleep when taking the PVH lock below, and our pmap_pv_remove()
3254 * call below may also place the lock in sleep mode if processing a large PV list.
3255 * We therefore can't leave preemption disabled across that code, which means we
3256 * can't directly use the per-CPU prev_ptes array in that code. Since that code
3257 * only cares about the physical address stored in each prev_ptes entry, we'll
3258 * use a local array to stash off only the 4-byte physical address index in order
3259 * to reduce stack usage.
3260 */
3261 unsigned int pai_list[SPTM_MAPPING_LIMIT];
3262 _Static_assert(SPTM_MAPPING_LIMIT <= 64,
3263 "SPTM_MAPPING_LIMIT value causes excessive stack usage for pai_list");
3264
3265 unsigned int num_mappings = (end - va) >> pmap_page_shift;
3266 if (num_mappings > SPTM_MAPPING_LIMIT) {
3267 num_mappings = SPTM_MAPPING_LIMIT;
3268 }
3269
3270 /**
3271 * Disable preemption to ensure that we can safely access per-CPU mapping data after
3272 * issuing the SPTM call.
3273 */
3274 disable_preemption();
3275 /**
3276 * Enter the retype epoch for the batched unmap operation. This is necessary because we
3277 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
3278 * call, so a concurrent pmap_page_protect() operation against one of those pages may
3279 * race this call. That should be perfectly fine as far as the PTE updates are concerned,
3280 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
3281 * if it does not first drain our epoch.
3282 */
3283 pmap_retype_epoch_enter();
3284 sptm_unmap_region(pmap->ttep, va, num_mappings, sptm_flags);
3285 pmap_retype_epoch_exit();
3286
3287 sptm_pte_t *prev_ptes = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes;
3288 for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3289 const pt_entry_t prev_pte = prev_ptes[i];
3290
3291 if (pte_is_compressed(prev_pte, cpte)) {
3292 if (options & PMAP_OPTIONS_REMOVE) {
3293 ++num_compressed;
3294 if (prev_pte & ARM_PTE_COMPRESSED_ALT) {
3295 ++num_alt_compressed;
3296 }
3297 }
3298 pai_list[i] = INVALID_PAI;
3299 continue;
3300 } else if (!pte_is_valid(prev_pte)) {
3301 pai_list[i] = INVALID_PAI;
3302 continue;
3303 }
3304
3305 if (pte_is_wired(prev_pte)) {
3306 num_unwired++;
3307 }
3308
3309 const pmap_paddr_t pa = pte_to_pa(prev_pte);
3310
3311 if (__improbable(!pa_valid(pa))) {
3312 pai_list[i] = INVALID_PAI;
3313 continue;
3314 }
3315 pai_list[i] = pa_index(pa);
3316 }
3317
3318 enable_preemption();
3319 cpte -= num_mappings;
3320
3321 for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3322 if (pai_list[i] == INVALID_PAI) {
3323 continue;
3324 }
3325 locked_pvh_t locked_pvh;
3326 if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
3327 locked_pvh = pvh_lock_nopreempt(pai_list[i]);
3328 } else {
3329 locked_pvh = pvh_lock(pai_list[i]);
3330 }
3331
3332 bool is_internal, is_altacct;
3333 pv_remove_return_t remove_status = pmap_remove_pv(pmap, cpte, &locked_pvh, &is_internal, &is_altacct);
3334
3335 switch (remove_status) {
3336 case PV_REMOVE_SUCCESS:
3337 ++num_removed;
3338 if (is_altacct) {
3339 assert(is_internal);
3340 num_internal++;
3341 num_alt_internal++;
3342 } else if (is_internal) {
3343 if (ppattr_test_reusable(pai_list[i])) {
3344 num_reusable++;
3345 } else {
3346 num_internal++;
3347 }
3348 } else {
3349 num_external++;
3350 }
3351 break;
3352 default:
3353 /*
3354 * PVE already removed; this can happen due to a concurrent pmap_disconnect()
3355 * executing before we grabbed the PVH lock.
3356 */
3357 break;
3358 }
3359
3360 pvh_unlock(&locked_pvh);
3361 }
3362
3363 va += (num_mappings << pmap_page_shift);
3364 }
3365
3366 if (__improbable(need_strong_sync)) {
3367 arm64_sync_tlb(true);
3368 }
3369
3370 /*
3371 * Update the counts
3372 */
3373 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
3374
3375 if (pmap != kernel_pmap) {
3376 if (num_unwired != 0) {
3377 ptd_info_t * const ptd_info = ptep_get_info(cpte - 1);
3378 if (__improbable(os_atomic_sub_orig(&ptd_info->wiredcnt, num_unwired, relaxed) < num_unwired)) {
3379 panic("%s: pmap %p VA [0x%llx, 0x%llx) (ptd info %p) wired count underflow", __func__, pmap,
3380 (unsigned long long)start, (unsigned long long)end, ptd_info);
3381 }
3382 }
3383
3384 /* update ledgers */
3385 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
3386 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
3387 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
3388 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
3389 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
3390 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
3391 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
3392 /* make needed adjustments to phys_footprint */
3393 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
3394 ((num_internal -
3395 num_alt_internal) +
3396 (num_compressed -
3397 num_alt_compressed)) * pmap_page_size);
3398 }
3399 }
3400
3401
3402 /*
3403 * Remove the given range of addresses
3404 * from the specified map.
3405 *
3406 * It is assumed that the start and end are properly
3407 * rounded to the hardware page size.
3408 */
3409 void
3410 pmap_remove(
3411 pmap_t pmap,
3412 vm_map_address_t start,
3413 vm_map_address_t end)
3414 {
3415 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
3416 }
3417
3418 MARK_AS_PMAP_TEXT vm_map_address_t
3419 pmap_remove_options_internal(
3420 pmap_t pmap,
3421 vm_map_address_t start,
3422 vm_map_address_t end,
3423 int options)
3424 {
3425 vm_map_address_t eva = end;
3426 tt_entry_t *tte_p;
3427 bool unlock = true;
3428
3429 if (__improbable(end < start)) {
3430 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
3431 }
3432 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3433 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
3434 }
3435
3436 validate_pmap_mutable(pmap);
3437
3438 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3439
3440 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
3441 pmap_lock(pmap, lock_mode);
3442
3443 tte_p = pmap_tte(pmap, start);
3444
3445 if ((tte_p == NULL) || ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_FAULT)) {
3446 goto done;
3447 }
3448
3449 assertf(tte_is_table(*tte_p), "%s: invalid TTE %p (0x%llx) for pmap %p va 0x%llx",
3450 __func__, tte_p, (unsigned long long)*tte_p, pmap, (unsigned long long)start);
3451
3452 pmap_remove_range_options(pmap, start, end, options);
3453
3454 if (pmap->type != PMAP_TYPE_USER) {
3455 goto done;
3456 }
3457
3458 uint16_t refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
3459 if (__improbable(refcnt == 0)) {
3460 ptd_info_t *ptd_info = ptep_get_info((pt_entry_t*)ttetokv(*tte_p));
3461 os_atomic_inc(&ptd_info->wiredcnt, relaxed); // Prevent someone else from freeing the table if we need to drop the lock
3462 if (!pmap_lock_shared_to_exclusive(pmap)) {
3463 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3464 }
3465 lock_mode = PMAP_LOCK_EXCLUSIVE;
3466 refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
3467 if ((os_atomic_dec(&ptd_info->wiredcnt, relaxed) == 0) && (refcnt == 0)) {
3468 /**
3469 * Drain any concurrent retype-sensitive SPTM operations. This is needed to
3470 * ensure that we don't unmap the page table and retype it while those operations
3471 * are still finishing on other CPUs, leading to an SPTM violation. In particular,
3472 * the multipage batched cacheability/attribute update code may issue SPTM calls
3473 * without holding the relevant PVH or pmap locks, so we can't guarantee those
3474 * calls have actually completed despite observing refcnt == 0.
3475 *
3476 * At this point, we CAN guarantee that:
3477 * 1) All prior PTE removals required to produce refcnt == 0 have
3478 * completed and been synchronized for all observers by DSB, and the
3479 * relevant PV list entries removed. Subsequent calls not already in the
3480 * retype epoch will no longer observe these mappings.
3481 * 2) We now hold the pmap lock exclusive, so there will be no further attempt
3482 * to enter mappings in this page table before it is unmapped.
3483 */
3484 pmap_retype_epoch_prepare_drain();
3485 pmap_retype_epoch_drain();
3486 pmap_tte_deallocate(pmap, start, tte_p, pt_attr_twig_level(pt_attr));
3487 unlock = false; // pmap_tte_deallocate() has dropped the lock
3488 }
3489 }
3490 done:
3491 if (unlock) {
3492 pmap_unlock(pmap, lock_mode);
3493 }
3494
3495 return eva;
3496 }
3497
3498 void
3499 pmap_remove_options(
3500 pmap_t pmap,
3501 vm_map_address_t start,
3502 vm_map_address_t end,
3503 int options)
3504 {
3505 vm_map_address_t va;
3506
3507 if (pmap == PMAP_NULL) {
3508 return;
3509 }
3510
3511 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3512
3513 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
3514 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
3515 VM_KERNEL_ADDRHIDE(end));
3516
3517 #if MACH_ASSERT
3518 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
3519 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
3520 pmap, (uint64_t)start, (uint64_t)end);
3521 }
3522 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
3523 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
3524 pmap, (uint64_t)start, (uint64_t)end);
3525 }
3526 #endif
3527
3528 /*
3529 * We allow single-page requests to execute non-preemptibly,
3530 * as it doesn't make sense to sample AST_URGENT for a single-page
3531 * operation, and there are a couple of special use cases that
3532 * require a non-preemptible single-page operation.
3533 */
3534 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
3535 pmap_verify_preemptible();
3536 }
3537
3538 /*
3539 * Invalidate the translation buffer first
3540 */
3541 va = start;
3542 while (va < end) {
3543 vm_map_address_t l;
3544
3545 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
3546 if (l > end) {
3547 l = end;
3548 }
3549
3550 va = pmap_remove_options_internal(pmap, va, l, options);
3551 }
3552
3553 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
3554 }
3555
3556
3557 /*
3558 * Remove phys addr if mapped in specified map
3559 */
3560 void
3561 pmap_remove_some_phys(
3562 __unused pmap_t map,
3563 __unused ppnum_t pn)
3564 {
3565 /* Implement to support working set code */
3566 }
3567
3568 /*
3569 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
3570 * switch a thread onto a new vm_map.
3571 */
3572 void
3573 pmap_switch_user(thread_t thread, vm_map_t new_map)
3574 {
3575 pmap_t new_pmap = new_map->pmap;
3576
3577
3578 thread->map = new_map;
3579 pmap_set_pmap(new_pmap, thread);
3580
3581 }
3582 void
3583 pmap_set_pmap(
3584 pmap_t pmap,
3585 thread_t thread)
3586 {
3587 pmap_switch(pmap, thread);
3588 }
3589
3590 MARK_AS_PMAP_TEXT void
3591 pmap_switch_internal(
3592 pmap_t pmap,
3593 thread_t thread)
3594 {
3595 validate_pmap_mutable(pmap);
3596 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3597 const uint16_t asid_index = PMAP_HWASID(pmap);
3598 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
3599 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
3600 }
3601
3602 #if __ARM_KERNEL_PROTECT__
3603 asid_index >>= 1;
3604 #endif
3605
3606 if (asid_index > 0) {
3607 pmap_update_plru(asid_index);
3608 }
3609
3610 __unused sptm_return_t sptm_return;
3611 #pragma unused(thread)
3612 if (0) {
3613 } else {
3614 sptm_return = sptm_switch_root(pmap->ttep, 0, 0);
3615 }
3616
3617 #if DEVELOPMENT || DEBUG
3618 if (__improbable(sptm_return & SPTM_SWITCH_ASID_TLBI_FLUSH)) {
3619 os_atomic_inc(&pmap_asid_flushes, relaxed);
3620 }
3621
3622 if (__improbable(sptm_return & SPTM_SWITCH_RCTX_FLUSH)) {
3623 os_atomic_inc(&pmap_speculation_restrictions, relaxed);
3624 }
3625 #endif /* DEVELOPMENT || DEBUG */
3626 }
3627
3628 void
3629 pmap_switch(
3630 pmap_t pmap,
3631 thread_t thread)
3632 {
3633 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
3634 pmap_switch_internal(pmap, thread);
3635 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
3636 }
3637
3638 void
3639 pmap_page_protect(
3640 ppnum_t ppnum,
3641 vm_prot_t prot)
3642 {
3643 pmap_page_protect_options(ppnum, prot, 0, NULL);
3644 }
3645
3646 /**
3647 * Helper function for performing per-mapping accounting following an SPTM disjoint unmap request.
3648 *
3649 * @note [pmap] cannot be the kernel pmap. This is because we do not maintain a ledger in the
3650 * kernel pmap.
3651 *
3652 * @param pmap The pmap that contained the mapping
3653 * @param pai The physical page index mapped by the mapping
3654 * @param is_compressed Indicates whether the operation was an unmap-to-compress vs. a full unmap
3655 * @param is_internal Indicates whether the mapping was for an internal (aka anonymous) VM page
3656 * @param is_altacct Indicates whether the mapping was subject to alternate accounting.
3657 */
3658 static void
3659 pmap_disjoint_unmap_accounting(pmap_t pmap, unsigned int pai, bool is_compressed, bool is_internal, bool is_altacct)
3660 {
3661 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
3662 pvh_assert_locked(pai);
3663
3664 assert(pmap != kernel_pmap);
3665
3666 if (is_internal &&
3667 !is_altacct &&
3668 ppattr_test_reusable(pai)) {
3669 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3670 } else if (!is_internal) {
3671 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3672 }
3673
3674 if (is_altacct) {
3675 assert(is_internal);
3676 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3677 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3678 if (is_compressed) {
3679 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3680 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3681 }
3682 } else if (ppattr_test_reusable(pai)) {
3683 assert(is_internal);
3684 if (is_compressed) {
3685 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3686 /* was not in footprint, but is now */
3687 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3688 }
3689 } else if (is_internal) {
3690 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3691
3692 /*
3693 * Update all stats related to physical footprint, which only
3694 * deals with internal pages.
3695 */
3696 if (is_compressed) {
3697 /*
3698 * This removal is only being done so we can send this page to
3699 * the compressor; therefore it mustn't affect total task footprint.
3700 */
3701 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3702 } else {
3703 /*
3704 * This internal page isn't going to the compressor, so adjust stats to keep
3705 * phys_footprint up to date.
3706 */
3707 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3708 }
3709 } else {
3710 /* external page: no impact on ledgers */
3711 }
3712 }
3713
3714 /**
3715 * Helper function for issuing a disjoint unmap request to the SPTM and performing
3716 * related accounting. This function uses the 'prev_ptes' list generated by
3717 * the sptm_unmap_disjoint() call to determine whether said call altered the
3718 * relevant PTEs in a manner that would require accounting updates.
3719 *
3720 * @param pa The physical address against which the disjoint unmap will be issued.
3721 * @param num_mappings The number of disjoint mappings for the SPTM to update.
3722 * The per-CPU sptm_ops array should contain the same number
3723 * of individual disjoint requests.
3724 */
3725 static void
3726 pmap_disjoint_unmap(pmap_paddr_t pa, unsigned int num_mappings)
3727 {
3728 const unsigned int pai = pa_index(pa);
3729
3730 pvh_assert_locked(pai);
3731
3732 assert(num_mappings <= SPTM_MAPPING_LIMIT);
3733
3734 assert(get_preemption_level() > 0);
3735 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
3736
3737 sptm_unmap_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings);
3738
3739 for (unsigned int cur_mapping = 0; cur_mapping < num_mappings; ++cur_mapping) {
3740 pt_entry_t prev_pte = sptm_pcpu->sptm_prev_ptes[cur_mapping];
3741
3742 pt_desc_t * const ptdp = sptm_pcpu->sptm_ptds[cur_mapping];
3743 const pmap_t pmap = ptdp->pmap;
3744
3745 assertf(!pte_is_valid(prev_pte) ||
3746 ((pte_to_pa(prev_pte) & ~PAGE_MASK) == pa), "%s: prev_pte 0x%llx does not map pa 0x%llx",
3747 __func__, (unsigned long long)prev_pte, (unsigned long long)pa);
3748
3749 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
3750 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3751
3752 if (pmap != kernel_pmap) {
3753 /*
3754 * If the prior PTE is invalid (which may happen due to a concurrent remove operation),
3755 * the compressed marker won't be written so we shouldn't account the mapping as compressed.
3756 */
3757 const bool is_compressed = (pte_is_valid(prev_pte) &&
3758 ((sptm_pcpu->sptm_ops[cur_mapping].pte_template & ARM_PTE_COMPRESSED_MASK) != 0));
3759 const bool is_internal = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_INTERNAL) != 0;
3760 const bool is_altacct = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_ALTACCT) != 0;
3761
3762 /*
3763 * The rule is that accounting related to PTE contents (wired, PTD refcount)
3764 * must be updated by whoever clears the PTE, while accounting related to physical page
3765 * attributes must be updated by whoever clears the PVE. We therefore always call
3766 * pmap_disjoint_unmap_accounting() here since we're removing the PVE, but only update
3767 * wired/PTD accounting if the prior PTE was valid.
3768 */
3769 pmap_disjoint_unmap_accounting(pmap, pai, is_compressed, is_internal, is_altacct);
3770
3771 if (!pte_is_valid(prev_pte)) {
3772 continue;
3773 }
3774
3775 if (pte_is_wired(prev_pte)) {
3776 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3777 if (__improbable(os_atomic_dec_orig(&sptm_pcpu->sptm_ptd_info[cur_mapping]->wiredcnt, relaxed) == 0)) {
3778 panic("%s: over-unwire of ptdp %p, ptd info %p", __func__,
3779 ptdp, sptm_pcpu->sptm_ptd_info[cur_mapping]);
3780 }
3781 }
3782 }
3783 }
3784 }
3785
3786 /**
3787 * The following two functions, pmap_multipage_op_submit_disjoint() and
3788 * pmap_multipage_op_add_page(), are intended to allow callers to manage batched SPTM
3789 * operations that may span multiple physical pages. They are intended to operate in
3790 * a way that allows callers such as pmap_page_protect_options_with_flush_range() to
3791 * insert mappings into the per-CPU SPTM disjoint ops array in the same manner that
3792 * they would for an ordinary single-page operation.
3793 * Functions such as pmap_page_protect_options_with_flush_range() operate on a single
3794 * physical page but may be passed a non-NULL flush_range object to indicate that the
3795 * call is part of a larger batched operation which may span multiple physical pages.
3796 * In that scenario, these functions are intended to be used as follows:
3797 * 1) Call pmap_multipage_op_add_page() to insert a "header" for the page into the per-
3798 * CPU SPTM ops array. Use the return value from this call as the starting index
3799 * at which to add ordinary mapping entries into the same array.
3800 * 2) Insert sptm_disjoint_op_t entries into the ops array in the normal manner until
3801 * the array is full, the SPTM options required for the upcoming sequence of pages
3802 * need to change, or the current mapping matches flush_range->current_ptep.
3803 * In the latter case, pmap_insert_flush_range_template() may instead be used
3804 * to insert the mapping into the per-CPU SPTM region templates array. See the
3805 * documentation for pmap_insert_flush_range_template() below.
3806 * 3) If the array is full, call pmap_multipage_op_submit_disjoint() and return to step 1).
3807 * 4) If the SPTM options need to change, call pmap_multipage_op_add_page() to insert
3808 * a new header with the updated options and, using the return value as the new
3809 * insertion point for the ops array, resume step 2).
3810 * 5) Upon completion, if there are any pending not-yet-submitted mappings, do not
3811 * submit those mappings to the SPTM as would ordinarily be done for a single-page
3812 * call. These trailing mappings will be submitted as part of the next batch,
3813 * or by the next-higher caller if the range operation is complete.
3814 *
3815 * Note that, as a performance optimization, the caller may track the insertion
3816 * point in the disjoint ops array locally (i.e. without incrementing
3817 * flush_range->pending_disjoint_entries on every iteration, as long as it takes care to do the
3818 * following:
3819 * 1) Initialize and update that insertion point as described in steps 1) and 4) above.
3820 * 2) Pass the updated insertion point as the 'pending_disjoint_entries' parameter into the calls
3821 * in steps 3) and 4) above.
3822 * 3) Update flush_range->pending_disjoint_entries with the locally-maintained value along with
3823 * step 5) above.
3824 */
3825
3826 /**
3827 * Submit any pending disjoint multi-page mapping updates to the SPTM.
3828 *
3829 * @note This function must be called with preemption disabled, and will drop
3830 * the preemption-disable count upon submitting to the SPTM.
3831 * @note [pending_disjoint_entries] must include *all* pending entries in the SPTM ops array,
3832 * including physical address "header" entries.
3833 * @note This function automatically updates the per_paddr_header.num_mappings field
3834 * for the most recent physical address header in the SPTM ops array to its final
3835 * value.
3836 *
3837 * @param pending_disjoint_entries The number of not-yet-submitted mappings according to the caller.
3838 * This value may be greater than [flush_range]->pending_disjoint_entries if
3839 * the caller has inserted mappings into the ops array without
3840 * updating [flush_range]->pending_disjoint_entries, in which case this
3841 * function will update [flush_range]->pending_disjoint_entries with the
3842 * caller's value.
3843 * @param flush_range The object tracking the current state of the multipage disjoint
3844 * operation.
3845 */
3846 static inline void
3847 pmap_multipage_op_submit_disjoint(unsigned int pending_disjoint_entries, pmap_tlb_flush_range_t *flush_range)
3848 {
3849 /**
3850 * Reconcile the number of pending entries as tracked by the caller with the
3851 * number of pending entries tracked by flush_range. If the caller's value is
3852 * greater, we assume the caller has inserted locally-tracked mappings into the
3853 * array without directly updating flush_range->pending_disjoint_entries. Otherwise, we
3854 * assume the caller has no locally-tracked mappings and is simply trying to
3855 * purge any pending mappings from a prior call sequence.
3856 */
3857 if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
3858 flush_range->pending_disjoint_entries = pending_disjoint_entries;
3859 } else {
3860 assert(pending_disjoint_entries == 0);
3861 }
3862 if (flush_range->pending_disjoint_entries != 0) {
3863 assert(get_preemption_level() > 0);
3864 /**
3865 * Compute the correct number of mappings for the most recent paddr
3866 * header based on the current position in the SPTM ops array.
3867 */
3868 flush_range->current_header->per_paddr_header.num_mappings =
3869 flush_range->pending_disjoint_entries - flush_range->current_header_first_mapping_index;
3870 const sptm_return_t sptm_return = sptm_update_disjoint_multipage(
3871 PERCPU_GET(pmap_sptm_percpu)->sptm_ops_pa, flush_range->pending_disjoint_entries);
3872
3873 /**
3874 * We may be submitting the batch and exiting the epoch partway through
3875 * processing the PV list for a page. That's fine, because in that case we'll
3876 * hold the PV lock for that page, which will prevent mappings of that page from
3877 * being disconnected and will prevent the completion of pmap_remove() against
3878 * any of those mappings, thus also guaranteeing the relevant page table pages
3879 * can't be freed. The epoch still protects mappings for any prior page in
3880 * the batch, whose PV locks are no longer held.
3881 */
3882 pmap_retype_epoch_exit();
3883 enable_preemption();
3884 if (flush_range->pending_region_entries != 0) {
3885 flush_range->processed_entries += flush_range->pending_disjoint_entries;
3886 } else {
3887 flush_range->processed_entries = 0;
3888 }
3889 flush_range->pending_disjoint_entries = 0;
3890 if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
3891 flush_range->ptfr_flush_needed = true;
3892 }
3893 }
3894 }
3895
3896 /**
3897 * Insert a new physical address "header" entry into the per-CPU SPTM ops array for a
3898 * multi-page SPTM operation. It is expected that the caller will subsequently add
3899 * mapping entries for this physical address into the array.
3900 *
3901 * @note This function will disable preemption upon creation of the first paddr header
3902 * (index 0 in the per-CPU SPTM ops array) and it is expected that
3903 * pmap_multipage_op_submit() will subsequently be called on the same CPU.
3904 * @note Before inserting the new header, this function automatically updates the
3905 * per_paddr_header.num_mappings field for the previous physical address header
3906 * (if present) in the SPTM ops array to its final value.
3907 *
3908 * @param phys The physical address for which to insert a header entry.
3909 * @param inout_pending_disjoint_entries
3910 * [input] The number of not-yet-submitted mappings according to the caller.
3911 * This value may be greater than [flush_range]->pending_disjoint_entries if
3912 * the caller has inserted mappings into the ops array without
3913 * updating [flush_range]->pending_disjoint_entries, in which case this
3914 * function will update [flush_range]->pending_disjoint_entries with the
3915 * caller's value.
3916 * [output] Returns the starting index at which the caller should insert mapping
3917 * entries into the per-CPU SPTM ops array.
3918 * @param sptm_update_options SPTM_UPDATE_* flags to pass to the SPTM call.
3919 * SPTM_UPDATE_SKIP_PAPT is automatically inserted by this
3920 * function.
3921 * @param flush_range The object tracking the current state of the multipage operation.
3922 *
3923 * @return True if the region operation was submitted to the SPTM due to the ops array already
3924 * being full, false otherwise. In the former case, the new header will not be added
3925 * to the array; the caller will need to re-invoke this function after taking any
3926 * necessary post-submission action (such as enabling preemption).
3927 */
3928 static inline bool
3929 pmap_multipage_op_add_page(
3930 pmap_paddr_t phys,
3931 unsigned int *inout_pending_disjoint_entries,
3932 uint32_t sptm_update_options,
3933 pmap_tlb_flush_range_t *flush_range)
3934 {
3935 unsigned int pending_disjoint_entries = *inout_pending_disjoint_entries;
3936
3937 /**
3938 * Reconcile the number of pending entries as tracked by the caller with the
3939 * number of pending entries tracked by flush_range. If the caller's value is
3940 * greater, we assume the caller has inserted locally-tracked mappings into the
3941 * array without directly updating flush_range->pending_disjoint_entries. Otherwise, we
3942 * assume the caller has no locally-tracked mappings and is adding its paddr
3943 * header for the first time.
3944 */
3945 if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
3946 flush_range->pending_disjoint_entries = pending_disjoint_entries;
3947 } else {
3948 assert(pending_disjoint_entries == 0);
3949 }
3950 if (flush_range->pending_disjoint_entries >= (SPTM_MAPPING_LIMIT - 1)) {
3951 /**
3952 * If the SPTM ops array is either full or only has space for the paddr
3953 * header, there won't be room for mapping entries, so submit the pending
3954 * mappings to the SPTM now, and return to allow the caller to take
3955 * any necessary post-submission action.
3956 */
3957 pmap_multipage_op_submit_disjoint(pending_disjoint_entries, flush_range);
3958 *inout_pending_disjoint_entries = 0;
3959 return true;
3960 }
3961 pending_disjoint_entries = flush_range->pending_disjoint_entries;
3962
3963 sptm_update_options |= SPTM_UPDATE_SKIP_PAPT;
3964 if (pending_disjoint_entries == 0) {
3965 disable_preemption();
3966 /**
3967 * Enter the retype epoch while we gather the disjoint update arguments
3968 * and issue the SPTM call. Since this operation may cover multiple physical
3969 * pages, we may construct the argument array and invoke the SPTM without holding
3970 * all relevant PVH locks or pmap locks. We therefore need to record that we are
3971 * collecting and modifying mapping state so that e.g. pmap_page_protect() does
3972 * not attempt to retype the underlying pages and pmap_remove() does not attempt
3973 * to free the page tables used for these mappings without first draining our epoch.
3974 */
3975 pmap_retype_epoch_enter();
3976 flush_range->pending_disjoint_entries = 1;
3977 } else {
3978 /**
3979 * Before inserting the new header, update the prior header's number
3980 * of paddr-specific mappings to its final value.
3981 */
3982 assert(flush_range->current_header != NULL);
3983 flush_range->current_header->per_paddr_header.num_mappings =
3984 pending_disjoint_entries - flush_range->current_header_first_mapping_index;
3985 }
3986 sptm_disjoint_op_t *sptm_ops = PERCPU_GET(pmap_sptm_percpu)->sptm_ops;
3987 flush_range->current_header = (sptm_update_disjoint_multipage_op_t*)&sptm_ops[pending_disjoint_entries];
3988 flush_range->current_header_first_mapping_index = ++pending_disjoint_entries;
3989 flush_range->current_header->per_paddr_header.paddr = phys;
3990 flush_range->current_header->per_paddr_header.num_mappings = 0;
3991 flush_range->current_header->per_paddr_header.options = sptm_update_options;
3992
3993 *inout_pending_disjoint_entries = pending_disjoint_entries;
3994 return false;
3995 }
3996
3997 /**
3998 * The following two functions, pmap_multipage_op_submit_region() and
3999 * pmap_insert_flush_range_template(), are meant to be used in a similar fashion
4000 * to pmap_multipage_op_submit_disjoint() and pmap_multipage_op_add_page(),
4001 * but for the specific case in which a given mapping within a PV list happens
4002 * to map the current VA within a VA region being operated on by
4003 * phys_attribute_clear_range(). This allows the pmap to further optimize
4004 * the SPTM calls by using sptm_update_region() to modify all mappings within
4005 * the VA region, which requires far fewer table walks than a disjoint operation.
4006 * Since the starting VA of the region, the owning pmap, and the insertion point
4007 * within the per-CPU region templates array are already known, these functions
4008 * don't require the special "header" entry or the complex array position tracking
4009 * of their disjoint equivalents above.
4010 * Note that these functions may be used together with the disjoint functions above;
4011 * these functions can be used for the "primary" mappings corresponding to the VA
4012 * region being manipulated by the VM layer, while the disjoint functions can be
4013 * used for any alias mappings of the underlying pages which fall outside that
4014 * VA region.
4015 */
4016
4017 /**
4018 * Submit any pending region-based templates for the specified flush_range.
4019 *
4020 * @note This function must be called with preemption disabled, and will drop
4021 * the preemption-disable count upon submitting to the SPTM.
4022 *
4023 * @param flush_range The object tracking the current state of the region operation.
4024 */
4025 static inline void
4026 pmap_multipage_op_submit_region(pmap_tlb_flush_range_t *flush_range)
4027 {
4028 if (flush_range->pending_region_entries != 0) {
4029 assert(get_preemption_level() > 0);
4030 pmap_assert_locked(flush_range->ptfr_pmap, PMAP_LOCK_SHARED);
4031 /**
4032 * If there are any pending disjoint entries, we're already in a retype epoch.
4033 * For disjoint entries, we need to hold the epoch during the entire time we
4034 * construct the disjoint ops array because those ops may point to some arbitrary
4035 * pmap and we need to ensure the relevant page tables and even the pmap itself
4036 * aren't concurrently reclaimed while our ops array points to them.
4037 * But for a region op like this, we know we already hold the relevant pmap lock
4038 * so none of the above can happen concurrently. We therefore only need to hold
4039 * the epoch across the SPTM call itself to prevent a concurrent unmap operation
4040 * from attempting to retype the mapped pages while our SPTM call has them in-
4041 * flight.
4042 */
4043 if (flush_range->pending_disjoint_entries == 0) {
4044 pmap_retype_epoch_enter();
4045 }
4046 const sptm_return_t sptm_return = sptm_update_region(flush_range->ptfr_pmap->ttep,
4047 flush_range->pending_region_start, flush_range->pending_region_entries,
4048 PERCPU_GET(pmap_sptm_percpu)->sptm_templates_pa,
4049 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | SPTM_UPDATE_DEFER_TLBI);
4050 if (flush_range->pending_disjoint_entries == 0) {
4051 pmap_retype_epoch_exit();
4052 }
4053 enable_preemption();
4054 if (flush_range->pending_disjoint_entries != 0) {
4055 flush_range->processed_entries += flush_range->pending_region_entries;
4056 } else {
4057 flush_range->processed_entries = 0;
4058 }
4059 flush_range->pending_region_start += (flush_range->pending_region_entries <<
4060 pmap_get_pt_attr(flush_range->ptfr_pmap)->pta_page_shift);
4061 flush_range->pending_region_entries = 0;
4062 if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4063 flush_range->ptfr_flush_needed = true;
4064 }
4065 }
4066 }
4067
4068 /**
4069 * Insert a PTE template into the per-CPU SPTM region ops array.
4070 * This is meant to be used as a performance optimization for the case in which a given
4071 * mapping being processed by a function such as pmap_page_protect_options_with_flush_range()
4072 * happens to map the current iteration position within [flush_range]'s VA region.
4073 * In this case the mapping can be inserted as a region-based template rather than a disjoint
4074 * operation as would be done in the general case. The idea is that region-based SPTM
4075 * operations are significantly less expensive than disjoint operations, because each region
4076 * operation only requires a single page table walk at the beginning vs. a table walk for
4077 * each mapping in the disjoint case. Since the majority of mappings processed by a flush
4078 * range operation belong to the main flush range VA region (i.e. alias mappings outside
4079 * the region are less common), the performance improvement can be significant.
4080 *
4081 * @note This function will disable preemption upon inserting the first entry into the
4082 * per-CPU templates array, and will re-enable preemption upon submitting the region
4083 * operation to the SPTM.
4084 *
4085 * @param template The PTE template to insert into the per-CPU templates array.
4086 * @param flush_range The object tracking the current state of the region operation.
4087 *
4088 * @return True if the region operation was submitted to the SPTM, false otherwise.
4089 */
4090 static inline bool
4091 pmap_insert_flush_range_template(pt_entry_t template, pmap_tlb_flush_range_t *flush_range)
4092 {
4093 if (flush_range->pending_region_entries == 0) {
4094 disable_preemption();
4095 }
4096 flush_range->region_entry_added = true;
4097 PERCPU_GET(pmap_sptm_percpu)->sptm_templates[flush_range->pending_region_entries++] = template;
4098 if (flush_range->pending_region_entries == SPTM_MAPPING_LIMIT) {
4099 pmap_multipage_op_submit_region(flush_range);
4100 return true;
4101 }
4102 return false;
4103 }
4104
4105 /**
4106 * Wrapper function for submitting any pending operations, region-based or disjoint,
4107 * tracked by a flush range object. This is meant to be used by the top-level caller that
4108 * iterates over the flush range's VA region and calls functions such as
4109 * pmap_page_protect_options_with_flush_range() or arm_force_fast_fault_with_flush_range()
4110 * to construct the relevant SPTM operations arrays.
4111 *
4112 * @param flush_range The object tracking the current state of region and/or disjoint operations.
4113 */
4114 static inline void
4115 pmap_multipage_op_submit(pmap_tlb_flush_range_t *flush_range)
4116 {
4117 pmap_multipage_op_submit_disjoint(0, flush_range);
4118 pmap_multipage_op_submit_region(flush_range);
4119 }
4120
4121 /**
4122 * This is an internal-only flag that indicates the caller of pmap_page_protect_options_with_flush_range()
4123 * is removing/updating all mappings in preparation for a retype operation. In this case
4124 * pmap_page_protect_options() will assume (and assert) that the PVH lock for the physical page is held
4125 * by the calller, and will perform the necessary retype epoch drain prior to returning.
4126 */
4127 #define PMAP_OPTIONS_PPO_PENDING_RETYPE 0x80000000
4128 _Static_assert(PMAP_OPTIONS_PPO_PENDING_RETYPE & PMAP_OPTIONS_RESERVED_MASK,
4129 "PMAP_OPTIONS_PPO_PENDING_RETYPE outside reserved encoding space");
4130
4131 /**
4132 * Lower the permission for all mappings to a given page. If VM_PROT_NONE is specified,
4133 * the mappings will be removed.
4134 *
4135 * @param ppnum Page number to lower the permission of.
4136 * @param prot The permission to lower to.
4137 * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
4138 * PMAP_OPTIONS_PPO_PENDING_RETYPE indicates the PVH lock for ppnum is
4139 * already locked and a retype epoch drain shold be performed.
4140 * PMAP_OPTIONS_COMPRESSOR indicates the function is called by the
4141 * VM compressor.
4142 * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
4143 * by the caller. This is an input/output parameter which may be updated
4144 * to reflect a new PV head value to be passed to a later call to pvh_unlock().
4145 * @param flush_range When present, this function will skip the TLB flush for the
4146 * mappings that are covered by the range, leaving that to be
4147 * done later by the caller. It may also avoid submitting mapping
4148 * updates directly to the SPTM, instead accumulating them in a
4149 * per-CPU array to be submitted later by the caller.
4150 *
4151 * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4152 */
4153 MARK_AS_PMAP_TEXT static void
4154 pmap_page_protect_options_with_flush_range(
4155 ppnum_t ppnum,
4156 vm_prot_t prot,
4157 unsigned int options,
4158 locked_pvh_t *locked_pvh,
4159 pmap_tlb_flush_range_t *flush_range)
4160 {
4161 pmap_paddr_t phys = ptoa(ppnum);
4162 locked_pvh_t local_locked_pvh = {.pvh = 0};
4163 pv_entry_t *pve_p = NULL;
4164 pv_entry_t *pveh_p = NULL;
4165 pv_entry_t *pvet_p = NULL;
4166 pt_entry_t *pte_p = NULL;
4167 pv_entry_t *new_pve_p = NULL;
4168 pt_entry_t *new_pte_p = NULL;
4169
4170 bool remove = false;
4171 unsigned int pvh_cnt = 0;
4172 unsigned int num_mappings = 0, num_skipped_mappings = 0;
4173
4174 assert(ppnum != vm_page_fictitious_addr);
4175
4176 /**
4177 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4178 *
4179 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
4180 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
4181 * semantics conflict with each other, so assert they are not both true.
4182 */
4183 assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
4184
4185 /* Only work with managed pages. */
4186 if (!pa_valid(phys)) {
4187 return;
4188 }
4189
4190 /*
4191 * Determine the new protection.
4192 */
4193 switch (prot) {
4194 case VM_PROT_ALL:
4195 return; /* nothing to do */
4196 case VM_PROT_READ:
4197 case VM_PROT_READ | VM_PROT_EXECUTE:
4198 break;
4199 default:
4200 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4201 options = options & ~PMAP_OPTIONS_NOFLUSH;
4202 remove = true;
4203 break;
4204 }
4205
4206 /**
4207 * We don't support cross-page batching (indicated by flush_range being non-NULL) for removals,
4208 * as removals must use the SPTM prev_ptes array for accounting, which isn't supported for cross-
4209 * page batches.
4210 */
4211 assert((flush_range == NULL) || !remove);
4212
4213 unsigned int pai = pa_index(phys);
4214 if (__probable(locked_pvh == NULL)) {
4215 if (flush_range != NULL) {
4216 /**
4217 * If we're partway through processing a multi-page batched call,
4218 * preemption will already be disabled so we can't simply call
4219 * pvh_lock() which may block. Instead, we first try to acquire
4220 * the lock without waiting, which in most cases should succeed.
4221 * If it fails, we submit the pending batched operations to re-
4222 * enable preemption and then acquire the lock normally.
4223 */
4224 local_locked_pvh = pvh_try_lock(pai);
4225 if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
4226 pmap_multipage_op_submit(flush_range);
4227 local_locked_pvh = pvh_lock(pai);
4228 }
4229 } else {
4230 local_locked_pvh = pvh_lock(pai);
4231 }
4232 } else {
4233 local_locked_pvh = *locked_pvh;
4234 assert(pai == local_locked_pvh.pai);
4235 }
4236 assert(local_locked_pvh.pvh != 0);
4237 pvh_assert_locked(pai);
4238
4239 bool pvh_lock_sleep_mode_needed = false;
4240
4241 /*
4242 * PVH should be locked before accessing per-CPU data, as we're relying on the lock
4243 * to disable preemption.
4244 */
4245 pmap_cpu_data_t *pmap_cpu_data = NULL;
4246 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
4247 sptm_disjoint_op_t *sptm_ops = NULL;
4248 pt_desc_t **sptm_ptds = NULL;
4249 ptd_info_t **sptm_ptd_info = NULL;
4250
4251 /* BEGIN IGNORE CODESTYLE */
4252
4253 /**
4254 * This would also work as a block, with the above variables declared using the
4255 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
4256 * dereferencing __block variables through stack forwarding pointers) isn't needed
4257 * here, as we never need to use this code sequence as a closure.
4258 */
4259 #define PPO_PERCPU_INIT() do { \
4260 disable_preemption(); \
4261 pmap_cpu_data = pmap_get_cpu_data(); \
4262 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
4263 sptm_ops = sptm_pcpu->sptm_ops; \
4264 sptm_ptds = sptm_pcpu->sptm_ptds; \
4265 sptm_ptd_info = sptm_pcpu->sptm_ptd_info; \
4266 if (remove) { \
4267 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed); \
4268 /* \
4269 * Ensure the store to inflight_disconnect will be observed before any of the
4270 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4271 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4272 * another CPU, in between this function's clearing a PTE and dropping the
4273 * corresponding pagetable refcount. That can lead to a panic if the
4274 * destroying thread observes a non-zero refcount. For this we need a store-
4275 * store barrier; a store-release operation would not be sufficient.
4276 */ \
4277 os_atomic_thread_fence(release); \
4278 } \
4279 } while (0)
4280
4281 /* END IGNORE CODESTYLE */
4282
4283
4284 PPO_PERCPU_INIT();
4285
4286 pv_entry_t **pve_pp = NULL;
4287
4288 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
4289 pte_p = pvh_ptep(local_locked_pvh.pvh);
4290 } else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4291 pve_p = pvh_pve_list(local_locked_pvh.pvh);
4292 pveh_p = pve_p;
4293 } else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
4294 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
4295 }
4296
4297 int pve_ptep_idx = 0;
4298 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4299
4300 /*
4301 * We need to keep track of whether a particular PVE list contains IOMMU
4302 * mappings when removing entries, because we should only remove CPU
4303 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4304 * it around.
4305 */
4306 bool iommu_mapping_in_pve = false;
4307
4308 /**
4309 * With regard to TLBI, there are three cases:
4310 *
4311 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
4312 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
4313 * itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
4314 * mapping is out of the range.
4315 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
4316 * let SPTM handle TLBI flushing.
4317 */
4318 const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
4319 const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
4320
4321 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4322 if (__improbable(pvh_lock_sleep_mode_needed)) {
4323 assert((num_mappings == 0) && (num_skipped_mappings == 0));
4324 if (remove) {
4325 /**
4326 * Clear the in-flight disconnect indicator for the current CPU, as we've
4327 * already submitted any prior pending SPTM operations, and we're about to
4328 * briefly re-enable preemption which may cause this thread to be migrated.
4329 */
4330 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
4331 }
4332 /**
4333 * Undo the explicit preemption disable done in the last call to PPO_PER_CPU_INIT().
4334 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
4335 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
4336 * core while processing SPTM per-CPU data. At the same time, we also want preemption
4337 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
4338 * urgent ASTs can be handled.
4339 */
4340 enable_preemption();
4341 pvh_lock_enter_sleep_mode(&local_locked_pvh);
4342 pvh_lock_sleep_mode_needed = false;
4343 PPO_PERCPU_INIT();
4344 }
4345
4346 if (pve_p != PV_ENTRY_NULL) {
4347 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4348 if (pte_p == PT_ENTRY_NULL) {
4349 goto protect_skip_pve;
4350 }
4351 }
4352
4353 #ifdef PVH_FLAG_IOMMU
4354 if (pvh_ptep_is_iommu(pte_p)) {
4355 iommu_mapping_in_pve = true;
4356 if (__improbable(remove && (options & PMAP_OPTIONS_COMPRESSOR))) {
4357 const iommu_instance_t iommu = ptep_get_iommu(pte_p);
4358 panic("%s: attempt to compress ppnum 0x%x owned by iommu driver "
4359 "%u (token: %#x), pve_p=%p", __func__, ppnum, GET_IOMMU_ID(iommu),
4360 GET_IOMMU_TOKEN(iommu), pve_p);
4361 }
4362 if (remove && (pve_p == PV_ENTRY_NULL)) {
4363 /*
4364 * We've found an IOMMU entry and it's the only entry in the PV list.
4365 * We don't discard IOMMU entries, so simply set up the new PV list to
4366 * contain the single IOMMU PTE and exit the loop.
4367 */
4368 new_pte_p = pte_p;
4369 break;
4370 }
4371 ++num_skipped_mappings;
4372 goto protect_skip_pve;
4373 }
4374 #endif
4375
4376 const pt_entry_t spte = os_atomic_load(pte_p, relaxed);
4377
4378 if (__improbable(!remove && !pte_is_valid(spte))) {
4379 ++num_skipped_mappings;
4380 goto protect_skip_pve;
4381 }
4382
4383 pt_desc_t *ptdp = NULL;
4384 pmap_t pmap = NULL;
4385 vm_map_address_t va = 0;
4386
4387 if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
4388 /**
4389 * If the current mapping matches the flush range's current iteration position,
4390 * there's no need to do the work of getting the PTD. We already know the pmap,
4391 * and the VA is implied by flush_range->pending_region_start.
4392 */
4393 pmap = flush_range->ptfr_pmap;
4394 } else {
4395 ptdp = ptep_get_ptd(pte_p);
4396 pmap = ptdp->pmap;
4397 va = ptd_get_va(ptdp, pte_p);
4398 }
4399
4400 /**
4401 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
4402 * pending disjoint ops, so we don't need to do flush range disjoint op management.
4403 */
4404 if ((flush_range != NULL) && (ptdp != NULL)) {
4405 /**
4406 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
4407 * We do this in three cases:
4408 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
4409 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
4410 * for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
4411 * 3) We need to change the options passed to the SPTM for a run of one or more mappings. Specifically,
4412 * if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
4413 * belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
4414 * the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
4415 */
4416 uint32_t per_mapping_sptm_update_options = sptm_update_options;
4417 if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4418 per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
4419 }
4420 if ((num_mappings == 0) ||
4421 (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
4422 if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
4423 /**
4424 * If we needed to submit the pending disjoint ops to make room for the new page,
4425 * flush any pending region ops to reenable preemption and restart the loop with
4426 * the lock in sleep mode. This prevents preemption from being held disabled
4427 * for an arbitrary amount of time in the pathological case in which we have
4428 * both pending region ops and an excessively long PV list that repeatedly
4429 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
4430 */
4431 pmap_multipage_op_submit_region(flush_range);
4432 assert(num_mappings == 0);
4433 num_skipped_mappings = 0;
4434 pvh_lock_sleep_mode_needed = true;
4435 continue;
4436 }
4437 }
4438 }
4439
4440 if (__improbable((pmap == NULL) ||
4441 (pte_is_valid(spte) && (atop(pte_to_pa(spte)) != ppnum)))) {
4442 #if MACH_ASSERT
4443 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4444 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4445 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4446 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4447
4448 pv_entry_t *check_pvep = pve_p;
4449
4450 do {
4451 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4452 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4453 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, (void*)local_locked_pvh.pvh, pve_p, pai);
4454 }
4455 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4456
4457 /* Restore previous PTEP value. */
4458 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4459 }
4460 #endif
4461 panic("%s: bad PVE pte_p=%p pmap=%p prot=%d options=%u, pvh=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4462 __func__, pte_p, pmap, prot, options, (void*)local_locked_pvh.pvh, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4463 }
4464
4465 pt_entry_t pte_template = ARM_PTE_EMPTY;
4466
4467 if (ptdp != NULL) {
4468 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
4469 sptm_ops[num_mappings].vaddr = va;
4470 }
4471
4472 /* Remove the mapping if new protection is NONE */
4473 if (remove) {
4474 sptm_ptds[num_mappings] = ptdp;
4475 sptm_ptd_info[num_mappings] = ptd_get_info(ptdp);
4476 sptm_pcpu->sptm_acct_flags[num_mappings] = 0;
4477 if (pmap != kernel_pmap) {
4478 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4479 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4480
4481 if (is_internal) {
4482 sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_INTERNAL;
4483 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4484 }
4485 if (is_altacct) {
4486 sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_ALTACCT;
4487 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4488 }
4489 if (compress && is_internal) {
4490 pte_template = ARM_PTE_COMPRESSED;
4491 if (is_altacct) {
4492 pte_template |= ARM_PTE_COMPRESSED_ALT;
4493 }
4494 }
4495 }
4496 /* Remove this CPU mapping from PVE list. */
4497 if (pve_p != PV_ENTRY_NULL) {
4498 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4499 }
4500 } else {
4501 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4502
4503 if (pmap == kernel_pmap) {
4504 pte_template = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4505 } else {
4506 pte_template = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4507 }
4508
4509 /*
4510 * We must at least clear the 'was writeable' flag, as we're at least revoking write access,
4511 * meaning that the VM is effectively requesting that subsequent write accesses to these mappings
4512 * go through vm_fault() instead of being handled by arm_fast_fault().
4513 */
4514 pte_set_was_writeable(pte_template, false);
4515
4516 /*
4517 * While the naive implementation of this would serve to add execute
4518 * permission, this is not how the VM uses this interface, or how
4519 * x86_64 implements it. So ignore requests to add execute permissions.
4520 */
4521 #if DEVELOPMENT || DEBUG
4522 if ((!(prot & VM_PROT_EXECUTE) && nx_enabled && pmap->nx_enabled) ||
4523 (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
4524 #else
4525 if (!(prot & VM_PROT_EXECUTE) ||
4526 (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
4527 #endif
4528 {
4529 pte_template |= pt_attr_leaf_xn(pt_attr);
4530 }
4531 }
4532
4533 if (ptdp != NULL) {
4534 sptm_ops[num_mappings].pte_template = pte_template;
4535 ++num_mappings;
4536 } else if (pmap_insert_flush_range_template(pte_template, flush_range)) {
4537 /**
4538 * We submit both the pending disjoint and pending region ops whenever
4539 * either category reaches the mapping limit. Having pending operations
4540 * in either category will keep preemption disabled, and we want to ensure
4541 * that we can at least temporarily re-enable preemption roughly every
4542 * SPTM_MAPPING_LIMIT mappings.
4543 */
4544 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
4545 pvh_lock_sleep_mode_needed = true;
4546 num_mappings = num_skipped_mappings = 0;
4547 }
4548
4549 protect_skip_pve:
4550 if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
4551 if (flush_range != NULL) {
4552 /* See comment above for why we submit both disjoint and region ops when we hit the limit. */
4553 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
4554 pmap_multipage_op_submit_region(flush_range);
4555 } else if (num_mappings > 0) {
4556 if (remove) {
4557 pmap_disjoint_unmap(phys, num_mappings);
4558 } else {
4559 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
4560 }
4561 }
4562 pvh_lock_sleep_mode_needed = true;
4563 num_mappings = num_skipped_mappings = 0;
4564 }
4565 pte_p = PT_ENTRY_NULL;
4566 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4567 pve_ptep_idx = 0;
4568
4569 if (remove) {
4570 /**
4571 * If there are any IOMMU mappings in the PVE list, preserve
4572 * those mappings in a new PVE list (new_pve_p) which will later
4573 * become the new PVH entry. Keep track of the CPU mappings in
4574 * pveh_p/pvet_p so they can be deallocated later.
4575 */
4576 if (iommu_mapping_in_pve) {
4577 iommu_mapping_in_pve = false;
4578 pv_entry_t *temp_pve_p = pve_next(pve_p);
4579 pve_remove(&local_locked_pvh, pve_pp, pve_p);
4580 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4581 pveh_p = pvh_pve_list(local_locked_pvh.pvh);
4582 } else {
4583 assert(pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL));
4584 pveh_p = PV_ENTRY_NULL;
4585 }
4586 pve_p->pve_next = new_pve_p;
4587 new_pve_p = pve_p;
4588 pve_p = temp_pve_p;
4589 continue;
4590 } else {
4591 pvet_p = pve_p;
4592 pvh_cnt++;
4593 }
4594 }
4595
4596 pve_pp = pve_next_ptr(pve_p);
4597 pve_p = pve_next(pve_p);
4598 iommu_mapping_in_pve = false;
4599 }
4600 }
4601
4602 if (num_mappings != 0) {
4603 if (remove) {
4604 pmap_disjoint_unmap(phys, num_mappings);
4605 } else if (flush_range == NULL) {
4606 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
4607 } else {
4608 /* Resync the pending mapping state in flush_range with our local state. */
4609 assert(num_mappings >= flush_range->pending_disjoint_entries);
4610 flush_range->pending_disjoint_entries = num_mappings;
4611 }
4612 }
4613
4614 if (remove) {
4615 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
4616 }
4617
4618 /**
4619 * Undo the explicit disable_preemption() done in PPO_PERCPU_INIT().
4620 * Note that enable_preemption() decrements a per-thread counter, so if
4621 * we happen to still hold the PVH lock in spin mode then preemption won't
4622 * actually be re-enabled until we drop the lock (which also decrements
4623 * the per-thread counter.
4624 */
4625 enable_preemption();
4626
4627 /* if we removed a bunch of entries, take care of them now */
4628 if (remove) {
4629 /**
4630 * If we (or our caller as indicated by PMAP_OPTIONS_PPO_PENDING_RETYPE) will
4631 * be retyping the page, we need to drain the epochs to ensure that concurrent
4632 * calls to batched operations such as pmap_remove() and the various multipage
4633 * attribute update functions have finished consuming mappings of this page.
4634 */
4635 const bool needs_retyping = pmap_prepare_unmapped_page_for_retype(phys);
4636 if ((options & PMAP_OPTIONS_PPO_PENDING_RETYPE) && !needs_retyping) {
4637 /**
4638 * pmap_prepare_unmapped_page_for_retype() will only return true if
4639 * the page belongs to a certain set of types that need to be auto-
4640 * retyped back to XNU_DEFAULT when they are unmapped. But if the
4641 * caller indicated that it's going to retype the page, we need
4642 * to drain the epochs regardless of the current page type.
4643 */
4644 pmap_retype_epoch_prepare_drain();
4645 }
4646 if (new_pve_p != PV_ENTRY_NULL) {
4647 pvh_update_head(&local_locked_pvh, new_pve_p, PVH_TYPE_PVEP);
4648 } else if (new_pte_p != PT_ENTRY_NULL) {
4649 pvh_update_head(&local_locked_pvh, new_pte_p, PVH_TYPE_PTEP);
4650 } else {
4651 pvh_set_flags(&local_locked_pvh, 0);
4652 pvh_update_head(&local_locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
4653 }
4654
4655 /* If removing the last mapping to a specially-protected page, retype the page back to XNU_DEFAULT. */
4656 const bool retype_needed = pmap_retype_unmapped_page(phys);
4657 if ((options & PMAP_OPTIONS_PPO_PENDING_RETYPE) && !retype_needed) {
4658 pmap_retype_epoch_drain();
4659 }
4660 }
4661
4662 if (__probable(locked_pvh == NULL)) {
4663 pvh_unlock(&local_locked_pvh);
4664 } else {
4665 *locked_pvh = local_locked_pvh;
4666 }
4667
4668 if (remove && (pvet_p != PV_ENTRY_NULL)) {
4669 assert(pveh_p != PV_ENTRY_NULL);
4670 pv_list_free(pveh_p, pvet_p, pvh_cnt);
4671 }
4672
4673 if ((flush_range != NULL) && !preemption_enabled()) {
4674 flush_range->processed_entries += num_skipped_mappings;
4675 }
4676 }
4677
4678 MARK_AS_PMAP_TEXT void
4679 pmap_page_protect_options_internal(
4680 ppnum_t ppnum,
4681 vm_prot_t prot,
4682 unsigned int options,
4683 void *arg)
4684 {
4685 if (arg != NULL) {
4686 /*
4687 * This is a legacy argument from pre-ARM era that the VM layer passes in to hint that it will call
4688 * pmap_flush() later to flush the TLB. On ARM platforms, however, pmap_flush() is not implemented,
4689 * as it's typically more efficient to perform the TLB flushing inline with the page table updates
4690 * themselves. Therefore, if the argument is non-NULL, pmap will take care of TLB flushing itself
4691 * by clearing PMAP_OPTIONS_NOFLUSH.
4692 */
4693 options &= ~PMAP_OPTIONS_NOFLUSH;
4694 }
4695 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL, NULL);
4696 }
4697
4698 void
4699 pmap_page_protect_options(
4700 ppnum_t ppnum,
4701 vm_prot_t prot,
4702 unsigned int options,
4703 void *arg)
4704 {
4705 pmap_paddr_t phys = ptoa(ppnum);
4706
4707 assert(ppnum != vm_page_fictitious_addr);
4708
4709 /* Only work with managed pages. */
4710 if (!pa_valid(phys)) {
4711 return;
4712 }
4713
4714 /*
4715 * Determine the new protection.
4716 */
4717 if (prot == VM_PROT_ALL) {
4718 return; /* nothing to do */
4719 }
4720
4721 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
4722
4723 pmap_page_protect_options_internal(ppnum, prot, options, arg);
4724
4725 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
4726 }
4727
4728
4729 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
4730 MARK_AS_PMAP_TEXT void
4731 pmap_disable_user_jop_internal(pmap_t pmap)
4732 {
4733 if (pmap == kernel_pmap) {
4734 panic("%s: called with kernel_pmap", __func__);
4735 }
4736 validate_pmap_mutable(pmap);
4737 sptm_configure_root(pmap->ttep, 0, SPTM_ROOT_PT_FLAG_JOP);
4738 pmap->disable_jop = true;
4739 }
4740
4741 void
4742 pmap_disable_user_jop(pmap_t pmap)
4743 {
4744 pmap_disable_user_jop_internal(pmap);
4745 }
4746 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
4747
4748 /*
4749 * Indicates if the pmap layer enforces some additional restrictions on the
4750 * given set of protections.
4751 */
4752 bool
4753 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
4754 {
4755 return false;
4756 }
4757
4758 /*
4759 * Set the physical protection on the
4760 * specified range of this map as requested.
4761 * VERY IMPORTANT: Will not increase permissions.
4762 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
4763 */
4764 void
4765 pmap_protect(
4766 pmap_t pmap,
4767 vm_map_address_t b,
4768 vm_map_address_t e,
4769 vm_prot_t prot)
4770 {
4771 pmap_protect_options(pmap, b, e, prot, 0, NULL);
4772 }
4773
4774 static bool
4775 pmap_protect_strong_sync(unsigned int num_mappings __unused)
4776 {
4777 return false;
4778 }
4779
4780 MARK_AS_PMAP_TEXT vm_map_address_t
4781 pmap_protect_options_internal(
4782 pmap_t pmap,
4783 vm_map_address_t start,
4784 vm_map_address_t end,
4785 vm_prot_t prot,
4786 unsigned int options,
4787 __unused void *args)
4788 {
4789 pt_entry_t *pte_p;
4790 bool set_NX = true;
4791 bool set_XO = false;
4792 bool should_have_removed = false;
4793 bool need_strong_sync = false;
4794
4795 /* Validate the pmap input before accessing its data. */
4796 validate_pmap_mutable(pmap);
4797
4798 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4799
4800 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
4801 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4802 }
4803
4804 #if DEVELOPMENT || DEBUG
4805 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
4806 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
4807 should_have_removed = true;
4808 }
4809 } else
4810 #endif
4811 {
4812 /* Determine the new protection. */
4813 switch (prot) {
4814 case VM_PROT_EXECUTE:
4815 set_XO = true;
4816 OS_FALLTHROUGH;
4817 case VM_PROT_READ:
4818 case VM_PROT_READ | VM_PROT_EXECUTE:
4819 break;
4820 case VM_PROT_READ | VM_PROT_WRITE:
4821 case VM_PROT_ALL:
4822 return end; /* nothing to do */
4823 default:
4824 should_have_removed = true;
4825 }
4826 }
4827
4828 if (__improbable(should_have_removed)) {
4829 panic("%s: should have been a remove operation, "
4830 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
4831 __FUNCTION__,
4832 pmap, (void *)start, (void *)end, prot, options, args);
4833 }
4834
4835 #if DEVELOPMENT || DEBUG
4836 bool force_write = false;
4837 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
4838 force_write = true;
4839 }
4840 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4841 #else
4842 if ((prot & VM_PROT_EXECUTE))
4843 #endif
4844 {
4845 set_NX = false;
4846 } else {
4847 set_NX = true;
4848 }
4849
4850 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4851 vm_map_address_t va = start;
4852 vm_map_address_t sptm_start_va = start;
4853 unsigned int num_mappings = 0;
4854
4855 pmap_lock(pmap, PMAP_LOCK_SHARED);
4856
4857 pte_p = pmap_pte(pmap, start);
4858
4859 if (pte_p == NULL) {
4860 pmap_unlock(pmap, PMAP_LOCK_SHARED);
4861 return end;
4862 }
4863
4864 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
4865 #if DEVELOPMENT || DEBUG
4866 if (!force_write)
4867 #endif
4868 {
4869 disable_preemption();
4870 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
4871 }
4872
4873 pt_entry_t tmplate = ARM_PTE_EMPTY;
4874
4875 if (pmap == kernel_pmap) {
4876 #if DEVELOPMENT || DEBUG
4877 if (force_write) {
4878 tmplate = ARM_PTE_AP(AP_RWNA);
4879 } else
4880 #endif
4881 {
4882 tmplate = ARM_PTE_AP(AP_RONA);
4883 }
4884 } else {
4885 #if DEVELOPMENT || DEBUG
4886 if (force_write) {
4887 assert(pmap->type != PMAP_TYPE_NESTED);
4888 tmplate = pt_attr_leaf_rw(pt_attr);
4889 } else
4890 #endif
4891 if (set_XO) {
4892 tmplate = pt_attr_leaf_rona(pt_attr);
4893 } else {
4894 tmplate = pt_attr_leaf_ro(pt_attr);
4895 }
4896 }
4897
4898 if (set_NX) {
4899 tmplate |= pt_attr_leaf_xn(pt_attr);
4900 }
4901
4902 while (va < end) {
4903 pt_entry_t spte = ARM_PTE_EMPTY;
4904
4905 /**
4906 * Removing "NX" would grant "execute" access immediately, bypassing any
4907 * checks VM might want to do in its soft fault path.
4908 * pmap_protect() and co. are not allowed to increase access permissions,
4909 * except in the PMAP_OPTIONS_PROTECT_IMMEDIATE internal-only case.
4910 * Therefore, if we are not explicitly clearing execute permissions, inherit
4911 * the existing permissions.
4912 */
4913 if (!set_NX) {
4914 spte = os_atomic_load(pte_p, relaxed);
4915 if (__improbable(!pte_is_valid(spte))) {
4916 tmplate |= pt_attr_leaf_xn(pt_attr);
4917 } else {
4918 tmplate |= (spte & ARM_PTE_XMASK);
4919 }
4920 }
4921
4922 #if DEVELOPMENT || DEBUG
4923 /*
4924 * PMAP_OPTIONS_PROTECT_IMMEDIATE is an internal-only option that's intended to
4925 * provide a "backdoor" to allow normally write-protected compressor pages to be
4926 * be temporarily written without triggering expensive write faults.
4927 */
4928 while (force_write) {
4929 if (spte == ARM_PTE_EMPTY) {
4930 spte = os_atomic_load(pte_p, relaxed);
4931 }
4932 const pt_entry_t prev_pte = spte;
4933
4934 /* A concurrent disconnect may have cleared the PTE. */
4935 if (__improbable(!pte_is_valid(spte))) {
4936 break;
4937 }
4938
4939 /* Inherit permissions and "was_writeable" from the template. */
4940 spte = (spte & ~(ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE)) |
4941 (tmplate & (ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE));
4942
4943 /* Access flag should be set for any immediate change in protections */
4944 spte |= ARM_PTE_AF;
4945 const pmap_paddr_t pa = pte_to_pa(spte);
4946 const unsigned int pai = pa_index(pa);
4947 locked_pvh_t locked_pvh;
4948 if (pa_valid(pa)) {
4949 locked_pvh = pvh_lock(pai);
4950
4951 /**
4952 * The VM may concurrently call pmap_disconnect() on the compressor
4953 * page in question, e.g. if relocating the page to satisfy a precious
4954 * allocation. Now that we hold the PVH lock, re-check the PTE and
4955 * restart the loop if it's different from the value we read before
4956 * we held the lock.
4957 */
4958 if (__improbable(os_atomic_load(pte_p, relaxed) != prev_pte)) {
4959 pvh_unlock(&locked_pvh);
4960 spte = ARM_PTE_EMPTY;
4961 continue;
4962 }
4963 ppattr_modify_bits(pai, PP_ATTR_REFFAULT | PP_ATTR_MODFAULT,
4964 PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
4965 }
4966
4967 __assert_only const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, va, spte);
4968
4969 /**
4970 * We don't expect the VM to be concurrently calling pmap_remove() against these
4971 * compressor mappings. If it does for some reason, that could cause the above
4972 * call to return either SPTM_SUCCESS or SPTM_MAP_FLUSH_PENDING.
4973 */
4974 assert3u(sptm_status, ==, SPTM_MAP_VALID);
4975
4976 if (pa_valid(pa)) {
4977 pvh_unlock(&locked_pvh);
4978 }
4979 break;
4980 }
4981
4982 #endif /* DEVELOPMENT || DEBUG */
4983
4984 va += pmap_page_size;
4985 ++pte_p;
4986
4987 #if DEVELOPMENT || DEBUG
4988 if (!force_write)
4989 #endif
4990 {
4991 sptm_pcpu->sptm_templates[num_mappings] = tmplate;
4992 ++num_mappings;
4993 if (num_mappings == SPTM_MAPPING_LIMIT) {
4994 /**
4995 * Enter the retype epoch for the batched update operation. This is necessary because we
4996 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
4997 * call, so a concurrent pmap_page_protect() operation against one of those pages may
4998 * race this call. That should be perfectly fine as far as the PTE updates are concerned,
4999 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
5000 * if it does not first drain our epoch.
5001 */
5002 pmap_retype_epoch_enter();
5003 sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5004 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5005 pmap_retype_epoch_exit();
5006 need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5007
5008 /* Temporarily re-enable preemption to allow any urgent ASTs to be processed. */
5009 enable_preemption();
5010 num_mappings = 0;
5011 sptm_start_va = va;
5012 disable_preemption();
5013 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5014 }
5015 }
5016 }
5017
5018 /* This won't happen in the force_write case as we should never increment num_mappings. */
5019 if (num_mappings != 0) {
5020 pmap_retype_epoch_enter();
5021 sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5022 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5023 pmap_retype_epoch_exit();
5024 need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5025 }
5026
5027 #if DEVELOPMENT || DEBUG
5028 if (!force_write)
5029 #endif
5030 {
5031 enable_preemption();
5032 }
5033 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5034 if (__improbable(need_strong_sync)) {
5035 arm64_sync_tlb(true);
5036 }
5037 return va;
5038 }
5039
5040 void
5041 pmap_protect_options(
5042 pmap_t pmap,
5043 vm_map_address_t b,
5044 vm_map_address_t e,
5045 vm_prot_t prot,
5046 unsigned int options,
5047 __unused void *args)
5048 {
5049 vm_map_address_t l, beg;
5050
5051 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5052
5053 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5054 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5055 pmap, (uint64_t)b, (uint64_t)e);
5056 }
5057
5058 /*
5059 * We allow single-page requests to execute non-preemptibly,
5060 * as it doesn't make sense to sample AST_URGENT for a single-page
5061 * operation, and there are a couple of special use cases that
5062 * require a non-preemptible single-page operation.
5063 */
5064 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5065 pmap_verify_preemptible();
5066 }
5067
5068 #if DEVELOPMENT || DEBUG
5069 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5070 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5071 pmap_remove_options(pmap, b, e, options);
5072 return;
5073 }
5074 } else
5075 #endif
5076 {
5077 /* Determine the new protection. */
5078 switch (prot) {
5079 case VM_PROT_EXECUTE:
5080 case VM_PROT_READ:
5081 case VM_PROT_READ | VM_PROT_EXECUTE:
5082 break;
5083 case VM_PROT_READ | VM_PROT_WRITE:
5084 case VM_PROT_ALL:
5085 return; /* nothing to do */
5086 default:
5087 pmap_remove_options(pmap, b, e, options);
5088 return;
5089 }
5090 }
5091
5092 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5093 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5094 VM_KERNEL_ADDRHIDE(e));
5095
5096 beg = b;
5097
5098 while (beg < e) {
5099 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5100
5101 if (l > e) {
5102 l = e;
5103 }
5104
5105 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5106 }
5107
5108
5109 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5110 }
5111
5112 /**
5113 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5114 *
5115 * @param pmap pmap to insert the pages into.
5116 * @param va virtual address to map the pages into.
5117 * @param pa page number of the first physical page to map.
5118 * @param size block size, in number of pages.
5119 * @param prot mapping protection attributes.
5120 * @param attr flags to pass to pmap_enter().
5121 *
5122 * @return KERN_SUCCESS.
5123 */
5124 kern_return_t
5125 pmap_map_block(
5126 pmap_t pmap,
5127 addr64_t va,
5128 ppnum_t pa,
5129 uint32_t size,
5130 vm_prot_t prot,
5131 int attr,
5132 unsigned int flags)
5133 {
5134 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5135 }
5136
5137 /**
5138 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5139 * As opposed to pmap_map_block(), this function takes
5140 * a physical address as an input and operates using the
5141 * page size associated with the input pmap.
5142 *
5143 * @param pmap pmap to insert the pages into.
5144 * @param va virtual address to map the pages into.
5145 * @param pa physical address of the first physical page to map.
5146 * @param size block size, in number of pages.
5147 * @param prot mapping protection attributes.
5148 * @param attr flags to pass to pmap_enter().
5149 *
5150 * @return KERN_SUCCESS.
5151 */
5152 kern_return_t
5153 pmap_map_block_addr(
5154 pmap_t pmap,
5155 addr64_t va,
5156 pmap_paddr_t pa,
5157 uint32_t size,
5158 vm_prot_t prot,
5159 int attr,
5160 unsigned int flags)
5161 {
5162 #if __ARM_MIXED_PAGE_SIZE__
5163 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5164 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5165 #else
5166 const uint64_t pmap_page_size = PAGE_SIZE;
5167 #endif
5168
5169 for (ppnum_t page = 0; page < size; page++) {
5170 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE, PMAP_MAPPING_TYPE_INFER) != KERN_SUCCESS) {
5171 panic("%s: failed pmap_enter_addr, "
5172 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5173 __FUNCTION__,
5174 pmap, va, (uint64_t)pa, size, prot, flags);
5175 }
5176
5177 va += pmap_page_size;
5178 pa += pmap_page_size;
5179 }
5180
5181
5182 return KERN_SUCCESS;
5183 }
5184
5185 kern_return_t
5186 pmap_enter_addr(
5187 pmap_t pmap,
5188 vm_map_address_t v,
5189 pmap_paddr_t pa,
5190 vm_prot_t prot,
5191 vm_prot_t fault_type,
5192 unsigned int flags,
5193 boolean_t wired,
5194 pmap_mapping_type_t mapping_type)
5195 {
5196 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, mapping_type);
5197 }
5198
5199 /*
5200 * Insert the given physical page (p) at
5201 * the specified virtual address (v) in the
5202 * target physical map with the protection requested.
5203 *
5204 * If specified, the page will be wired down, meaning
5205 * that the related pte can not be reclaimed.
5206 *
5207 * NB: This is the only routine which MAY NOT lazy-evaluate
5208 * or lose information. That is, this routine must actually
5209 * insert this page into the given map eventually (must make
5210 * forward progress eventually.
5211 */
5212 kern_return_t
5213 pmap_enter(
5214 pmap_t pmap,
5215 vm_map_address_t v,
5216 ppnum_t pn,
5217 vm_prot_t prot,
5218 vm_prot_t fault_type,
5219 unsigned int flags,
5220 boolean_t wired,
5221 pmap_mapping_type_t mapping_type)
5222 {
5223 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, mapping_type);
5224 }
5225
5226 /*
5227 * Attempt to update a PTE constructed by pmap_enter_options().
5228 *
5229 * @note performs no page table or accounting modifications, nor any lasting SPTM page type modification, on failure.
5230 * @note expects to be called with preemption disabled to guarantee safe access to SPTM per-CPU data.
5231 *
5232 * @param pmap The pmap representing the address space in which to store the new PTE
5233 * @param pte_p The physical aperture KVA of the PTE to store
5234 * @param new_pte The new value to store in *pte_p
5235 * @param v The virtual address mapped by pte_p
5236 * @param locked_pvh Input/Output parameter pointing to a wrapped pv_head_table entry returned by
5237 * a previous call to pvh_lock(). *locked_pvh will be updated if existing mappings
5238 * need to be disconnected prior to retyping.
5239 * @param old_pte Returns the prior PTE contents, iff the PTE is successfully updated
5240 * @param options bitmask of PMAP_OPTIONS_* flags passed to pmap_enter_options().
5241 * @param mapping_type The type of the new mapping, this defines which SPTM frame type to use.
5242 *
5243 * @return SPTM_SUCCESS iff able to successfully update *pte_p to new_pte via sptm_map_page(),
5244 * SPTM_MAP_VALID if an existing mapping was successfully upgraded via sptm_map_page(),
5245 * SPTM_MAP_FLUSH_PENDING if the TLB flush of a previous mapping is still in-flight and
5246 * the mapping operation should be retried, or if the mapping operation should be retried
5247 * because we had to temporarily re-enable preemption which would invalidate caller-held
5248 * per-CPU data.
5249 * Otherwise an appropriate SPTM or TXM error code; in these cases the mapping should not be
5250 * retried and the caller should return an error.
5251 */
5252 static inline sptm_return_t
5253 pmap_enter_pte(
5254 pmap_t pmap,
5255 pt_entry_t *pte_p,
5256 pt_entry_t new_pte,
5257 locked_pvh_t *locked_pvh,
5258 pt_entry_t *old_pte,
5259 vm_map_address_t v,
5260 unsigned int options,
5261 pmap_mapping_type_t mapping_type)
5262 {
5263 sptm_pte_t prev_pte;
5264 bool changed_wiring = false;
5265
5266 assert(pte_p != NULL);
5267 assert(old_pte != NULL);
5268
5269 /* SPTM TODO: handle PAGE_RATIO_4 configurations if those devices remain supported. */
5270
5271 assert(get_preemption_level() > 0);
5272 const pmap_paddr_t pa = pte_to_pa(new_pte) & ~PAGE_MASK;
5273 sptm_frame_type_t prev_frame_type = XNU_DEFAULT;
5274 sptm_frame_type_t new_frame_type = XNU_DEFAULT;
5275
5276 /*
5277 * If the caller specified a mapping type of PMAP_MAPPINGS_TYPE_INFER, then we
5278 * keep the existing logic of deriving the SPTM frame type from the XPRR permissions.
5279 *
5280 * If the caller specified another mapping type, we simply follow that. This refactor was
5281 * needed for the XNU_KERNEL_RESTRICTED work, and it also allows us to be more precise at
5282 * what we want. It's better to let the caller specify the mapping type rather than use the
5283 * permissions for that.
5284 *
5285 * In the future, we should move entirely to use pmap_mapping_type_t; see rdar://114886323.
5286 */
5287 if (mapping_type != PMAP_MAPPING_TYPE_INFER) {
5288 switch (mapping_type) {
5289 case PMAP_MAPPING_TYPE_DEFAULT:
5290 new_frame_type = (sptm_frame_type_t)mapping_type;
5291 break;
5292 case PMAP_MAPPING_TYPE_ROZONE:
5293 assert(((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pmap_get_pt_attr(pmap)))));
5294 new_frame_type = (sptm_frame_type_t)mapping_type;
5295 break;
5296 case PMAP_MAPPING_TYPE_RESTRICTED:
5297 if (use_xnu_restricted) {
5298 new_frame_type = (sptm_frame_type_t)mapping_type;
5299 } else {
5300 new_frame_type = XNU_DEFAULT;
5301 }
5302 break;
5303 default:
5304 panic("invalid mapping type: %d", mapping_type);
5305 }
5306 } else if (__improbable(pte_to_xprr_perm(new_pte) == XPRR_USER_JIT_PERM)) {
5307 /*
5308 * Always check for XPRR_USER_JIT_PERM before we check for anything else. When using
5309 * RWX permissions, the only allowed type is XNU_USER_JIT, regardless of any other
5310 * flags which the VM may have provided.
5311 *
5312 * TODO: Assert that the PMAP_OPTIONS_XNU_USER_DEBUG flag isn't set when entering
5313 * this case. We can't do this for now because this might trigger on some macOS
5314 * systems where applications use MAP_JIT with RW/RX permissions, and then later
5315 * switch to RWX (which will cause a switch to XNU_USER_JIT from XNU_USER_DEBUG
5316 * but the VM will still have PMAP_OPTIONS_XNU_USER_DEBUG set). If the VM can
5317 * catch this case, and remove PMAP_OPTIONS_XNU_USER_DEBUG when an application
5318 * switches to RWX, then we can start asserting this requirement.
5319 */
5320 new_frame_type = XNU_USER_JIT;
5321 } else if (__improbable(options & PMAP_OPTIONS_XNU_USER_DEBUG)) {
5322 /*
5323 * Both XNU_USER_DEBUG and XNU_USER_EXEC allow RX permissions. Given that, we must
5324 * test for PMAP_OPTIONS_XNU_USER_DEBUG before we test for XNU_USER_EXEC since the
5325 * XNU_USER_DEBUG type overlays the XNU_USER_EXEC type.
5326 */
5327 new_frame_type = XNU_USER_DEBUG;
5328 } else if (pte_to_xprr_perm(new_pte) == XPRR_USER_RX_PERM) {
5329 new_frame_type = XNU_USER_EXEC;
5330 }
5331
5332 if (__improbable(new_frame_type != XNU_DEFAULT)) {
5333 prev_frame_type = sptm_get_frame_type(pa);
5334 }
5335
5336 if (__improbable(new_frame_type != prev_frame_type)) {
5337 /**
5338 * Remove all existing mappings prior to retyping, so that we can safely retype without having to worry
5339 * about a concurrent operation on one of those mappings triggering an SPTM violation. In particular,
5340 * pmap_remove() may clear a mapping to this page without holding its PVH lock. This approach works
5341 * because we hold the PVH lock during this call, and any attempt to enter a new mapping for the page
5342 * will also need to grab the PVH lock and call this function.
5343 */
5344 pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
5345 PMAP_OPTIONS_PPO_PENDING_RETYPE, locked_pvh, NULL);
5346 /**
5347 * In the unlikely event that pmap_page_protect_options_with_flush_range() had to process
5348 * an excessively long PV list, it will have enabled preemption by placing the PVH lock
5349 * in sleep mode. In this case, we may have been migrated to a different CPU, and caller
5350 * assumptions about the state of per-CPU data (such as per-CPU PVE availability) will no
5351 * longer hold true. Ask the caller to retry by pretending we encountered a pending flush.
5352 */
5353 if (__improbable(preemption_enabled())) {
5354 return SPTM_MAP_FLUSH_PENDING;
5355 }
5356 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5357 /* Reload the existing frame type, as pmap_page_protect_options() may have changed it back to XNU_DEFAULT. */
5358 prev_frame_type = sptm_get_frame_type(pa);
5359 sptm_retype(pa, prev_frame_type, new_frame_type, retype_params);
5360 }
5361
5362 const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, v, new_pte);
5363 if (__improbable((sptm_status != SPTM_SUCCESS) && (sptm_status != SPTM_MAP_VALID))) {
5364 /*
5365 * We should always undo our previous retype, even if the SPTM returned SPTM_MAP_FLUSH_PENDING as
5366 * opposed to a TXM error. In the case of SPTM_MAP_FLUSH_PENDING, pmap_enter() will drop the PVH
5367 * lock before turning around to retry the mapping operation. It may then be possible for the
5368 * mapping state of the page to change such that our next attempt to map it will fail with a TXM
5369 * error, so if we were to leave the new type in place here we would then have lost our record
5370 * of the previous type and would effectively leave the page in an inconsistent state.
5371 */
5372 if (__improbable(new_frame_type != prev_frame_type)) {
5373 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5374 sptm_retype(pa, new_frame_type, prev_frame_type, retype_params);
5375 }
5376 return sptm_status;
5377 }
5378
5379 *old_pte = prev_pte = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes[0];
5380
5381 if (prev_pte != new_pte) {
5382 changed_wiring = pte_is_compressed(prev_pte, pte_p) ?
5383 (new_pte & ARM_PTE_WIRED) != 0 :
5384 (new_pte & ARM_PTE_WIRED) != (prev_pte & ARM_PTE_WIRED);
5385
5386 if ((pmap != kernel_pmap) && changed_wiring) {
5387 pte_update_wiredcnt(pmap, pte_p, (new_pte & ARM_PTE_WIRED) != 0);
5388 }
5389
5390 PMAP_TRACE(4 + pt_attr_leaf_level(pmap_get_pt_attr(pmap)), PMAP_CODE(PMAP__TTE),
5391 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v),
5392 VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)), new_pte);
5393 }
5394
5395 return sptm_status;
5396 }
5397
5398 MARK_AS_PMAP_TEXT static pt_entry_t
5399 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5400 {
5401 pt_entry_t pte;
5402
5403 switch (wimg & (VM_WIMG_MASK)) {
5404 case VM_WIMG_IO:
5405 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5406 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5407 // AP, while preserving the security benefits of using device
5408 // mapping against side-channel attacks. On pre-H14 platforms,
5409 // the accesses will still be strongly ordered.
5410 if (is_dram_addr(pa)) {
5411 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5412 } else {
5413 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5414 #if HAS_FEAT_XS
5415 pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5416 if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5417 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5418 }
5419 #endif /* HAS_FEAT_XS */
5420 }
5421 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5422 break;
5423 case VM_WIMG_RT:
5424 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5425 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5426 break;
5427 case VM_WIMG_POSTED:
5428 if (is_dram_addr(pa)) {
5429 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5430 } else {
5431 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5432 }
5433 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5434 break;
5435 case VM_WIMG_POSTED_REORDERED:
5436 if (is_dram_addr(pa)) {
5437 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5438 } else {
5439 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5440 }
5441 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5442 break;
5443 case VM_WIMG_POSTED_COMBINED_REORDERED:
5444 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5445 #if HAS_FEAT_XS
5446 if (!is_dram_addr(pa)) {
5447 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5448 }
5449 #endif /* HAS_FEAT_XS */
5450 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5451 break;
5452 case VM_WIMG_WCOMB:
5453 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5454 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5455 break;
5456 case VM_WIMG_WTHRU:
5457 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5458 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5459 break;
5460 case VM_WIMG_COPYBACK:
5461 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5462 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5463 break;
5464 case VM_WIMG_INNERWBACK:
5465 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5466 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5467 break;
5468 default:
5469 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5470 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5471 }
5472
5473 return pte;
5474 }
5475
5476
5477 /*
5478 * Construct a PTE (and the physical page attributes) for the given virtual to
5479 * physical mapping.
5480 *
5481 * This function has no side effects and is safe to call so that it is safe to
5482 * call while attempting a pmap_enter transaction.
5483 */
5484 MARK_AS_PMAP_TEXT static pt_entry_t
5485 pmap_construct_pte(
5486 const pmap_t pmap,
5487 vm_map_address_t va,
5488 pmap_paddr_t pa,
5489 vm_prot_t prot,
5490 vm_prot_t fault_type,
5491 boolean_t wired,
5492 const pt_attr_t* const pt_attr,
5493 uint16_t *pp_attr_bits /* OUTPUT */
5494 )
5495 {
5496 bool set_NX = false, set_XO = false;
5497 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
5498 assert(pp_attr_bits != NULL);
5499 *pp_attr_bits = 0;
5500
5501 if (wired) {
5502 pte |= ARM_PTE_WIRED;
5503 }
5504
5505 #if DEVELOPMENT || DEBUG
5506 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5507 #else
5508 if ((prot & VM_PROT_EXECUTE))
5509 #endif
5510 {
5511 set_NX = false;
5512 } else {
5513 set_NX = true;
5514 }
5515
5516 if (prot == VM_PROT_EXECUTE) {
5517 set_XO = true;
5518
5519 }
5520
5521 if (set_NX) {
5522 pte |= pt_attr_leaf_xn(pt_attr);
5523 } else {
5524 if (pmap == kernel_pmap) {
5525 pte |= ARM_PTE_NX;
5526 } else {
5527 pte |= pt_attr_leaf_x(pt_attr);
5528 }
5529 }
5530
5531 if (pmap == kernel_pmap) {
5532 #if __ARM_KERNEL_PROTECT__
5533 pte |= ARM_PTE_NG;
5534 #endif /* __ARM_KERNEL_PROTECT__ */
5535 if (prot & VM_PROT_WRITE) {
5536 pte |= ARM_PTE_AP(AP_RWNA);
5537 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5538 } else {
5539 pte |= ARM_PTE_AP(AP_RONA);
5540 *pp_attr_bits |= PP_ATTR_REFERENCED;
5541 }
5542 } else {
5543 if (pmap->type != PMAP_TYPE_NESTED) {
5544 pte |= ARM_PTE_NG;
5545 } else if ((pmap->nested_region_unnested_table_bitmap)
5546 && (va >= pmap->nested_region_addr)
5547 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5548 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5549
5550 if ((pmap->nested_region_unnested_table_bitmap)
5551 && bitmap_test(pmap->nested_region_unnested_table_bitmap, index)) {
5552 pte |= ARM_PTE_NG;
5553 }
5554 }
5555 if (prot & VM_PROT_WRITE) {
5556 assert(pmap->type != PMAP_TYPE_NESTED);
5557 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5558 if (fault_type & VM_PROT_WRITE) {
5559 pte |= pt_attr_leaf_rw(pt_attr);
5560 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5561 } else {
5562 pte |= pt_attr_leaf_ro(pt_attr);
5563 /*
5564 * Mark the page as MODFAULT so that a subsequent write
5565 * may be handled through arm_fast_fault().
5566 */
5567 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5568 pte_set_was_writeable(pte, true);
5569 }
5570 } else {
5571 pte |= pt_attr_leaf_rw(pt_attr);
5572 *pp_attr_bits |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5573 }
5574 } else {
5575 if (set_XO) {
5576 pte |= pt_attr_leaf_rona(pt_attr);
5577 } else {
5578 pte |= pt_attr_leaf_ro(pt_attr);
5579 }
5580 *pp_attr_bits |= PP_ATTR_REFERENCED;
5581 }
5582 }
5583
5584 pte |= ARM_PTE_AF;
5585 return pte;
5586 }
5587
5588 MARK_AS_PMAP_TEXT kern_return_t
5589 pmap_enter_options_internal(
5590 pmap_t pmap,
5591 vm_map_address_t v,
5592 pmap_paddr_t pa,
5593 vm_prot_t prot,
5594 vm_prot_t fault_type,
5595 unsigned int flags,
5596 boolean_t wired,
5597 unsigned int options,
5598 pmap_mapping_type_t mapping_type)
5599 {
5600 ppnum_t pn = (ppnum_t)atop(pa);
5601 pt_entry_t *pte_p;
5602 unsigned int wimg_bits;
5603 bool committed = false;
5604 kern_return_t kr = KERN_SUCCESS;
5605 uint16_t pp_attr_bits;
5606 volatile uint16_t *wiredcnt = NULL;
5607 pv_free_list_t *local_pv_free;
5608
5609 validate_pmap_mutable(pmap);
5610
5611 /**
5612 * Prepare for the SPTM call early by prefetching the relavant FTEs. Cache misses
5613 * in SPTM accessing these turn out to contribute to a large portion of delay on
5614 * the critical path. Technically, sptm_prefetch_fte may not find an FTE associated
5615 * with pa and return LIBSPTM_FAILURE. However, we are okay with that as it's only
5616 * a best-effort performance optimization.
5617 */
5618 sptm_prefetch_fte(pmap->ttep);
5619 sptm_prefetch_fte(pa);
5620
5621 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5622
5623 if ((v) & pt_attr_leaf_offmask(pt_attr)) {
5624 panic("pmap_enter_options() pmap %p v 0x%llx",
5625 pmap, (uint64_t)v);
5626 }
5627
5628 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
5629 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
5630 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
5631 }
5632
5633 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
5634 panic("pmap_enter_options() pmap %p pa 0x%llx",
5635 pmap, (uint64_t)pa);
5636 }
5637
5638 /* The PA should not extend beyond the architected physical address space */
5639 pa &= ARM_PTE_PAGE_MASK;
5640
5641 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
5642 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
5643 extern vm_offset_t ctrr_test_page;
5644 if (__probable(v != ctrr_test_page))
5645 #endif
5646 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
5647 }
5648 assert(pn != vm_page_fictitious_addr);
5649
5650 pmap_lock(pmap, PMAP_LOCK_SHARED);
5651
5652 /*
5653 * Expand pmap to include this pte. Assume that
5654 * pmap is always expanded to include enough hardware
5655 * pages to map one VM page.
5656 */
5657 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
5658 /* Must unlock to expand the pmap. */
5659 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5660
5661 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
5662
5663 if (kr != KERN_SUCCESS) {
5664 return kr;
5665 }
5666
5667 pmap_lock(pmap, PMAP_LOCK_SHARED);
5668 }
5669
5670 if (options & PMAP_OPTIONS_NOENTER) {
5671 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5672 return KERN_SUCCESS;
5673 }
5674
5675 /*
5676 * Since we may not hold the pmap lock exclusive, updating the pte is
5677 * done via a cmpxchg loop.
5678 * We need to be careful about modifying non-local data structures before commiting
5679 * the new pte since we may need to re-do the transaction.
5680 */
5681 const pt_entry_t prev_pte = os_atomic_load(pte_p, relaxed);
5682
5683 if (pte_is_valid(prev_pte) && (pte_to_pa(prev_pte) != pa)) {
5684 /*
5685 * There is already a mapping here & it's for a different physical page.
5686 * First remove that mapping.
5687 * We assume that we can leave the pmap lock held for shared access rather
5688 * than exclusive access here, because we assume that the VM won't try to
5689 * simultaneously map the same VA to multiple different physical pages.
5690 * If that assumption is violated, sptm_map_page() will panic as the architecture
5691 * does not allow the output address of a mapping to be changed without a break-
5692 * before-make sequence.
5693 */
5694 pmap_remove_range(pmap, v, v + PAGE_SIZE);
5695 }
5696
5697 if (pmap != kernel_pmap) {
5698 ptd_info_t *ptd_info = ptep_get_info(pte_p);
5699 wiredcnt = &ptd_info->wiredcnt;
5700 }
5701
5702 while (!committed) {
5703 pt_entry_t spte = ARM_PTE_EMPTY;
5704 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
5705 bool skip_footprint_debit = false;
5706
5707 /*
5708 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
5709 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
5710 * read-write protection. The PMAP layer though still needs to use the right
5711 * index, which is the older XO-now-TPRO one and that is specially selected
5712 * here thanks to PMAP_OPTIONS_MAP_TPRO.
5713 *
5714 * Note that pmap_construct_pte() may check the nested region ASID bitmap,
5715 * which needs to happen at every iteration of the commit loop in case we
5716 * previously dropped the pmap lock.
5717 */
5718 pt_entry_t pte = pmap_construct_pte(pmap, v, pa,
5719 ((options & PMAP_OPTIONS_MAP_TPRO) ? VM_PROT_RORW_TP : prot), fault_type, wired, pt_attr, &pp_attr_bits);
5720
5721 if (pa_valid(pa)) {
5722 unsigned int pai;
5723 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
5724
5725 is_internal = FALSE;
5726 is_altacct = FALSE;
5727
5728 pai = pa_index(pa);
5729 locked_pvh_t locked_pvh;
5730
5731 if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
5732 locked_pvh = pvh_lock_nopreempt(pai);
5733 } else {
5734 locked_pvh = pvh_lock(pai);
5735 }
5736
5737 /*
5738 * Make sure that the current per-cpu PV free list has
5739 * enough entries (2 in the worst-case scenario) to handle the enter_pv
5740 * if the transaction succeeds. At this point, preemption has either
5741 * been disabled by the caller or by pvh_lock() above.
5742 * Note that we can still be interrupted, but a primary
5743 * interrupt handler can never enter the pmap.
5744 */
5745 assert(get_preemption_level() > 0);
5746 local_pv_free = &pmap_get_cpu_data()->pv_free;
5747 const bool allocation_required = !pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL) &&
5748 !(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP) && pvh_ptep(locked_pvh.pvh) == pte_p);
5749
5750 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
5751 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
5752 int new_allocated_pves = 0;
5753
5754 while (new_allocated_pves < 2) {
5755 local_pv_free = &pmap_get_cpu_data()->pv_free;
5756 pv_status = pv_alloc(pmap, PMAP_LOCK_SHARED, options, &new_pve_p[new_allocated_pves], &locked_pvh, wiredcnt);
5757 if (pv_status == PV_ALLOC_FAIL) {
5758 break;
5759 } else if (pv_status == PV_ALLOC_RETRY) {
5760 /*
5761 * In the case that pv_alloc() had to grab a new page of PVEs,
5762 * it will have dropped the pmap lock while doing so.
5763 * On non-PPL devices, dropping the lock re-enables preemption so we may
5764 * be on a different CPU now.
5765 */
5766 local_pv_free = &pmap_get_cpu_data()->pv_free;
5767 } else {
5768 /* If we've gotten this far then a node should've been allocated. */
5769 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
5770
5771 new_allocated_pves++;
5772 }
5773 }
5774
5775 for (int i = 0; i < new_allocated_pves; i++) {
5776 pv_free(new_pve_p[i]);
5777 }
5778 }
5779
5780 if (pv_status == PV_ALLOC_FAIL) {
5781 pvh_unlock(&locked_pvh);
5782 kr = KERN_RESOURCE_SHORTAGE;
5783 break;
5784 } else if (pv_status == PV_ALLOC_RETRY) {
5785 pvh_unlock(&locked_pvh);
5786 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
5787 continue;
5788 }
5789
5790 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
5791 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
5792 } else {
5793 wimg_bits = pmap_cache_attributes(pn);
5794 }
5795
5796 /**
5797 * We may be retrying this operation after dropping the PVH lock.
5798 * Cache attributes for the physical page may have changed while the lock
5799 * was dropped, so update PTE cache attributes on each loop iteration.
5800 */
5801 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
5802
5803
5804 const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, pte, &locked_pvh, &spte, v, options, mapping_type);
5805 assert(committed == false);
5806 if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
5807 committed = true;
5808 } else if (sptm_status == SPTM_MAP_FLUSH_PENDING) {
5809 pvh_unlock(&locked_pvh);
5810 continue;
5811 } else if (sptm_status == SPTM_MAP_CODESIGN_ERROR) {
5812 pvh_unlock(&locked_pvh);
5813 kr = KERN_CODESIGN_ERROR;
5814 break;
5815 } else {
5816 pvh_unlock(&locked_pvh);
5817 kr = KERN_FAILURE;
5818 break;
5819 }
5820 const bool had_valid_mapping = (sptm_status == SPTM_MAP_VALID);
5821 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
5822 if (!had_valid_mapping) {
5823 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
5824 int pve_ptep_idx = 0;
5825 pv_status = pmap_enter_pv(pmap, pte_p, options, PMAP_LOCK_SHARED, &locked_pvh, &new_pve_p, &pve_ptep_idx);
5826 /* We did all the allocations up top. So this shouldn't be able to fail. */
5827 if (pv_status != PV_ALLOC_SUCCESS) {
5828 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
5829 __func__, pv_status, new_pve_p, pmap);
5830 }
5831
5832 if (pmap != kernel_pmap) {
5833 if (options & PMAP_OPTIONS_INTERNAL) {
5834 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
5835 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
5836 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
5837 /*
5838 * Make a note to ourselves that this
5839 * mapping is using alternative
5840 * accounting. We'll need this in order
5841 * to know which ledger to debit when
5842 * the mapping is removed.
5843 *
5844 * The altacct bit must be set while
5845 * the pv head is locked. Defer the
5846 * ledger accounting until after we've
5847 * dropped the lock.
5848 */
5849 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
5850 is_altacct = TRUE;
5851 }
5852 }
5853 if (ppattr_test_reusable(pai) &&
5854 !is_altacct) {
5855 is_reusable = TRUE;
5856 } else if (options & PMAP_OPTIONS_INTERNAL) {
5857 is_internal = TRUE;
5858 } else {
5859 is_external = TRUE;
5860 }
5861 }
5862 }
5863
5864 pvh_unlock(&locked_pvh);
5865
5866 if (pp_attr_bits != 0) {
5867 ppattr_pa_set_bits(pa, pp_attr_bits);
5868 }
5869
5870 if (!had_valid_mapping && (pmap != kernel_pmap)) {
5871 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5872
5873 if (is_internal) {
5874 /*
5875 * Make corresponding adjustments to
5876 * phys_footprint statistics.
5877 */
5878 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5879 if (is_altacct) {
5880 /*
5881 * If this page is internal and
5882 * in an IOKit region, credit
5883 * the task's total count of
5884 * dirty, internal IOKit pages.
5885 * It should *not* count towards
5886 * the task's total physical
5887 * memory footprint, because
5888 * this entire region was
5889 * already billed to the task
5890 * at the time the mapping was
5891 * created.
5892 *
5893 * Put another way, this is
5894 * internal++ and
5895 * alternate_accounting++, so
5896 * net effect on phys_footprint
5897 * is 0. That means: don't
5898 * touch phys_footprint here.
5899 */
5900 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5901 } else {
5902 if (pte_is_compressed(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
5903 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
5904 skip_footprint_debit = true;
5905 } else {
5906 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5907 }
5908 }
5909 }
5910 if (is_reusable) {
5911 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5912 } else if (is_external) {
5913 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5914 }
5915 }
5916 } else {
5917 if (prot & VM_PROT_EXECUTE) {
5918 kr = KERN_FAILURE;
5919 break;
5920 }
5921
5922 wimg_bits = pmap_cache_attributes(pn);
5923 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
5924 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
5925 }
5926
5927 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
5928
5929
5930 /**
5931 * pmap_enter_pte() expects to be called with preemption disabled so it can access
5932 * the per-CPU prev_ptes array.
5933 */
5934 disable_preemption();
5935 const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, pte, NULL, &spte, v, options, mapping_type);
5936 enable_preemption();
5937 assert(committed == false);
5938 if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
5939 committed = true;
5940
5941 /**
5942 * If there was already a valid pte here then we reuse its
5943 * reference on the ptd and drop the one that we took above.
5944 */
5945 } else if (__improbable(sptm_status != SPTM_MAP_FLUSH_PENDING)) {
5946 panic("%s: Unexpected SPTM return code %u for non-managed PA 0x%llx", __func__, (unsigned int)sptm_status, (unsigned long long)pa);
5947 }
5948 }
5949 if (committed) {
5950 if (pte_is_compressed(spte, pte_p)) {
5951 assert(pmap != kernel_pmap);
5952
5953 /* One less "compressed" */
5954 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
5955 pt_attr_page_size(pt_attr) * PAGE_RATIO);
5956
5957 if (spte & ARM_PTE_COMPRESSED_ALT) {
5958 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5959 } else if (!skip_footprint_debit) {
5960 /* Was part of the footprint */
5961 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5962 }
5963 }
5964 }
5965 }
5966
5967 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5968
5969 if (kr == KERN_CODESIGN_ERROR) {
5970 /* Print any logs from TXM */
5971 txm_print_logs();
5972 }
5973 return kr;
5974 }
5975
5976 kern_return_t
5977 pmap_enter_options_addr(
5978 pmap_t pmap,
5979 vm_map_address_t v,
5980 pmap_paddr_t pa,
5981 vm_prot_t prot,
5982 vm_prot_t fault_type,
5983 unsigned int flags,
5984 boolean_t wired,
5985 unsigned int options,
5986 __unused void *arg,
5987 pmap_mapping_type_t mapping_type)
5988 {
5989 kern_return_t kr = KERN_FAILURE;
5990
5991
5992 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
5993 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
5994
5995 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options, mapping_type);
5996
5997 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
5998
5999 return kr;
6000 }
6001
6002 kern_return_t
6003 pmap_enter_options(
6004 pmap_t pmap,
6005 vm_map_address_t v,
6006 ppnum_t pn,
6007 vm_prot_t prot,
6008 vm_prot_t fault_type,
6009 unsigned int flags,
6010 boolean_t wired,
6011 unsigned int options,
6012 __unused void *arg,
6013 pmap_mapping_type_t mapping_type)
6014 {
6015 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot,
6016 fault_type, flags, wired, options, arg, mapping_type);
6017 }
6018
6019 /*
6020 * Routine: pmap_change_wiring
6021 * Function: Change the wiring attribute for a map/virtual-address
6022 * pair.
6023 * In/out conditions:
6024 * The mapping must already exist in the pmap.
6025 */
6026 MARK_AS_PMAP_TEXT void
6027 pmap_change_wiring_internal(
6028 pmap_t pmap,
6029 vm_map_address_t v,
6030 boolean_t wired)
6031 {
6032 pt_entry_t *pte_p, prev_pte;
6033
6034 validate_pmap_mutable(pmap);
6035
6036 pmap_lock(pmap, PMAP_LOCK_SHARED);
6037
6038 const pt_entry_t new_wiring = (wired ? ARM_PTE_WIRED : 0);
6039
6040 pte_p = pmap_pte(pmap, v);
6041 if (pte_p == PT_ENTRY_NULL) {
6042 if (!wired) {
6043 /*
6044 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6045 * may have been freed by a remove operation.
6046 */
6047 goto pmap_change_wiring_return;
6048 } else {
6049 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6050 }
6051 }
6052
6053 disable_preemption();
6054 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
6055 sptm_pcpu->sptm_templates[0] = (*pte_p & ~ARM_PTE_WIRED) | new_wiring;
6056
6057 pmap_retype_epoch_enter();
6058 sptm_update_region(pmap->ttep, v, 1, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_SW_WIRED);
6059 pmap_retype_epoch_exit();
6060
6061 prev_pte = os_atomic_load(&sptm_pcpu->sptm_prev_ptes[0], relaxed);
6062 enable_preemption();
6063
6064 if (!pte_is_valid(prev_pte)) {
6065 goto pmap_change_wiring_return;
6066 }
6067
6068 if ((pmap != kernel_pmap) && (wired != pte_is_wired(prev_pte))) {
6069 pte_update_wiredcnt(pmap, pte_p, wired);
6070 }
6071
6072 pmap_change_wiring_return:
6073 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6074 }
6075
6076 void
6077 pmap_change_wiring(
6078 pmap_t pmap,
6079 vm_map_address_t v,
6080 boolean_t wired)
6081 {
6082 pmap_change_wiring_internal(pmap, v, wired);
6083 }
6084
6085 MARK_AS_PMAP_TEXT pmap_paddr_t
6086 pmap_find_pa_internal(
6087 pmap_t pmap,
6088 addr64_t va)
6089 {
6090 pmap_paddr_t pa = 0;
6091
6092 validate_pmap(pmap);
6093
6094 if (pmap != kernel_pmap) {
6095 pmap_lock(pmap, PMAP_LOCK_SHARED);
6096 }
6097
6098 pa = pmap_vtophys(pmap, va);
6099
6100 if (pmap != kernel_pmap) {
6101 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6102 }
6103
6104 return pa;
6105 }
6106
6107 pmap_paddr_t
6108 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6109 {
6110 pmap_paddr_t pa = 0;
6111
6112 if (pmap == kernel_pmap) {
6113 pa = mmu_kvtop(va);
6114 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6115 /*
6116 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6117 * translation even if PAN would prevent kernel access through the translation.
6118 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6119 */
6120 pa = mmu_uvtop(va);
6121 }
6122 return pa;
6123 }
6124
6125 pmap_paddr_t
6126 pmap_find_pa(
6127 pmap_t pmap,
6128 addr64_t va)
6129 {
6130 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6131
6132 if (pa != 0) {
6133 return pa;
6134 }
6135
6136 if (not_in_kdp) {
6137 return pmap_find_pa_internal(pmap, va);
6138 } else {
6139 return pmap_vtophys(pmap, va);
6140 }
6141 }
6142
6143 ppnum_t
6144 pmap_find_phys_nofault(
6145 pmap_t pmap,
6146 addr64_t va)
6147 {
6148 ppnum_t ppn;
6149 ppn = atop(pmap_find_pa_nofault(pmap, va));
6150 return ppn;
6151 }
6152
6153 ppnum_t
6154 pmap_find_phys(
6155 pmap_t pmap,
6156 addr64_t va)
6157 {
6158 ppnum_t ppn;
6159 ppn = atop(pmap_find_pa(pmap, va));
6160 return ppn;
6161 }
6162
6163 /**
6164 * Translate a kernel virtual address into a physical address.
6165 *
6166 * @param va The kernel virtual address to translate. Does not work on user
6167 * virtual addresses.
6168 *
6169 * @return The physical address if the translation was successful, or zero if
6170 * no valid mappings were found for the given virtual address.
6171 */
6172 pmap_paddr_t
6173 kvtophys(vm_offset_t va)
6174 {
6175 sptm_paddr_t pa;
6176
6177 if (sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS) {
6178 return 0;
6179 }
6180
6181 return pa;
6182 }
6183
6184 /**
6185 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6186 * points to a non-kernel-managed physical page, then this call will panic().
6187 *
6188 * @note The output of this function is guaranteed to be a kernel-managed
6189 * physical page, which means it's safe to pass the output directly to
6190 * pa_index() to create a physical address index for various pmap data
6191 * structures.
6192 *
6193 * @param va The kernel virtual address to translate. Does not work on user
6194 * virtual addresses.
6195 *
6196 * @return The translated physical address for the given virtual address.
6197 */
6198 pmap_paddr_t
6199 kvtophys_nofail(vm_offset_t va)
6200 {
6201 pmap_paddr_t pa;
6202
6203 if (__improbable(sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS)) {
6204 panic("%s: VA->PA translation failed for va %p", __func__, (void *)va);
6205 }
6206
6207 return pa;
6208 }
6209
6210 pmap_paddr_t
6211 pmap_vtophys(
6212 pmap_t pmap,
6213 addr64_t va)
6214 {
6215 if ((va < pmap->min) || (va >= pmap->max)) {
6216 return 0;
6217 }
6218
6219 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6220
6221 tt_entry_t * ttp = NULL;
6222 tt_entry_t * ttep = NULL;
6223 tt_entry_t tte = ARM_TTE_EMPTY;
6224 pmap_paddr_t pa = 0;
6225 unsigned int cur_level;
6226
6227 ttp = pmap->tte;
6228
6229 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6230 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6231
6232 tte = *ttep;
6233
6234 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6235 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6236 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6237 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6238
6239 if ((tte & valid_mask) != valid_mask) {
6240 return (pmap_paddr_t) 0;
6241 }
6242
6243 /* This detects both leaf entries and intermediate block mappings. */
6244 if ((tte & type_mask) == type_block) {
6245 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6246 break;
6247 }
6248
6249 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6250 }
6251
6252 return pa;
6253 }
6254
6255 /*
6256 * pmap_init_pte_page - Initialize a page table page.
6257 */
6258 MARK_AS_PMAP_TEXT void
6259 pmap_init_pte_page(
6260 pmap_t pmap,
6261 pt_entry_t *pte_p,
6262 vm_offset_t va,
6263 unsigned int ttlevel,
6264 boolean_t alloc_ptd)
6265 {
6266 pt_desc_t *ptdp = NULL;
6267 unsigned int pai = pa_index(kvtophys_nofail((vm_offset_t)pte_p));
6268 const uintptr_t pvh = pai_to_pvh(pai);
6269
6270 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6271 if (alloc_ptd) {
6272 /*
6273 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6274 * on 4KB hardware, we may already have allocated a page table descriptor for a
6275 * bootstrap request, so we check for an existing PTD here.
6276 */
6277 ptdp = ptd_alloc(pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
6278 if (ptdp == NULL) {
6279 panic("%s: unable to allocate PTD", __func__);
6280 }
6281 locked_pvh_t locked_pvh = pvh_lock(pai);
6282 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
6283 pvh_unlock(&locked_pvh);
6284 } else {
6285 panic("pmap_init_pte_page(): no PTD for pte_p %p", pte_p);
6286 }
6287 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6288 ptdp = pvh_ptd(pvh);
6289 } else {
6290 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6291 }
6292
6293 // pagetable zero-fill and barrier should be guaranteed by the SPTM
6294 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6295 }
6296
6297 /*
6298 * This function guarantees that a pmap has the necessary page tables in place
6299 * to map the specified VA. If necessary, it will allocate new tables at any
6300 * non-root level in the hierarchy (the root table is always already allocated
6301 * and stored in the pmap).
6302 *
6303 * @note This function is expected to be called without any pmap or PVH lock
6304 * held.
6305 *
6306 * @note It is possible for an L3 table newly allocated by this function to be
6307 * deleted by another thread before control returns to the caller, iff that
6308 * table is an ordinary userspace table. Callers that use this function
6309 * to allocate new user L3 tables are therefore expected to keep calling
6310 * this function until they observe a successful L3 PTE lookup with the pmap
6311 * lock held. As long as it does not drop the pmap lock, the caller may
6312 * then safely use the looked-up L3 table. See the use of this function in
6313 * pmap_enter_options_internal() for an example.
6314 *
6315 * @param pmap The pmap for which to ensure mapping space is present.
6316 * @param v The virtual address for which to ensure mapping space is present
6317 * in [pmap].
6318 * @param options Flags to pass to pmap_tt_allocate() if a new table needs to be
6319 * allocated. The only valid option is PMAP_OPTIONS_NOWAIT, which
6320 * specifies that the allocation must not block.
6321 * @param level The maximum paging level for which to ensure a table is present.
6322 *
6323 * @return KERN_INVALID_ADDRESS if [v] is outside the pmap's mappable range,
6324 * KERN_RESOURCE_SHORTAGE if a new table can't be allocated,
6325 * KERN_SUCCESS otherwise.
6326 */
6327 MARK_AS_PMAP_TEXT static kern_return_t
6328 pmap_expand(
6329 pmap_t pmap,
6330 vm_map_address_t vaddr,
6331 unsigned int options,
6332 unsigned int level)
6333 {
6334 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6335
6336 if (__improbable((vaddr < pmap->min) || (vaddr >= pmap->max))) {
6337 return KERN_INVALID_ADDRESS;
6338 }
6339 pmap_paddr_t pa;
6340 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
6341 const uint64_t table_align_mask = (PAGE_SIZE / pmap_page_size) - 1;
6342 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6343 tt_entry_t *table_ttep = pmap->tte;
6344 tt_entry_t *ttep;
6345 tt_entry_t old_tte = ARM_TTE_EMPTY;
6346
6347 pa = 0x0ULL;
6348
6349 for (; ttlevel < level; ttlevel++) {
6350 /**
6351 * If the previous iteration didn't allocate a new table, obtain the table from the previous TTE.
6352 * Doing this step at the beginning of the loop instead of the end (which would make it part of
6353 * the prior iteration) avoids the possibility of executing this step to extract an L3 table KVA
6354 * from an L2 TTE, which would be useless because there would be no next iteration to make use
6355 * of the table KVA.
6356 */
6357 if (table_ttep == NULL) {
6358 assert(tte_is_valid_table(old_tte));
6359 table_ttep = (tt_entry_t*)phystokv(old_tte & ARM_TTE_TABLE_MASK);
6360 }
6361
6362 vm_map_address_t v = pt_attr_align_va(pt_attr, ttlevel, vaddr);
6363
6364 /**
6365 * We don't need to hold the pmap lock while walking the paging hierarchy. Only L3 tables are
6366 * allowed to be dynamically removed, and only for regular user pmaps at that. We may allocate
6367 * a new L3 table below, but we will only access L0-L2 tables, so there's no risk of a table
6368 * being deleted while we are using it for the next level(s) of lookup.
6369 */
6370 ttep = &table_ttep[ttn_index(pt_attr, vaddr, ttlevel)];
6371 old_tte = os_atomic_load(ttep, relaxed);
6372 table_ttep = NULL;
6373 if (!tte_is_valid_table(old_tte)) {
6374 tt_entry_t new_tte, *new_ttep;
6375 while (pmap_tt_allocate(pmap, &new_ttep, ttlevel + 1, options | PMAP_PAGE_NOZEROFILL) != KERN_SUCCESS) {
6376 if (options & PMAP_OPTIONS_NOWAIT) {
6377 return KERN_RESOURCE_SHORTAGE;
6378 }
6379 VM_PAGE_WAIT();
6380 }
6381 /* Grab the pmap lock to ensure we don't try to concurrently map different tables at the same TTE. */
6382 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6383 old_tte = os_atomic_load(ttep, relaxed);
6384 if (!tte_is_valid_table(old_tte)) {
6385 pmap_init_pte_page(pmap, (pt_entry_t *) new_ttep, v, ttlevel + 1, FALSE);
6386 pa = kvtophys_nofail((vm_offset_t)new_ttep);
6387 /*
6388 * If the table is going to map a kernel RO zone VA region, then we must
6389 * upgrade its SPTM type to XNU_PAGE_TABLE_ROZONE. The SPTM's type system
6390 * requires the table to be transitioned through XNU_DEFAULT for refcount
6391 * enforcement, which is fine since this path is expected to execute only
6392 * once during boot.
6393 */
6394 if (__improbable(ttlevel == pt_attr_twig_level(pt_attr)) &&
6395 (pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE)) {
6396 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
6397 sptm_retype(pa, XNU_PAGE_TABLE, XNU_DEFAULT, retype_params);
6398 retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
6399 sptm_retype(pa, XNU_DEFAULT, XNU_PAGE_TABLE_ROZONE, retype_params);
6400 }
6401 new_tte = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6402 sptm_map_table(pmap->ttep, v, (sptm_pt_level_t)ttlevel, new_tte);
6403 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6404 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), new_tte);
6405 /**
6406 * If we need to set up multiple TTEs mapping different parts of the same page
6407 * (e.g. because we're carving multiple 4K page tables out of a 16K native page,
6408 * determine which of the grouped TTEs is the one that we need to follow for the
6409 * next level of the table walk.
6410 */
6411 table_ttep = new_ttep + ((((uintptr_t)ttep / sizeof(tt_entry_t)) & table_align_mask) *
6412 (pmap_page_size / sizeof(tt_entry_t)));
6413 pa = 0x0ULL;
6414 new_ttep = (tt_entry_t *)NULL;
6415 }
6416 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6417
6418 if (new_ttep != (tt_entry_t *)NULL) {
6419 pmap_tt_deallocate(pmap, new_ttep, ttlevel + 1);
6420 new_ttep = (tt_entry_t *)NULL;
6421 }
6422 }
6423 }
6424
6425 return KERN_SUCCESS;
6426 }
6427
6428 /*
6429 * Routine: pmap_gc
6430 * Function:
6431 * Pmap garbage collection
6432 * Called by the pageout daemon when pages are scarce.
6433 *
6434 */
6435 void
6436 pmap_gc(void)
6437 {
6438 /*
6439 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6440 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6441 * or may contain wired mappings. However, it may make sense to scan the pmap VM
6442 * object here, and for each page consult the SPTM frame table and if necessary
6443 * the PTD in the PV head table. If the frame table indicates the page is a leaf
6444 * page table page and the PTD indicates it has no wired mappings, we can call
6445 * pmap_remove() on the VA region mapped by the page and therein return the page
6446 * to the VM.
6447 */
6448 }
6449
6450 /*
6451 * By default, don't attempt pmap GC more frequently
6452 * than once / 1 minutes.
6453 */
6454
6455 void
6456 compute_pmap_gc_throttle(
6457 void *arg __unused)
6458 {
6459 }
6460
6461 /*
6462 * pmap_attribute_cache_sync(vm_offset_t pa)
6463 *
6464 * Invalidates all of the instruction cache on a physical page and
6465 * pushes any dirty data from the data cache for the same physical page
6466 */
6467
6468 kern_return_t
6469 pmap_attribute_cache_sync(
6470 ppnum_t pp,
6471 vm_size_t size,
6472 __unused vm_machine_attribute_t attribute,
6473 __unused vm_machine_attribute_val_t * value)
6474 {
6475 if (size > PAGE_SIZE) {
6476 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
6477 } else {
6478 cache_sync_page(pp);
6479 }
6480
6481 return KERN_SUCCESS;
6482 }
6483
6484 /*
6485 * pmap_sync_page_data_phys(ppnum_t pp)
6486 *
6487 * Invalidates all of the instruction cache on a physical page and
6488 * pushes any dirty data from the data cache for the same physical page.
6489 * Not required on SPTM systems, because the SPTM automatically performs
6490 * the invalidate operation when retyping to one of the types that allow
6491 * for executable permissions.
6492 */
6493 void
6494 pmap_sync_page_data_phys(
6495 __unused ppnum_t pp)
6496 {
6497 return;
6498 }
6499
6500 /*
6501 * pmap_sync_page_attributes_phys(ppnum_t pp)
6502 *
6503 * Write back and invalidate all cachelines on a physical page.
6504 */
6505 void
6506 pmap_sync_page_attributes_phys(
6507 ppnum_t pp)
6508 {
6509 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
6510 }
6511
6512 #if CONFIG_COREDUMP
6513 /* temporary workaround */
6514 boolean_t
6515 coredumpok(
6516 vm_map_t map,
6517 mach_vm_offset_t va)
6518 {
6519 pt_entry_t *pte_p;
6520 pt_entry_t spte;
6521
6522 pte_p = pmap_pte(map->pmap, va);
6523 if (0 == pte_p) {
6524 return FALSE;
6525 }
6526 if (vm_map_entry_has_device_pager(map, va)) {
6527 return FALSE;
6528 }
6529 spte = *pte_p;
6530 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6531 }
6532 #endif
6533
6534 void
6535 fillPage(
6536 ppnum_t pn,
6537 unsigned int fill)
6538 {
6539 unsigned int *addr;
6540 int count;
6541
6542 addr = (unsigned int *) phystokv(ptoa(pn));
6543 count = PAGE_SIZE / sizeof(unsigned int);
6544 while (count--) {
6545 *addr++ = fill;
6546 }
6547 }
6548
6549 extern void mapping_set_mod(ppnum_t pn);
6550
6551 void
6552 mapping_set_mod(
6553 ppnum_t pn)
6554 {
6555 pmap_set_modify(pn);
6556 }
6557
6558 extern void mapping_set_ref(ppnum_t pn);
6559
6560 void
6561 mapping_set_ref(
6562 ppnum_t pn)
6563 {
6564 pmap_set_reference(pn);
6565 }
6566
6567 /*
6568 * Clear specified attribute bits.
6569 *
6570 * Try to force an arm_fast_fault() for all mappings of
6571 * the page - to force attributes to be set again at fault time.
6572 * If the forcing succeeds, clear the cached bits at the head.
6573 * Otherwise, something must have been wired, so leave the cached
6574 * attributes alone.
6575 */
6576 MARK_AS_PMAP_TEXT static void
6577 phys_attribute_clear_with_flush_range(
6578 ppnum_t pn,
6579 unsigned int bits,
6580 int options,
6581 void *arg,
6582 pmap_tlb_flush_range_t *flush_range)
6583 {
6584 pmap_paddr_t pa = ptoa(pn);
6585 vm_prot_t allow_mode = VM_PROT_ALL;
6586
6587 if ((arg != NULL) || (flush_range != NULL)) {
6588 options = options & ~PMAP_OPTIONS_NOFLUSH;
6589 }
6590
6591 if (__improbable((options & PMAP_OPTIONS_FF_WIRED) != 0)) {
6592 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
6593 "invalid options",
6594 pn, bits, options, arg, flush_range);
6595 }
6596
6597 if (__improbable((bits & PP_ATTR_MODIFIED) &&
6598 (options & PMAP_OPTIONS_NOFLUSH))) {
6599 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
6600 "should not clear 'modified' without flushing TLBs",
6601 pn, bits, options, arg, flush_range);
6602 }
6603
6604 assert(pn != vm_page_fictitious_addr);
6605
6606 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
6607 assert(bits == PP_ATTR_MODIFIED);
6608
6609 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, NULL, flush_range);
6610 /*
6611 * We short circuit this case; it should not need to
6612 * invoke arm_force_fast_fault, so just clear the modified bit.
6613 * pmap_page_protect has taken care of resetting
6614 * the state so that we'll see the next write as a fault to
6615 * the VM (i.e. we don't want a fast fault).
6616 */
6617 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6618 return;
6619 }
6620 if (bits & PP_ATTR_REFERENCED) {
6621 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
6622 }
6623 if (bits & PP_ATTR_MODIFIED) {
6624 allow_mode &= ~VM_PROT_WRITE;
6625 }
6626
6627 if (bits == PP_ATTR_NOENCRYPT) {
6628 /*
6629 * We short circuit this case; it should not need to
6630 * invoke arm_force_fast_fault, so just clear and
6631 * return. On ARM, this bit is just a debugging aid.
6632 */
6633 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6634 return;
6635 }
6636
6637 arm_force_fast_fault_with_flush_range(pn, allow_mode, options, NULL, (pp_attr_t)bits, flush_range);
6638 }
6639
6640 MARK_AS_PMAP_TEXT void
6641 phys_attribute_clear_internal(
6642 ppnum_t pn,
6643 unsigned int bits,
6644 int options,
6645 void *arg)
6646 {
6647 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
6648 }
6649
6650 #if __ARM_RANGE_TLBI__
6651
6652 MARK_AS_PMAP_TEXT static vm_map_address_t
6653 phys_attribute_clear_twig_internal(
6654 pmap_t pmap,
6655 vm_map_address_t start,
6656 vm_map_address_t end,
6657 unsigned int bits,
6658 unsigned int options,
6659 pmap_tlb_flush_range_t *flush_range)
6660 {
6661 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
6662 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6663 assert(end >= start);
6664 assert((end - start) <= pt_attr_twig_size(pt_attr));
6665 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
6666 vm_map_address_t va = start;
6667 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
6668 tt_entry_t *tte_p;
6669 tte_p = pmap_tte(pmap, start);
6670
6671 /**
6672 * It's possible that this portion of our VA region has never been paged in, in which case
6673 * there may not be a valid twig or leaf table here.
6674 */
6675 if ((tte_p == (tt_entry_t *) NULL) || !tte_is_valid_table(*tte_p)) {
6676 assert(flush_range->pending_region_entries == 0);
6677 return end;
6678 }
6679
6680 pte_p = (pt_entry_t *) ttetokv(*tte_p);
6681
6682 start_pte_p = &pte_p[pte_index(pt_attr, start)];
6683 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
6684 assert(end_pte_p >= start_pte_p);
6685 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
6686 if (flush_range->pending_region_entries == 0) {
6687 flush_range->pending_region_start = va;
6688 } else {
6689 assertf((flush_range->pending_region_start +
6690 (flush_range->pending_region_entries * pmap_page_size)) == va,
6691 "pending_region_start 0x%llx + 0x%lx pages != va 0%llx",
6692 (unsigned long long)flush_range->pending_region_start,
6693 (unsigned long)flush_range->pending_region_entries,
6694 (unsigned long long)va);
6695 }
6696 flush_range->current_ptep = curr_pte_p;
6697 const pt_entry_t spte = os_atomic_load(curr_pte_p, relaxed);
6698 const pmap_paddr_t pa = pte_to_pa(spte);
6699 if (pte_is_valid(spte) && pa_valid(pa)) {
6700 /* The PTE maps a managed page, so do the appropriate PV list-based permission changes. */
6701 const ppnum_t pn = (ppnum_t) atop(pa);
6702 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
6703 if (__probable(flush_range->region_entry_added)) {
6704 flush_range->region_entry_added = false;
6705 } else {
6706 /**
6707 * It's possible that some other thread removed the mapping between our check
6708 * of the PTE above and taking the PVH lock in the
6709 * phys_attribute_clear_with_flush_range() path. In that case we have a
6710 * discontinuity in the region to update, so just submit any pending region
6711 * templates and start a new region op on the next iteration.
6712 */
6713 pmap_multipage_op_submit_region(flush_range);
6714 }
6715 } else if (__improbable(!pte_is_valid(spte))) {
6716 /**
6717 * We've found an invalid mapping, so we have a discontinuity in the the region to
6718 * update. Handle this by submitting any pending region templates and starting a new
6719 * region on the next iteration. In theory we could instead handle this by installing
6720 * a "safe" (AF bit cleared, minimal permissions) PTE template; the SPTM would just
6721 * ignore the update on finding an invalid mapping in the PTE. But we don't know
6722 * what a "safe" template will be in all cases: for example, JIT regions require all
6723 * mappings to either be invalid or to have full RWX permissions.
6724 */
6725 pmap_multipage_op_submit_region(flush_range);
6726 } else if (pmap_insert_flush_range_template(spte, flush_range)) {
6727 /**
6728 * We've found a mapping to a non-managed page, so just insert the existing
6729 * PTE into the pending region ops since we don't manage attributes for non-managed
6730 * pages.
6731 * If pmap_insert_flush_range_template() returns true, indicating that it reached
6732 * the mapping limit and submitted the SPTM call, then we also submit any pending
6733 * disjoint ops. Having pending operations in either category will keep preemption
6734 * disabled, and we want to ensure that we can at least temporarily
6735 * re-enable preemption every SPTM_MAPPING_LIMIT mappings.
6736 */
6737 pmap_multipage_op_submit_disjoint(0, flush_range);
6738 }
6739
6740 /**
6741 * If the total number of pending + processed entries exceeds the mapping threshold,
6742 * we may need to submit all pending operations to avoid excessive preemption latency.
6743 * Otherwise, a small number of pending disjoint or region ops can hold preemption
6744 * disabled across an arbitrary number of total processed entries.
6745 * As an optimization, we may be able to avoid submitting if no urgent AST is
6746 * pending on the local CPU, but only if we aren't currently in an epoch. If we are
6747 * in an epoch, failure to submit in a timely manner can cause another CPU to wait
6748 * too long for our epoch to drain.
6749 */
6750 if (((flush_range->processed_entries + flush_range->pending_disjoint_entries +
6751 flush_range->pending_region_entries) >= SPTM_MAPPING_LIMIT) &&
6752 (pmap_in_epoch() || pmap_pending_preemption())) {
6753 pmap_multipage_op_submit(flush_range);
6754 assert(preemption_enabled());
6755 }
6756 }
6757
6758 /* SPTM region ops can't span L3 table boundaries, so submit any pending region templates now. */
6759 pmap_multipage_op_submit_region(flush_range);
6760 return end;
6761 }
6762
6763 MARK_AS_PMAP_TEXT vm_map_address_t
6764 phys_attribute_clear_range_internal(
6765 pmap_t pmap,
6766 vm_map_address_t start,
6767 vm_map_address_t end,
6768 unsigned int bits,
6769 unsigned int options)
6770 {
6771 if (__improbable(end < start)) {
6772 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
6773 }
6774 validate_pmap_mutable(pmap);
6775
6776 vm_map_address_t va = start;
6777 pmap_tlb_flush_range_t flush_range = {
6778 .ptfr_pmap = pmap,
6779 .ptfr_start = start,
6780 .ptfr_end = end,
6781 .current_ptep = NULL,
6782 .pending_region_start = 0,
6783 .pending_region_entries = 0,
6784 .region_entry_added = false,
6785 .current_header = NULL,
6786 .current_header_first_mapping_index = 0,
6787 .processed_entries = 0,
6788 .pending_disjoint_entries = 0,
6789 .ptfr_flush_needed = false
6790 };
6791
6792 pmap_lock(pmap, PMAP_LOCK_SHARED);
6793 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6794
6795 while (va < end) {
6796 vm_map_address_t curr_end;
6797
6798 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
6799 if (curr_end > end) {
6800 curr_end = end;
6801 }
6802
6803 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
6804 }
6805 pmap_multipage_op_submit(&flush_range);
6806 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6807 assert((flush_range.pending_disjoint_entries == 0) && (flush_range.pending_region_entries == 0));
6808 if (flush_range.ptfr_flush_needed) {
6809 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
6810 flush_range.ptfr_start,
6811 flush_range.ptfr_end - flush_range.ptfr_start,
6812 flush_range.ptfr_pmap,
6813 true);
6814 sync_tlb_flush();
6815 }
6816 return va;
6817 }
6818
6819 static void
6820 phys_attribute_clear_range(
6821 pmap_t pmap,
6822 vm_map_address_t start,
6823 vm_map_address_t end,
6824 unsigned int bits,
6825 unsigned int options)
6826 {
6827 /*
6828 * We allow single-page requests to execute non-preemptibly,
6829 * as it doesn't make sense to sample AST_URGENT for a single-page
6830 * operation, and there are a couple of special use cases that
6831 * require a non-preemptible single-page operation.
6832 */
6833 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
6834 pmap_verify_preemptible();
6835 }
6836 __assert_only const int preemption_level = get_preemption_level();
6837
6838 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
6839
6840 phys_attribute_clear_range_internal(pmap, start, end, bits, options);
6841
6842 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
6843
6844 assert(preemption_level == get_preemption_level());
6845 }
6846 #endif /* __ARM_RANGE_TLBI__ */
6847
6848 static void
6849 phys_attribute_clear(
6850 ppnum_t pn,
6851 unsigned int bits,
6852 int options,
6853 void *arg)
6854 {
6855 /*
6856 * Do we really want this tracepoint? It will be extremely chatty.
6857 * Also, should we have a corresponding trace point for the set path?
6858 */
6859 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
6860
6861 phys_attribute_clear_internal(pn, bits, options, arg);
6862
6863 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
6864 }
6865
6866 /*
6867 * Set specified attribute bits.
6868 *
6869 * Set cached value in the pv head because we have
6870 * no per-mapping hardware support for referenced and
6871 * modify bits.
6872 */
6873 MARK_AS_PMAP_TEXT void
6874 phys_attribute_set_internal(
6875 ppnum_t pn,
6876 unsigned int bits)
6877 {
6878 pmap_paddr_t pa = ptoa(pn);
6879 assert(pn != vm_page_fictitious_addr);
6880
6881 ppattr_pa_set_bits(pa, (uint16_t)bits);
6882
6883 return;
6884 }
6885
6886 static void
6887 phys_attribute_set(
6888 ppnum_t pn,
6889 unsigned int bits)
6890 {
6891 phys_attribute_set_internal(pn, bits);
6892 }
6893
6894
6895 /*
6896 * Check specified attribute bits.
6897 *
6898 * use the software cached bits (since no hw support).
6899 */
6900 static boolean_t
6901 phys_attribute_test(
6902 ppnum_t pn,
6903 unsigned int bits)
6904 {
6905 pmap_paddr_t pa = ptoa(pn);
6906 assert(pn != vm_page_fictitious_addr);
6907 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
6908 }
6909
6910
6911 /*
6912 * Set the modify/reference bits on the specified physical page.
6913 */
6914 void
6915 pmap_set_modify(ppnum_t pn)
6916 {
6917 phys_attribute_set(pn, PP_ATTR_MODIFIED);
6918 }
6919
6920
6921 /*
6922 * Clear the modify bits on the specified physical page.
6923 */
6924 void
6925 pmap_clear_modify(
6926 ppnum_t pn)
6927 {
6928 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
6929 }
6930
6931
6932 /*
6933 * pmap_is_modified:
6934 *
6935 * Return whether or not the specified physical page is modified
6936 * by any physical maps.
6937 */
6938 boolean_t
6939 pmap_is_modified(
6940 ppnum_t pn)
6941 {
6942 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
6943 }
6944
6945
6946 /*
6947 * Set the reference bit on the specified physical page.
6948 */
6949 static void
6950 pmap_set_reference(
6951 ppnum_t pn)
6952 {
6953 phys_attribute_set(pn, PP_ATTR_REFERENCED);
6954 }
6955
6956 /*
6957 * Clear the reference bits on the specified physical page.
6958 */
6959 void
6960 pmap_clear_reference(
6961 ppnum_t pn)
6962 {
6963 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
6964 }
6965
6966
6967 /*
6968 * pmap_is_referenced:
6969 *
6970 * Return whether or not the specified physical page is referenced
6971 * by any physical maps.
6972 */
6973 boolean_t
6974 pmap_is_referenced(
6975 ppnum_t pn)
6976 {
6977 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
6978 }
6979
6980 /*
6981 * pmap_get_refmod(phys)
6982 * returns the referenced and modified bits of the specified
6983 * physical page.
6984 */
6985 unsigned int
6986 pmap_get_refmod(
6987 ppnum_t pn)
6988 {
6989 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
6990 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
6991 }
6992
6993 static inline unsigned int
6994 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
6995 {
6996 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
6997 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
6998 }
6999
7000 /*
7001 * pmap_clear_refmod(phys, mask)
7002 * clears the referenced and modified bits as specified by the mask
7003 * of the specified physical page.
7004 */
7005 void
7006 pmap_clear_refmod_options(
7007 ppnum_t pn,
7008 unsigned int mask,
7009 unsigned int options,
7010 void *arg)
7011 {
7012 unsigned int bits;
7013
7014 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7015 phys_attribute_clear(pn, bits, options, arg);
7016 }
7017
7018 /*
7019 * Perform pmap_clear_refmod_options on a virtual address range.
7020 * The operation will be performed in bulk & tlb flushes will be coalesced
7021 * if possible.
7022 *
7023 * Returns true if the operation is supported on this platform.
7024 * If this function returns false, the operation is not supported and
7025 * nothing has been modified in the pmap.
7026 */
7027 bool
7028 pmap_clear_refmod_range_options(
7029 pmap_t pmap __unused,
7030 vm_map_address_t start __unused,
7031 vm_map_address_t end __unused,
7032 unsigned int mask __unused,
7033 unsigned int options __unused)
7034 {
7035 #if __ARM_RANGE_TLBI__
7036 unsigned int bits;
7037 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7038 phys_attribute_clear_range(pmap, start, end, bits, options);
7039 return true;
7040 #else /* __ARM_RANGE_TLBI__ */
7041 #pragma unused(pmap, start, end, mask, options)
7042 /*
7043 * This operation allows the VM to bulk modify refmod bits on a virtually
7044 * contiguous range of addresses. This is large performance improvement on
7045 * platforms that support ranged tlbi instructions. But on older platforms,
7046 * we can only flush per-page or the entire asid. So we currently
7047 * only support this operation on platforms that support ranged tlbi.
7048 * instructions. On other platforms, we require that
7049 * the VM modify the bits on a per-page basis.
7050 */
7051 return false;
7052 #endif /* __ARM_RANGE_TLBI__ */
7053 }
7054
7055 void
7056 pmap_clear_refmod(
7057 ppnum_t pn,
7058 unsigned int mask)
7059 {
7060 pmap_clear_refmod_options(pn, mask, 0, NULL);
7061 }
7062
7063 unsigned int
7064 pmap_disconnect_options(
7065 ppnum_t pn,
7066 unsigned int options,
7067 void *arg)
7068 {
7069 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7070 /*
7071 * On ARM, the "modified" bit is managed by software, so
7072 * we know up-front if the physical page is "modified",
7073 * without having to scan all the PTEs pointing to it.
7074 * The caller should have made the VM page "busy" so noone
7075 * should be able to establish any new mapping and "modify"
7076 * the page behind us.
7077 */
7078 if (pmap_is_modified(pn)) {
7079 /*
7080 * The page has been modified and will be sent to
7081 * the VM compressor.
7082 */
7083 options |= PMAP_OPTIONS_COMPRESSOR;
7084 } else {
7085 /*
7086 * The page hasn't been modified and will be freed
7087 * instead of compressed.
7088 */
7089 }
7090 }
7091
7092 /* disconnect the page */
7093 pmap_page_protect_options(pn, 0, options, arg);
7094
7095 /* return ref/chg status */
7096 return pmap_get_refmod(pn);
7097 }
7098
7099 /*
7100 * Routine:
7101 * pmap_disconnect
7102 *
7103 * Function:
7104 * Disconnect all mappings for this page and return reference and change status
7105 * in generic format.
7106 *
7107 */
7108 unsigned int
7109 pmap_disconnect(
7110 ppnum_t pn)
7111 {
7112 pmap_page_protect(pn, 0); /* disconnect the page */
7113 return pmap_get_refmod(pn); /* return ref/chg status */
7114 }
7115
7116 boolean_t
7117 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7118 {
7119 if (ptoa(first) >= vm_last_phys) {
7120 return FALSE;
7121 }
7122 if (ptoa(last) < vm_first_phys) {
7123 return FALSE;
7124 }
7125
7126 return TRUE;
7127 }
7128
7129 /*
7130 * The state maintained by the noencrypt functions is used as a
7131 * debugging aid on ARM. This incurs some overhead on the part
7132 * of the caller. A special case check in phys_attribute_clear
7133 * (the most expensive path) currently minimizes this overhead,
7134 * but stubbing these functions out on RELEASE kernels yields
7135 * further wins.
7136 */
7137 boolean_t
7138 pmap_is_noencrypt(
7139 ppnum_t pn)
7140 {
7141 #if DEVELOPMENT || DEBUG
7142 boolean_t result = FALSE;
7143
7144 if (!pa_valid(ptoa(pn))) {
7145 return FALSE;
7146 }
7147
7148 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7149
7150 return result;
7151 #else
7152 #pragma unused(pn)
7153 return FALSE;
7154 #endif
7155 }
7156
7157 void
7158 pmap_set_noencrypt(
7159 ppnum_t pn)
7160 {
7161 #if DEVELOPMENT || DEBUG
7162 if (!pa_valid(ptoa(pn))) {
7163 return;
7164 }
7165
7166 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7167 #else
7168 #pragma unused(pn)
7169 #endif
7170 }
7171
7172 void
7173 pmap_clear_noencrypt(
7174 ppnum_t pn)
7175 {
7176 #if DEVELOPMENT || DEBUG
7177 if (!pa_valid(ptoa(pn))) {
7178 return;
7179 }
7180
7181 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7182 #else
7183 #pragma unused(pn)
7184 #endif
7185 }
7186
7187 void
7188 pmap_lock_phys_page(ppnum_t pn)
7189 {
7190 unsigned int pai;
7191 pmap_paddr_t phys = ptoa(pn);
7192
7193 if (pa_valid(phys)) {
7194 pai = pa_index(phys);
7195 __unused const locked_pvh_t locked_pvh = pvh_lock(pai);
7196 } else {
7197 simple_lock(&phys_backup_lock, LCK_GRP_NULL);
7198 }
7199 }
7200
7201
7202 void
7203 pmap_unlock_phys_page(ppnum_t pn)
7204 {
7205 unsigned int pai;
7206 pmap_paddr_t phys = ptoa(pn);
7207
7208 if (pa_valid(phys)) {
7209 pai = pa_index(phys);
7210 locked_pvh_t locked_pvh = {.pvh = pai_to_pvh(pai), .pai = pai};
7211 pvh_unlock(&locked_pvh);
7212 } else {
7213 simple_unlock(&phys_backup_lock);
7214 }
7215 }
7216
7217 MARK_AS_PMAP_TEXT void
7218 pmap_clear_user_ttb_internal(void)
7219 {
7220 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7221 }
7222
7223 void
7224 pmap_clear_user_ttb(void)
7225 {
7226 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7227 pmap_clear_user_ttb_internal();
7228 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7229 }
7230
7231 /**
7232 * Set up a "fast fault", or a page fault that won't go through the VM layer on
7233 * a page. This is primarily used to manage ref/mod bits in software. Depending
7234 * on the value of allow_mode, the next read and/or write of the page will fault
7235 * and the ref/mod bits will be updated.
7236 *
7237 * @param ppnum Page number to set up a fast fault on.
7238 * @param allow_mode VM_PROT_NONE will cause the next read and write access to
7239 * fault.
7240 * VM_PROT_READ will only cause the next write access to fault.
7241 * Other values are undefined.
7242 * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
7243 * PMAP_OPTIONS_FF_WIRED forces a fast fault even on wired pages.
7244 * PMAP_OPTIONS_SET_REUSABLE/PMAP_OPTIONS_CLEAR_REUSABLE updates
7245 * the global reusable bit of the page.
7246 * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
7247 * by the caller. This is an input/output parameter which may be updated
7248 * to reflect a new PV head value to be passed to a later call to pvh_unlock().
7249 * @param bits_to_clear Mask of additional pp_attr_t bits to clear for the physical
7250 * page, iff this function completes successfully and returns
7251 * TRUE. This is typically some combination of
7252 * the referenced, modified, and noencrypt bits.
7253 * @param flush_range When present, this function will skip the TLB flush for the
7254 * mappings that are covered by the range, leaving that to be
7255 * done later by the caller. It may also avoid submitting mapping
7256 * updates directly to the SPTM, instead accumulating them in a
7257 * per-CPU array to be submitted later by the caller.
7258 *
7259 * @return TRUE if the fast fault was successfully configured for all mappings
7260 * of the page, FALSE otherwise (e.g. if wired mappings are present and
7261 * PMAP_OPTIONS_FF_WIRED was not passed).
7262 *
7263 * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
7264 *
7265 * @warning PMAP_OPTIONS_FF_WIRED should only be used with pages accessible from
7266 * EL0. The kernel may assume that accesses to wired, kernel-owned pages
7267 * won't fault.
7268 */
7269 MARK_AS_PMAP_TEXT static boolean_t
7270 arm_force_fast_fault_with_flush_range(
7271 ppnum_t ppnum,
7272 vm_prot_t allow_mode,
7273 int options,
7274 locked_pvh_t *locked_pvh,
7275 pp_attr_t bits_to_clear,
7276 pmap_tlb_flush_range_t *flush_range)
7277 {
7278 pmap_paddr_t phys = ptoa(ppnum);
7279 pv_entry_t *pve_p;
7280 pt_entry_t *pte_p;
7281 unsigned int pai;
7282 boolean_t result;
7283 unsigned int num_mappings = 0, num_skipped_mappings = 0;
7284 bool ref_fault;
7285 bool mod_fault;
7286 bool clear_write_fault = false;
7287 bool ref_aliases_mod = false;
7288
7289 assert(ppnum != vm_page_fictitious_addr);
7290
7291 /**
7292 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
7293 *
7294 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
7295 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
7296 * semantics conflict with each other, so assert they are not both true.
7297 */
7298 assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
7299
7300 if (!pa_valid(phys)) {
7301 return FALSE; /* Not a managed page. */
7302 }
7303
7304 result = TRUE;
7305 ref_fault = false;
7306 mod_fault = false;
7307 pai = pa_index(phys);
7308 locked_pvh_t local_locked_pvh = {.pvh = 0};
7309 if (__probable(locked_pvh == NULL)) {
7310 if (flush_range != NULL) {
7311 /**
7312 * If we're partway through processing a multi-page batched call,
7313 * preemption will already be disabled so we can't simply call
7314 * pvh_lock() which may block. Instead, we first try to acquire
7315 * the lock without waiting, which in most cases should succeed.
7316 * If it fails, we submit the pending batched operations to re-
7317 * enable preemption and then acquire the lock normally.
7318 */
7319 local_locked_pvh = pvh_try_lock(pai);
7320 if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
7321 pmap_multipage_op_submit(flush_range);
7322 local_locked_pvh = pvh_lock(pai);
7323 }
7324 } else {
7325 local_locked_pvh = pvh_lock(pai);
7326 }
7327 } else {
7328 local_locked_pvh = *locked_pvh;
7329 assert(pai == local_locked_pvh.pai);
7330 }
7331 assert(local_locked_pvh.pvh != 0);
7332 pvh_assert_locked(pai);
7333
7334 pte_p = PT_ENTRY_NULL;
7335 pve_p = PV_ENTRY_NULL;
7336 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
7337 pte_p = pvh_ptep(local_locked_pvh.pvh);
7338 } else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
7339 pve_p = pvh_pve_list(local_locked_pvh.pvh);
7340 } else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
7341 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
7342 }
7343
7344 const bool is_reusable = ppattr_test_reusable(pai);
7345
7346 bool pvh_lock_sleep_mode_needed = false;
7347 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
7348 sptm_disjoint_op_t *sptm_ops = NULL;
7349
7350 /**
7351 * This would also work as a block, with the above variables declared using the
7352 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
7353 * dereferencing __block variables through stack forwarding pointers) isn't needed
7354 * here, as we never need to use this code sequence as a closure.
7355 */
7356 #define FFF_PERCPU_INIT() do { \
7357 disable_preemption(); \
7358 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
7359 sptm_ops = sptm_pcpu->sptm_ops; \
7360 } while (0)
7361
7362 FFF_PERCPU_INIT();
7363
7364 int pve_ptep_idx = 0;
7365
7366 /**
7367 * With regard to TLBI, there are three cases:
7368 *
7369 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
7370 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
7371 * itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
7372 * mapping is out of the range.
7373 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
7374 * let SPTM handle TLBI flushing.
7375 */
7376 const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
7377 const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
7378
7379 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7380 pt_entry_t spte;
7381 pt_entry_t tmplate;
7382
7383 if (__improbable(pvh_lock_sleep_mode_needed)) {
7384 assert((num_mappings == 0) && (num_skipped_mappings == 0));
7385 /**
7386 * Undo the explicit preemption disable done in the last call to FFF_PER_CPU_INIT().
7387 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
7388 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
7389 * core while processing SPTM per-CPU data. At the same time, we also want preemption
7390 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
7391 * urgent ASTs can be handled.
7392 */
7393 enable_preemption();
7394 pvh_lock_enter_sleep_mode(&local_locked_pvh);
7395 pvh_lock_sleep_mode_needed = false;
7396 FFF_PERCPU_INIT();
7397 }
7398
7399 if (pve_p != PV_ENTRY_NULL) {
7400 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7401 if (pte_p == PT_ENTRY_NULL) {
7402 goto fff_skip_pve;
7403 }
7404 }
7405
7406 #ifdef PVH_FLAG_IOMMU
7407 if (pvh_ptep_is_iommu(pte_p)) {
7408 ++num_skipped_mappings;
7409 goto fff_skip_pve;
7410 }
7411 #endif
7412 spte = os_atomic_load(pte_p, relaxed);
7413 if (pte_is_compressed(spte, pte_p)) {
7414 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7415 }
7416
7417 pt_desc_t *ptdp = NULL;
7418 pmap_t pmap = NULL;
7419 vm_map_address_t va = 0;
7420
7421 if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
7422 /**
7423 * If the current mapping matches the flush range's current iteration position,
7424 * there's no need to do the work of getting the PTD. We already know the pmap,
7425 * and the VA is implied by flush_range->pending_region_start.
7426 */
7427 pmap = flush_range->ptfr_pmap;
7428 } else {
7429 ptdp = ptep_get_ptd(pte_p);
7430 pmap = ptdp->pmap;
7431 va = ptd_get_va(ptdp, pte_p);
7432 assert(va >= pmap->min && va < pmap->max);
7433 }
7434
7435 bool skip_pte = pte_is_wired(spte) &&
7436 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7437
7438 if (skip_pte) {
7439 result = FALSE;
7440 }
7441
7442 // A concurrent pmap_remove() may have cleared the PTE
7443 if (__improbable(!pte_is_valid(spte))) {
7444 skip_pte = true;
7445 }
7446
7447 /**
7448 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
7449 * pending disjoint ops, so we don't need to do flush range disjoint op management.
7450 */
7451 if ((flush_range != NULL) && (ptdp != NULL) && !skip_pte) {
7452 /**
7453 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
7454 * We do this in three cases:
7455 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
7456 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
7457 * for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
7458 * 3) We need to change the options passed to the SPTM for a run of one or more mappings. Specifically,
7459 * if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
7460 * belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
7461 * the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
7462 */
7463 uint32_t per_mapping_sptm_update_options = sptm_update_options;
7464 if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
7465 per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
7466 }
7467 if ((num_mappings == 0) ||
7468 (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
7469 if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
7470 /**
7471 * If we needed to submit the pending disjoint ops to make room for the new page,
7472 * flush any pending region ops to reenable preemption and restart the loop with
7473 * the lock in sleep mode. This prevents preemption from being held disabled
7474 * for an arbitrary amount of time in the pathological case in which we have
7475 * both pending region ops and an excessively long PV list that repeatedly
7476 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
7477 */
7478 pmap_multipage_op_submit_region(flush_range);
7479 assert(num_mappings == 0);
7480 num_skipped_mappings = 0;
7481 pvh_lock_sleep_mode_needed = true;
7482 continue;
7483 }
7484 }
7485 }
7486
7487 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7488
7489 /* update pmap stats and ledgers */
7490 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7491 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7492 if (is_altacct) {
7493 /*
7494 * We do not track "reusable" status for
7495 * "alternate accounting" mappings.
7496 */
7497 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7498 is_reusable &&
7499 is_internal &&
7500 pmap != kernel_pmap) {
7501 /* one less "reusable" */
7502 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7503 /* one more "internal" */
7504 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7505 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7506
7507 /*
7508 * Since the page is being marked non-reusable, we assume that it will be
7509 * modified soon. Avoid the cost of another trap to handle the fast
7510 * fault when we next write to this page.
7511 */
7512 clear_write_fault = true;
7513 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7514 !is_reusable &&
7515 is_internal &&
7516 pmap != kernel_pmap) {
7517 /* one more "reusable" */
7518 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7519 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7520 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7521 }
7522
7523 if (skip_pte) {
7524 ++num_skipped_mappings;
7525 goto fff_skip_pve;
7526 }
7527
7528 tmplate = spte;
7529
7530 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7531 /* read protection sets the pte to fault */
7532 tmplate = tmplate & ~ARM_PTE_AF;
7533 ref_fault = true;
7534 }
7535 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7536 /* take away write permission if set */
7537 if (pmap == kernel_pmap) {
7538 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7539 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7540 pte_set_was_writeable(tmplate, true);
7541 mod_fault = true;
7542 }
7543 } else {
7544 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7545 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7546 pte_set_was_writeable(tmplate, true);
7547 mod_fault = true;
7548 }
7549 }
7550 }
7551
7552 if (ptdp != NULL) {
7553 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
7554 sptm_ops[num_mappings].vaddr = va;
7555 sptm_ops[num_mappings].pte_template = tmplate;
7556 ++num_mappings;
7557 } else if (pmap_insert_flush_range_template(tmplate, flush_range)) {
7558 /**
7559 * We submit both the pending disjoint and pending region ops whenever
7560 * either category reaches the mapping limit. Having pending operations
7561 * in either category will keep preemption disabled, and we want to ensure
7562 * that we can at least temporarily re-enable preemption roughly every
7563 * SPTM_MAPPING_LIMIT mappings.
7564 */
7565 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
7566 pvh_lock_sleep_mode_needed = true;
7567 num_mappings = num_skipped_mappings = 0;
7568 }
7569 fff_skip_pve:
7570 if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
7571 if (flush_range != NULL) {
7572 /* See comment above for why we submit both disjoint and region ops when we hit the limit. */
7573 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
7574 pmap_multipage_op_submit_region(flush_range);
7575 } else if (num_mappings > 0) {
7576 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
7577 }
7578 pvh_lock_sleep_mode_needed = true;
7579 num_mappings = num_skipped_mappings = 0;
7580 }
7581 pte_p = PT_ENTRY_NULL;
7582 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7583 pve_ptep_idx = 0;
7584 pve_p = pve_next(pve_p);
7585 }
7586 }
7587
7588 if (num_mappings != 0) {
7589 sptm_return_t sptm_ret;
7590
7591 if (flush_range == NULL) {
7592 sptm_ret = sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
7593 } else {
7594 /* Resync the pending mapping state in flush_range with our local state. */
7595 assert(num_mappings >= flush_range->pending_disjoint_entries);
7596 flush_range->pending_disjoint_entries = num_mappings;
7597 }
7598 }
7599
7600 /**
7601 * Undo the explicit disable_preemption() done in FFF_PERCPU_INIT().
7602 * Note that enable_preemption() decrements a per-thread counter, so if
7603 * we happen to still hold the PVH lock in spin mode then preemption won't
7604 * actually be re-enabled until we drop the lock (which also decrements
7605 * the per-thread counter.
7606 */
7607 enable_preemption();
7608
7609 /*
7610 * If we are using the same approach for ref and mod
7611 * faults on this PTE, do not clear the write fault;
7612 * this would cause both ref and mod to be set on the
7613 * page again, and prevent us from taking ANY read/write
7614 * fault on the mapping.
7615 */
7616 if (clear_write_fault && !ref_aliases_mod) {
7617 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, local_locked_pvh.pvh, PT_ENTRY_NULL, 0);
7618 }
7619
7620 pp_attr_t attrs_to_clear = (result ? bits_to_clear : 0);
7621 pp_attr_t attrs_to_set = 0;
7622 /* update global "reusable" status for this page */
7623 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
7624 attrs_to_clear |= PP_ATTR_REUSABLE;
7625 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
7626 attrs_to_set |= PP_ATTR_REUSABLE;
7627 }
7628
7629 if (mod_fault) {
7630 attrs_to_set |= PP_ATTR_MODFAULT;
7631 }
7632 if (ref_fault) {
7633 attrs_to_set |= PP_ATTR_REFFAULT;
7634 }
7635
7636 if (attrs_to_set | attrs_to_clear) {
7637 ppattr_modify_bits(pai, attrs_to_clear, attrs_to_set);
7638 }
7639
7640 if (__probable(locked_pvh == NULL)) {
7641 pvh_unlock(&local_locked_pvh);
7642 } else {
7643 *locked_pvh = local_locked_pvh;
7644 }
7645 if ((flush_range != NULL) && !preemption_enabled()) {
7646 flush_range->processed_entries += num_skipped_mappings;
7647 }
7648 return result;
7649 }
7650
7651 MARK_AS_PMAP_TEXT boolean_t
7652 arm_force_fast_fault_internal(
7653 ppnum_t ppnum,
7654 vm_prot_t allow_mode,
7655 int options)
7656 {
7657 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
7658 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
7659 }
7660 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL, 0, NULL);
7661 }
7662
7663 /*
7664 * Routine: arm_force_fast_fault
7665 *
7666 * Function:
7667 * Force all mappings for this page to fault according
7668 * to the access modes allowed, so we can gather ref/modify
7669 * bits again.
7670 */
7671
7672 boolean_t
7673 arm_force_fast_fault(
7674 ppnum_t ppnum,
7675 vm_prot_t allow_mode,
7676 int options,
7677 __unused void *arg)
7678 {
7679 pmap_paddr_t phys = ptoa(ppnum);
7680
7681 assert(ppnum != vm_page_fictitious_addr);
7682
7683 if (!pa_valid(phys)) {
7684 return FALSE; /* Not a managed page. */
7685 }
7686
7687 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
7688 }
7689
7690 /**
7691 * Clear pending force fault for at most SPTM_MAPPING_LIMIT mappings for this
7692 * page based on the observed fault type, and update the appropriate ref/modify
7693 * bits for the physical page. This typically involves adding write permissions
7694 * back for write faults and setting the Access Flag for both read/write faults
7695 * (since the lack of those things is what caused the fault in the first place).
7696 *
7697 * @note Only SPTM_MAPPING_LIMIT number of mappings can be modified in a single
7698 * arm_clear_fast_fault() call to prevent excessive PVH lock contention as
7699 * the PVH lock should be held for `ppnum` already. If a fault is
7700 * subsequently taken on a mapping we haven't processed, arm_fast_fault()
7701 * will call this function with a non-NULL pte_p to perform a targeted
7702 * fixup.
7703 *
7704 * @param ppnum Page number of the page to clear a pending force fault on.
7705 * @param fault_type The type of access/fault that triggered us wanting to clear
7706 * the pending force fault status. This determines how we
7707 * modify the PTE to not cause a fault in the future and also
7708 * whether we mark the PTE as referenced or modified.
7709 * Typically a write fault would cause the page to be marked
7710 * as referenced and modified, and a read fault would only
7711 * cause the page to be marked as referenced.
7712 * @param pvh pv_head_table entry value for [ppnum] returned by a previous call
7713 * to pvh_lock().
7714 * @param pte_p If this value is non-PT_ENTRY_NULL then only this specified PTE
7715 * will be modified. If it is PT_ENTRY_NULL, then every mapping to
7716 * `ppnum` will be modified.
7717 * @param attrs_to_clear Mask of additional pp_attr_t bits to clear for the physical
7718 * page upon completion of this function. This is typically
7719 * some combination of the REFFAULT and MODFAULT bits.
7720 *
7721 * @return TRUE if any PTEs were modified, FALSE otherwise.
7722 */
7723 MARK_AS_PMAP_TEXT static boolean_t
7724 arm_clear_fast_fault(
7725 ppnum_t ppnum,
7726 vm_prot_t fault_type,
7727 uintptr_t pvh,
7728 pt_entry_t *pte_p,
7729 pp_attr_t attrs_to_clear)
7730 {
7731 const pmap_paddr_t pa = ptoa(ppnum);
7732 pv_entry_t *pve_p;
7733 boolean_t result;
7734 unsigned int num_mappings = 0, num_skipped_mappings = 0;
7735 pp_attr_t attrs_to_set = 0;
7736
7737 assert(ppnum != vm_page_fictitious_addr);
7738
7739 if (!pa_valid(pa)) {
7740 return FALSE; /* Not a managed page. */
7741 }
7742
7743 result = FALSE;
7744 pve_p = PV_ENTRY_NULL;
7745 if (pte_p == PT_ENTRY_NULL) {
7746 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
7747 pte_p = pvh_ptep(pvh);
7748 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
7749 pve_p = pvh_pve_list(pvh);
7750 } else if (__improbable(!pvh_test_type(pvh, PVH_TYPE_NULL))) {
7751 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)pvh, (uint64_t)pa);
7752 }
7753 }
7754
7755 disable_preemption();
7756 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
7757 sptm_disjoint_op_t *sptm_ops = sptm_pcpu->sptm_ops;
7758
7759 int pve_ptep_idx = 0;
7760
7761 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7762 pt_entry_t spte;
7763 pt_entry_t tmplate;
7764
7765 if (pve_p != PV_ENTRY_NULL) {
7766 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7767 if (pte_p == PT_ENTRY_NULL) {
7768 goto cff_skip_pve;
7769 }
7770 }
7771
7772 #ifdef PVH_FLAG_IOMMU
7773 if (pvh_ptep_is_iommu(pte_p)) {
7774 ++num_skipped_mappings;
7775 goto cff_skip_pve;
7776 }
7777 #endif
7778 spte = os_atomic_load(pte_p, relaxed);
7779 // A concurrent pmap_remove() may have cleared the PTE
7780 if (__improbable(!pte_is_valid(spte))) {
7781 ++num_skipped_mappings;
7782 goto cff_skip_pve;
7783 }
7784
7785 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7786 const pmap_t pmap = ptdp->pmap;
7787 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7788
7789 assert(va >= pmap->min && va < pmap->max);
7790
7791 tmplate = spte;
7792
7793 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
7794 {
7795 if (pmap == kernel_pmap) {
7796 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
7797 } else {
7798 assert(pmap->type != PMAP_TYPE_NESTED);
7799 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
7800 }
7801 }
7802
7803 tmplate |= ARM_PTE_AF;
7804
7805 pte_set_was_writeable(tmplate, false);
7806 attrs_to_set |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
7807 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
7808 tmplate = spte | ARM_PTE_AF;
7809
7810 {
7811 attrs_to_set |= PP_ATTR_REFERENCED;
7812 }
7813 }
7814
7815 assert(spte != ARM_PTE_EMPTY);
7816
7817 if (spte != tmplate) {
7818 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
7819 sptm_ops[num_mappings].vaddr = va;
7820 sptm_ops[num_mappings].pte_template = tmplate;
7821 ++num_mappings;
7822 result = TRUE;
7823 }
7824
7825 cff_skip_pve:
7826 if ((num_mappings + num_skipped_mappings) == SPTM_MAPPING_LIMIT) {
7827 if (num_mappings != 0) {
7828 sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
7829 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
7830 num_mappings = 0;
7831 }
7832 /*
7833 * We've reached the limit of mappings that can be processed in a single arm_clear_fast_fault()
7834 * call. Bail out here to avoid excessive PVH lock duration on the fault path. If a fault is
7835 * subsequently taken on a mapping we haven't processed, arm_fast_fault() will call this
7836 * function with a non-NULL pte_p to perform a targeted fixup.
7837 */
7838 break;
7839 }
7840
7841 pte_p = PT_ENTRY_NULL;
7842 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7843 pve_ptep_idx = 0;
7844 pve_p = pve_next(pve_p);
7845 }
7846 }
7847
7848 if (num_mappings != 0) {
7849 assert(result == TRUE);
7850 sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
7851 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
7852 }
7853
7854 if (attrs_to_set | attrs_to_clear) {
7855 ppattr_modify_bits(pa_index(pa), attrs_to_clear, attrs_to_set);
7856 }
7857 enable_preemption();
7858
7859 return result;
7860 }
7861
7862 /*
7863 * Determine if the fault was induced by software tracking of
7864 * modify/reference bits. If so, re-enable the mapping (and set
7865 * the appropriate bits).
7866 *
7867 * Returns KERN_SUCCESS if the fault was induced and was
7868 * successfully handled.
7869 *
7870 * Returns KERN_FAILURE if the fault was not induced and
7871 * the function was unable to deal with it.
7872 *
7873 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
7874 * disallows this type of access.
7875 */
7876 MARK_AS_PMAP_TEXT kern_return_t
7877 arm_fast_fault_internal(
7878 pmap_t pmap,
7879 vm_map_address_t va,
7880 vm_prot_t fault_type,
7881 __unused bool was_af_fault,
7882 __unused bool from_user)
7883 {
7884 kern_return_t result = KERN_FAILURE;
7885 pt_entry_t *ptep;
7886 pt_entry_t spte = ARM_PTE_EMPTY;
7887 locked_pvh_t locked_pvh = {.pvh = 0};
7888 unsigned int pai;
7889 pmap_paddr_t pa;
7890 validate_pmap_mutable(pmap);
7891
7892 if (__probable(preemption_enabled())) {
7893 pmap_lock(pmap, PMAP_LOCK_SHARED);
7894 } else if (__improbable(!pmap_try_lock(pmap, PMAP_LOCK_SHARED))) {
7895 /**
7896 * In certain cases, arm_fast_fault() may be invoked with preemption disabled
7897 * on the copyio path. In theses cases the (in-kernel) caller expects that any
7898 * faults taken against the user address may not be handled successfully
7899 * (vm_fault() allows non-preemptible callers with the possibility that the
7900 * fault may not be successfully handled) and will result in the copyio operation
7901 * returning EFAULT. It is then the caller's responsibility to retry the copyio
7902 * operation in a preemptible context.
7903 *
7904 * For these cases attempting to acquire the sleepable lock will panic, so
7905 * we simply make a best effort and return failure just as the VM does if we
7906 * can't acquire the lock without sleeping.
7907 */
7908 return result;
7909 }
7910
7911 /*
7912 * If the entry doesn't exist, is completely invalid, or is already
7913 * valid, we can't fix it here.
7914 */
7915
7916 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
7917 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
7918 if (ptep != PT_ENTRY_NULL) {
7919 while (true) {
7920 spte = os_atomic_load(ptep, relaxed);
7921
7922 pa = pte_to_pa(spte);
7923
7924 if ((spte == ARM_PTE_EMPTY) || pte_is_compressed(spte, ptep)) {
7925 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7926 return result;
7927 }
7928
7929 if (!pa_valid(pa)) {
7930 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
7931 if (frame_type == XNU_PROTECTED_IO) {
7932 result = KERN_PROTECTION_FAILURE;
7933 }
7934 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7935 return result;
7936 }
7937 pai = pa_index(pa);
7938 /**
7939 * Check for preemption disablement and in that case use pvh_try_lock()
7940 * for the same reason we use pmap_try_lock() above.
7941 */
7942 if (__probable(preemption_enabled())) {
7943 locked_pvh = pvh_lock(pai);
7944 } else {
7945 locked_pvh = pvh_try_lock(pai);
7946 if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
7947 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7948 return result;
7949 }
7950 }
7951 assert(locked_pvh.pvh != 0);
7952 if (os_atomic_load(ptep, relaxed) == spte) {
7953 /*
7954 * Double-check the spte value, as we care about the AF bit.
7955 * It's also possible that pmap_page_protect() transitioned the
7956 * PTE to compressed/empty before we grabbed the PVH lock.
7957 */
7958 break;
7959 }
7960 pvh_unlock(&locked_pvh);
7961 }
7962 } else {
7963 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7964 return result;
7965 }
7966
7967
7968 if (result == KERN_SUCCESS) {
7969 goto ff_cleanup;
7970 }
7971
7972 pp_attr_t attrs = os_atomic_load(&pp_attr_table[pai], relaxed);
7973 if ((attrs & PP_ATTR_REFFAULT) || ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT))) {
7974 /*
7975 * An attempted access will always clear ref/mod fault state, as
7976 * appropriate for the fault type. arm_clear_fast_fault will
7977 * update the associated PTEs for the page as appropriate; if
7978 * any PTEs are updated, we redrive the access. If the mapping
7979 * does not actually allow for the attempted access, the
7980 * following fault will (hopefully) fail to update any PTEs, and
7981 * thus cause arm_fast_fault to decide that it failed to handle
7982 * the fault.
7983 */
7984 pp_attr_t attrs_to_clear = 0;
7985 if (attrs & PP_ATTR_REFFAULT) {
7986 attrs_to_clear |= PP_ATTR_REFFAULT;
7987 }
7988 if ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT)) {
7989 attrs_to_clear |= PP_ATTR_MODFAULT;
7990 }
7991
7992 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, PT_ENTRY_NULL, attrs_to_clear)) {
7993 /*
7994 * Should this preserve KERN_PROTECTION_FAILURE? The
7995 * cost of not doing so is a another fault in a case
7996 * that should already result in an exception.
7997 */
7998 result = KERN_SUCCESS;
7999 }
8000 }
8001
8002 /*
8003 * If the PTE already has sufficient permissions, we can report the fault as handled.
8004 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8005 * on mappings of the same page
8006 */
8007 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8008 uintptr_t ap_ro, ap_rw, ap_x;
8009 if (pmap == kernel_pmap) {
8010 ap_ro = ARM_PTE_AP(AP_RONA);
8011 ap_rw = ARM_PTE_AP(AP_RWNA);
8012 ap_x = ARM_PTE_NX;
8013 } else {
8014 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8015 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8016 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8017 }
8018 /*
8019 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8020 * hardware they may be xPRR-protected, in which case they'll be handled
8021 * by the is_pte_xprr_protected() case above. Additionally, the exception
8022 * handling path currently does not call arm_fast_fault() without at least
8023 * VM_PROT_READ in fault_type.
8024 */
8025 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8026 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8027 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8028 result = KERN_SUCCESS;
8029 }
8030 }
8031 }
8032
8033 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, ptep, 0)) {
8034 /*
8035 * A prior arm_clear_fast_fault() operation may have returned early due to
8036 * another pending PV list operation or an excessively large PV list.
8037 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8038 * taking a fault on the same mapping.
8039 */
8040 result = KERN_SUCCESS;
8041 }
8042
8043 ff_cleanup:
8044
8045 pvh_unlock(&locked_pvh);
8046 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8047 return result;
8048 }
8049
8050 kern_return_t
8051 arm_fast_fault(
8052 pmap_t pmap,
8053 vm_map_address_t va,
8054 vm_prot_t fault_type,
8055 bool was_af_fault,
8056 __unused bool from_user)
8057 {
8058 kern_return_t result = KERN_FAILURE;
8059
8060 if (va < pmap->min || va >= pmap->max) {
8061 return result;
8062 }
8063
8064 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8065 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8066 from_user);
8067
8068
8069 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8070
8071 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8072
8073 return result;
8074 }
8075
8076 void
8077 pmap_copy_page(
8078 ppnum_t psrc,
8079 ppnum_t pdst,
8080 int options)
8081 {
8082 bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8083 (addr64_t) (ptoa(pdst)),
8084 PAGE_SIZE,
8085 options);
8086 }
8087
8088
8089 /*
8090 * pmap_copy_page copies the specified (machine independent) pages.
8091 */
8092 void
8093 pmap_copy_part_page(
8094 ppnum_t psrc,
8095 vm_offset_t src_offset,
8096 ppnum_t pdst,
8097 vm_offset_t dst_offset,
8098 vm_size_t len)
8099 {
8100 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8101 (addr64_t) (ptoa(pdst) + dst_offset),
8102 len);
8103 }
8104
8105
8106 /*
8107 * pmap_zero_page zeros the specified (machine independent) page.
8108 */
8109 void
8110 pmap_zero_page(
8111 ppnum_t pn)
8112 {
8113 assert(pn != vm_page_fictitious_addr);
8114 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8115 }
8116
8117 /*
8118 * pmap_zero_page_with_options allows to specify further operations
8119 * to perform with the zeroing.
8120 */
8121 void
8122 pmap_zero_page_with_options(
8123 ppnum_t pn,
8124 int options)
8125 {
8126 assert(pn != vm_page_fictitious_addr);
8127 bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8128 }
8129
8130 /*
8131 * pmap_zero_part_page
8132 * zeros the specified (machine independent) part of a page.
8133 */
8134 void
8135 pmap_zero_part_page(
8136 ppnum_t pn,
8137 vm_offset_t offset,
8138 vm_size_t len)
8139 {
8140 assert(pn != vm_page_fictitious_addr);
8141 assert(offset + len <= PAGE_SIZE);
8142 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8143 }
8144
8145 void
8146 pmap_map_globals(
8147 void)
8148 {
8149 pt_entry_t pte;
8150
8151 pte = pa_to_pte(kvtophys_nofail((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX |
8152 ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8153 #if __ARM_KERNEL_PROTECT__
8154 pte |= ARM_PTE_NG;
8155 #endif /* __ARM_KERNEL_PROTECT__ */
8156 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8157 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8158 sptm_map_page(kernel_pmap->ttep, LOWGLOBAL_ALIAS, pte);
8159
8160
8161 #if KASAN
8162 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8163 #endif
8164 }
8165
8166 vm_offset_t
8167 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8168 {
8169 if (__improbable(index >= CPUWINDOWS_MAX)) {
8170 panic("%s: invalid index %u", __func__, index);
8171 }
8172 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8173 }
8174
8175 MARK_AS_PMAP_TEXT unsigned int
8176 pmap_map_cpu_windows_copy_internal(
8177 ppnum_t pn,
8178 vm_prot_t prot,
8179 unsigned int wimg_bits)
8180 {
8181 pt_entry_t *ptep = NULL, pte;
8182 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8183 unsigned int cpu_num;
8184 unsigned int cpu_window_index;
8185 vm_offset_t cpu_copywindow_vaddr = 0;
8186 bool need_strong_sync = false;
8187
8188 assert(get_preemption_level() > 0);
8189 cpu_num = pmap_cpu_data->cpu_number;
8190
8191 for (cpu_window_index = 0; cpu_window_index < CPUWINDOWS_MAX; cpu_window_index++) {
8192 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, cpu_window_index);
8193 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8194 assert(!pte_is_compressed(*ptep, ptep));
8195 if (!pte_is_valid(*ptep)) {
8196 break;
8197 }
8198 }
8199 if (__improbable(cpu_window_index == CPUWINDOWS_MAX)) {
8200 panic("%s: out of windows", __func__);
8201 }
8202
8203 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8204 #if __ARM_KERNEL_PROTECT__
8205 pte |= ARM_PTE_NG;
8206 #endif /* __ARM_KERNEL_PROTECT__ */
8207 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8208
8209 if (prot & VM_PROT_WRITE) {
8210 pte |= ARM_PTE_AP(AP_RWNA);
8211 } else {
8212 pte |= ARM_PTE_AP(AP_RONA);
8213 }
8214
8215 /*
8216 * It's expected to be safe for an interrupt handler to nest copy-window usage with the
8217 * active thread on a CPU, as long as a sufficient number of copy windows are available.
8218 * --If the interrupt handler executes before the active thread creates the per-CPU mapping,
8219 * or after the active thread completely removes the mapping, it may use the same mapping
8220 * but will finish execution and tear down the mapping without the thread needing to know.
8221 * --If the interrupt handler executes after the active thread creates the per-CPU mapping,
8222 * it will observe the valid mapping and use a different copy window.
8223 * --If the interrupt handler executes after the active thread clears the PTE in
8224 * pmap_unmap_cpu_windows_copy() but before the active thread flushes the TLB, the code
8225 * for computing cpu_window_index above will observe the PTE_INVALID_IN_FLIGHT token set
8226 * by the SPTM, and will select a different index.
8227 */
8228 const sptm_return_t sptm_status = sptm_map_page(kernel_pmap->ttep, cpu_copywindow_vaddr, pte);
8229 if (__improbable(sptm_status != SPTM_SUCCESS)) {
8230 panic("%s: failed to map CPU copy-window VA 0x%llx with SPTM status %d",
8231 __func__, (unsigned long long)cpu_copywindow_vaddr, sptm_status);
8232 }
8233
8234
8235 /*
8236 * Clean up any pending strong TLB flush for the same window in a thread we may have
8237 * interrupted.
8238 */
8239 if (__improbable(pmap_cpu_data->copywindow_strong_sync[cpu_window_index])) {
8240 arm64_sync_tlb(true);
8241 }
8242 pmap_cpu_data->copywindow_strong_sync[cpu_window_index] = need_strong_sync;
8243
8244 return cpu_window_index;
8245 }
8246
8247 unsigned int
8248 pmap_map_cpu_windows_copy(
8249 ppnum_t pn,
8250 vm_prot_t prot,
8251 unsigned int wimg_bits)
8252 {
8253 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8254 }
8255
8256 MARK_AS_PMAP_TEXT void
8257 pmap_unmap_cpu_windows_copy_internal(
8258 unsigned int index)
8259 {
8260 unsigned int cpu_num;
8261 vm_offset_t cpu_copywindow_vaddr = 0;
8262 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8263
8264 assert(index < CPUWINDOWS_MAX);
8265 assert(get_preemption_level() > 0);
8266
8267 cpu_num = pmap_cpu_data->cpu_number;
8268
8269 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8270 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8271 * (which are likely to have been on I/O memory) are complete before
8272 * tearing down the mapping. */
8273 __builtin_arm_dsb(DSB_SY);
8274 sptm_unmap_region(kernel_pmap->ttep, cpu_copywindow_vaddr, 1, 0);
8275 if (__improbable(pmap_cpu_data->copywindow_strong_sync[index])) {
8276 arm64_sync_tlb(true);
8277 pmap_cpu_data->copywindow_strong_sync[index] = false;
8278 }
8279 }
8280
8281 void
8282 pmap_unmap_cpu_windows_copy(
8283 unsigned int index)
8284 {
8285 return pmap_unmap_cpu_windows_copy_internal(index);
8286 }
8287
8288 /*
8289 * Indicate that a pmap is intended to be used as a nested pmap
8290 * within one or more larger address spaces. This must be set
8291 * before pmap_nest() is called with this pmap as the 'subordinate'.
8292 */
8293 MARK_AS_PMAP_TEXT void
8294 pmap_set_nested_internal(
8295 pmap_t pmap)
8296 {
8297 validate_pmap_mutable(pmap);
8298 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8299 if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8300 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8301 __func__, pmap, pmap->type);
8302 }
8303 pmap->type = PMAP_TYPE_NESTED;
8304 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
8305 retype_params.attr_idx = (pt_attr_page_size(pt_attr) == 4096) ? SPTM_PT_GEOMETRY_4K : SPTM_PT_GEOMETRY_16K;
8306 pmap_txm_acquire_exclusive_lock(pmap);
8307 sptm_retype(pmap->ttep, XNU_USER_ROOT_TABLE, XNU_SHARED_ROOT_TABLE, retype_params);
8308 pmap_txm_release_exclusive_lock(pmap);
8309 pmap_get_pt_ops(pmap)->free_id(pmap);
8310 }
8311
8312 void
8313 pmap_set_nested(
8314 pmap_t pmap)
8315 {
8316 pmap_set_nested_internal(pmap);
8317 }
8318
8319 bool
8320 pmap_is_nested(
8321 pmap_t pmap)
8322 {
8323 return pmap->type == PMAP_TYPE_NESTED;
8324 }
8325
8326 /*
8327 * pmap_trim_range(pmap, start, end)
8328 *
8329 * pmap = pmap to operate on
8330 * start = start of the range
8331 * end = end of the range
8332 *
8333 * Attempts to deallocate TTEs for the given range in the nested range.
8334 */
8335 MARK_AS_PMAP_TEXT static void
8336 pmap_trim_range(
8337 pmap_t pmap,
8338 addr64_t start,
8339 addr64_t end)
8340 {
8341 addr64_t cur;
8342 addr64_t nested_region_start;
8343 addr64_t nested_region_end;
8344 addr64_t adjusted_start;
8345 addr64_t adjusted_end;
8346 addr64_t adjust_offmask;
8347 tt_entry_t * tte_p;
8348 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8349
8350 if (__improbable(end < start)) {
8351 panic("%s: invalid address range, "
8352 "pmap=%p, start=%p, end=%p",
8353 __func__,
8354 pmap, (void*)start, (void*)end);
8355 }
8356
8357 nested_region_start = pmap->nested_region_addr;
8358 nested_region_end = nested_region_start + pmap->nested_region_size;
8359
8360 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8361 panic("%s: range outside nested region %p-%p, "
8362 "pmap=%p, start=%p, end=%p",
8363 __func__, (void *)nested_region_start, (void *)nested_region_end,
8364 pmap, (void*)start, (void*)end);
8365 }
8366
8367 /* Contract the range to TT page boundaries. */
8368 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
8369
8370 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
8371 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8372 adjusted_end = end & ~adjust_offmask;
8373
8374 /* Iterate over the range, trying to remove TTEs. */
8375 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += (pt_attr_twig_size(pt_attr) * page_ratio)) {
8376 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8377
8378 tte_p = pmap_tte(pmap, cur);
8379
8380 if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
8381 /* pmap_tte_deallocate()/pmap_tte_trim() will drop the pmap lock */
8382 if ((pmap->type == PMAP_TYPE_NESTED) && (sptm_get_page_table_refcnt(tte_to_pa(*tte_p)) == 0)) {
8383 /* Deallocate for the nested map. */
8384 pmap_tte_deallocate(pmap, cur, tte_p, pt_attr_twig_level(pt_attr));
8385 } else if (pmap->type == PMAP_TYPE_USER) {
8386 /**
8387 * Just remove for the parent map. If the leaf table pointed
8388 * to by the TTE being removed (owned by the nested pmap)
8389 * has any mappings, then this call will panic. This
8390 * enforces the policy that tables being trimmed must be
8391 * empty to prevent possible use-after-free attacks.
8392 */
8393 pmap_tte_trim(pmap, cur, tte_p);
8394 } else {
8395 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8396 }
8397 } else {
8398 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8399 }
8400 }
8401 }
8402
8403 /*
8404 * pmap_trim_internal(grand, subord, vstart, size)
8405 *
8406 * grand = pmap subord is nested in
8407 * subord = nested pmap
8408 * vstart = start of the used range in grand
8409 * size = size of the used range
8410 *
8411 * Attempts to trim the shared region page tables down to only cover the given
8412 * range in subord and grand.
8413 */
8414 MARK_AS_PMAP_TEXT void
8415 pmap_trim_internal(
8416 pmap_t grand,
8417 pmap_t subord,
8418 addr64_t vstart,
8419 uint64_t size)
8420 {
8421 addr64_t vend;
8422 addr64_t adjust_offmask;
8423
8424 if (__improbable(os_add_overflow(vstart, size, &vend))) {
8425 panic("%s: grand addr wraps around, "
8426 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8427 __func__, grand, subord, (void*)vstart, size);
8428 }
8429
8430 validate_pmap_mutable(grand);
8431 validate_pmap(subord);
8432
8433 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8434
8435 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8436
8437 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8438 panic("%s: subord is of non-nestable type 0x%hhx, "
8439 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8440 __func__, subord->type, grand, subord, (void*)vstart, size);
8441 }
8442
8443 if (__improbable(grand->type != PMAP_TYPE_USER)) {
8444 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
8445 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8446 __func__, grand->type, grand, subord, (void*)vstart, size);
8447 }
8448
8449 if (__improbable(grand->nested_pmap != subord)) {
8450 panic("%s: grand->nested != subord, "
8451 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8452 __func__, grand, subord, (void*)vstart, size);
8453 }
8454
8455 if (__improbable((size != 0) &&
8456 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
8457 panic("%s: grand range not in nested region, "
8458 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8459 __func__, grand, subord, (void*)vstart, size);
8460 }
8461
8462
8463 if (!grand->nested_has_no_bounds_ref) {
8464 assert(subord->nested_bounds_set);
8465
8466 if (!grand->nested_bounds_set) {
8467 /* Inherit the bounds from subord. */
8468 grand->nested_region_true_start = subord->nested_region_true_start;
8469 grand->nested_region_true_end = subord->nested_region_true_end;
8470 grand->nested_bounds_set = true;
8471 }
8472
8473 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8474 return;
8475 }
8476
8477 if ((!subord->nested_bounds_set) && size) {
8478 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
8479 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
8480
8481 subord->nested_region_true_start = vstart;
8482 subord->nested_region_true_end = vend;
8483 subord->nested_region_true_start &= ~adjust_offmask;
8484
8485 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
8486 panic("%s: padded true end wraps around, "
8487 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8488 __func__, grand, subord, (void*)vstart, size);
8489 }
8490
8491 subord->nested_region_true_end &= ~adjust_offmask;
8492 subord->nested_bounds_set = true;
8493 }
8494
8495 if (subord->nested_bounds_set) {
8496 /* Inherit the bounds from subord. */
8497 grand->nested_region_true_start = subord->nested_region_true_start;
8498 grand->nested_region_true_end = subord->nested_region_true_end;
8499 grand->nested_bounds_set = true;
8500
8501 /* If we know the bounds, we can trim the pmap. */
8502 grand->nested_has_no_bounds_ref = false;
8503 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8504 } else {
8505 /* Don't trim if we don't know the bounds. */
8506 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8507 return;
8508 }
8509
8510 /* Trim grand to only cover the given range. */
8511 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
8512 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
8513
8514 /* Try to trim subord. */
8515 pmap_trim_subord(subord);
8516 }
8517
8518 MARK_AS_PMAP_TEXT static void
8519 pmap_trim_self(pmap_t pmap)
8520 {
8521 if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
8522 /* If we have a no bounds ref, we need to drop it. */
8523 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
8524 pmap->nested_has_no_bounds_ref = false;
8525 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
8526 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
8527 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
8528 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
8529
8530 if (nested_bounds_set) {
8531 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
8532 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
8533 }
8534 /*
8535 * Try trimming the nested pmap, in case we had the
8536 * last reference.
8537 */
8538 pmap_trim_subord(pmap->nested_pmap);
8539 }
8540 }
8541
8542 /*
8543 * pmap_trim_subord(grand, subord)
8544 *
8545 * grand = pmap that we have nested subord in
8546 * subord = nested pmap we are attempting to trim
8547 *
8548 * Trims subord if possible
8549 */
8550 MARK_AS_PMAP_TEXT static void
8551 pmap_trim_subord(pmap_t subord)
8552 {
8553 bool contract_subord = false;
8554
8555 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8556
8557 subord->nested_no_bounds_refcnt--;
8558
8559 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
8560 /* If this was the last no bounds reference, trim subord. */
8561 contract_subord = true;
8562 }
8563
8564 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8565
8566 if (contract_subord) {
8567 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
8568 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
8569 }
8570 }
8571
8572 void
8573 pmap_trim(
8574 pmap_t grand,
8575 pmap_t subord,
8576 addr64_t vstart,
8577 uint64_t size)
8578 {
8579 pmap_trim_internal(grand, subord, vstart, size);
8580 }
8581
8582 #if HAS_APPLE_PAC
8583
8584 void *
8585 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
8586 {
8587 void *res = NULL;
8588 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
8589
8590 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
8591 __compiler_materialize_and_prevent_reordering_on(value);
8592 res = sptm_sign_user_pointer(value, key, discriminator, jop_key);
8593 __compiler_materialize_and_prevent_reordering_on(res);
8594 ml_disable_user_jop_key(jop_key, saved_jop_state);
8595
8596 ml_set_interrupts_enabled(current_intr_state);
8597
8598 return res;
8599 }
8600
8601 void *
8602 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
8603 {
8604 void *res = NULL;
8605 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
8606
8607 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
8608 __compiler_materialize_and_prevent_reordering_on(value);
8609 res = sptm_auth_user_pointer(value, key, discriminator, jop_key);
8610 __compiler_materialize_and_prevent_reordering_on(res);
8611 ml_disable_user_jop_key(jop_key, saved_jop_state);
8612
8613 if (res == SPTM_AUTH_FAILURE) {
8614 res = ml_poison_ptr(value, key);
8615 }
8616
8617 ml_set_interrupts_enabled(current_intr_state);
8618
8619 return res;
8620 }
8621 #endif /* HAS_APPLE_PAC */
8622
8623 /*
8624 * kern_return_t pmap_nest(grand, subord, vstart, size)
8625 *
8626 * grand = the pmap that we will nest subord into
8627 * subord = the pmap that goes into the grand
8628 * vstart = start of range in pmap to be inserted
8629 * size = Size of nest area (up to 16TB)
8630 *
8631 * Inserts a pmap into another. This is used to implement shared segments.
8632 *
8633 */
8634
8635 /**
8636 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
8637 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
8638 * This function operates in 3 main phases:
8639 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
8640 * 2. Expansion of subord to ensure the required leaf-level page table pages for
8641 * the mapping range are present in subord.
8642 * 3. Expansion of grand to ensure the required twig-level page table pages for
8643 * the mapping range are present in grand.
8644 * 4. Invoke sptm_nest_region() to copy the relevant TTEs from subord to grand.
8645 *
8646 * This function may return early due to pending AST_URGENT preemption; if so
8647 * it will indicate the need to be re-entered.
8648 *
8649 * @param grand pmap to insert the TTEs into. Must be a user pmap.
8650 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
8651 * @param vstart twig-aligned virtual address for the beginning of the nesting range
8652 * @param size twig-aligned size of the nesting range
8653 *
8654 * @return KERN_RESOURCE_SHORTAGE on allocation failure, KERN_SUCCESS otherwise
8655 */
8656 MARK_AS_PMAP_TEXT kern_return_t
8657 pmap_nest_internal(
8658 pmap_t grand,
8659 pmap_t subord,
8660 addr64_t vstart,
8661 uint64_t size)
8662 {
8663 kern_return_t kr = KERN_SUCCESS;
8664 vm_map_offset_t vaddr;
8665 tt_entry_t *stte_p;
8666 tt_entry_t *gtte_p;
8667 bitmap_t *nested_region_unnested_table_bitmap;
8668 int expand_options = 0;
8669 bool deref_subord = true;
8670
8671 addr64_t vend;
8672 if (__improbable(os_add_overflow(vstart, size, &vend))) {
8673 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
8674 }
8675
8676 validate_pmap_mutable(grand);
8677 validate_pmap(subord);
8678 os_ref_retain_raw(&subord->ref_count, &pmap_refgrp);
8679
8680 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8681 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
8682 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
8683 }
8684
8685 if (__improbable(((size | vstart) &
8686 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
8687 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx",
8688 grand, vstart, size);
8689 }
8690
8691 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8692 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
8693 }
8694
8695 if (__improbable(grand->type != PMAP_TYPE_USER)) {
8696 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
8697 }
8698
8699 /**
8700 * Use an acquire barrier to ensure that subsequent loads of nested_region_* fields are not
8701 * speculated ahead of the load of nested_region_unnested_table_bitmap, so that if we observe a non-NULL
8702 * nested_region_unnested_table_bitmap then we can be sure the other fields have been initialized as well.
8703 */
8704 if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) {
8705 uint64_t nested_region_unnested_table_bits = size >> pt_attr_twig_shift(pt_attr);
8706
8707 if (__improbable((nested_region_unnested_table_bits > UINT_MAX))) {
8708 panic("%s: bitmap allocation size %llu will truncate, "
8709 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
8710 __func__, nested_region_unnested_table_bits,
8711 grand, subord, vstart, size);
8712 }
8713
8714 nested_region_unnested_table_bitmap = bitmap_alloc((uint) nested_region_unnested_table_bits);
8715
8716 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8717 if (subord->nested_region_unnested_table_bitmap == NULL) {
8718 subord->nested_region_addr = vstart;
8719 subord->nested_region_size = (mach_vm_offset_t) size;
8720 sptm_configure_shared_region(subord->ttep, vstart, size >> pt_attr->pta_page_shift);
8721
8722 /**
8723 * Ensure that the rest of the subord->nested_region_* fields are
8724 * initialized and visible before setting the nested_region_unnested_table_bitmap
8725 * field (which is used as the flag to say that the rest are initialized).
8726 */
8727 os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release);
8728 nested_region_unnested_table_bitmap = NULL;
8729 }
8730 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8731 if (nested_region_unnested_table_bitmap != NULL) {
8732 bitmap_free(nested_region_unnested_table_bitmap, nested_region_unnested_table_bits);
8733 }
8734 }
8735
8736 assertf(subord->nested_region_addr == vstart, "%s: pmap %p nested region addr 0x%llx doesn't match vstart 0x%llx",
8737 __func__, subord, (unsigned long long)subord->nested_region_addr, (unsigned long long)vstart);
8738 assertf(subord->nested_region_size == size, "%s: pmap %p nested region size 0x%llx doesn't match size 0x%llx",
8739 __func__, subord, (unsigned long long)subord->nested_region_size, (unsigned long long)size);
8740
8741 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8742
8743 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
8744 /*
8745 * If this is grand's first nesting operation, keep the reference on subord.
8746 * It will be released by pmap_destroy_internal() when grand is destroyed.
8747 */
8748 deref_subord = false;
8749
8750 if (!subord->nested_bounds_set) {
8751 /*
8752 * We are nesting without the shared regions bounds
8753 * being known. We'll have to trim the pmap later.
8754 */
8755 grand->nested_has_no_bounds_ref = true;
8756 subord->nested_no_bounds_refcnt++;
8757 }
8758
8759 grand->nested_region_addr = vstart;
8760 grand->nested_region_size = (mach_vm_offset_t) size;
8761 } else {
8762 if (__improbable(grand->nested_pmap != subord)) {
8763 panic("pmap_nest() pmap %p has a nested pmap", grand);
8764 } else if (__improbable(grand->nested_region_addr > vstart)) {
8765 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
8766 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
8767 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
8768 }
8769 }
8770
8771 vaddr = vstart;
8772 if (vaddr < subord->nested_region_true_start) {
8773 vaddr = subord->nested_region_true_start;
8774 }
8775
8776 addr64_t true_end = vend;
8777 if (true_end > subord->nested_region_true_end) {
8778 true_end = subord->nested_region_true_end;
8779 }
8780
8781 while (vaddr < true_end) {
8782 stte_p = pmap_tte(subord, vaddr);
8783 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
8784 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8785 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
8786
8787 if (kr != KERN_SUCCESS) {
8788 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
8789 goto done;
8790 }
8791
8792 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8793 }
8794 vaddr += pt_attr_twig_size(pt_attr);
8795 }
8796
8797 /*
8798 * copy TTEs from subord pmap into grand pmap
8799 */
8800
8801 vaddr = (vm_map_offset_t) vstart;
8802 if (vaddr < subord->nested_region_true_start) {
8803 vaddr = subord->nested_region_true_start;
8804 }
8805
8806 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8807 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
8808
8809 while (vaddr < true_end) {
8810 gtte_p = pmap_tte(grand, vaddr);
8811 if (gtte_p == PT_ENTRY_NULL) {
8812 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
8813 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
8814 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
8815
8816 if (kr != KERN_SUCCESS) {
8817 goto done;
8818 }
8819 }
8820
8821 vaddr += pt_attr_twig_size(pt_attr);
8822 }
8823
8824 vaddr = (vm_map_offset_t) vstart;
8825
8826 /*
8827 * It is possible to have a preempted nest operation execute concurrently
8828 * with a trim operation that sets nested_region_true_start. In this case,
8829 * update the nesting bounds. This is useful both as a performance
8830 * optimization and to prevent an attempt to nest a just-trimmed TTE,
8831 * which will trigger an SPTM violation.
8832 * Note that pmap_trim() may concurrently update grand's bounds as we are
8833 * making these checks, but in that case pmap_trim_range() has not yet
8834 * been called on grand and will wait for us to drop grand's lock, so it
8835 * should see any TTEs we've nested here and clear them appropriately.
8836 */
8837 if (vaddr < subord->nested_region_true_start) {
8838 vaddr = subord->nested_region_true_start;
8839 }
8840 if (vaddr < grand->nested_region_true_start) {
8841 vaddr = grand->nested_region_true_start;
8842 }
8843 if (true_end > subord->nested_region_true_end) {
8844 true_end = subord->nested_region_true_end;
8845 }
8846 if (true_end > grand->nested_region_true_end) {
8847 true_end = grand->nested_region_true_end;
8848 }
8849
8850 while (vaddr < true_end) {
8851 /*
8852 * The SPTM requires the run of TTE updates to all reside within the same L2 page, so the region
8853 * we supply to the SPTM can't span multiple L1 TTEs.
8854 */
8855 vm_map_offset_t vlim = ((vaddr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8856 if (vlim > true_end) {
8857 vlim = true_end;
8858 }
8859 pmap_txm_acquire_exclusive_lock(grand);
8860 pmap_txm_acquire_shared_lock(subord);
8861 sptm_nest_region(grand->ttep, subord->ttep, vaddr, (vlim - vaddr) >> pt_attr->pta_page_shift);
8862 pmap_txm_release_shared_lock(subord);
8863 pmap_txm_release_exclusive_lock(grand);
8864 vaddr = vlim;
8865 }
8866
8867 done:
8868 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
8869 if (deref_subord) {
8870 pmap_destroy_internal(subord);
8871 }
8872
8873 return kr;
8874 }
8875
8876 kern_return_t
8877 pmap_nest(
8878 pmap_t grand,
8879 pmap_t subord,
8880 addr64_t vstart,
8881 uint64_t size)
8882 {
8883 kern_return_t kr = KERN_SUCCESS;
8884
8885 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
8886 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
8887 VM_KERNEL_ADDRHIDE(vstart));
8888
8889 pmap_verify_preemptible();
8890 kr = pmap_nest_internal(grand, subord, vstart, size);
8891
8892 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
8893
8894 return kr;
8895 }
8896
8897 /*
8898 * kern_return_t pmap_unnest(grand, vaddr)
8899 *
8900 * grand = the pmap that will have the virtual range unnested
8901 * vaddr = start of range in pmap to be unnested
8902 * size = size of range in pmap to be unnested
8903 *
8904 */
8905
8906 kern_return_t
8907 pmap_unnest(
8908 pmap_t grand,
8909 addr64_t vaddr,
8910 uint64_t size)
8911 {
8912 return pmap_unnest_options(grand, vaddr, size, 0);
8913 }
8914
8915 /**
8916 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
8917 * from a top-level pmap ('grand'). The corresponding mappings in the nested
8918 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
8919 * still have the region nested. The mappings in 'grand' will be left empty
8920 * with the assumption that they will be demand-filled by subsequent access faults.
8921 *
8922 * This function operates in 2 main phases:
8923 * 1. Iteration over the nested pmap's mappings for the specified range to mark
8924 * them non-global.
8925 * 2. Calling the SPTM to clear the twig-level TTEs for the address range in grand.
8926 *
8927 * This function may return early due to pending AST_URGENT preemption; if so
8928 * it will indicate the need to be re-entered.
8929 *
8930 * @param grand pmap from which to unnest mappings
8931 * @param vaddr twig-aligned virtual address for the beginning of the nested range
8932 * @param size twig-aligned size of the nested range
8933 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
8934 * grand is being torn down and step 1) above is not needed.
8935 */
8936 MARK_AS_PMAP_TEXT void
8937 pmap_unnest_options_internal(
8938 pmap_t grand,
8939 addr64_t vaddr,
8940 uint64_t size,
8941 unsigned int option)
8942 {
8943 vm_map_offset_t start;
8944 vm_map_offset_t addr;
8945 unsigned int current_index;
8946 unsigned int start_index;
8947 unsigned int max_index;
8948
8949 addr64_t vend;
8950 addr64_t true_end;
8951 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
8952 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
8953 }
8954
8955 validate_pmap_mutable(grand);
8956
8957 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8958
8959 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
8960 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
8961 (unsigned long long)vaddr, (unsigned long long)size);
8962 }
8963
8964 if (__improbable(grand->nested_pmap == NULL)) {
8965 panic("%s: %p has no nested pmap", __func__, grand);
8966 }
8967
8968 true_end = vend;
8969 if (true_end > grand->nested_pmap->nested_region_true_end) {
8970 true_end = grand->nested_pmap->nested_region_true_end;
8971 }
8972
8973 if ((option & PMAP_UNNEST_CLEAN) == 0) {
8974 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
8975 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
8976 }
8977
8978 /*
8979 * SPTM TODO: I suspect we may be able to hold the nested pmap lock shared here.
8980 * We would need to use atomic_bitmap_set below where we currently use bitmap_test + bitmap_set.
8981 * The risk is that a concurrent pmap_enter() against the nested pmap could observe the relevant
8982 * bit in the nested region bitmap to be clear, but could then create the (global) mapping after
8983 * we've made our SPTM sweep below to set NG. In that case we could end up with a mix of global
8984 * and non-global mappings for the same VA region and thus a TLB conflict. I'm uncertain if the
8985 * VM would allow these operation to happen concurrently. Even if it does, we could still do
8986 * something fancier here such as waiting for concurrent pmap_enter() to drain after updating
8987 * the bitmap.
8988 */
8989 pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
8990
8991 disable_preemption();
8992 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
8993 unsigned int num_mappings = 0;
8994 start = vaddr;
8995 if (start < grand->nested_pmap->nested_region_true_start) {
8996 start = grand->nested_pmap->nested_region_true_start;
8997 }
8998 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
8999 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9000
9001 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9002 pt_entry_t *bpte, *cpte;
9003
9004 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9005
9006 bpte = pmap_pte(grand->nested_pmap, addr);
9007
9008 if (!bitmap_test(grand->nested_pmap->nested_region_unnested_table_bitmap, current_index)) {
9009 /*
9010 * We've marked the 'twig' region as being unnested. Every mapping entered within
9011 * the nested pmap in this region will now be marked non-global.
9012 */
9013 bitmap_set(grand->nested_pmap->nested_region_unnested_table_bitmap, current_index);
9014 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
9015 pt_entry_t spte = os_atomic_load(cpte, relaxed);
9016
9017 if (pte_is_valid(spte)) {
9018 spte |= ARM_PTE_NG;
9019 }
9020
9021 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9022
9023 sptm_pcpu->sptm_templates[num_mappings] = spte;
9024 ++num_mappings;
9025
9026 if (num_mappings == SPTM_MAPPING_LIMIT) {
9027 pmap_retype_epoch_enter();
9028 sptm_update_region(grand->nested_pmap->ttep, start, num_mappings,
9029 sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9030 pmap_retype_epoch_exit();
9031 enable_preemption();
9032 num_mappings = 0;
9033 start = addr;
9034 disable_preemption();
9035 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9036 }
9037 }
9038 }
9039 /**
9040 * The SPTM does not allow region updates to span multiple leaf page tables, so request
9041 * any remaining updates up to vlim before moving to the next page table page.
9042 */
9043 if (num_mappings != 0) {
9044 pmap_retype_epoch_enter();
9045 sptm_update_region(grand->nested_pmap->ttep, start, num_mappings,
9046 sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9047 pmap_retype_epoch_exit();
9048 enable_preemption();
9049 num_mappings = 0;
9050 disable_preemption();
9051 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9052 }
9053 addr = start = vlim;
9054 }
9055
9056 if (num_mappings != 0) {
9057 pmap_retype_epoch_enter();
9058 sptm_update_region(grand->nested_pmap->ttep, start, num_mappings,
9059 sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9060 pmap_retype_epoch_exit();
9061 }
9062
9063 enable_preemption();
9064 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9065 }
9066
9067 /*
9068 * invalidate all pdes for segment at vaddr in pmap grand
9069 */
9070 addr = vaddr;
9071
9072 pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9073
9074 if (addr < grand->nested_pmap->nested_region_true_start) {
9075 addr = grand->nested_pmap->nested_region_true_start;
9076 }
9077
9078 if (true_end > grand->nested_pmap->nested_region_true_end) {
9079 true_end = grand->nested_pmap->nested_region_true_end;
9080 }
9081
9082 while (addr < true_end) {
9083 vm_map_offset_t vlim = ((addr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9084 if (vlim > true_end) {
9085 vlim = true_end;
9086 }
9087 sptm_unnest_region(grand->ttep, grand->nested_pmap->ttep, addr, (vlim - addr) >> pt_attr->pta_page_shift);
9088 addr = vlim;
9089 }
9090
9091 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9092 }
9093
9094 kern_return_t
9095 pmap_unnest_options(
9096 pmap_t grand,
9097 addr64_t vaddr,
9098 uint64_t size,
9099 unsigned int option)
9100 {
9101 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9102 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9103
9104 pmap_verify_preemptible();
9105 pmap_unnest_options_internal(grand, vaddr, size, option);
9106
9107 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9108
9109 return KERN_SUCCESS;
9110 }
9111
9112 boolean_t
9113 pmap_adjust_unnest_parameters(
9114 __unused pmap_t p,
9115 __unused vm_map_offset_t *s,
9116 __unused vm_map_offset_t *e)
9117 {
9118 return TRUE; /* to get to log_unnest_badness()... */
9119 }
9120
9121 #if PMAP_FORK_NEST
9122 /**
9123 * Perform any necessary pre-nesting of the parent's shared region at fork()
9124 * time.
9125 *
9126 * @note This should only be called from vm_map_fork().
9127 *
9128 * @param old_pmap The pmap of the parent task.
9129 * @param new_pmap The pmap of the child task.
9130 * @param nesting_start An output parameter that is updated with the start
9131 * address of the range that was pre-nested
9132 * @param nesting_end An output parameter that is updated with the end
9133 * address of the range that was pre-nested
9134 *
9135 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9136 * KERN_INVALID_ARGUMENT if the arguments were not valid.
9137 */
9138 kern_return_t
9139 pmap_fork_nest(
9140 pmap_t old_pmap,
9141 pmap_t new_pmap,
9142 vm_map_offset_t *nesting_start,
9143 vm_map_offset_t *nesting_end)
9144 {
9145 if (old_pmap == NULL || new_pmap == NULL) {
9146 return KERN_INVALID_ARGUMENT;
9147 }
9148 if (old_pmap->nested_pmap == NULL) {
9149 return KERN_SUCCESS;
9150 }
9151 pmap_nest(new_pmap,
9152 old_pmap->nested_pmap,
9153 old_pmap->nested_region_addr,
9154 old_pmap->nested_region_size);
9155 assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
9156 new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
9157 new_pmap->nested_region_size == old_pmap->nested_region_size,
9158 "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
9159 new_pmap->nested_pmap,
9160 new_pmap->nested_region_addr,
9161 new_pmap->nested_region_size,
9162 old_pmap->nested_pmap,
9163 old_pmap->nested_region_addr,
9164 old_pmap->nested_region_size);
9165 *nesting_start = old_pmap->nested_region_addr;
9166 *nesting_end = *nesting_start + old_pmap->nested_region_size;
9167 return KERN_SUCCESS;
9168 }
9169 #endif /* PMAP_FORK_NEST */
9170
9171 /*
9172 * disable no-execute capability on
9173 * the specified pmap
9174 */
9175 #if DEVELOPMENT || DEBUG
9176 void
9177 pmap_disable_NX(
9178 pmap_t pmap)
9179 {
9180 pmap->nx_enabled = FALSE;
9181 }
9182 #else
9183 void
9184 pmap_disable_NX(
9185 __unused pmap_t pmap)
9186 {
9187 }
9188 #endif
9189
9190 /*
9191 * flush a range of hardware TLB entries.
9192 * NOTE: assumes the smallest TLB entry in use will be for
9193 * an ARM small page (4K).
9194 */
9195
9196 #if __ARM_RANGE_TLBI__
9197 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
9198 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
9199 #else
9200 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
9201 #endif // __ARM_RANGE_TLBI__
9202
9203 static void
9204 flush_mmu_tlb_region_asid_async(
9205 vm_offset_t va,
9206 size_t length,
9207 pmap_t pmap,
9208 bool last_level_only __unused)
9209 {
9210 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
9211 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
9212 ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
9213 const uint16_t asid = PMAP_HWASID(pmap);
9214
9215 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
9216 boolean_t flush_all = FALSE;
9217
9218 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
9219 flush_all = TRUE;
9220 }
9221 if (flush_all) {
9222 flush_mmu_tlb_async();
9223 } else {
9224 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, false);
9225 }
9226 return;
9227 }
9228 #if __ARM_RANGE_TLBI__
9229 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
9230 va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
9231 if (pmap->type == PMAP_TYPE_NESTED) {
9232 flush_mmu_tlb_allrange_async(va, last_level_only, false);
9233 } else {
9234 flush_mmu_tlb_range_async(va, last_level_only, false);
9235 }
9236 return;
9237 }
9238 #endif
9239 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
9240 va = tlbi_asid(asid) | tlbi_addr(va);
9241
9242 if (pmap->type == PMAP_TYPE_NESTED) {
9243 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, false);
9244 } else {
9245 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, false);
9246 }
9247 }
9248
9249 void
9250 flush_mmu_tlb_region(
9251 vm_offset_t va,
9252 unsigned length)
9253 {
9254 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
9255 sync_tlb_flush();
9256 }
9257
9258 unsigned int
9259 pmap_cache_attributes(
9260 ppnum_t pn)
9261 {
9262 pmap_paddr_t paddr;
9263 unsigned int pai;
9264 unsigned int result;
9265 pp_attr_t pp_attr_current;
9266
9267 paddr = ptoa(pn);
9268
9269 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
9270
9271 if (!pa_valid(paddr)) {
9272 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
9273 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
9274 }
9275
9276 result = VM_WIMG_DEFAULT;
9277
9278 pai = pa_index(paddr);
9279
9280 pp_attr_current = pp_attr_table[pai];
9281 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
9282 result = pp_attr_current & PP_ATTR_WIMG_MASK;
9283 }
9284 return result;
9285 }
9286
9287 MARK_AS_PMAP_TEXT static void
9288 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
9289 {
9290 if ((wimg_bits_prev != wimg_bits_new)
9291 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
9292 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
9293 && (wimg_bits_new != VM_WIMG_COPYBACK))
9294 || ((wimg_bits_prev == VM_WIMG_WTHRU)
9295 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
9296 pmap_sync_page_attributes_phys(pn);
9297 }
9298
9299 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
9300 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
9301 }
9302 }
9303
9304 MARK_AS_PMAP_TEXT __unused void
9305 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
9306 {
9307 pmap_paddr_t paddr = ptoa(pn);
9308
9309 if (__improbable(!pa_valid(paddr))) {
9310 panic("%s called on non-managed page 0x%08x", __func__, pn);
9311 }
9312
9313 pmap_set_cache_attributes_internal(pn, new_cacheattr, false);
9314
9315 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
9316 }
9317
9318 static inline bool
9319 cacheattr_supports_compressor(unsigned int cacheattr)
9320 {
9321 switch (cacheattr) {
9322 case VM_WIMG_DEFAULT:
9323 return true;
9324 default:
9325 return false;
9326 }
9327 }
9328
9329 void *
9330 pmap_map_compressor_page(ppnum_t pn)
9331 {
9332 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
9333 if (!cacheattr_supports_compressor(cacheattr)) {
9334 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
9335 }
9336
9337 return (void*)phystokv(ptoa(pn));
9338 }
9339
9340 void
9341 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
9342 {
9343 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
9344 if (!cacheattr_supports_compressor(cacheattr)) {
9345 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
9346 }
9347 }
9348
9349 /**
9350 * Flushes TLB entries associated with the page specified by paddr, but do not
9351 * issue barriers yet.
9352 *
9353 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
9354 */
9355 static void
9356 pmap_flush_tlb_for_paddr_async(pmap_paddr_t paddr)
9357 {
9358 /* Flush the physical aperture mappings. */
9359 const vm_offset_t kva = phystokv(paddr);
9360 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
9361
9362 /* Flush the mappings tracked in the ptes. */
9363 const unsigned int pai = pa_index(paddr);
9364 locked_pvh_t locked_pvh = pvh_lock(pai);
9365
9366 pt_entry_t *pte_p = PT_ENTRY_NULL;
9367 pv_entry_t *pve_p = PV_ENTRY_NULL;
9368
9369 if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP)) {
9370 pte_p = pvh_ptep(locked_pvh.pvh);
9371 } else if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
9372 pve_p = pvh_pve_list(locked_pvh.pvh);
9373 pte_p = PT_ENTRY_NULL;
9374 }
9375
9376 unsigned int nptes = 0;
9377 int pve_ptep_idx = 0;
9378 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
9379 if (pve_p != PV_ENTRY_NULL) {
9380 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
9381 if (pte_p == PT_ENTRY_NULL) {
9382 goto flush_tlb_skip_pte;
9383 }
9384 }
9385
9386 if (__improbable(nptes == SPTM_MAPPING_LIMIT)) {
9387 pvh_lock_enter_sleep_mode(&locked_pvh);
9388 }
9389 ++nptes;
9390 #ifdef PVH_FLAG_IOMMU
9391 if (pvh_ptep_is_iommu(pte_p)) {
9392 goto flush_tlb_skip_pte;
9393 }
9394 #endif /* PVH_FLAG_IOMMU */
9395 const pmap_t pmap = ptep_get_pmap(pte_p);
9396 const vm_map_address_t va = ptep_get_va(pte_p);
9397
9398 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
9399
9400 flush_tlb_skip_pte:
9401 pte_p = PT_ENTRY_NULL;
9402 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
9403 pve_ptep_idx = 0;
9404 pve_p = pve_next(pve_p);
9405 }
9406 }
9407 pvh_unlock(&locked_pvh);
9408 }
9409
9410 /**
9411 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
9412 *
9413 * @param pai The Physical Address Index of the entry.
9414 * @param cacheattr The new cache attribute.
9415 */
9416 MARK_AS_PMAP_TEXT static void
9417 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
9418 {
9419 pvh_assert_locked(pai);
9420
9421 pp_attr_t pp_attr_current, pp_attr_template;
9422 do {
9423 pp_attr_current = pp_attr_table[pai];
9424 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
9425
9426 /**
9427 * WIMG bits should only be updated under the PVH lock, but we should do
9428 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
9429 */
9430 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
9431 }
9432
9433 /**
9434 * Structure for tracking where we are during the collection of mappings for batch
9435 * cache attribute updates.
9436 *
9437 * @note We need to track where in the per-cpu ops table we are filling the next mappings into,
9438 * because the collection routine can return with a not completely filled ops table when
9439 * it exhausts the PV list for a page. In such case, the remaining slots in the ops table
9440 * will be used for mappings of the next page.
9441 *
9442 * @note We also need to record where we are in the PV list, because the collection routine can
9443 * also return when the ops table is filled but it's still in the middle of the PV list.
9444 * Those remaining items in the PV list need to be handled by the next batch operation in
9445 * a new ops table.
9446 */
9447 typedef struct {
9448 /* Where we are in the sptm ops table. */
9449 unsigned int sptm_ops_index;
9450
9451 /**
9452 * The last collected physical address from the previous full ops array (and in turn, SPTM
9453 * call). This is used to know whether the SPTM call for the latest full ops table should
9454 * skip updating the PAPT mapping (seeing as the last call would have handled updating it).
9455 */
9456 pmap_paddr_t last_table_last_papt_pa;
9457
9458 /**
9459 * Where we are in the pv list.
9460 *
9461 * When ptep is non-null, there's only one mapping to the page and the ptep is the address
9462 * of it.
9463 *
9464 * When pvep is non-null, there's more than one mapping and the mappings are tracked by the
9465 * PV list.
9466 *
9467 * When they are both null, it indicates we are collecting for a new page and the collection
9468 * function will initialize them to be one of the two states above.
9469 *
9470 * It is undefined when they are both non-null.
9471 */
9472 pt_entry_t *ptep;
9473 pv_entry_t *pvep;
9474 unsigned int pve_ptep_idx;
9475 } pmap_sptm_update_cache_attr_ops_collect_state_t;
9476
9477 /**
9478 * Reports whether there is any pending ops in an sptm cache attr ops table.
9479 *
9480 * @param state A pmap_sptm_update_cache_attr_ops_collect_state_t structure.
9481 *
9482 * @return True if there's any outstanding cache attr op.
9483 * False otherwise.
9484 */
9485 static inline bool
9486 pmap_is_sptm_update_cache_attr_ops_pending(pmap_sptm_update_cache_attr_ops_collect_state_t state)
9487 {
9488 return state.sptm_ops_index > 0;
9489 }
9490
9491 /**
9492 * Struct for encoding the collection status into pmap_sptm_update_cache_attr_ops_collect()'s
9493 * return value indicating what kind of attention it needs.
9494 */
9495 typedef enum {
9496 OPS_COLLECT_NOTHING = 0x0,
9497
9498 /* The ops table is full, and the caller should commit the table to SPTM. */
9499 OPS_COLLECT_RETURN_FULL_TABLE = 0x1,
9500
9501 /**
9502 * The page has its mappings completely collected, and the caller should
9503 * pass in a new page next time.
9504 */
9505 OPS_COLLECT_RETURN_COMPLETED_PAGE = 0x2,
9506 } pmap_sptm_update_cache_attr_ops_collect_return_t;
9507
9508 /**
9509 * Collects mappings of a physical page into an SPTM ops table for cache attribute updates.
9510 *
9511 * @note This routine returns either when the ops table is full or the page represented by
9512 * pa has no more mapping to collect. The caller should call this routine again with
9513 * a fresh ops table, or a new page, or both, depending on the return code.
9514 *
9515 * @note The PVH lock needs to be held for pa.
9516 *
9517 * @param state Tracks the state of PV list traversal and SPTM ops table filling. It is used
9518 * by this routine to save the progress of the collection.
9519 * @param sptm_ops Pointer to the SPTM ops table.
9520 * @param pa The physical address whose mappings are to be collected.
9521 * @param attributes The new cache attributes.
9522 *
9523 * @return A pmap_sptm_update_cache_attr_ops_collect_return_t that encodes what the caller
9524 * should do before calling this routine again. See the inline comments around
9525 * pmap_sptm_update_cache_attr_ops_collect_return_t for details.
9526 */
9527 static pmap_sptm_update_cache_attr_ops_collect_return_t
9528 pmap_sptm_update_cache_attr_ops_collect(
9529 pmap_sptm_update_cache_attr_ops_collect_state_t *state,
9530 sptm_update_disjoint_multipage_op_t *sptm_ops,
9531 pmap_paddr_t pa,
9532 unsigned int attributes)
9533 {
9534 if (state == NULL || sptm_ops == NULL) {
9535 panic("%s: unexpected null arguments - state: %p, sptm_ops: %p", __func__, state, sptm_ops);
9536 }
9537
9538 PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_START, pa, attributes, state->sptm_ops_index);
9539
9540 /* Copy the states into local variables. */
9541 unsigned int sptm_ops_index = state->sptm_ops_index;
9542 pmap_paddr_t last_table_last_papt_pa = state->last_table_last_papt_pa;
9543 pv_entry_t *pvep = state->pvep;
9544 pt_entry_t *ptep = state->ptep;
9545 unsigned int pve_ptep_idx = state->pve_ptep_idx;
9546
9547 unsigned int pai = pa_index(pa);
9548
9549 /* We should at least have one free slot in the ops table. */
9550 assert(sptm_ops_index < SPTM_MAPPING_LIMIT);
9551
9552 /* The PVH lock for pa has to be locked. */
9553 pvh_assert_locked(pai);
9554
9555 /* If pvep and ptep are both null in the state, it's a new page. Initialize the states. */
9556 if (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL) {
9557 const uintptr_t pvh = pai_to_pvh(pai);
9558 if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
9559 ptep = PT_ENTRY_NULL;
9560 pvep = pvh_pve_list(pvh);
9561 pve_ptep_idx = 0;
9562 } else if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
9563 ptep = pvh_ptep(pvh);
9564 pvep = PV_ENTRY_NULL;
9565 pve_ptep_idx = 0;
9566 }
9567 }
9568
9569 /**
9570 * The first entry filled in is always the PAPT header entry:
9571 *
9572 * 1) In the case of a fresh ops table, the first entry has to be a PAPT header.
9573 * 2) In the case of a fresh page, we need to insert a new PAPT header to request
9574 * SPTM to operate on a new page.
9575 *
9576 * Remember the index of the PAPT header here so that we can update the number
9577 * of mappings field later when we finish collecting.
9578 */
9579 const unsigned int papt_sptm_ops_index = sptm_ops_index;
9580 unsigned int num_mappings = 0;
9581
9582 /* Assemble the PTE template for the PAPT mapping. */
9583 const vm_address_t kva = phystokv(pa);
9584 const pt_entry_t *papt_ptep = pmap_pte(kernel_pmap, kva);
9585
9586 pt_entry_t template = os_atomic_load(papt_ptep, relaxed);
9587 template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
9588 template |= wimg_to_pte(attributes, pa);
9589
9590 /* Fill in the PAPT header entry. */
9591 sptm_ops[papt_sptm_ops_index].per_paddr_header.paddr = pa;
9592 sptm_ops[papt_sptm_ops_index].per_paddr_header.papt_pte_template = template;
9593 sptm_ops[papt_sptm_ops_index].per_paddr_header.options = SPTM_UPDATE_SH | SPTM_UPDATE_MAIR | SPTM_UPDATE_DEFER_TLBI;
9594
9595 if ((papt_sptm_ops_index == 0) && (pa == last_table_last_papt_pa)) {
9596 /**
9597 * If the previous SPTM call was made with an ops table that already included
9598 * updating the PA of the page that this table starts with, then we can assume
9599 * that call already updated the PAPT and we can safely skip it in this
9600 * upcoming one.
9601 */
9602 sptm_ops[0].per_paddr_header.options |= SPTM_UPDATE_SKIP_PAPT;
9603 }
9604
9605 sptm_ops_index++;
9606
9607 /**
9608 * Main loop for collecting the mappings into the ops table. It terminates either
9609 * when the ops table is full or the PV list is exhausted.
9610 */
9611 while ((sptm_ops_index < SPTM_MAPPING_LIMIT) && (pvep != PV_ENTRY_NULL || ptep != PT_ENTRY_NULL)) {
9612 /**
9613 * Update ptep. There are really two cases here:
9614 *
9615 * 1) pvep is PV_ENTRY_NULL. In this case, ptep holds the pointer to
9616 * the only mapping to the page.
9617 * 2) pvep is not PV_ENTRY_NULL. In such case, ptep is updated accroding to
9618 * pvep and pve_ptep_idx.
9619 */
9620 if (pvep != PV_ENTRY_NULL) {
9621 ptep = pve_get_ptep(pvep, pve_ptep_idx);
9622
9623 /* This pve is empty, so skip to next one. */
9624 if (ptep == PT_ENTRY_NULL) {
9625 goto sucaoc_skip_pte;
9626 }
9627 }
9628
9629 #ifdef PVH_FLAG_IOMMU
9630 /* Skip IOMMU pteps. */
9631 if (pvh_ptep_is_iommu(ptep)) {
9632 goto sucaoc_skip_pte;
9633 }
9634 #endif
9635 /* Assemble the PTE template for the mapping. */
9636 const vm_address_t va = ptep_get_va(ptep);
9637 const pmap_t pmap = ptep_get_pmap(ptep);
9638
9639 template = os_atomic_load(ptep, relaxed);
9640 template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
9641 template |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, pa);
9642
9643 /* Fill into the ops table. */
9644 sptm_ops[sptm_ops_index].disjoint_op.root_pt_paddr = pmap->ttep;
9645 sptm_ops[sptm_ops_index].disjoint_op.vaddr = va;
9646 sptm_ops[sptm_ops_index].disjoint_op.pte_template = template;
9647
9648 /* Move the sptm ops table cursor. */
9649 sptm_ops_index++;
9650
9651 /* Increment the mappings counter. */
9652 num_mappings++;
9653
9654 sucaoc_skip_pte:
9655 /**
9656 * Reset ptep to PT_ENTRY_NULL to keep the loop precondition of either ptep
9657 * or pvep is nonnull (not both, not neither) true.
9658 */
9659 ptep = PT_ENTRY_NULL;
9660
9661 /* Advance to next pvep if we have exhausted the pteps in it. */
9662 if ((pvep != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
9663 pve_ptep_idx = 0;
9664 pvep = pve_next(pvep);
9665 }
9666 }
9667
9668 /* Update the PAPT header for the number of mappings. */
9669 sptm_ops[papt_sptm_ops_index].per_paddr_header.num_mappings = num_mappings;
9670
9671 const bool full_table = (sptm_ops_index >= SPTM_MAPPING_LIMIT);
9672 const bool collection_done_for_page = (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL);
9673
9674 /**
9675 * The ops table is full, so the caller should now invoke the SPTM before calling
9676 * into this function again.
9677 */
9678 if (full_table) {
9679 /* Update last_table_last_papt_pa to be the pa collected in this call. */
9680 last_table_last_papt_pa = pa;
9681
9682 /* Reset sptm_ops_index. */
9683 sptm_ops_index = 0;
9684 }
9685
9686 /* Copy the updated collection states back to the parameter structure. */
9687 state->sptm_ops_index = sptm_ops_index;
9688 state->last_table_last_papt_pa = last_table_last_papt_pa;
9689 state->pvep = pvep;
9690 state->ptep = ptep;
9691 state->pve_ptep_idx = pve_ptep_idx;
9692
9693 /* Assemble the return value. */
9694 pmap_sptm_update_cache_attr_ops_collect_return_t retval = OPS_COLLECT_NOTHING;
9695
9696 if (full_table) {
9697 retval |= OPS_COLLECT_RETURN_FULL_TABLE;
9698 }
9699
9700 if (collection_done_for_page) {
9701 retval |= OPS_COLLECT_RETURN_COMPLETED_PAGE;
9702 }
9703
9704 PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_END, pa, attributes, sptm_ops_index);
9705
9706 return retval;
9707 }
9708
9709 /* At least one PAPT header plus one mapping. */
9710 static_assert(SPTM_MAPPING_LIMIT >= 2);
9711
9712 /**
9713 * Returns if a cache attribute is allowed (on managed pages).
9714 *
9715 * @param attributes A 32-bit value whose VM_WIMG_MASK bits represent the
9716 * cache attribute.
9717 *
9718 * @return True if the cache attribute is allowed on managed pages.
9719 * False otherwise.
9720 */
9721 static bool
9722 pmap_is_cache_attribute_allowed(unsigned int attributes)
9723 {
9724 if (pmap_panic_dev_wimg_on_managed) {
9725 switch (attributes & VM_WIMG_MASK) {
9726 /* supported on DRAM, but slow, so we disallow */
9727 case VM_WIMG_IO: // nGnRnE
9728 case VM_WIMG_POSTED: // nGnRE
9729
9730 /* unsupported on DRAM */
9731 case VM_WIMG_POSTED_REORDERED: // nGRE
9732 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
9733 return false;
9734
9735 default:
9736 return true;
9737 }
9738 }
9739
9740 return true;
9741 }
9742
9743 /**
9744 * Batch updates the cache attributes of a list of pages in three passes.
9745 *
9746 * In pass one, the pp_attr_table and the pte are updated (by SPTM) for the pages in the list.
9747 * In pass two, TLB entries are flushed for each page in the list if necessary.
9748 * In pass three, caches are cleaned for each page in the list if necessary.
9749 *
9750 * @param page_list List of pages to be updated.
9751 * @param cacheattr The new cache attributes.
9752 * @param update_attr_table Whether the pp_attr_table should be updated. This is useful for compressor
9753 * pages where it's desired to keep the old WIMG bits.
9754 */
9755 void
9756 pmap_batch_set_cache_attributes_internal(
9757 const unified_page_list_t *page_list,
9758 unsigned int cacheattr,
9759 bool update_attr_table)
9760 {
9761 bool tlb_flush_pass_needed = false;
9762 bool rt_cache_flush_pass_needed = false;
9763 bool preemption_disabled = false;
9764
9765 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE1);
9766
9767 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
9768 sptm_update_disjoint_multipage_op_t *sptm_ops = NULL;
9769
9770 pmap_sptm_update_cache_attr_ops_collect_state_t state = {0};
9771
9772 unified_page_list_iterator_t iter;
9773
9774 for (unified_page_list_iterator_init(page_list, &iter);
9775 !unified_page_list_iterator_end(&iter);
9776 unified_page_list_iterator_next(&iter)) {
9777 bool is_fictitious = false;
9778 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
9779 const pmap_paddr_t paddr = ptoa(pn);
9780
9781 /**
9782 * Skip if the page is not managed.
9783 *
9784 * We don't panic here because sometimes the user just blindly pass in
9785 * pages that are not managed. We need to handle that gracefully.
9786 */
9787 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
9788 continue;
9789 }
9790
9791 const unsigned int pai = pa_index(paddr);
9792 locked_pvh_t locked_pvh = {.pvh = 0};
9793
9794 if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
9795 /**
9796 * If we're partway through processing a multi-page batched call,
9797 * preemption will already be disabled so we can't simply call
9798 * pvh_lock() which may block. Instead, we first try to acquire
9799 * the lock without waiting, which in most cases should succeed.
9800 * If it fails, we submit the pending batched operations to re-
9801 * enable preemption and then acquire the lock normally.
9802 */
9803 locked_pvh = pvh_try_lock(pai);
9804 if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
9805 assert(preemption_disabled);
9806 const sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
9807 pmap_retype_epoch_exit();
9808 enable_preemption();
9809 preemption_disabled = false;
9810 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
9811 tlb_flush_pass_needed = true;
9812 }
9813 state.sptm_ops_index = 0;
9814 locked_pvh = pvh_lock(pai);
9815 }
9816 } else {
9817 locked_pvh = pvh_lock(pai);
9818 }
9819 assert(locked_pvh.pvh != 0);
9820
9821 const pp_attr_t pp_attr_current = pp_attr_table[pai];
9822
9823 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
9824 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
9825 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
9826 }
9827
9828 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
9829
9830 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
9831 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
9832 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
9833 }
9834
9835 /**
9836 * When update_attr_table is false, we know that wimg_bits_prev read from pp_attr_table is not to be trusted,
9837 * and we should force update the cache attribute.
9838 */
9839 const bool force_update = !update_attr_table;
9840 /* Update the cache attributes in PTE and PP_ATTR table. */
9841 if ((wimg_bits_new != wimg_bits_prev) || force_update) {
9842 if (!pmap_is_cache_attribute_allowed(cacheattr)) {
9843 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, pn=%#x",
9844 __func__, cacheattr & VM_WIMG_MASK, pn);
9845 }
9846
9847 /* Update PP_ATTR_TABLE */
9848 if (update_attr_table) {
9849 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
9850 }
9851
9852 bool mapping_collection_done = false;
9853 bool pvh_lock_sleep_mode_needed = false;
9854 do {
9855 if (__improbable(pvh_lock_sleep_mode_needed)) {
9856 assert(!preemption_disabled);
9857 pvh_lock_enter_sleep_mode(&locked_pvh);
9858 pvh_lock_sleep_mode_needed = false;
9859 }
9860
9861 /* Disable preemption to use the per-CPU structure safely. */
9862 if (!preemption_disabled) {
9863 preemption_disabled = true;
9864 disable_preemption();
9865 /**
9866 * Enter the retype epoch while we gather the disjoint update arguments
9867 * and issue the SPTM call. Since this operation may cover multiple physical
9868 * pages, we may construct the argument array and invoke the SPTM without holding
9869 * all relevant PVH locks, we need to record that we are collecting and modifying
9870 * mapping state so that e.g. pmap_page_protect() does not attempt to retype the
9871 * underlying pages and pmap_remove() does not attempt to free the page tables
9872 * used for these mappings without first draining our epoch.
9873 */
9874 pmap_retype_epoch_enter();
9875
9876 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9877 sptm_ops = (sptm_update_disjoint_multipage_op_t *) sptm_pcpu->sptm_ops;
9878 }
9879
9880 /* The return value indicates if we should call into SPTM in this iteration. */
9881 pmap_sptm_update_cache_attr_ops_collect_return_t retval =
9882 pmap_sptm_update_cache_attr_ops_collect(&state, sptm_ops, paddr, cacheattr);
9883
9884 /* The collection routine should only return if it needs attention. */
9885 assert(retval != OPS_COLLECT_NOTHING);
9886
9887 /* Gather information for next step from the return value. */
9888 mapping_collection_done = retval & OPS_COLLECT_RETURN_COMPLETED_PAGE;
9889 const bool call_sptm = retval & OPS_COLLECT_RETURN_FULL_TABLE;
9890
9891 if (call_sptm) {
9892 /* Call into SPTM with this SPTM ops table. */
9893 sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, SPTM_MAPPING_LIMIT);
9894 /**
9895 * We may be submitting the batch and exiting the epoch partway through
9896 * processing the PV list for a page. That's fine, because in that case we'll
9897 * hold the PV lock for that page, which will prevent mappings of that page from
9898 * being disconnected and will prevent the completion of pmap_remove() against
9899 * any of those mappings, thus also guaranteeing the relevant page table pages
9900 * can't be freed. The epoch still protects mappings for any prior page in
9901 * the batch, whose PV locks are no longer held.
9902 */
9903 pmap_retype_epoch_exit();
9904 /**
9905 * Balance out the explicit disable_preemption() made either at the beginning of
9906 * the function or on a prior iteration of the loop that placed the PVH lock in
9907 * sleep mode. Note that enable_preemption() decrements a per-thread counter,
9908 * so if we still happen to hold the PVH lock in spin mode preemption won't
9909 * actually be re-enabled until we switch the lock over to sleep mode on
9910 * the next iteration.
9911 */
9912 enable_preemption();
9913 preemption_disabled = false;
9914 pvh_lock_sleep_mode_needed = true;
9915
9916 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
9917 tlb_flush_pass_needed = true;
9918 }
9919 }
9920
9921 /* We cannot be in a situation where we didn't call into SPTM while also having not finished walking the pv list. */
9922 assert(call_sptm || mapping_collection_done);
9923 } while (!mapping_collection_done);
9924
9925 /**
9926 * We could technically force the cache flush pass here when force_update is true, but
9927 * since the compressor mapping/unmapping path handles cache flushing itself, it's fine
9928 * leaving this as is.
9929 */
9930 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
9931 rt_cache_flush_pass_needed = true;
9932 }
9933 }
9934
9935 pvh_unlock(&locked_pvh);
9936 }
9937
9938 if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
9939 assert(preemption_disabled);
9940 sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
9941 pmap_retype_epoch_exit();
9942 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
9943 tlb_flush_pass_needed = true;
9944 }
9945
9946 /**
9947 * This is the last sptm_update_cache_attr() call whatsoever, so it's
9948 * okay not to update the state variables.
9949 */
9950
9951 enable_preemption();
9952 } else if (preemption_disabled) {
9953 pmap_retype_epoch_exit();
9954 enable_preemption();
9955 }
9956
9957 if (tlb_flush_pass_needed) {
9958 /* Sync the PTE writes before potential TLB/Cache flushes. */
9959 FLUSH_PTE_STRONG();
9960
9961 /**
9962 * Pass 2: for each physical page and for each mapping, we need to flush
9963 * the TLB for it.
9964 */
9965 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE2);
9966 for (unified_page_list_iterator_init(page_list, &iter);
9967 !unified_page_list_iterator_end(&iter);
9968 unified_page_list_iterator_next(&iter)) {
9969 bool is_fictitious = false;
9970 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
9971 const pmap_paddr_t paddr = ptoa(pn);
9972
9973 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
9974 continue;
9975 }
9976
9977 pmap_flush_tlb_for_paddr_async(paddr);
9978 }
9979
9980 #if HAS_FEAT_XS
9981 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
9982 arm64_sync_tlb(false);
9983 #else
9984 /**
9985 * For targets that distinguish between mild and strong DSB, mild DSB
9986 * will not drain the prefetcher. This can lead to prefetch-driven
9987 * cache fills that defeat the uncacheable requirement of the RT memory type.
9988 * In those cases, strong DSB must instead be employed to drain the prefetcher.
9989 */
9990 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
9991 #endif
9992 }
9993
9994 if (rt_cache_flush_pass_needed) {
9995 /* Pass 3: Flush the cache if the page is recently set to RT */
9996 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE3);
9997 /**
9998 * We disable preemption to ensure we are not preempted
9999 * in the state where DC by VA instructions remain enabled.
10000 */
10001 disable_preemption();
10002
10003 assert(get_preemption_level() > 0);
10004
10005 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10006 /**
10007 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10008 * and the host will handle cache maintenance for it. So we don't need to
10009 * worry about enabling the ops here for AVP.
10010 */
10011 enable_dc_mva_ops();
10012 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10013 /**
10014 * DMB should be sufficient to ensure prior accesses to the memory in question are
10015 * correctly ordered relative to the upcoming cache maintenance operations.
10016 */
10017 __builtin_arm_dmb(DMB_SY);
10018
10019 for (unified_page_list_iterator_init(page_list, &iter);
10020 !unified_page_list_iterator_end(&iter);) {
10021 bool is_fictitious = false;
10022 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10023 const pmap_paddr_t paddr = ptoa(pn);
10024
10025 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10026 unified_page_list_iterator_next(&iter);
10027 continue;
10028 }
10029
10030 CleanPoC_DcacheRegion_Force_nopreempt_nohid_nobarrier(phystokv(paddr), PAGE_SIZE);
10031
10032 unified_page_list_iterator_next(&iter);
10033 if (__improbable(pmap_pending_preemption() && !unified_page_list_iterator_end(&iter))) {
10034 __builtin_arm_dsb(DSB_SY);
10035 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10036 disable_dc_mva_ops();
10037 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10038 enable_preemption();
10039 assert(preemption_enabled());
10040 disable_preemption();
10041 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10042 enable_dc_mva_ops();
10043 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10044 }
10045 }
10046
10047 /* Issue DSB to ensure cache maintenance is fully complete before subsequent accesses. */
10048 __builtin_arm_dsb(DSB_SY);
10049 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10050 disable_dc_mva_ops();
10051 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10052
10053 enable_preemption();
10054 }
10055
10056 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE4);
10057 }
10058
10059 /**
10060 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10061 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10062 *
10063 * @param page_list List of pages to be updated.
10064 * @param cacheattr The new cache attribute.
10065 */
10066 void
10067 pmap_batch_set_cache_attributes(
10068 const unified_page_list_t *page_list,
10069 unsigned int cacheattr)
10070 {
10071 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10072
10073 /* Verify we are being called from a preemptible context. */
10074 pmap_verify_preemptible();
10075
10076 pmap_batch_set_cache_attributes_internal(page_list, cacheattr, true);
10077
10078 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10079 }
10080
10081 MARK_AS_PMAP_TEXT void
10082 pmap_set_cache_attributes_internal(
10083 ppnum_t pn,
10084 unsigned int cacheattr,
10085 bool update_attr_table)
10086 {
10087 upl_page_info_t single_page_upl = { .phys_addr = pn };
10088 const unified_page_list_t page_list = {
10089 .upl = {.upl_info = &single_page_upl, .upl_size = 1},
10090 .type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
10091 };
10092
10093 pmap_batch_set_cache_attributes_internal(&page_list, cacheattr, update_attr_table);
10094 }
10095
10096 void
10097 pmap_set_cache_attributes(
10098 ppnum_t pn,
10099 unsigned int cacheattr)
10100 {
10101 pmap_set_cache_attributes_internal(pn, cacheattr, true);
10102 }
10103
10104 void
10105 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10106 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10107 {
10108 pmap_paddr_t data_pa = 0; // data address
10109 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10110 pmap_paddr_t text_pa = 0; // text address
10111
10112 *kernel_data_addr = 0;
10113 *kernel_text_addr = 0;
10114 *user_text_addr = 0;
10115
10116 kern_return_t kr = pmap_page_alloc(&data_pa, PMAP_PAGE_ALLOCATE_NONE);
10117 assert(kr == KERN_SUCCESS);
10118
10119 kr = pmap_page_alloc(&ro_data_pa, PMAP_PAGE_ALLOCATE_NONE);
10120 assert(kr == KERN_SUCCESS);
10121
10122 #if CONFIG_ARM_PFZ
10123 kr = pmap_page_alloc(&text_pa, PMAP_PAGE_ALLOCATE_NONE);
10124 assert(kr == KERN_SUCCESS);
10125
10126 /**
10127 * User mapping of comm page text section for 64 bit mapping only
10128 *
10129 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10130 * user processes to get this page mapped in, they should never call into
10131 * this page.
10132 *
10133 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10134 * is slid in the same L3 as the data commpage. It is either outside the
10135 * max of user VA or is pre-reserved in vm_map_exec(). This means that
10136 * it is reserved and unavailable to mach VM for future mappings.
10137 */
10138 const int num_ptes = pt_attr_leaf_size(native_pt_attr) >> PTE_SHIFT;
10139
10140 do {
10141 const int text_leaf_index = random() % num_ptes;
10142
10143 /**
10144 * Generate a VA for the commpage text with the same root and twig index as data
10145 * comm page, but with new leaf index we've just generated.
10146 */
10147 commpage_text_user_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(native_pt_attr));
10148 commpage_text_user_va |= (text_leaf_index << pt_attr_leaf_shift(native_pt_attr));
10149 } while ((commpage_text_user_va == _COMM_PAGE64_BASE_ADDRESS) ||
10150 (commpage_text_user_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10151
10152 *user_text_addr = commpage_text_user_va;
10153 *kernel_text_addr = phystokv(text_pa);
10154 #endif
10155
10156 /* For manipulation in kernel, go straight to physical page */
10157 commpage_data_pa = data_pa;
10158 *kernel_data_addr = phystokv(data_pa);
10159 assert(commpage_ro_data_pa == 0);
10160 commpage_ro_data_pa = ro_data_pa;
10161 *kernel_ro_data_addr = phystokv(ro_data_pa);
10162 assert(commpage_text_pa == 0);
10163 commpage_text_pa = text_pa;
10164 }
10165
10166
10167 /*
10168 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10169 * with user controlled TTEs for regions that aren't explicitly reserved by the
10170 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10171 */
10172 #if (ARM_PGSHIFT == 14)
10173 /**
10174 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10175 * commpage completely above the maximum 32-bit userspace VA.
10176 */
10177 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10178 static_assert(_COMM_PAGE64_NESTING_START == SPTM_ARM64_COMMPAGE_REGION_START);
10179 static_assert(_COMM_PAGE64_NESTING_SIZE == SPTM_ARM64_COMMPAGE_REGION_SIZE);
10180
10181 /**
10182 * Normally there'd be an assert to check that 64-bit devices with 64-bit
10183 * userspace VAs can nest the commpage completely above the maximum 64-bit
10184 * userpace VA, but that technically isn't true on macOS. On those systems, the
10185 * commpage lives within the userspace VA range, but is protected by the VM as
10186 * a reserved region (see vm_reserved_regions[] definition for more info).
10187 */
10188
10189 #elif (ARM_PGSHIFT == 12)
10190 /**
10191 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
10192 * above the maximum userspace VA.
10193 */
10194 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
10195 #else
10196 #error Nested shared page mapping is unsupported on this config
10197 #endif
10198
10199 MARK_AS_PMAP_TEXT kern_return_t
10200 pmap_insert_commpage_internal(
10201 pmap_t pmap)
10202 {
10203 kern_return_t kr = KERN_SUCCESS;
10204 vm_offset_t commpage_vaddr;
10205 pt_entry_t *ttep;
10206 pmap_paddr_t commpage_table = commpage_default_table;
10207
10208 /* Validate the pmap input before accessing its data. */
10209 validate_pmap_mutable(pmap);
10210
10211 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10212 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10213
10214 #if __ARM_MIXED_PAGE_SIZE__
10215 #if !__ARM_16K_PG__
10216 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
10217 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
10218 #endif /* !__ARM_16K_PG__ */
10219
10220 /* Choose the correct shared page pmap to use. */
10221 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
10222 if (pmap_page_size == 4096) {
10223 if (pmap_is_64bit(pmap)) {
10224 commpage_table = commpage_4k_table;
10225 } else {
10226 panic("32-bit commpage not currently supported for SPTM configurations");
10227 //commpage_table = commpage32_4k_table;
10228 }
10229 } else if (pmap_page_size != 16384) {
10230 panic("No commpage table exists for the wanted page size: %llu", pmap_page_size);
10231 } else
10232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
10233 {
10234 if (pmap_is_64bit(pmap)) {
10235 commpage_table = commpage_default_table;
10236 } else {
10237 panic("32-bit commpage not currently supported for SPTM configurations");
10238 //commpage_table = commpage32_default_table;
10239 }
10240 }
10241
10242 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
10243 #error We assume a single page.
10244 #endif
10245
10246 if (pmap_is_64bit(pmap)) {
10247 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10248 } else {
10249 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10250 }
10251
10252
10253 pmap_lock(pmap, PMAP_LOCK_SHARED);
10254
10255 /*
10256 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
10257 * two (2MB) depending on the address space layout. For 16KB pages, each level
10258 * one entry is 64GB, so we must go to the second level entry (32MB) in order
10259 * to "nest".
10260 *
10261 * Note: This is not "nesting" in the shared cache sense. This definition of
10262 * nesting just means inserting pointers to pre-allocated tables inside of
10263 * the passed in pmap to allow us to share page tables (which map the shared
10264 * page) for every task. This saves at least one page of memory per process
10265 * compared to creating new page tables in every process for mapping the
10266 * shared page.
10267 */
10268
10269 /**
10270 * Allocate the twig page tables if needed, and slam a pointer to the shared
10271 * page's tables into place.
10272 */
10273 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
10274 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10275
10276 kr = pmap_expand(pmap, commpage_vaddr, 0, commpage_level);
10277
10278 if (kr != KERN_SUCCESS) {
10279 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
10280 }
10281
10282 pmap_lock(pmap, PMAP_LOCK_SHARED);
10283 }
10284
10285 if (*ttep != ARM_PTE_EMPTY) {
10286 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
10287 }
10288
10289 sptm_map_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level,
10290 (commpage_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID);
10291
10292 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10293
10294 return kr;
10295 }
10296
10297 static void
10298 pmap_unmap_commpage(
10299 pmap_t pmap)
10300 {
10301 pt_entry_t *ptep;
10302 vm_offset_t commpage_vaddr;
10303
10304 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10305 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10306 __assert_only pmap_paddr_t commpage_pa = commpage_data_pa;
10307
10308 if (pmap_is_64bit(pmap)) {
10309 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10310 } else {
10311 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10312 }
10313
10314
10315 ptep = pmap_pte(pmap, commpage_vaddr);
10316
10317 if (ptep == NULL) {
10318 return;
10319 }
10320
10321 /* It had better be mapped to the shared page. */
10322 if (pte_to_pa(*ptep) != commpage_pa) {
10323 panic("%s: non-commpage PA 0x%llx mapped at VA 0x%llx in pmap %p; expected 0x%llx",
10324 __func__, (unsigned long long)pte_to_pa(*ptep), (unsigned long long)commpage_vaddr,
10325 pmap, (unsigned long long)commpage_pa);
10326 }
10327
10328 sptm_unmap_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level);
10329 }
10330
10331 void
10332 pmap_insert_commpage(
10333 pmap_t pmap)
10334 {
10335 pmap_insert_commpage_internal(pmap);
10336 }
10337
10338 static boolean_t
10339 pmap_is_64bit(
10340 pmap_t pmap)
10341 {
10342 return pmap->is_64bit;
10343 }
10344
10345 bool
10346 pmap_is_exotic(
10347 pmap_t pmap __unused)
10348 {
10349 return false;
10350 }
10351
10352
10353 /* ARMTODO -- an implementation that accounts for
10354 * holes in the physical map, if any.
10355 */
10356 boolean_t
10357 pmap_valid_page(
10358 ppnum_t pn)
10359 {
10360 return pa_valid(ptoa(pn));
10361 }
10362
10363 boolean_t
10364 pmap_bootloader_page(
10365 ppnum_t pn)
10366 {
10367 pmap_paddr_t paddr = ptoa(pn);
10368
10369 if (pa_valid(paddr)) {
10370 return FALSE;
10371 }
10372 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10373 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
10374 }
10375
10376 MARK_AS_PMAP_TEXT boolean_t
10377 pmap_is_empty_internal(
10378 pmap_t pmap,
10379 vm_map_offset_t va_start,
10380 vm_map_offset_t va_end)
10381 {
10382 vm_map_offset_t block_start, block_end;
10383 tt_entry_t *tte_p;
10384
10385 if (pmap == NULL) {
10386 return TRUE;
10387 }
10388
10389 validate_pmap(pmap);
10390
10391 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10392 unsigned int initial_not_in_kdp = not_in_kdp;
10393
10394 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10395 pmap_lock(pmap, PMAP_LOCK_SHARED);
10396 }
10397
10398
10399 /* TODO: This will be faster if we increment ttep at each level. */
10400 block_start = va_start;
10401
10402 while (block_start < va_end) {
10403 pt_entry_t *bpte_p, *epte_p;
10404 pt_entry_t *pte_p;
10405
10406 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10407 if (block_end > va_end) {
10408 block_end = va_end;
10409 }
10410
10411 tte_p = pmap_tte(pmap, block_start);
10412 if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
10413 pte_p = (pt_entry_t *) ttetokv(*tte_p);
10414 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
10415 epte_p = &pte_p[pte_index(pt_attr, block_end)];
10416
10417 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
10418 if (*pte_p != ARM_PTE_EMPTY) {
10419 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10420 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10421 }
10422 return FALSE;
10423 }
10424 }
10425 }
10426 block_start = block_end;
10427 }
10428
10429 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10430 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10431 }
10432
10433 return TRUE;
10434 }
10435
10436 boolean_t
10437 pmap_is_empty(
10438 pmap_t pmap,
10439 vm_map_offset_t va_start,
10440 vm_map_offset_t va_end)
10441 {
10442 return pmap_is_empty_internal(pmap, va_start, va_end);
10443 }
10444
10445 vm_map_offset_t
10446 pmap_max_offset(
10447 boolean_t is64,
10448 unsigned int option)
10449 {
10450 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
10451 }
10452
10453 vm_map_offset_t
10454 pmap_max_64bit_offset(
10455 __unused unsigned int option)
10456 {
10457 vm_map_offset_t max_offset_ret = 0;
10458
10459 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
10460 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
10461 max_offset_ret = arm64_pmap_max_offset_default;
10462 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
10463 max_offset_ret = min_max_offset;
10464 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
10465 max_offset_ret = MACH_VM_MAX_ADDRESS;
10466 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
10467 if (arm64_pmap_max_offset_default) {
10468 max_offset_ret = arm64_pmap_max_offset_default;
10469 } else if (max_mem > 0xC0000000) {
10470 // devices with > 3GB of memory
10471 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
10472 } else if (max_mem > 0x40000000) {
10473 // devices with > 1GB and <= 3GB of memory
10474 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
10475 } else {
10476 // devices with <= 1 GB of memory
10477 max_offset_ret = min_max_offset;
10478 }
10479 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
10480 if (arm64_pmap_max_offset_default) {
10481 // Allow the boot-arg to override jumbo size
10482 max_offset_ret = arm64_pmap_max_offset_default;
10483 } else {
10484 max_offset_ret = MACH_VM_JUMBO_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
10485 }
10486 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
10487 } else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
10488 max_offset_ret = MACH_VM_MAX_ADDRESS;
10489 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
10490 } else {
10491 panic("pmap_max_64bit_offset illegal option 0x%x", option);
10492 }
10493
10494 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
10495 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
10496 assert(max_offset_ret >= min_max_offset);
10497 }
10498
10499 return max_offset_ret;
10500 }
10501
10502 vm_map_offset_t
10503 pmap_max_32bit_offset(
10504 unsigned int option)
10505 {
10506 vm_map_offset_t max_offset_ret = 0;
10507
10508 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
10509 max_offset_ret = arm_pmap_max_offset_default;
10510 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
10511 max_offset_ret = VM_MAX_ADDRESS;
10512 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
10513 max_offset_ret = VM_MAX_ADDRESS;
10514 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
10515 if (arm_pmap_max_offset_default) {
10516 max_offset_ret = arm_pmap_max_offset_default;
10517 } else if (max_mem > 0x20000000) {
10518 max_offset_ret = VM_MAX_ADDRESS;
10519 } else {
10520 max_offset_ret = VM_MAX_ADDRESS;
10521 }
10522 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
10523 max_offset_ret = VM_MAX_ADDRESS;
10524 } else {
10525 panic("pmap_max_32bit_offset illegal option 0x%x", option);
10526 }
10527
10528 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
10529 return max_offset_ret;
10530 }
10531
10532 #if CONFIG_DTRACE
10533 /*
10534 * Constrain DTrace copyin/copyout actions
10535 */
10536 extern kern_return_t dtrace_copyio_preflight(addr64_t);
10537 extern kern_return_t dtrace_copyio_postflight(addr64_t);
10538
10539 kern_return_t
10540 dtrace_copyio_preflight(
10541 __unused addr64_t va)
10542 {
10543 if (current_map() == kernel_map) {
10544 return KERN_FAILURE;
10545 } else {
10546 return KERN_SUCCESS;
10547 }
10548 }
10549
10550 kern_return_t
10551 dtrace_copyio_postflight(
10552 __unused addr64_t va)
10553 {
10554 return KERN_SUCCESS;
10555 }
10556 #endif /* CONFIG_DTRACE */
10557
10558
10559 void
10560 pmap_flush_context_init(__unused pmap_flush_context *pfc)
10561 {
10562 }
10563
10564
10565 void
10566 pmap_flush(
10567 __unused pmap_flush_context *cpus_to_flush)
10568 {
10569 /* not implemented yet */
10570 return;
10571 }
10572
10573 /**
10574 * Perform basic validation checks on the destination only and
10575 * corresponding offset/sizes prior to writing to a read only allocation.
10576 *
10577 * @note Should be called before writing to an allocation from the read
10578 * only allocator.
10579 *
10580 * @param zid The ID of the zone the allocation belongs to.
10581 * @param va VA of element being modified (destination).
10582 * @param offset Offset being written to, in the element.
10583 * @param new_data_size Size of modification.
10584 *
10585 */
10586
10587 MARK_AS_PMAP_TEXT static void
10588 pmap_ro_zone_validate_element_dst(
10589 zone_id_t zid,
10590 vm_offset_t va,
10591 vm_offset_t offset,
10592 vm_size_t new_data_size)
10593 {
10594 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
10595 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
10596 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
10597 }
10598
10599 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
10600
10601 /* Check element is from correct zone and properly aligned */
10602 zone_require_ro(zid, elem_size, (void*)va);
10603
10604 if (__improbable(new_data_size > (elem_size - offset))) {
10605 panic("%s: New data size %lu too large for elem size %lu at addr %p",
10606 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
10607 }
10608 if (__improbable(offset >= elem_size)) {
10609 panic("%s: Offset %lu too large for elem size %lu at addr %p",
10610 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
10611 }
10612 }
10613
10614
10615 /**
10616 * Perform basic validation checks on the source, destination and
10617 * corresponding offset/sizes prior to writing to a read only allocation.
10618 *
10619 * @note Should be called before writing to an allocation from the read
10620 * only allocator.
10621 *
10622 * @param zid The ID of the zone the allocation belongs to.
10623 * @param va VA of element being modified (destination).
10624 * @param offset Offset being written to, in the element.
10625 * @param new_data Pointer to new data (source).
10626 * @param new_data_size Size of modification.
10627 *
10628 */
10629
10630 MARK_AS_PMAP_TEXT static void
10631 pmap_ro_zone_validate_element(
10632 zone_id_t zid,
10633 vm_offset_t va,
10634 vm_offset_t offset,
10635 const vm_offset_t new_data,
10636 vm_size_t new_data_size)
10637 {
10638 vm_offset_t sum = 0;
10639
10640 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
10641 panic("%s: Integer addition overflow %p + %lu = %lu",
10642 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
10643 }
10644
10645 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
10646 }
10647
10648 /**
10649 * Function to configure RO zone access permissions for a forthcoming write operation.
10650 */
10651 static void
10652 pmap_ro_zone_prepare_write(void)
10653 {
10654 }
10655
10656 /**
10657 * Function to indicate that a preceding RO zone write operation is complete.
10658 */
10659 static void
10660 pmap_ro_zone_complete_write(void)
10661 {
10662 }
10663
10664 /**
10665 * Function to align an address or size to the required RO zone mapping alignment.
10666 *
10667 * For the SPTM the RO zone region must be aligned on a twig boundary so that at least
10668 * the last-level kernel pagetable can be of the appropriate SPTM RO zone table type,
10669 * which allows the SPTM to enforce RO zone mapping permission restrictions.
10670 *
10671 * @param value the address or size to be aligned.
10672 *
10673 * @return the aligned value
10674 */
10675 vm_offset_t
10676 pmap_ro_zone_align(vm_offset_t value)
10677 {
10678 const pt_attr_t * const pt_attr = pmap_get_pt_attr(kernel_pmap);
10679 return PMAP_ALIGN(value, pt_attr_twig_size(pt_attr));
10680 }
10681
10682 /**
10683 * Function to copy kauth_cred from new_data to kv.
10684 * Function defined in "kern_prot.c"
10685 *
10686 * @note Will be removed upon completion of
10687 * <rdar://problem/72635194> Compiler PAC support for memcpy.
10688 *
10689 * @param kv Address to copy new data to.
10690 * @param new_data Pointer to new data.
10691 *
10692 */
10693
10694 extern void
10695 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
10696
10697 /**
10698 * Zalloc-specific memcpy that writes through the physical aperture
10699 * and ensures the element being modified is from a read-only zone.
10700 *
10701 * @note Designed to work only with the zone allocator's read-only submap.
10702 *
10703 * @param zid The ID of the zone to allocate from.
10704 * @param va VA of element to be modified.
10705 * @param offset Offset from element.
10706 * @param new_data Pointer to new data.
10707 * @param new_data_size Size of modification.
10708 *
10709 */
10710
10711 void
10712 pmap_ro_zone_memcpy(
10713 zone_id_t zid,
10714 vm_offset_t va,
10715 vm_offset_t offset,
10716 const vm_offset_t new_data,
10717 vm_size_t new_data_size)
10718 {
10719 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
10720 }
10721
10722 MARK_AS_PMAP_TEXT void
10723 pmap_ro_zone_memcpy_internal(
10724 zone_id_t zid,
10725 vm_offset_t va,
10726 vm_offset_t offset,
10727 const vm_offset_t new_data,
10728 vm_size_t new_data_size)
10729 {
10730 if (!new_data || new_data_size == 0) {
10731 return;
10732 }
10733
10734 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
10735 const bool istate = ml_set_interrupts_enabled(FALSE);
10736 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
10737 pmap_ro_zone_prepare_write();
10738 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
10739 pmap_ro_zone_complete_write();
10740 ml_set_interrupts_enabled(istate);
10741 }
10742
10743 /**
10744 * Zalloc-specific function to atomically mutate fields of an element that
10745 * belongs to a read-only zone, via the physcial aperture.
10746 *
10747 * @note Designed to work only with the zone allocator's read-only submap.
10748 *
10749 * @param zid The ID of the zone the element belongs to.
10750 * @param va VA of element to be modified.
10751 * @param offset Offset in element.
10752 * @param op Atomic operation to perform.
10753 * @param value Mutation value.
10754 *
10755 */
10756
10757 uint64_t
10758 pmap_ro_zone_atomic_op(
10759 zone_id_t zid,
10760 vm_offset_t va,
10761 vm_offset_t offset,
10762 zro_atomic_op_t op,
10763 uint64_t value)
10764 {
10765 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
10766 }
10767
10768 MARK_AS_PMAP_TEXT uint64_t
10769 pmap_ro_zone_atomic_op_internal(
10770 zone_id_t zid,
10771 vm_offset_t va,
10772 vm_offset_t offset,
10773 zro_atomic_op_t op,
10774 uint64_t value)
10775 {
10776 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
10777 vm_size_t value_size = op & 0xf;
10778 const boolean_t istate = ml_set_interrupts_enabled(FALSE);
10779
10780 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
10781 pmap_ro_zone_prepare_write();
10782 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
10783 pmap_ro_zone_complete_write();
10784 ml_set_interrupts_enabled(istate);
10785
10786 return value;
10787 }
10788
10789 /**
10790 * bzero for allocations from read only zones, that writes through the
10791 * physical aperture.
10792 *
10793 * @note This is called by the zfree path of all allocations from read
10794 * only zones.
10795 *
10796 * @param zid The ID of the zone the allocation belongs to.
10797 * @param va VA of element to be zeroed.
10798 * @param offset Offset in the element.
10799 * @param size Size of allocation.
10800 *
10801 */
10802
10803 void
10804 pmap_ro_zone_bzero(
10805 zone_id_t zid,
10806 vm_offset_t va,
10807 vm_offset_t offset,
10808 vm_size_t size)
10809 {
10810 pmap_ro_zone_bzero_internal(zid, va, offset, size);
10811 }
10812
10813 MARK_AS_PMAP_TEXT void
10814 pmap_ro_zone_bzero_internal(
10815 zone_id_t zid,
10816 vm_offset_t va,
10817 vm_offset_t offset,
10818 vm_size_t size)
10819 {
10820 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
10821 const boolean_t istate = ml_set_interrupts_enabled(FALSE);
10822 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
10823 pmap_ro_zone_prepare_write();
10824 bzero((void*)phystokv(pa), size);
10825 pmap_ro_zone_complete_write();
10826 ml_set_interrupts_enabled(istate);
10827 }
10828
10829 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
10830
10831 MARK_AS_PMAP_TEXT mach_vm_size_t
10832 pmap_query_resident_internal(
10833 pmap_t pmap,
10834 vm_map_address_t start,
10835 vm_map_address_t end,
10836 mach_vm_size_t *compressed_bytes_p)
10837 {
10838 mach_vm_size_t resident_bytes = 0;
10839 mach_vm_size_t compressed_bytes = 0;
10840
10841 pt_entry_t *bpte, *epte;
10842 pt_entry_t *pte_p;
10843 tt_entry_t *tte_p;
10844
10845 if (pmap == NULL) {
10846 return PMAP_RESIDENT_INVALID;
10847 }
10848
10849 validate_pmap(pmap);
10850
10851 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10852
10853 /* Ensure that this request is valid, and addresses exactly one TTE. */
10854 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
10855 (end % pt_attr_page_size(pt_attr)))) {
10856 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
10857 }
10858
10859 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
10860 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
10861 }
10862
10863 pmap_lock(pmap, PMAP_LOCK_SHARED);
10864 tte_p = pmap_tte(pmap, start);
10865 if (tte_p == (tt_entry_t *) NULL) {
10866 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10867 return PMAP_RESIDENT_INVALID;
10868 }
10869 if (tte_is_valid_table(*tte_p)) {
10870 pte_p = (pt_entry_t *) ttetokv(*tte_p);
10871 bpte = &pte_p[pte_index(pt_attr, start)];
10872 epte = &pte_p[pte_index(pt_attr, end)];
10873
10874 for (; bpte < epte; bpte++) {
10875 if (pte_is_compressed(*bpte, bpte)) {
10876 compressed_bytes += pt_attr_page_size(pt_attr);
10877 } else if (pa_valid(pte_to_pa(*bpte))) {
10878 resident_bytes += pt_attr_page_size(pt_attr);
10879 }
10880 }
10881 }
10882 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10883
10884 if (compressed_bytes_p) {
10885 *compressed_bytes_p += compressed_bytes;
10886 }
10887
10888 return resident_bytes;
10889 }
10890
10891 mach_vm_size_t
10892 pmap_query_resident(
10893 pmap_t pmap,
10894 vm_map_address_t start,
10895 vm_map_address_t end,
10896 mach_vm_size_t *compressed_bytes_p)
10897 {
10898 mach_vm_size_t total_resident_bytes;
10899 mach_vm_size_t compressed_bytes;
10900 vm_map_address_t va;
10901
10902
10903 if (pmap == PMAP_NULL) {
10904 if (compressed_bytes_p) {
10905 *compressed_bytes_p = 0;
10906 }
10907 return 0;
10908 }
10909
10910 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10911
10912 total_resident_bytes = 0;
10913 compressed_bytes = 0;
10914
10915 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
10916 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
10917 VM_KERNEL_ADDRHIDE(end));
10918
10919 va = start;
10920 while (va < end) {
10921 vm_map_address_t l;
10922 mach_vm_size_t resident_bytes;
10923
10924 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
10925
10926 if (l > end) {
10927 l = end;
10928 }
10929 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
10930 if (resident_bytes == PMAP_RESIDENT_INVALID) {
10931 break;
10932 }
10933
10934 total_resident_bytes += resident_bytes;
10935
10936 va = l;
10937 }
10938
10939 if (compressed_bytes_p) {
10940 *compressed_bytes_p = compressed_bytes;
10941 }
10942
10943 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
10944 total_resident_bytes);
10945
10946 return total_resident_bytes;
10947 }
10948
10949 #if MACH_ASSERT
10950 static void
10951 pmap_check_ledgers(
10952 pmap_t pmap)
10953 {
10954 int pid;
10955 char *procname;
10956
10957 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
10958 /*
10959 * This pmap was not or is no longer fully associated
10960 * with a task (e.g. the old pmap after a fork()/exec() or
10961 * spawn()). Its "ledger" still points at a task that is
10962 * now using a different (and active) address space, so
10963 * we can't check that all the pmap ledgers are balanced here.
10964 *
10965 * If the "pid" is set, that means that we went through
10966 * pmap_set_process() in task_terminate_internal(), so
10967 * this task's ledger should not have been re-used and
10968 * all the pmap ledgers should be back to 0.
10969 */
10970 return;
10971 }
10972
10973 pid = pmap->pmap_pid;
10974 procname = pmap->pmap_procname;
10975
10976 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
10977 }
10978 #endif /* MACH_ASSERT */
10979
10980 void
10981 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
10982 {
10983 }
10984
10985 /**
10986 * The minimum shared region nesting size is used by the VM to determine when to
10987 * break up large mappings to nested regions. The smallest size that these
10988 * mappings can be broken into is determined by what page table level those
10989 * regions are being nested in at and the size of the page tables.
10990 *
10991 * For instance, if a nested region is nesting at L2 for a process utilizing
10992 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
10993 * block entry).
10994 *
10995 * @param pmap The target pmap to determine the block size based on whether it's
10996 * using 16KB or 4KB page tables.
10997 */
10998 uint64_t
10999 pmap_shared_region_size_min(__unused pmap_t pmap)
11000 {
11001 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11002
11003 /**
11004 * We always nest the shared region at L2 (32MB for 16KB pages, 8MB for
11005 * 4KB pages). This means that a target pmap will contain L2 entries that
11006 * point to shared L3 page tables in the shared region pmap.
11007 */
11008 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
11009 return pt_attr_twig_size(pt_attr) * page_ratio;
11010 }
11011
11012 boolean_t
11013 pmap_enforces_execute_only(
11014 pmap_t pmap)
11015 {
11016 return pmap != kernel_pmap;
11017 }
11018
11019 MARK_AS_PMAP_TEXT void
11020 pmap_set_vm_map_cs_enforced_internal(
11021 pmap_t pmap,
11022 bool new_value)
11023 {
11024 validate_pmap_mutable(pmap);
11025 pmap->pmap_vm_map_cs_enforced = new_value;
11026 }
11027
11028 void
11029 pmap_set_vm_map_cs_enforced(
11030 pmap_t pmap,
11031 bool new_value)
11032 {
11033 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
11034 }
11035
11036 extern int cs_process_enforcement_enable;
11037 bool
11038 pmap_get_vm_map_cs_enforced(
11039 pmap_t pmap)
11040 {
11041 if (cs_process_enforcement_enable) {
11042 return true;
11043 }
11044 return pmap->pmap_vm_map_cs_enforced;
11045 }
11046
11047 MARK_AS_PMAP_TEXT void
11048 pmap_set_jit_entitled_internal(
11049 __unused pmap_t pmap)
11050 {
11051 }
11052
11053 void
11054 pmap_set_jit_entitled(
11055 pmap_t pmap)
11056 {
11057 pmap_set_jit_entitled_internal(pmap);
11058 }
11059
11060 bool
11061 pmap_get_jit_entitled(
11062 __unused pmap_t pmap)
11063 {
11064 return false;
11065 }
11066
11067 MARK_AS_PMAP_TEXT void
11068 pmap_set_tpro_internal(
11069 __unused pmap_t pmap)
11070 {
11071 return;
11072 }
11073
11074 void
11075 pmap_set_tpro(
11076 pmap_t pmap)
11077 {
11078 pmap_set_tpro_internal(pmap);
11079 }
11080
11081 bool
11082 pmap_get_tpro(
11083 __unused pmap_t pmap)
11084 {
11085 return false;
11086 }
11087
11088
11089 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
11090
11091 MARK_AS_PMAP_TEXT kern_return_t
11092 pmap_query_page_info_internal(
11093 pmap_t pmap,
11094 vm_map_offset_t va,
11095 int *disp_p)
11096 {
11097 pmap_paddr_t pa;
11098 int disp;
11099 unsigned int pai;
11100 pt_entry_t *pte_p;
11101 pv_entry_t *pve_p;
11102
11103 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
11104 *disp_p = 0;
11105 return KERN_INVALID_ARGUMENT;
11106 }
11107
11108 validate_pmap(pmap);
11109 pmap_lock(pmap, PMAP_LOCK_SHARED);
11110
11111 try_again:
11112 disp = 0;
11113
11114 pte_p = pmap_pte(pmap, va);
11115 if (pte_p == PT_ENTRY_NULL) {
11116 goto done;
11117 }
11118
11119 const pt_entry_t pte = os_atomic_load(pte_p, relaxed);
11120 pa = pte_to_pa(pte);
11121 if (pa == 0) {
11122 if (pte_is_compressed(pte, pte_p)) {
11123 disp |= PMAP_QUERY_PAGE_COMPRESSED;
11124 if (pte & ARM_PTE_COMPRESSED_ALT) {
11125 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
11126 }
11127 }
11128 } else {
11129 disp |= PMAP_QUERY_PAGE_PRESENT;
11130 pai = pa_index(pa);
11131 if (!pa_valid(pa)) {
11132 goto done;
11133 }
11134 locked_pvh_t locked_pvh = pvh_lock(pai);
11135 if (__improbable(pte != os_atomic_load(pte_p, relaxed))) {
11136 /* something changed: try again */
11137 pvh_unlock(&locked_pvh);
11138 pmap_query_page_info_retries++;
11139 goto try_again;
11140 }
11141 pve_p = PV_ENTRY_NULL;
11142 int pve_ptep_idx = 0;
11143 if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
11144 unsigned int npves = 0;
11145 pve_p = pvh_pve_list(locked_pvh.pvh);
11146 while (pve_p != PV_ENTRY_NULL &&
11147 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
11148 if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
11149 pvh_lock_enter_sleep_mode(&locked_pvh);
11150 }
11151 pve_p = pve_next(pve_p);
11152 npves++;
11153 }
11154 }
11155
11156 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
11157 disp |= PMAP_QUERY_PAGE_ALTACCT;
11158 } else if (ppattr_test_reusable(pai)) {
11159 disp |= PMAP_QUERY_PAGE_REUSABLE;
11160 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
11161 disp |= PMAP_QUERY_PAGE_INTERNAL;
11162 }
11163 pvh_unlock(&locked_pvh);
11164 }
11165
11166 done:
11167 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11168 *disp_p = disp;
11169 return KERN_SUCCESS;
11170 }
11171
11172 kern_return_t
11173 pmap_query_page_info(
11174 pmap_t pmap,
11175 vm_map_offset_t va,
11176 int *disp_p)
11177 {
11178 return pmap_query_page_info_internal(pmap, va, disp_p);
11179 }
11180
11181
11182
11183 uint32_t
11184 pmap_user_va_bits(pmap_t pmap __unused)
11185 {
11186 #if __ARM_MIXED_PAGE_SIZE__
11187 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
11188 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
11189 #else
11190 return 64 - T0SZ_BOOT;
11191 #endif
11192 }
11193
11194 uint32_t
11195 pmap_kernel_va_bits(void)
11196 {
11197 return 64 - T1SZ_BOOT;
11198 }
11199
11200 static vm_map_size_t
11201 pmap_user_va_size(pmap_t pmap)
11202 {
11203 return 1ULL << pmap_user_va_bits(pmap);
11204 }
11205
11206
11207
11208 bool
11209 pmap_in_ppl(void)
11210 {
11211 return false;
11212 }
11213
11214 MARK_AS_PMAP_TEXT void
11215 pmap_footprint_suspend_internal(
11216 vm_map_t map,
11217 boolean_t suspend)
11218 {
11219 #if DEVELOPMENT || DEBUG
11220 if (suspend) {
11221 current_thread()->pmap_footprint_suspended = TRUE;
11222 map->pmap->footprint_was_suspended = TRUE;
11223 } else {
11224 current_thread()->pmap_footprint_suspended = FALSE;
11225 }
11226 #else /* DEVELOPMENT || DEBUG */
11227 (void) map;
11228 (void) suspend;
11229 #endif /* DEVELOPMENT || DEBUG */
11230 }
11231
11232 void
11233 pmap_footprint_suspend(
11234 vm_map_t map,
11235 boolean_t suspend)
11236 {
11237 pmap_footprint_suspend_internal(map, suspend);
11238 }
11239
11240 void
11241 pmap_nop(pmap_t pmap)
11242 {
11243 validate_pmap_mutable(pmap);
11244 }
11245
11246 pmap_t
11247 pmap_txm_kernel_pmap(void)
11248 {
11249 return kernel_pmap;
11250 }
11251
11252 TXMAddressSpace_t*
11253 pmap_txm_addr_space(const pmap_t pmap)
11254 {
11255 if (pmap) {
11256 return pmap->txm_addr_space;
11257 }
11258
11259 /*
11260 * When the passed in PMAP is NULL, it means the caller wishes to operate
11261 * on the current_pmap(). We could resolve and return that, but it is actually
11262 * safer to return NULL since these TXM interfaces also accept NULL inputs
11263 * which causes TXM to resolve to the current_pmap() equivalent internally.
11264 */
11265 return NULL;
11266 }
11267
11268 void
11269 pmap_txm_set_addr_space(
11270 pmap_t pmap,
11271 TXMAddressSpace_t *txm_addr_space)
11272 {
11273 assert(pmap != NULL);
11274
11275 if (pmap->txm_addr_space && txm_addr_space) {
11276 /* Attempted to overwrite the address space in the PMAP */
11277 panic("attempted ovewrite of TXM address space: %p | %p | %p",
11278 pmap, pmap->txm_addr_space, txm_addr_space);
11279 } else if (!pmap->txm_addr_space && !txm_addr_space) {
11280 /* This should never happen */
11281 panic("attempted NULL overwrite of TXM address space: %p", pmap);
11282 }
11283
11284 pmap->txm_addr_space = txm_addr_space;
11285 }
11286
11287 void
11288 pmap_txm_set_trust_level(
11289 pmap_t pmap,
11290 CSTrust_t trust_level)
11291 {
11292 assert(pmap != NULL);
11293
11294 CSTrust_t current_trust = pmap->txm_trust_level;
11295 if (current_trust != kCSTrustUntrusted) {
11296 panic("attempted to overwrite TXM trust on the pmap: %p", pmap);
11297 }
11298
11299 pmap->txm_trust_level = trust_level;
11300 }
11301
11302 kern_return_t
11303 pmap_txm_get_trust_level_kdp(
11304 pmap_t pmap,
11305 CSTrust_t *trust_level)
11306 {
11307 if (pmap == NULL) {
11308 return KERN_INVALID_ARGUMENT;
11309 } else if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
11310 return KERN_INVALID_ARGUMENT;
11311 }
11312
11313 if (trust_level != NULL) {
11314 *trust_level = pmap->txm_trust_level;
11315 }
11316 return KERN_SUCCESS;
11317 }
11318
11319 kern_return_t
11320 pmap_txm_get_jit_address_range_kdp(
11321 pmap_t pmap,
11322 uintptr_t *jit_region_start,
11323 uintptr_t *jit_region_end)
11324 {
11325 if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
11326 return KERN_INVALID_ARGUMENT;
11327 }
11328 TXMAddressSpace_t *txm_addr_space = pmap_txm_addr_space(pmap);
11329 if (NULL == txm_addr_space) {
11330 return KERN_INVALID_ARGUMENT;
11331 }
11332 if (ml_validate_nofault((vm_offset_t)txm_addr_space, sizeof(*txm_addr_space)) == false) {
11333 return KERN_INVALID_ARGUMENT;
11334 }
11335 /**
11336 * It's a bit gross that we're dereferencing what is supposed to be an abstract type.
11337 * If we were running in the TXM, we would always perform additional checks on txm_addr_space,
11338 * but this isn't necessary here, since we are running in the kernel and only using the results for
11339 * diagnostic purposes, rather than any policy enforcement.
11340 */
11341 if (txm_addr_space->jitRegion) {
11342 if (ml_validate_nofault((vm_offset_t)txm_addr_space->jitRegion, sizeof(txm_addr_space->jitRegion)) == false) {
11343 return KERN_INVALID_ARGUMENT;
11344 }
11345 if (txm_addr_space->jitRegion->addr && txm_addr_space->jitRegion->addrEnd) {
11346 *jit_region_start = txm_addr_space->jitRegion->addr;
11347 *jit_region_end = txm_addr_space->jitRegion->addrEnd;
11348 return KERN_SUCCESS;
11349 }
11350 }
11351 return KERN_NOT_FOUND;
11352 }
11353
11354 static pmap_t
11355 _pmap_txm_resolve_pmap(pmap_t pmap)
11356 {
11357 if (pmap == NULL) {
11358 pmap = current_pmap();
11359 if (pmap == kernel_pmap) {
11360 return NULL;
11361 }
11362 }
11363
11364 return pmap;
11365 }
11366
11367 void
11368 pmap_txm_acquire_shared_lock(pmap_t pmap)
11369 {
11370 pmap = _pmap_txm_resolve_pmap(pmap);
11371 if (!pmap) {
11372 return;
11373 }
11374
11375 lck_rw_lock_shared(&pmap->txm_lck);
11376 }
11377
11378 void
11379 pmap_txm_release_shared_lock(pmap_t pmap)
11380 {
11381 pmap = _pmap_txm_resolve_pmap(pmap);
11382 if (!pmap) {
11383 return;
11384 }
11385
11386 lck_rw_unlock_shared(&pmap->txm_lck);
11387 }
11388
11389 void
11390 pmap_txm_acquire_exclusive_lock(pmap_t pmap)
11391 {
11392 pmap = _pmap_txm_resolve_pmap(pmap);
11393 if (!pmap) {
11394 return;
11395 }
11396
11397 lck_rw_lock_exclusive(&pmap->txm_lck);
11398 }
11399
11400 void
11401 pmap_txm_release_exclusive_lock(pmap_t pmap)
11402 {
11403 pmap = _pmap_txm_resolve_pmap(pmap);
11404 if (!pmap) {
11405 return;
11406 }
11407
11408 lck_rw_unlock_exclusive(&pmap->txm_lck);
11409 }
11410
11411 static void
11412 _pmap_txm_transfer_page(const pmap_paddr_t addr)
11413 {
11414 sptm_retype_params_t retype_params = {
11415 .raw = SPTM_RETYPE_PARAMS_NULL
11416 };
11417
11418 /* Retype through the SPTM */
11419 sptm_retype(addr, XNU_DEFAULT, TXM_DEFAULT, retype_params);
11420 }
11421
11422 /**
11423 * Prepare a page for retyping to TXM_DEFAULT by clearing its
11424 * internal flags.
11425 *
11426 * @param pa Physical address of the page.
11427 */
11428 static inline void
11429 _pmap_txm_retype_prepare(const pmap_paddr_t pa)
11430 {
11431 const sptm_retype_params_t retype_params = {
11432 .raw = SPTM_RETYPE_PARAMS_NULL
11433 };
11434
11435 /**
11436 * SPTM allows XNU_DEFAULT pages to request deferral of TLB flushing
11437 * when their PTE is updated, which is an important performance
11438 * optimization. However, this also allows an attacker controlled
11439 * XNU to exploit a read reference with a stale write-enabled PTE in
11440 * TLB. This is fine as long as the page is not retyped and the damage
11441 * will be contained within XNU domain. However, when such a page needs
11442 * to be retyped, SPTM has to make sure there's no outstanding
11443 * reference, or there's no history of deferring TLBIs. Internally,
11444 * SPTM maintains a flag tracking past deferred TLBIs that only gets
11445 * cleared on retyping with no outstanding reference. Therefore, we
11446 * do a dummy retype to XNU_DEFAULT itself to clear the internal flag,
11447 * before we actually transfer this page to TXM domain. To make sure
11448 * SPTM won't throw a violation, all the mappings to the page have to
11449 * be removed before calling this.
11450 */
11451 sptm_retype(pa, XNU_DEFAULT, XNU_DEFAULT, retype_params);
11452 }
11453
11454 /**
11455 * Transfer an XNU owned page to TXM domain.
11456 *
11457 * @param addr Kernel virtual address of the page. It has to be page size
11458 * aligned.
11459 */
11460 void
11461 pmap_txm_transfer_page(const vm_address_t addr)
11462 {
11463 assert((addr & PAGE_MASK) == 0);
11464
11465 const pmap_paddr_t pa = kvtophys_nofail(addr);
11466 const unsigned int pai = pa_index(pa);
11467
11468 /* Lock the PVH lock to prevent concurrent updates to the mappings during the self retype below. */
11469 locked_pvh_t locked_pvh = pvh_lock(pai);
11470
11471 /* Disconnect the mapping to assure SPTM of no pending TLBI. */
11472 pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
11473 PMAP_OPTIONS_PPO_PENDING_RETYPE, &locked_pvh, NULL);
11474
11475 /* Self retype to clear the SPTM internal flags tracking delayed TLBIs for revoked writes. */
11476 _pmap_txm_retype_prepare(pa);
11477
11478 pvh_unlock(&locked_pvh);
11479
11480 /* XNU needs to hold an RO reference to the page despite the ownership being transferred to TXM. */
11481 pmap_enter_addr(kernel_pmap, addr, pa, VM_PROT_READ, VM_PROT_NONE, 0, true, PMAP_MAPPING_TYPE_INFER);
11482
11483 /* Finally, retype the page to TXM_DEFAULT. */
11484 _pmap_txm_transfer_page(pa);
11485 }
11486
11487 struct vm_object txm_vm_object_storage VM_PAGE_PACKED_ALIGNED;
11488 SECURITY_READ_ONLY_LATE(vm_object_t) txm_vm_object = &txm_vm_object_storage;
11489
11490 _Static_assert(sizeof(vm_map_address_t) == sizeof(pmap_paddr_t),
11491 "sizeof(vm_map_address_t) != sizeof(pmap_paddr_t)");
11492
11493 vm_map_address_t
11494 pmap_txm_allocate_page(void)
11495 {
11496 pmap_paddr_t phys_addr = 0;
11497 vm_page_t page = VM_PAGE_NULL;
11498 boolean_t thread_vm_privileged = false;
11499
11500 /* We are allowed to allocate privileged memory */
11501 thread_vm_privileged = set_vm_privilege(true);
11502
11503 /* Allocate a page from the VM free list */
11504 vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
11505 while ((page = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) {
11506 VM_PAGE_WAIT();
11507 }
11508
11509 /* Wire all of the pages allocated for TXM */
11510 vm_page_lock_queues();
11511 vm_page_wire(page, VM_KERN_MEMORY_SECURITY, TRUE);
11512 vm_page_unlock_queues();
11513
11514 phys_addr = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page));
11515 if (phys_addr == 0) {
11516 panic("invalid VM page allocated for TXM: %llu", phys_addr);
11517 }
11518
11519 /* Add the physical page to the TXM VM object */
11520 vm_object_lock(txm_vm_object);
11521 vm_page_insert_wired(
11522 page,
11523 txm_vm_object,
11524 phys_addr - gPhysBase,
11525 VM_KERN_MEMORY_SECURITY);
11526 vm_object_unlock(txm_vm_object);
11527
11528 /* Reset thread privilege */
11529 set_vm_privilege(thread_vm_privileged);
11530
11531 /* Retype the page */
11532 _pmap_txm_transfer_page(phys_addr);
11533
11534 return phys_addr;
11535 }
11536
11537 int
11538 pmap_cs_configuration(void)
11539 {
11540 code_signing_config_t config = 0;
11541
11542 /* Compute the code signing configuration */
11543 code_signing_configuration(NULL, &config);
11544
11545 return (int)config;
11546 }
11547
11548 bool
11549 pmap_performs_stage2_translations(
11550 __unused pmap_t pmap)
11551 {
11552 return false;
11553 }
11554
11555 bool
11556 pmap_has_iofilter_protected_write(void)
11557 {
11558 #if HAS_GUARDED_IO_FILTER
11559 return true;
11560 #else
11561 return false;
11562 #endif
11563 }
11564
11565 #if HAS_GUARDED_IO_FILTER
11566
11567 void
11568 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
11569 {
11570 /**
11571 * Even though this is done from EL1/2 for an address potentially owned by Guarded
11572 * Mode, we should be fine as mmu_kvtop uses "at s1e1r" checking for read access
11573 * only.
11574 */
11575 const pmap_paddr_t pa = mmu_kvtop(addr);
11576
11577 if (!pa) {
11578 panic("%s: addr 0x%016llx doesn't have a valid kernel mapping", __func__, (uint64_t) addr);
11579 }
11580
11581 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
11582 if (frame_type == XNU_PROTECTED_IO) {
11583 sptm_iofilter_protected_write(pa, value, width);
11584 } else {
11585 /* Mappings is valid but not specified by I/O filter. However, we still try
11586 * accessing the address from kernel mode. This allows addresses that are not
11587 * owned by SPTM to be accessed by this interface.
11588 */
11589 switch (width) {
11590 case 1:
11591 *(volatile uint8_t *)addr = (uint8_t) value;
11592 break;
11593 case 2:
11594 *(volatile uint16_t *)addr = (uint16_t) value;
11595 break;
11596 case 4:
11597 *(volatile uint32_t *)addr = (uint32_t) value;
11598 break;
11599 case 8:
11600 *(volatile uint64_t *)addr = (uint64_t) value;
11601 break;
11602 default:
11603 panic("%s: width %llu not supported", __func__, width);
11604 }
11605 }
11606 }
11607
11608 #else /* HAS_GUARDED_IO_FILTER */
11609
11610 __attribute__((__noreturn__))
11611 void
11612 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
11613 {
11614 panic("%s called on an unsupported platform.", __FUNCTION__);
11615 }
11616
11617 #endif /* HAS_GUARDED_IO_FILTER */
11618
11619 void * __attribute__((noreturn))
11620 pmap_claim_reserved_ppl_page(void)
11621 {
11622 panic("%s: function not supported in this environment", __FUNCTION__);
11623 }
11624
11625 void __attribute__((noreturn))
11626 pmap_free_reserved_ppl_page(void __unused *kva)
11627 {
11628 panic("%s: function not supported in this environment", __FUNCTION__);
11629 }
11630
11631 bool
11632 pmap_lookup_in_loaded_trust_caches(__unused const uint8_t cdhash[CS_CDHASH_LEN])
11633 {
11634 kern_return_t kr = query_trust_cache(
11635 kTCQueryTypeLoadable,
11636 cdhash,
11637 NULL);
11638
11639 if (kr == KERN_SUCCESS) {
11640 return true;
11641 }
11642 return false;
11643 }
11644
11645 uint32_t
11646 pmap_lookup_in_static_trust_cache(__unused const uint8_t cdhash[CS_CDHASH_LEN])
11647 {
11648 TrustCacheQueryToken_t query_token = {0};
11649 kern_return_t kr = KERN_NOT_FOUND;
11650 uint64_t flags = 0;
11651 uint8_t hash_type = 0;
11652
11653 kr = query_trust_cache(
11654 kTCQueryTypeStatic,
11655 cdhash,
11656 &query_token);
11657
11658 if (kr == KERN_SUCCESS) {
11659 amfi->TrustCache.queryGetFlags(&query_token, &flags);
11660 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
11661
11662 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
11663 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
11664 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
11665 }
11666
11667 return 0;
11668 }
11669
11670 #if DEVELOPMENT || DEBUG
11671
11672 struct page_table_dump_header {
11673 uint64_t pa;
11674 uint64_t num_entries;
11675 uint64_t start_va;
11676 uint64_t end_va;
11677 };
11678
11679 static kern_return_t
11680 pmap_dump_page_tables_recurse(pmap_t pmap,
11681 const tt_entry_t *ttp,
11682 unsigned int cur_level,
11683 unsigned int level_mask,
11684 uint64_t start_va,
11685 void *buf_start,
11686 void *buf_end,
11687 size_t *bytes_copied)
11688 {
11689 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11690 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
11691
11692 uint64_t size = pt_attr->pta_level_info[cur_level].size;
11693 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
11694 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
11695 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
11696
11697 void *bufp = (uint8_t*)buf_start + *bytes_copied;
11698
11699 if (cur_level == pt_attr_root_level(pt_attr)) {
11700 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
11701 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
11702 }
11703
11704 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
11705 const tt_entry_t *tt_end = &ttp[num_entries];
11706
11707 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
11708 return KERN_INSUFFICIENT_BUFFER_SIZE;
11709 }
11710
11711 if (level_mask & (1U << cur_level)) {
11712 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
11713 header->pa = kvtophys_nofail((vm_offset_t)ttp);
11714 header->num_entries = num_entries;
11715 header->start_va = start_va;
11716 header->end_va = start_va + (num_entries * size);
11717
11718 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
11719 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
11720 }
11721 uint64_t current_va = start_va;
11722
11723 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
11724 tt_entry_t tte = *ttep;
11725
11726 if (!(tte & valid_mask)) {
11727 continue;
11728 }
11729
11730 if ((tte & type_mask) == type_block) {
11731 continue;
11732 } else {
11733 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
11734 panic("%s: corrupt entry %#llx at %p, "
11735 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
11736 __FUNCTION__, tte, ttep,
11737 ttp, cur_level, bufp, buf_end);
11738 }
11739
11740 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
11741
11742 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
11743 level_mask, current_va, buf_start, buf_end, bytes_copied);
11744
11745 if (recurse_result != KERN_SUCCESS) {
11746 return recurse_result;
11747 }
11748 }
11749 }
11750
11751 return KERN_SUCCESS;
11752 }
11753
11754 kern_return_t
11755 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
11756 {
11757 if (not_in_kdp) {
11758 panic("pmap_dump_page_tables must only be called from kernel debugger context");
11759 }
11760 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
11761 level_mask, pmap->min, bufp, buf_end, bytes_copied);
11762 }
11763
11764 #else /* DEVELOPMENT || DEBUG */
11765
11766 kern_return_t
11767 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
11768 unsigned int level_mask __unused, size_t *bytes_copied __unused)
11769 {
11770 return KERN_NOT_SUPPORTED;
11771 }
11772 #endif /* !(DEVELOPMENT || DEBUG) */
11773
11774
11775 #ifdef CONFIG_XNUPOST
11776 static volatile bool pmap_test_took_fault = false;
11777
11778 static bool
11779 pmap_test_fault_handler(arm_saved_state_t * state)
11780 {
11781 bool retval = false;
11782 uint64_t esr = get_saved_state_esr(state);
11783 esr_exception_class_t class = ESR_EC(esr);
11784 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
11785
11786 if ((class == ESR_EC_DABORT_EL1) &&
11787 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
11788 pmap_test_took_fault = true;
11789 /* return to the instruction immediately after the call to NX page */
11790 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
11791 retval = true;
11792 }
11793
11794 return retval;
11795 }
11796
11797 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
11798 static NOKASAN bool
11799 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
11800 {
11801 pmap_t old_pmap = NULL;
11802 thread_t thread = current_thread();
11803
11804 pmap_test_took_fault = false;
11805
11806 /*
11807 * We're potentially switching pmaps without using the normal thread
11808 * mechanism; disable interrupts and preemption to avoid any unexpected
11809 * memory accesses.
11810 */
11811 const boolean_t old_int_state = ml_set_interrupts_enabled(FALSE);
11812 mp_disable_preemption();
11813
11814 if (pmap != NULL) {
11815 old_pmap = current_pmap();
11816 pmap_switch(pmap, thread);
11817
11818 /* Disable PAN; pmap shouldn't be the kernel pmap. */
11819 #if __ARM_PAN_AVAILABLE__
11820 __builtin_arm_wsr("pan", 0);
11821 #endif /* __ARM_PAN_AVAILABLE__ */
11822 }
11823
11824 ml_expect_fault_begin(pmap_test_fault_handler, va);
11825
11826 if (is_write) {
11827 *((volatile uint64_t*)(va)) = 0xdec0de;
11828 } else {
11829 volatile uint64_t tmp = *((volatile uint64_t*)(va));
11830 (void)tmp;
11831 }
11832
11833 /* Save the fault bool, and undo the gross stuff we did. */
11834 bool took_fault = pmap_test_took_fault;
11835 ml_expect_fault_end();
11836
11837 if (pmap != NULL) {
11838 #if __ARM_PAN_AVAILABLE__
11839 __builtin_arm_wsr("pan", 1);
11840 #endif /* __ARM_PAN_AVAILABLE__ */
11841
11842 pmap_switch(old_pmap, thread);
11843 }
11844
11845 mp_enable_preemption();
11846 ml_set_interrupts_enabled(old_int_state);
11847 bool retval = (took_fault == should_fault);
11848 return retval;
11849 }
11850
11851 static bool
11852 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
11853 {
11854 bool retval = pmap_test_access(pmap, va, should_fault, false);
11855
11856 if (!retval) {
11857 T_FAIL("%s: %s, "
11858 "pmap=%p, va=%p, should_fault=%u",
11859 __func__, should_fault ? "did not fault" : "faulted",
11860 pmap, (void*)va, (unsigned)should_fault);
11861 }
11862
11863 return retval;
11864 }
11865
11866 static bool
11867 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
11868 {
11869 bool retval = pmap_test_access(pmap, va, should_fault, true);
11870
11871 if (!retval) {
11872 T_FAIL("%s: %s, "
11873 "pmap=%p, va=%p, should_fault=%u",
11874 __func__, should_fault ? "did not fault" : "faulted",
11875 pmap, (void*)va, (unsigned)should_fault);
11876 }
11877
11878 return retval;
11879 }
11880
11881 static bool
11882 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
11883 {
11884 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
11885 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
11886
11887 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
11888
11889 if (!retval) {
11890 T_FAIL("%s: bits=%u, "
11891 "pa=%p, should_be_set=%u",
11892 __func__, bits,
11893 (void*)pa, should_be_set);
11894 }
11895
11896 return retval;
11897 }
11898
11899 static __attribute__((noinline)) bool
11900 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
11901 {
11902 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
11903 return retval;
11904 }
11905
11906 static int
11907 pmap_test_test_config(unsigned int flags)
11908 {
11909 T_LOG("running pmap_test_test_config flags=0x%X", flags);
11910 unsigned int map_count = 0;
11911 unsigned long page_ratio = 0;
11912 pmap_t pmap = pmap_create_options(NULL, 0, flags);
11913
11914 if (!pmap) {
11915 panic("Failed to allocate pmap");
11916 }
11917
11918 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11919 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
11920 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
11921 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
11922
11923 if (pmap_page_size <= native_page_size) {
11924 page_ratio = native_page_size / pmap_page_size;
11925 } else {
11926 /*
11927 * We claim to support a page_ratio of less than 1, which is
11928 * not currently supported by the pmap layer; panic.
11929 */
11930 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
11931 "flags=%u",
11932 __func__, native_page_size, pmap_page_size,
11933 flags);
11934 }
11935
11936 if (PAGE_RATIO > 1) {
11937 /*
11938 * The kernel is deliberately pretending to have 16KB pages.
11939 * The pmap layer has code that supports this, so pretend the
11940 * page size is larger than it is.
11941 */
11942 pmap_page_size = PAGE_SIZE;
11943 native_page_size = PAGE_SIZE;
11944 }
11945
11946 /*
11947 * Get two pages from the VM; one to be mapped wired, and one to be
11948 * mapped nonwired.
11949 */
11950 vm_page_t unwired_vm_page = vm_page_grab();
11951 vm_page_t wired_vm_page = vm_page_grab();
11952
11953 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
11954 panic("Failed to grab VM pages");
11955 }
11956
11957 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
11958 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
11959
11960 pmap_paddr_t pa = ptoa(pn);
11961 pmap_paddr_t wired_pa = ptoa(wired_pn);
11962
11963 /*
11964 * We'll start mappings at the second twig TT. This keeps us from only
11965 * using the first entry in each TT, which would trivially be address
11966 * 0; one of the things we will need to test is retrieving the VA for
11967 * a given PTE.
11968 */
11969 vm_map_address_t va_base = pmap_twig_size;
11970 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
11971
11972 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
11973 /*
11974 * Not exactly a functional failure, but this test relies on
11975 * there being a spare PTE slot we can use to pin the TT.
11976 */
11977 panic("Cannot pin translation table");
11978 }
11979
11980 /*
11981 * Create the wired mapping; this will prevent the pmap layer from
11982 * reclaiming our test TTs, which would interfere with this test
11983 * ("interfere" -> "make it panic").
11984 */
11985 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true, PMAP_MAPPING_TYPE_INFER);
11986
11987 T_LOG("Validate that kernel cannot write to SPTM memory.");
11988 pt_entry_t * ptep = pmap_pte(pmap, va_base);
11989 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
11990
11991 /*
11992 * Create read-only mappings of the nonwired page; if the pmap does
11993 * not use the same page size as the kernel, create multiple mappings
11994 * so that the kernel page is fully mapped.
11995 */
11996 for (map_count = 0; map_count < page_ratio; map_count++) {
11997 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)),
11998 VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
11999 }
12000
12001 /* Validate that all the PTEs have the expected PA and VA. */
12002 for (map_count = 0; map_count < page_ratio; map_count++) {
12003 ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
12004
12005 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
12006 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
12007 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
12008 }
12009
12010 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
12011 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
12012 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
12013 }
12014 }
12015
12016 T_LOG("Validate that reads to our mapping do not fault.");
12017 pmap_test_read(pmap, va_base, false);
12018
12019 T_LOG("Validate that writes to our mapping fault.");
12020 pmap_test_write(pmap, va_base, true);
12021
12022 T_LOG("Make the first mapping writable.");
12023 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12024
12025 T_LOG("Validate that writes to our mapping do not fault.");
12026 pmap_test_write(pmap, va_base, false);
12027
12028 /*
12029 * For page ratios of greater than 1: validate that writes to the other
12030 * mappings still fault. Remove the mappings afterwards (we're done
12031 * with page ratio testing).
12032 */
12033 for (map_count = 1; map_count < page_ratio; map_count++) {
12034 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
12035 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
12036 }
12037
12038 /* Remove remaining mapping */
12039 pmap_remove(pmap, va_base, va_base + pmap_page_size);
12040
12041 T_LOG("Make the first mapping execute-only");
12042 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false, PMAP_MAPPING_TYPE_INFER);
12043
12044
12045 T_LOG("Validate that reads to our mapping do not fault.");
12046 pmap_test_read(pmap, va_base, false);
12047
12048 T_LOG("Validate that reads to our mapping do not fault.");
12049 pmap_test_read(pmap, va_base, false);
12050
12051 T_LOG("Validate that writes to our mapping fault.");
12052 pmap_test_write(pmap, va_base, true);
12053
12054 pmap_remove(pmap, va_base, va_base + pmap_page_size);
12055
12056 T_LOG("Mark the page unreferenced and unmodified.");
12057 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12058 pmap_test_check_refmod(pa, 0);
12059
12060 /*
12061 * Begin testing the ref/mod state machine. Re-enter the mapping with
12062 * different protection/fault_type settings, and confirm that the
12063 * ref/mod state matches our expectations at each step.
12064 */
12065 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
12066 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12067 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12068
12069 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
12070 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12071 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12072 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12073
12074 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
12075 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12076 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12077 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12078
12079 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
12080 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12081 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12082
12083 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
12084 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12085 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12086 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12087
12088 /*
12089 * Shared memory testing; we'll have two mappings; one read-only,
12090 * one read-write.
12091 */
12092 vm_map_address_t rw_base = va_base;
12093 vm_map_address_t ro_base = va_base + pmap_page_size;
12094
12095 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12096 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12097
12098 /*
12099 * Test that we take faults as expected for unreferenced/unmodified
12100 * pages. Also test the arm_fast_fault interface, to ensure that
12101 * mapping permissions change as expected.
12102 */
12103 T_LOG("!ref/!mod: expect no access");
12104 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12105 pmap_test_read_write(pmap, ro_base, false, false);
12106 pmap_test_read_write(pmap, rw_base, false, false);
12107
12108 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
12109 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
12110 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12111 pmap_test_read_write(pmap, ro_base, true, false);
12112 pmap_test_read_write(pmap, rw_base, true, false);
12113
12114 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
12115 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12116 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12117 pmap_test_read_write(pmap, ro_base, true, false);
12118 pmap_test_read_write(pmap, rw_base, true, true);
12119
12120 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
12121 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12122 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12123 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12124 pmap_test_read_write(pmap, ro_base, true, false);
12125 pmap_test_read_write(pmap, rw_base, true, true);
12126
12127 T_LOG("RW protect both mappings; should not change protections.");
12128 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12129 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12130 pmap_test_read_write(pmap, ro_base, true, false);
12131 pmap_test_read_write(pmap, rw_base, true, true);
12132
12133 T_LOG("Read protect both mappings; RW mapping should become RO.");
12134 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
12135 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
12136 pmap_test_read_write(pmap, ro_base, true, false);
12137 pmap_test_read_write(pmap, rw_base, true, false);
12138
12139 T_LOG("RW protect the page; mappings should not change protections.");
12140 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12141 pmap_page_protect(pn, VM_PROT_ALL);
12142 pmap_test_read_write(pmap, ro_base, true, false);
12143 pmap_test_read_write(pmap, rw_base, true, true);
12144
12145 T_LOG("Read protect the page; RW mapping should become RO.");
12146 pmap_page_protect(pn, VM_PROT_READ);
12147 pmap_test_read_write(pmap, ro_base, true, false);
12148 pmap_test_read_write(pmap, rw_base, true, false);
12149
12150 T_LOG("Validate that disconnect removes all known mappings of the page.");
12151 pmap_disconnect(pn);
12152 if (!pmap_verify_free(pn)) {
12153 T_FAIL("Page still has mappings");
12154 }
12155
12156 #if defined(ARM_LARGE_MEMORY)
12157 #define PMAP_TEST_LARGE_MEMORY_VA 64 * (1ULL << 40) /* 64 TB */
12158
12159 T_LOG("Create new wired mapping in the extended address space enabled by ARM_LARGE_MEMORY.");
12160 pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12161 pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, true, true);
12162 pmap_remove(pmap, PMAP_TEST_LARGE_MEMORY_VA, PMAP_TEST_LARGE_MEMORY_VA + pmap_page_size);
12163 #endif /* ARM_LARGE_MEMORY */
12164
12165 T_LOG("Remove the wired mapping, so we can tear down the test map.");
12166 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
12167 pmap_destroy(pmap);
12168
12169 T_LOG("Release the pages back to the VM.");
12170 vm_page_lock_queues();
12171 vm_page_free(unwired_vm_page);
12172 vm_page_free(wired_vm_page);
12173 vm_page_unlock_queues();
12174
12175 T_LOG("Testing successful!");
12176 return 0;
12177 }
12178
12179 kern_return_t
12180 pmap_test(void)
12181 {
12182 T_LOG("Starting pmap_tests");
12183 int flags = 0;
12184 flags |= PMAP_CREATE_64BIT;
12185
12186 #if __ARM_MIXED_PAGE_SIZE__ && !CONFIG_SPTM
12187 T_LOG("Testing VM_PAGE_SIZE_4KB");
12188 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
12189 T_LOG("Testing VM_PAGE_SIZE_16KB");
12190 pmap_test_test_config(flags);
12191 #else /* __ARM_MIXED_PAGE_SIZE__ */
12192 pmap_test_test_config(flags);
12193 #endif /* __ARM_MIXED_PAGE_SIZE__ */
12194
12195 T_PASS("completed pmap_test successfully");
12196 return KERN_SUCCESS;
12197 }
12198 #endif /* CONFIG_XNUPOST */
12199
12200 /*
12201 * The following function should never make it to RELEASE code, since
12202 * it provides a way to get the PPL to modify text pages.
12203 */
12204 #if DEVELOPMENT || DEBUG
12205
12206 /**
12207 * Forcibly overwrite executable text with an illegal instruction.
12208 *
12209 * @note Only used for xnu unit testing.
12210 *
12211 * @param pa The physical address to corrupt.
12212 *
12213 * @return KERN_SUCCESS on success.
12214 */
12215 kern_return_t
12216 pmap_test_text_corruption(pmap_paddr_t pa __unused)
12217 {
12218 /*
12219 * SPTM TODO: implement an SPTM version of this.
12220 * The physical apertue is owned by the SPTM and text
12221 * pages have RO physical aperture mappings.
12222 */
12223 return KERN_SUCCESS;
12224 }
12225
12226 #endif /* DEVELOPMENT || DEBUG */
12227
12228