1 /*
2 * Copyright (c) 2011-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/backtrace.h>
42 #include <kern/bits.h>
43 #include <kern/ecc.h>
44 #include <kern/thread.h>
45 #include <kern/sched.h>
46 #include <kern/zalloc.h>
47 #include <kern/zalloc_internal.h>
48 #include <kern/kalloc.h>
49 #include <kern/spl.h>
50 #include <kern/startup.h>
51 #include <kern/trap_telemetry.h>
52 #include <kern/trustcache.h>
53
54 #include <os/overflow.h>
55
56 #include <vm/pmap.h>
57 #include <vm/pmap_cs.h>
58 #include <vm/vm_map_xnu.h>
59 #include <vm/vm_kern.h>
60 #include <vm/vm_protos.h>
61 #include <vm/vm_object_internal.h>
62 #include <vm/vm_page_internal.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/cpm_internal.h>
65
66
67 #include <libkern/section_keywords.h>
68 #include <sys/errno.h>
69
70 #include <libkern/amfi/amfi.h>
71 #include <sys/trusted_execution_monitor.h>
72 #include <sys/trust_caches.h>
73 #include <sys/code_signing.h>
74
75 #include <machine/atomic.h>
76 #include <machine/thread.h>
77 #include <machine/lowglobals.h>
78
79 #include <arm/caches_internal.h>
80 #include <arm/cpu_data.h>
81 #include <arm/cpu_data_internal.h>
82 #include <arm/cpu_capabilities.h>
83 #include <arm/cpu_number.h>
84 #include <arm/machine_cpu.h>
85 #include <arm/misc_protos.h>
86 #include <arm/trap_internal.h>
87 #include <arm64/sptm/pmap/pmap_internal.h>
88 #include <arm64/sptm/sptm.h>
89
90 #include <arm64/proc_reg.h>
91 #include <pexpert/arm64/boot.h>
92 #include <arm64/ppl/uat.h>
93 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
94 #include <arm64/amcc_rorgn.h>
95 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
96
97 #include <pexpert/device_tree.h>
98
99 #include <san/kasan.h>
100 #include <sys/cdefs.h>
101
102 #if defined(HAS_APPLE_PAC)
103 #include <ptrauth.h>
104 #endif
105
106 #ifdef CONFIG_XNUPOST
107 #include <tests/xnupost.h>
108 #endif
109
110
111 #if HIBERNATION
112 #include <IOKit/IOHibernatePrivate.h>
113 #endif /* HIBERNATION */
114
115 #ifdef __ARM64_PMAP_SUBPAGE_L1__
116 /**
117 * Different from PPL, PMAP_ROOT_ALLOC_SIZE for subpage L1 devices is 128 bytes
118 * rather than 64 bytes, due to the metadata SPTM needs to track the subpage L1
119 * tables.
120 */
121 #define PMAP_ROOT_ALLOC_SIZE SUBPAGE_USER_ROOT_TABLE_SIZE
122 #else
123 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
124 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
125
126 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
127
128
129 /**
130 * Per-CPU data used to do setup and post-processing for SPTM calls.
131 * On the setup side, this structure is used to store parameters for batched SPTM operations.
132 * These parameters may be large (upwards of 1K), and given that SPTM calls are generally
133 * issued from preemption-disabled contexts anyway, it's better to store them in per-CPU
134 * data rather than the local stack.
135 * On the post-processing side, this structure exposes a pointer to the SPTM's per-CPU array
136 * of 'prev_ptes', that is the prior value encountered in each PTE at the time of the SPTM's
137 * atomic update of that PTE.
138 */
139 pmap_sptm_percpu_data_t PERCPU_DATA(pmap_sptm_percpu);
140
141 /**
142 * Reference group for global tracking of all outstanding pmap references.
143 */
144 os_refgrp_decl(static, pmap_refgrp, "pmap", NULL);
145
146 /* Boot-arg to enable/disable the use of XNU_KERNEL_RESTRICTED type in SPTM. */
147 TUNABLE(bool, use_xnu_restricted, "xnu_restricted", true);
148
149 extern u_int32_t random(void); /* from <libkern/libkern.h> */
150
151 static bool alloc_asid(pmap_t pmap);
152 static void free_asid(pmap_t pmap);
153 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
154 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
155
156 const struct page_table_ops native_pt_ops =
157 {
158 .alloc_id = alloc_asid,
159 .free_id = free_asid,
160 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
161 .wimg_to_pte = wimg_to_pte,
162 };
163
164 const struct page_table_level_info pmap_table_level_info_16k[] =
165 {
166 [0] = {
167 .size = ARM_16K_TT_L0_SIZE,
168 .offmask = ARM_16K_TT_L0_OFFMASK,
169 .shift = ARM_16K_TT_L0_SHIFT,
170 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
171 .valid_mask = ARM_TTE_VALID,
172 .type_mask = ARM_TTE_TYPE_MASK,
173 .type_block = ARM_TTE_TYPE_BLOCK
174 },
175 [1] = {
176 .size = ARM_16K_TT_L1_SIZE,
177 .offmask = ARM_16K_TT_L1_OFFMASK,
178 .shift = ARM_16K_TT_L1_SHIFT,
179 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
180 .valid_mask = ARM_TTE_VALID,
181 .type_mask = ARM_TTE_TYPE_MASK,
182 .type_block = ARM_TTE_TYPE_BLOCK
183 },
184 [2] = {
185 .size = ARM_16K_TT_L2_SIZE,
186 .offmask = ARM_16K_TT_L2_OFFMASK,
187 .shift = ARM_16K_TT_L2_SHIFT,
188 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
189 .valid_mask = ARM_TTE_VALID,
190 .type_mask = ARM_TTE_TYPE_MASK,
191 .type_block = ARM_TTE_TYPE_BLOCK
192 },
193 [3] = {
194 .size = ARM_16K_TT_L3_SIZE,
195 .offmask = ARM_16K_TT_L3_OFFMASK,
196 .shift = ARM_16K_TT_L3_SHIFT,
197 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
198 .valid_mask = ARM_PTE_TYPE_VALID,
199 .type_mask = ARM_TTE_TYPE_MASK,
200 .type_block = ARM_TTE_TYPE_L3BLOCK
201 }
202 };
203
204 const struct page_table_level_info pmap_table_level_info_4k[] =
205 {
206 [0] = {
207 .size = ARM_4K_TT_L0_SIZE,
208 .offmask = ARM_4K_TT_L0_OFFMASK,
209 .shift = ARM_4K_TT_L0_SHIFT,
210 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
211 .valid_mask = ARM_TTE_VALID,
212 .type_mask = ARM_TTE_TYPE_MASK,
213 .type_block = ARM_TTE_TYPE_BLOCK
214 },
215 [1] = {
216 .size = ARM_4K_TT_L1_SIZE,
217 .offmask = ARM_4K_TT_L1_OFFMASK,
218 .shift = ARM_4K_TT_L1_SHIFT,
219 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
220 .valid_mask = ARM_TTE_VALID,
221 .type_mask = ARM_TTE_TYPE_MASK,
222 .type_block = ARM_TTE_TYPE_BLOCK
223 },
224 [2] = {
225 .size = ARM_4K_TT_L2_SIZE,
226 .offmask = ARM_4K_TT_L2_OFFMASK,
227 .shift = ARM_4K_TT_L2_SHIFT,
228 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
229 .valid_mask = ARM_TTE_VALID,
230 .type_mask = ARM_TTE_TYPE_MASK,
231 .type_block = ARM_TTE_TYPE_BLOCK
232 },
233 [3] = {
234 .size = ARM_4K_TT_L3_SIZE,
235 .offmask = ARM_4K_TT_L3_OFFMASK,
236 .shift = ARM_4K_TT_L3_SHIFT,
237 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
238 .valid_mask = ARM_PTE_TYPE_VALID,
239 .type_mask = ARM_TTE_TYPE_MASK,
240 .type_block = ARM_TTE_TYPE_L3BLOCK
241 }
242 };
243
244 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
245 {
246 [0] = { /* Unused */
247 .size = ARM_4K_TT_L0_SIZE,
248 .offmask = ARM_4K_TT_L0_OFFMASK,
249 .shift = ARM_4K_TT_L0_SHIFT,
250 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
251 .valid_mask = ARM_TTE_VALID,
252 .type_mask = ARM_TTE_TYPE_MASK,
253 .type_block = ARM_TTE_TYPE_BLOCK
254 },
255 [1] = { /* Concatenated, so index mask is larger than normal */
256 .size = ARM_4K_TT_L1_SIZE,
257 .offmask = ARM_4K_TT_L1_OFFMASK,
258 .shift = ARM_4K_TT_L1_SHIFT,
259 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
260 .index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
261 #else
262 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
263 #endif
264 .valid_mask = ARM_TTE_VALID,
265 .type_mask = ARM_TTE_TYPE_MASK,
266 .type_block = ARM_TTE_TYPE_BLOCK
267 },
268 [2] = {
269 .size = ARM_4K_TT_L2_SIZE,
270 .offmask = ARM_4K_TT_L2_OFFMASK,
271 .shift = ARM_4K_TT_L2_SHIFT,
272 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
273 .valid_mask = ARM_TTE_VALID,
274 .type_mask = ARM_TTE_TYPE_MASK,
275 .type_block = ARM_TTE_TYPE_BLOCK
276 },
277 [3] = {
278 .size = ARM_4K_TT_L3_SIZE,
279 .offmask = ARM_4K_TT_L3_OFFMASK,
280 .shift = ARM_4K_TT_L3_SHIFT,
281 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
282 .valid_mask = ARM_PTE_TYPE_VALID,
283 .type_mask = ARM_TTE_TYPE_MASK,
284 .type_block = ARM_TTE_TYPE_L3BLOCK
285 }
286 };
287
288 const struct page_table_attr pmap_pt_attr_4k = {
289 .pta_level_info = pmap_table_level_info_4k,
290 .pta_root_level = (T0SZ_BOOT - 16) / 9,
291 #if __ARM_MIXED_PAGE_SIZE__
292 .pta_commpage_level = PMAP_TT_L2_LEVEL,
293 #else /* __ARM_MIXED_PAGE_SIZE__ */
294 #if __ARM_16K_PG__
295 .pta_commpage_level = PMAP_TT_L2_LEVEL,
296 #else /* __ARM_16K_PG__ */
297 .pta_commpage_level = PMAP_TT_L1_LEVEL,
298 #endif /* __ARM_16K_PG__ */
299 #endif /* __ARM_MIXED_PAGE_SIZE__ */
300 .pta_max_level = PMAP_TT_L3_LEVEL,
301 .pta_ops = &native_pt_ops,
302 .ap_ro = ARM_PTE_AP(AP_RORO),
303 .ap_rw = ARM_PTE_AP(AP_RWRW),
304 .ap_rona = ARM_PTE_AP(AP_RONA),
305 .ap_rwna = ARM_PTE_AP(AP_RWNA),
306 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
307 .ap_x = ARM_PTE_PNX,
308 #if __ARM_MIXED_PAGE_SIZE__
309 .pta_tcr_value = TCR_EL1_4KB,
310 #endif /* __ARM_MIXED_PAGE_SIZE__ */
311 .pta_page_size = 4096,
312 .pta_page_shift = 12,
313 .geometry_id = SPTM_PT_GEOMETRY_4K,
314 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB),
315 };
316
317 const struct page_table_attr pmap_pt_attr_16k_kern = {
318 .pta_level_info = pmap_table_level_info_16k,
319 .pta_root_level = PMAP_TT_L1_LEVEL,
320 .pta_commpage_level = PMAP_TT_L2_LEVEL,
321 .pta_max_level = PMAP_TT_L3_LEVEL,
322 .pta_ops = &native_pt_ops,
323 .ap_ro = ARM_PTE_AP(AP_RORO),
324 .ap_rw = ARM_PTE_AP(AP_RWRW),
325 .ap_rona = ARM_PTE_AP(AP_RONA),
326 .ap_rwna = ARM_PTE_AP(AP_RWNA),
327 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
328 .ap_x = ARM_PTE_PNX,
329 #if __ARM_MIXED_PAGE_SIZE__
330 .pta_tcr_value = TCR_EL1_16KB,
331 #endif /* __ARM_MIXED_PAGE_SIZE__ */
332 .pta_page_size = 16384,
333 .pta_page_shift = 14,
334 .geometry_id = SPTM_PT_GEOMETRY_16K_KERN,
335 .pta_va_valid_mask = ARM_PTE_T1_REGION_MASK(TCR_EL1_16KB),
336 };
337
338 const struct page_table_attr pmap_pt_attr_16k = {
339 .pta_level_info = pmap_table_level_info_16k,
340 .pta_root_level = PMAP_TT_L1_LEVEL,
341 .pta_commpage_level = PMAP_TT_L2_LEVEL,
342 .pta_max_level = PMAP_TT_L3_LEVEL,
343 .pta_ops = &native_pt_ops,
344 .ap_ro = ARM_PTE_AP(AP_RORO),
345 .ap_rw = ARM_PTE_AP(AP_RWRW),
346 .ap_rona = ARM_PTE_AP(AP_RONA),
347 .ap_rwna = ARM_PTE_AP(AP_RWNA),
348 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
349 .ap_x = ARM_PTE_PNX,
350 #if __ARM_MIXED_PAGE_SIZE__
351 .pta_tcr_value = TCR_EL1_16KB,
352 #endif /* __ARM_MIXED_PAGE_SIZE__ */
353 .pta_page_size = 16384,
354 .pta_page_shift = 14,
355 .geometry_id = SPTM_PT_GEOMETRY_16K,
356 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB),
357 };
358
359 #if __ARM_16K_PG__
360 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
361 #else /* !__ARM_16K_PG__ */
362 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
363 #endif /* !__ARM_16K_PG__ */
364
365
366 #if DEVELOPMENT || DEBUG
367 int vm_footprint_suspend_allowed = 1;
368
369 extern int pmap_ledgers_panic;
370 extern int pmap_ledgers_panic_leeway;
371
372 #endif /* DEVELOPMENT || DEBUG */
373
374 #if DEVELOPMENT || DEBUG
375 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
376 (current_thread()->pmap_footprint_suspended)
377 #else /* DEVELOPMENT || DEBUG */
378 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
379 #endif /* DEVELOPMENT || DEBUG */
380
381 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
382
383
384 /* Keeps track of whether the pmap has been bootstrapped */
385 SECURITY_READ_ONLY_LATE(bool) pmap_bootstrapped = false;
386
387 /*
388 * Represents a tlb range that will be flushed before returning from the pmap.
389 * Used by phys_attribute_clear_range to defer flushing pages in this range until
390 * the end of the operation, and to accumulate batched operations for submission
391 * to the SPTM as a performance optimization.
392 */
393 typedef struct pmap_tlb_flush_range {
394 /* Address space in which the flush region resides */
395 pmap_t ptfr_pmap;
396
397 /* Page-aligned beginning of the flush region */
398 vm_map_address_t ptfr_start;
399
400 /* Page-aligned non-inclusive end of the flush region */
401 vm_map_address_t ptfr_end;
402
403 /**
404 * Address of current PTE position in ptfr_pmap's [ptfr_start, ptfr_end) region.
405 * This is meant to be set up by the caller of pmap_page_protect_options_with_flush_range()
406 * or arm_force_fast_fault_with_flush_range(), and used by those functions to determine
407 * when a given mapping can be added to the SPTM's per-CPU region templates array vs.
408 * the more complex task of adding it to the disjoint ops array.
409 */
410 pt_entry_t *current_ptep;
411
412 /**
413 * Starting VA for any not-yet-submitted per-CPU region templates. This is meant to be
414 * set up by the caller of pmap_page_protect_options_with_flush_range() or
415 * arm_force_fast_fault_with_flush_range() and used by pmap_multipage_op_submit_region()
416 * when issuing the SPTM call to purge any pending region ops.
417 */
418 vm_map_address_t pending_region_start;
419
420 /**
421 * Number of entries in the per-CPU SPTM region templates array which have not
422 * yet been submitted to the SPTM.
423 */
424 unsigned int pending_region_entries;
425
426 /**
427 * Indicates whether at least one region entry was added to the per-CPU region ops
428 * array since the last time this field was checked. Intended to be cleared by the
429 * caller.
430 */
431 bool region_entry_added;
432
433 /**
434 * Marker for the current paddr "header" entry in the per-CPU SPTM disjoint ops array.
435 * This field is intended to be modified only by pmap_multipage_op_submit_disjoint()
436 * and pmap_multipage_op_add_page(), and should be treated as opaque by callers
437 * of those functions.
438 */
439 sptm_update_disjoint_multipage_op_t *current_header;
440
441 /**
442 * Position in the per-CPU SPTM ops array of the first ordinary
443 * sptm_disjoint_op_t entry following [current_header]. This is the starting
444 * point at which mappings should be inserted for the page described by
445 * [current_header].
446 */
447 unsigned int current_header_first_mapping_index;
448
449 /**
450 * Number of entries in the per-CPU SPTM disjoint ops array, including paddr headers,
451 * which have not yet been submitted to the SPTM.
452 */
453 unsigned int pending_disjoint_entries;
454
455 /**
456 * This field is used by the preemption check interval logic on the
457 * phys_attribute_clear_range() path to determine when sufficient
458 * forward progress has been made to check for and (if necessary)
459 * handle pending preemption.
460 */
461 unsigned int processed_entries;
462
463 /**
464 * Indicates whether the top-level caller needs to flush the TLB for
465 * the region in [ptfr_pmap] described by [ptfr_start, ptfr_end).
466 * This will be set if the SPTM indicates that it needed to alter
467 * any valid mapping within this region and SPTM_UPDATE_DEFER_TLBI
468 * was passed to the relevant SPTM call(s).
469 */
470 bool ptfr_flush_needed;
471 } pmap_tlb_flush_range_t;
472
473
474
475 /* Virtual memory region for early allocation */
476 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
477 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
478 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
479
480 extern uint8_t bootstrap_pagetables[];
481
482 extern unsigned int not_in_kdp;
483
484 extern vm_offset_t first_avail;
485
486 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
487 extern vm_offset_t virtual_space_end; /* End of kernel address space */
488 extern vm_offset_t static_memory_end;
489
490 extern const vm_map_address_t physmap_base;
491 extern const vm_map_address_t physmap_end;
492
493 extern int maxproc, hard_maxproc;
494
495 extern bool sdsb_io_rgns_present;
496
497 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
498 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
499
500 /* The number of address bits one TTBR can cover. */
501 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
502
503 /*
504 * The bounds on our TTBRs. These are for sanity checking that
505 * an address is accessible by a TTBR before we attempt to map it.
506 */
507
508 /* The level of the root of a page table. */
509 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
510
511 /* The number of entries in the root TT of a page table. */
512 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
513
514 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
515 const pmap_t kernel_pmap = &kernel_pmap_store;
516
517 __static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
518
519 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
520 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
521
522 typedef struct tt_free_entry {
523 struct tt_free_entry *next;
524 } tt_free_entry_t;
525
526 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
527 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
528 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
529 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
530 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
531 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
532 _Atomic unsigned int inuse_iommu_pages_count[SPTM_IOMMUS_N_IDS] = {0}; /* number of active pages for each IOMMU class */
533
534 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
535 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
536
537 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
538 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
539
540 /* Lock group used for all pmap object locks. */
541 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
542
543 #if DEVELOPMENT || DEBUG
544 int nx_enabled = 1; /* enable no-execute protection */
545 int allow_data_exec = 0; /* No apps may execute data */
546 int allow_stack_exec = 0; /* No apps may execute from the stack */
547 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
548 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
549 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
550 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
551 #else /* DEVELOPMENT || DEBUG */
552 const int nx_enabled = 1; /* enable no-execute protection */
553 const int allow_data_exec = 0; /* No apps may execute data */
554 const int allow_stack_exec = 0; /* No apps may execute from the stack */
555 #endif /* DEVELOPMENT || DEBUG */
556
557
558 #if MACH_ASSERT
559 static void pmap_check_ledgers(pmap_t pmap);
560 #else
561 static inline void
pmap_check_ledgers(__unused pmap_t pmap)562 pmap_check_ledgers(__unused pmap_t pmap)
563 {
564 }
565 #endif /* MACH_ASSERT */
566
567 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
568
569 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
570 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
571
572 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
573
574 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
575
576 /* end of shared region + 512MB for various purposes */
577 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
578 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
579 "Minimum address space size outside allowable range");
580
581 // Max offset is 15.375GB for devices with "large" memory config
582 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
583 // Max offset is 11.375GB for devices with "small" memory config
584 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
585
586
587 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
588 "Large device address space size outside allowable range");
589 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
590 "Small device address space size outside allowable range");
591
592 # ifdef XNU_TARGET_OS_OSX
593 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
594 # else
595 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
596 # endif
597
598 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
599 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
600 #else
601 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
602 #endif
603
604 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
605 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
606 SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap;
607 #if !HAS_16BIT_ASID
608 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
609 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
610 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
611 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
612 #else
613 static uint16_t last_allocated_asid = 0;
614 #endif /* !HAS_16BIT_ASID */
615
616
617 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_default_table;
618 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_default_table;
619 #if __ARM_MIXED_PAGE_SIZE__
620 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_4k_table;
621 //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_4k_table;
622 #endif
623 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_data_pa = 0;
624 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_text_pa = 0;
625 SECURITY_READ_ONLY_LATE(static vm_map_address_t) commpage_text_user_va = 0;
626 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_ro_data_pa = 0;
627
628
629 #if (DEVELOPMENT || DEBUG)
630 /* Caches whether the SPTM sysreg API has been enabled by the SPTM */
631 SECURITY_READ_ONLY_LATE(static bool) sptm_sysreg_available = false;
632 #endif /* (DEVELOPMENT || DEBUG) */
633
634 /* PTE Define Macros */
635
636 #ifndef SPTM_PTE_IN_FLIGHT_MARKER
637 /* SPTM TODO: Get rid of this once we export SPTM_PTE_IN_FLIGHT_MARKER from the SPTM. */
638 #define SPTM_PTE_IN_FLIGHT_MARKER 0x80U
639 #endif /* SPTM_PTE_IN_FLIGHT_MARKER */
640
641 /**
642 * Determine whether a PTE has been marked as compressed. This function also panics if
643 * the PTE contains bits that shouldn't be present in a compressed PTE, which is most of them.
644 *
645 * @param pte the PTE contents to check
646 * @param ptep the address of the PTE contents, for diagnostic purposes only
647 *
648 * @return true if the PTE is compressed, false otherwise
649 */
650 static inline bool
pte_is_compressed(pt_entry_t pte,pt_entry_t * ptep)651 pte_is_compressed(pt_entry_t pte, pt_entry_t *ptep)
652 {
653 const bool compressed = (!pte_is_valid(pte) && (pte & ARM_PTE_COMPRESSED));
654 /**
655 * Check for bits that shouldn't be present in a compressed PTE. This is everything except the
656 * compressed/compressed-alt bits, as well as the SPTM's in-flight marker which may be set while
657 * the SPTM is in the process of flushing the TLBs after marking a previously-valid PTE as
658 * compressed.
659 */
660 if (__improbable(compressed && (pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER)))) {
661 panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?",
662 ptep, pte, pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER));
663 }
664 return compressed;
665 }
666
667 #define pte_is_wired(pte) \
668 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
669
670 #define pte_was_writeable(pte) \
671 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
672
673 #define pte_set_was_writeable(pte, was_writeable) \
674 do { \
675 if ((was_writeable)) { \
676 (pte) |= ARM_PTE_WRITEABLE; \
677 } else { \
678 (pte) &= ~ARM_PTE_WRITEABLE; \
679 } \
680 } while(0)
681
682 /**
683 * Updated wired-mapping accountings in the PTD and ledger.
684 *
685 * @param pmap The pmap against which to update accounting
686 * @param pte_p The PTE whose wired state is being changed
687 * @param wired Indicates whether the PTE is being wired or unwired.
688 */
689 static inline void
pte_update_wiredcnt(pmap_t pmap,pt_entry_t * pte_p,boolean_t wired)690 pte_update_wiredcnt(pmap_t pmap, pt_entry_t *pte_p, boolean_t wired)
691 {
692 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
693 unsigned short *ptd_wiredcnt_ptr = &(ptep_get_info(pte_p)->wiredcnt);
694 if (wired) {
695 if (__improbable(os_atomic_inc_orig(ptd_wiredcnt_ptr, relaxed) == UINT16_MAX)) {
696 panic("pmap %p (pte %p): wired count overflow", pmap, pte_p);
697 }
698 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
699 } else {
700 if (__improbable(os_atomic_dec_orig(ptd_wiredcnt_ptr, relaxed) == 0)) {
701 panic("pmap %p (pte %p): wired count underflow", pmap, pte_p);
702 }
703 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
704 }
705 }
706
707 /*
708 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
709 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
710 * will observe the updated PTE.
711 */
712 #define FLUSH_PTE() \
713 __builtin_arm_dmb(DMB_ISH);
714
715 /*
716 * Synchronize updates to PTEs that were previously valid and thus may be cached in
717 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
718 * TLBI. This should only require a store-store barrier, as subsequent accesses in
719 * program order will not issue until the DSB completes. Prior loads may be reordered
720 * after the barrier, but their behavior should not be materially affected by the
721 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
722 * matter for loads until the access is re-driven well after the TLB update is
723 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
724 * we should be in a position to handle access faults. For "voluntary" PTE access
725 * restriction due to unmapping or protection, the decision to restrict access should
726 * have a data dependency on prior loads in order to avoid a data race.
727 */
728 #define FLUSH_PTE_STRONG() \
729 __builtin_arm_dsb(DSB_ISHST);
730
731 /**
732 * Write enough page table entries to map a single VM page. On systems where the
733 * VM page size does not match the hardware page size, multiple page table
734 * entries will need to be written.
735 *
736 * @note This function does not emit a barrier to ensure these page table writes
737 * have completed before continuing. This is commonly needed. In the case
738 * where a DMB or DSB barrier is needed, then use the write_pte() and
739 * write_pte_strong() functions respectively instead of this one.
740 *
741 * @param ptep Pointer to the first page table entry to update.
742 * @param pte The value to write into each page table entry. In the case that
743 * multiple PTEs are updated to a non-empty value, then the address
744 * in this value will automatically be incremented for each PTE
745 * write.
746 */
747 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)748 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
749 {
750 /**
751 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
752 * systems, which is why it's checked at runtime instead of compile time.
753 * The "unreachable" warning needs to be suppressed because it still is a
754 * compile time constant on some systems.
755 */
756 __unreachable_ok_push
757 if (TEST_PAGE_RATIO_4) {
758 if (((uintptr_t)ptep) & 0x1f) {
759 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
760 __func__, ptep, (void*)pte);
761 }
762
763 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
764 /**
765 * If we're writing an empty/compressed PTE value, then don't
766 * auto-increment the address for each PTE write.
767 */
768 *ptep = pte;
769 *(ptep + 1) = pte;
770 *(ptep + 2) = pte;
771 *(ptep + 3) = pte;
772 } else {
773 *ptep = pte;
774 *(ptep + 1) = pte | 0x1000;
775 *(ptep + 2) = pte | 0x2000;
776 *(ptep + 3) = pte | 0x3000;
777 }
778 } else {
779 *ptep = pte;
780 }
781 __unreachable_ok_pop
782 }
783
784 /**
785 * Writes enough page table entries to map a single VM page and then ensures
786 * those writes complete by executing a Data Memory Barrier.
787 *
788 * @note The DMB issued by this function is not strong enough to protect against
789 * TLB invalidates from being reordered above the PTE writes. If a TLBI
790 * instruction is going to immediately be called after this write, it's
791 * recommended to call write_pte_strong() instead of this function.
792 *
793 * See the function header for write_pte_fast() for more details on the
794 * parameters.
795 */
796 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)797 write_pte(pt_entry_t *ptep, pt_entry_t pte)
798 {
799 write_pte_fast(ptep, pte);
800 FLUSH_PTE();
801 }
802
803 /**
804 * Retrieve the pmap structure for the thread running on the current CPU.
805 */
806 pmap_t
current_pmap()807 current_pmap()
808 {
809 const pmap_t current = vm_map_pmap(current_thread()->map);
810 assert(current != NULL);
811 return current;
812 }
813
814 #if DEVELOPMENT || DEBUG
815
816 /*
817 * Trace levels are controlled by a bitmask in which each
818 * level can be enabled/disabled by the (1<<level) position
819 * in the boot arg
820 * Level 0: PPL extension functionality
821 * Level 1: pmap lifecycle (create/destroy/switch)
822 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
823 * Level 3: internal state management (attributes/fast-fault)
824 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
825 */
826
827 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
828
829 #define PMAP_TRACE(level, ...) \
830 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
831 KDBG_RELEASE(__VA_ARGS__); \
832 }
833 #else /* DEVELOPMENT || DEBUG */
834
835 #define PMAP_TRACE(level, ...)
836
837 #endif /* DEVELOPMENT || DEBUG */
838
839
840 /*
841 * Internal function prototypes (forward declarations).
842 */
843
844 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
845
846 static void pmap_set_reference(ppnum_t pn);
847
848 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
849
850 static kern_return_t pmap_expand(
851 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
852
853 static void pmap_remove_range(pmap_t, vm_map_address_t, vm_map_address_t);
854
855 static tt_entry_t *pmap_tt1_allocate(pmap_t, vm_size_t, uint8_t);
856
857 static void pmap_tt1_deallocate(pmap_t, tt_entry_t *, vm_size_t);
858
859 static kern_return_t pmap_tt_allocate(
860 pmap_t, tt_entry_t **, pt_desc_t **, unsigned int, unsigned int);
861
862 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
863 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
864 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
865
866 static void pmap_unmap_commpage(
867 pmap_t pmap);
868
869 static boolean_t
870 pmap_is_64bit(pmap_t);
871
872
873 static void pmap_flush_tlb_for_paddr_async(pmap_paddr_t);
874
875 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
876
877 static boolean_t arm_clear_fast_fault(
878 ppnum_t ppnum,
879 vm_prot_t fault_type,
880 uintptr_t pvh,
881 pt_entry_t *pte_p,
882 pp_attr_t attrs_to_clear);
883
884 static void pmap_tte_deallocate(
885 pmap_t pmap,
886 vm_offset_t va_start,
887 tt_entry_t *ttep,
888 unsigned int level,
889 bool pmap_locked);
890
891
892 /*
893 * Temporary prototypes, while we wait for pmap_enter to move to taking an
894 * address instead of a page number.
895 */
896 kern_return_t
897 pmap_enter(
898 pmap_t pmap,
899 vm_map_address_t v,
900 ppnum_t pn,
901 vm_prot_t prot,
902 vm_prot_t fault_type,
903 unsigned int flags,
904 boolean_t wired,
905 pmap_mapping_type_t mapping_type);
906
907 static kern_return_t
908 pmap_enter_addr(
909 pmap_t pmap,
910 vm_map_address_t v,
911 pmap_paddr_t pa,
912 vm_prot_t prot,
913 vm_prot_t fault_type,
914 unsigned int flags,
915 boolean_t wired,
916 pmap_mapping_type_t mapping_type);
917
918 kern_return_t
919 pmap_enter_options_addr(
920 pmap_t pmap,
921 vm_map_address_t v,
922 pmap_paddr_t pa,
923 vm_prot_t prot,
924 vm_prot_t fault_type,
925 unsigned int flags,
926 boolean_t wired,
927 unsigned int options,
928 __unused void *arg,
929 pmap_mapping_type_t mapping_type);
930
931 #ifdef CONFIG_XNUPOST
932 kern_return_t pmap_test(void);
933 #endif /* CONFIG_XNUPOST */
934
935 PMAP_SUPPORT_PROTOTYPES(
936 kern_return_t,
937 arm_fast_fault, (pmap_t pmap,
938 vm_map_address_t va,
939 vm_prot_t fault_type,
940 bool was_af_fault,
941 bool from_user), ARM_FAST_FAULT_INDEX);
942
943 PMAP_SUPPORT_PROTOTYPES(
944 boolean_t,
945 arm_force_fast_fault, (ppnum_t ppnum,
946 vm_prot_t allow_mode,
947 int options), ARM_FORCE_FAST_FAULT_INDEX);
948
949 MARK_AS_PMAP_TEXT static boolean_t
950 arm_force_fast_fault_with_flush_range(
951 ppnum_t ppnum,
952 vm_prot_t allow_mode,
953 int options,
954 locked_pvh_t *locked_pvh,
955 pp_attr_t bits_to_clear,
956 pmap_tlb_flush_range_t *flush_range);
957
958 PMAP_SUPPORT_PROTOTYPES(
959 void,
960 pmap_batch_set_cache_attributes, (
961 const unified_page_list_t * page_list,
962 unsigned int cacheattr,
963 bool update_attr_table), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
964
965 PMAP_SUPPORT_PROTOTYPES(
966 void,
967 pmap_change_wiring, (pmap_t pmap,
968 vm_map_address_t v,
969 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
970
971 PMAP_SUPPORT_PROTOTYPES(
972 pmap_t,
973 pmap_create_options, (ledger_t ledger,
974 vm_map_size_t size,
975 unsigned int flags,
976 kern_return_t * kr), PMAP_CREATE_INDEX);
977
978 PMAP_SUPPORT_PROTOTYPES(
979 void,
980 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
981
982 PMAP_SUPPORT_PROTOTYPES(
983 kern_return_t,
984 pmap_enter_options, (pmap_t pmap,
985 vm_map_address_t v,
986 pmap_paddr_t pa,
987 vm_prot_t prot,
988 vm_prot_t fault_type,
989 unsigned int flags,
990 boolean_t wired,
991 unsigned int options,
992 pmap_mapping_type_t mapping_type), PMAP_ENTER_OPTIONS_INDEX);
993
994 PMAP_SUPPORT_PROTOTYPES(
995 pmap_paddr_t,
996 pmap_find_pa, (pmap_t pmap,
997 addr64_t va), PMAP_FIND_PA_INDEX);
998
999 PMAP_SUPPORT_PROTOTYPES(
1000 kern_return_t,
1001 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
1002
1003
1004 PMAP_SUPPORT_PROTOTYPES(
1005 boolean_t,
1006 pmap_is_empty, (pmap_t pmap,
1007 vm_map_offset_t va_start,
1008 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1009
1010
1011 PMAP_SUPPORT_PROTOTYPES(
1012 unsigned int,
1013 pmap_map_cpu_windows_copy, (ppnum_t pn,
1014 vm_prot_t prot,
1015 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1016
1017 PMAP_SUPPORT_PROTOTYPES(
1018 void,
1019 pmap_ro_zone_memcpy, (zone_id_t zid,
1020 vm_offset_t va,
1021 vm_offset_t offset,
1022 const vm_offset_t new_data,
1023 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1024
1025 PMAP_SUPPORT_PROTOTYPES(
1026 uint64_t,
1027 pmap_ro_zone_atomic_op, (zone_id_t zid,
1028 vm_offset_t va,
1029 vm_offset_t offset,
1030 zro_atomic_op_t op,
1031 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1032
1033 PMAP_SUPPORT_PROTOTYPES(
1034 void,
1035 pmap_ro_zone_bzero, (zone_id_t zid,
1036 vm_offset_t va,
1037 vm_offset_t offset,
1038 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1039
1040 PMAP_SUPPORT_PROTOTYPES(
1041 kern_return_t,
1042 pmap_nest, (pmap_t grand,
1043 pmap_t subord,
1044 addr64_t vstart,
1045 uint64_t size), PMAP_NEST_INDEX);
1046
1047 PMAP_SUPPORT_PROTOTYPES(
1048 void,
1049 pmap_page_protect_options, (ppnum_t ppnum,
1050 vm_prot_t prot,
1051 unsigned int options,
1052 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1053
1054 PMAP_SUPPORT_PROTOTYPES(
1055 vm_map_address_t,
1056 pmap_protect_options, (pmap_t pmap,
1057 vm_map_address_t start,
1058 vm_map_address_t end,
1059 vm_prot_t prot,
1060 unsigned int options,
1061 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1062
1063 PMAP_SUPPORT_PROTOTYPES(
1064 kern_return_t,
1065 pmap_query_page_info, (pmap_t pmap,
1066 vm_map_offset_t va,
1067 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1068
1069 PMAP_SUPPORT_PROTOTYPES(
1070 mach_vm_size_t,
1071 pmap_query_resident, (pmap_t pmap,
1072 vm_map_address_t start,
1073 vm_map_address_t end,
1074 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1075
1076 PMAP_SUPPORT_PROTOTYPES(
1077 void,
1078 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1079
1080 PMAP_SUPPORT_PROTOTYPES(
1081 vm_map_address_t,
1082 pmap_remove_options, (pmap_t pmap,
1083 vm_map_address_t start,
1084 vm_map_address_t end,
1085 int options), PMAP_REMOVE_OPTIONS_INDEX);
1086
1087
1088 PMAP_SUPPORT_PROTOTYPES(
1089 void,
1090 pmap_set_cache_attributes, (ppnum_t pn,
1091 unsigned int cacheattr,
1092 bool update_attr_table), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1093
1094 PMAP_SUPPORT_PROTOTYPES(
1095 void,
1096 pmap_update_compressor_page, (ppnum_t pn,
1097 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1098
1099 PMAP_SUPPORT_PROTOTYPES(
1100 void,
1101 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1102
1103 #if MACH_ASSERT
1104 PMAP_SUPPORT_PROTOTYPES(
1105 void,
1106 pmap_set_process, (pmap_t pmap,
1107 int pid,
1108 char *procname), PMAP_SET_PROCESS_INDEX);
1109 #endif
1110
1111 PMAP_SUPPORT_PROTOTYPES(
1112 void,
1113 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1114
1115 PMAP_SUPPORT_PROTOTYPES(
1116 void,
1117 pmap_unnest_options, (pmap_t grand,
1118 addr64_t vaddr,
1119 uint64_t size,
1120 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1121
1122 PMAP_SUPPORT_PROTOTYPES(
1123 void,
1124 phys_attribute_set, (ppnum_t pn,
1125 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1126
1127 PMAP_SUPPORT_PROTOTYPES(
1128 void,
1129 phys_attribute_clear, (ppnum_t pn,
1130 unsigned int bits,
1131 int options,
1132 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1133
1134 #if __ARM_RANGE_TLBI__
1135 PMAP_SUPPORT_PROTOTYPES(
1136 vm_map_address_t,
1137 phys_attribute_clear_range, (pmap_t pmap,
1138 vm_map_address_t start,
1139 vm_map_address_t end,
1140 unsigned int bits,
1141 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1142 #endif /* __ARM_RANGE_TLBI__ */
1143
1144
1145 PMAP_SUPPORT_PROTOTYPES(
1146 void,
1147 pmap_switch, (pmap_t pmap, thread_t thread), PMAP_SWITCH_INDEX);
1148
1149 PMAP_SUPPORT_PROTOTYPES(
1150 void,
1151 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1152
1153 PMAP_SUPPORT_PROTOTYPES(
1154 void,
1155 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1156
1157 PMAP_SUPPORT_PROTOTYPES(
1158 void,
1159 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1160
1161 PMAP_SUPPORT_PROTOTYPES(
1162 void,
1163 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1164
1165 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1166 PMAP_SUPPORT_PROTOTYPES(
1167 void,
1168 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1169 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1170
1171 PMAP_SUPPORT_PROTOTYPES(
1172 void,
1173 pmap_trim, (pmap_t grand,
1174 pmap_t subord,
1175 addr64_t vstart,
1176 uint64_t size), PMAP_TRIM_INDEX);
1177
1178 #if HAS_APPLE_PAC
1179 PMAP_SUPPORT_PROTOTYPES(
1180 void *,
1181 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1182 PMAP_SUPPORT_PROTOTYPES(
1183 void *,
1184 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1185 #endif /* HAS_APPLE_PAC */
1186
1187
1188 void pmap_footprint_suspend(vm_map_t map,
1189 boolean_t suspend);
1190 PMAP_SUPPORT_PROTOTYPES(
1191 void,
1192 pmap_footprint_suspend, (vm_map_t map,
1193 boolean_t suspend),
1194 PMAP_FOOTPRINT_SUSPEND_INDEX);
1195
1196
1197
1198
1199
1200 /*
1201 * The low global vector page is mapped at a fixed alias.
1202 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1203 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1204 * to check both addresses anyway for backward compatibility. So for now
1205 * we leave H6 and H7 where they were.
1206 */
1207 #if (ARM_PGSHIFT == 14)
1208 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1209 #else
1210 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1211 #endif
1212
1213 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1214 PMAP_ZINFO_PALLOC(
1215 pmap_t pmap, int bytes)
1216 {
1217 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1218 }
1219
1220 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1221 PMAP_ZINFO_PFREE(
1222 pmap_t pmap,
1223 int bytes)
1224 {
1225 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1226 }
1227
1228 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1229 pmap_tt_ledger_credit(
1230 pmap_t pmap,
1231 vm_size_t size)
1232 {
1233 if (pmap != kernel_pmap) {
1234 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1235 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1236 }
1237 }
1238
1239 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1240 pmap_tt_ledger_debit(
1241 pmap_t pmap,
1242 vm_size_t size)
1243 {
1244 if (pmap != kernel_pmap) {
1245 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1246 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1247 }
1248 }
1249
1250 static inline void
pmap_update_plru(uint16_t asid_index __unused)1251 pmap_update_plru(uint16_t asid_index __unused)
1252 {
1253 #if !HAS_16BIT_ASID
1254 if (__probable(pmap_asid_plru)) {
1255 unsigned plru_index = asid_index >> 6;
1256 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1257 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1258 asid_plru_bitmap[plru_index] = ((plru_index == 0) ? ~1ULL : UINT64_MAX);
1259 }
1260 }
1261 #endif /* !HAS_16BIT_ASID */
1262 }
1263
1264 static bool
alloc_asid(pmap_t pmap)1265 alloc_asid(pmap_t pmap)
1266 {
1267 int vasid = -1;
1268
1269 pmap_simple_lock(&asid_lock);
1270
1271 #if !HAS_16BIT_ASID
1272 if (__probable(pmap_asid_plru)) {
1273 unsigned plru_index = 0;
1274 uint64_t lowest_gen = asid_plru_generation[0];
1275 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1276 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1277 if (asid_plru_generation[i] < lowest_gen) {
1278 plru_index = i;
1279 lowest_gen = asid_plru_generation[i];
1280 lowest_gen_bitmap = asid_plru_bitmap[i];
1281 }
1282 }
1283
1284 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += (MAX_HW_ASIDS >> 6)) {
1285 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1286 if (temp_plru) {
1287 vasid = (plru_index << 6) + lsb_first(temp_plru);
1288 #if DEVELOPMENT || DEBUG
1289 ++pmap_asid_hits;
1290 #endif
1291 break;
1292 }
1293 }
1294 }
1295 #else
1296 /**
1297 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1298 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1299 * However, we first try to allocate starting from the position of the most-recently allocated
1300 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1301 * lower bit positions and then re-checking those same lower positions every time we allocate
1302 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1303 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1304 * logic, without requiring prohibitively expensive RCTX instructions.
1305 */
1306 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1307 #endif /* !HAS_16BIT_ASID */
1308 if (__improbable(vasid < 0)) {
1309 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1310 // slightly better with the collision detection scheme used by pmap_switch_internal().
1311 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1312 #if DEVELOPMENT || DEBUG
1313 ++pmap_asid_misses;
1314 #endif
1315 }
1316 if (__improbable(vasid < 0)) {
1317 pmap_simple_unlock(&asid_lock);
1318 return false;
1319 }
1320 assert((uint32_t)vasid < pmap_max_asids);
1321 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1322 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1323 const uint16_t hw_asid = (uint16_t)(vasid & (MAX_HW_ASIDS - 1));
1324 #if HAS_16BIT_ASID
1325 last_allocated_asid = hw_asid;
1326 #endif /* HAS_16BIT_ASID */
1327 pmap_simple_unlock(&asid_lock);
1328 assert(hw_asid != 0); // Should never alias kernel ASID
1329 pmap->asid = (uint16_t)vasid;
1330 pmap_update_plru(hw_asid);
1331 return true;
1332 }
1333
1334 static void
free_asid(pmap_t pmap)1335 free_asid(pmap_t pmap)
1336 {
1337 const uint16_t vasid = os_atomic_xchg(&pmap->asid, 0, relaxed);
1338 if (__improbable(vasid == 0)) {
1339 return;
1340 }
1341
1342 #if !HAS_16BIT_ASID
1343 if (pmap_asid_plru) {
1344 const uint16_t hw_asid = vasid & (MAX_HW_ASIDS - 1);
1345 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1346 }
1347 #endif /* !HAS_16BIT_ASID */
1348 pmap_simple_lock(&asid_lock);
1349 assert(!bitmap_test(&asid_bitmap[0], vasid));
1350 bitmap_set(&asid_bitmap[0], vasid);
1351 pmap_simple_unlock(&asid_lock);
1352 }
1353
1354
1355 boolean_t
pmap_valid_address(pmap_paddr_t addr)1356 pmap_valid_address(
1357 pmap_paddr_t addr)
1358 {
1359 return pa_valid(addr);
1360 }
1361
1362
1363
1364
1365
1366
1367 /*
1368 * Map memory at initialization. The physical addresses being
1369 * mapped are not managed and are never unmapped.
1370 *
1371 * For now, VM is already on, we only need to map the
1372 * specified memory.
1373 */
1374 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1375 pmap_map(
1376 vm_map_address_t virt,
1377 vm_offset_t start,
1378 vm_offset_t end,
1379 vm_prot_t prot,
1380 unsigned int flags)
1381 {
1382 kern_return_t kr;
1383 vm_size_t ps;
1384
1385 ps = PAGE_SIZE;
1386 while (start < end) {
1387 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1388 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1389
1390 if (kr != KERN_SUCCESS) {
1391 panic("%s: failed pmap_enter, "
1392 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1393 __FUNCTION__,
1394 (void *) virt, (void *) start, (void *) end, prot, flags);
1395 }
1396
1397 virt += ps;
1398 start += ps;
1399 }
1400
1401
1402 return virt;
1403 }
1404
1405 #if HAS_SPTM_SYSCTL
1406 bool disarm_protected_io = false;
1407 #endif /* HAS_SPTM_SYSCTL */
1408
1409 /**
1410 * Force the permission of a PTE to be kernel RO if a page has XNU_PROTECTED_IO type.
1411 *
1412 * @param paddr The physical address of the page.
1413 * @param tmplate The PTE value to be evaluated.
1414 *
1415 * @return A new PTE value with permission bits modified.
1416 */
1417 static inline
1418 pt_entry_t
pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr,pt_entry_t tmplate)1419 pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr, pt_entry_t tmplate)
1420 {
1421 #if HAS_SPTM_SYSCTL
1422 if (__improbable(disarm_protected_io)) {
1423 /* Make sure disarm_protected_io is read before its counterpart in SPTM */
1424 os_atomic_thread_fence(acquire);
1425 return tmplate;
1426 }
1427
1428 #endif /* HAS_SPTM_SYSCTL */
1429
1430 /**
1431 * When requesting RW mappings to an XNU_PROTECTED_IO frame, downgrade
1432 * the mapping to RO. This is required because IOKit relies on this
1433 * behavior currently in the PPL.
1434 */
1435 const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
1436 if (frame_type == XNU_PROTECTED_IO) {
1437 /* SPTM to own the page by converting KERN_RW to PPL_RW. */
1438 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1439 switch (xprr_perm) {
1440 case XPRR_KERN_RO_PERM:
1441 break;
1442 case XPRR_KERN_RW_PERM:
1443 tmplate &= ~ARM_PTE_XPRR_MASK;
1444 tmplate |= xprr_perm_to_pte(XPRR_KERN_RO_PERM);
1445 break;
1446 default:
1447 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1448 }
1449 }
1450
1451 return tmplate;
1452 }
1453
1454 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1455 pmap_map_bd_with_options(
1456 vm_map_address_t virt,
1457 vm_offset_t start,
1458 vm_offset_t end,
1459 vm_prot_t prot,
1460 int32_t options)
1461 {
1462 pt_entry_t tmplate;
1463 vm_map_address_t vaddr;
1464 vm_offset_t paddr;
1465 pt_entry_t mem_attr;
1466
1467 if (__improbable(start & PAGE_MASK)) {
1468 panic("%s: start 0x%lx is not page aligned", __func__, start);
1469 }
1470
1471 if (__improbable(end & PAGE_MASK)) {
1472 panic("%s: end 0x%lx is not page aligned", __func__, end);
1473 }
1474
1475 if (__improbable(!gDramBase || !gDramSize)) {
1476 panic("%s: gDramBase/gDramSize not initialized", __func__);
1477 }
1478
1479 bool first_page_is_dram = is_dram_addr(start);
1480 for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) {
1481 if (first_page_is_dram != is_dram_addr(pa)) {
1482 panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM",
1483 __func__, pa, first_page_is_dram ? "is not" : "is");
1484 }
1485 }
1486
1487 switch (options & PMAP_MAP_BD_MASK) {
1488 case PMAP_MAP_BD_WCOMB:
1489 if (is_dram_addr(start)) {
1490 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1491 } else {
1492 #if HAS_FEAT_XS
1493 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
1494 #else /* HAS_FEAT_XS */
1495 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1496 #endif /* HAS_FEAT_XS */
1497 #if DEBUG || DEVELOPMENT
1498 pmap_wcrt_on_non_dram_count_increment_atomic();
1499 #endif /* DEBUG || DEVELOPMENT */
1500 }
1501 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1502 break;
1503 case PMAP_MAP_BD_POSTED:
1504 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1505 break;
1506 case PMAP_MAP_BD_POSTED_REORDERED:
1507 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1508 break;
1509 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1510 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1511 break;
1512 default:
1513 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1514 break;
1515 }
1516
1517 tmplate = ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1518 mem_attr | ARM_PTE_TYPE_VALID | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1519
1520 #if __ARM_KERNEL_PROTECT__
1521 tmplate |= ARM_PTE_NG;
1522 #endif /* __ARM_KERNEL_PROTECT__ */
1523
1524 vaddr = virt;
1525 paddr = start;
1526 while (paddr < end) {
1527 __assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, vaddr, pmap_force_pte_kernel_ro_if_protected_io(paddr, tmplate) | pa_to_pte(paddr));
1528 assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1529
1530 vaddr += PAGE_SIZE;
1531 paddr += PAGE_SIZE;
1532 }
1533
1534 return vaddr;
1535 }
1536
1537 /*
1538 * Back-door routine for mapping kernel VM at initialization.
1539 * Useful for mapping memory outside the range
1540 * [vm_first_phys, vm_last_phys] (i.e., devices).
1541 * Otherwise like pmap_map.
1542 */
1543 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1544 pmap_map_bd(
1545 vm_map_address_t virt,
1546 vm_offset_t start,
1547 vm_offset_t end,
1548 vm_prot_t prot)
1549 {
1550 return pmap_map_bd_with_options(virt, start, end, prot, 0);
1551 }
1552
1553 /*
1554 * Back-door routine for mapping kernel VM at initialization.
1555 * Useful for mapping memory specific physical addresses in early
1556 * boot (i.e., before kernel_map is initialized).
1557 *
1558 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1559 */
1560
1561 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1562 pmap_map_high_window_bd(
1563 vm_offset_t pa_start,
1564 vm_size_t len,
1565 vm_prot_t prot)
1566 {
1567 pt_entry_t *ptep, pte;
1568 vm_map_address_t va_start = VREGION1_START;
1569 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1570 vm_map_address_t va_end;
1571 vm_map_address_t va;
1572 vm_size_t offset;
1573
1574 offset = pa_start & PAGE_MASK;
1575 pa_start -= offset;
1576 len += offset;
1577
1578 if (len > (va_max - va_start)) {
1579 panic("%s: area too large, "
1580 "pa_start=%p, len=%p, prot=0x%x",
1581 __FUNCTION__,
1582 (void*)pa_start, (void*)len, prot);
1583 }
1584
1585 scan:
1586 for (; va_start < va_max; va_start += PAGE_SIZE) {
1587 ptep = pmap_pte(kernel_pmap, va_start);
1588 assert(!pte_is_compressed(*ptep, ptep));
1589 if (!pte_is_valid(*ptep)) {
1590 break;
1591 }
1592 }
1593 if (va_start > va_max) {
1594 panic("%s: insufficient pages, "
1595 "pa_start=%p, len=%p, prot=0x%x",
1596 __FUNCTION__,
1597 (void*)pa_start, (void*)len, prot);
1598 }
1599
1600 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1601 ptep = pmap_pte(kernel_pmap, va_end);
1602 assert(!pte_is_compressed(*ptep, ptep));
1603 if (pte_is_valid(*ptep)) {
1604 va_start = va_end + PAGE_SIZE;
1605 goto scan;
1606 }
1607 }
1608
1609 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1610 ptep = pmap_pte(kernel_pmap, va);
1611 pte = pa_to_pte(pa_start)
1612 | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1613 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1614 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT)
1615 | ARM_PTE_SH(SH_OUTER_MEMORY);
1616 #if __ARM_KERNEL_PROTECT__
1617 pte |= ARM_PTE_NG;
1618 #endif /* __ARM_KERNEL_PROTECT__ */
1619 __assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, va, pte);
1620 assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1621 }
1622 #if KASAN
1623 kasan_notify_address(va_start, len);
1624 #endif
1625 return va_start;
1626 }
1627
1628 /*
1629 * pmap_get_arm64_prot
1630 *
1631 * return effective armv8 VMSA block protections including
1632 * table AP/PXN/XN overrides of a pmap entry
1633 *
1634 */
1635
1636 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1637 pmap_get_arm64_prot(
1638 pmap_t pmap,
1639 vm_offset_t addr)
1640 {
1641 tt_entry_t tte = 0;
1642 unsigned int level = 0;
1643 uint64_t effective_prot_bits = 0;
1644 uint64_t aggregate_tte = 0;
1645 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1646 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1647
1648 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1649 tte = *pmap_ttne(pmap, level, addr);
1650
1651 if (!(tte & ARM_TTE_VALID)) {
1652 return 0;
1653 }
1654
1655 if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
1656 /* Block or page mapping; both have the same protection bit layout. */
1657 break;
1658 } else if (tte_is_table(tte)) {
1659 /* All of the table bits we care about are overrides, so just OR them together. */
1660 aggregate_tte |= tte;
1661 }
1662 }
1663
1664 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1665 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1666 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1667
1668 /* Start with the PTE bits. */
1669 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1670
1671 /* Table AP bits mask out block/page AP bits */
1672 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1673
1674 /* XN/PXN bits can be OR'd in. */
1675 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1676 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1677
1678 return effective_prot_bits;
1679 }
1680
1681 /**
1682 * Helper macros for accessing the "unnested" and "in-progress" bits in
1683 * pmap->nested_region_unnested_table_bitmap.
1684 */
1685 #define UNNEST_BIT(index) ((index) * 2)
1686 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
1687
1688 /*
1689 * Bootstrap the system enough to run with virtual memory.
1690 *
1691 * The early VM initialization code has already allocated
1692 * the first CPU's translation table and made entries for
1693 * all the one-to-one mappings to be found there.
1694 *
1695 * We must set up the kernel pmap structures, the
1696 * physical-to-virtual translation lookup tables for the
1697 * physical memory to be managed (between avail_start and
1698 * avail_end).
1699 *
1700 * Map the kernel's code and data, and allocate the system page table.
1701 * Page_size must already be set.
1702 *
1703 * Parameters:
1704 * first_avail first available physical page -
1705 * after kernel page tables
1706 * avail_start PA of first managed physical page
1707 * avail_end PA of last managed physical page
1708 */
1709
1710 void
pmap_bootstrap(vm_offset_t vstart)1711 pmap_bootstrap(
1712 vm_offset_t vstart)
1713 {
1714 vm_map_offset_t maxoffset;
1715
1716 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
1717
1718 #if DEVELOPMENT || DEBUG
1719 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
1720 kprintf("Kernel traces for pmap operations enabled\n");
1721 }
1722 #endif
1723
1724 /*
1725 * Initialize the kernel pmap.
1726 */
1727 #if ARM_PARAMETERIZED_PMAP
1728 kernel_pmap->pmap_pt_attr = &pmap_pt_attr_16k_kern;
1729 #endif /* ARM_PARAMETERIZED_PMAP */
1730 #if HAS_APPLE_PAC
1731 kernel_pmap->disable_jop = 0;
1732 #endif /* HAS_APPLE_PAC */
1733 kernel_pmap->tte = cpu_tte;
1734 kernel_pmap->ttep = cpu_ttep;
1735 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
1736 kernel_pmap->max = UINTPTR_MAX;
1737 os_ref_init_count_raw(&kernel_pmap->ref_count, &pmap_refgrp, 1);
1738 kernel_pmap->nx_enabled = TRUE;
1739 kernel_pmap->is_64bit = TRUE;
1740 #if CONFIG_ROSETTA
1741 kernel_pmap->is_rosetta = FALSE;
1742 #endif
1743
1744 kernel_pmap->nested_region_addr = 0x0ULL;
1745 kernel_pmap->nested_region_size = 0x0ULL;
1746 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
1747 kernel_pmap->type = PMAP_TYPE_KERNEL;
1748
1749 kernel_pmap->asid = 0;
1750
1751 /**
1752 * The kernel pmap lock is no longer needed; init it and then destroy it to
1753 * place it in a known-invalid state that will cause any attempt to use it
1754 * to fail.
1755 */
1756 pmap_lock_init(kernel_pmap);
1757 pmap_lock_destroy(kernel_pmap);
1758
1759 pmap_max_asids = SPTMArgs->num_asids;
1760
1761 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
1762
1763 /**
1764 * Bootstrap the core pmap data structures (e.g., pv_head_table,
1765 * pp_attr_table, etc). This function will use `avail_start` to allocate
1766 * space for these data structures.
1767 * */
1768 pmap_data_bootstrap();
1769
1770 /**
1771 * Don't make any assumptions about the alignment of avail_start before this
1772 * point (i.e., pmap_data_bootstrap() performs allocations).
1773 */
1774 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
1775
1776 const pmap_paddr_t pmap_struct_start = avail_start;
1777
1778 asid_bitmap = (bitmap_t*)phystokv(avail_start);
1779 avail_start = round_page(avail_start + asid_table_size);
1780
1781 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
1782
1783 queue_init(&map_pmap_list);
1784 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
1785
1786 virtual_space_start = vstart;
1787 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
1788
1789 bitmap_full(&asid_bitmap[0], pmap_max_asids);
1790 /* Clear the ASIDs which will alias the reserved kernel ASID of 0. */
1791 for (unsigned int i = 0; i < pmap_max_asids; i += MAX_HW_ASIDS) {
1792 bitmap_clear(&asid_bitmap[0], i);
1793 }
1794
1795
1796 #if !HAS_16BIT_ASID
1797 /**
1798 * Align the range of available hardware ASIDs to a multiple of 64 to enable the
1799 * masking used by the PLRU scheme. This means we must handle the case in which
1800 * the returned hardware ASID is 0, which we do by clearing all vASIDs that will
1801 * alias the kernel ASID.
1802 */
1803 pmap_max_asids = pmap_max_asids & ~63ul;
1804 if (__improbable(pmap_max_asids == 0)) {
1805 panic("%s: insufficient number of ASIDs (%u) supplied by SPTM", __func__, (unsigned int)pmap_max_asids);
1806 }
1807 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
1808 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
1809 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
1810 _Static_assert((MAX_HW_ASIDS % 64) == 0, "MAX_HW_ASIDS is not divisible by 64");
1811 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
1812 bitmap_clear(&asid_plru_bitmap[0], 0);
1813 #endif /* !HAS_16BIT_ASID */
1814
1815
1816 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
1817 maxoffset = trunc_page(maxoffset);
1818 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
1819 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
1820 arm_pmap_max_offset_default = maxoffset;
1821 }
1822 }
1823 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
1824 maxoffset = trunc_page(maxoffset);
1825 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
1826 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
1827 arm64_pmap_max_offset_default = maxoffset;
1828 }
1829 }
1830
1831 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
1832
1833
1834 #if DEVELOPMENT || DEBUG
1835 PE_parse_boot_argn("vm_footprint_suspend_allowed",
1836 &vm_footprint_suspend_allowed,
1837 sizeof(vm_footprint_suspend_allowed));
1838 #endif /* DEVELOPMENT || DEBUG */
1839
1840 #if KASAN
1841 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
1842 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
1843 #endif /* KASAN */
1844
1845 /**
1846 * Ensure that avail_start is always left on a page boundary. The calling
1847 * code might not perform any alignment before allocating page tables so
1848 * this is important.
1849 */
1850 avail_start = round_page(avail_start);
1851
1852
1853 #if (DEVELOPMENT || DEBUG)
1854 (void)sptm_features_available(SPTM_FEATURE_SYSREG, &sptm_sysreg_available);
1855 #endif /* (DEVELOPMENT || DEBUG) */
1856
1857 #if __ARM64_PMAP_SUBPAGE_L1__
1858 /* Initialize the Subpage User Root Table subsystem. */
1859 surt_init();
1860 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
1861
1862 /* Signal that the pmap has been bootstrapped */
1863 pmap_bootstrapped = true;
1864 }
1865
1866 /**
1867 * Helper for creating a populated commpage table
1868 *
1869 * In order to avoid burning extra pages on mapping the commpage, we create a
1870 * dedicated table hierarchy for the commpage. We forcibly nest the translation tables from
1871 * this pmap into other pmaps. The level we will nest at depends on the MMU configuration (page
1872 * size, TTBR range, etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
1873 *
1874 * @note that this is NOT "the nested pmap" (which is used to nest the shared cache).
1875 *
1876 * @param rw_va Virtual address at which to insert a mapping to the kernel R/W commpage
1877 * @param ro_va Virtual address at which to insert a mapping to the kernel R/O commpage
1878 * @param rw_pa Physical address of kernel R/W commpage
1879 * @param ro_pa Physical address of kernel R/O commpage, may be 0 if not supported in this
1880 * configuration
1881 * @param rx_pa Physical address of user executable (and kernel R/O) commpage, may be 0 if
1882 * not supported in this configuration
1883 * @param pmap_create_flags Control flags for the temporary pmap created by this function
1884 *
1885 * @return the physical address of the created commpage table, typed as
1886 * XNU_PAGE_TABLE_COMMPAGE and containing all relevant commpage mappings.
1887 */
1888 static pmap_paddr_t
pmap_create_commpage_table(vm_map_address_t rw_va,vm_map_address_t ro_va,pmap_paddr_t rw_pa,pmap_paddr_t ro_pa,pmap_paddr_t rx_pa,unsigned int pmap_create_flags)1889 pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va,
1890 pmap_paddr_t rw_pa, pmap_paddr_t ro_pa, pmap_paddr_t rx_pa, unsigned int pmap_create_flags)
1891 {
1892 pmap_t temp_commpage_pmap = pmap_create_options(NULL, 0, pmap_create_flags);
1893 assert(temp_commpage_pmap != NULL);
1894 assert(rw_pa != 0);
1895 const pt_attr_t *pt_attr = pmap_get_pt_attr(temp_commpage_pmap);
1896
1897 /*
1898 * We only use pmap_expand to expand the pmap up to the commpage nesting level. At that level
1899 * and beyond, all the newly created tables will be nested directly into the userspace region
1900 * for each process, and as such they must be of the dedicated SPTM commpage table type so that
1901 * the SPTM can enforce the commpage security model which forbids random replacement of commpage
1902 * mappings.
1903 */
1904 kern_return_t kr = pmap_expand(temp_commpage_pmap, rw_va, 0, pt_attr_commpage_level(pt_attr));
1905 assert(kr == KERN_SUCCESS);
1906
1907 pmap_paddr_t commpage_table_pa = 0;
1908 for (unsigned int i = pt_attr_commpage_level(pt_attr); i < pt_attr_leaf_level(pt_attr); i++) {
1909 pmap_paddr_t new_table = 0;
1910 kr = pmap_page_alloc(&new_table, 0);
1911 assert((kr == KERN_SUCCESS) && (new_table != 0));
1912 if (commpage_table_pa == 0) {
1913 commpage_table_pa = new_table;
1914 }
1915
1916 pt_desc_t *ptdp = ptd_alloc(temp_commpage_pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
1917 assert(ptdp);
1918
1919 const unsigned int pai = pa_index(new_table);
1920 locked_pvh_t locked_pvh = pvh_lock(pai);
1921 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
1922
1923 ptd_info_init(ptdp, temp_commpage_pmap, pt_attr_align_va(pt_attr, i, rw_va), i + 1, NULL);
1924
1925 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1926 retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
1927 sptm_retype(new_table, XNU_DEFAULT, XNU_PAGE_TABLE_COMMPAGE, retype_params);
1928
1929 const sptm_tte_t table_tte = (new_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
1930
1931 sptm_map_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, i, rw_va),
1932 (sptm_pt_level_t)i, table_tte);
1933
1934 ptd_info_finalize(ptdp);
1935
1936 /* The PTD's assoicated pmap temp_commpage_pmap is to be destroyed, so set it to NULL here. */
1937 ptdp->pmap = NULL;
1938
1939 pvh_unlock(&locked_pvh);
1940 }
1941
1942 /*
1943 * Note the lack of ARM_PTE_NG here: commpage mappings are at fixed addresses and
1944 * frequently accessed, so we map them global to avoid unnecessary TLB pressure.
1945 */
1946 static const sptm_pte_t commpage_pte_template = ARM_PTE_TYPE_VALID
1947 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK)
1948 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX
1949 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF;
1950
1951 sptm_return_t sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, rw_va,
1952 commpage_pte_template | ARM_PTE_NX | pa_to_pte(rw_pa));
1953 assert(sptm_ret == SPTM_SUCCESS);
1954
1955 if (ro_pa != 0) {
1956 assert((ro_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1957 sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, ro_va,
1958 commpage_pte_template | ARM_PTE_NX | pa_to_pte(ro_pa));
1959 assert(sptm_ret == SPTM_SUCCESS);
1960 }
1961
1962 if (rx_pa != 0) {
1963 assert((commpage_text_user_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1964 assert((commpage_text_user_va != rw_va) && (commpage_text_user_va != ro_va));
1965 sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, commpage_text_user_va, commpage_pte_template | pa_to_pte(rx_pa));
1966 assert(sptm_ret == SPTM_SUCCESS);
1967 }
1968
1969
1970 /* Unmap the commpage table here so that it won't be deallocated by pmap_destroy(). */
1971 sptm_unmap_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, pt_attr_commpage_level(pt_attr), rw_va),
1972 (sptm_pt_level_t)pt_attr_commpage_level(pt_attr));
1973 pmap_destroy(temp_commpage_pmap);
1974
1975 return commpage_table_pa;
1976 }
1977
1978 /**
1979 * Helper for creating all commpage tables applicable to the current configuration.
1980 *
1981 * @note This function is intended to be called during bootstrap.
1982 * @note This function assumes that pmap_create_commpages has already executed, and therefore
1983 * the commpage_*_pa variables have been assigned to their final values. commpage_data_pa
1984 * is the kernel RW commpage and is assumed to be present on all configurations, so it
1985 * therefore must be non-zero at this point. The other variables are considered optional
1986 * depending upon configuration and may be zero.
1987 */
1988 void pmap_prepare_commpages(void);
1989 void
pmap_prepare_commpages(void)1990 pmap_prepare_commpages(void)
1991 {
1992 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1993 assert(commpage_data_pa != 0);
1994 sptm_retype(commpage_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RW, retype_params);
1995 if (commpage_ro_data_pa != 0) {
1996 sptm_retype(commpage_ro_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RO, retype_params);
1997 }
1998 if (commpage_text_pa != 0) {
1999 sptm_retype(commpage_text_pa, XNU_DEFAULT, XNU_COMMPAGE_RX, retype_params);
2000 }
2001
2002 /*
2003 * User mapping of comm page text section for 64 bit mapping only
2004 *
2005 * We don't insert the text commpage into the 32 bit mapping because we don't want
2006 * 32-bit user processes to get this page mapped in, they should never call into
2007 * this page.
2008 */
2009 commpage_default_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
2010 commpage_data_pa, commpage_ro_data_pa, commpage_text_pa, PMAP_CREATE_64BIT);
2011
2012 /*
2013 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
2014 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
2015 */
2016 commpage32_default_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
2017 commpage_data_pa, commpage_ro_data_pa, 0, 0);
2018
2019 #if __ARM_MIXED_PAGE_SIZE__
2020 commpage_4k_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
2021 commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_64BIT | PMAP_CREATE_FORCE_4K_PAGES);
2022
2023 /*
2024 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
2025 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
2026 * commpage32_4k_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
2027 * commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES);
2028 */
2029 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2030
2031 }
2032
2033 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2034 pmap_virtual_space(
2035 vm_offset_t *startp,
2036 vm_offset_t *endp
2037 )
2038 {
2039 *startp = virtual_space_start;
2040 *endp = virtual_space_end;
2041 }
2042
2043
2044 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2045 pmap_virtual_region(
2046 unsigned int region_select,
2047 vm_map_offset_t *startp,
2048 vm_map_size_t *size
2049 )
2050 {
2051 boolean_t ret = FALSE;
2052 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2053 if (region_select == 0) {
2054 /*
2055 * In this config, the bootstrap mappings should occupy their own L2
2056 * TTs, as they should be immutable after boot. Having the associated
2057 * TTEs and PTEs in their own pages allows us to lock down those pages,
2058 * while allowing the rest of the kernel address range to be remapped.
2059 */
2060 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2061 #if defined(ARM_LARGE_MEMORY)
2062 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2063 #else
2064 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2065 #endif
2066 ret = TRUE;
2067 }
2068
2069 #if defined(ARM_LARGE_MEMORY)
2070 if (region_select == 1) {
2071 *startp = VREGION1_START;
2072 *size = VREGION1_SIZE;
2073 ret = TRUE;
2074 }
2075 #endif
2076 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */
2077 #if defined(ARM_LARGE_MEMORY)
2078 /* For large memory systems with no KTRR/CTRR */
2079 if (region_select == 0) {
2080 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2081 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2082 ret = TRUE;
2083 }
2084
2085 if (region_select == 1) {
2086 *startp = VREGION1_START;
2087 *size = VREGION1_SIZE;
2088 ret = TRUE;
2089 }
2090 #else /* !defined(ARM_LARGE_MEMORY) */
2091 unsigned long low_global_vr_mask = 0;
2092 vm_map_size_t low_global_vr_size = 0;
2093
2094 if (region_select == 0) {
2095 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2096 if (!TEST_PAGE_SIZE_4K) {
2097 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2098 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2099 } else {
2100 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2101 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2102 }
2103 ret = TRUE;
2104 }
2105 if (region_select == 1) {
2106 *startp = VREGION1_START;
2107 *size = VREGION1_SIZE;
2108 ret = TRUE;
2109 }
2110 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2111 if (!TEST_PAGE_SIZE_4K) {
2112 low_global_vr_mask = 0xFFFFFFFFFE000000;
2113 low_global_vr_size = 0x2000000;
2114 } else {
2115 low_global_vr_mask = 0xFFFFFFFFFF800000;
2116 low_global_vr_size = 0x800000;
2117 }
2118
2119 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2120 *startp = LOW_GLOBAL_BASE_ADDRESS;
2121 *size = low_global_vr_size;
2122 ret = TRUE;
2123 }
2124
2125 if (region_select == 3) {
2126 /* In this config, we allow the bootstrap mappings to occupy the same
2127 * page table pages as the heap.
2128 */
2129 *startp = VM_MIN_KERNEL_ADDRESS;
2130 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2131 ret = TRUE;
2132 }
2133 #endif /* defined(ARM_LARGE_MEMORY) */
2134 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
2135 return ret;
2136 }
2137
2138 /*
2139 * Routines to track and allocate physical pages during early boot.
2140 * On most systems that memory runs from first_avail through to avail_end
2141 * with no gaps.
2142 *
2143 * If the system supports ECC and ecc_bad_pages_count > 0, we
2144 * need to skip those pages.
2145 */
2146
2147 static unsigned int avail_page_count = 0;
2148 static bool need_ram_ranges_init = true;
2149
2150
2151 /**
2152 * Checks to see if a given page is in
2153 * the array of known bad pages
2154 *
2155 * @param ppn page number to check
2156 */
2157 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2158 pmap_is_bad_ram(__unused ppnum_t ppn)
2159 {
2160 return false;
2161 }
2162
2163 /**
2164 * Prepare bad ram pages to be skipped.
2165 */
2166
2167 /*
2168 * Initialize the count of available pages. No lock needed here,
2169 * as this code is called while kernel boot up is single threaded.
2170 */
2171 static void
initialize_ram_ranges(void)2172 initialize_ram_ranges(void)
2173 {
2174 __assert_only pmap_paddr_t first = first_avail;
2175 pmap_paddr_t end = avail_end;
2176
2177 assert(first <= end);
2178 assert(first == (first & ~PAGE_MASK));
2179 assert(end == (end & ~PAGE_MASK));
2180
2181 need_ram_ranges_init = false;
2182
2183 avail_page_count = atop(end - first_avail);
2184 }
2185
2186 unsigned int
pmap_free_pages(void)2187 pmap_free_pages(
2188 void)
2189 {
2190 if (need_ram_ranges_init) {
2191 initialize_ram_ranges();
2192 }
2193 return avail_page_count;
2194 }
2195
2196 unsigned int
pmap_free_pages_span(void)2197 pmap_free_pages_span(
2198 void)
2199 {
2200 if (need_ram_ranges_init) {
2201 initialize_ram_ranges();
2202 }
2203 return (unsigned int)atop(avail_end - first_avail);
2204 }
2205
2206
2207 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2208 pmap_next_page_hi(
2209 ppnum_t * pnum,
2210 __unused boolean_t might_free)
2211 {
2212 return pmap_next_page(pnum);
2213 }
2214
2215
2216 boolean_t
pmap_next_page(ppnum_t * pnum)2217 pmap_next_page(
2218 ppnum_t *pnum)
2219 {
2220 if (need_ram_ranges_init) {
2221 initialize_ram_ranges();
2222 }
2223
2224
2225 if (first_avail != avail_end) {
2226 *pnum = (ppnum_t)atop(first_avail);
2227 first_avail += PAGE_SIZE;
2228 assert(avail_page_count > 0);
2229 --avail_page_count;
2230 return TRUE;
2231 }
2232 assert(avail_page_count == 0);
2233 return FALSE;
2234 }
2235
2236
2237
2238
2239 /**
2240 * Helper function to check wheter the given physical
2241 * page number is a restricted page.
2242 *
2243 * @param pn the physical page number to query.
2244 */
2245 bool
pmap_is_page_restricted(ppnum_t pn)2246 pmap_is_page_restricted(ppnum_t pn)
2247 {
2248 sptm_frame_type_t frame_type = sptm_get_frame_type(ptoa(pn));
2249 return frame_type == XNU_KERNEL_RESTRICTED;
2250 }
2251
2252 /*
2253 * Initialize the pmap module.
2254 * Called by vm_init, to initialize any structures that the pmap
2255 * system needs to map virtual memory.
2256 */
2257 void
pmap_init(void)2258 pmap_init(
2259 void)
2260 {
2261 /*
2262 * Protect page zero in the kernel map.
2263 * (can be overruled by permanent transltion
2264 * table entries at page zero - see arm_vm_init).
2265 */
2266 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2267
2268 pmap_initialized = TRUE;
2269
2270 /*
2271 * Create the zone of physical maps
2272 * and the physical-to-virtual entries.
2273 */
2274 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2275 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2276
2277
2278 /*
2279 * Initialize the pmap object (for tracking the vm_page_t
2280 * structures for pages we allocate to be page tables in
2281 * pmap_expand().
2282 */
2283 _vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2284 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2285
2286 /*
2287 * Initialize the TXM VM object in the same way as the
2288 * PMAP VM object.
2289 */
2290 _vm_object_allocate(mem_size, txm_vm_object, VM_MAP_SERIAL_SPECIAL);
2291 txm_vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2292
2293 /*
2294 * The values of [hard_]maxproc may have been scaled, make sure
2295 * they are still less than the value of pmap_max_asids.
2296 */
2297 if ((uint32_t)maxproc > pmap_max_asids) {
2298 maxproc = pmap_max_asids;
2299 }
2300 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2301 hard_maxproc = pmap_max_asids;
2302 }
2303 }
2304
2305 /**
2306 * Verify that a given physical page contains no mappings (outside of the
2307 * default physical aperture mapping).
2308 *
2309 * @param ppnum Physical page number to check there are no mappings to.
2310 *
2311 * @return True if there are no mappings, false otherwise or if the page is not
2312 * kernel-managed.
2313 */
2314 bool
pmap_verify_free(ppnum_t ppnum)2315 pmap_verify_free(ppnum_t ppnum)
2316 {
2317 const pmap_paddr_t pa = ptoa(ppnum);
2318
2319 assert(pa != vm_page_fictitious_addr);
2320
2321 /* Only mappings to kernel-managed physical memory are tracked. */
2322 if (!pa_valid(pa)) {
2323 return false;
2324 }
2325
2326 const unsigned int pai = pa_index(pa);
2327
2328 return pvh_test_type(pai_to_pvh(pai), PVH_TYPE_NULL);
2329 }
2330
2331
2332 #if __ARM64_PMAP_SUBPAGE_L1__
2333 static inline bool
pmap_user_root_size_matches_subpage_l1(vm_size_t root_size)2334 pmap_user_root_size_matches_subpage_l1(vm_size_t root_size)
2335 {
2336 return root_size == 8 * sizeof(tt_entry_t);
2337 }
2338 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2339
2340 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2341 pmap_root_alloc_size(pmap_t pmap)
2342 {
2343 #pragma unused(pmap)
2344 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2345 const unsigned int root_level = pt_attr_root_level(pt_attr);
2346 const uint64_t index = pt_attr_va_valid_mask(pt_attr);
2347 return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2348 }
2349
2350 /*
2351 * Create and return a physical map.
2352 *
2353 * If the size specified for the map
2354 * is zero, the map is an actual physical
2355 * map, and may be referenced by the
2356 * hardware.
2357 *
2358 * If the size specified is non-zero,
2359 * the map will be used in software only, and
2360 * is bounded by that size.
2361 */
2362 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2363 pmap_create_options_internal(
2364 ledger_t ledger,
2365 vm_map_size_t size,
2366 unsigned int flags,
2367 kern_return_t *kr)
2368 {
2369 pmap_t p;
2370 bool is_64bit = flags & PMAP_CREATE_64BIT;
2371 #if defined(HAS_APPLE_PAC)
2372 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2373 #endif /* defined(HAS_APPLE_PAC) */
2374 kern_return_t local_kr = KERN_SUCCESS;
2375 __unused uint8_t sptm_root_flags = SPTM_ROOT_PT_FLAGS_DEFAULT;
2376 TXMAddressSpaceFlags_t txm_flags = kTXMAddressSpaceFlagInit;
2377 const bool is_stage2 = false;
2378
2379 if (size != 0) {
2380 {
2381 // Size parameter should only be set for stage 2.
2382 return PMAP_NULL;
2383 }
2384 }
2385
2386 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2387 return PMAP_NULL;
2388 }
2389
2390 /*
2391 * Allocate a pmap struct from the pmap_zone. Then allocate
2392 * the translation table of the right size for the pmap.
2393 */
2394 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2395 local_kr = KERN_RESOURCE_SHORTAGE;
2396 goto pmap_create_fail;
2397 }
2398
2399 p->ledger = ledger;
2400
2401
2402 p->pmap_vm_map_cs_enforced = false;
2403 p->min = 0;
2404
2405
2406 #if CONFIG_ROSETTA
2407 if (flags & PMAP_CREATE_ROSETTA) {
2408 p->is_rosetta = TRUE;
2409 } else {
2410 p->is_rosetta = FALSE;
2411 }
2412 #endif /* CONFIG_ROSETTA */
2413 #if defined(HAS_APPLE_PAC)
2414 p->disable_jop = disable_jop;
2415
2416 if (p->disable_jop) {
2417 sptm_root_flags &= ~SPTM_ROOT_PT_FLAG_JOP;
2418 }
2419 #endif /* defined(HAS_APPLE_PAC) */
2420
2421 p->nested_region_true_start = 0;
2422 p->nested_region_true_end = ~0;
2423
2424 p->nx_enabled = true;
2425 p->is_64bit = is_64bit;
2426
2427 if (!is_64bit) {
2428 sptm_root_flags |= SPTM_ROOT_PT_FLAG_ARM64_32;
2429 }
2430
2431 p->nested_pmap = PMAP_NULL;
2432 p->type = PMAP_TYPE_USER;
2433
2434 #if ARM_PARAMETERIZED_PMAP
2435 /* Default to the native pt_attr */
2436 p->pmap_pt_attr = native_pt_attr;
2437 #endif /* ARM_PARAMETERIZED_PMAP */
2438 #if __ARM_MIXED_PAGE_SIZE__
2439 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2440 p->pmap_pt_attr = &pmap_pt_attr_4k;
2441 }
2442 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2443 p->max = pmap_user_va_size(p);
2444
2445 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2446 local_kr = KERN_NO_SPACE;
2447 goto id_alloc_fail;
2448 }
2449
2450 /**
2451 * We expect top level translation tables to always fit into a single
2452 * physical page. This would also catch a misconfiguration if 4K
2453 * concatenated page tables needed more than one physical tt1 page.
2454 */
2455 vm_size_t pmap_root_size = pmap_root_alloc_size(p);
2456 if (__improbable(pmap_root_size > PAGE_SIZE)) {
2457 panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)pmap_root_size);
2458 }
2459
2460 #if __ARM64_PMAP_SUBPAGE_L1__
2461 /**
2462 * Identify the case where the root qualifies for SURT, and update the
2463 * root size to the TTEs + the SPTM metadata, reflecting the actual
2464 * space taken by this subpage root table.
2465 */
2466 if (!(flags & PMAP_CREATE_NESTED) && pmap_user_root_size_matches_subpage_l1(pmap_root_size)) {
2467 pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE;
2468 }
2469 #endif
2470
2471 pmap_lock_init(p);
2472
2473 p->tte = pmap_tt1_allocate(p, pmap_root_size, sptm_root_flags);
2474 if (!(p->tte)) {
2475 local_kr = KERN_RESOURCE_SHORTAGE;
2476 goto tt1_alloc_fail;
2477 }
2478
2479 p->ttep = kvtophys_nofail((vm_offset_t)p->tte);
2480 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2481
2482 /*
2483 * initialize the rest of the structure
2484 */
2485 p->nested_region_addr = 0x0ULL;
2486 p->nested_region_size = 0x0ULL;
2487 p->nested_region_unnested_table_bitmap = NULL;
2488
2489 p->associated_vm_map_serial_id = VM_MAP_SERIAL_NONE;
2490
2491 #if MACH_ASSERT
2492 p->pmap_pid = 0;
2493 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
2494 #endif /* MACH_ASSERT */
2495 #if DEVELOPMENT || DEBUG
2496 p->footprint_was_suspended = FALSE;
2497 #endif /* DEVELOPMENT || DEBUG */
2498
2499 os_ref_init_count_raw(&p->ref_count, &pmap_refgrp, 1);
2500 pmap_simple_lock(&pmaps_lock);
2501 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
2502 pmap_simple_unlock(&pmaps_lock);
2503
2504 /**
2505 * The SPTM pmap's concurrency model can sometimes allow ledger balances to transiently
2506 * go negative. Note that we still check overall ledger balance on pmap destruction.
2507 */
2508 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
2509 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
2510 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
2511 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
2512 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
2513 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
2514 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
2515 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
2516 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
2517
2518 if (!is_stage2) {
2519 /*
2520 * Complete initialization for the TXM address space. This needs to be done
2521 * after the SW ASID has been registered with the SPTM.
2522 * TXM enforcement does not apply to virtual machines.
2523 */
2524 if (flags & PMAP_CREATE_TEST) {
2525 txm_flags |= kTXMAddressSpaceFlagTest;
2526 }
2527
2528 pmap_txmlock_init(p);
2529 txm_register_address_space(p, p->asid, txm_flags);
2530 p->txm_trust_level = kCSTrustUntrusted;
2531 }
2532
2533 return p;
2534
2535 tt1_alloc_fail:
2536 pmap_get_pt_ops(p)->free_id(p);
2537 id_alloc_fail:
2538 zfree(pmap_zone, p);
2539 pmap_create_fail:
2540 *kr = local_kr;
2541 return PMAP_NULL;
2542 }
2543
2544 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)2545 pmap_create_options(
2546 ledger_t ledger,
2547 vm_map_size_t size,
2548 unsigned int flags)
2549 {
2550 pmap_t pmap;
2551 kern_return_t kr = KERN_SUCCESS;
2552
2553 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
2554
2555 ledger_reference(ledger);
2556
2557 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
2558
2559 if (pmap == PMAP_NULL) {
2560 ledger_dereference(ledger);
2561 }
2562
2563 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2564
2565 return pmap;
2566 }
2567
2568 #if MACH_ASSERT
2569 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)2570 pmap_set_process_internal(
2571 __unused pmap_t pmap,
2572 __unused int pid,
2573 __unused char *procname)
2574 {
2575 if (pmap == NULL || pmap->pmap_pid == -1) {
2576 return;
2577 }
2578
2579 validate_pmap_mutable(pmap);
2580
2581 pmap->pmap_pid = pid;
2582 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
2583 }
2584 #endif /* MACH_ASSERT */
2585
2586 #if MACH_ASSERT
2587 void
pmap_set_process(pmap_t pmap,int pid,char * procname)2588 pmap_set_process(
2589 pmap_t pmap,
2590 int pid,
2591 char *procname)
2592 {
2593 pmap_set_process_internal(pmap, pid, procname);
2594 }
2595 #endif /* MACH_ASSERT */
2596
2597 /*
2598 * pmap_deallocate_all_leaf_tts:
2599 *
2600 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
2601 * removing and deallocating all TTEs.
2602 */
2603 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,vm_map_address_t start_va,unsigned level)2604 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, vm_map_address_t start_va, unsigned level)
2605 {
2606 tt_entry_t tte = ARM_TTE_EMPTY;
2607 tt_entry_t * ttep = NULL;
2608 tt_entry_t * last_ttep = NULL;
2609
2610 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2611 const uint64_t size = pt_attr->pta_level_info[level].size;
2612
2613 assert(level < pt_attr_leaf_level(pt_attr));
2614
2615 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
2616
2617 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
2618 vm_map_address_t va = start_va;
2619 for (ttep = first_ttep; ttep <= last_ttep; ttep += page_ratio, va += (size * page_ratio)) {
2620 if (!(*ttep & ARM_TTE_VALID)) {
2621 continue;
2622 }
2623
2624 for (unsigned i = 0; i < page_ratio; i++) {
2625 tte = ttep[i];
2626
2627 if (!(tte & ARM_TTE_VALID)) {
2628 panic("%s: found unexpectedly invalid tte, ttep=%p, tte=%p, "
2629 "pmap=%p, first_ttep=%p, level=%u",
2630 __FUNCTION__, ttep + i, (void *)tte,
2631 pmap, first_ttep, level);
2632 }
2633
2634 if (tte_is_block(tte)) {
2635 panic("%s: found block mapping, ttep=%p, tte=%p, "
2636 "pmap=%p, first_ttep=%p, level=%u",
2637 __FUNCTION__, ttep + i, (void *)tte,
2638 pmap, first_ttep, level);
2639 }
2640
2641 /* Must be valid, type table */
2642 if (level < pt_attr_twig_level(pt_attr)) {
2643 /* If we haven't reached the twig level, recurse to the next level. */
2644 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK),
2645 va + (size * i), level + 1);
2646 }
2647 }
2648
2649 /* Remove the TTE. */
2650 pmap_tte_deallocate(pmap, va, ttep, level, false);
2651 }
2652 }
2653
2654 /*
2655 * We maintain stats and ledgers so that a task's physical footprint is:
2656 * phys_footprint = ((internal - alternate_accounting)
2657 * + (internal_compressed - alternate_accounting_compressed)
2658 * + iokit_mapped
2659 * + purgeable_nonvolatile
2660 * + purgeable_nonvolatile_compressed
2661 * + page_table)
2662 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
2663 */
2664
2665 /*
2666 * Retire the given physical map from service.
2667 * Should only be called if the map contains
2668 * no valid mappings.
2669 */
2670 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)2671 pmap_destroy_internal(
2672 pmap_t pmap)
2673 {
2674 if (pmap == PMAP_NULL) {
2675 return;
2676 }
2677
2678 validate_pmap(pmap);
2679
2680 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2681 const bool is_stage2_pmap = false;
2682
2683 if (os_ref_release_raw(&pmap->ref_count, &pmap_refgrp) > 0) {
2684 return;
2685 }
2686
2687 if (!is_stage2_pmap) {
2688 /*
2689 * Complete all clean up required for TXM. This needs to happen before the
2690 * SW ASID has been unregistered with the SPTM.
2691 */
2692 txm_unregister_address_space(pmap);
2693 pmap_txmlock_destroy(pmap);
2694 }
2695
2696 /**
2697 * Drain any concurrent retype-sensitive SPTM operations. This is needed to
2698 * ensure that we don't unmap and retype the page tables while those operations
2699 * are still finishing on other CPUs, leading to an SPTM violation. In particular,
2700 * the multipage batched cacheability/attribute update code may issue SPTM calls
2701 * without holding the relevant PVH or pmap locks, so we can't guarantee those
2702 * calls have actually completed despite observing refcnt == 0.
2703 *
2704 * At this point, we CAN guarantee that:
2705 * 1) All prior PTE removals required to empty the pmap have completed and
2706 * been synchronized with DSB, *except* the commpage removal which doesn't
2707 * involve pages that can ever be retyped. Subsequent calls not already
2708 * in the pmap epoch will no longer observe these mappings.
2709 * 2) The pmap now has a zero refcount, so in a correctly functioning system
2710 * no further mappings will be requested for it.
2711 */
2712 pmap_epoch_prepare_drain();
2713
2714 if (!is_stage2_pmap) {
2715 pmap_unmap_commpage(pmap);
2716 }
2717
2718 pmap_simple_lock(&pmaps_lock);
2719 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
2720 pmap_simple_unlock(&pmaps_lock);
2721
2722 pmap_epoch_drain();
2723
2724 /*
2725 * Free the memory maps, then the
2726 * pmap structure.
2727 */
2728 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pmap->min, pt_attr_root_level(pt_attr));
2729
2730 if (pmap->tte) {
2731 vm_size_t pmap_root_size = pmap_root_alloc_size(pmap);
2732 #if __ARM64_PMAP_SUBPAGE_L1__
2733 /**
2734 * Like in the allocation path, identify the case where the root table
2735 * qualifies for SURT.
2736 */
2737 if (pmap_user_root_size_matches_subpage_l1(pmap_root_size)) {
2738 /**
2739 * Nested tables cannot use SURT, so the allocated size has to be
2740 * PAGE_SIZE.
2741 */
2742 if (pmap_is_nested(pmap)) {
2743 pmap_root_size = PAGE_SIZE;
2744 } else {
2745 /**
2746 * Note: with SPTM, the kernel pmap is never supposed to be
2747 * destroyed because the SPTM relies on the existence of the
2748 * kernel root table. Also, the commpage-typed pmap doesn't
2749 * exist. Not only is the pmap associated with a commpage
2750 * table transient and destroyed right after the commpage
2751 * table is setup, but also the pmap is just a plain
2752 * PMAP_TYPE_USER typed pmap.
2753 */
2754 assert(pmap->type == PMAP_TYPE_USER);
2755 pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE;
2756 }
2757 }
2758 #endif
2759 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_size);
2760 pmap->tte = (tt_entry_t *) NULL;
2761 pmap->ttep = 0;
2762 }
2763
2764 if (pmap->type != PMAP_TYPE_NESTED) {
2765 /* return its asid to the pool */
2766 pmap_get_pt_ops(pmap)->free_id(pmap);
2767 if (pmap->nested_pmap != NULL) {
2768 /* release the reference we hold on the nested pmap */
2769 pmap_destroy_internal(pmap->nested_pmap);
2770 }
2771 }
2772
2773 pmap_check_ledgers(pmap);
2774
2775 if ((pmap->type == PMAP_TYPE_NESTED) && (pmap->nested_region_unnested_table_bitmap != NULL)) {
2776 bitmap_free(pmap->nested_region_unnested_table_bitmap,
2777 (pmap->nested_region_size >> (pt_attr_twig_shift(pt_attr) - 1)));
2778 }
2779
2780 pmap_lock_destroy(pmap);
2781 zfree(pmap_zone, pmap);
2782 }
2783
2784 void
pmap_destroy(pmap_t pmap)2785 pmap_destroy(
2786 pmap_t pmap)
2787 {
2788 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2789
2790 ledger_t ledger = pmap->ledger;
2791
2792 pmap_destroy_internal(pmap);
2793
2794 ledger_dereference(ledger);
2795
2796 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
2797 }
2798
2799
2800 /*
2801 * Add a reference to the specified pmap.
2802 */
2803 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)2804 pmap_reference_internal(
2805 pmap_t pmap)
2806 {
2807 if (pmap != PMAP_NULL) {
2808 validate_pmap_mutable(pmap);
2809 os_ref_retain_raw(&pmap->ref_count, &pmap_refgrp);
2810 }
2811 }
2812
2813 void
pmap_reference(pmap_t pmap)2814 pmap_reference(
2815 pmap_t pmap)
2816 {
2817 pmap_reference_internal(pmap);
2818 }
2819
2820 static sptm_frame_type_t
get_sptm_pt_type(pmap_t pmap)2821 get_sptm_pt_type(pmap_t pmap)
2822 {
2823 const bool is_stage2_pmap = false;
2824 if (is_stage2_pmap) {
2825 assert(pmap->type != PMAP_TYPE_NESTED);
2826 return XNU_STAGE2_PAGE_TABLE;
2827 } else {
2828 return pmap->type == PMAP_TYPE_NESTED ? XNU_PAGE_TABLE_SHARED : XNU_PAGE_TABLE;
2829 }
2830 }
2831
2832 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,uint8_t sptm_root_flags)2833 pmap_tt1_allocate(pmap_t pmap, vm_size_t size, uint8_t sptm_root_flags)
2834 {
2835 pmap_paddr_t pa = 0;
2836 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2837 const bool is_stage2_pmap = false;
2838
2839 /**
2840 * Allocate the entire page for root-level page table unless it is subpage
2841 * L1 table, where size will be exactly PMAP_ROOT_ALLOC_SIZE.
2842 */
2843 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
2844 size = PAGE_SIZE;
2845 }
2846
2847 #if __ARM64_PMAP_SUBPAGE_L1__
2848 /**
2849 * At this moment, the allocation size is smaller than the page size only
2850 * when it is a subpage L1 table. We will try to allocate a root table
2851 * from the SURTs (SUbpage Root Tables).
2852 */
2853 const bool use_surt = (size < PAGE_SIZE);
2854 if (use_surt) {
2855 /* It has to be a user pmap. */
2856 assert(pmap->type == PMAP_TYPE_USER);
2857
2858 /**
2859 * Subpage stage 2 root table is not supported. This is guaranteed by
2860 * the stage 2 pmaps using a different pmap geometry than the stage
2861 * 1 pmaps.
2862 */
2863 assert(!is_stage2_pmap);
2864
2865 /* Try allocating a SURT from the SURT page queue. */
2866 pa = surt_try_alloc();
2867
2868 /* If there is one SURT available, call SPTM to claim the SURT. */
2869 if (pa) {
2870 sptm_surt_alloc(surt_page_pa_from_surt_pa(pa),
2871 surt_index_from_surt_pa(pa),
2872 pt_attr->geometry_id,
2873 sptm_root_flags,
2874 pmap->asid);
2875
2876 /* We don't need to allocate a new page, so skip to the end. */
2877 goto ptt1a_done;
2878 }
2879 }
2880 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2881
2882 /**
2883 * Either the root table size is not suitable for SURT or SURT is out of
2884 * tables. In either case, a page needs to be allocated.
2885 */
2886 const kern_return_t ret = pmap_page_alloc(&pa, PMAP_PAGE_NOZEROFILL);
2887
2888 /* No page is allocated, so return 0 to signal failure. */
2889 if (ret != KERN_SUCCESS) {
2890 return (tt_entry_t *)0;
2891 }
2892
2893 /**
2894 * Drain the epochs to ensure any lingering batched operations that may have
2895 * taken an in-flight reference to this page are complete.
2896 */
2897 pmap_epoch_prepare_drain();
2898
2899 assert(pa);
2900
2901 #if __ARM64_PMAP_SUBPAGE_L1__
2902 if (use_surt) {
2903 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2904
2905 pmap_epoch_drain();
2906
2907 /**
2908 * The allocated page is retyped to XNU_SUBPAGE_USER_ROOT_TABLES as the
2909 * container of the SURTs.
2910 */
2911 sptm_retype(pa, XNU_DEFAULT, XNU_SUBPAGE_USER_ROOT_TABLES, retype_params);
2912
2913 /**
2914 * Before we add the page to the SURT page queue, claim the first SURT
2915 * for ourselves. This is safe since we are the only one accessing this
2916 * page at this moment.
2917 */
2918 sptm_surt_alloc(pa, 0, pt_attr->geometry_id, sptm_root_flags, pmap->asid);
2919
2920 /**
2921 * Add the newly allocated SURT page to the page queue.
2922 */
2923 surt_feed_page_with_first_table_allocated(pa);
2924 } else
2925 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2926 {
2927 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2928 retype_params.attr_idx = pt_attr->geometry_id;
2929 retype_params.flags = sptm_root_flags;
2930 if (is_stage2_pmap) {
2931 retype_params.vmid = pmap->vmid;
2932 } else {
2933 retype_params.asid = pmap->asid;
2934 }
2935
2936 pmap_epoch_drain();
2937
2938 sptm_retype(pa, XNU_DEFAULT, is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : XNU_USER_ROOT_TABLE,
2939 retype_params);
2940 }
2941
2942 #if __ARM64_PMAP_SUBPAGE_L1__
2943 ptt1a_done:
2944 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2945 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
2946 * Depending on the device, this can vary between 512b and 16K. */
2947 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
2948 pmap_tt_ledger_credit(pmap, size);
2949
2950 return (tt_entry_t *) phystokv(pa);
2951 }
2952
2953 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size)2954 pmap_tt1_deallocate(
2955 pmap_t pmap,
2956 tt_entry_t *tt,
2957 vm_size_t size)
2958 {
2959 pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)tt);
2960 const bool is_stage2_pmap = false;
2961
2962 /**
2963 * Free the entire page unless it is subpage L1 table, where size will be
2964 * exactly PMAP_ROOT_ALLOC_SIZE.
2965 */
2966 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
2967 size = PAGE_SIZE;
2968 }
2969
2970 #if __ARM64_PMAP_SUBPAGE_L1__
2971 /**
2972 * At this moment, the free size is smaller than the page size only
2973 * when it is a subpage L1 table. We will try to free the root table
2974 * from the SURT page.
2975 */
2976 const bool use_surt = (size < PAGE_SIZE);
2977 if (use_surt) {
2978 /* It has to be a user pmap. */
2979 assert(pmap->type == PMAP_TYPE_USER);
2980
2981 /* Subpage stage 2 root table is not supported. */
2982 assert(!is_stage2_pmap);
2983
2984 /* Before we do anything in pmap, tell SPTM that the SURT is free. */
2985 sptm_surt_free(surt_page_pa_from_surt_pa(pa),
2986 surt_index_from_surt_pa(pa));
2987
2988 /**
2989 * Make sure the SURT bitmap update is not reordered before the SPTM
2990 * rw guard release.
2991 */
2992 os_atomic_thread_fence(release);
2993
2994 /**
2995 * Free the SURT in pmap scope, if surt_free() returns false, there
2996 * are still other SURTs on the page. In such case, do not retype
2997 * or free the page; just skip to the end to finish accounting.
2998 */
2999 if (!surt_free(pa)) {
3000 goto ptt1d_done;
3001 }
3002
3003 /**
3004 * Make sure the SURT bitmap read is not reordered after the SPTM
3005 * rw guard exclusive acquire in the retype case.
3006 */
3007 os_atomic_thread_fence(acquire);
3008 }
3009 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3010
3011 sptm_frame_type_t page_type;
3012 #if __ARM64_PMAP_SUBPAGE_L1__
3013 if (use_surt) {
3014 page_type = XNU_SUBPAGE_USER_ROOT_TABLES;
3015 } else
3016 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3017 if (is_stage2_pmap) {
3018 page_type = XNU_STAGE2_ROOT_TABLE;
3019 } else if (pmap->type == PMAP_TYPE_NESTED) {
3020 page_type = XNU_SHARED_ROOT_TABLE;
3021 } else {
3022 page_type = XNU_USER_ROOT_TABLE;
3023 }
3024
3025 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3026 sptm_retype(pa & ~PAGE_MASK, page_type, XNU_DEFAULT, retype_params);
3027 pmap_page_free(pa & ~PAGE_MASK);
3028
3029 #if __ARM64_PMAP_SUBPAGE_L1__
3030 ptt1d_done:
3031 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3032 OSAddAtomic(-(int32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3033 pmap_tt_ledger_debit(pmap, size);
3034 }
3035
3036 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,pt_desc_t ** ptdp_out,unsigned int level,unsigned int options)3037 pmap_tt_allocate(
3038 pmap_t pmap,
3039 tt_entry_t **ttp,
3040 pt_desc_t **ptdp_out,
3041 unsigned int level,
3042 unsigned int options)
3043 {
3044 pmap_paddr_t pa;
3045 const unsigned int alloc_flags =
3046 (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0;
3047
3048 /* Allocate a VM page to be used as the page table. */
3049 if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) {
3050 return KERN_RESOURCE_SHORTAGE;
3051 }
3052
3053 pt_desc_t *ptdp = ptd_alloc(pmap, alloc_flags);
3054 if (ptdp == NULL) {
3055 pmap_page_free(pa);
3056 return KERN_RESOURCE_SHORTAGE;
3057 }
3058
3059 unsigned int pai = pa_index(pa);
3060 locked_pvh_t locked_pvh = pvh_lock(pai);
3061 assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL), "%s: non-empty PVH %p",
3062 __func__, (void*)locked_pvh.pvh);
3063
3064 /**
3065 * Drain the epochs to ensure any lingering batched operations that may have taken
3066 * an in-flight reference to this page are complete.
3067 */
3068 pmap_epoch_prepare_drain();
3069
3070 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3071 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3072 } else {
3073 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3074 }
3075
3076 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3077
3078 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3079
3080 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
3081 pvh_unlock(&locked_pvh);
3082
3083 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3084 retype_params.level = (sptm_pt_level_t)level;
3085
3086 /**
3087 * SPTM TODO: To reduce the cost of draining and retyping, consider caching freed page table pages
3088 * in a small per-CPU bucket and reusing them in preference to calling pmap_page_alloc() above.
3089 */
3090 pmap_epoch_drain();
3091
3092 sptm_retype(pa, XNU_DEFAULT, get_sptm_pt_type(pmap), retype_params);
3093
3094 *ptdp_out = ptdp;
3095 *ttp = (tt_entry_t *)phystokv(pa);
3096
3097 return KERN_SUCCESS;
3098 }
3099
3100 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3101 pmap_tt_deallocate(
3102 pmap_t pmap,
3103 tt_entry_t *ttp,
3104 unsigned int level)
3105 {
3106 pt_desc_t *ptdp;
3107 vm_offset_t free_page = 0;
3108 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3109
3110 ptdp = ptep_get_ptd(ttp);
3111 ptdp->va = (vm_offset_t)-1;
3112
3113 const uint16_t refcnt = sptm_get_page_table_refcnt(kvtophys_nofail((vm_offset_t)ttp));
3114
3115 if (__improbable(refcnt != 0)) {
3116 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, refcnt);
3117 }
3118
3119 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3120 if (free_page != 0) {
3121 pmap_paddr_t pa = kvtophys_nofail(free_page);
3122 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3123 sptm_retype(pa, get_sptm_pt_type(pmap), XNU_DEFAULT, retype_params);
3124 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3125
3126 unsigned int pai = pa_index(pa);
3127 locked_pvh_t locked_pvh = pvh_lock(pai);
3128 assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTDP), "%s: non-PTD PVH %p",
3129 __func__, (void*)locked_pvh.pvh);
3130 pvh_update_head(&locked_pvh, NULL, PVH_TYPE_NULL);
3131 pvh_unlock(&locked_pvh);
3132 pmap_page_free(pa);
3133 if (level < pt_attr_leaf_level(pt_attr)) {
3134 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3135 } else {
3136 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3137 }
3138 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3139 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3140 }
3141 }
3142
3143 /**
3144 * Check table refcounts after clearing a translation table entry pointing to that table
3145 *
3146 * @note If the cleared TTE points to a leaf table, then that leaf table
3147 * must have a refcnt of zero before the TTE can be removed.
3148 *
3149 * @param pmap The pmap containing the page table whose TTE is being removed.
3150 * @param tte Value stored in the TTE prior to clearing it
3151 * @param level The level of the page table that contains the TTE being removed
3152 */
3153 static void
pmap_tte_check_refcounts(pmap_t pmap,tt_entry_t tte,unsigned int level)3154 pmap_tte_check_refcounts(
3155 pmap_t pmap,
3156 tt_entry_t tte,
3157 unsigned int level)
3158 {
3159 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3160
3161 /**
3162 * Remember, the passed in "level" parameter refers to the level above the
3163 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3164 * page table).
3165 */
3166 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3167
3168 unsigned short refcnt = 0;
3169
3170 /**
3171 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3172 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3173 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3174 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3175 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3176 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3177 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3178 * synchronize it against the disconnect operation. If that removal caused the
3179 * refcount to reach zero, the pagetable page could be freed before the disconnect
3180 * operation is finished using the relevant pagetable descriptor.
3181 * Address these cases by draining the epochs to ensure other cores are no longer
3182 * consuming the page table we're preparing to delete.
3183 */
3184 if (remove_leaf_table) {
3185 pmap_epoch_prepare_drain();
3186 pmap_epoch_drain();
3187 refcnt = sptm_get_page_table_refcnt(tte_to_pa(tte));
3188 }
3189
3190 #if MACH_ASSERT
3191 /**
3192 * On internal devices, always do the page table consistency check
3193 * regardless of page table level or the actual refcnt value.
3194 */
3195 {
3196 #else /* MACH_ASSERT */
3197 /**
3198 * Only perform the page table consistency check when deleting leaf page
3199 * tables and it seems like there might be valid/compressed mappings
3200 * leftover.
3201 */
3202 if (__improbable(remove_leaf_table && refcnt != 0)) {
3203 #endif /* MACH_ASSERT */
3204
3205 /**
3206 * There are multiple problems that can arise as a non-zero refcnt:
3207 * 1. A bug in the refcnt management logic.
3208 * 2. A memory stomper or hardware failure.
3209 * 3. The VM forgetting to unmap all of the valid mappings in an address
3210 * space before destroying a pmap.
3211 *
3212 * By looping over the page table and determining how many valid or
3213 * compressed entries there actually are, we can narrow down which of
3214 * these three cases is causing this panic. If the expected refcnt
3215 * (valid + compressed) and the actual refcnt don't match then the
3216 * problem is probably either a memory corruption issue (if the
3217 * non-empty entries don't match valid+compressed, that could also be a
3218 * sign of corruption) or refcnt management bug. Otherwise, there
3219 * actually are leftover mappings and the higher layers of xnu are
3220 * probably at fault.
3221 *
3222 * Note that we use PAGE_SIZE to govern the range of the table check,
3223 * because even for 4K processes we still allocate a 16K page for each
3224 * page table; we simply map it using 4 adjacent TTEs for the 4K case.
3225 */
3226 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(PAGE_SIZE - 1)));
3227
3228 pt_entry_t *ptep = bpte;
3229 unsigned short wiredcnt = ptep_get_info((pt_entry_t*)ttetokv(tte))->wiredcnt;
3230 unsigned short non_empty = 0, valid = 0, comp = 0;
3231 for (unsigned int i = 0; i < (PAGE_SIZE / sizeof(*ptep)); i++, ptep++) {
3232 /* Keep track of all non-empty entries to detect memory corruption. */
3233 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3234 non_empty++;
3235 }
3236
3237 if (__improbable(pte_is_compressed(*ptep, ptep))) {
3238 comp++;
3239 } else if (__improbable(pte_is_valid(*ptep))) {
3240 valid++;
3241 }
3242 }
3243
3244 #if MACH_ASSERT
3245 /**
3246 * On internal machines, panic whenever a page table getting deleted has
3247 * leftover mappings (valid or otherwise) or a leaf page table has a
3248 * non-zero refcnt.
3249 */
3250 if (__improbable((non_empty != 0) || (remove_leaf_table && ((refcnt != 0) || (wiredcnt != 0))))) {
3251 #else /* MACH_ASSERT */
3252 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3253 {
3254 #endif /* MACH_ASSERT */
3255 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3256 "%d compressed, %d non-empty, refcnt=%d, wiredcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3257 level + 1, valid, comp, non_empty, refcnt, wiredcnt, level, (uint64_t)tte, pmap, bpte);
3258 }
3259 }
3260 }
3261
3262 /**
3263 * Remove translation table entry pointing to a nested shared region table
3264 *
3265 * @note The TTE to clear out is expected to point to a leaf table with a refcnt
3266 * of zero.
3267 *
3268 * @param pmap The user pmap containing the nested page table whose TTE is being removed.
3269 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3270 * @param ttep Pointer to the TTE that should be cleared out.
3271 */
3272 static void
3273 pmap_tte_trim(
3274 pmap_t pmap,
3275 vm_offset_t va_start,
3276 tt_entry_t *ttep)
3277 {
3278 assert(ttep != NULL);
3279 const tt_entry_t tte = *ttep;
3280 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3281
3282 if (__improbable(tte == ARM_TTE_EMPTY)) {
3283 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3284 "stomper? pmap=%p ttep=%p", __func__, pt_attr_twig_level(pt_attr), pmap, ttep);
3285 }
3286
3287 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
3288 sptm_unnest_region(pmap->ttep, pmap->nested_pmap->ttep, va_start, (pt_attr_twig_size(pt_attr) * page_ratio) >> pt_attr->pta_page_shift);
3289
3290 pmap_tte_check_refcounts(pmap, tte, pt_attr_twig_level(pt_attr));
3291 }
3292
3293 /**
3294 * Remove a translation table entry.
3295 *
3296 * @note If the TTE to clear out points to a leaf table, then that leaf table
3297 * must have a mapping refcount of zero before the TTE can be removed.
3298 * @note If locked_pvh is non-NULL, this function expects to be called with
3299 * the PVH lock held and will return with it unlocked. Otherwise it
3300 * expects pmap to be locked exclusive, and will return with pmap unlocked.
3301 *
3302 * @param pmap The pmap containing the page table whose TTE is being removed.
3303 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3304 * @param ttep Pointer to the TTE that should be cleared out.
3305 * @param level The level of the page table that contains the TTE to be removed.
3306 * @param pmap_locked If true, the caller holds an exclusive pmap lock which should
3307 * be dropped after removing the table entry.
3308 */
3309 static void
3310 pmap_tte_remove(
3311 pmap_t pmap,
3312 vm_offset_t va_start,
3313 tt_entry_t *ttep,
3314 unsigned int level,
3315 bool pmap_locked)
3316 {
3317 assert(ttep != NULL);
3318 const tt_entry_t tte = *ttep;
3319
3320 if (__improbable(tte == ARM_TTE_EMPTY)) {
3321 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3322 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3323 }
3324
3325 sptm_unmap_table(pmap->ttep, pt_attr_align_va(pmap_get_pt_attr(pmap), level, va_start), (sptm_pt_level_t)level);
3326
3327 if (pmap_locked) {
3328 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3329 }
3330
3331 pmap_tte_check_refcounts(pmap, tte, level);
3332 }
3333
3334 /**
3335 * Given a pointer to an entry within a `level` page table, delete the
3336 * page table at `level` + 1 that is represented by that entry. For instance,
3337 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3338 * contains the PA of the L3 table, and `level` would be "2".
3339 *
3340 * @note If the table getting deallocated is a leaf table, then that leaf table
3341 * must have a mapping refcount of zero before getting deallocated.
3342 * @note If locked_pvh is non-NULL, this function expects to be called with
3343 * the PVH lock held and will return with it unlocked. Otherwise it
3344 * expects pmap to be locked exclusive, and will return with pmap unlocked.
3345 *
3346 * @param pmap The pmap that owns the page table to be deallocated.
3347 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3348 * @param ttep Pointer to the `level` TTE to remove.
3349 * @param level The level of the table that contains an entry pointing to the
3350 * table to be removed. The deallocated page table will be a
3351 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3352 * deleted).
3353 * @param pmap_locked If true, the caller holds an exclusive pmap lock which should
3354 * be dropped after removing the table entry.
3355 */
3356 static void
3357 pmap_tte_deallocate(
3358 pmap_t pmap,
3359 vm_offset_t va_start,
3360 tt_entry_t *ttep,
3361 unsigned int level,
3362 bool pmap_locked)
3363 {
3364 tt_entry_t tte = *ttep;
3365
3366 if (tte_get_ptd(tte)->pmap != pmap) {
3367 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3368 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3369 }
3370
3371 assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
3372 (unsigned long long)tte);
3373
3374 /* pmap_tte_remove() will drop the pmap lock if necessary. */
3375 pmap_tte_remove(pmap, va_start, ttep, level, pmap_locked);
3376
3377 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3378 }
3379
3380 /*
3381 * Remove a range of hardware page-table entries.
3382 * The range is given as the first (inclusive)
3383 * and last (exclusive) virtual addresses mapped by
3384 * the PTE region to be removed.
3385 *
3386 * The pmap must be locked shared.
3387 * If the pmap is not the kernel pmap, the range must lie
3388 * entirely within one pte-page. Assumes that the pte-page exists.
3389 *
3390 * Returns the number of PTE changed
3391 */
3392 MARK_AS_PMAP_TEXT static void
3393 pmap_remove_range(
3394 pmap_t pmap,
3395 vm_map_address_t va,
3396 vm_map_address_t end)
3397 {
3398 pmap_remove_range_options(pmap, va, end, PMAP_OPTIONS_REMOVE);
3399 }
3400
3401 MARK_AS_PMAP_TEXT void
3402 pmap_remove_range_options(
3403 pmap_t pmap,
3404 vm_map_address_t start,
3405 vm_map_address_t end,
3406 int options)
3407 {
3408 const unsigned int sptm_flags = ((options & PMAP_OPTIONS_REMOVE) ? SPTM_REMOVE_COMPRESSED : 0);
3409 unsigned int num_removed = 0;
3410 unsigned int num_external = 0, num_internal = 0, num_reusable = 0;
3411 unsigned int num_alt_internal = 0;
3412 unsigned int num_compressed = 0, num_alt_compressed = 0;
3413 unsigned short num_unwired = 0;
3414 bool need_strong_sync = false;
3415
3416 /*
3417 * The pmap lock should be held here. It will only be held shared in most if not all cases.
3418 */
3419 pmap_assert_locked(pmap, PMAP_LOCK_HELD);
3420
3421 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3422 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
3423 const uint64_t pmap_page_shift = pt_attr_leaf_shift(pt_attr);
3424 vm_map_address_t va = start;
3425 pt_entry_t *cpte = pmap_pte(pmap, va);
3426 assert(cpte != NULL);
3427
3428 while (va < end) {
3429 /**
3430 * We may need to sleep when taking the PVH lock below, and our pmap_pv_remove()
3431 * call below may also place the lock in sleep mode if processing a large PV list.
3432 * We therefore can't leave preemption disabled across that code, which means we
3433 * can't directly use the per-CPU prev_ptes array in that code. Since that code
3434 * only cares about the physical address stored in each prev_ptes entry, we'll
3435 * use a local array to stash off only the 4-byte physical address index in order
3436 * to reduce stack usage.
3437 */
3438 unsigned int pai_list[SPTM_MAPPING_LIMIT];
3439 _Static_assert(SPTM_MAPPING_LIMIT <= 64,
3440 "SPTM_MAPPING_LIMIT value causes excessive stack usage for pai_list");
3441
3442 unsigned int num_mappings = (end - va) >> pmap_page_shift;
3443 if (num_mappings > SPTM_MAPPING_LIMIT) {
3444 num_mappings = SPTM_MAPPING_LIMIT;
3445 }
3446
3447 /**
3448 * Disable preemption to ensure that we can safely access per-CPU mapping data after
3449 * issuing the SPTM call.
3450 */
3451 disable_preemption();
3452 /**
3453 * Enter the pmap epoch for the batched unmap operation. This is necessary because we
3454 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
3455 * call, so a concurrent pmap_page_protect() operation against one of those pages may
3456 * race this call. That should be perfectly fine as far as the PTE updates are concerned,
3457 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
3458 * if it does not first drain our epoch.
3459 */
3460 pmap_epoch_enter();
3461 sptm_unmap_region(pmap->ttep, va, num_mappings, sptm_flags);
3462 pmap_epoch_exit();
3463
3464 sptm_pte_t *prev_ptes = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes;
3465 for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3466 const pt_entry_t prev_pte = prev_ptes[i];
3467
3468 if (pte_is_compressed(prev_pte, cpte)) {
3469 if (options & PMAP_OPTIONS_REMOVE) {
3470 ++num_compressed;
3471 if (prev_pte & ARM_PTE_COMPRESSED_ALT) {
3472 ++num_alt_compressed;
3473 }
3474 }
3475 pai_list[i] = INVALID_PAI;
3476 continue;
3477 } else if (!pte_is_valid(prev_pte)) {
3478 pai_list[i] = INVALID_PAI;
3479 continue;
3480 }
3481
3482 if (pte_is_wired(prev_pte)) {
3483 num_unwired++;
3484 }
3485
3486 const pmap_paddr_t pa = pte_to_pa(prev_pte);
3487
3488 if (__improbable(!pa_valid(pa))) {
3489 pai_list[i] = INVALID_PAI;
3490 continue;
3491 }
3492 pai_list[i] = pa_index(pa);
3493 }
3494
3495 enable_preemption();
3496 cpte -= num_mappings;
3497
3498 for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3499 if (pai_list[i] == INVALID_PAI) {
3500 continue;
3501 }
3502 locked_pvh_t locked_pvh;
3503 if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
3504 locked_pvh = pvh_lock_nopreempt(pai_list[i]);
3505 } else {
3506 locked_pvh = pvh_lock(pai_list[i]);
3507 }
3508
3509 bool is_internal, is_altacct;
3510 pv_remove_return_t remove_status = pmap_remove_pv(pmap, cpte, &locked_pvh, &is_internal, &is_altacct);
3511
3512 switch (remove_status) {
3513 case PV_REMOVE_SUCCESS:
3514 ++num_removed;
3515 if (is_altacct) {
3516 assert(is_internal);
3517 num_internal++;
3518 num_alt_internal++;
3519 } else if (is_internal) {
3520 if (ppattr_test_reusable(pai_list[i])) {
3521 num_reusable++;
3522 } else {
3523 num_internal++;
3524 }
3525 } else {
3526 num_external++;
3527 }
3528 break;
3529 default:
3530 /*
3531 * PVE already removed; this can happen due to a concurrent pmap_disconnect()
3532 * executing before we grabbed the PVH lock.
3533 */
3534 break;
3535 }
3536
3537 pvh_unlock(&locked_pvh);
3538 }
3539
3540 va += (num_mappings << pmap_page_shift);
3541 }
3542
3543 if (__improbable(need_strong_sync)) {
3544 arm64_sync_tlb(true);
3545 }
3546
3547 /*
3548 * Update the counts
3549 */
3550 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
3551
3552 if (pmap != kernel_pmap) {
3553 if (num_unwired != 0) {
3554 ptd_info_t * const ptd_info = ptep_get_info(cpte - 1);
3555 if (__improbable(os_atomic_sub_orig(&ptd_info->wiredcnt, num_unwired, relaxed) < num_unwired)) {
3556 panic("%s: pmap %p VA [0x%llx, 0x%llx) (ptd info %p) wired count underflow", __func__, pmap,
3557 (unsigned long long)start, (unsigned long long)end, ptd_info);
3558 }
3559 }
3560
3561 /* update ledgers */
3562 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
3563 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
3564 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
3565 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
3566 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
3567 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
3568 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
3569 /* make needed adjustments to phys_footprint */
3570 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
3571 ((num_internal -
3572 num_alt_internal) +
3573 (num_compressed -
3574 num_alt_compressed)) * pmap_page_size);
3575 }
3576 }
3577
3578
3579 /*
3580 * Remove the given range of addresses
3581 * from the specified map.
3582 *
3583 * It is assumed that the start and end are properly
3584 * rounded to the hardware page size.
3585 */
3586 void
3587 pmap_remove(
3588 pmap_t pmap,
3589 vm_map_address_t start,
3590 vm_map_address_t end)
3591 {
3592 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
3593 }
3594
3595 MARK_AS_PMAP_TEXT vm_map_address_t
3596 pmap_remove_options_internal(
3597 pmap_t pmap,
3598 vm_map_address_t start,
3599 vm_map_address_t end,
3600 int options)
3601 {
3602 vm_map_address_t eva = end;
3603 tt_entry_t *tte_p;
3604 bool unlock = true;
3605
3606 if (__improbable(end < start)) {
3607 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
3608 }
3609 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3610 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
3611 }
3612
3613 validate_pmap_mutable(pmap);
3614
3615 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3616
3617 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
3618 pmap_lock(pmap, lock_mode);
3619
3620 tte_p = pmap_tte(pmap, start);
3621
3622 if ((tte_p == NULL) || ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_FAULT)) {
3623 goto done;
3624 }
3625
3626 assertf(tte_is_table(*tte_p), "%s: invalid TTE %p (0x%llx) for pmap %p va 0x%llx",
3627 __func__, tte_p, (unsigned long long)*tte_p, pmap, (unsigned long long)start);
3628
3629 pmap_remove_range_options(pmap, start, end, options);
3630
3631 if (pmap->type != PMAP_TYPE_USER) {
3632 goto done;
3633 }
3634
3635 uint16_t refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
3636 if (__improbable(refcnt == 0)) {
3637 ptd_info_t *ptd_info = ptep_get_info((pt_entry_t*)ttetokv(*tte_p));
3638 os_atomic_inc(&ptd_info->wiredcnt, relaxed); // Prevent someone else from freeing the table if we need to drop the lock
3639 if (!pmap_lock_shared_to_exclusive(pmap)) {
3640 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3641 }
3642 lock_mode = PMAP_LOCK_EXCLUSIVE;
3643 refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
3644 if ((os_atomic_dec(&ptd_info->wiredcnt, relaxed) == 0) && (refcnt == 0)) {
3645 /**
3646 * Drain any concurrent retype-sensitive SPTM operations. This is needed to
3647 * ensure that we don't unmap the page table and retype it while those operations
3648 * are still finishing on other CPUs, leading to an SPTM violation. In particular,
3649 * the multipage batched cacheability/attribute update code may issue SPTM calls
3650 * without holding the relevant PVH or pmap locks, so we can't guarantee those
3651 * calls have actually completed despite observing refcnt == 0.
3652 *
3653 * At this point, we CAN guarantee that:
3654 * 1) All prior PTE removals required to produce refcnt == 0 have
3655 * completed and been synchronized for all observers by DSB, and the
3656 * relevant PV list entries removed. Subsequent calls not already in the
3657 * pmap epoch will no longer observe these mappings.
3658 * 2) We now hold the pmap lock exclusive, so there will be no further attempt
3659 * to enter mappings in this page table before it is unmapped.
3660 */
3661 pmap_epoch_prepare_drain();
3662 pmap_epoch_drain();
3663 pmap_tte_deallocate(pmap, start, tte_p, pt_attr_twig_level(pt_attr), true);
3664 unlock = false; // pmap_tte_deallocate() has dropped the lock
3665 }
3666 }
3667 done:
3668 if (unlock) {
3669 pmap_unlock(pmap, lock_mode);
3670 }
3671
3672 return eva;
3673 }
3674
3675 void
3676 pmap_remove_options(
3677 pmap_t pmap,
3678 vm_map_address_t start,
3679 vm_map_address_t end,
3680 int options)
3681 {
3682 vm_map_address_t va;
3683
3684 if (pmap == PMAP_NULL) {
3685 return;
3686 }
3687
3688 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3689
3690 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
3691 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
3692 VM_KERNEL_ADDRHIDE(end));
3693
3694 #if MACH_ASSERT
3695 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
3696 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
3697 pmap, (uint64_t)start, (uint64_t)end);
3698 }
3699 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
3700 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
3701 pmap, (uint64_t)start, (uint64_t)end);
3702 }
3703 #endif
3704
3705 /*
3706 * We allow single-page requests to execute non-preemptibly,
3707 * as it doesn't make sense to sample AST_URGENT for a single-page
3708 * operation, and there are a couple of special use cases that
3709 * require a non-preemptible single-page operation.
3710 */
3711 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
3712 pmap_verify_preemptible();
3713 }
3714
3715 /*
3716 * Invalidate the translation buffer first
3717 */
3718 va = start;
3719 while (va < end) {
3720 vm_map_address_t l;
3721
3722 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
3723 if (l > end) {
3724 l = end;
3725 }
3726
3727 va = pmap_remove_options_internal(pmap, va, l, options);
3728 }
3729
3730 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
3731 }
3732
3733
3734 /*
3735 * Remove phys addr if mapped in specified map
3736 */
3737 void
3738 pmap_remove_some_phys(
3739 __unused pmap_t map,
3740 __unused ppnum_t pn)
3741 {
3742 /* Implement to support working set code */
3743 }
3744
3745 /*
3746 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
3747 * switch a thread onto a new vm_map.
3748 */
3749 void
3750 pmap_switch_user(thread_t thread, vm_map_t new_map)
3751 {
3752 pmap_t new_pmap = new_map->pmap;
3753
3754
3755 thread->map = new_map;
3756 pmap_set_pmap(new_pmap, thread);
3757
3758 }
3759 void
3760 pmap_set_pmap(
3761 pmap_t pmap,
3762 thread_t thread)
3763 {
3764 pmap_switch(pmap, thread);
3765 }
3766
3767 MARK_AS_PMAP_TEXT void
3768 pmap_switch_internal(
3769 pmap_t pmap,
3770 thread_t thread)
3771 {
3772 validate_pmap_mutable(pmap);
3773 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3774 const uint16_t asid_index = PMAP_HWASID(pmap);
3775 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
3776 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
3777 }
3778
3779 #if __ARM_KERNEL_PROTECT__
3780 asid_index >>= 1;
3781 #endif
3782
3783 if (asid_index > 0) {
3784 pmap_update_plru(asid_index);
3785 }
3786
3787 __unused sptm_return_t sptm_return;
3788 #pragma unused(thread)
3789 if (0) {
3790 } else {
3791 sptm_return = sptm_switch_root(pmap->ttep, 0, 0);
3792 }
3793
3794 #if DEVELOPMENT || DEBUG
3795 if (__improbable(sptm_return & SPTM_SWITCH_ASID_TLBI_FLUSH)) {
3796 os_atomic_inc(&pmap_asid_flushes, relaxed);
3797 }
3798
3799 if (__improbable(sptm_return & SPTM_SWITCH_RCTX_FLUSH)) {
3800 os_atomic_inc(&pmap_speculation_restrictions, relaxed);
3801 }
3802 #endif /* DEVELOPMENT || DEBUG */
3803 }
3804
3805 void
3806 pmap_switch(
3807 pmap_t pmap,
3808 thread_t thread)
3809 {
3810 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
3811 pmap_switch_internal(pmap, thread);
3812 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
3813 }
3814
3815 void
3816 pmap_page_protect(
3817 ppnum_t ppnum,
3818 vm_prot_t prot)
3819 {
3820 pmap_page_protect_options(ppnum, prot, 0, NULL);
3821 }
3822
3823 /**
3824 * Helper function for performing per-mapping accounting following an SPTM disjoint unmap request.
3825 *
3826 * @note [pmap] cannot be the kernel pmap. This is because we do not maintain a ledger in the
3827 * kernel pmap.
3828 *
3829 * @param pmap The pmap that contained the mapping
3830 * @param pai The physical page index mapped by the mapping
3831 * @param is_compressed Indicates whether the operation was an unmap-to-compress vs. a full unmap
3832 * @param is_internal Indicates whether the mapping was for an internal (aka anonymous) VM page
3833 * @param is_altacct Indicates whether the mapping was subject to alternate accounting.
3834 */
3835 static void
3836 pmap_disjoint_unmap_accounting(pmap_t pmap, unsigned int pai, bool is_compressed, bool is_internal, bool is_altacct)
3837 {
3838 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
3839 pvh_assert_locked(pai);
3840
3841 assert(pmap != kernel_pmap);
3842
3843 if (is_internal &&
3844 !is_altacct &&
3845 ppattr_test_reusable(pai)) {
3846 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3847 } else if (!is_internal) {
3848 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3849 }
3850
3851 if (is_altacct) {
3852 assert(is_internal);
3853 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3854 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3855 if (is_compressed) {
3856 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3857 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3858 }
3859 } else if (ppattr_test_reusable(pai)) {
3860 assert(is_internal);
3861 if (is_compressed) {
3862 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3863 /* was not in footprint, but is now */
3864 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3865 }
3866 } else if (is_internal) {
3867 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3868
3869 /*
3870 * Update all stats related to physical footprint, which only
3871 * deals with internal pages.
3872 */
3873 if (is_compressed) {
3874 /*
3875 * This removal is only being done so we can send this page to
3876 * the compressor; therefore it mustn't affect total task footprint.
3877 */
3878 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3879 } else {
3880 /*
3881 * This internal page isn't going to the compressor, so adjust stats to keep
3882 * phys_footprint up to date.
3883 */
3884 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3885 }
3886 } else {
3887 /* external page: no impact on ledgers */
3888 }
3889 }
3890
3891 /**
3892 * Helper function for issuing a disjoint unmap request to the SPTM and performing
3893 * related accounting. This function uses the 'prev_ptes' list generated by
3894 * the sptm_unmap_disjoint() call to determine whether said call altered the
3895 * relevant PTEs in a manner that would require accounting updates.
3896 *
3897 * @param pa The physical address against which the disjoint unmap will be issued.
3898 * @param num_mappings The number of disjoint mappings for the SPTM to update.
3899 * The per-CPU sptm_ops array should contain the same number
3900 * of individual disjoint requests.
3901 */
3902 static void
3903 pmap_disjoint_unmap(pmap_paddr_t pa, unsigned int num_mappings)
3904 {
3905 const unsigned int pai = pa_index(pa);
3906
3907 pvh_assert_locked(pai);
3908
3909 assert(num_mappings <= SPTM_MAPPING_LIMIT);
3910
3911 assert(get_preemption_level() > 0);
3912 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
3913
3914 sptm_unmap_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings);
3915
3916 for (unsigned int cur_mapping = 0; cur_mapping < num_mappings; ++cur_mapping) {
3917 pt_entry_t prev_pte = sptm_pcpu->sptm_prev_ptes[cur_mapping];
3918
3919 pt_desc_t * const ptdp = sptm_pcpu->sptm_ptds[cur_mapping];
3920 const pmap_t pmap = ptdp->pmap;
3921
3922 assertf(!pte_is_valid(prev_pte) ||
3923 ((pte_to_pa(prev_pte) & ~PAGE_MASK) == pa), "%s: prev_pte 0x%llx does not map pa 0x%llx",
3924 __func__, (unsigned long long)prev_pte, (unsigned long long)pa);
3925
3926 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
3927 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3928
3929 if (pmap != kernel_pmap) {
3930 /*
3931 * If the prior PTE is invalid (which may happen due to a concurrent remove operation),
3932 * the compressed marker won't be written so we shouldn't account the mapping as compressed.
3933 */
3934 const bool is_compressed = (pte_is_valid(prev_pte) &&
3935 ((sptm_pcpu->sptm_ops[cur_mapping].pte_template & ARM_PTE_COMPRESSED_MASK) != 0));
3936 const bool is_internal = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_INTERNAL) != 0;
3937 const bool is_altacct = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_ALTACCT) != 0;
3938
3939 /*
3940 * The rule is that accounting related to PTE contents (wired, PTD refcount)
3941 * must be updated by whoever clears the PTE, while accounting related to physical page
3942 * attributes must be updated by whoever clears the PVE. We therefore always call
3943 * pmap_disjoint_unmap_accounting() here since we're removing the PVE, but only update
3944 * wired/PTD accounting if the prior PTE was valid.
3945 */
3946 pmap_disjoint_unmap_accounting(pmap, pai, is_compressed, is_internal, is_altacct);
3947
3948 if (!pte_is_valid(prev_pte)) {
3949 continue;
3950 }
3951
3952 if (pte_is_wired(prev_pte)) {
3953 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3954 if (__improbable(os_atomic_dec_orig(&sptm_pcpu->sptm_ptd_info[cur_mapping]->wiredcnt, relaxed) == 0)) {
3955 panic("%s: over-unwire of ptdp %p, ptd info %p", __func__,
3956 ptdp, sptm_pcpu->sptm_ptd_info[cur_mapping]);
3957 }
3958 }
3959 }
3960 }
3961 }
3962
3963 /**
3964 * The following two functions, pmap_multipage_op_submit_disjoint() and
3965 * pmap_multipage_op_add_page(), are intended to allow callers to manage batched SPTM
3966 * operations that may span multiple physical pages. They are intended to operate in
3967 * a way that allows callers such as pmap_page_protect_options_with_flush_range() to
3968 * insert mappings into the per-CPU SPTM disjoint ops array in the same manner that
3969 * they would for an ordinary single-page operation.
3970 * Functions such as pmap_page_protect_options_with_flush_range() operate on a single
3971 * physical page but may be passed a non-NULL flush_range object to indicate that the
3972 * call is part of a larger batched operation which may span multiple physical pages.
3973 * In that scenario, these functions are intended to be used as follows:
3974 * 1) Call pmap_multipage_op_add_page() to insert a "header" for the page into the per-
3975 * CPU SPTM ops array. Use the return value from this call as the starting index
3976 * at which to add ordinary mapping entries into the same array.
3977 * 2) Insert sptm_disjoint_op_t entries into the ops array in the normal manner until
3978 * the array is full, the SPTM options required for the upcoming sequence of pages
3979 * need to change, or the current mapping matches flush_range->current_ptep.
3980 * In the latter case, pmap_insert_flush_range_template() may instead be used
3981 * to insert the mapping into the per-CPU SPTM region templates array. See the
3982 * documentation for pmap_insert_flush_range_template() below.
3983 * 3) If the array is full, call pmap_multipage_op_submit_disjoint() and return to step 1).
3984 * 4) If the SPTM options need to change, call pmap_multipage_op_add_page() to insert
3985 * a new header with the updated options and, using the return value as the new
3986 * insertion point for the ops array, resume step 2).
3987 * 5) Upon completion, if there are any pending not-yet-submitted mappings, do not
3988 * submit those mappings to the SPTM as would ordinarily be done for a single-page
3989 * call. These trailing mappings will be submitted as part of the next batch,
3990 * or by the next-higher caller if the range operation is complete.
3991 *
3992 * Note that, as a performance optimization, the caller may track the insertion
3993 * point in the disjoint ops array locally (i.e. without incrementing
3994 * flush_range->pending_disjoint_entries on every iteration, as long as it takes care to do the
3995 * following:
3996 * 1) Initialize and update that insertion point as described in steps 1) and 4) above.
3997 * 2) Pass the updated insertion point as the 'pending_disjoint_entries' parameter into the calls
3998 * in steps 3) and 4) above.
3999 * 3) Update flush_range->pending_disjoint_entries with the locally-maintained value along with
4000 * step 5) above.
4001 */
4002
4003 /**
4004 * Submit any pending disjoint multi-page mapping updates to the SPTM.
4005 *
4006 * @note This function must be called with preemption disabled, and will drop
4007 * the preemption-disable count upon submitting to the SPTM.
4008 * @note [pending_disjoint_entries] must include *all* pending entries in the SPTM ops array,
4009 * including physical address "header" entries.
4010 * @note This function automatically updates the per_paddr_header.num_mappings field
4011 * for the most recent physical address header in the SPTM ops array to its final
4012 * value.
4013 *
4014 * @param pending_disjoint_entries The number of not-yet-submitted mappings according to the caller.
4015 * This value may be greater than [flush_range]->pending_disjoint_entries if
4016 * the caller has inserted mappings into the ops array without
4017 * updating [flush_range]->pending_disjoint_entries, in which case this
4018 * function will update [flush_range]->pending_disjoint_entries with the
4019 * caller's value.
4020 * @param flush_range The object tracking the current state of the multipage disjoint
4021 * operation.
4022 */
4023 static inline void
4024 pmap_multipage_op_submit_disjoint(unsigned int pending_disjoint_entries, pmap_tlb_flush_range_t *flush_range)
4025 {
4026 /**
4027 * Reconcile the number of pending entries as tracked by the caller with the
4028 * number of pending entries tracked by flush_range. If the caller's value is
4029 * greater, we assume the caller has inserted locally-tracked mappings into the
4030 * array without directly updating flush_range->pending_disjoint_entries. Otherwise, we
4031 * assume the caller has no locally-tracked mappings and is simply trying to
4032 * purge any pending mappings from a prior call sequence.
4033 */
4034 if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
4035 flush_range->pending_disjoint_entries = pending_disjoint_entries;
4036 } else {
4037 assert(pending_disjoint_entries == 0);
4038 }
4039 if (flush_range->pending_disjoint_entries != 0) {
4040 assert(get_preemption_level() > 0);
4041 /**
4042 * Compute the correct number of mappings for the most recent paddr
4043 * header based on the current position in the SPTM ops array.
4044 */
4045 flush_range->current_header->per_paddr_header.num_mappings =
4046 flush_range->pending_disjoint_entries - flush_range->current_header_first_mapping_index;
4047 const sptm_return_t sptm_return = sptm_update_disjoint_multipage(
4048 PERCPU_GET(pmap_sptm_percpu)->sptm_ops_pa, flush_range->pending_disjoint_entries);
4049
4050 /**
4051 * We may be submitting the batch and exiting the epoch partway through
4052 * processing the PV list for a page. That's fine, because in that case we'll
4053 * hold the PV lock for that page, which will prevent mappings of that page from
4054 * being disconnected and will prevent the completion of pmap_remove() against
4055 * any of those mappings, thus also guaranteeing the relevant page table pages
4056 * can't be freed. The epoch still protects mappings for any prior page in
4057 * the batch, whose PV locks are no longer held.
4058 */
4059 pmap_epoch_exit();
4060 enable_preemption();
4061 if (flush_range->pending_region_entries != 0) {
4062 flush_range->processed_entries += flush_range->pending_disjoint_entries;
4063 } else {
4064 flush_range->processed_entries = 0;
4065 }
4066 flush_range->pending_disjoint_entries = 0;
4067 if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4068 flush_range->ptfr_flush_needed = true;
4069 }
4070 }
4071 }
4072
4073 /**
4074 * Insert a new physical address "header" entry into the per-CPU SPTM ops array for a
4075 * multi-page SPTM operation. It is expected that the caller will subsequently add
4076 * mapping entries for this physical address into the array.
4077 *
4078 * @note This function will disable preemption upon creation of the first paddr header
4079 * (index 0 in the per-CPU SPTM ops array) and it is expected that
4080 * pmap_multipage_op_submit() will subsequently be called on the same CPU.
4081 * @note Before inserting the new header, this function automatically updates the
4082 * per_paddr_header.num_mappings field for the previous physical address header
4083 * (if present) in the SPTM ops array to its final value.
4084 *
4085 * @param phys The physical address for which to insert a header entry.
4086 * @param inout_pending_disjoint_entries
4087 * [input] The number of not-yet-submitted mappings according to the caller.
4088 * This value may be greater than [flush_range]->pending_disjoint_entries if
4089 * the caller has inserted mappings into the ops array without
4090 * updating [flush_range]->pending_disjoint_entries, in which case this
4091 * function will update [flush_range]->pending_disjoint_entries with the
4092 * caller's value.
4093 * [output] Returns the starting index at which the caller should insert mapping
4094 * entries into the per-CPU SPTM ops array.
4095 * @param sptm_update_options SPTM_UPDATE_* flags to pass to the SPTM call.
4096 * SPTM_UPDATE_SKIP_PAPT is automatically inserted by this
4097 * function.
4098 * @param flush_range The object tracking the current state of the multipage operation.
4099 *
4100 * @return True if the region operation was submitted to the SPTM due to the ops array already
4101 * being full, false otherwise. In the former case, the new header will not be added
4102 * to the array; the caller will need to re-invoke this function after taking any
4103 * necessary post-submission action (such as enabling preemption).
4104 */
4105 static inline bool
4106 pmap_multipage_op_add_page(
4107 pmap_paddr_t phys,
4108 unsigned int *inout_pending_disjoint_entries,
4109 uint32_t sptm_update_options,
4110 pmap_tlb_flush_range_t *flush_range)
4111 {
4112 unsigned int pending_disjoint_entries = *inout_pending_disjoint_entries;
4113
4114 /**
4115 * Reconcile the number of pending entries as tracked by the caller with the
4116 * number of pending entries tracked by flush_range. If the caller's value is
4117 * greater, we assume the caller has inserted locally-tracked mappings into the
4118 * array without directly updating flush_range->pending_disjoint_entries. Otherwise, we
4119 * assume the caller has no locally-tracked mappings and is adding its paddr
4120 * header for the first time.
4121 */
4122 if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
4123 flush_range->pending_disjoint_entries = pending_disjoint_entries;
4124 } else {
4125 assert(pending_disjoint_entries == 0);
4126 }
4127 if (flush_range->pending_disjoint_entries >= (SPTM_MAPPING_LIMIT - 1)) {
4128 /**
4129 * If the SPTM ops array is either full or only has space for the paddr
4130 * header, there won't be room for mapping entries, so submit the pending
4131 * mappings to the SPTM now, and return to allow the caller to take
4132 * any necessary post-submission action.
4133 */
4134 pmap_multipage_op_submit_disjoint(pending_disjoint_entries, flush_range);
4135 *inout_pending_disjoint_entries = 0;
4136 return true;
4137 }
4138 pending_disjoint_entries = flush_range->pending_disjoint_entries;
4139
4140 sptm_update_options |= SPTM_UPDATE_SKIP_PAPT;
4141 if (pending_disjoint_entries == 0) {
4142 disable_preemption();
4143 /**
4144 * Enter the pmap epoch while we gather the disjoint update arguments
4145 * and issue the SPTM call. Since this operation may cover multiple physical
4146 * pages, we may construct the argument array and invoke the SPTM without holding
4147 * all relevant PVH locks or pmap locks. We therefore need to record that we are
4148 * collecting and modifying mapping state so that e.g. pmap_page_protect() does
4149 * not attempt to retype the underlying pages and pmap_remove() does not attempt
4150 * to free the page tables used for these mappings without first draining our epoch.
4151 */
4152 pmap_epoch_enter();
4153 flush_range->pending_disjoint_entries = 1;
4154 } else {
4155 /**
4156 * Before inserting the new header, update the prior header's number
4157 * of paddr-specific mappings to its final value.
4158 */
4159 assert(flush_range->current_header != NULL);
4160 flush_range->current_header->per_paddr_header.num_mappings =
4161 pending_disjoint_entries - flush_range->current_header_first_mapping_index;
4162 }
4163 sptm_disjoint_op_t *sptm_ops = PERCPU_GET(pmap_sptm_percpu)->sptm_ops;
4164 flush_range->current_header = (sptm_update_disjoint_multipage_op_t*)&sptm_ops[pending_disjoint_entries];
4165 flush_range->current_header_first_mapping_index = ++pending_disjoint_entries;
4166 flush_range->current_header->per_paddr_header.paddr = phys;
4167 flush_range->current_header->per_paddr_header.num_mappings = 0;
4168 flush_range->current_header->per_paddr_header.options = sptm_update_options;
4169
4170 *inout_pending_disjoint_entries = pending_disjoint_entries;
4171 return false;
4172 }
4173
4174 /**
4175 * The following two functions, pmap_multipage_op_submit_region() and
4176 * pmap_insert_flush_range_template(), are meant to be used in a similar fashion
4177 * to pmap_multipage_op_submit_disjoint() and pmap_multipage_op_add_page(),
4178 * but for the specific case in which a given mapping within a PV list happens
4179 * to map the current VA within a VA region being operated on by
4180 * phys_attribute_clear_range(). This allows the pmap to further optimize
4181 * the SPTM calls by using sptm_update_region() to modify all mappings within
4182 * the VA region, which requires far fewer table walks than a disjoint operation.
4183 * Since the starting VA of the region, the owning pmap, and the insertion point
4184 * within the per-CPU region templates array are already known, these functions
4185 * don't require the special "header" entry or the complex array position tracking
4186 * of their disjoint equivalents above.
4187 * Note that these functions may be used together with the disjoint functions above;
4188 * these functions can be used for the "primary" mappings corresponding to the VA
4189 * region being manipulated by the VM layer, while the disjoint functions can be
4190 * used for any alias mappings of the underlying pages which fall outside that
4191 * VA region.
4192 */
4193
4194 /**
4195 * Submit any pending region-based templates for the specified flush_range.
4196 *
4197 * @note This function must be called with preemption disabled, and will drop
4198 * the preemption-disable count upon submitting to the SPTM.
4199 *
4200 * @param flush_range The object tracking the current state of the region operation.
4201 */
4202 static inline void
4203 pmap_multipage_op_submit_region(pmap_tlb_flush_range_t *flush_range)
4204 {
4205 if (flush_range->pending_region_entries != 0) {
4206 assert(get_preemption_level() > 0);
4207 pmap_assert_locked(flush_range->ptfr_pmap, PMAP_LOCK_SHARED);
4208 /**
4209 * If there are any pending disjoint entries, we're already in a pmap epoch.
4210 * For disjoint entries, we need to hold the epoch during the entire time we
4211 * construct the disjoint ops array because those ops may point to some arbitrary
4212 * pmap and we need to ensure the relevant page tables and even the pmap itself
4213 * aren't concurrently reclaimed while our ops array points to them.
4214 * But for a region op like this, we know we already hold the relevant pmap lock
4215 * so none of the above can happen concurrently. We therefore only need to hold
4216 * the epoch across the SPTM call itself to prevent a concurrent unmap operation
4217 * from attempting to retype the mapped pages while our SPTM call has them in-
4218 * flight.
4219 */
4220 if (flush_range->pending_disjoint_entries == 0) {
4221 pmap_epoch_enter();
4222 }
4223 const sptm_return_t sptm_return = sptm_update_region(flush_range->ptfr_pmap->ttep,
4224 flush_range->pending_region_start, flush_range->pending_region_entries,
4225 PERCPU_GET(pmap_sptm_percpu)->sptm_templates_pa,
4226 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | SPTM_UPDATE_DEFER_TLBI);
4227 if (flush_range->pending_disjoint_entries == 0) {
4228 pmap_epoch_exit();
4229 }
4230 enable_preemption();
4231 if (flush_range->pending_disjoint_entries != 0) {
4232 flush_range->processed_entries += flush_range->pending_region_entries;
4233 } else {
4234 flush_range->processed_entries = 0;
4235 }
4236 flush_range->pending_region_start += (flush_range->pending_region_entries <<
4237 pmap_get_pt_attr(flush_range->ptfr_pmap)->pta_page_shift);
4238 flush_range->pending_region_entries = 0;
4239 if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4240 flush_range->ptfr_flush_needed = true;
4241 }
4242 }
4243 }
4244
4245 /**
4246 * Insert a PTE template into the per-CPU SPTM region ops array.
4247 * This is meant to be used as a performance optimization for the case in which a given
4248 * mapping being processed by a function such as pmap_page_protect_options_with_flush_range()
4249 * happens to map the current iteration position within [flush_range]'s VA region.
4250 * In this case the mapping can be inserted as a region-based template rather than a disjoint
4251 * operation as would be done in the general case. The idea is that region-based SPTM
4252 * operations are significantly less expensive than disjoint operations, because each region
4253 * operation only requires a single page table walk at the beginning vs. a table walk for
4254 * each mapping in the disjoint case. Since the majority of mappings processed by a flush
4255 * range operation belong to the main flush range VA region (i.e. alias mappings outside
4256 * the region are less common), the performance improvement can be significant.
4257 *
4258 * @note This function will disable preemption upon inserting the first entry into the
4259 * per-CPU templates array, and will re-enable preemption upon submitting the region
4260 * operation to the SPTM.
4261 *
4262 * @param template The PTE template to insert into the per-CPU templates array.
4263 * @param flush_range The object tracking the current state of the region operation.
4264 *
4265 * @return True if the region operation was submitted to the SPTM, false otherwise.
4266 */
4267 static inline bool
4268 pmap_insert_flush_range_template(pt_entry_t template, pmap_tlb_flush_range_t *flush_range)
4269 {
4270 if (flush_range->pending_region_entries == 0) {
4271 disable_preemption();
4272 }
4273 flush_range->region_entry_added = true;
4274 PERCPU_GET(pmap_sptm_percpu)->sptm_templates[flush_range->pending_region_entries++] = template;
4275 if (flush_range->pending_region_entries == SPTM_MAPPING_LIMIT) {
4276 pmap_multipage_op_submit_region(flush_range);
4277 return true;
4278 }
4279 return false;
4280 }
4281
4282 /**
4283 * Wrapper function for submitting any pending operations, region-based or disjoint,
4284 * tracked by a flush range object. This is meant to be used by the top-level caller that
4285 * iterates over the flush range's VA region and calls functions such as
4286 * pmap_page_protect_options_with_flush_range() or arm_force_fast_fault_with_flush_range()
4287 * to construct the relevant SPTM operations arrays.
4288 *
4289 * @param flush_range The object tracking the current state of region and/or disjoint operations.
4290 */
4291 static inline void
4292 pmap_multipage_op_submit(pmap_tlb_flush_range_t *flush_range)
4293 {
4294 pmap_multipage_op_submit_disjoint(0, flush_range);
4295 pmap_multipage_op_submit_region(flush_range);
4296 }
4297
4298 /**
4299 * This is an internal-only flag that indicates the caller of pmap_page_protect_options_with_flush_range()
4300 * is removing/updating all mappings in preparation for a retype operation. In this case
4301 * pmap_page_protect_options() will assume (and assert) that the PVH lock for the physical page is held
4302 * by the calller, and will perform the necessary pmap epoch drain and retype the page back to XNU_DEFAULT
4303 * prior to returning.
4304 */
4305 #define PMAP_OPTIONS_PPO_PENDING_RETYPE 0x80000000
4306 _Static_assert(PMAP_OPTIONS_PPO_PENDING_RETYPE & PMAP_OPTIONS_RESERVED_MASK,
4307 "PMAP_OPTIONS_PPO_PENDING_RETYPE outside reserved encoding space");
4308
4309 /**
4310 * Lower the permission for all mappings to a given page. If VM_PROT_NONE is specified,
4311 * the mappings will be removed.
4312 *
4313 * @param ppnum Page number to lower the permission of.
4314 * @param prot The permission to lower to.
4315 * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
4316 * PMAP_OPTIONS_PPO_PENDING_RETYPE indicates the PVH lock for ppnum is
4317 * already locked and a pmap epoch drain shold be performed, along with
4318 * retyping [ppnum] back to XNU_DEFAULT.
4319 * PMAP_OPTIONS_COMPRESSOR indicates the function is called by the
4320 * VM compressor.
4321 * PMAP_OPTIONS_RETYPE requests the [ppnum] be retyped back to XNU_DEFAULT,
4322 * along with an epoch drain; like PMAP_OPTIONS_PPO_PENDING_RETYPE but without
4323 * the PVH lock being held by the caller.
4324 * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
4325 * by the caller. This is an input/output parameter which may be updated
4326 * to reflect a new PV head value to be passed to a later call to pvh_unlock().
4327 * @param flush_range When present, this function will skip the TLB flush for the
4328 * mappings that are covered by the range, leaving that to be
4329 * done later by the caller. It may also avoid submitting mapping
4330 * updates directly to the SPTM, instead accumulating them in a
4331 * per-CPU array to be submitted later by the caller.
4332 *
4333 * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4334 */
4335 MARK_AS_PMAP_TEXT static void
4336 pmap_page_protect_options_with_flush_range(
4337 ppnum_t ppnum,
4338 vm_prot_t prot,
4339 unsigned int options,
4340 locked_pvh_t *locked_pvh,
4341 pmap_tlb_flush_range_t *flush_range)
4342 {
4343 pmap_paddr_t phys = ptoa(ppnum);
4344 locked_pvh_t local_locked_pvh = {.pvh = 0};
4345 pv_entry_t *pve_p = NULL;
4346 pv_entry_t *pveh_p = NULL;
4347 pv_entry_t *pvet_p = NULL;
4348 pt_entry_t *pte_p = NULL;
4349 pv_entry_t *new_pve_p = NULL;
4350 pt_entry_t *new_pte_p = NULL;
4351
4352 bool remove = false;
4353 unsigned int pvh_cnt = 0;
4354 unsigned int num_mappings = 0, num_skipped_mappings = 0;
4355
4356 assert(ppnum != vm_page_fictitious_addr);
4357
4358 /**
4359 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4360 *
4361 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
4362 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
4363 * semantics conflict with each other, so assert they are not both true.
4364 */
4365 assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
4366
4367 /* Only work with managed pages. */
4368 if (!pa_valid(phys)) {
4369 return;
4370 }
4371
4372 /*
4373 * Determine the new protection.
4374 */
4375 switch (prot) {
4376 case VM_PROT_ALL:
4377 return; /* nothing to do */
4378 case VM_PROT_READ:
4379 case VM_PROT_READ | VM_PROT_EXECUTE:
4380 break;
4381 default:
4382 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4383 options = options & ~PMAP_OPTIONS_NOFLUSH;
4384 remove = true;
4385 break;
4386 }
4387
4388 /**
4389 * We don't support cross-page batching (indicated by flush_range being non-NULL) for removals,
4390 * as removals must use the SPTM prev_ptes array for accounting, which isn't supported for cross-
4391 * page batches.
4392 */
4393 assert((flush_range == NULL) || !remove);
4394
4395 unsigned int pai = pa_index(phys);
4396 if (__probable(locked_pvh == NULL)) {
4397 if (flush_range != NULL) {
4398 /**
4399 * If we're partway through processing a multi-page batched call,
4400 * preemption will already be disabled so we can't simply call
4401 * pvh_lock() which may block. Instead, we first try to acquire
4402 * the lock without waiting, which in most cases should succeed.
4403 * If it fails, we submit the pending batched operations to re-
4404 * enable preemption and then acquire the lock normally.
4405 */
4406 local_locked_pvh = pvh_try_lock(pai);
4407 if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
4408 pmap_multipage_op_submit(flush_range);
4409 local_locked_pvh = pvh_lock(pai);
4410 }
4411 } else {
4412 local_locked_pvh = pvh_lock(pai);
4413 }
4414 } else {
4415 local_locked_pvh = *locked_pvh;
4416 assert(pai == local_locked_pvh.pai);
4417 }
4418 assert(local_locked_pvh.pvh != 0);
4419 pvh_assert_locked(pai);
4420
4421 bool pvh_lock_sleep_mode_needed = false;
4422 bool clear_epoch = false;
4423
4424 /*
4425 * PVH should be locked before accessing per-CPU data, as we're relying on the lock
4426 * to disable preemption.
4427 */
4428 pmap_cpu_data_t *pmap_cpu_data = NULL;
4429 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
4430 sptm_disjoint_op_t *sptm_ops = NULL;
4431 pt_desc_t **sptm_ptds = NULL;
4432 ptd_info_t **sptm_ptd_info = NULL;
4433
4434 /* BEGIN IGNORE CODESTYLE */
4435
4436 /**
4437 * This would also work as a block, with the above variables declared using the
4438 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
4439 * dereferencing __block variables through stack forwarding pointers) isn't needed
4440 * here, as we never need to use this code sequence as a closure.
4441 */
4442 #define PPO_PERCPU_INIT() do { \
4443 disable_preemption(); \
4444 pmap_cpu_data = pmap_get_cpu_data(); \
4445 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
4446 sptm_ops = sptm_pcpu->sptm_ops; \
4447 sptm_ptds = sptm_pcpu->sptm_ptds; \
4448 sptm_ptd_info = sptm_pcpu->sptm_ptd_info; \
4449 if (remove) { \
4450 clear_epoch = true; \
4451 pmap_epoch_enter(); \
4452 } \
4453 } while (0)
4454
4455 /* END IGNORE CODESTYLE */
4456
4457
4458 PPO_PERCPU_INIT();
4459
4460 pv_entry_t **pve_pp = NULL;
4461
4462 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
4463 pte_p = pvh_ptep(local_locked_pvh.pvh);
4464 } else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4465 pve_p = pvh_pve_list(local_locked_pvh.pvh);
4466 pveh_p = pve_p;
4467 } else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
4468 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
4469 }
4470
4471 int pve_ptep_idx = 0;
4472 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4473
4474 /*
4475 * We need to keep track of whether a particular PVE list contains IOMMU
4476 * mappings when removing entries, because we should only remove CPU
4477 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4478 * it around.
4479 */
4480 bool iommu_mapping_in_pve = false;
4481
4482 /**
4483 * With regard to TLBI, there are three cases:
4484 *
4485 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
4486 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
4487 * itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
4488 * mapping is out of the range.
4489 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
4490 * let SPTM handle TLBI flushing.
4491 */
4492 const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
4493 const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
4494
4495 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4496 if (__improbable(pvh_lock_sleep_mode_needed)) {
4497 assert((num_mappings == 0) && (num_skipped_mappings == 0));
4498 if (clear_epoch) {
4499 pmap_epoch_exit();
4500 clear_epoch = false;
4501 }
4502 /**
4503 * Undo the explicit preemption disable done in the last call to PPO_PER_CPU_INIT().
4504 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
4505 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
4506 * core while processing SPTM per-CPU data. At the same time, we also want preemption
4507 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
4508 * urgent ASTs can be handled.
4509 */
4510 enable_preemption();
4511 pvh_lock_enter_sleep_mode(&local_locked_pvh);
4512 pvh_lock_sleep_mode_needed = false;
4513 PPO_PERCPU_INIT();
4514 }
4515
4516 if (pve_p != PV_ENTRY_NULL) {
4517 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4518 if (pte_p == PT_ENTRY_NULL) {
4519 goto protect_skip_pve;
4520 }
4521 }
4522
4523 #ifdef PVH_FLAG_IOMMU
4524 if (pvh_ptep_is_iommu(pte_p)) {
4525 iommu_mapping_in_pve = true;
4526 if (__improbable(remove && (options & PMAP_OPTIONS_COMPRESSOR))) {
4527 const iommu_instance_t iommu = ptep_get_iommu(pte_p);
4528 panic("%s: attempt to compress ppnum 0x%x owned by iommu driver "
4529 "%u (token: %#x), pve_p=%p", __func__, ppnum, GET_IOMMU_ID(iommu),
4530 GET_IOMMU_TOKEN(iommu), pve_p);
4531 }
4532 if (remove && (pve_p == PV_ENTRY_NULL)) {
4533 /*
4534 * We've found an IOMMU entry and it's the only entry in the PV list.
4535 * We don't discard IOMMU entries, so simply set up the new PV list to
4536 * contain the single IOMMU PTE and exit the loop.
4537 */
4538 new_pte_p = pte_p;
4539 break;
4540 }
4541 ++num_skipped_mappings;
4542 goto protect_skip_pve;
4543 }
4544 #endif
4545
4546 const pt_entry_t spte = os_atomic_load(pte_p, relaxed);
4547
4548 if (__improbable(!remove && !pte_is_valid(spte))) {
4549 ++num_skipped_mappings;
4550 goto protect_skip_pve;
4551 }
4552
4553 pt_desc_t *ptdp = NULL;
4554 pmap_t pmap = NULL;
4555 vm_map_address_t va = 0;
4556
4557 if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
4558 /**
4559 * If the current mapping matches the flush range's current iteration position,
4560 * there's no need to do the work of getting the PTD. We already know the pmap,
4561 * and the VA is implied by flush_range->pending_region_start.
4562 */
4563 pmap = flush_range->ptfr_pmap;
4564 } else {
4565 ptdp = ptep_get_ptd(pte_p);
4566 pmap = ptdp->pmap;
4567 va = ptd_get_va(ptdp, pte_p);
4568 }
4569
4570 /**
4571 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
4572 * pending disjoint ops, so we don't need to do flush range disjoint op management.
4573 */
4574 if ((flush_range != NULL) && (ptdp != NULL)) {
4575 /**
4576 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
4577 * We do this in three cases:
4578 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
4579 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
4580 * for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
4581 * 3) We need to change the options passed to the SPTM for a run of one or more mappings. Specifically,
4582 * if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
4583 * belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
4584 * the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
4585 */
4586 uint32_t per_mapping_sptm_update_options = sptm_update_options;
4587 if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4588 per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
4589 }
4590 if ((num_mappings == 0) ||
4591 (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
4592 if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
4593 /**
4594 * If we needed to submit the pending disjoint ops to make room for the new page,
4595 * flush any pending region ops to reenable preemption and restart the loop with
4596 * the lock in sleep mode. This prevents preemption from being held disabled
4597 * for an arbitrary amount of time in the pathological case in which we have
4598 * both pending region ops and an excessively long PV list that repeatedly
4599 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
4600 */
4601 pmap_multipage_op_submit_region(flush_range);
4602 assert(num_mappings == 0);
4603 num_skipped_mappings = 0;
4604 pvh_lock_sleep_mode_needed = true;
4605 continue;
4606 }
4607 }
4608 }
4609
4610 if (__improbable((pmap == NULL) ||
4611 (pte_is_valid(spte) && (atop(pte_to_pa(spte)) != ppnum)))) {
4612 #if MACH_ASSERT
4613 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4614 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4615 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4616 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4617
4618 pv_entry_t *check_pvep = pve_p;
4619
4620 do {
4621 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4622 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4623 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, (void*)local_locked_pvh.pvh, pve_p, pai);
4624 }
4625 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4626
4627 /* Restore previous PTEP value. */
4628 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4629 }
4630 #endif
4631 panic("%s: bad PVE pte_p=%p pmap=%p prot=%d options=%u, pvh=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4632 __func__, pte_p, pmap, prot, options, (void*)local_locked_pvh.pvh, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4633 }
4634
4635 pt_entry_t pte_template = ARM_PTE_EMPTY;
4636
4637 if (ptdp != NULL) {
4638 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
4639 sptm_ops[num_mappings].vaddr = va;
4640 }
4641
4642 /* Remove the mapping if new protection is NONE */
4643 if (remove) {
4644 sptm_ptds[num_mappings] = ptdp;
4645 sptm_ptd_info[num_mappings] = ptd_get_info(ptdp);
4646 sptm_pcpu->sptm_acct_flags[num_mappings] = 0;
4647 if (pmap != kernel_pmap) {
4648 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4649 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4650
4651 if (is_internal) {
4652 sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_INTERNAL;
4653 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4654 }
4655 if (is_altacct) {
4656 sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_ALTACCT;
4657 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4658 }
4659 if (compress && is_internal) {
4660 pte_template = ARM_PTE_COMPRESSED;
4661 if (is_altacct) {
4662 pte_template |= ARM_PTE_COMPRESSED_ALT;
4663 }
4664 }
4665 }
4666 /* Remove this CPU mapping from PVE list. */
4667 if (pve_p != PV_ENTRY_NULL) {
4668 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4669 }
4670 } else {
4671 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4672
4673 if (pmap == kernel_pmap) {
4674 pte_template = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4675 } else {
4676 pte_template = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4677 }
4678
4679 /*
4680 * We must at least clear the 'was writeable' flag, as we're at least revoking write access,
4681 * meaning that the VM is effectively requesting that subsequent write accesses to these mappings
4682 * go through vm_fault() instead of being handled by arm_fast_fault().
4683 */
4684 pte_set_was_writeable(pte_template, false);
4685
4686 /*
4687 * While the naive implementation of this would serve to add execute
4688 * permission, this is not how the VM uses this interface, or how
4689 * x86_64 implements it. So ignore requests to add execute permissions.
4690 */
4691 #if DEVELOPMENT || DEBUG
4692 if ((!(prot & VM_PROT_EXECUTE) && nx_enabled && pmap->nx_enabled) ||
4693 (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
4694 #else
4695 if (!(prot & VM_PROT_EXECUTE) ||
4696 (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
4697 #endif
4698 {
4699 pte_template |= pt_attr_leaf_xn(pt_attr);
4700 }
4701 }
4702
4703 if (ptdp != NULL) {
4704 sptm_ops[num_mappings].pte_template = pte_template;
4705 ++num_mappings;
4706 } else if (pmap_insert_flush_range_template(pte_template, flush_range)) {
4707 /**
4708 * We submit both the pending disjoint and pending region ops whenever
4709 * either category reaches the mapping limit. Having pending operations
4710 * in either category will keep preemption disabled, and we want to ensure
4711 * that we can at least temporarily re-enable preemption roughly every
4712 * SPTM_MAPPING_LIMIT mappings.
4713 */
4714 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
4715 pvh_lock_sleep_mode_needed = true;
4716 num_mappings = num_skipped_mappings = 0;
4717 }
4718
4719 protect_skip_pve:
4720 if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
4721 if (flush_range != NULL) {
4722 /* See comment above for why we submit both disjoint and region ops when we hit the limit. */
4723 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
4724 pmap_multipage_op_submit_region(flush_range);
4725 } else if (num_mappings > 0) {
4726 if (remove) {
4727 pmap_disjoint_unmap(phys, num_mappings);
4728 } else {
4729 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
4730 }
4731 }
4732 pvh_lock_sleep_mode_needed = true;
4733 num_mappings = num_skipped_mappings = 0;
4734 }
4735 pte_p = PT_ENTRY_NULL;
4736 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4737 pve_ptep_idx = 0;
4738
4739 if (remove) {
4740 /**
4741 * If there are any IOMMU mappings in the PVE list, preserve
4742 * those mappings in a new PVE list (new_pve_p) which will later
4743 * become the new PVH entry. Keep track of the CPU mappings in
4744 * pveh_p/pvet_p so they can be deallocated later.
4745 */
4746 if (iommu_mapping_in_pve) {
4747 iommu_mapping_in_pve = false;
4748 pv_entry_t *temp_pve_p = pve_next(pve_p);
4749 pve_remove(&local_locked_pvh, pve_pp, pve_p);
4750 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4751 pveh_p = pvh_pve_list(local_locked_pvh.pvh);
4752 } else {
4753 assert(pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL));
4754 pveh_p = PV_ENTRY_NULL;
4755 }
4756 pve_p->pve_next = new_pve_p;
4757 new_pve_p = pve_p;
4758 pve_p = temp_pve_p;
4759 continue;
4760 } else {
4761 pvet_p = pve_p;
4762 pvh_cnt++;
4763 }
4764 }
4765
4766 pve_pp = pve_next_ptr(pve_p);
4767 pve_p = pve_next(pve_p);
4768 iommu_mapping_in_pve = false;
4769 }
4770 }
4771
4772 if (num_mappings != 0) {
4773 if (remove) {
4774 pmap_disjoint_unmap(phys, num_mappings);
4775 } else if (flush_range == NULL) {
4776 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
4777 } else {
4778 /* Resync the pending mapping state in flush_range with our local state. */
4779 assert(num_mappings >= flush_range->pending_disjoint_entries);
4780 flush_range->pending_disjoint_entries = num_mappings;
4781 }
4782 }
4783
4784 if (clear_epoch) {
4785 pmap_epoch_exit();
4786 }
4787
4788 /**
4789 * Undo the explicit disable_preemption() done in PPO_PERCPU_INIT().
4790 * Note that enable_preemption() decrements a per-thread counter, so if
4791 * we happen to still hold the PVH lock in spin mode then preemption won't
4792 * actually be re-enabled until we drop the lock (which also decrements
4793 * the per-thread counter.
4794 */
4795 enable_preemption();
4796
4797 /* if we removed a bunch of entries, take care of them now */
4798 if (remove) {
4799 /**
4800 * If a retype is going to be needed here and/or by our caller, drain
4801 * the epochs to ensure that concurrent calls to batched operations such as
4802 * pmap_remove() and the various multipage attribute update functions have
4803 * finished consuming mappings of this page.
4804 */
4805 bool retype_needed = false;
4806 sptm_frame_type_t frame_type = XNU_DEFAULT;
4807 if (options & (PMAP_OPTIONS_PPO_PENDING_RETYPE | PMAP_OPTIONS_RETYPE)) {
4808 /**
4809 * If the frame type isn't currently XNU_DEFAULT, retype it back either
4810 * to satisfy the caller's request (PMAP_OPTIONS_RETYPE) or to ensure
4811 * the caller's subsequent retype will work as not all non-default types
4812 * can be directly retyped to one another without going through XNU_DEFAULT.
4813 */
4814 frame_type = sptm_get_frame_type(phys);
4815 retype_needed = (frame_type != XNU_DEFAULT);
4816 }
4817 /**
4818 * If the caller is indicating that it will subsequently retype the page
4819 * by passing PMAP_OPTIONS_PPO_PENDING_RETYPE, then we'll need to drain the epochs
4820 * regardless of current frame type to prepare for the caller's retype.
4821 */
4822 const bool drain_needed = retype_needed || !!(options & PMAP_OPTIONS_PPO_PENDING_RETYPE);
4823 if (__improbable(drain_needed)) {
4824 pmap_epoch_prepare_drain();
4825 }
4826 if (new_pve_p != PV_ENTRY_NULL) {
4827 pvh_update_head(&local_locked_pvh, new_pve_p, PVH_TYPE_PVEP);
4828 } else if (new_pte_p != PT_ENTRY_NULL) {
4829 pvh_update_head(&local_locked_pvh, new_pte_p, PVH_TYPE_PTEP);
4830 } else {
4831 pvh_set_flags(&local_locked_pvh, 0);
4832 pvh_update_head(&local_locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
4833 }
4834
4835 if (__improbable(drain_needed)) {
4836 pmap_epoch_drain();
4837 }
4838 if (__improbable(retype_needed)) {
4839 const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
4840 sptm_retype(phys, frame_type, XNU_DEFAULT, retype_params);
4841 }
4842 }
4843
4844 if (__probable(locked_pvh == NULL)) {
4845 pvh_unlock(&local_locked_pvh);
4846 } else {
4847 *locked_pvh = local_locked_pvh;
4848 }
4849
4850 if (remove && (pvet_p != PV_ENTRY_NULL)) {
4851 assert(pveh_p != PV_ENTRY_NULL);
4852 pv_list_free(pveh_p, pvet_p, pvh_cnt);
4853 }
4854
4855 if ((flush_range != NULL) && !preemption_enabled()) {
4856 flush_range->processed_entries += num_skipped_mappings;
4857 }
4858 }
4859
4860 MARK_AS_PMAP_TEXT void
4861 pmap_page_protect_options_internal(
4862 ppnum_t ppnum,
4863 vm_prot_t prot,
4864 unsigned int options,
4865 void *arg)
4866 {
4867 if (arg != NULL) {
4868 /*
4869 * This is a legacy argument from pre-ARM era that the VM layer passes in to hint that it will call
4870 * pmap_flush() later to flush the TLB. On ARM platforms, however, pmap_flush() is not implemented,
4871 * as it's typically more efficient to perform the TLB flushing inline with the page table updates
4872 * themselves. Therefore, if the argument is non-NULL, pmap will take care of TLB flushing itself
4873 * by clearing PMAP_OPTIONS_NOFLUSH.
4874 */
4875 options &= ~PMAP_OPTIONS_NOFLUSH;
4876 }
4877 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL, NULL);
4878 }
4879
4880 void
4881 pmap_page_protect_options(
4882 ppnum_t ppnum,
4883 vm_prot_t prot,
4884 unsigned int options,
4885 void *arg)
4886 {
4887 pmap_paddr_t phys = ptoa(ppnum);
4888
4889 assert(ppnum != vm_page_fictitious_addr);
4890
4891 /* Only work with managed pages. */
4892 if (!pa_valid(phys)) {
4893 return;
4894 }
4895
4896 /*
4897 * Determine the new protection.
4898 */
4899 if (prot == VM_PROT_ALL) {
4900 return; /* nothing to do */
4901 }
4902
4903 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
4904
4905 pmap_page_protect_options_internal(ppnum, prot, options, arg);
4906
4907 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
4908 }
4909
4910
4911 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
4912 MARK_AS_PMAP_TEXT void
4913 pmap_disable_user_jop_internal(pmap_t pmap)
4914 {
4915 if (pmap == kernel_pmap) {
4916 panic("%s: called with kernel_pmap", __func__);
4917 }
4918 validate_pmap_mutable(pmap);
4919 sptm_configure_root(pmap->ttep, 0, SPTM_ROOT_PT_FLAG_JOP);
4920 pmap->disable_jop = true;
4921 }
4922
4923 void
4924 pmap_disable_user_jop(pmap_t pmap)
4925 {
4926 pmap_disable_user_jop_internal(pmap);
4927 }
4928 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
4929
4930 /*
4931 * Indicates if the pmap layer enforces some additional restrictions on the
4932 * given set of protections.
4933 */
4934 bool
4935 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
4936 {
4937 return false;
4938 }
4939
4940 /*
4941 * Set the physical protection on the
4942 * specified range of this map as requested.
4943 * VERY IMPORTANT: Will not increase permissions.
4944 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
4945 */
4946 void
4947 pmap_protect(
4948 pmap_t pmap,
4949 vm_map_address_t b,
4950 vm_map_address_t e,
4951 vm_prot_t prot)
4952 {
4953 pmap_protect_options(pmap, b, e, prot, 0, NULL);
4954 }
4955
4956 static bool
4957 pmap_protect_strong_sync(unsigned int num_mappings __unused)
4958 {
4959 return false;
4960 }
4961
4962 MARK_AS_PMAP_TEXT vm_map_address_t
4963 pmap_protect_options_internal(
4964 pmap_t pmap,
4965 vm_map_address_t start,
4966 vm_map_address_t end,
4967 vm_prot_t prot,
4968 unsigned int options,
4969 __unused void *args)
4970 {
4971 pt_entry_t *pte_p;
4972 bool set_NX = true;
4973 bool set_XO = false;
4974 bool should_have_removed = false;
4975 bool need_strong_sync = false;
4976
4977 /* Validate the pmap input before accessing its data. */
4978 validate_pmap_mutable(pmap);
4979
4980 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4981
4982 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
4983 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4984 }
4985
4986 #if DEVELOPMENT || DEBUG
4987 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
4988 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
4989 should_have_removed = true;
4990 }
4991 } else
4992 #endif
4993 {
4994 /* Determine the new protection. */
4995 switch (prot) {
4996 case VM_PROT_EXECUTE:
4997 set_XO = true;
4998 OS_FALLTHROUGH;
4999 case VM_PROT_READ:
5000 case VM_PROT_READ | VM_PROT_EXECUTE:
5001 break;
5002 case VM_PROT_READ | VM_PROT_WRITE:
5003 case VM_PROT_ALL:
5004 return end; /* nothing to do */
5005 default:
5006 should_have_removed = true;
5007 }
5008 }
5009
5010 if (__improbable(should_have_removed)) {
5011 panic("%s: should have been a remove operation, "
5012 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5013 __FUNCTION__,
5014 pmap, (void *)start, (void *)end, prot, options, args);
5015 }
5016
5017 #if DEVELOPMENT || DEBUG
5018 bool force_write = false;
5019 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5020 force_write = true;
5021 }
5022 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5023 #else
5024 if ((prot & VM_PROT_EXECUTE))
5025 #endif
5026 {
5027 set_NX = false;
5028 } else {
5029 set_NX = true;
5030 }
5031
5032 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5033 vm_map_address_t va = start;
5034 vm_map_address_t sptm_start_va = start;
5035 unsigned int num_mappings = 0;
5036
5037 pmap_lock(pmap, PMAP_LOCK_SHARED);
5038
5039 pte_p = pmap_pte(pmap, start);
5040
5041 if (pte_p == NULL) {
5042 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5043 return end;
5044 }
5045
5046 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
5047 #if DEVELOPMENT || DEBUG
5048 if (!force_write)
5049 #endif
5050 {
5051 disable_preemption();
5052 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5053 }
5054
5055 pt_entry_t tmplate = ARM_PTE_EMPTY;
5056
5057 if (pmap == kernel_pmap) {
5058 #if DEVELOPMENT || DEBUG
5059 if (force_write) {
5060 tmplate = ARM_PTE_AP(AP_RWNA);
5061 } else
5062 #endif
5063 {
5064 tmplate = ARM_PTE_AP(AP_RONA);
5065 }
5066 } else {
5067 #if DEVELOPMENT || DEBUG
5068 if (force_write) {
5069 assert(pmap->type != PMAP_TYPE_NESTED);
5070 tmplate = pt_attr_leaf_rw(pt_attr);
5071 } else
5072 #endif
5073 if (set_XO) {
5074 tmplate = pt_attr_leaf_rona(pt_attr);
5075 } else {
5076 tmplate = pt_attr_leaf_ro(pt_attr);
5077 }
5078 }
5079
5080 if (set_NX) {
5081 tmplate |= pt_attr_leaf_xn(pt_attr);
5082 }
5083
5084 while (va < end) {
5085 pt_entry_t spte = ARM_PTE_EMPTY;
5086
5087 /**
5088 * Removing "NX" would grant "execute" access immediately, bypassing any
5089 * checks VM might want to do in its soft fault path.
5090 * pmap_protect() and co. are not allowed to increase access permissions,
5091 * except in the PMAP_OPTIONS_PROTECT_IMMEDIATE internal-only case.
5092 * Therefore, if we are not explicitly clearing execute permissions, inherit
5093 * the existing permissions.
5094 */
5095 if (!set_NX) {
5096 spte = os_atomic_load(pte_p, relaxed);
5097 if (__improbable(!pte_is_valid(spte))) {
5098 tmplate |= pt_attr_leaf_xn(pt_attr);
5099 } else {
5100 tmplate |= (spte & ARM_PTE_XMASK);
5101 }
5102 }
5103
5104 #if DEVELOPMENT || DEBUG
5105 /*
5106 * PMAP_OPTIONS_PROTECT_IMMEDIATE is an internal-only option that's intended to
5107 * provide a "backdoor" to allow normally write-protected compressor pages to be
5108 * be temporarily written without triggering expensive write faults.
5109 */
5110 while (force_write) {
5111 if (spte == ARM_PTE_EMPTY) {
5112 spte = os_atomic_load(pte_p, relaxed);
5113 }
5114 const pt_entry_t prev_pte = spte;
5115
5116 /* A concurrent disconnect may have cleared the PTE. */
5117 if (__improbable(!pte_is_valid(spte))) {
5118 break;
5119 }
5120
5121 /* Inherit permissions and "was_writeable" from the template. */
5122 spte = (spte & ~(ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE)) |
5123 (tmplate & (ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE));
5124
5125 /* Access flag should be set for any immediate change in protections */
5126 spte |= ARM_PTE_AF;
5127 const pmap_paddr_t pa = pte_to_pa(spte);
5128 const unsigned int pai = pa_index(pa);
5129 locked_pvh_t locked_pvh;
5130 if (pa_valid(pa)) {
5131 locked_pvh = pvh_lock(pai);
5132
5133 /**
5134 * The VM may concurrently call pmap_disconnect() on the compressor
5135 * page in question, e.g. if relocating the page to satisfy a precious
5136 * allocation. Now that we hold the PVH lock, re-check the PTE and
5137 * restart the loop if it's different from the value we read before
5138 * we held the lock.
5139 */
5140 if (__improbable(os_atomic_load(pte_p, relaxed) != prev_pte)) {
5141 pvh_unlock(&locked_pvh);
5142 spte = ARM_PTE_EMPTY;
5143 continue;
5144 }
5145 ppattr_modify_bits(pai, PP_ATTR_REFFAULT | PP_ATTR_MODFAULT,
5146 PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5147 }
5148
5149 __assert_only const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, va, spte);
5150
5151 /**
5152 * We don't expect the VM to be concurrently calling pmap_remove() against these
5153 * compressor mappings. If it does for some reason, that could cause the above
5154 * call to return either SPTM_SUCCESS or SPTM_MAP_FLUSH_PENDING.
5155 */
5156 assert3u(sptm_status, ==, SPTM_MAP_VALID);
5157
5158 if (pa_valid(pa)) {
5159 pvh_unlock(&locked_pvh);
5160 }
5161 break;
5162 }
5163
5164 #endif /* DEVELOPMENT || DEBUG */
5165
5166 va += pmap_page_size;
5167 ++pte_p;
5168
5169 #if DEVELOPMENT || DEBUG
5170 if (!force_write)
5171 #endif
5172 {
5173 sptm_pcpu->sptm_templates[num_mappings] = tmplate;
5174 ++num_mappings;
5175 if (num_mappings == SPTM_MAPPING_LIMIT) {
5176 /**
5177 * Enter the pmap epoch for the batched update operation. This is necessary because we
5178 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
5179 * call, so a concurrent pmap_page_protect() operation against one of those pages may
5180 * race this call. That should be perfectly fine as far as the PTE updates are concerned,
5181 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
5182 * if it does not first drain our epoch.
5183 */
5184 pmap_epoch_enter();
5185 sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5186 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5187 pmap_epoch_exit();
5188 need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5189
5190 /* Temporarily re-enable preemption to allow any urgent ASTs to be processed. */
5191 enable_preemption();
5192 num_mappings = 0;
5193 sptm_start_va = va;
5194 disable_preemption();
5195 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5196 }
5197 }
5198 }
5199
5200 /* This won't happen in the force_write case as we should never increment num_mappings. */
5201 if (num_mappings != 0) {
5202 pmap_epoch_enter();
5203 sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5204 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5205 pmap_epoch_exit();
5206 need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5207 }
5208
5209 #if DEVELOPMENT || DEBUG
5210 if (!force_write)
5211 #endif
5212 {
5213 enable_preemption();
5214 }
5215 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5216 if (__improbable(need_strong_sync)) {
5217 arm64_sync_tlb(true);
5218 }
5219 return va;
5220 }
5221
5222 void
5223 pmap_protect_options(
5224 pmap_t pmap,
5225 vm_map_address_t b,
5226 vm_map_address_t e,
5227 vm_prot_t prot,
5228 unsigned int options,
5229 __unused void *args)
5230 {
5231 vm_map_address_t l, beg;
5232
5233 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5234
5235 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5236 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5237 pmap, (uint64_t)b, (uint64_t)e);
5238 }
5239
5240 /*
5241 * We allow single-page requests to execute non-preemptibly,
5242 * as it doesn't make sense to sample AST_URGENT for a single-page
5243 * operation, and there are a couple of special use cases that
5244 * require a non-preemptible single-page operation.
5245 */
5246 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5247 pmap_verify_preemptible();
5248 }
5249
5250 #if DEVELOPMENT || DEBUG
5251 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5252 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5253 pmap_remove_options(pmap, b, e, options);
5254 return;
5255 }
5256 } else
5257 #endif
5258 {
5259 /* Determine the new protection. */
5260 switch (prot) {
5261 case VM_PROT_EXECUTE:
5262 case VM_PROT_READ:
5263 case VM_PROT_READ | VM_PROT_EXECUTE:
5264 break;
5265 case VM_PROT_READ | VM_PROT_WRITE:
5266 case VM_PROT_ALL:
5267 return; /* nothing to do */
5268 default:
5269 pmap_remove_options(pmap, b, e, options);
5270 return;
5271 }
5272 }
5273
5274 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5275 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5276 VM_KERNEL_ADDRHIDE(e));
5277
5278 beg = b;
5279
5280 while (beg < e) {
5281 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5282
5283 if (l > e) {
5284 l = e;
5285 }
5286
5287 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5288 }
5289
5290
5291 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5292 }
5293
5294 /**
5295 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5296 *
5297 * @param pmap pmap to insert the pages into.
5298 * @param va virtual address to map the pages into.
5299 * @param pa page number of the first physical page to map.
5300 * @param size block size, in number of pages.
5301 * @param prot mapping protection attributes.
5302 * @param attr flags to pass to pmap_enter().
5303 *
5304 * @return KERN_SUCCESS.
5305 */
5306 kern_return_t
5307 pmap_map_block(
5308 pmap_t pmap,
5309 addr64_t va,
5310 ppnum_t pa,
5311 uint32_t size,
5312 vm_prot_t prot,
5313 int attr,
5314 unsigned int flags)
5315 {
5316 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5317 }
5318
5319 /**
5320 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5321 * As opposed to pmap_map_block(), this function takes
5322 * a physical address as an input and operates using the
5323 * page size associated with the input pmap.
5324 *
5325 * @param pmap pmap to insert the pages into.
5326 * @param va virtual address to map the pages into.
5327 * @param pa physical address of the first physical page to map.
5328 * @param size block size, in number of pages.
5329 * @param prot mapping protection attributes.
5330 * @param attr flags to pass to pmap_enter().
5331 *
5332 * @return KERN_SUCCESS.
5333 */
5334 kern_return_t
5335 pmap_map_block_addr(
5336 pmap_t pmap,
5337 addr64_t va,
5338 pmap_paddr_t pa,
5339 uint32_t size,
5340 vm_prot_t prot,
5341 int attr,
5342 unsigned int flags)
5343 {
5344 #if __ARM_MIXED_PAGE_SIZE__
5345 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5346 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5347 #else
5348 const uint64_t pmap_page_size = PAGE_SIZE;
5349 #endif
5350
5351 for (ppnum_t page = 0; page < size; page++) {
5352 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE, PMAP_MAPPING_TYPE_INFER) != KERN_SUCCESS) {
5353 panic("%s: failed pmap_enter_addr, "
5354 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5355 __FUNCTION__,
5356 pmap, va, (uint64_t)pa, size, prot, flags);
5357 }
5358
5359 va += pmap_page_size;
5360 pa += pmap_page_size;
5361 }
5362
5363
5364 return KERN_SUCCESS;
5365 }
5366
5367 kern_return_t
5368 pmap_enter_addr(
5369 pmap_t pmap,
5370 vm_map_address_t v,
5371 pmap_paddr_t pa,
5372 vm_prot_t prot,
5373 vm_prot_t fault_type,
5374 unsigned int flags,
5375 boolean_t wired,
5376 pmap_mapping_type_t mapping_type)
5377 {
5378 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, mapping_type);
5379 }
5380
5381 /*
5382 * Insert the given physical page (p) at
5383 * the specified virtual address (v) in the
5384 * target physical map with the protection requested.
5385 *
5386 * If specified, the page will be wired down, meaning
5387 * that the related pte can not be reclaimed.
5388 *
5389 * NB: This is the only routine which MAY NOT lazy-evaluate
5390 * or lose information. That is, this routine must actually
5391 * insert this page into the given map eventually (must make
5392 * forward progress eventually.
5393 */
5394 kern_return_t
5395 pmap_enter(
5396 pmap_t pmap,
5397 vm_map_address_t v,
5398 ppnum_t pn,
5399 vm_prot_t prot,
5400 vm_prot_t fault_type,
5401 unsigned int flags,
5402 boolean_t wired,
5403 pmap_mapping_type_t mapping_type)
5404 {
5405 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, mapping_type);
5406 }
5407
5408 /**
5409 * Helper function for determining the frame type that will be required for a physical page given
5410 * a set of mapping constraints.
5411 *
5412 * @param pmap The address space in which the page will be mapped.
5413 * @param pte The fully-configured page table entry, including permissions and output address, that
5414 * will be used for the mapping.
5415 * @param vaddr The virtual address that will be mapped using [pte]
5416 * @param options Extra mapping options that would be passed to pmap_enter() when performing the mapping
5417 * @param mapping_type The mapping type enum that would be passed to pmap_enter() when performing the mapping
5418 * @param prev_frame_type Output param that will store the existing frame type for the physical page
5419 * mapped by [pte]. As an optimization, this will only be queried if [*new_frame_type]
5420 * is determined to be something other than XNU_DEFAULT, otherwise it will be assumed
5421 * to be XNU_DEFAULT
5422 * @param new_frame_type Output param that will store the new frame type that will be required for the
5423 * physical page mapped by [pte]
5424 */
5425 static inline void
5426 pmap_frame_type_for_pte(
5427 pmap_t pmap __assert_only,
5428 pt_entry_t pte,
5429 vm_map_address_t vaddr __assert_only,
5430 unsigned int options,
5431 pmap_mapping_type_t mapping_type,
5432 sptm_frame_type_t *prev_frame_type,
5433 sptm_frame_type_t *new_frame_type)
5434 {
5435 const pmap_paddr_t paddr = pte_to_pa(pte) & ~PAGE_MASK;
5436 assert(prev_frame_type != NULL);
5437 assert(new_frame_type != NULL);
5438 *prev_frame_type = *new_frame_type = XNU_DEFAULT;
5439
5440 /*
5441 * If the caller specified a mapping type of PMAP_MAPPINGS_TYPE_INFER, then we
5442 * keep the existing logic of deriving the SPTM frame type from the XPRR permissions.
5443 *
5444 * If the caller specified another mapping type, we simply follow that. This refactor was
5445 * needed for the XNU_KERNEL_RESTRICTED work, and it also allows us to be more precise at
5446 * what we want. It's better to let the caller specify the mapping type rather than use the
5447 * permissions for that.
5448 *
5449 * In the future, we should move entirely to use pmap_mapping_type_t; see rdar://114886323.
5450 */
5451 if (mapping_type != PMAP_MAPPING_TYPE_INFER) {
5452 switch (mapping_type) {
5453 case PMAP_MAPPING_TYPE_DEFAULT:
5454 *new_frame_type = (sptm_frame_type_t)mapping_type;
5455 break;
5456 case PMAP_MAPPING_TYPE_ROZONE:
5457 assert(((pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + pt_attr_page_size(pmap_get_pt_attr(pmap)))));
5458 *new_frame_type = (sptm_frame_type_t)mapping_type;
5459 break;
5460 case PMAP_MAPPING_TYPE_RESTRICTED:
5461 if (use_xnu_restricted) {
5462 *new_frame_type = (sptm_frame_type_t)mapping_type;
5463 } else {
5464 *new_frame_type = XNU_DEFAULT;
5465 }
5466 break;
5467 default:
5468 panic("invalid mapping type: %d", mapping_type);
5469 }
5470 } else if (__improbable(pte_to_xprr_perm(pte) == XPRR_USER_JIT_PERM)) {
5471 /*
5472 * Always check for XPRR_USER_JIT_PERM before we check for anything else. When using
5473 * RWX permissions, the only allowed type is XNU_USER_JIT, regardless of any other
5474 * flags which the VM may have provided.
5475 *
5476 * TODO: Assert that the PMAP_OPTIONS_XNU_USER_DEBUG flag isn't set when entering
5477 * this case. We can't do this for now because this might trigger on some macOS
5478 * systems where applications use MAP_JIT with RW/RX permissions, and then later
5479 * switch to RWX (which will cause a switch to XNU_USER_JIT from XNU_USER_DEBUG
5480 * but the VM will still have PMAP_OPTIONS_XNU_USER_DEBUG set). If the VM can
5481 * catch this case, and remove PMAP_OPTIONS_XNU_USER_DEBUG when an application
5482 * switches to RWX, then we can start asserting this requirement.
5483 */
5484 *new_frame_type = XNU_USER_JIT;
5485 } else if (__improbable(options & PMAP_OPTIONS_XNU_USER_DEBUG)) {
5486 /*
5487 * Both XNU_USER_DEBUG and XNU_USER_EXEC allow RX permissions. Given that, we must
5488 * test for PMAP_OPTIONS_XNU_USER_DEBUG before we test for XNU_USER_EXEC since the
5489 * XNU_USER_DEBUG type overlays the XNU_USER_EXEC type.
5490 */
5491 *new_frame_type = XNU_USER_DEBUG;
5492 } else if (pte_to_xprr_perm(pte) == XPRR_USER_RX_PERM) {
5493 *new_frame_type = XNU_USER_EXEC;
5494 } else if ((pte_to_xprr_perm(pte) == XPRR_USER_RW_PERM) ||
5495 (pte_was_writeable(pte) && (pte_to_xprr_perm(pte) == XPRR_USER_RO_PERM))) {
5496 /**
5497 * Allow retyping from user executable types (except XNU_USER_DEBUG, which already
5498 * allows user RW mappings) back to XNU_DEFAULT if a writable mapping is requested.
5499 * Our retype logic will disconnect all existing mappings, so future attempts to
5500 * execute these pages will fault, retype back to exec, and go back through any
5501 * needed CS validation. For all other current frame types, just leave the previous
5502 * and new frame types unchanged; for most other types attempting to add a user RW
5503 * mapping is a bug and we should just let the SPTM throw a violation.
5504 */
5505 const sptm_frame_type_t cur_frame_type = sptm_get_frame_type(paddr);
5506 if (__improbable(sptm_type_is_user_executable(cur_frame_type) &&
5507 (cur_frame_type != XNU_USER_DEBUG))) {
5508 *prev_frame_type = cur_frame_type;
5509 }
5510 }
5511
5512 if (__improbable(*new_frame_type != XNU_DEFAULT)) {
5513 *prev_frame_type = sptm_get_frame_type(paddr);
5514 }
5515 }
5516
5517 /*
5518 * Construct a PTE (and the physical page attributes) for the given virtual to
5519 * physical mapping.
5520 *
5521 * @param pmap The pmap representing the address space for which to construct
5522 * the mapping.
5523 * @param pa The physical address to be mapped by the new PTE.
5524 * @param prot Access permissions to apply to the new PTE.
5525 * @param fault_type The type of access fault that is triggering the request
5526 * to construct the new PTE.
5527 * @param wired Whether the new PTE should have the wired bit set.
5528 * @param pp_attr_bits Output parameter that will return the physical page attributes
5529 * to apply to pp_attr_table for the new mapping.
5530 *
5531 * This function has no side effects and is safe to call while attempting a
5532 * pmap_enter transaction.
5533 */
5534 MARK_AS_PMAP_TEXT static pt_entry_t
5535 pmap_construct_pte(
5536 const pmap_t pmap,
5537 pmap_paddr_t pa,
5538 vm_prot_t prot,
5539 vm_prot_t fault_type,
5540 boolean_t wired,
5541 uint16_t *pp_attr_bits /* OUTPUT */
5542 )
5543 {
5544 const pt_attr_t* const pt_attr = pmap_get_pt_attr(pmap);
5545 bool set_NX = false, set_XO = false;
5546 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
5547 assert(pp_attr_bits != NULL);
5548 *pp_attr_bits = 0;
5549
5550 if (wired) {
5551 pte |= ARM_PTE_WIRED;
5552 }
5553
5554 #if DEVELOPMENT || DEBUG
5555 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5556 #else
5557 if ((prot & VM_PROT_EXECUTE))
5558 #endif
5559 {
5560 set_NX = false;
5561 } else {
5562 set_NX = true;
5563 }
5564
5565 if (prot == VM_PROT_EXECUTE) {
5566 set_XO = true;
5567
5568 }
5569
5570 if (set_NX) {
5571 pte |= pt_attr_leaf_xn(pt_attr);
5572 } else {
5573 if (pmap == kernel_pmap) {
5574 pte |= ARM_PTE_NX;
5575 } else {
5576 pte |= pt_attr_leaf_x(pt_attr);
5577 }
5578 }
5579
5580 if (pmap == kernel_pmap) {
5581 #if __ARM_KERNEL_PROTECT__
5582 pte |= ARM_PTE_NG;
5583 #endif /* __ARM_KERNEL_PROTECT__ */
5584 if (prot & VM_PROT_WRITE) {
5585 pte |= ARM_PTE_AP(AP_RWNA);
5586 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5587 } else {
5588 pte |= ARM_PTE_AP(AP_RONA);
5589 *pp_attr_bits |= PP_ATTR_REFERENCED;
5590 }
5591 } else {
5592 if (pmap->type != PMAP_TYPE_NESTED) {
5593 pte |= ARM_PTE_NG;
5594 }
5595 if (prot & VM_PROT_WRITE) {
5596 assert(pmap->type != PMAP_TYPE_NESTED);
5597 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5598 if (fault_type & VM_PROT_WRITE) {
5599 pte |= pt_attr_leaf_rw(pt_attr);
5600 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5601 } else {
5602 pte |= pt_attr_leaf_ro(pt_attr);
5603 /*
5604 * Mark the page as MODFAULT so that a subsequent write
5605 * may be handled through arm_fast_fault().
5606 */
5607 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5608 pte_set_was_writeable(pte, true);
5609 }
5610 } else {
5611 pte |= pt_attr_leaf_rw(pt_attr);
5612 *pp_attr_bits |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5613 }
5614 } else {
5615 if (set_XO) {
5616 pte |= pt_attr_leaf_rona(pt_attr);
5617 } else {
5618 pte |= pt_attr_leaf_ro(pt_attr);
5619 }
5620 *pp_attr_bits |= PP_ATTR_REFERENCED;
5621 }
5622 }
5623
5624 pte |= ARM_PTE_AF;
5625 return pte;
5626 }
5627
5628 /**
5629 * This function allows the VM to query whether a mapping operation will result in a page being
5630 * retyped, without actually performing the mapping operation. It's useful for the VM to know
5631 * this when performing up-front page validation under the VM object lock.
5632 *
5633 * @param pmap The address space in which the mapping will occur
5634 * @param vaddr The virtual address that will be mapped
5635 * @param pn The physical page number to be mapped by [vaddr]
5636 * @param prot The permissions to be used for the mapping
5637 * @param options The extra mapping options that would be passed to pmap_enter() if the
5638 * mapping operation were performed
5639 * @param mapping_type The mapping type enum that would be passed to pmap_enter() if the
5640 * mapping operation were performed
5641 *
5642 * @return True if the mapping operation would produce a retype of the page at [pn],
5643 * False otherwise
5644 */
5645 bool
5646 pmap_will_retype(
5647 pmap_t pmap,
5648 vm_map_address_t vaddr,
5649 ppnum_t pn,
5650 vm_prot_t prot,
5651 unsigned int options,
5652 pmap_mapping_type_t mapping_type)
5653 {
5654 const pmap_paddr_t paddr = ptoa(pn);
5655 uint16_t pp_attr_bits;
5656 pt_entry_t pte = pmap_construct_pte(pmap, paddr, prot, prot, false, &pp_attr_bits);
5657 sptm_frame_type_t prev_frame_type, new_frame_type;
5658 pmap_frame_type_for_pte(pmap, pte, vaddr, options, mapping_type, &prev_frame_type, &new_frame_type);
5659
5660 return new_frame_type != prev_frame_type;
5661 }
5662
5663 /*
5664 * Attempt to update a PTE constructed by pmap_enter_options().
5665 *
5666 * @note performs no page table or accounting modifications, nor any lasting SPTM page type modification, on failure.
5667 * @note expects to be called with preemption disabled to guarantee safe access to SPTM per-CPU data.
5668 *
5669 * @param pmap The pmap representing the address space in which to store the new PTE
5670 * @param pte_p The physical aperture KVA of the PTE to store
5671 * @param new_pte The new value to store in *pte_p
5672 * @param v The virtual address mapped by pte_p
5673 * @param locked_pvh Input/Output parameter pointing to a wrapped pv_head_table entry returned by
5674 * a previous call to pvh_lock(). *locked_pvh will be updated if existing mappings
5675 * need to be disconnected prior to retyping.
5676 * @param old_pte Returns the prior PTE contents, iff the PTE is successfully updated
5677 * @param options bitmask of PMAP_OPTIONS_* flags passed to pmap_enter_options().
5678 * @param mapping_type The type of the new mapping, this defines which SPTM frame type to use.
5679 *
5680 * @return SPTM_SUCCESS iff able to successfully update *pte_p to new_pte via sptm_map_page(),
5681 * SPTM_MAP_VALID if an existing mapping was successfully upgraded via sptm_map_page(),
5682 * SPTM_MAP_FLUSH_PENDING if the TLB flush of a previous mapping is still in-flight and
5683 * the mapping operation should be retried, or if the mapping operation should be retried
5684 * because we had to temporarily re-enable preemption which would invalidate caller-held
5685 * per-CPU data.
5686 * Otherwise an appropriate SPTM or TXM error code; in these cases the mapping should not be
5687 * retried and the caller should return an error.
5688 */
5689 static inline sptm_return_t
5690 pmap_enter_pte(
5691 pmap_t pmap,
5692 pt_entry_t *pte_p,
5693 pt_entry_t new_pte,
5694 locked_pvh_t *locked_pvh,
5695 pt_entry_t *old_pte,
5696 vm_map_address_t v,
5697 unsigned int options,
5698 pmap_mapping_type_t mapping_type)
5699 {
5700 sptm_pte_t prev_pte;
5701 bool changed_wiring = false;
5702
5703 assert(pte_p != NULL);
5704 assert(old_pte != NULL);
5705
5706 /* SPTM TODO: handle PAGE_RATIO_4 configurations if those devices remain supported. */
5707
5708 assert(get_preemption_level() > 0);
5709 const pmap_paddr_t pa = pte_to_pa(new_pte) & ~PAGE_MASK;
5710 sptm_frame_type_t prev_frame_type;
5711 sptm_frame_type_t new_frame_type;
5712
5713 pmap_frame_type_for_pte(pmap, new_pte, v, options, mapping_type, &prev_frame_type, &new_frame_type);
5714
5715 if (__improbable(new_frame_type != prev_frame_type)) {
5716 /**
5717 * Remove all existing mappings prior to retyping, so that we can safely retype without having to worry
5718 * about a concurrent operation on one of those mappings triggering an SPTM violation. In particular,
5719 * pmap_remove() may clear a mapping to this page without holding its PVH lock. This approach works
5720 * because we hold the PVH lock during this call, and any attempt to enter a new mapping for the page
5721 * will also need to grab the PVH lock and call this function.
5722 */
5723 pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
5724 PMAP_OPTIONS_PPO_PENDING_RETYPE, locked_pvh, NULL);
5725 /**
5726 * In the unlikely event that pmap_page_protect_options_with_flush_range() had to process
5727 * an excessively long PV list, it will have enabled preemption by placing the PVH lock
5728 * in sleep mode. In this case, we may have been migrated to a different CPU, and caller
5729 * assumptions about the state of per-CPU data (such as per-CPU PVE availability) will no
5730 * longer hold true. Ask the caller to retry by pretending we encountered a pending flush.
5731 */
5732 if (__improbable(preemption_enabled())) {
5733 return SPTM_MAP_FLUSH_PENDING;
5734 }
5735 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5736 /* Reload the existing frame type, as pmap_page_protect_options() may have changed it back to XNU_DEFAULT. */
5737 prev_frame_type = sptm_get_frame_type(pa);
5738 if (new_frame_type != prev_frame_type) {
5739 sptm_retype(pa, prev_frame_type, new_frame_type, retype_params);
5740 }
5741 }
5742
5743 if (pmap->type == PMAP_TYPE_NESTED) {
5744 /**
5745 * Enter the epoch before we check the unnesting state of the leaf page table, so that a
5746 * concurrent pmap_unnest() operation can guarantee that we either observe the unnested
5747 * table state and install a non-global mapping, or have finished installing a global mapping
5748 * before it marks all existing mappings as non-global.
5749 */
5750 pmap_epoch_enter();
5751 vm_map_offset_t nested_region_size = os_atomic_load(&pmap->nested_region_size, acquire);
5752 if (nested_region_size && (v >= pmap->nested_region_addr) && (v < (pmap->nested_region_addr + nested_region_size))) {
5753 assert(pmap->nested_region_addr != 0);
5754 assert(pmap->nested_region_unnested_table_bitmap != NULL);
5755 unsigned int index = (unsigned int)((v - pmap->nested_region_addr) >>
5756 pt_attr_twig_shift(pmap_get_pt_attr(pmap)));
5757
5758 if ((bitmap_test(pmap->nested_region_unnested_table_bitmap, UNNEST_IN_PROGRESS_BIT(index)))) {
5759 new_pte |= ARM_PTE_NG;
5760 }
5761 }
5762 }
5763 const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, v, new_pte);
5764 if (pmap->type == PMAP_TYPE_NESTED) {
5765 pmap_epoch_exit();
5766 }
5767 if (__improbable((sptm_status != SPTM_SUCCESS) && (sptm_status != SPTM_MAP_VALID))) {
5768 /*
5769 * We should always undo our previous retype, even if the SPTM returned SPTM_MAP_FLUSH_PENDING as
5770 * opposed to a TXM error. In the case of SPTM_MAP_FLUSH_PENDING, pmap_enter() will drop the PVH
5771 * lock before turning around to retry the mapping operation. It may then be possible for the
5772 * mapping state of the page to change such that our next attempt to map it will fail with a TXM
5773 * error, so if we were to leave the new type in place here we would then have lost our record
5774 * of the previous type and would effectively leave the page in an inconsistent state.
5775 */
5776 if (__improbable(new_frame_type != prev_frame_type)) {
5777 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5778 sptm_retype(pa, new_frame_type, prev_frame_type, retype_params);
5779 }
5780 return sptm_status;
5781 }
5782
5783 *old_pte = prev_pte = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes[0];
5784
5785 if (prev_pte != new_pte) {
5786 changed_wiring = pte_is_compressed(prev_pte, pte_p) ?
5787 (new_pte & ARM_PTE_WIRED) != 0 :
5788 (new_pte & ARM_PTE_WIRED) != (prev_pte & ARM_PTE_WIRED);
5789
5790 if ((pmap != kernel_pmap) && changed_wiring) {
5791 pte_update_wiredcnt(pmap, pte_p, (new_pte & ARM_PTE_WIRED) != 0);
5792 }
5793
5794 PMAP_TRACE(4 + pt_attr_leaf_level(pmap_get_pt_attr(pmap)), PMAP_CODE(PMAP__TTE),
5795 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v),
5796 VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)), new_pte);
5797 }
5798
5799 return sptm_status;
5800 }
5801
5802 MARK_AS_PMAP_TEXT static pt_entry_t
5803 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5804 {
5805 pt_entry_t pte;
5806
5807 switch (wimg & (VM_WIMG_MASK)) {
5808 case VM_WIMG_IO:
5809 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5810 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5811 // AP, while preserving the security benefits of using device
5812 // mapping against side-channel attacks. On pre-H14 platforms,
5813 // the accesses will still be strongly ordered.
5814 if (is_dram_addr(pa)) {
5815 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5816 } else {
5817 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5818 #if HAS_FEAT_XS
5819 pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5820 if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5821 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5822 }
5823 #endif /* HAS_FEAT_XS */
5824 }
5825 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5826 break;
5827 case VM_WIMG_RT:
5828 if (is_dram_addr(pa)) {
5829 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5830 } else {
5831 #if HAS_FEAT_XS
5832 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5833 #else /* HAS_FEAT_XS */
5834 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5835 #endif /* HAS_FEAT_XS */
5836 #if DEBUG || DEVELOPMENT
5837 pmap_wcrt_on_non_dram_count_increment_atomic();
5838 #endif /* DEBUG || DEVELOPMENT */
5839 }
5840 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5841 break;
5842 case VM_WIMG_POSTED:
5843 if (is_dram_addr(pa)) {
5844 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5845 } else {
5846 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5847 }
5848 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5849 break;
5850 case VM_WIMG_POSTED_REORDERED:
5851 if (is_dram_addr(pa)) {
5852 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5853 } else {
5854 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5855 }
5856 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5857 break;
5858 case VM_WIMG_POSTED_COMBINED_REORDERED:
5859 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5860 #if HAS_FEAT_XS
5861 if (!is_dram_addr(pa)) {
5862 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5863 }
5864 #endif /* HAS_FEAT_XS */
5865 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5866 break;
5867 case VM_WIMG_WCOMB:
5868 if (is_dram_addr(pa)) {
5869 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5870 } else {
5871 #if HAS_FEAT_XS
5872 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5873 #else /* HAS_FEAT_XS */
5874 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5875 #endif /* HAS_FEAT_XS */
5876 #if DEBUG || DEVELOPMENT
5877 pmap_wcrt_on_non_dram_count_increment_atomic();
5878 #endif /* DEBUG || DEVELOPMENT */
5879 }
5880 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5881 break;
5882 case VM_WIMG_WTHRU:
5883 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5884 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5885 break;
5886 case VM_WIMG_COPYBACK:
5887 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5888 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5889 break;
5890 case VM_WIMG_INNERWBACK:
5891 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5892 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5893 break;
5894 default:
5895 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5896 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5897 }
5898
5899 return pte;
5900 }
5901
5902 MARK_AS_PMAP_TEXT kern_return_t
5903 pmap_enter_options_internal(
5904 pmap_t pmap,
5905 vm_map_address_t v,
5906 pmap_paddr_t pa,
5907 vm_prot_t prot,
5908 vm_prot_t fault_type,
5909 unsigned int flags,
5910 boolean_t wired,
5911 unsigned int options,
5912 pmap_mapping_type_t mapping_type)
5913 {
5914 ppnum_t pn = (ppnum_t)atop(pa);
5915 pt_entry_t *pte_p;
5916 unsigned int wimg_bits;
5917 bool committed = false;
5918 kern_return_t kr = KERN_SUCCESS;
5919 uint16_t pp_attr_bits;
5920 pv_free_list_t *local_pv_free;
5921
5922 validate_pmap_mutable(pmap);
5923
5924 /**
5925 * Prepare for the SPTM call early by prefetching the relavant FTEs. Cache misses
5926 * in SPTM accessing these turn out to contribute to a large portion of delay on
5927 * the critical path. Technically, sptm_prefetch_fte may not find an FTE associated
5928 * with pa and return LIBSPTM_FAILURE. However, we are okay with that as it's only
5929 * a best-effort performance optimization.
5930 */
5931 sptm_prefetch_fte(pmap->ttep);
5932 sptm_prefetch_fte(pa);
5933
5934 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5935
5936 if ((v) & pt_attr_leaf_offmask(pt_attr)) {
5937 panic("pmap_enter_options() pmap %p v 0x%llx",
5938 pmap, (uint64_t)v);
5939 }
5940
5941 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
5942 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
5943 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
5944 }
5945
5946 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
5947 panic("pmap_enter_options() pmap %p pa 0x%llx",
5948 pmap, (uint64_t)pa);
5949 }
5950
5951 /* The PA should not extend beyond the architected physical address space */
5952 pa &= ARM_PTE_PAGE_MASK;
5953
5954 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
5955 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
5956 extern vm_offset_t ctrr_test_page;
5957 if (__probable(v != ctrr_test_page))
5958 #endif
5959 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
5960 }
5961 assert(pn != vm_page_fictitious_addr);
5962
5963 pmap_lock(pmap, PMAP_LOCK_SHARED);
5964
5965 /*
5966 * Expand pmap to include this pte. Assume that
5967 * pmap is always expanded to include enough hardware
5968 * pages to map one VM page.
5969 */
5970 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
5971 /* Must unlock to expand the pmap. */
5972 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5973
5974 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
5975
5976 if (kr != KERN_SUCCESS) {
5977 return kr;
5978 }
5979
5980 pmap_lock(pmap, PMAP_LOCK_SHARED);
5981 }
5982
5983 if (options & PMAP_OPTIONS_NOENTER) {
5984 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5985 return KERN_SUCCESS;
5986 }
5987
5988 /*
5989 * Since we may not hold the pmap lock exclusive, updating the pte is
5990 * done via a cmpxchg loop.
5991 * We need to be careful about modifying non-local data structures before commiting
5992 * the new pte since we may need to re-do the transaction.
5993 */
5994 const pt_entry_t prev_pte = os_atomic_load(pte_p, relaxed);
5995
5996 if (pte_is_valid(prev_pte) && (pte_to_pa(prev_pte) != pa)) {
5997 /*
5998 * There is already a mapping here & it's for a different physical page.
5999 * First remove that mapping.
6000 * We assume that we can leave the pmap lock held for shared access rather
6001 * than exclusive access here, because we assume that the VM won't try to
6002 * simultaneously map the same VA to multiple different physical pages.
6003 * If that assumption is violated, sptm_map_page() will panic as the architecture
6004 * does not allow the output address of a mapping to be changed without a break-
6005 * before-make sequence.
6006 */
6007 pmap_remove_range(pmap, v, v + PAGE_SIZE);
6008 }
6009
6010 while (!committed) {
6011 pt_entry_t spte = ARM_PTE_EMPTY;
6012 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6013 bool skip_footprint_debit = false;
6014
6015 /*
6016 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6017 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6018 * read-write protection. The PMAP layer though still needs to use the right
6019 * index, which is the older XO-now-TPRO one and that is specially selected
6020 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6021 *
6022 * Note that pmap_construct_pte() may check the nested region ASID bitmap,
6023 * which needs to happen at every iteration of the commit loop in case we
6024 * previously dropped the pmap lock.
6025 */
6026 pt_entry_t pte = pmap_construct_pte(pmap, pa,
6027 ((options & PMAP_OPTIONS_MAP_TPRO) ? VM_PROT_RORW_TP : prot), fault_type, wired, &pp_attr_bits);
6028
6029 if (pa_valid(pa)) {
6030 unsigned int pai;
6031 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6032
6033 is_internal = FALSE;
6034 is_altacct = FALSE;
6035
6036 pai = pa_index(pa);
6037 locked_pvh_t locked_pvh;
6038
6039 if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
6040 locked_pvh = pvh_lock_nopreempt(pai);
6041 } else {
6042 locked_pvh = pvh_lock(pai);
6043 }
6044
6045 /*
6046 * Make sure that the current per-cpu PV free list has
6047 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6048 * if the transaction succeeds. At this point, preemption has either
6049 * been disabled by the caller or by pvh_lock() above.
6050 * Note that we can still be interrupted, but a primary
6051 * interrupt handler can never enter the pmap.
6052 */
6053 assert(get_preemption_level() > 0);
6054 local_pv_free = &pmap_get_cpu_data()->pv_free;
6055 const bool allocation_required = !pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL) &&
6056 !(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP) && pvh_ptep(locked_pvh.pvh) == pte_p);
6057
6058 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6059 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6060 int new_allocated_pves = 0;
6061 volatile uint16_t *wiredcnt = NULL;
6062 if (pmap != kernel_pmap) {
6063 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6064 wiredcnt = &ptd_info->wiredcnt;
6065 }
6066
6067 while (new_allocated_pves < 2) {
6068 local_pv_free = &pmap_get_cpu_data()->pv_free;
6069 pv_status = pv_alloc(pmap, PMAP_LOCK_SHARED, options, &new_pve_p[new_allocated_pves], &locked_pvh, wiredcnt);
6070 if (pv_status == PV_ALLOC_FAIL) {
6071 break;
6072 } else if (pv_status == PV_ALLOC_RETRY) {
6073 /*
6074 * In the case that pv_alloc() had to grab a new page of PVEs,
6075 * it will have dropped the pmap lock while doing so.
6076 * On non-PPL devices, dropping the lock re-enables preemption so we may
6077 * be on a different CPU now.
6078 */
6079 local_pv_free = &pmap_get_cpu_data()->pv_free;
6080 } else {
6081 /* If we've gotten this far then a node should've been allocated. */
6082 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6083
6084 new_allocated_pves++;
6085 }
6086 }
6087
6088 for (int i = 0; i < new_allocated_pves; i++) {
6089 pv_free(new_pve_p[i]);
6090 }
6091 }
6092
6093 if (pv_status == PV_ALLOC_FAIL) {
6094 pvh_unlock(&locked_pvh);
6095 kr = KERN_RESOURCE_SHORTAGE;
6096 break;
6097 } else if (pv_status == PV_ALLOC_RETRY) {
6098 pvh_unlock(&locked_pvh);
6099 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6100 continue;
6101 }
6102
6103 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6104 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6105 } else {
6106 wimg_bits = pmap_cache_attributes(pn);
6107 }
6108
6109 /**
6110 * We may be retrying this operation after dropping the PVH lock.
6111 * Cache attributes for the physical page may have changed while the lock
6112 * was dropped, so update PTE cache attributes on each loop iteration.
6113 */
6114 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6115
6116
6117 const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, pte, &locked_pvh, &spte, v, options, mapping_type);
6118 assert(committed == false);
6119 if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
6120 committed = true;
6121 } else if (sptm_status == SPTM_MAP_FLUSH_PENDING) {
6122 pvh_unlock(&locked_pvh);
6123 continue;
6124 } else if (sptm_status == SPTM_MAP_CODESIGN_ERROR) {
6125 pvh_unlock(&locked_pvh);
6126 kr = KERN_CODESIGN_ERROR;
6127 break;
6128 } else {
6129 pvh_unlock(&locked_pvh);
6130 kr = KERN_FAILURE;
6131 break;
6132 }
6133 const bool had_valid_mapping = (sptm_status == SPTM_MAP_VALID);
6134 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6135 if (!had_valid_mapping) {
6136 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6137 int pve_ptep_idx = 0;
6138 pv_status = pmap_enter_pv(pmap, pte_p, options, PMAP_LOCK_SHARED, &locked_pvh, &new_pve_p, &pve_ptep_idx);
6139 /* We did all the allocations up top. So this shouldn't be able to fail. */
6140 if (pv_status != PV_ALLOC_SUCCESS) {
6141 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6142 __func__, pv_status, new_pve_p, pmap);
6143 }
6144
6145 if (pmap != kernel_pmap) {
6146 if (options & PMAP_OPTIONS_INTERNAL) {
6147 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6148 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6149 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6150 /*
6151 * Make a note to ourselves that this
6152 * mapping is using alternative
6153 * accounting. We'll need this in order
6154 * to know which ledger to debit when
6155 * the mapping is removed.
6156 *
6157 * The altacct bit must be set while
6158 * the pv head is locked. Defer the
6159 * ledger accounting until after we've
6160 * dropped the lock.
6161 */
6162 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6163 is_altacct = TRUE;
6164 }
6165 }
6166 if (ppattr_test_reusable(pai) &&
6167 !is_altacct) {
6168 is_reusable = TRUE;
6169 } else if (options & PMAP_OPTIONS_INTERNAL) {
6170 is_internal = TRUE;
6171 } else {
6172 is_external = TRUE;
6173 }
6174 }
6175 }
6176
6177 pvh_unlock(&locked_pvh);
6178
6179 if (pp_attr_bits != 0) {
6180 ppattr_pa_set_bits(pa, pp_attr_bits);
6181 }
6182
6183 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6184 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6185
6186 if (is_internal) {
6187 /*
6188 * Make corresponding adjustments to
6189 * phys_footprint statistics.
6190 */
6191 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6192 if (is_altacct) {
6193 /*
6194 * If this page is internal and
6195 * in an IOKit region, credit
6196 * the task's total count of
6197 * dirty, internal IOKit pages.
6198 * It should *not* count towards
6199 * the task's total physical
6200 * memory footprint, because
6201 * this entire region was
6202 * already billed to the task
6203 * at the time the mapping was
6204 * created.
6205 *
6206 * Put another way, this is
6207 * internal++ and
6208 * alternate_accounting++, so
6209 * net effect on phys_footprint
6210 * is 0. That means: don't
6211 * touch phys_footprint here.
6212 */
6213 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6214 } else {
6215 if (pte_is_compressed(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6216 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6217 skip_footprint_debit = true;
6218 } else {
6219 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6220 }
6221 }
6222 }
6223 if (is_reusable) {
6224 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6225 } else if (is_external) {
6226 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6227 }
6228 }
6229 } else {
6230 if (prot & VM_PROT_EXECUTE) {
6231 kr = KERN_FAILURE;
6232 break;
6233 }
6234
6235 wimg_bits = pmap_cache_attributes(pn);
6236 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6237 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6238 }
6239
6240 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6241
6242
6243 /**
6244 * pmap_enter_pte() expects to be called with preemption disabled so it can access
6245 * the per-CPU prev_ptes array.
6246 */
6247 disable_preemption();
6248 const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, pte, NULL, &spte, v, options, mapping_type);
6249 enable_preemption();
6250 assert(committed == false);
6251 if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
6252 committed = true;
6253
6254 /**
6255 * If there was already a valid pte here then we reuse its
6256 * reference on the ptd and drop the one that we took above.
6257 */
6258 } else if (__improbable(sptm_status != SPTM_MAP_FLUSH_PENDING)) {
6259 panic("%s: Unexpected SPTM return code %u for non-managed PA 0x%llx", __func__, (unsigned int)sptm_status, (unsigned long long)pa);
6260 }
6261 }
6262 if (committed) {
6263 if (pte_is_compressed(spte, pte_p)) {
6264 assert(pmap != kernel_pmap);
6265
6266 /* One less "compressed" */
6267 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6268 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6269
6270 if (spte & ARM_PTE_COMPRESSED_ALT) {
6271 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6272 } else if (!skip_footprint_debit) {
6273 /* Was part of the footprint */
6274 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6275 }
6276 }
6277 }
6278 }
6279
6280 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6281
6282 if (kr == KERN_CODESIGN_ERROR) {
6283 /* Print any logs from TXM */
6284 txm_print_logs();
6285 }
6286 return kr;
6287 }
6288
6289 kern_return_t
6290 pmap_enter_options_addr(
6291 pmap_t pmap,
6292 vm_map_address_t v,
6293 pmap_paddr_t pa,
6294 vm_prot_t prot,
6295 vm_prot_t fault_type,
6296 unsigned int flags,
6297 boolean_t wired,
6298 unsigned int options,
6299 __unused void *arg,
6300 pmap_mapping_type_t mapping_type)
6301 {
6302 kern_return_t kr = KERN_FAILURE;
6303
6304
6305 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6306 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6307
6308 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options, mapping_type);
6309
6310 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6311
6312 return kr;
6313 }
6314
6315 kern_return_t
6316 pmap_enter_options(
6317 pmap_t pmap,
6318 vm_map_address_t v,
6319 ppnum_t pn,
6320 vm_prot_t prot,
6321 vm_prot_t fault_type,
6322 unsigned int flags,
6323 boolean_t wired,
6324 unsigned int options,
6325 __unused void *arg,
6326 pmap_mapping_type_t mapping_type)
6327 {
6328 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot,
6329 fault_type, flags, wired, options, arg, mapping_type);
6330 }
6331
6332 /*
6333 * Routine: pmap_change_wiring
6334 * Function: Change the wiring attribute for a map/virtual-address
6335 * pair.
6336 * In/out conditions:
6337 * The mapping must already exist in the pmap.
6338 */
6339 MARK_AS_PMAP_TEXT void
6340 pmap_change_wiring_internal(
6341 pmap_t pmap,
6342 vm_map_address_t v,
6343 boolean_t wired)
6344 {
6345 pt_entry_t *pte_p, prev_pte;
6346
6347 validate_pmap_mutable(pmap);
6348
6349 pmap_lock(pmap, PMAP_LOCK_SHARED);
6350
6351 const pt_entry_t new_wiring = (wired ? ARM_PTE_WIRED : 0);
6352
6353 pte_p = pmap_pte(pmap, v);
6354 if (pte_p == PT_ENTRY_NULL) {
6355 if (!wired) {
6356 /*
6357 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6358 * may have been freed by a remove operation.
6359 */
6360 goto pmap_change_wiring_return;
6361 } else {
6362 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6363 }
6364 }
6365
6366 disable_preemption();
6367 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
6368 sptm_pcpu->sptm_templates[0] = (*pte_p & ~ARM_PTE_WIRED) | new_wiring;
6369
6370 pmap_epoch_enter();
6371 sptm_update_region(pmap->ttep, v, 1, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_SW_WIRED);
6372 pmap_epoch_exit();
6373
6374 prev_pte = os_atomic_load(&sptm_pcpu->sptm_prev_ptes[0], relaxed);
6375 enable_preemption();
6376
6377 if (!pte_is_valid(prev_pte)) {
6378 goto pmap_change_wiring_return;
6379 }
6380
6381 if ((pmap != kernel_pmap) && (wired != pte_is_wired(prev_pte))) {
6382 pte_update_wiredcnt(pmap, pte_p, wired);
6383 }
6384
6385 pmap_change_wiring_return:
6386 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6387 }
6388
6389 void
6390 pmap_change_wiring(
6391 pmap_t pmap,
6392 vm_map_address_t v,
6393 boolean_t wired)
6394 {
6395 pmap_change_wiring_internal(pmap, v, wired);
6396 }
6397
6398 MARK_AS_PMAP_TEXT pmap_paddr_t
6399 pmap_find_pa_internal(
6400 pmap_t pmap,
6401 addr64_t va)
6402 {
6403 pmap_paddr_t pa = 0;
6404
6405 validate_pmap(pmap);
6406
6407 if (pmap != kernel_pmap) {
6408 pmap_lock(pmap, PMAP_LOCK_SHARED);
6409 }
6410
6411 pa = pmap_vtophys(pmap, va);
6412
6413 if (pmap != kernel_pmap) {
6414 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6415 }
6416
6417 return pa;
6418 }
6419
6420 pmap_paddr_t
6421 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6422 {
6423 pmap_paddr_t pa = 0;
6424
6425 if (pmap == kernel_pmap) {
6426 pa = mmu_kvtop(va);
6427 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6428 /*
6429 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6430 * translation even if PAN would prevent kernel access through the translation.
6431 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6432 */
6433 pa = mmu_uvtop(va);
6434 }
6435 return pa;
6436 }
6437
6438 pmap_paddr_t
6439 pmap_find_pa(
6440 pmap_t pmap,
6441 addr64_t va)
6442 {
6443 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6444
6445 if (pa != 0) {
6446 return pa;
6447 }
6448
6449 if (not_in_kdp) {
6450 return pmap_find_pa_internal(pmap, va);
6451 } else {
6452 return pmap_vtophys(pmap, va);
6453 }
6454 }
6455
6456 ppnum_t
6457 pmap_find_phys_nofault(
6458 pmap_t pmap,
6459 addr64_t va)
6460 {
6461 ppnum_t ppn;
6462 ppn = atop(pmap_find_pa_nofault(pmap, va));
6463 return ppn;
6464 }
6465
6466 ppnum_t
6467 pmap_find_phys(
6468 pmap_t pmap,
6469 addr64_t va)
6470 {
6471 ppnum_t ppn;
6472 ppn = atop(pmap_find_pa(pmap, va));
6473 return ppn;
6474 }
6475
6476 /**
6477 * Translate a kernel virtual address into a physical address.
6478 *
6479 * @param va The kernel virtual address to translate. Does not work on user
6480 * virtual addresses.
6481 *
6482 * @return The physical address if the translation was successful, or zero if
6483 * no valid mappings were found for the given virtual address.
6484 */
6485 pmap_paddr_t
6486 kvtophys(vm_offset_t va)
6487 {
6488 sptm_paddr_t pa;
6489
6490 if (sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS) {
6491 return 0;
6492 }
6493
6494 return pa;
6495 }
6496
6497 /**
6498 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6499 * points to a non-kernel-managed physical page, then this call will panic().
6500 *
6501 * @note The output of this function is guaranteed to be a kernel-managed
6502 * physical page, which means it's safe to pass the output directly to
6503 * pa_index() to create a physical address index for various pmap data
6504 * structures.
6505 *
6506 * @param va The kernel virtual address to translate. Does not work on user
6507 * virtual addresses.
6508 *
6509 * @return The translated physical address for the given virtual address.
6510 */
6511 pmap_paddr_t
6512 kvtophys_nofail(vm_offset_t va)
6513 {
6514 pmap_paddr_t pa;
6515
6516 if (__improbable(sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS)) {
6517 panic("%s: VA->PA translation failed for va %p", __func__, (void *)va);
6518 }
6519
6520 return pa;
6521 }
6522
6523 pmap_paddr_t
6524 pmap_vtophys(
6525 pmap_t pmap,
6526 addr64_t va)
6527 {
6528 if ((va < pmap->min) || (va >= pmap->max)) {
6529 return 0;
6530 }
6531
6532 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6533
6534 tt_entry_t * ttp = NULL;
6535 tt_entry_t * ttep = NULL;
6536 tt_entry_t tte = ARM_TTE_EMPTY;
6537 pmap_paddr_t pa = 0;
6538 unsigned int cur_level;
6539
6540 ttp = pmap->tte;
6541
6542 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6543 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6544
6545 tte = *ttep;
6546
6547 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6548 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6549 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6550 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6551
6552 if ((tte & valid_mask) != valid_mask) {
6553 return (pmap_paddr_t) 0;
6554 }
6555
6556 /* This detects both leaf entries and intermediate block mappings. */
6557 if ((tte & type_mask) == type_block) {
6558 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6559 break;
6560 }
6561
6562 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6563 }
6564
6565 return pa;
6566 }
6567
6568 /*
6569 * pmap_init_pte_page - Initialize a page table page.
6570 */
6571 MARK_AS_PMAP_TEXT void
6572 pmap_init_pte_page(
6573 pmap_t pmap,
6574 pt_entry_t *pte_p,
6575 vm_offset_t va,
6576 unsigned int ttlevel,
6577 boolean_t alloc_ptd)
6578 {
6579 pt_desc_t *ptdp = NULL;
6580 unsigned int pai = pa_index(kvtophys_nofail((vm_offset_t)pte_p));
6581 const uintptr_t pvh = pai_to_pvh(pai);
6582
6583 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6584 if (alloc_ptd) {
6585 /*
6586 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6587 * on 4KB hardware, we may already have allocated a page table descriptor for a
6588 * bootstrap request, so we check for an existing PTD here.
6589 */
6590 ptdp = ptd_alloc(pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
6591 if (ptdp == NULL) {
6592 panic("%s: unable to allocate PTD", __func__);
6593 }
6594 locked_pvh_t locked_pvh = pvh_lock(pai);
6595 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
6596 pvh_unlock(&locked_pvh);
6597 } else {
6598 panic("pmap_init_pte_page(): no PTD for pte_p %p", pte_p);
6599 }
6600 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6601 ptdp = pvh_ptd(pvh);
6602 } else {
6603 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6604 }
6605
6606 // pagetable zero-fill and barrier should be guaranteed by the SPTM
6607 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6608 }
6609
6610 /*
6611 * This function guarantees that a pmap has the necessary page tables in place
6612 * to map the specified VA. If necessary, it will allocate new tables at any
6613 * non-root level in the hierarchy (the root table is always already allocated
6614 * and stored in the pmap).
6615 *
6616 * @note This function is expected to be called without any pmap or PVH lock
6617 * held.
6618 *
6619 * @note It is possible for an L3 table newly allocated by this function to be
6620 * deleted by another thread before control returns to the caller, iff that
6621 * table is an ordinary userspace table. Callers that use this function
6622 * to allocate new user L3 tables are therefore expected to keep calling
6623 * this function until they observe a successful L3 PTE lookup with the pmap
6624 * lock held. As long as it does not drop the pmap lock, the caller may
6625 * then safely use the looked-up L3 table. See the use of this function in
6626 * pmap_enter_options_internal() for an example.
6627 *
6628 * @param pmap The pmap for which to ensure mapping space is present.
6629 * @param vaddr The virtual address for which to ensure mapping space is present
6630 * in [pmap].
6631 * @param options Flags to pass to pmap_tt_allocate() if a new table needs to be
6632 * allocated. The only valid option is PMAP_OPTIONS_NOWAIT, which
6633 * specifies that the allocation must not block.
6634 * @param level The maximum paging level for which to ensure a table is present.
6635 *
6636 * @return KERN_INVALID_ADDRESS if [v] is outside the pmap's mappable range,
6637 * KERN_RESOURCE_SHORTAGE if a new table can't be allocated,
6638 * KERN_SUCCESS otherwise.
6639 */
6640 MARK_AS_PMAP_TEXT static kern_return_t
6641 pmap_expand(
6642 pmap_t pmap,
6643 vm_map_address_t vaddr,
6644 unsigned int options,
6645 unsigned int level)
6646 {
6647 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6648
6649 if (__improbable((vaddr < pmap->min) || (vaddr >= pmap->max))) {
6650 return KERN_INVALID_ADDRESS;
6651 }
6652 pmap_paddr_t table_pa = pmap->ttep;
6653 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
6654 const uint64_t table_align_mask = (PAGE_SIZE / pmap_page_size) - 1;
6655 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6656 tt_entry_t *table_ttep = pmap->tte;
6657 tt_entry_t *ttep;
6658 tt_entry_t old_tte = ARM_TTE_EMPTY;
6659
6660 for (; ttlevel < level; ttlevel++) {
6661 /**
6662 * If the previous iteration didn't allocate a new table, obtain the table from the previous TTE.
6663 * Doing this step at the beginning of the loop instead of the end (which would make it part of
6664 * the prior iteration) avoids the possibility of executing this step to extract an L3 table KVA
6665 * from an L2 TTE, which would be useless because there would be no next iteration to make use
6666 * of the table KVA.
6667 */
6668 if (table_ttep == NULL) {
6669 assert(tte_is_valid_table(old_tte));
6670 table_pa = old_tte & ARM_TTE_TABLE_MASK;
6671 table_ttep = (tt_entry_t*)phystokv(table_pa);
6672 }
6673
6674 vm_map_address_t v = pt_attr_align_va(pt_attr, ttlevel, vaddr);
6675
6676 /**
6677 * We don't need to hold the pmap lock while walking the paging hierarchy. Only L3 tables are
6678 * allowed to be dynamically removed, and only for regular user pmaps at that. We may allocate
6679 * a new L3 table below, but we will only access L0-L2 tables, so there's no risk of a table
6680 * being deleted while we are using it for the next level(s) of lookup.
6681 */
6682 ttep = &table_ttep[ttn_index(pt_attr, vaddr, ttlevel)];
6683 old_tte = os_atomic_load(ttep, relaxed);
6684 table_ttep = NULL;
6685 if (!tte_is_valid_table(old_tte)) {
6686 tt_entry_t new_tte, *new_ttep;
6687 pt_desc_t *new_ptdp;
6688 while (pmap_tt_allocate(pmap, &new_ttep, &new_ptdp, ttlevel + 1, options | PMAP_PAGE_NOZEROFILL) != KERN_SUCCESS) {
6689 if (options & PMAP_OPTIONS_NOWAIT) {
6690 return KERN_RESOURCE_SHORTAGE;
6691 }
6692 VM_PAGE_WAIT();
6693 }
6694 assert(pa_valid(table_pa));
6695 /**
6696 * Grab the lower-level table's PVH lock to ensure we don't try to concurrently map different
6697 * tables at the same TTE.
6698 */
6699 locked_pvh_t locked_pvh = pvh_lock(pa_index(table_pa));
6700 old_tte = os_atomic_load(ttep, relaxed);
6701 if (!tte_is_valid_table(old_tte)) {
6702 /**
6703 * This call must be issued prior to sptm_map_table() so that the page table's
6704 * PTD info is valid by the time the new table becomes visible in the paging
6705 * hierarchy. sptm_map_table() is expected to issue a barrier that effectively
6706 * guarantees the PTD update will be visible to concurrent observers as soon as
6707 * the new table becomes visible in the paging hierarchy.
6708 */
6709 pmap_init_pte_page(pmap, (pt_entry_t *) new_ttep, v, ttlevel + 1, FALSE);
6710 pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)new_ttep);
6711 /*
6712 * If the table is going to map a kernel RO zone VA region, then we must
6713 * upgrade its SPTM type to XNU_PAGE_TABLE_ROZONE. The SPTM's type system
6714 * requires the table to be transitioned through XNU_DEFAULT for refcount
6715 * enforcement, which is fine since this path is expected to execute only
6716 * once during boot.
6717 */
6718 if (__improbable(ttlevel == pt_attr_twig_level(pt_attr)) &&
6719 (pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE)) {
6720 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
6721 sptm_retype(pa, XNU_PAGE_TABLE, XNU_DEFAULT, retype_params);
6722 retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
6723 sptm_retype(pa, XNU_DEFAULT, XNU_PAGE_TABLE_ROZONE, retype_params);
6724 }
6725 new_tte = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6726 sptm_map_table(pmap->ttep, v, (sptm_pt_level_t)ttlevel, new_tte);
6727 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6728 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), new_tte);
6729
6730 /**
6731 * Now that we've fully mapped the table, do final initialization of PTD
6732 * state, which includes dropping the wired count to allow future reclamation
6733 * of the page table page.
6734 */
6735 ptd_info_finalize(new_ptdp);
6736
6737 table_pa = pa;
6738 /**
6739 * If we need to set up multiple TTEs mapping different parts of the same page
6740 * (e.g. because we're carving multiple 4K page tables out of a 16K native page,
6741 * determine which of the grouped TTEs is the one that we need to follow for the
6742 * next level of the table walk.
6743 */
6744 table_ttep = new_ttep + ((((uintptr_t)ttep / sizeof(tt_entry_t)) & table_align_mask) *
6745 (pmap_page_size / sizeof(tt_entry_t)));
6746 new_ttep = (tt_entry_t *)NULL;
6747 }
6748 pvh_unlock(&locked_pvh);
6749
6750 if (new_ttep != (tt_entry_t *)NULL) {
6751 pmap_tt_deallocate(pmap, new_ttep, ttlevel + 1);
6752 new_ttep = (tt_entry_t *)NULL;
6753 }
6754 }
6755 }
6756
6757 return KERN_SUCCESS;
6758 }
6759
6760 /*
6761 * Routine: pmap_gc
6762 * Function:
6763 * Pmap garbage collection
6764 * Called by the pageout daemon when pages are scarce.
6765 *
6766 */
6767 void
6768 pmap_gc(void)
6769 {
6770 /*
6771 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6772 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6773 * or may contain wired mappings. However, it may make sense to scan the pmap VM
6774 * object here, and for each page consult the SPTM frame table and if necessary
6775 * the PTD in the PV head table. If the frame table indicates the page is a leaf
6776 * page table page and the PTD indicates it has no wired mappings, we can call
6777 * pmap_remove() on the VA region mapped by the page and therein return the page
6778 * to the VM.
6779 */
6780 }
6781
6782 /*
6783 * By default, don't attempt pmap GC more frequently
6784 * than once / 1 minutes.
6785 */
6786
6787 void
6788 compute_pmap_gc_throttle(
6789 void *arg __unused)
6790 {
6791 }
6792
6793 /*
6794 * pmap_attribute_cache_sync(vm_offset_t pa)
6795 *
6796 * Invalidates all of the instruction cache on a physical page and
6797 * pushes any dirty data from the data cache for the same physical page
6798 */
6799
6800 kern_return_t
6801 pmap_attribute_cache_sync(
6802 ppnum_t pp,
6803 vm_size_t size,
6804 __unused vm_machine_attribute_t attribute,
6805 __unused vm_machine_attribute_val_t * value)
6806 {
6807 if (size > PAGE_SIZE) {
6808 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
6809 } else {
6810 cache_sync_page(pp);
6811 }
6812
6813 return KERN_SUCCESS;
6814 }
6815
6816 /*
6817 * pmap_sync_page_data_phys(ppnum_t pp)
6818 *
6819 * Invalidates all of the instruction cache on a physical page and
6820 * pushes any dirty data from the data cache for the same physical page.
6821 * Not required on SPTM systems, because the SPTM automatically performs
6822 * the invalidate operation when retyping to one of the types that allow
6823 * for executable permissions.
6824 */
6825 void
6826 pmap_sync_page_data_phys(
6827 __unused ppnum_t pp)
6828 {
6829 return;
6830 }
6831
6832 /*
6833 * pmap_sync_page_attributes_phys(ppnum_t pp)
6834 *
6835 * Write back and invalidate all cachelines on a physical page.
6836 */
6837 void
6838 pmap_sync_page_attributes_phys(
6839 ppnum_t pp)
6840 {
6841 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
6842 }
6843
6844 #if CONFIG_COREDUMP
6845 /* temporary workaround */
6846 boolean_t
6847 coredumpok(
6848 vm_map_t map,
6849 mach_vm_offset_t va)
6850 {
6851 pt_entry_t *pte_p;
6852 pt_entry_t spte;
6853
6854 pte_p = pmap_pte(map->pmap, va);
6855 if (0 == pte_p) {
6856 return FALSE;
6857 }
6858 if (vm_map_entry_has_device_pager(map, va)) {
6859 return FALSE;
6860 }
6861 spte = *pte_p;
6862 return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT;
6863 }
6864 #endif
6865
6866 void
6867 fillPage(
6868 ppnum_t pn,
6869 unsigned int fill)
6870 {
6871 unsigned int *addr;
6872 int count;
6873
6874 addr = (unsigned int *) phystokv(ptoa(pn));
6875 count = PAGE_SIZE / sizeof(unsigned int);
6876 while (count--) {
6877 *addr++ = fill;
6878 }
6879 }
6880
6881 extern void mapping_set_mod(ppnum_t pn);
6882
6883 void
6884 mapping_set_mod(
6885 ppnum_t pn)
6886 {
6887 pmap_set_modify(pn);
6888 }
6889
6890 extern void mapping_set_ref(ppnum_t pn);
6891
6892 void
6893 mapping_set_ref(
6894 ppnum_t pn)
6895 {
6896 pmap_set_reference(pn);
6897 }
6898
6899 /*
6900 * Clear specified attribute bits.
6901 *
6902 * Try to force an arm_fast_fault() for all mappings of
6903 * the page - to force attributes to be set again at fault time.
6904 * If the forcing succeeds, clear the cached bits at the head.
6905 * Otherwise, something must have been wired, so leave the cached
6906 * attributes alone.
6907 */
6908 MARK_AS_PMAP_TEXT static void
6909 phys_attribute_clear_with_flush_range(
6910 ppnum_t pn,
6911 unsigned int bits,
6912 int options,
6913 void *arg,
6914 pmap_tlb_flush_range_t *flush_range)
6915 {
6916 pmap_paddr_t pa = ptoa(pn);
6917 vm_prot_t allow_mode = VM_PROT_ALL;
6918
6919 if ((arg != NULL) || (flush_range != NULL)) {
6920 options = options & ~PMAP_OPTIONS_NOFLUSH;
6921 }
6922
6923 if (__improbable((options & PMAP_OPTIONS_FF_WIRED) != 0)) {
6924 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
6925 "invalid options",
6926 pn, bits, options, arg, flush_range);
6927 }
6928
6929 if (__improbable((bits & PP_ATTR_MODIFIED) &&
6930 (options & PMAP_OPTIONS_NOFLUSH))) {
6931 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
6932 "should not clear 'modified' without flushing TLBs",
6933 pn, bits, options, arg, flush_range);
6934 }
6935
6936 assert(pn != vm_page_fictitious_addr);
6937
6938 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
6939 assert(bits == PP_ATTR_MODIFIED);
6940
6941 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, NULL, flush_range);
6942 /*
6943 * We short circuit this case; it should not need to
6944 * invoke arm_force_fast_fault, so just clear the modified bit.
6945 * pmap_page_protect has taken care of resetting
6946 * the state so that we'll see the next write as a fault to
6947 * the VM (i.e. we don't want a fast fault).
6948 */
6949 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6950 return;
6951 }
6952 if (bits & PP_ATTR_REFERENCED) {
6953 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
6954 }
6955 if (bits & PP_ATTR_MODIFIED) {
6956 allow_mode &= ~VM_PROT_WRITE;
6957 }
6958
6959 if (bits == PP_ATTR_NOENCRYPT) {
6960 /*
6961 * We short circuit this case; it should not need to
6962 * invoke arm_force_fast_fault, so just clear and
6963 * return. On ARM, this bit is just a debugging aid.
6964 */
6965 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6966 return;
6967 }
6968
6969 arm_force_fast_fault_with_flush_range(pn, allow_mode, options, NULL, (pp_attr_t)bits, flush_range);
6970 }
6971
6972 MARK_AS_PMAP_TEXT void
6973 phys_attribute_clear_internal(
6974 ppnum_t pn,
6975 unsigned int bits,
6976 int options,
6977 void *arg)
6978 {
6979 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
6980 }
6981
6982 #if __ARM_RANGE_TLBI__
6983
6984 MARK_AS_PMAP_TEXT static vm_map_address_t
6985 phys_attribute_clear_twig_internal(
6986 pmap_t pmap,
6987 vm_map_address_t start,
6988 vm_map_address_t end,
6989 unsigned int bits,
6990 unsigned int options,
6991 pmap_tlb_flush_range_t *flush_range)
6992 {
6993 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
6994 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6995 assert(end >= start);
6996 assert((end - start) <= pt_attr_twig_size(pt_attr));
6997 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
6998 vm_map_address_t va = start;
6999 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7000 tt_entry_t *tte_p;
7001 tte_p = pmap_tte(pmap, start);
7002
7003 /**
7004 * It's possible that this portion of our VA region has never been paged in, in which case
7005 * there may not be a valid twig or leaf table here.
7006 */
7007 if ((tte_p == (tt_entry_t *) NULL) || !tte_is_valid_table(*tte_p)) {
7008 assert(flush_range->pending_region_entries == 0);
7009 return end;
7010 }
7011
7012 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7013
7014 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7015 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7016 assert(end_pte_p >= start_pte_p);
7017 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7018 if (flush_range->pending_region_entries == 0) {
7019 flush_range->pending_region_start = va;
7020 } else {
7021 assertf((flush_range->pending_region_start +
7022 (flush_range->pending_region_entries * pmap_page_size)) == va,
7023 "pending_region_start 0x%llx + 0x%lx pages != va 0%llx",
7024 (unsigned long long)flush_range->pending_region_start,
7025 (unsigned long)flush_range->pending_region_entries,
7026 (unsigned long long)va);
7027 }
7028 flush_range->current_ptep = curr_pte_p;
7029 const pt_entry_t spte = os_atomic_load(curr_pte_p, relaxed);
7030 const pmap_paddr_t pa = pte_to_pa(spte);
7031 if (pte_is_valid(spte) && pa_valid(pa)) {
7032 /* The PTE maps a managed page, so do the appropriate PV list-based permission changes. */
7033 const ppnum_t pn = (ppnum_t) atop(pa);
7034 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7035 if (__probable(flush_range->region_entry_added)) {
7036 flush_range->region_entry_added = false;
7037 } else {
7038 /**
7039 * It's possible that some other thread removed the mapping between our check
7040 * of the PTE above and taking the PVH lock in the
7041 * phys_attribute_clear_with_flush_range() path. In that case we have a
7042 * discontinuity in the region to update, so just submit any pending region
7043 * templates and start a new region op on the next iteration.
7044 */
7045 pmap_multipage_op_submit_region(flush_range);
7046 }
7047 } else if (__improbable(!pte_is_valid(spte))) {
7048 /**
7049 * We've found an invalid mapping, so we have a discontinuity in the the region to
7050 * update. Handle this by submitting any pending region templates and starting a new
7051 * region on the next iteration. In theory we could instead handle this by installing
7052 * a "safe" (AF bit cleared, minimal permissions) PTE template; the SPTM would just
7053 * ignore the update on finding an invalid mapping in the PTE. But we don't know
7054 * what a "safe" template will be in all cases: for example, JIT regions require all
7055 * mappings to either be invalid or to have full RWX permissions.
7056 */
7057 pmap_multipage_op_submit_region(flush_range);
7058 } else if (pmap_insert_flush_range_template(spte, flush_range)) {
7059 /**
7060 * We've found a mapping to a non-managed page, so just insert the existing
7061 * PTE into the pending region ops since we don't manage attributes for non-managed
7062 * pages.
7063 * If pmap_insert_flush_range_template() returns true, indicating that it reached
7064 * the mapping limit and submitted the SPTM call, then we also submit any pending
7065 * disjoint ops. Having pending operations in either category will keep preemption
7066 * disabled, and we want to ensure that we can at least temporarily
7067 * re-enable preemption every SPTM_MAPPING_LIMIT mappings.
7068 */
7069 pmap_multipage_op_submit_disjoint(0, flush_range);
7070 }
7071
7072 /**
7073 * If the total number of pending + processed entries exceeds the mapping threshold,
7074 * we may need to submit all pending operations to avoid excessive preemption latency.
7075 * Otherwise, a small number of pending disjoint or region ops can hold preemption
7076 * disabled across an arbitrary number of total processed entries.
7077 * As an optimization, we may be able to avoid submitting if no urgent AST is
7078 * pending on the local CPU, but only if we aren't currently in an epoch. If we are
7079 * in an epoch, failure to submit in a timely manner can cause another CPU to wait
7080 * too long for our epoch to drain.
7081 */
7082 if (((flush_range->processed_entries + flush_range->pending_disjoint_entries +
7083 flush_range->pending_region_entries) >= SPTM_MAPPING_LIMIT) &&
7084 (pmap_in_epoch() || pmap_pending_preemption())) {
7085 pmap_multipage_op_submit(flush_range);
7086 assert(preemption_enabled());
7087 }
7088 }
7089
7090 /* SPTM region ops can't span L3 table boundaries, so submit any pending region templates now. */
7091 pmap_multipage_op_submit_region(flush_range);
7092 return end;
7093 }
7094
7095 MARK_AS_PMAP_TEXT vm_map_address_t
7096 phys_attribute_clear_range_internal(
7097 pmap_t pmap,
7098 vm_map_address_t start,
7099 vm_map_address_t end,
7100 unsigned int bits,
7101 unsigned int options)
7102 {
7103 if (__improbable(end < start)) {
7104 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7105 }
7106 validate_pmap_mutable(pmap);
7107
7108 vm_map_address_t va = start;
7109 pmap_tlb_flush_range_t flush_range = {
7110 .ptfr_pmap = pmap,
7111 .ptfr_start = start,
7112 .ptfr_end = end,
7113 .current_ptep = NULL,
7114 .pending_region_start = 0,
7115 .pending_region_entries = 0,
7116 .region_entry_added = false,
7117 .current_header = NULL,
7118 .current_header_first_mapping_index = 0,
7119 .processed_entries = 0,
7120 .pending_disjoint_entries = 0,
7121 .ptfr_flush_needed = false
7122 };
7123
7124 pmap_lock(pmap, PMAP_LOCK_SHARED);
7125 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7126
7127 while (va < end) {
7128 vm_map_address_t curr_end;
7129
7130 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7131 if (curr_end > end) {
7132 curr_end = end;
7133 }
7134
7135 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7136 }
7137 pmap_multipage_op_submit(&flush_range);
7138 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7139 assert((flush_range.pending_disjoint_entries == 0) && (flush_range.pending_region_entries == 0));
7140 if (flush_range.ptfr_flush_needed) {
7141 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7142 flush_range.ptfr_start,
7143 flush_range.ptfr_end - flush_range.ptfr_start,
7144 flush_range.ptfr_pmap,
7145 true);
7146 sync_tlb_flush();
7147 }
7148 return va;
7149 }
7150
7151 static void
7152 phys_attribute_clear_range(
7153 pmap_t pmap,
7154 vm_map_address_t start,
7155 vm_map_address_t end,
7156 unsigned int bits,
7157 unsigned int options)
7158 {
7159 /*
7160 * We allow single-page requests to execute non-preemptibly,
7161 * as it doesn't make sense to sample AST_URGENT for a single-page
7162 * operation, and there are a couple of special use cases that
7163 * require a non-preemptible single-page operation.
7164 */
7165 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7166 pmap_verify_preemptible();
7167 }
7168 __assert_only const int preemption_level = get_preemption_level();
7169
7170 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7171
7172 phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7173
7174 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7175
7176 assert(preemption_level == get_preemption_level());
7177 }
7178 #endif /* __ARM_RANGE_TLBI__ */
7179
7180 static void
7181 phys_attribute_clear(
7182 ppnum_t pn,
7183 unsigned int bits,
7184 int options,
7185 void *arg)
7186 {
7187 /*
7188 * Do we really want this tracepoint? It will be extremely chatty.
7189 * Also, should we have a corresponding trace point for the set path?
7190 */
7191 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7192
7193 phys_attribute_clear_internal(pn, bits, options, arg);
7194
7195 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7196 }
7197
7198 /*
7199 * Set specified attribute bits.
7200 *
7201 * Set cached value in the pv head because we have
7202 * no per-mapping hardware support for referenced and
7203 * modify bits.
7204 */
7205 MARK_AS_PMAP_TEXT void
7206 phys_attribute_set_internal(
7207 ppnum_t pn,
7208 unsigned int bits)
7209 {
7210 pmap_paddr_t pa = ptoa(pn);
7211 assert(pn != vm_page_fictitious_addr);
7212
7213 ppattr_pa_set_bits(pa, (uint16_t)bits);
7214
7215 return;
7216 }
7217
7218 static void
7219 phys_attribute_set(
7220 ppnum_t pn,
7221 unsigned int bits)
7222 {
7223 phys_attribute_set_internal(pn, bits);
7224 }
7225
7226
7227 /*
7228 * Check specified attribute bits.
7229 *
7230 * use the software cached bits (since no hw support).
7231 */
7232 static boolean_t
7233 phys_attribute_test(
7234 ppnum_t pn,
7235 unsigned int bits)
7236 {
7237 pmap_paddr_t pa = ptoa(pn);
7238 assert(pn != vm_page_fictitious_addr);
7239 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7240 }
7241
7242
7243 /*
7244 * Set the modify/reference bits on the specified physical page.
7245 */
7246 void
7247 pmap_set_modify(ppnum_t pn)
7248 {
7249 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7250 }
7251
7252
7253 /*
7254 * Clear the modify bits on the specified physical page.
7255 */
7256 void
7257 pmap_clear_modify(
7258 ppnum_t pn)
7259 {
7260 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7261 }
7262
7263
7264 /*
7265 * pmap_is_modified:
7266 *
7267 * Return whether or not the specified physical page is modified
7268 * by any physical maps.
7269 */
7270 boolean_t
7271 pmap_is_modified(
7272 ppnum_t pn)
7273 {
7274 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7275 }
7276
7277
7278 /*
7279 * Set the reference bit on the specified physical page.
7280 */
7281 static void
7282 pmap_set_reference(
7283 ppnum_t pn)
7284 {
7285 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7286 }
7287
7288 /*
7289 * Clear the reference bits on the specified physical page.
7290 */
7291 void
7292 pmap_clear_reference(
7293 ppnum_t pn)
7294 {
7295 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7296 }
7297
7298
7299 /*
7300 * pmap_is_referenced:
7301 *
7302 * Return whether or not the specified physical page is referenced
7303 * by any physical maps.
7304 */
7305 boolean_t
7306 pmap_is_referenced(
7307 ppnum_t pn)
7308 {
7309 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7310 }
7311
7312 /*
7313 * pmap_get_refmod(phys)
7314 * returns the referenced and modified bits of the specified
7315 * physical page.
7316 */
7317 unsigned int
7318 pmap_get_refmod(
7319 ppnum_t pn)
7320 {
7321 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7322 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7323 }
7324
7325 static inline unsigned int
7326 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7327 {
7328 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7329 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7330 }
7331
7332 /*
7333 * pmap_clear_refmod(phys, mask)
7334 * clears the referenced and modified bits as specified by the mask
7335 * of the specified physical page.
7336 */
7337 void
7338 pmap_clear_refmod_options(
7339 ppnum_t pn,
7340 unsigned int mask,
7341 unsigned int options,
7342 void *arg)
7343 {
7344 unsigned int bits;
7345
7346 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7347 phys_attribute_clear(pn, bits, options, arg);
7348 }
7349
7350 /*
7351 * Perform pmap_clear_refmod_options on a virtual address range.
7352 * The operation will be performed in bulk & tlb flushes will be coalesced
7353 * if possible.
7354 *
7355 * Returns true if the operation is supported on this platform.
7356 * If this function returns false, the operation is not supported and
7357 * nothing has been modified in the pmap.
7358 */
7359 bool
7360 pmap_clear_refmod_range_options(
7361 pmap_t pmap __unused,
7362 vm_map_address_t start __unused,
7363 vm_map_address_t end __unused,
7364 unsigned int mask __unused,
7365 unsigned int options __unused)
7366 {
7367 #if __ARM_RANGE_TLBI__
7368 unsigned int bits;
7369 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7370 phys_attribute_clear_range(pmap, start, end, bits, options);
7371 return true;
7372 #else /* __ARM_RANGE_TLBI__ */
7373 #pragma unused(pmap, start, end, mask, options)
7374 /*
7375 * This operation allows the VM to bulk modify refmod bits on a virtually
7376 * contiguous range of addresses. This is large performance improvement on
7377 * platforms that support ranged tlbi instructions. But on older platforms,
7378 * we can only flush per-page or the entire asid. So we currently
7379 * only support this operation on platforms that support ranged tlbi.
7380 * instructions. On other platforms, we require that
7381 * the VM modify the bits on a per-page basis.
7382 */
7383 return false;
7384 #endif /* __ARM_RANGE_TLBI__ */
7385 }
7386
7387 void
7388 pmap_clear_refmod(
7389 ppnum_t pn,
7390 unsigned int mask)
7391 {
7392 pmap_clear_refmod_options(pn, mask, 0, NULL);
7393 }
7394
7395 unsigned int
7396 pmap_disconnect_options(
7397 ppnum_t pn,
7398 unsigned int options,
7399 void *arg)
7400 {
7401 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7402 /*
7403 * On ARM, the "modified" bit is managed by software, so
7404 * we know up-front if the physical page is "modified",
7405 * without having to scan all the PTEs pointing to it.
7406 * The caller should have made the VM page "busy" so noone
7407 * should be able to establish any new mapping and "modify"
7408 * the page behind us.
7409 */
7410 if (pmap_is_modified(pn)) {
7411 /*
7412 * The page has been modified and will be sent to
7413 * the VM compressor.
7414 */
7415 options |= PMAP_OPTIONS_COMPRESSOR;
7416 } else {
7417 /*
7418 * The page hasn't been modified and will be freed
7419 * instead of compressed.
7420 */
7421 }
7422 }
7423
7424 /* disconnect the page */
7425 pmap_page_protect_options(pn, 0, options, arg);
7426
7427 /* return ref/chg status */
7428 return pmap_get_refmod(pn);
7429 }
7430
7431 /*
7432 * Routine:
7433 * pmap_disconnect
7434 *
7435 * Function:
7436 * Disconnect all mappings for this page and return reference and change status
7437 * in generic format.
7438 *
7439 */
7440 unsigned int
7441 pmap_disconnect(
7442 ppnum_t pn)
7443 {
7444 pmap_page_protect(pn, 0); /* disconnect the page */
7445 return pmap_get_refmod(pn); /* return ref/chg status */
7446 }
7447
7448 boolean_t
7449 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7450 {
7451 if (ptoa(first) >= vm_last_phys) {
7452 return FALSE;
7453 }
7454 if (ptoa(last) < vm_first_phys) {
7455 return FALSE;
7456 }
7457
7458 return TRUE;
7459 }
7460
7461 /*
7462 * The state maintained by the noencrypt functions is used as a
7463 * debugging aid on ARM. This incurs some overhead on the part
7464 * of the caller. A special case check in phys_attribute_clear
7465 * (the most expensive path) currently minimizes this overhead,
7466 * but stubbing these functions out on RELEASE kernels yields
7467 * further wins.
7468 */
7469 boolean_t
7470 pmap_is_noencrypt(
7471 ppnum_t pn)
7472 {
7473 #if DEVELOPMENT || DEBUG
7474 boolean_t result = FALSE;
7475
7476 if (!pa_valid(ptoa(pn))) {
7477 return FALSE;
7478 }
7479
7480 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7481
7482 return result;
7483 #else
7484 #pragma unused(pn)
7485 return FALSE;
7486 #endif
7487 }
7488
7489 void
7490 pmap_set_noencrypt(
7491 ppnum_t pn)
7492 {
7493 #if DEVELOPMENT || DEBUG
7494 if (!pa_valid(ptoa(pn))) {
7495 return;
7496 }
7497
7498 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7499 #else
7500 #pragma unused(pn)
7501 #endif
7502 }
7503
7504 void
7505 pmap_clear_noencrypt(
7506 ppnum_t pn)
7507 {
7508 #if DEVELOPMENT || DEBUG
7509 if (!pa_valid(ptoa(pn))) {
7510 return;
7511 }
7512
7513 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7514 #else
7515 #pragma unused(pn)
7516 #endif
7517 }
7518
7519 void
7520 pmap_lock_phys_page(ppnum_t pn)
7521 {
7522 unsigned int pai;
7523 pmap_paddr_t phys = ptoa(pn);
7524
7525 if (pa_valid(phys)) {
7526 pai = pa_index(phys);
7527 __unused const locked_pvh_t locked_pvh = pvh_lock(pai);
7528 } else {
7529 simple_lock(&phys_backup_lock, LCK_GRP_NULL);
7530 }
7531 }
7532
7533
7534 void
7535 pmap_unlock_phys_page(ppnum_t pn)
7536 {
7537 unsigned int pai;
7538 pmap_paddr_t phys = ptoa(pn);
7539
7540 if (pa_valid(phys)) {
7541 pai = pa_index(phys);
7542 locked_pvh_t locked_pvh = {.pvh = pai_to_pvh(pai), .pai = pai};
7543 pvh_unlock(&locked_pvh);
7544 } else {
7545 simple_unlock(&phys_backup_lock);
7546 }
7547 }
7548
7549 MARK_AS_PMAP_TEXT void
7550 pmap_clear_user_ttb_internal(void)
7551 {
7552 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7553 }
7554
7555 void
7556 pmap_clear_user_ttb(void)
7557 {
7558 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7559 pmap_clear_user_ttb_internal();
7560 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7561 }
7562
7563 /**
7564 * Set up a "fast fault", or a page fault that won't go through the VM layer on
7565 * a page. This is primarily used to manage ref/mod bits in software. Depending
7566 * on the value of allow_mode, the next read and/or write of the page will fault
7567 * and the ref/mod bits will be updated.
7568 *
7569 * @param ppnum Page number to set up a fast fault on.
7570 * @param allow_mode VM_PROT_NONE will cause the next read and write access to
7571 * fault.
7572 * VM_PROT_READ will only cause the next write access to fault.
7573 * Other values are undefined.
7574 * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
7575 * PMAP_OPTIONS_FF_WIRED forces a fast fault even on wired pages.
7576 * PMAP_OPTIONS_SET_REUSABLE/PMAP_OPTIONS_CLEAR_REUSABLE updates
7577 * the global reusable bit of the page.
7578 * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
7579 * by the caller. This is an input/output parameter which may be updated
7580 * to reflect a new PV head value to be passed to a later call to pvh_unlock().
7581 * @param bits_to_clear Mask of additional pp_attr_t bits to clear for the physical
7582 * page, iff this function completes successfully and returns
7583 * TRUE. This is typically some combination of
7584 * the referenced, modified, and noencrypt bits.
7585 * @param flush_range When present, this function will skip the TLB flush for the
7586 * mappings that are covered by the range, leaving that to be
7587 * done later by the caller. It may also avoid submitting mapping
7588 * updates directly to the SPTM, instead accumulating them in a
7589 * per-CPU array to be submitted later by the caller.
7590 *
7591 * @return TRUE if the fast fault was successfully configured for all mappings
7592 * of the page, FALSE otherwise (e.g. if wired mappings are present and
7593 * PMAP_OPTIONS_FF_WIRED was not passed).
7594 *
7595 * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
7596 *
7597 * @warning PMAP_OPTIONS_FF_WIRED should only be used with pages accessible from
7598 * EL0. The kernel may assume that accesses to wired, kernel-owned pages
7599 * won't fault.
7600 */
7601 MARK_AS_PMAP_TEXT static boolean_t
7602 arm_force_fast_fault_with_flush_range(
7603 ppnum_t ppnum,
7604 vm_prot_t allow_mode,
7605 int options,
7606 locked_pvh_t *locked_pvh,
7607 pp_attr_t bits_to_clear,
7608 pmap_tlb_flush_range_t *flush_range)
7609 {
7610 pmap_paddr_t phys = ptoa(ppnum);
7611 pv_entry_t *pve_p;
7612 pt_entry_t *pte_p;
7613 unsigned int pai;
7614 boolean_t result;
7615 unsigned int num_mappings = 0, num_skipped_mappings = 0;
7616 bool ref_fault;
7617 bool mod_fault;
7618 bool clear_write_fault = false;
7619 bool ref_aliases_mod = false;
7620
7621 assert(ppnum != vm_page_fictitious_addr);
7622
7623 /**
7624 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
7625 *
7626 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
7627 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
7628 * semantics conflict with each other, so assert they are not both true.
7629 */
7630 assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
7631
7632 if (!pa_valid(phys)) {
7633 return FALSE; /* Not a managed page. */
7634 }
7635
7636 result = TRUE;
7637 ref_fault = false;
7638 mod_fault = false;
7639 pai = pa_index(phys);
7640 locked_pvh_t local_locked_pvh = {.pvh = 0};
7641 if (__probable(locked_pvh == NULL)) {
7642 if (flush_range != NULL) {
7643 /**
7644 * If we're partway through processing a multi-page batched call,
7645 * preemption will already be disabled so we can't simply call
7646 * pvh_lock() which may block. Instead, we first try to acquire
7647 * the lock without waiting, which in most cases should succeed.
7648 * If it fails, we submit the pending batched operations to re-
7649 * enable preemption and then acquire the lock normally.
7650 */
7651 local_locked_pvh = pvh_try_lock(pai);
7652 if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
7653 pmap_multipage_op_submit(flush_range);
7654 local_locked_pvh = pvh_lock(pai);
7655 }
7656 } else {
7657 local_locked_pvh = pvh_lock(pai);
7658 }
7659 } else {
7660 local_locked_pvh = *locked_pvh;
7661 assert(pai == local_locked_pvh.pai);
7662 }
7663 assert(local_locked_pvh.pvh != 0);
7664 pvh_assert_locked(pai);
7665
7666 pte_p = PT_ENTRY_NULL;
7667 pve_p = PV_ENTRY_NULL;
7668 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
7669 pte_p = pvh_ptep(local_locked_pvh.pvh);
7670 } else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
7671 pve_p = pvh_pve_list(local_locked_pvh.pvh);
7672 } else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
7673 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
7674 }
7675
7676 const bool is_reusable = ppattr_test_reusable(pai);
7677
7678 bool pvh_lock_sleep_mode_needed = false;
7679 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
7680 sptm_disjoint_op_t *sptm_ops = NULL;
7681
7682 /**
7683 * This would also work as a block, with the above variables declared using the
7684 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
7685 * dereferencing __block variables through stack forwarding pointers) isn't needed
7686 * here, as we never need to use this code sequence as a closure.
7687 */
7688 #define FFF_PERCPU_INIT() do { \
7689 disable_preemption(); \
7690 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
7691 sptm_ops = sptm_pcpu->sptm_ops; \
7692 } while (0)
7693
7694 FFF_PERCPU_INIT();
7695
7696 int pve_ptep_idx = 0;
7697
7698 /**
7699 * With regard to TLBI, there are three cases:
7700 *
7701 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
7702 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
7703 * itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
7704 * mapping is out of the range.
7705 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
7706 * let SPTM handle TLBI flushing.
7707 */
7708 const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
7709 const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
7710
7711 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7712 pt_entry_t spte;
7713 pt_entry_t tmplate;
7714
7715 if (__improbable(pvh_lock_sleep_mode_needed)) {
7716 assert((num_mappings == 0) && (num_skipped_mappings == 0));
7717 /**
7718 * Undo the explicit preemption disable done in the last call to FFF_PER_CPU_INIT().
7719 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
7720 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
7721 * core while processing SPTM per-CPU data. At the same time, we also want preemption
7722 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
7723 * urgent ASTs can be handled.
7724 */
7725 enable_preemption();
7726 pvh_lock_enter_sleep_mode(&local_locked_pvh);
7727 pvh_lock_sleep_mode_needed = false;
7728 FFF_PERCPU_INIT();
7729 }
7730
7731 if (pve_p != PV_ENTRY_NULL) {
7732 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7733 if (pte_p == PT_ENTRY_NULL) {
7734 goto fff_skip_pve;
7735 }
7736 }
7737
7738 #ifdef PVH_FLAG_IOMMU
7739 if (pvh_ptep_is_iommu(pte_p)) {
7740 ++num_skipped_mappings;
7741 goto fff_skip_pve;
7742 }
7743 #endif
7744 spte = os_atomic_load(pte_p, relaxed);
7745 if (pte_is_compressed(spte, pte_p)) {
7746 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7747 }
7748
7749 pt_desc_t *ptdp = NULL;
7750 pmap_t pmap = NULL;
7751 vm_map_address_t va = 0;
7752
7753 if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
7754 /**
7755 * If the current mapping matches the flush range's current iteration position,
7756 * there's no need to do the work of getting the PTD. We already know the pmap,
7757 * and the VA is implied by flush_range->pending_region_start.
7758 */
7759 pmap = flush_range->ptfr_pmap;
7760 } else {
7761 ptdp = ptep_get_ptd(pte_p);
7762 pmap = ptdp->pmap;
7763 va = ptd_get_va(ptdp, pte_p);
7764 assert(va >= pmap->min && va < pmap->max);
7765 }
7766
7767 bool skip_pte = pte_is_wired(spte) &&
7768 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7769
7770 if (skip_pte) {
7771 result = FALSE;
7772 }
7773
7774 // A concurrent pmap_remove() may have cleared the PTE
7775 if (__improbable(!pte_is_valid(spte))) {
7776 skip_pte = true;
7777 }
7778
7779 /**
7780 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
7781 * pending disjoint ops, so we don't need to do flush range disjoint op management.
7782 */
7783 if ((flush_range != NULL) && (ptdp != NULL) && !skip_pte) {
7784 /**
7785 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
7786 * We do this in three cases:
7787 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
7788 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
7789 * for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
7790 * 3) We need to change the options passed to the SPTM for a run of one or more mappings. Specifically,
7791 * if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
7792 * belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
7793 * the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
7794 */
7795 uint32_t per_mapping_sptm_update_options = sptm_update_options;
7796 if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
7797 per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
7798 }
7799 if ((num_mappings == 0) ||
7800 (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
7801 if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
7802 /**
7803 * If we needed to submit the pending disjoint ops to make room for the new page,
7804 * flush any pending region ops to reenable preemption and restart the loop with
7805 * the lock in sleep mode. This prevents preemption from being held disabled
7806 * for an arbitrary amount of time in the pathological case in which we have
7807 * both pending region ops and an excessively long PV list that repeatedly
7808 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
7809 */
7810 pmap_multipage_op_submit_region(flush_range);
7811 assert(num_mappings == 0);
7812 num_skipped_mappings = 0;
7813 pvh_lock_sleep_mode_needed = true;
7814 continue;
7815 }
7816 }
7817 }
7818
7819 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7820
7821 /* update pmap stats and ledgers */
7822 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7823 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7824 if (is_altacct) {
7825 /*
7826 * We do not track "reusable" status for
7827 * "alternate accounting" mappings.
7828 */
7829 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7830 is_reusable &&
7831 is_internal &&
7832 pmap != kernel_pmap) {
7833 /* one less "reusable" */
7834 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7835 /* one more "internal" */
7836 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7837 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7838
7839 /*
7840 * Since the page is being marked non-reusable, we assume that it will be
7841 * modified soon. Avoid the cost of another trap to handle the fast
7842 * fault when we next write to this page.
7843 */
7844 clear_write_fault = true;
7845 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7846 !is_reusable &&
7847 is_internal &&
7848 pmap != kernel_pmap) {
7849 /* one more "reusable" */
7850 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7851 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7852 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7853 }
7854
7855 if (skip_pte) {
7856 ++num_skipped_mappings;
7857 goto fff_skip_pve;
7858 }
7859
7860 tmplate = spte;
7861
7862 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7863 /* read protection sets the pte to fault */
7864 tmplate = tmplate & ~ARM_PTE_AF;
7865 ref_fault = true;
7866 }
7867 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7868 /* take away write permission if set */
7869 if (pmap == kernel_pmap) {
7870 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7871 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7872 pte_set_was_writeable(tmplate, true);
7873 mod_fault = true;
7874 }
7875 } else {
7876 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7877 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7878 pte_set_was_writeable(tmplate, true);
7879 mod_fault = true;
7880 }
7881 }
7882 }
7883
7884 if (ptdp != NULL) {
7885 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
7886 sptm_ops[num_mappings].vaddr = va;
7887 sptm_ops[num_mappings].pte_template = tmplate;
7888 ++num_mappings;
7889 } else if (pmap_insert_flush_range_template(tmplate, flush_range)) {
7890 /**
7891 * We submit both the pending disjoint and pending region ops whenever
7892 * either category reaches the mapping limit. Having pending operations
7893 * in either category will keep preemption disabled, and we want to ensure
7894 * that we can at least temporarily re-enable preemption roughly every
7895 * SPTM_MAPPING_LIMIT mappings.
7896 */
7897 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
7898 pvh_lock_sleep_mode_needed = true;
7899 num_mappings = num_skipped_mappings = 0;
7900 }
7901 fff_skip_pve:
7902 if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
7903 if (flush_range != NULL) {
7904 /* See comment above for why we submit both disjoint and region ops when we hit the limit. */
7905 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
7906 pmap_multipage_op_submit_region(flush_range);
7907 } else if (num_mappings > 0) {
7908 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
7909 }
7910 pvh_lock_sleep_mode_needed = true;
7911 num_mappings = num_skipped_mappings = 0;
7912 }
7913 pte_p = PT_ENTRY_NULL;
7914 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7915 pve_ptep_idx = 0;
7916 pve_p = pve_next(pve_p);
7917 }
7918 }
7919
7920 if (num_mappings != 0) {
7921 sptm_return_t sptm_ret;
7922
7923 if (flush_range == NULL) {
7924 sptm_ret = sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
7925 } else {
7926 /* Resync the pending mapping state in flush_range with our local state. */
7927 assert(num_mappings >= flush_range->pending_disjoint_entries);
7928 flush_range->pending_disjoint_entries = num_mappings;
7929 }
7930 }
7931
7932 /**
7933 * Undo the explicit disable_preemption() done in FFF_PERCPU_INIT().
7934 * Note that enable_preemption() decrements a per-thread counter, so if
7935 * we happen to still hold the PVH lock in spin mode then preemption won't
7936 * actually be re-enabled until we drop the lock (which also decrements
7937 * the per-thread counter.
7938 */
7939 enable_preemption();
7940
7941 /*
7942 * If we are using the same approach for ref and mod
7943 * faults on this PTE, do not clear the write fault;
7944 * this would cause both ref and mod to be set on the
7945 * page again, and prevent us from taking ANY read/write
7946 * fault on the mapping.
7947 */
7948 if (clear_write_fault && !ref_aliases_mod) {
7949 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, local_locked_pvh.pvh, PT_ENTRY_NULL, 0);
7950 }
7951
7952 pp_attr_t attrs_to_clear = (result ? bits_to_clear : 0);
7953 pp_attr_t attrs_to_set = 0;
7954 /* update global "reusable" status for this page */
7955 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
7956 attrs_to_clear |= PP_ATTR_REUSABLE;
7957 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
7958 attrs_to_set |= PP_ATTR_REUSABLE;
7959 }
7960
7961 if (mod_fault) {
7962 attrs_to_set |= PP_ATTR_MODFAULT;
7963 }
7964 if (ref_fault) {
7965 attrs_to_set |= PP_ATTR_REFFAULT;
7966 }
7967
7968 if (attrs_to_set | attrs_to_clear) {
7969 ppattr_modify_bits(pai, attrs_to_clear, attrs_to_set);
7970 }
7971
7972 if (__probable(locked_pvh == NULL)) {
7973 pvh_unlock(&local_locked_pvh);
7974 } else {
7975 *locked_pvh = local_locked_pvh;
7976 }
7977 if ((flush_range != NULL) && !preemption_enabled()) {
7978 flush_range->processed_entries += num_skipped_mappings;
7979 }
7980 return result;
7981 }
7982
7983 MARK_AS_PMAP_TEXT boolean_t
7984 arm_force_fast_fault_internal(
7985 ppnum_t ppnum,
7986 vm_prot_t allow_mode,
7987 int options)
7988 {
7989 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
7990 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
7991 }
7992 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL, 0, NULL);
7993 }
7994
7995 /*
7996 * Routine: arm_force_fast_fault
7997 *
7998 * Function:
7999 * Force all mappings for this page to fault according
8000 * to the access modes allowed, so we can gather ref/modify
8001 * bits again.
8002 */
8003
8004 boolean_t
8005 arm_force_fast_fault(
8006 ppnum_t ppnum,
8007 vm_prot_t allow_mode,
8008 int options,
8009 __unused void *arg)
8010 {
8011 pmap_paddr_t phys = ptoa(ppnum);
8012
8013 assert(ppnum != vm_page_fictitious_addr);
8014
8015 if (!pa_valid(phys)) {
8016 return FALSE; /* Not a managed page. */
8017 }
8018
8019 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8020 }
8021
8022 /**
8023 * Clear pending force fault for at most SPTM_MAPPING_LIMIT mappings for this
8024 * page based on the observed fault type, and update the appropriate ref/modify
8025 * bits for the physical page. This typically involves adding write permissions
8026 * back for write faults and setting the Access Flag for both read/write faults
8027 * (since the lack of those things is what caused the fault in the first place).
8028 *
8029 * @note Only SPTM_MAPPING_LIMIT number of mappings can be modified in a single
8030 * arm_clear_fast_fault() call to prevent excessive PVH lock contention as
8031 * the PVH lock should be held for `ppnum` already. If a fault is
8032 * subsequently taken on a mapping we haven't processed, arm_fast_fault()
8033 * will call this function with a non-NULL pte_p to perform a targeted
8034 * fixup.
8035 *
8036 * @param ppnum Page number of the page to clear a pending force fault on.
8037 * @param fault_type The type of access/fault that triggered us wanting to clear
8038 * the pending force fault status. This determines how we
8039 * modify the PTE to not cause a fault in the future and also
8040 * whether we mark the PTE as referenced or modified.
8041 * Typically a write fault would cause the page to be marked
8042 * as referenced and modified, and a read fault would only
8043 * cause the page to be marked as referenced.
8044 * @param pvh pv_head_table entry value for [ppnum] returned by a previous call
8045 * to pvh_lock().
8046 * @param pte_p If this value is non-PT_ENTRY_NULL then only this specified PTE
8047 * will be modified. If it is PT_ENTRY_NULL, then every mapping to
8048 * `ppnum` will be modified.
8049 * @param attrs_to_clear Mask of additional pp_attr_t bits to clear for the physical
8050 * page upon completion of this function. This is typically
8051 * some combination of the REFFAULT and MODFAULT bits.
8052 *
8053 * @return TRUE if any PTEs were modified, FALSE otherwise.
8054 */
8055 MARK_AS_PMAP_TEXT static boolean_t
8056 arm_clear_fast_fault(
8057 ppnum_t ppnum,
8058 vm_prot_t fault_type,
8059 uintptr_t pvh,
8060 pt_entry_t *pte_p,
8061 pp_attr_t attrs_to_clear)
8062 {
8063 const pmap_paddr_t pa = ptoa(ppnum);
8064 pv_entry_t *pve_p;
8065 boolean_t result;
8066 unsigned int num_mappings = 0, num_skipped_mappings = 0;
8067 pp_attr_t attrs_to_set = 0;
8068
8069 assert(ppnum != vm_page_fictitious_addr);
8070
8071 if (!pa_valid(pa)) {
8072 return FALSE; /* Not a managed page. */
8073 }
8074
8075 result = FALSE;
8076 pve_p = PV_ENTRY_NULL;
8077 if (pte_p == PT_ENTRY_NULL) {
8078 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
8079 pte_p = pvh_ptep(pvh);
8080 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
8081 pve_p = pvh_pve_list(pvh);
8082 } else if (__improbable(!pvh_test_type(pvh, PVH_TYPE_NULL))) {
8083 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)pvh, (uint64_t)pa);
8084 }
8085 }
8086
8087 disable_preemption();
8088 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
8089 sptm_disjoint_op_t *sptm_ops = sptm_pcpu->sptm_ops;
8090
8091 int pve_ptep_idx = 0;
8092
8093 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8094 pt_entry_t spte;
8095 pt_entry_t tmplate;
8096
8097 if (pve_p != PV_ENTRY_NULL) {
8098 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8099 if (pte_p == PT_ENTRY_NULL) {
8100 goto cff_skip_pve;
8101 }
8102 }
8103
8104 #ifdef PVH_FLAG_IOMMU
8105 if (pvh_ptep_is_iommu(pte_p)) {
8106 ++num_skipped_mappings;
8107 goto cff_skip_pve;
8108 }
8109 #endif
8110 spte = os_atomic_load(pte_p, relaxed);
8111 // A concurrent pmap_remove() may have cleared the PTE
8112 if (__improbable(!pte_is_valid(spte))) {
8113 ++num_skipped_mappings;
8114 goto cff_skip_pve;
8115 }
8116
8117 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8118 const pmap_t pmap = ptdp->pmap;
8119
8120 tmplate = spte;
8121
8122 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8123 assert(pmap);
8124 {
8125 if (pmap == kernel_pmap) {
8126 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8127 } else {
8128 assert(pmap->type != PMAP_TYPE_NESTED);
8129 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8130 }
8131 }
8132
8133 tmplate |= ARM_PTE_AF;
8134
8135 pte_set_was_writeable(tmplate, false);
8136 attrs_to_set |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8137 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8138 assert(pmap);
8139 tmplate = spte | ARM_PTE_AF;
8140
8141 {
8142 attrs_to_set |= PP_ATTR_REFERENCED;
8143 }
8144 }
8145
8146 assert(spte != ARM_PTE_EMPTY);
8147
8148 if (spte != tmplate) {
8149 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8150 assert(va >= pmap->min && va < pmap->max);
8151
8152 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
8153 sptm_ops[num_mappings].vaddr = va;
8154 sptm_ops[num_mappings].pte_template = tmplate;
8155 ++num_mappings;
8156 result = TRUE;
8157 }
8158
8159 cff_skip_pve:
8160 if ((num_mappings + num_skipped_mappings) == SPTM_MAPPING_LIMIT) {
8161 if (num_mappings != 0) {
8162 sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
8163 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
8164 num_mappings = 0;
8165 }
8166 /*
8167 * We've reached the limit of mappings that can be processed in a single arm_clear_fast_fault()
8168 * call. Bail out here to avoid excessive PVH lock duration on the fault path. If a fault is
8169 * subsequently taken on a mapping we haven't processed, arm_fast_fault() will call this
8170 * function with a non-NULL pte_p to perform a targeted fixup.
8171 */
8172 break;
8173 }
8174
8175 pte_p = PT_ENTRY_NULL;
8176 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8177 pve_ptep_idx = 0;
8178 pve_p = pve_next(pve_p);
8179 }
8180 }
8181
8182 if (num_mappings != 0) {
8183 assert(result == TRUE);
8184 sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
8185 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
8186 }
8187
8188 if (attrs_to_set | attrs_to_clear) {
8189 ppattr_modify_bits(pa_index(pa), attrs_to_clear, attrs_to_set);
8190 }
8191 enable_preemption();
8192
8193 return result;
8194 }
8195
8196 /*
8197 * Determine if the fault was induced by software tracking of
8198 * modify/reference bits. If so, re-enable the mapping (and set
8199 * the appropriate bits).
8200 *
8201 * Returns KERN_SUCCESS if the fault was induced and was
8202 * successfully handled.
8203 *
8204 * Returns KERN_FAILURE if the fault was not induced and
8205 * the function was unable to deal with it.
8206 *
8207 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8208 * disallows this type of access.
8209 */
8210 MARK_AS_PMAP_TEXT kern_return_t
8211 arm_fast_fault_internal(
8212 pmap_t pmap,
8213 vm_map_address_t va,
8214 vm_prot_t fault_type,
8215 __unused bool was_af_fault,
8216 __unused bool from_user)
8217 {
8218 kern_return_t result = KERN_FAILURE;
8219 pt_entry_t *ptep;
8220 pt_entry_t spte = ARM_PTE_EMPTY;
8221 locked_pvh_t locked_pvh = {.pvh = 0};
8222 unsigned int pai;
8223 pmap_paddr_t pa;
8224 validate_pmap_mutable(pmap);
8225
8226 if (__probable(preemption_enabled())) {
8227 pmap_lock(pmap, PMAP_LOCK_SHARED);
8228 } else if (__improbable(!pmap_try_lock(pmap, PMAP_LOCK_SHARED))) {
8229 /**
8230 * In certain cases, arm_fast_fault() may be invoked with preemption disabled
8231 * on the copyio path. In theses cases the (in-kernel) caller expects that any
8232 * faults taken against the user address may not be handled successfully
8233 * (vm_fault() allows non-preemptible callers with the possibility that the
8234 * fault may not be successfully handled) and will result in the copyio operation
8235 * returning EFAULT. It is then the caller's responsibility to retry the copyio
8236 * operation in a preemptible context.
8237 *
8238 * For these cases attempting to acquire the sleepable lock will panic, so
8239 * we simply make a best effort and return failure just as the VM does if we
8240 * can't acquire the lock without sleeping.
8241 */
8242 return result;
8243 }
8244
8245 /*
8246 * If the entry doesn't exist, is completely invalid, or is already
8247 * valid, we can't fix it here.
8248 */
8249
8250 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8251 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8252 if (ptep != PT_ENTRY_NULL) {
8253 while (true) {
8254 spte = os_atomic_load(ptep, relaxed);
8255
8256 pa = pte_to_pa(spte);
8257
8258 if ((spte == ARM_PTE_EMPTY) || pte_is_compressed(spte, ptep)) {
8259 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8260 return result;
8261 }
8262
8263 if (!pa_valid(pa)) {
8264 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
8265 if (frame_type == XNU_PROTECTED_IO) {
8266 result = KERN_PROTECTION_FAILURE;
8267 }
8268 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8269 return result;
8270 }
8271 pai = pa_index(pa);
8272 /**
8273 * Check for preemption disablement and in that case use pvh_try_lock()
8274 * for the same reason we use pmap_try_lock() above.
8275 */
8276 if (__probable(preemption_enabled())) {
8277 locked_pvh = pvh_lock(pai);
8278 } else {
8279 locked_pvh = pvh_try_lock(pai);
8280 if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
8281 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8282 return result;
8283 }
8284 }
8285 assert(locked_pvh.pvh != 0);
8286 if (os_atomic_load(ptep, relaxed) == spte) {
8287 /*
8288 * Double-check the spte value, as we care about the AF bit.
8289 * It's also possible that pmap_page_protect() transitioned the
8290 * PTE to compressed/empty before we grabbed the PVH lock.
8291 */
8292 break;
8293 }
8294 pvh_unlock(&locked_pvh);
8295 }
8296 } else {
8297 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8298 return result;
8299 }
8300
8301
8302 if (result == KERN_SUCCESS) {
8303 goto ff_cleanup;
8304 }
8305
8306 pp_attr_t attrs = os_atomic_load(&pp_attr_table[pai], relaxed);
8307 if ((attrs & PP_ATTR_REFFAULT) || ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT))) {
8308 /*
8309 * An attempted access will always clear ref/mod fault state, as
8310 * appropriate for the fault type. arm_clear_fast_fault will
8311 * update the associated PTEs for the page as appropriate; if
8312 * any PTEs are updated, we redrive the access. If the mapping
8313 * does not actually allow for the attempted access, the
8314 * following fault will (hopefully) fail to update any PTEs, and
8315 * thus cause arm_fast_fault to decide that it failed to handle
8316 * the fault.
8317 */
8318 pp_attr_t attrs_to_clear = 0;
8319 if (attrs & PP_ATTR_REFFAULT) {
8320 attrs_to_clear |= PP_ATTR_REFFAULT;
8321 }
8322 if ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT)) {
8323 attrs_to_clear |= PP_ATTR_MODFAULT;
8324 }
8325
8326 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, PT_ENTRY_NULL, attrs_to_clear)) {
8327 /*
8328 * Should this preserve KERN_PROTECTION_FAILURE? The
8329 * cost of not doing so is a another fault in a case
8330 * that should already result in an exception.
8331 */
8332 result = KERN_SUCCESS;
8333 }
8334 }
8335
8336 /*
8337 * If the PTE already has sufficient permissions, we can report the fault as handled.
8338 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8339 * on mappings of the same page
8340 */
8341 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8342 uintptr_t ap_ro, ap_rw, ap_x;
8343 if (pmap == kernel_pmap) {
8344 ap_ro = ARM_PTE_AP(AP_RONA);
8345 ap_rw = ARM_PTE_AP(AP_RWNA);
8346 ap_x = ARM_PTE_NX;
8347 } else {
8348 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8349 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8350 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8351 }
8352 /*
8353 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8354 * hardware they may be xPRR-protected, in which case they'll be handled
8355 * by the is_pte_xprr_protected() case above. Additionally, the exception
8356 * handling path currently does not call arm_fast_fault() without at least
8357 * VM_PROT_READ in fault_type.
8358 */
8359 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8360 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8361 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8362 result = KERN_SUCCESS;
8363 }
8364 }
8365 }
8366
8367 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, ptep, 0)) {
8368 /*
8369 * A prior arm_clear_fast_fault() operation may have returned early due to
8370 * another pending PV list operation or an excessively large PV list.
8371 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8372 * taking a fault on the same mapping.
8373 */
8374 result = KERN_SUCCESS;
8375 }
8376
8377 ff_cleanup:
8378
8379 pvh_unlock(&locked_pvh);
8380 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8381 return result;
8382 }
8383
8384 kern_return_t
8385 arm_fast_fault(
8386 pmap_t pmap,
8387 vm_map_address_t va,
8388 vm_prot_t fault_type,
8389 bool was_af_fault,
8390 __unused bool from_user)
8391 {
8392 kern_return_t result = KERN_FAILURE;
8393
8394 if (va < pmap->min || va >= pmap->max) {
8395 return result;
8396 }
8397
8398 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8399 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8400 from_user);
8401
8402
8403 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8404
8405 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8406
8407 return result;
8408 }
8409
8410 void
8411 pmap_copy_page(
8412 ppnum_t psrc,
8413 ppnum_t pdst,
8414 int options)
8415 {
8416 bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8417 (addr64_t) (ptoa(pdst)),
8418 PAGE_SIZE,
8419 options);
8420 }
8421
8422
8423 /*
8424 * pmap_copy_page copies the specified (machine independent) pages.
8425 */
8426 void
8427 pmap_copy_part_page(
8428 ppnum_t psrc,
8429 vm_offset_t src_offset,
8430 ppnum_t pdst,
8431 vm_offset_t dst_offset,
8432 vm_size_t len)
8433 {
8434 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8435 (addr64_t) (ptoa(pdst) + dst_offset),
8436 len);
8437 }
8438
8439
8440 /*
8441 * pmap_zero_page zeros the specified (machine independent) page.
8442 */
8443 void
8444 pmap_zero_page(
8445 ppnum_t pn)
8446 {
8447 assert(pn != vm_page_fictitious_addr);
8448 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8449 }
8450
8451 /*
8452 * pmap_zero_page_with_options allows to specify further operations
8453 * to perform with the zeroing.
8454 */
8455 void
8456 pmap_zero_page_with_options(
8457 ppnum_t pn,
8458 int options)
8459 {
8460 assert(pn != vm_page_fictitious_addr);
8461 bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8462 }
8463
8464 /*
8465 * pmap_zero_part_page
8466 * zeros the specified (machine independent) part of a page.
8467 */
8468 void
8469 pmap_zero_part_page(
8470 ppnum_t pn,
8471 vm_offset_t offset,
8472 vm_size_t len)
8473 {
8474 assert(pn != vm_page_fictitious_addr);
8475 assert(offset + len <= PAGE_SIZE);
8476 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8477 }
8478
8479 void
8480 pmap_map_globals(
8481 void)
8482 {
8483 pt_entry_t pte;
8484
8485 pte = pa_to_pte(kvtophys_nofail((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX |
8486 ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8487 #if __ARM_KERNEL_PROTECT__
8488 pte |= ARM_PTE_NG;
8489 #endif /* __ARM_KERNEL_PROTECT__ */
8490 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8491 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8492 sptm_map_page(kernel_pmap->ttep, LOWGLOBAL_ALIAS, pte);
8493
8494
8495 #if KASAN
8496 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8497 #endif
8498 }
8499
8500 vm_offset_t
8501 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8502 {
8503 if (__improbable(index >= CPUWINDOWS_MAX)) {
8504 panic("%s: invalid index %u", __func__, index);
8505 }
8506 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8507 }
8508
8509 MARK_AS_PMAP_TEXT unsigned int
8510 pmap_map_cpu_windows_copy_internal(
8511 ppnum_t pn,
8512 vm_prot_t prot,
8513 unsigned int wimg_bits)
8514 {
8515 pt_entry_t *ptep = NULL, pte;
8516 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8517 unsigned int cpu_num;
8518 unsigned int cpu_window_index;
8519 vm_offset_t cpu_copywindow_vaddr = 0;
8520 bool need_strong_sync = false;
8521
8522 assert(get_preemption_level() > 0);
8523 cpu_num = pmap_cpu_data->cpu_number;
8524
8525 for (cpu_window_index = 0; cpu_window_index < CPUWINDOWS_MAX; cpu_window_index++) {
8526 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, cpu_window_index);
8527 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8528 assert(!pte_is_compressed(*ptep, ptep));
8529 if (!pte_is_valid(*ptep)) {
8530 break;
8531 }
8532 }
8533 if (__improbable(cpu_window_index == CPUWINDOWS_MAX)) {
8534 panic("%s: out of windows", __func__);
8535 }
8536
8537 const pmap_paddr_t paddr = ptoa(pn);
8538 pte = pa_to_pte(paddr) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8539 #if __ARM_KERNEL_PROTECT__
8540 pte |= ARM_PTE_NG;
8541 #endif /* __ARM_KERNEL_PROTECT__ */
8542 pte |= wimg_to_pte(wimg_bits, paddr);
8543
8544 if (prot & VM_PROT_WRITE) {
8545 pte |= ARM_PTE_AP(AP_RWNA);
8546 } else {
8547 pte |= ARM_PTE_AP(AP_RONA);
8548 }
8549
8550 /*
8551 * It's expected to be safe for an interrupt handler to nest copy-window usage with the
8552 * active thread on a CPU, as long as a sufficient number of copy windows are available.
8553 * --If the interrupt handler executes before the active thread creates the per-CPU mapping,
8554 * or after the active thread completely removes the mapping, it may use the same mapping
8555 * but will finish execution and tear down the mapping without the thread needing to know.
8556 * --If the interrupt handler executes after the active thread creates the per-CPU mapping,
8557 * it will observe the valid mapping and use a different copy window.
8558 * --If the interrupt handler executes after the active thread clears the PTE in
8559 * pmap_unmap_cpu_windows_copy() but before the active thread flushes the TLB, the code
8560 * for computing cpu_window_index above will observe the PTE_INVALID_IN_FLIGHT token set
8561 * by the SPTM, and will select a different index.
8562 */
8563 const sptm_return_t sptm_status = sptm_map_page(kernel_pmap->ttep, cpu_copywindow_vaddr, pte);
8564 if (__improbable(sptm_status != SPTM_SUCCESS)) {
8565 panic("%s: failed to map CPU copy-window VA 0x%llx with SPTM status %d",
8566 __func__, (unsigned long long)cpu_copywindow_vaddr, sptm_status);
8567 }
8568
8569
8570 /*
8571 * Clean up any pending strong TLB flush for the same window in a thread we may have
8572 * interrupted.
8573 */
8574 if (__improbable(pmap_cpu_data->copywindow_strong_sync[cpu_window_index])) {
8575 arm64_sync_tlb(true);
8576 }
8577 pmap_cpu_data->copywindow_strong_sync[cpu_window_index] = need_strong_sync;
8578
8579 return cpu_window_index;
8580 }
8581
8582 unsigned int
8583 pmap_map_cpu_windows_copy(
8584 ppnum_t pn,
8585 vm_prot_t prot,
8586 unsigned int wimg_bits)
8587 {
8588 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8589 }
8590
8591 MARK_AS_PMAP_TEXT void
8592 pmap_unmap_cpu_windows_copy_internal(
8593 unsigned int index)
8594 {
8595 unsigned int cpu_num;
8596 vm_offset_t cpu_copywindow_vaddr = 0;
8597 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8598
8599 assert(index < CPUWINDOWS_MAX);
8600 assert(get_preemption_level() > 0);
8601
8602 cpu_num = pmap_cpu_data->cpu_number;
8603
8604 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8605 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8606 * (which are likely to have been on I/O memory) are complete before
8607 * tearing down the mapping. */
8608 __builtin_arm_dsb(DSB_SY);
8609 sptm_unmap_region(kernel_pmap->ttep, cpu_copywindow_vaddr, 1, 0);
8610 if (__improbable(pmap_cpu_data->copywindow_strong_sync[index])) {
8611 arm64_sync_tlb(true);
8612 pmap_cpu_data->copywindow_strong_sync[index] = false;
8613 }
8614 }
8615
8616 void
8617 pmap_unmap_cpu_windows_copy(
8618 unsigned int index)
8619 {
8620 return pmap_unmap_cpu_windows_copy_internal(index);
8621 }
8622
8623 /*
8624 * Indicate that a pmap is intended to be used as a nested pmap
8625 * within one or more larger address spaces. This must be set
8626 * before pmap_nest() is called with this pmap as the 'subordinate'.
8627 */
8628 MARK_AS_PMAP_TEXT void
8629 pmap_set_nested_internal(
8630 pmap_t pmap)
8631 {
8632 validate_pmap_mutable(pmap);
8633 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8634 if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8635 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8636 __func__, pmap, pmap->type);
8637 }
8638 pmap->type = PMAP_TYPE_NESTED;
8639 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
8640 retype_params.attr_idx = (pt_attr_page_size(pt_attr) == 4096) ? SPTM_PT_GEOMETRY_4K : SPTM_PT_GEOMETRY_16K;
8641 pmap_txm_acquire_exclusive_lock(pmap);
8642 sptm_retype(pmap->ttep, XNU_USER_ROOT_TABLE, XNU_SHARED_ROOT_TABLE, retype_params);
8643 pmap_txm_release_exclusive_lock(pmap);
8644 pmap_get_pt_ops(pmap)->free_id(pmap);
8645 }
8646
8647 void
8648 pmap_set_nested(
8649 pmap_t pmap)
8650 {
8651 pmap_set_nested_internal(pmap);
8652 }
8653
8654 bool
8655 pmap_is_nested(
8656 pmap_t pmap)
8657 {
8658 return pmap->type == PMAP_TYPE_NESTED;
8659 }
8660
8661 /*
8662 * pmap_trim_range(pmap, start, end)
8663 *
8664 * pmap = pmap to operate on
8665 * start = start of the range
8666 * end = end of the range
8667 *
8668 * Attempts to deallocate TTEs for the given range in the nested range.
8669 */
8670 MARK_AS_PMAP_TEXT static void
8671 pmap_trim_range(
8672 pmap_t pmap,
8673 addr64_t start,
8674 addr64_t end)
8675 {
8676 addr64_t cur;
8677 addr64_t nested_region_start;
8678 addr64_t nested_region_end;
8679 addr64_t adjusted_start;
8680 addr64_t adjusted_end;
8681 addr64_t adjust_offmask;
8682 tt_entry_t * tte_p;
8683 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8684
8685 if (__improbable(end < start)) {
8686 panic("%s: invalid address range, "
8687 "pmap=%p, start=%p, end=%p",
8688 __func__,
8689 pmap, (void*)start, (void*)end);
8690 }
8691
8692 nested_region_start = pmap->nested_region_addr;
8693 nested_region_end = nested_region_start + pmap->nested_region_size;
8694
8695 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8696 panic("%s: range outside nested region %p-%p, "
8697 "pmap=%p, start=%p, end=%p",
8698 __func__, (void *)nested_region_start, (void *)nested_region_end,
8699 pmap, (void*)start, (void*)end);
8700 }
8701
8702 /* Contract the range to TT page boundaries. */
8703 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
8704
8705 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
8706 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8707 adjusted_end = end & ~adjust_offmask;
8708
8709 /* Iterate over the range, trying to remove TTEs. */
8710 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += (pt_attr_twig_size(pt_attr) * page_ratio)) {
8711 tte_p = pmap_tte(pmap, cur);
8712
8713 if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
8714 if ((pmap->type == PMAP_TYPE_NESTED) && (sptm_get_page_table_refcnt(tte_to_pa(*tte_p)) == 0)) {
8715 /* Deallocate for the nested map. */
8716 pmap_tte_deallocate(pmap, cur, tte_p, pt_attr_twig_level(pt_attr), false);
8717 } else if (pmap->type == PMAP_TYPE_USER) {
8718 /**
8719 * Just remove for the parent map. If the leaf table pointed
8720 * to by the TTE being removed (owned by the nested pmap)
8721 * has any mappings, then this call will panic. This
8722 * enforces the policy that tables being trimmed must be
8723 * empty to prevent possible use-after-free attacks.
8724 */
8725 pmap_tte_trim(pmap, cur, tte_p);
8726 } else {
8727 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8728 }
8729 }
8730 }
8731 }
8732
8733 /*
8734 * pmap_trim_internal(grand, subord, vstart, size)
8735 *
8736 * grand = pmap subord is nested in
8737 * subord = nested pmap
8738 * vstart = start of the used range in grand
8739 * size = size of the used range
8740 *
8741 * Attempts to trim the shared region page tables down to only cover the given
8742 * range in subord and grand.
8743 *
8744 * This function assumes that trimming of [subord] happens exactly once, against
8745 * a temporary [grand] pmap, and that it happens before [subord] is ever actually
8746 * nested in a real task pmap. Unlike its PPL predecessor (which can't trust its
8747 * callers), the SPTM implementation therefore does not do any refcounting to
8748 * track top-level pmaps that may have nested tables outside the trimmed range.
8749 */
8750 MARK_AS_PMAP_TEXT void
8751 pmap_trim_internal(
8752 pmap_t grand,
8753 pmap_t subord,
8754 addr64_t vstart,
8755 uint64_t size)
8756 {
8757 addr64_t vend;
8758 addr64_t adjust_offmask;
8759
8760 if (__improbable(os_add_overflow(vstart, size, &vend))) {
8761 panic("%s: grand addr wraps around, "
8762 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8763 __func__, grand, subord, (void*)vstart, size);
8764 }
8765
8766 validate_pmap_mutable(grand);
8767 validate_pmap(subord);
8768
8769 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8770
8771 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8772 panic("%s: subord is of non-nestable type 0x%hhx, "
8773 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8774 __func__, subord->type, grand, subord, (void*)vstart, size);
8775 }
8776
8777 if (__improbable(grand->type != PMAP_TYPE_USER)) {
8778 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
8779 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8780 __func__, grand->type, grand, subord, (void*)vstart, size);
8781 }
8782
8783 if (__improbable(grand->nested_pmap != subord)) {
8784 panic("%s: grand->nested != subord, "
8785 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8786 __func__, grand, subord, (void*)vstart, size);
8787 }
8788
8789 if (__improbable((vstart < grand->nested_region_addr) ||
8790 (vend > (grand->nested_region_addr + grand->nested_region_size)))) {
8791 panic("%s: grand range not in nested region, "
8792 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8793 __func__, grand, subord, (void*)vstart, size);
8794 }
8795
8796 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
8797 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
8798 vm_map_offset_t true_end = vend;
8799
8800 os_atomic_store(&subord->nested_region_true_start, vstart & ~adjust_offmask, relaxed);
8801
8802 if (__improbable(os_add_overflow(true_end, adjust_offmask, &true_end))) {
8803 panic("%s: padded true end wraps around, "
8804 "grand=%p, subord=%p, vstart=%p, size=%#llx",
8805 __func__, grand, subord, (void*)vstart, size);
8806 }
8807
8808 os_atomic_store(&subord->nested_region_true_end, true_end & ~adjust_offmask, relaxed);
8809
8810 os_atomic_store(&grand->nested_region_true_start, subord->nested_region_true_start, relaxed);
8811 os_atomic_store(&grand->nested_region_true_end, subord->nested_region_true_end, relaxed);
8812 /* Trim grand to only cover the given range. */
8813 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
8814 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
8815 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
8816 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
8817 }
8818
8819 void
8820 pmap_trim(
8821 pmap_t grand,
8822 pmap_t subord,
8823 addr64_t vstart,
8824 uint64_t size)
8825 {
8826 pmap_trim_internal(grand, subord, vstart, size);
8827 }
8828
8829 #if HAS_APPLE_PAC
8830
8831 void *
8832 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
8833 {
8834 void *res = NULL;
8835 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
8836
8837 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
8838 __compiler_materialize_and_prevent_reordering_on(value);
8839 res = sptm_sign_user_pointer(value, key, discriminator, jop_key);
8840 __compiler_materialize_and_prevent_reordering_on(res);
8841 ml_disable_user_jop_key(jop_key, saved_jop_state);
8842
8843 ml_set_interrupts_enabled(current_intr_state);
8844
8845 return res;
8846 }
8847
8848 typedef struct {
8849 void *locations[SPTM_BATCHED_OPS_LIMIT];
8850 unsigned int index;
8851 uint64_t jop_key;
8852 } pmap_batch_sign_user_ptr_state_t;
8853
8854 static pmap_batch_sign_user_ptr_state_t PERCPU_DATA(percpu_pmap_batch_sign_user_ptr_state);
8855
8856 /**
8857 * Accumulates a user pointer signing request, and calls into SPTM to sign
8858 * them as it sees fit or is told to do so. If an SPTM call is made,
8859 * this function copies the signed pointers to their respective locations.
8860 *
8861 * @note This function will disable preemption when called for the first
8862 * time or for the first time after a submission to SPTM. It enables
8863 * preemption after a submission is made.
8864 *
8865 * @note The caller can force the submission of accumulated ops so far by
8866 * passing a NULL location pointer.
8867 *
8868 * @note The jop_key argument is expected to be consistent throughout a
8869 * batch. This function will panic if it detects the jop_key passed
8870 * in is inconsistent with the other ops in the batch.
8871 *
8872 * @param location The destination where the signed pointer will be copied
8873 * to. The caller can pass a NULL pointer to force an SPTM
8874 * submission of the accumulated signing ops so far. In
8875 * such case, the rest of the argument list is ignored.
8876 * @param value The pointer to be signed.
8877 * @param key The key used to sign the pointer.
8878 * @param discriminator The discriminator used to sign the pointer.
8879 * @param jop_key The JOP key used to sign the pointer.
8880 *
8881 * @return true if an SPTM call was made. Otherwise false.
8882 */
8883 bool
8884 pmap_batch_sign_user_ptr(void *location, void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
8885 {
8886 bool submitted_to_sptm = false;
8887
8888 /* Disable preemption to access percpu data. */
8889 disable_preemption();
8890
8891 pmap_batch_sign_user_ptr_state_t *state = PERCPU_GET(percpu_pmap_batch_sign_user_ptr_state);
8892 void **locations = state->locations;
8893 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
8894 sptm_user_pointer_op_t *sptm_user_pointer_ops = (sptm_user_pointer_op_t *) sptm_pcpu->sptm_user_pointer_ops;
8895 uintptr_t *sptm_values = (uintptr_t *) sptm_pcpu->sptm_prev_ptes;
8896
8897 if (state->index != 0) {
8898 /* Avoid leaking preemption counts by offsetting the disable at the beginning of this function. */
8899 enable_preemption();
8900
8901 /* Disabled preemption is still expected. */
8902 assert(!preemption_enabled());
8903 }
8904
8905 assert(state->index < SPTM_BATCHED_OPS_LIMIT);
8906
8907 /* Stash a pointer signing op if a copy location is supplied. */
8908 if (location != NULL) {
8909 locations[state->index] = location;
8910 sptm_user_pointer_ops[state->index].value = (uintptr_t)value;
8911 sptm_user_pointer_ops[state->index].key = key;
8912 sptm_user_pointer_ops[state->index].discriminator = discriminator;
8913
8914 if (state->index == 0) {
8915 state->jop_key = jop_key;
8916 } else {
8917 assert(state->jop_key == jop_key);
8918 }
8919
8920 state->index = state->index + 1;
8921 }
8922
8923 /**
8924 * Submit the stashed ops on this cpu to SPTM when:
8925 * 1. there are SPTM_BATCHED_OPS_LIMIT ops accumulated on the cpu, or
8926 * 2. the caller asks us to submit whatever we have accumulated by
8927 * passing in a NULL location argument.
8928 */
8929 if (state->index == SPTM_BATCHED_OPS_LIMIT || location == NULL) {
8930 if (__probable(state->index > 0)) {
8931 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
8932
8933 uint64_t saved_jop_state = ml_enable_user_jop_key(state->jop_key);
8934 sptm_batch_sign_user_pointer(sptm_pcpu->sptm_user_pointer_ops_pa, state->index, state->jop_key);
8935 ml_disable_user_jop_key(state->jop_key, saved_jop_state);
8936
8937 ml_set_interrupts_enabled(current_intr_state);
8938
8939 for (unsigned int i = 0; i < state->index; i++) {
8940 memcpy(locations[i], &(sptm_values[i]), sizeof(sptm_values[i]));
8941 }
8942
8943 state->index = 0;
8944 state->jop_key = 0;
8945 submitted_to_sptm = true;
8946 }
8947 }
8948
8949 /**
8950 * There is a slight difference between using submitted_to_sptm and
8951 * state->index here. We need to take care of the case when there is
8952 * no op accumulated but a NULL location passed in, where submitted_to_sptm
8953 * will be false and leak a preemption count.
8954 */
8955 if (state->index == 0) {
8956 assert(submitted_to_sptm || (location == NULL));
8957 enable_preemption();
8958 }
8959
8960 return submitted_to_sptm;
8961 }
8962
8963 void *
8964 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
8965 {
8966 void *res = NULL;
8967 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
8968
8969 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
8970 __compiler_materialize_and_prevent_reordering_on(value);
8971 res = sptm_auth_user_pointer(value, key, discriminator, jop_key);
8972 __compiler_materialize_and_prevent_reordering_on(res);
8973 ml_disable_user_jop_key(jop_key, saved_jop_state);
8974
8975 if (res == SPTM_AUTH_FAILURE) {
8976 res = ml_poison_ptr(value, key);
8977 }
8978
8979 ml_set_interrupts_enabled(current_intr_state);
8980
8981 return res;
8982 }
8983 #endif /* HAS_APPLE_PAC */
8984
8985 /**
8986 * Establishes the pmap associated with a shared region as the nested pmap
8987 * for a top-level user pmap.
8988 *
8989 * @param grand The top-level user pmap
8990 * @param subord The pmap to be set as [grand]'s nested pmap
8991 * @param vstart The base VA of the region to be nested.
8992 * @param size The size (in bytes) of the region to be nested.
8993 */
8994 void
8995 pmap_set_shared_region(
8996 pmap_t grand,
8997 pmap_t subord,
8998 addr64_t vstart,
8999 uint64_t size)
9000 {
9001 addr64_t vend;
9002
9003 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START,
9004 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size);
9005
9006 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9007 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9008 }
9009
9010 validate_pmap_mutable(grand);
9011 validate_pmap(subord);
9012 os_ref_retain_raw(&subord->ref_count, &pmap_refgrp);
9013
9014 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9015 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9016 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9017 }
9018
9019 if (__improbable(((size | vstart) &
9020 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9021 panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx",
9022 __func__, grand, vstart, size);
9023 }
9024
9025 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9026 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9027 }
9028
9029 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9030 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9031 }
9032
9033 if (subord->nested_region_size == 0) {
9034 /**
9035 * Since subord->nested_region_size is 0, this is the first time subord is being
9036 * associated with a top-level pmap. We therefore need to take a few extra steps to
9037 * ensure the shared region is properly configured. This initial setup step is expected
9038 * to be issued by the VM layer against a temporary grand pmap before any other pmap
9039 * is allowed to associate with subord, so synchronization is not needed here to prevent
9040 * concurrent initialization.
9041 */
9042 sptm_configure_shared_region(subord->ttep, vstart, size >> pt_attr->pta_page_shift);
9043
9044 /**
9045 * Since this is the first time subord is being associated with a top-level pmap, ensure
9046 * its nested region is fully expanded to L3 so that all relevant L3 tables can later be
9047 * inserted into top-level pmaps via pmap_nest(). Note that pmap_remove() will never
9048 * dynamically free L3 tables from nested pmaps. However, some of these tables may be
9049 * freed by a later call to pmap_trim().
9050 */
9051 vm_map_offset_t vaddr = vstart;
9052 while (vaddr < vend) {
9053 const tt_entry_t *const stte_p = pmap_tte(subord, vaddr);
9054 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9055 __assert_only kern_return_t kr;
9056 kr = pmap_expand(subord, vaddr, 0, pt_attr_leaf_level(pt_attr));
9057 assert3u(kr, ==, KERN_SUCCESS);
9058 }
9059 vaddr += pt_attr_twig_size(pt_attr);
9060 }
9061
9062 const uint64_t nested_region_unnested_table_bits = (size >> (pt_attr_twig_shift(pt_attr) - 1));
9063 if (__improbable((nested_region_unnested_table_bits > UINT_MAX))) {
9064 panic("%s: bitmap allocation size %llu will truncate, "
9065 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9066 __func__, nested_region_unnested_table_bits,
9067 grand, subord, vstart, size);
9068 }
9069
9070 subord->nested_region_unnested_table_bitmap = bitmap_alloc((uint) nested_region_unnested_table_bits);
9071 subord->nested_region_addr = vstart;
9072 subord->nested_region_size = (mach_vm_offset_t)size;
9073 }
9074
9075 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9076 grand->nested_region_addr = vstart;
9077 grand->nested_region_size = (mach_vm_offset_t)size;
9078 assert3u(grand->nested_region_addr, ==, subord->nested_region_addr);
9079 assert3u(grand->nested_region_size, ==, subord->nested_region_size);
9080 pmap_txm_acquire_exclusive_lock(grand);
9081 pmap_txm_acquire_shared_lock(subord);
9082 sptm_set_shared_region(grand->ttep, subord->ttep);
9083 pmap_txm_release_shared_lock(subord);
9084 pmap_txm_release_exclusive_lock(grand);
9085 } else {
9086 panic("%s: pmap %p already has a nested pmap %p", __func__, grand, grand->nested_pmap);
9087 }
9088
9089 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END);
9090 }
9091
9092 /**
9093 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9094 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9095 * This function operates in 2 main phases:
9096 * 1. Expands grand to ensure the required twig-level page table pages for
9097 * the mapping range are present in grand.
9098 * 2. Invokes sptm_nest_region() to copy the relevant TTEs from subord to grand.
9099 *
9100 * @note This function requires that pmap_set_shared_region() has already been
9101 * called for the [grand, subord] pair.
9102 *
9103 * @note The VA region defined by vstart and vsize must lie entirely within the
9104 * VA region established by the previous call to pmap_set_shared_region().
9105 *
9106 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9107 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9108 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9109 * @param size twig-aligned size of the nesting range
9110 *
9111 * @return KERN_RESOURCE_SHORTAGE on allocation failure, KERN_SUCCESS otherwise
9112 */
9113 MARK_AS_PMAP_TEXT kern_return_t
9114 pmap_nest_internal(
9115 pmap_t grand,
9116 pmap_t subord,
9117 addr64_t vstart,
9118 uint64_t size)
9119 {
9120 kern_return_t kr = KERN_SUCCESS;
9121 vm_map_offset_t vaddr;
9122 tt_entry_t *gtte_p;
9123
9124 addr64_t vend;
9125 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9126 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9127 }
9128
9129 validate_pmap_mutable(grand);
9130 validate_pmap(subord);
9131
9132 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9133
9134 if (__improbable(((size | vstart) &
9135 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9136 panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx",
9137 __func__, grand, vstart, size);
9138 }
9139
9140 if (__improbable(subord != grand->nested_pmap)) {
9141 panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p",
9142 __func__, subord, grand, grand->nested_pmap);
9143 }
9144
9145 addr64_t true_start = vstart;
9146 if (true_start < subord->nested_region_true_start) {
9147 true_start = subord->nested_region_true_start;
9148 }
9149
9150 addr64_t true_end = vend;
9151 if (true_end > subord->nested_region_true_end) {
9152 true_end = subord->nested_region_true_end;
9153 }
9154
9155 /* Ensure grand is expanded to L2 so that sptm_nest_region() can copy L3 entries from subord. */
9156 vaddr = (vm_map_offset_t) true_start;
9157
9158 while (vaddr < true_end) {
9159 gtte_p = pmap_tte(grand, vaddr);
9160 if (gtte_p == PT_ENTRY_NULL) {
9161 kr = pmap_expand(grand, vaddr, 0, pt_attr_twig_level(pt_attr));
9162
9163 if (kr != KERN_SUCCESS) {
9164 goto done;
9165 }
9166 }
9167
9168 vaddr += pt_attr_twig_size(pt_attr);
9169 }
9170
9171 vaddr = (vm_map_offset_t) true_start;
9172
9173 while (vaddr < true_end) {
9174 /*
9175 * The SPTM requires the run of TTE updates to all reside within the same L2 page, so the region
9176 * we supply to the SPTM can't span multiple L1 TTEs.
9177 */
9178 vm_map_offset_t vlim = ((vaddr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9179 if (vlim > true_end) {
9180 vlim = true_end;
9181 }
9182 sptm_nest_region(grand->ttep, subord->ttep, vaddr, (vlim - vaddr) >> pt_attr->pta_page_shift);
9183 vaddr = vlim;
9184 }
9185
9186 done:
9187 return kr;
9188 }
9189
9190 kern_return_t
9191 pmap_nest(
9192 pmap_t grand,
9193 pmap_t subord,
9194 addr64_t vstart,
9195 uint64_t size)
9196 {
9197 kern_return_t kr = KERN_SUCCESS;
9198
9199 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9200 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9201 VM_KERNEL_ADDRHIDE(vstart));
9202
9203 pmap_verify_preemptible();
9204 kr = pmap_nest_internal(grand, subord, vstart, size);
9205
9206 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9207
9208 return kr;
9209 }
9210
9211 /*
9212 * kern_return_t pmap_unnest(grand, vaddr)
9213 *
9214 * grand = the pmap that will have the virtual range unnested
9215 * vaddr = start of range in pmap to be unnested
9216 * size = size of range in pmap to be unnested
9217 *
9218 */
9219
9220 kern_return_t
9221 pmap_unnest(
9222 pmap_t grand,
9223 addr64_t vaddr,
9224 uint64_t size)
9225 {
9226 return pmap_unnest_options(grand, vaddr, size, 0);
9227 }
9228
9229 /**
9230 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9231 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9232 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9233 * still have the region nested. The mappings in 'grand' will be left empty
9234 * with the assumption that they will be demand-filled by subsequent access faults.
9235 *
9236 * This function operates in 2 main phases:
9237 * 1. Iteration over the nested pmap's mappings for the specified range to mark
9238 * them non-global.
9239 * 2. Calling the SPTM to clear the twig-level TTEs for the address range in grand.
9240 *
9241 * @param grand pmap from which to unnest mappings
9242 * @param vaddr twig-aligned virtual address for the beginning of the nested range
9243 * @param size twig-aligned size of the nested range
9244 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9245 * grand is being torn down and step 1) above is not needed.
9246 */
9247 MARK_AS_PMAP_TEXT void
9248 pmap_unnest_options_internal(
9249 pmap_t grand,
9250 addr64_t vaddr,
9251 uint64_t size,
9252 unsigned int option)
9253 {
9254 vm_map_offset_t start;
9255 vm_map_offset_t addr;
9256 unsigned int current_index;
9257 unsigned int start_index;
9258 unsigned int max_index;
9259
9260 addr64_t vend;
9261 addr64_t true_end;
9262 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9263 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9264 }
9265
9266 validate_pmap_mutable(grand);
9267
9268 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9269
9270 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9271 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9272 (unsigned long long)vaddr, (unsigned long long)size);
9273 }
9274
9275 struct pmap * const subord = grand->nested_pmap;
9276 if (__improbable(subord == NULL)) {
9277 panic("%s: %p has no nested pmap", __func__, grand);
9278 }
9279
9280 true_end = vend;
9281 if (true_end > subord->nested_region_true_end) {
9282 true_end = subord->nested_region_true_end;
9283 }
9284
9285 if ((option & PMAP_UNNEST_CLEAN) == 0) {
9286 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9287 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9288 }
9289
9290 start = vaddr;
9291 if (start < subord->nested_region_true_start) {
9292 start = subord->nested_region_true_start;
9293 }
9294 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9295 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9296
9297 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9298 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9299
9300 bool unnested = bitmap_test(subord->nested_region_unnested_table_bitmap, UNNEST_BIT(current_index));
9301 os_atomic_thread_fence(acquire);
9302 if (!unnested) {
9303 atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap,
9304 UNNEST_IN_PROGRESS_BIT(current_index), memory_order_relaxed);
9305 /*
9306 * Issue a store-load barrier to ensure the UNNEST_IN_PROGRESS bit is visible to any pmap_enter()
9307 * operation that enters the epoch after this point.
9308 */
9309 os_atomic_thread_fence(seq_cst);
9310 pmap_epoch_prepare_drain();
9311 pmap_epoch_drain();
9312
9313 unsigned int num_mappings = 0;
9314 disable_preemption();
9315 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9316 /*
9317 * We've marked the 'twig' region as being unnested. Every mapping entered within
9318 * the nested pmap in this region will now be marked non-global.
9319 */
9320 while (addr < vlim) {
9321 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9322
9323 sptm_pcpu->sptm_templates[num_mappings] = ARM_PTE_NG;
9324 ++num_mappings;
9325
9326 if (num_mappings == SPTM_MAPPING_LIMIT) {
9327 pmap_epoch_enter();
9328 /**
9329 * It's technically possible (though highly unlikely) for subord to
9330 * be concurrently trimmed, so re-check the bounds within the epoch to
9331 * avoid potentially issuing an SPTM operation against a deleted leaf
9332 * page table. This assumes the following:
9333 * 1) The pmap_trim() code path always issues a barrier and an epoch
9334 * drain in between updating subord's true bounds and actually
9335 * trimming subord, effectively purging any operation here which
9336 * may be using stale bounds.
9337 * 2) The true bounds, if set, will always be twig-aligned, thus
9338 * the region we operate on here can never span the starting or
9339 * ending bounds.
9340 */
9341 if ((start >= subord->nested_region_true_start) &&
9342 (start < subord->nested_region_true_end)) {
9343 sptm_update_region(subord->ttep, start, num_mappings,
9344 sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9345 }
9346 pmap_epoch_exit();
9347 enable_preemption();
9348 num_mappings = 0;
9349 start = addr;
9350 disable_preemption();
9351 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9352 }
9353 }
9354 /**
9355 * The SPTM does not allow region updates to span multiple leaf page tables, so request
9356 * any remaining updates up to vlim before moving to the next page table page.
9357 */
9358 if (num_mappings != 0) {
9359 pmap_epoch_enter();
9360 if ((start >= subord->nested_region_true_start) &&
9361 (start < subord->nested_region_true_end)) {
9362 sptm_update_region(subord->ttep, start, num_mappings,
9363 sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9364 }
9365 pmap_epoch_exit();
9366 }
9367 enable_preemption();
9368 atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap,
9369 UNNEST_BIT(current_index), memory_order_release);
9370 }
9371 addr = start = vlim;
9372 }
9373 }
9374
9375 /*
9376 * invalidate all pdes for segment at vaddr in pmap grand
9377 */
9378 addr = vaddr;
9379
9380 if (addr < subord->nested_region_true_start) {
9381 addr = subord->nested_region_true_start;
9382 }
9383
9384 if (true_end > subord->nested_region_true_end) {
9385 true_end = subord->nested_region_true_end;
9386 }
9387
9388 while (addr < true_end) {
9389 vm_map_offset_t vlim = ((addr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9390 if (vlim > true_end) {
9391 vlim = true_end;
9392 }
9393 sptm_unnest_region(grand->ttep, subord->ttep, addr, (vlim - addr) >> pt_attr->pta_page_shift);
9394 addr = vlim;
9395 }
9396 }
9397
9398 kern_return_t
9399 pmap_unnest_options(
9400 pmap_t grand,
9401 addr64_t vaddr,
9402 uint64_t size,
9403 unsigned int option)
9404 {
9405 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9406 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9407
9408 pmap_verify_preemptible();
9409 pmap_unnest_options_internal(grand, vaddr, size, option);
9410
9411 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9412
9413 return KERN_SUCCESS;
9414 }
9415
9416 boolean_t
9417 pmap_adjust_unnest_parameters(
9418 __unused pmap_t p,
9419 __unused vm_map_offset_t *s,
9420 __unused vm_map_offset_t *e)
9421 {
9422 return TRUE; /* to get to log_unnest_badness()... */
9423 }
9424
9425 /**
9426 * Perform any necessary pre-nesting of the parent's shared region at fork()
9427 * time.
9428 *
9429 * @note This should only be called from vm_map_fork().
9430 *
9431 * @param old_pmap The pmap of the parent task.
9432 * @param new_pmap The pmap of the child task.
9433 *
9434 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9435 * KERN_INVALID_ARGUMENT if the arguments were not valid.
9436 */
9437 kern_return_t
9438 pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap)
9439 {
9440 if (old_pmap == NULL || new_pmap == NULL) {
9441 return KERN_INVALID_ARGUMENT;
9442 }
9443 if (old_pmap->nested_pmap == NULL) {
9444 return KERN_SUCCESS;
9445 }
9446 pmap_set_shared_region(new_pmap,
9447 old_pmap->nested_pmap,
9448 old_pmap->nested_region_addr,
9449 old_pmap->nested_region_size);
9450 return KERN_SUCCESS;
9451 }
9452
9453 /*
9454 * disable no-execute capability on
9455 * the specified pmap
9456 */
9457 #if DEVELOPMENT || DEBUG
9458 void
9459 pmap_disable_NX(
9460 pmap_t pmap)
9461 {
9462 pmap->nx_enabled = FALSE;
9463 }
9464 #else
9465 void
9466 pmap_disable_NX(
9467 __unused pmap_t pmap)
9468 {
9469 }
9470 #endif
9471
9472 /*
9473 * flush a range of hardware TLB entries.
9474 * NOTE: assumes the smallest TLB entry in use will be for
9475 * an ARM small page (4K).
9476 */
9477
9478 #if __ARM_RANGE_TLBI__
9479 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
9480 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
9481 #else
9482 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
9483 #endif // __ARM_RANGE_TLBI__
9484
9485 static void
9486 flush_mmu_tlb_region_asid_async(
9487 vm_offset_t va,
9488 size_t length,
9489 pmap_t pmap,
9490 bool last_level_only __unused)
9491 {
9492 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
9493 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
9494 ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
9495 const uint16_t asid = PMAP_HWASID(pmap);
9496
9497 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
9498 boolean_t flush_all = FALSE;
9499
9500 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
9501 flush_all = TRUE;
9502 }
9503 if (flush_all) {
9504 flush_mmu_tlb_async();
9505 } else {
9506 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, false);
9507 }
9508 return;
9509 }
9510 #if __ARM_RANGE_TLBI__
9511 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
9512 va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
9513 if (pmap->type == PMAP_TYPE_NESTED) {
9514 flush_mmu_tlb_allrange_async(va, last_level_only, false);
9515 } else {
9516 flush_mmu_tlb_range_async(va, last_level_only, false);
9517 }
9518 return;
9519 }
9520 #endif
9521 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
9522 va = tlbi_asid(asid) | tlbi_addr(va);
9523
9524 if (pmap->type == PMAP_TYPE_NESTED) {
9525 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, false);
9526 } else {
9527 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, false);
9528 }
9529 }
9530
9531 void
9532 flush_mmu_tlb_region(
9533 vm_offset_t va,
9534 unsigned length)
9535 {
9536 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
9537 sync_tlb_flush();
9538 }
9539
9540 unsigned int
9541 pmap_cache_attributes(
9542 ppnum_t pn)
9543 {
9544 pmap_paddr_t paddr;
9545 unsigned int pai;
9546 unsigned int result;
9547 pp_attr_t pp_attr_current;
9548
9549 paddr = ptoa(pn);
9550
9551 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
9552
9553 if (!pa_valid(paddr)) {
9554 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
9555 return (io_rgn == NULL || io_rgn->signature == 'SKIO') ? VM_WIMG_IO : io_rgn->wimg;
9556 }
9557
9558 result = VM_WIMG_DEFAULT;
9559
9560 pai = pa_index(paddr);
9561
9562 pp_attr_current = pp_attr_table[pai];
9563 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
9564 result = pp_attr_current & PP_ATTR_WIMG_MASK;
9565 }
9566 return result;
9567 }
9568
9569 MARK_AS_PMAP_TEXT static void
9570 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
9571 {
9572 if ((wimg_bits_prev != wimg_bits_new)
9573 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
9574 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
9575 && (wimg_bits_new != VM_WIMG_COPYBACK))
9576 || ((wimg_bits_prev == VM_WIMG_WTHRU)
9577 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
9578 pmap_sync_page_attributes_phys(pn);
9579 }
9580
9581 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
9582 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
9583 }
9584 }
9585
9586 MARK_AS_PMAP_TEXT __unused void
9587 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
9588 {
9589 pmap_paddr_t paddr = ptoa(pn);
9590
9591 if (__improbable(!pa_valid(paddr))) {
9592 panic("%s called on non-managed page 0x%08x", __func__, pn);
9593 }
9594
9595 pmap_set_cache_attributes_internal(pn, new_cacheattr, false);
9596
9597 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
9598 }
9599
9600 static inline bool
9601 cacheattr_supports_compressor(unsigned int cacheattr)
9602 {
9603 switch (cacheattr) {
9604 case VM_WIMG_DEFAULT:
9605 return true;
9606 default:
9607 return false;
9608 }
9609 }
9610
9611 void *
9612 pmap_map_compressor_page(ppnum_t pn)
9613 {
9614 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
9615 if (!cacheattr_supports_compressor(cacheattr)) {
9616 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
9617 }
9618
9619 return (void*)phystokv(ptoa(pn));
9620 }
9621
9622 void
9623 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
9624 {
9625 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
9626 if (!cacheattr_supports_compressor(cacheattr)) {
9627 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
9628 }
9629 }
9630
9631 /**
9632 * Flushes TLB entries associated with the page specified by paddr, but do not
9633 * issue barriers yet.
9634 *
9635 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
9636 */
9637 static void
9638 pmap_flush_tlb_for_paddr_async(pmap_paddr_t paddr)
9639 {
9640 /* Flush the physical aperture mappings. */
9641 const vm_offset_t kva = phystokv(paddr);
9642 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
9643
9644 /* Flush the mappings tracked in the ptes. */
9645 const unsigned int pai = pa_index(paddr);
9646 locked_pvh_t locked_pvh = pvh_lock(pai);
9647
9648 pt_entry_t *pte_p = PT_ENTRY_NULL;
9649 pv_entry_t *pve_p = PV_ENTRY_NULL;
9650
9651 if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP)) {
9652 pte_p = pvh_ptep(locked_pvh.pvh);
9653 } else if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
9654 pve_p = pvh_pve_list(locked_pvh.pvh);
9655 pte_p = PT_ENTRY_NULL;
9656 }
9657
9658 unsigned int nptes = 0;
9659 int pve_ptep_idx = 0;
9660 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
9661 if (pve_p != PV_ENTRY_NULL) {
9662 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
9663 if (pte_p == PT_ENTRY_NULL) {
9664 goto flush_tlb_skip_pte;
9665 }
9666 }
9667
9668 if (__improbable(nptes == SPTM_MAPPING_LIMIT)) {
9669 pvh_lock_enter_sleep_mode(&locked_pvh);
9670 }
9671 ++nptes;
9672 #ifdef PVH_FLAG_IOMMU
9673 if (pvh_ptep_is_iommu(pte_p)) {
9674 goto flush_tlb_skip_pte;
9675 }
9676 #endif /* PVH_FLAG_IOMMU */
9677 const pmap_t pmap = ptep_get_pmap(pte_p);
9678 const vm_map_address_t va = ptep_get_va(pte_p);
9679
9680 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
9681
9682 flush_tlb_skip_pte:
9683 pte_p = PT_ENTRY_NULL;
9684 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
9685 pve_ptep_idx = 0;
9686 pve_p = pve_next(pve_p);
9687 }
9688 }
9689 pvh_unlock(&locked_pvh);
9690 }
9691
9692 /**
9693 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
9694 *
9695 * @param pai The Physical Address Index of the entry.
9696 * @param cacheattr The new cache attribute.
9697 */
9698 MARK_AS_PMAP_TEXT static void
9699 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
9700 {
9701 pvh_assert_locked(pai);
9702
9703 pp_attr_t pp_attr_current, pp_attr_template;
9704 do {
9705 pp_attr_current = pp_attr_table[pai];
9706 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
9707
9708 /**
9709 * WIMG bits should only be updated under the PVH lock, but we should do
9710 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
9711 */
9712 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
9713 }
9714
9715 /**
9716 * Structure for tracking where we are during the collection of mappings for batch
9717 * cache attribute updates.
9718 *
9719 * @note We need to track where in the per-cpu ops table we are filling the next mappings into,
9720 * because the collection routine can return with a not completely filled ops table when
9721 * it exhausts the PV list for a page. In such case, the remaining slots in the ops table
9722 * will be used for mappings of the next page.
9723 *
9724 * @note We also need to record where we are in the PV list, because the collection routine can
9725 * also return when the ops table is filled but it's still in the middle of the PV list.
9726 * Those remaining items in the PV list need to be handled by the next batch operation in
9727 * a new ops table.
9728 */
9729 typedef struct {
9730 /* Where we are in the sptm ops table. */
9731 unsigned int sptm_ops_index;
9732
9733 /**
9734 * The last collected physical address from the previous full ops array (and in turn, SPTM
9735 * call). This is used to know whether the SPTM call for the latest full ops table should
9736 * skip updating the PAPT mapping (seeing as the last call would have handled updating it).
9737 */
9738 pmap_paddr_t last_table_last_papt_pa;
9739
9740 /**
9741 * Where we are in the pv list.
9742 *
9743 * When ptep is non-null, there's only one mapping to the page and the ptep is the address
9744 * of it.
9745 *
9746 * When pvep is non-null, there's more than one mapping and the mappings are tracked by the
9747 * PV list.
9748 *
9749 * When they are both null, it indicates we are collecting for a new page and the collection
9750 * function will initialize them to be one of the two states above.
9751 *
9752 * It is undefined when they are both non-null.
9753 */
9754 pt_entry_t *ptep;
9755 pv_entry_t *pvep;
9756 unsigned int pve_ptep_idx;
9757 } pmap_sptm_update_cache_attr_ops_collect_state_t;
9758
9759 /**
9760 * Reports whether there is any pending ops in an sptm cache attr ops table.
9761 *
9762 * @param state A pmap_sptm_update_cache_attr_ops_collect_state_t structure.
9763 *
9764 * @return True if there's any outstanding cache attr op.
9765 * False otherwise.
9766 */
9767 static inline bool
9768 pmap_is_sptm_update_cache_attr_ops_pending(pmap_sptm_update_cache_attr_ops_collect_state_t state)
9769 {
9770 return state.sptm_ops_index > 0;
9771 }
9772
9773 /**
9774 * Struct for encoding the collection status into pmap_sptm_update_cache_attr_ops_collect()'s
9775 * return value indicating what kind of attention it needs.
9776 */
9777 typedef enum {
9778 OPS_COLLECT_NOTHING = 0x0,
9779
9780 /* The ops table is full, and the caller should commit the table to SPTM. */
9781 OPS_COLLECT_RETURN_FULL_TABLE = 0x1,
9782
9783 /**
9784 * The page has its mappings completely collected, and the caller should
9785 * pass in a new page next time.
9786 */
9787 OPS_COLLECT_RETURN_COMPLETED_PAGE = 0x2,
9788 } pmap_sptm_update_cache_attr_ops_collect_return_t;
9789
9790 /**
9791 * Collects mappings of a physical page into an SPTM ops table for cache attribute updates.
9792 *
9793 * @note This routine returns either when the ops table is full or the page represented by
9794 * pa has no more mapping to collect. The caller should call this routine again with
9795 * a fresh ops table, or a new page, or both, depending on the return code.
9796 *
9797 * @note The PVH lock needs to be held for pa.
9798 *
9799 * @param state Tracks the state of PV list traversal and SPTM ops table filling. It is used
9800 * by this routine to save the progress of the collection.
9801 * @param sptm_ops Pointer to the SPTM ops table.
9802 * @param pa The physical address whose mappings are to be collected.
9803 * @param attributes The new cache attributes.
9804 *
9805 * @return A pmap_sptm_update_cache_attr_ops_collect_return_t that encodes what the caller
9806 * should do before calling this routine again. See the inline comments around
9807 * pmap_sptm_update_cache_attr_ops_collect_return_t for details.
9808 */
9809 static pmap_sptm_update_cache_attr_ops_collect_return_t
9810 pmap_sptm_update_cache_attr_ops_collect(
9811 pmap_sptm_update_cache_attr_ops_collect_state_t *state,
9812 sptm_update_disjoint_multipage_op_t *sptm_ops,
9813 pmap_paddr_t pa,
9814 unsigned int attributes)
9815 {
9816 if (state == NULL || sptm_ops == NULL) {
9817 panic("%s: unexpected null arguments - state: %p, sptm_ops: %p", __func__, state, sptm_ops);
9818 }
9819
9820 PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_START, pa, attributes, state->sptm_ops_index);
9821
9822 /* Copy the states into local variables. */
9823 unsigned int sptm_ops_index = state->sptm_ops_index;
9824 pmap_paddr_t last_table_last_papt_pa = state->last_table_last_papt_pa;
9825 pv_entry_t *pvep = state->pvep;
9826 pt_entry_t *ptep = state->ptep;
9827 unsigned int pve_ptep_idx = state->pve_ptep_idx;
9828
9829 unsigned int pai = pa_index(pa);
9830
9831 /* We should at least have one free slot in the ops table. */
9832 assert(sptm_ops_index < SPTM_MAPPING_LIMIT);
9833
9834 /* The PVH lock for pa has to be locked. */
9835 pvh_assert_locked(pai);
9836
9837 /* If pvep and ptep are both null in the state, it's a new page. Initialize the states. */
9838 if (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL) {
9839 const uintptr_t pvh = pai_to_pvh(pai);
9840 if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
9841 ptep = PT_ENTRY_NULL;
9842 pvep = pvh_pve_list(pvh);
9843 pve_ptep_idx = 0;
9844 } else if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
9845 ptep = pvh_ptep(pvh);
9846 pvep = PV_ENTRY_NULL;
9847 pve_ptep_idx = 0;
9848 }
9849 }
9850
9851 /**
9852 * The first entry filled in is always the PAPT header entry:
9853 *
9854 * 1) In the case of a fresh ops table, the first entry has to be a PAPT header.
9855 * 2) In the case of a fresh page, we need to insert a new PAPT header to request
9856 * SPTM to operate on a new page.
9857 *
9858 * Remember the index of the PAPT header here so that we can update the number
9859 * of mappings field later when we finish collecting.
9860 */
9861 const unsigned int papt_sptm_ops_index = sptm_ops_index;
9862 unsigned int num_mappings = 0;
9863
9864 /* Assemble the PTE template for the PAPT mapping. */
9865 const vm_address_t kva = phystokv(pa);
9866 const pt_entry_t *papt_ptep = pmap_pte(kernel_pmap, kva);
9867
9868 pt_entry_t template = os_atomic_load(papt_ptep, relaxed);
9869 template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
9870 template |= wimg_to_pte(attributes, pa);
9871
9872 /* Fill in the PAPT header entry. */
9873 sptm_ops[papt_sptm_ops_index].per_paddr_header.paddr = pa;
9874 sptm_ops[papt_sptm_ops_index].per_paddr_header.papt_pte_template = template;
9875 sptm_ops[papt_sptm_ops_index].per_paddr_header.options = SPTM_UPDATE_SH | SPTM_UPDATE_MAIR | SPTM_UPDATE_DEFER_TLBI;
9876
9877 if ((papt_sptm_ops_index == 0) && (pa == last_table_last_papt_pa)) {
9878 /**
9879 * If the previous SPTM call was made with an ops table that already included
9880 * updating the PA of the page that this table starts with, then we can assume
9881 * that call already updated the PAPT and we can safely skip it in this
9882 * upcoming one.
9883 */
9884 sptm_ops[0].per_paddr_header.options |= SPTM_UPDATE_SKIP_PAPT;
9885 }
9886
9887 sptm_ops_index++;
9888
9889 /**
9890 * Main loop for collecting the mappings into the ops table. It terminates either
9891 * when the ops table is full or the PV list is exhausted.
9892 */
9893 while ((sptm_ops_index < SPTM_MAPPING_LIMIT) && (pvep != PV_ENTRY_NULL || ptep != PT_ENTRY_NULL)) {
9894 /**
9895 * Update ptep. There are really two cases here:
9896 *
9897 * 1) pvep is PV_ENTRY_NULL. In this case, ptep holds the pointer to
9898 * the only mapping to the page.
9899 * 2) pvep is not PV_ENTRY_NULL. In such case, ptep is updated accroding to
9900 * pvep and pve_ptep_idx.
9901 */
9902 if (pvep != PV_ENTRY_NULL) {
9903 ptep = pve_get_ptep(pvep, pve_ptep_idx);
9904
9905 /* This pve is empty, so skip to next one. */
9906 if (ptep == PT_ENTRY_NULL) {
9907 goto sucaoc_skip_pte;
9908 }
9909 }
9910
9911 #ifdef PVH_FLAG_IOMMU
9912 /* Skip IOMMU pteps. */
9913 if (pvh_ptep_is_iommu(ptep)) {
9914 goto sucaoc_skip_pte;
9915 }
9916 #endif
9917 /* Assemble the PTE template for the mapping. */
9918 const vm_address_t va = ptep_get_va(ptep);
9919 const pmap_t pmap = ptep_get_pmap(ptep);
9920
9921 template = os_atomic_load(ptep, relaxed);
9922 template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
9923 template |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, pa);
9924
9925 /* Fill into the ops table. */
9926 sptm_ops[sptm_ops_index].disjoint_op.root_pt_paddr = pmap->ttep;
9927 sptm_ops[sptm_ops_index].disjoint_op.vaddr = va;
9928 sptm_ops[sptm_ops_index].disjoint_op.pte_template = template;
9929
9930 /* Move the sptm ops table cursor. */
9931 sptm_ops_index++;
9932
9933 /* Increment the mappings counter. */
9934 num_mappings++;
9935
9936 sucaoc_skip_pte:
9937 /**
9938 * Reset ptep to PT_ENTRY_NULL to keep the loop precondition of either ptep
9939 * or pvep is nonnull (not both, not neither) true.
9940 */
9941 ptep = PT_ENTRY_NULL;
9942
9943 /* Advance to next pvep if we have exhausted the pteps in it. */
9944 if ((pvep != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
9945 pve_ptep_idx = 0;
9946 pvep = pve_next(pvep);
9947 }
9948 }
9949
9950 /* Update the PAPT header for the number of mappings. */
9951 sptm_ops[papt_sptm_ops_index].per_paddr_header.num_mappings = num_mappings;
9952
9953 const bool full_table = (sptm_ops_index >= SPTM_MAPPING_LIMIT);
9954 const bool collection_done_for_page = (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL);
9955
9956 /**
9957 * The ops table is full, so the caller should now invoke the SPTM before calling
9958 * into this function again.
9959 */
9960 if (full_table) {
9961 /* Update last_table_last_papt_pa to be the pa collected in this call. */
9962 last_table_last_papt_pa = pa;
9963
9964 /* Reset sptm_ops_index. */
9965 sptm_ops_index = 0;
9966 }
9967
9968 /* Copy the updated collection states back to the parameter structure. */
9969 state->sptm_ops_index = sptm_ops_index;
9970 state->last_table_last_papt_pa = last_table_last_papt_pa;
9971 state->pvep = pvep;
9972 state->ptep = ptep;
9973 state->pve_ptep_idx = pve_ptep_idx;
9974
9975 /* Assemble the return value. */
9976 pmap_sptm_update_cache_attr_ops_collect_return_t retval = OPS_COLLECT_NOTHING;
9977
9978 if (full_table) {
9979 retval |= OPS_COLLECT_RETURN_FULL_TABLE;
9980 }
9981
9982 if (collection_done_for_page) {
9983 retval |= OPS_COLLECT_RETURN_COMPLETED_PAGE;
9984 }
9985
9986 PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_END, pa, attributes, sptm_ops_index);
9987
9988 return retval;
9989 }
9990
9991 /* At least one PAPT header plus one mapping. */
9992 static_assert(SPTM_MAPPING_LIMIT >= 2);
9993
9994 /**
9995 * Returns if a cache attribute is allowed (on managed pages).
9996 *
9997 * @param attributes A 32-bit value whose VM_WIMG_MASK bits represent the
9998 * cache attribute.
9999 *
10000 * @return True if the cache attribute is allowed on managed pages.
10001 * False otherwise.
10002 */
10003 static bool
10004 pmap_is_cache_attribute_allowed(unsigned int attributes)
10005 {
10006 if (pmap_panic_dev_wimg_on_managed) {
10007 switch (attributes & VM_WIMG_MASK) {
10008 /* supported on DRAM, but slow, so we disallow */
10009 case VM_WIMG_IO: // nGnRnE
10010 case VM_WIMG_POSTED: // nGnRE
10011
10012 /* unsupported on DRAM */
10013 case VM_WIMG_POSTED_REORDERED: // nGRE
10014 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10015 return false;
10016
10017 default:
10018 return true;
10019 }
10020 }
10021
10022 return true;
10023 }
10024
10025 /**
10026 * Batch updates the cache attributes of a list of pages in three passes.
10027 *
10028 * In pass one, the pp_attr_table and the pte are updated (by SPTM) for the pages in the list.
10029 * In pass two, TLB entries are flushed for each page in the list if necessary.
10030 * In pass three, caches are cleaned for each page in the list if necessary.
10031 *
10032 * @param page_list List of pages to be updated.
10033 * @param cacheattr The new cache attributes.
10034 * @param update_attr_table Whether the pp_attr_table should be updated. This is useful for compressor
10035 * pages where it's desired to keep the old WIMG bits.
10036 */
10037 void
10038 pmap_batch_set_cache_attributes_internal(
10039 const unified_page_list_t *page_list,
10040 unsigned int cacheattr,
10041 bool update_attr_table)
10042 {
10043 bool tlb_flush_pass_needed = false;
10044 bool rt_cache_flush_pass_needed = false;
10045 bool preemption_disabled = false;
10046
10047 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE1);
10048
10049 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
10050 sptm_update_disjoint_multipage_op_t *sptm_ops = NULL;
10051
10052 pmap_sptm_update_cache_attr_ops_collect_state_t state = {0};
10053
10054 unified_page_list_iterator_t iter;
10055
10056 for (unified_page_list_iterator_init(page_list, &iter);
10057 !unified_page_list_iterator_end(&iter);
10058 unified_page_list_iterator_next(&iter)) {
10059 bool is_fictitious = false;
10060 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10061 const pmap_paddr_t paddr = ptoa(pn);
10062
10063 /**
10064 * Skip if the page is not managed.
10065 *
10066 * We don't panic here because sometimes the user just blindly pass in
10067 * pages that are not managed. We need to handle that gracefully.
10068 */
10069 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10070 continue;
10071 }
10072
10073 const unsigned int pai = pa_index(paddr);
10074 locked_pvh_t locked_pvh = {.pvh = 0};
10075
10076 if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
10077 /**
10078 * If we're partway through processing a multi-page batched call,
10079 * preemption will already be disabled so we can't simply call
10080 * pvh_lock() which may block. Instead, we first try to acquire
10081 * the lock without waiting, which in most cases should succeed.
10082 * If it fails, we submit the pending batched operations to re-
10083 * enable preemption and then acquire the lock normally.
10084 */
10085 locked_pvh = pvh_try_lock(pai);
10086 if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
10087 assert(preemption_disabled);
10088 const sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
10089 pmap_epoch_exit();
10090 enable_preemption();
10091 preemption_disabled = false;
10092 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10093 tlb_flush_pass_needed = true;
10094 }
10095 state.sptm_ops_index = 0;
10096 locked_pvh = pvh_lock(pai);
10097 }
10098 } else {
10099 locked_pvh = pvh_lock(pai);
10100 }
10101 assert(locked_pvh.pvh != 0);
10102
10103 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10104
10105 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10106 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10107 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10108 }
10109
10110 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10111
10112 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10113 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10114 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10115 }
10116
10117 /**
10118 * When update_attr_table is false, we know that wimg_bits_prev read from pp_attr_table is not to be trusted,
10119 * and we should force update the cache attribute.
10120 */
10121 const bool force_update = !update_attr_table;
10122 /* Update the cache attributes in PTE and PP_ATTR table. */
10123 if ((wimg_bits_new != wimg_bits_prev) || force_update) {
10124 if (!pmap_is_cache_attribute_allowed(cacheattr)) {
10125 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, pn=%#x",
10126 __func__, cacheattr & VM_WIMG_MASK, pn);
10127 }
10128
10129 /* Update PP_ATTR_TABLE */
10130 if (update_attr_table) {
10131 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10132 }
10133
10134 bool mapping_collection_done = false;
10135 bool pvh_lock_sleep_mode_needed = false;
10136 do {
10137 if (__improbable(pvh_lock_sleep_mode_needed)) {
10138 assert(!preemption_disabled);
10139 pvh_lock_enter_sleep_mode(&locked_pvh);
10140 pvh_lock_sleep_mode_needed = false;
10141 }
10142
10143 /* Disable preemption to use the per-CPU structure safely. */
10144 if (!preemption_disabled) {
10145 preemption_disabled = true;
10146 disable_preemption();
10147 /**
10148 * Enter the pmap epoch while we gather the disjoint update arguments
10149 * and issue the SPTM call. Since this operation may cover multiple physical
10150 * pages, we may construct the argument array and invoke the SPTM without holding
10151 * all relevant PVH locks, we need to record that we are collecting and modifying
10152 * mapping state so that e.g. pmap_page_protect() does not attempt to retype the
10153 * underlying pages and pmap_remove() does not attempt to free the page tables
10154 * used for these mappings without first draining our epoch.
10155 */
10156 pmap_epoch_enter();
10157
10158 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
10159 sptm_ops = (sptm_update_disjoint_multipage_op_t *) sptm_pcpu->sptm_ops;
10160 }
10161
10162 /* The return value indicates if we should call into SPTM in this iteration. */
10163 pmap_sptm_update_cache_attr_ops_collect_return_t retval =
10164 pmap_sptm_update_cache_attr_ops_collect(&state, sptm_ops, paddr, cacheattr);
10165
10166 /* The collection routine should only return if it needs attention. */
10167 assert(retval != OPS_COLLECT_NOTHING);
10168
10169 /* Gather information for next step from the return value. */
10170 mapping_collection_done = retval & OPS_COLLECT_RETURN_COMPLETED_PAGE;
10171 const bool call_sptm = retval & OPS_COLLECT_RETURN_FULL_TABLE;
10172
10173 if (call_sptm) {
10174 /* Call into SPTM with this SPTM ops table. */
10175 sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, SPTM_MAPPING_LIMIT);
10176 /**
10177 * We may be submitting the batch and exiting the epoch partway through
10178 * processing the PV list for a page. That's fine, because in that case we'll
10179 * hold the PV lock for that page, which will prevent mappings of that page from
10180 * being disconnected and will prevent the completion of pmap_remove() against
10181 * any of those mappings, thus also guaranteeing the relevant page table pages
10182 * can't be freed. The epoch still protects mappings for any prior page in
10183 * the batch, whose PV locks are no longer held.
10184 */
10185 pmap_epoch_exit();
10186 /**
10187 * Balance out the explicit disable_preemption() made either at the beginning of
10188 * the function or on a prior iteration of the loop that placed the PVH lock in
10189 * sleep mode. Note that enable_preemption() decrements a per-thread counter,
10190 * so if we still happen to hold the PVH lock in spin mode preemption won't
10191 * actually be re-enabled until we switch the lock over to sleep mode on
10192 * the next iteration.
10193 */
10194 enable_preemption();
10195 preemption_disabled = false;
10196 pvh_lock_sleep_mode_needed = true;
10197
10198 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10199 tlb_flush_pass_needed = true;
10200 }
10201 }
10202
10203 /* We cannot be in a situation where we didn't call into SPTM while also having not finished walking the pv list. */
10204 assert(call_sptm || mapping_collection_done);
10205 } while (!mapping_collection_done);
10206
10207 /**
10208 * We could technically force the cache flush pass here when force_update is true, but
10209 * since the compressor mapping/unmapping path handles cache flushing itself, it's fine
10210 * leaving this as is.
10211 */
10212 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10213 rt_cache_flush_pass_needed = true;
10214 }
10215 }
10216
10217 pvh_unlock(&locked_pvh);
10218 }
10219
10220 if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
10221 assert(preemption_disabled);
10222 sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
10223 pmap_epoch_exit();
10224 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10225 tlb_flush_pass_needed = true;
10226 }
10227
10228 /**
10229 * This is the last sptm_update_cache_attr() call whatsoever, so it's
10230 * okay not to update the state variables.
10231 */
10232
10233 enable_preemption();
10234 } else if (preemption_disabled) {
10235 pmap_epoch_exit();
10236 enable_preemption();
10237 }
10238
10239 if (tlb_flush_pass_needed) {
10240 /* Sync the PTE writes before potential TLB/Cache flushes. */
10241 FLUSH_PTE_STRONG();
10242
10243 /**
10244 * Pass 2: for each physical page and for each mapping, we need to flush
10245 * the TLB for it.
10246 */
10247 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE2);
10248 for (unified_page_list_iterator_init(page_list, &iter);
10249 !unified_page_list_iterator_end(&iter);
10250 unified_page_list_iterator_next(&iter)) {
10251 bool is_fictitious = false;
10252 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10253 const pmap_paddr_t paddr = ptoa(pn);
10254
10255 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10256 continue;
10257 }
10258
10259 pmap_flush_tlb_for_paddr_async(paddr);
10260 }
10261
10262 #if HAS_FEAT_XS
10263 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10264 arm64_sync_tlb(false);
10265 #else
10266 /**
10267 * For targets that distinguish between mild and strong DSB, mild DSB
10268 * will not drain the prefetcher. This can lead to prefetch-driven
10269 * cache fills that defeat the uncacheable requirement of the RT memory type.
10270 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10271 */
10272 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10273 #endif
10274 }
10275
10276 if (rt_cache_flush_pass_needed) {
10277 /* Pass 3: Flush the cache if the page is recently set to RT */
10278 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE3);
10279 /**
10280 * We disable preemption to ensure we are not preempted
10281 * in the state where DC by VA instructions remain enabled.
10282 */
10283 disable_preemption();
10284
10285 assert(get_preemption_level() > 0);
10286
10287 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10288 /**
10289 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10290 * and the host will handle cache maintenance for it. So we don't need to
10291 * worry about enabling the ops here for AVP.
10292 */
10293 enable_dc_mva_ops();
10294 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10295 /**
10296 * DMB should be sufficient to ensure prior accesses to the memory in question are
10297 * correctly ordered relative to the upcoming cache maintenance operations.
10298 */
10299 __builtin_arm_dmb(DMB_SY);
10300
10301 for (unified_page_list_iterator_init(page_list, &iter);
10302 !unified_page_list_iterator_end(&iter);) {
10303 bool is_fictitious = false;
10304 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10305 const pmap_paddr_t paddr = ptoa(pn);
10306
10307 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10308 unified_page_list_iterator_next(&iter);
10309 continue;
10310 }
10311
10312 CleanPoC_DcacheRegion_Force_nopreempt_nohid_nobarrier(phystokv(paddr), PAGE_SIZE);
10313
10314 unified_page_list_iterator_next(&iter);
10315 if (__improbable(pmap_pending_preemption() && !unified_page_list_iterator_end(&iter))) {
10316 __builtin_arm_dsb(DSB_SY);
10317 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10318 disable_dc_mva_ops();
10319 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10320 enable_preemption();
10321 assert(preemption_enabled());
10322 disable_preemption();
10323 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10324 enable_dc_mva_ops();
10325 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10326 }
10327 }
10328
10329 /* Issue DSB to ensure cache maintenance is fully complete before subsequent accesses. */
10330 __builtin_arm_dsb(DSB_SY);
10331 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10332 disable_dc_mva_ops();
10333 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10334
10335 enable_preemption();
10336 }
10337
10338 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE4);
10339 }
10340
10341 /**
10342 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10343 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10344 *
10345 * @param page_list List of pages to be updated.
10346 * @param cacheattr The new cache attribute.
10347 */
10348 void
10349 pmap_batch_set_cache_attributes(
10350 const unified_page_list_t *page_list,
10351 unsigned int cacheattr)
10352 {
10353 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10354
10355 /* Verify we are being called from a preemptible context. */
10356 pmap_verify_preemptible();
10357
10358 pmap_batch_set_cache_attributes_internal(page_list, cacheattr, true);
10359
10360 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10361 }
10362
10363 MARK_AS_PMAP_TEXT void
10364 pmap_set_cache_attributes_internal(
10365 ppnum_t pn,
10366 unsigned int cacheattr,
10367 bool update_attr_table)
10368 {
10369 upl_page_info_t single_page_upl = { .phys_addr = pn };
10370 const unified_page_list_t page_list = {
10371 .upl = {.upl_info = &single_page_upl, .upl_size = 1},
10372 .type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
10373 };
10374
10375 pmap_batch_set_cache_attributes_internal(&page_list, cacheattr, update_attr_table);
10376 }
10377
10378 void
10379 pmap_set_cache_attributes(
10380 ppnum_t pn,
10381 unsigned int cacheattr)
10382 {
10383 pmap_set_cache_attributes_internal(pn, cacheattr, true);
10384 }
10385
10386 void
10387 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10388 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10389 {
10390 pmap_paddr_t data_pa = 0; // data address
10391 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10392 pmap_paddr_t text_pa = 0; // text address
10393
10394 *kernel_data_addr = 0;
10395 *kernel_text_addr = 0;
10396 *user_text_addr = 0;
10397
10398 kern_return_t kr = pmap_page_alloc(&data_pa, PMAP_PAGE_ALLOCATE_NONE);
10399 assert(kr == KERN_SUCCESS);
10400
10401 kr = pmap_page_alloc(&ro_data_pa, PMAP_PAGE_ALLOCATE_NONE);
10402 assert(kr == KERN_SUCCESS);
10403
10404 #if CONFIG_ARM_PFZ
10405 kr = pmap_page_alloc(&text_pa, PMAP_PAGE_ALLOCATE_NONE);
10406 assert(kr == KERN_SUCCESS);
10407
10408 /**
10409 * User mapping of comm page text section for 64 bit mapping only
10410 *
10411 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10412 * user processes to get this page mapped in, they should never call into
10413 * this page.
10414 *
10415 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10416 * is slid in the same L3 as the data commpage. It is either outside the
10417 * max of user VA or is pre-reserved in vm_map_exec(). This means that
10418 * it is reserved and unavailable to mach VM for future mappings.
10419 */
10420 const int num_ptes = pt_attr_leaf_size(native_pt_attr) >> PTE_SHIFT;
10421
10422 do {
10423 const int text_leaf_index = random() % num_ptes;
10424
10425 /**
10426 * Generate a VA for the commpage text with the same root and twig index as data
10427 * comm page, but with new leaf index we've just generated.
10428 */
10429 commpage_text_user_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(native_pt_attr));
10430 commpage_text_user_va |= (text_leaf_index << pt_attr_leaf_shift(native_pt_attr));
10431 } while ((commpage_text_user_va == _COMM_PAGE64_BASE_ADDRESS) ||
10432 (commpage_text_user_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10433
10434 *user_text_addr = commpage_text_user_va;
10435 *kernel_text_addr = phystokv(text_pa);
10436 #endif
10437
10438 /* For manipulation in kernel, go straight to physical page */
10439 commpage_data_pa = data_pa;
10440 *kernel_data_addr = phystokv(data_pa);
10441 assert(commpage_ro_data_pa == 0);
10442 commpage_ro_data_pa = ro_data_pa;
10443 *kernel_ro_data_addr = phystokv(ro_data_pa);
10444 assert(commpage_text_pa == 0);
10445 commpage_text_pa = text_pa;
10446 }
10447
10448
10449 /*
10450 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10451 * with user controlled TTEs for regions that aren't explicitly reserved by the
10452 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10453 */
10454 #if (ARM_PGSHIFT == 14)
10455 /**
10456 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10457 * commpage completely above the maximum 32-bit userspace VA.
10458 */
10459 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10460 static_assert(_COMM_PAGE64_NESTING_START == SPTM_ARM64_COMMPAGE_REGION_START);
10461 static_assert(_COMM_PAGE64_NESTING_SIZE == SPTM_ARM64_COMMPAGE_REGION_SIZE);
10462
10463 /**
10464 * Normally there'd be an assert to check that 64-bit devices with 64-bit
10465 * userspace VAs can nest the commpage completely above the maximum 64-bit
10466 * userpace VA, but that technically isn't true on macOS. On those systems, the
10467 * commpage lives within the userspace VA range, but is protected by the VM as
10468 * a reserved region (see vm_reserved_regions[] definition for more info).
10469 */
10470
10471 #elif (ARM_PGSHIFT == 12)
10472 /**
10473 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
10474 * above the maximum userspace VA.
10475 */
10476 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
10477 #else
10478 #error Nested shared page mapping is unsupported on this config
10479 #endif
10480
10481 MARK_AS_PMAP_TEXT kern_return_t
10482 pmap_insert_commpage_internal(
10483 pmap_t pmap)
10484 {
10485 kern_return_t kr = KERN_SUCCESS;
10486 vm_offset_t commpage_vaddr;
10487 pt_entry_t *ttep;
10488 pmap_paddr_t commpage_table = commpage_default_table;
10489
10490 /* Validate the pmap input before accessing its data. */
10491 validate_pmap_mutable(pmap);
10492
10493 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10494 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10495
10496 #if __ARM_MIXED_PAGE_SIZE__
10497 #if !__ARM_16K_PG__
10498 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
10499 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
10500 #endif /* !__ARM_16K_PG__ */
10501
10502 /* Choose the correct shared page pmap to use. */
10503 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
10504 if (pmap_page_size == 4096) {
10505 if (pmap_is_64bit(pmap)) {
10506 commpage_table = commpage_4k_table;
10507 } else {
10508 panic("32-bit 4k commpage not currently supported for SPTM configurations");
10509 //commpage_table = commpage32_4k_table;
10510 }
10511 } else if (pmap_page_size != 16384) {
10512 panic("No commpage table exists for the wanted page size: %llu", pmap_page_size);
10513 } else
10514 #endif /* __ARM_MIXED_PAGE_SIZE__ */
10515 {
10516 if (pmap_is_64bit(pmap)) {
10517 commpage_table = commpage_default_table;
10518 } else {
10519 commpage_table = commpage32_default_table;
10520 }
10521 }
10522
10523 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
10524 #error We assume a single page.
10525 #endif
10526
10527 if (pmap_is_64bit(pmap)) {
10528 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10529 } else {
10530 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10531 }
10532
10533
10534 pmap_lock(pmap, PMAP_LOCK_SHARED);
10535
10536 /*
10537 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
10538 * two (2MB) depending on the address space layout. For 16KB pages, each level
10539 * one entry is 64GB, so we must go to the second level entry (32MB) in order
10540 * to "nest".
10541 *
10542 * Note: This is not "nesting" in the shared cache sense. This definition of
10543 * nesting just means inserting pointers to pre-allocated tables inside of
10544 * the passed in pmap to allow us to share page tables (which map the shared
10545 * page) for every task. This saves at least one page of memory per process
10546 * compared to creating new page tables in every process for mapping the
10547 * shared page.
10548 */
10549
10550 /**
10551 * Allocate the twig page tables if needed, and slam a pointer to the shared
10552 * page's tables into place.
10553 */
10554 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
10555 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10556
10557 kr = pmap_expand(pmap, commpage_vaddr, 0, commpage_level);
10558
10559 if (kr != KERN_SUCCESS) {
10560 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
10561 }
10562
10563 pmap_lock(pmap, PMAP_LOCK_SHARED);
10564 }
10565
10566 if (*ttep != ARM_PTE_EMPTY) {
10567 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
10568 }
10569
10570 sptm_map_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level,
10571 (commpage_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID);
10572
10573 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10574
10575 return kr;
10576 }
10577
10578 static void
10579 pmap_unmap_commpage(
10580 pmap_t pmap)
10581 {
10582 pt_entry_t *ptep;
10583 vm_offset_t commpage_vaddr;
10584
10585 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10586 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10587 __assert_only pmap_paddr_t commpage_pa = commpage_data_pa;
10588
10589 if (pmap_is_64bit(pmap)) {
10590 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10591 } else {
10592 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10593 }
10594
10595
10596 ptep = pmap_pte(pmap, commpage_vaddr);
10597
10598 if (ptep == NULL) {
10599 return;
10600 }
10601
10602 /* It had better be mapped to the shared page. */
10603 if (pte_to_pa(*ptep) != commpage_pa) {
10604 panic("%s: non-commpage PA 0x%llx mapped at VA 0x%llx in pmap %p; expected 0x%llx",
10605 __func__, (unsigned long long)pte_to_pa(*ptep), (unsigned long long)commpage_vaddr,
10606 pmap, (unsigned long long)commpage_pa);
10607 }
10608
10609 sptm_unmap_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level);
10610 }
10611
10612 void
10613 pmap_insert_commpage(
10614 pmap_t pmap)
10615 {
10616 pmap_insert_commpage_internal(pmap);
10617 }
10618
10619 static boolean_t
10620 pmap_is_64bit(
10621 pmap_t pmap)
10622 {
10623 return pmap->is_64bit;
10624 }
10625
10626 bool
10627 pmap_is_exotic(
10628 pmap_t pmap __unused)
10629 {
10630 return false;
10631 }
10632
10633
10634 /* ARMTODO -- an implementation that accounts for
10635 * holes in the physical map, if any.
10636 */
10637 boolean_t
10638 pmap_valid_page(
10639 ppnum_t pn)
10640 {
10641 return pa_valid(ptoa(pn));
10642 }
10643
10644 boolean_t
10645 pmap_bootloader_page(
10646 ppnum_t pn)
10647 {
10648 pmap_paddr_t paddr = ptoa(pn);
10649
10650 if (pa_valid(paddr)) {
10651 return FALSE;
10652 }
10653 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10654 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
10655 }
10656
10657 MARK_AS_PMAP_TEXT boolean_t
10658 pmap_is_empty_internal(
10659 pmap_t pmap,
10660 vm_map_offset_t va_start,
10661 vm_map_offset_t va_end)
10662 {
10663 vm_map_offset_t block_start, block_end;
10664 tt_entry_t *tte_p;
10665
10666 if (pmap == NULL) {
10667 return TRUE;
10668 }
10669
10670 validate_pmap(pmap);
10671
10672 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10673 unsigned int initial_not_in_kdp = not_in_kdp;
10674
10675 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10676 pmap_lock(pmap, PMAP_LOCK_SHARED);
10677 }
10678
10679
10680 /* TODO: This will be faster if we increment ttep at each level. */
10681 block_start = va_start;
10682
10683 while (block_start < va_end) {
10684 pt_entry_t *bpte_p, *epte_p;
10685 pt_entry_t *pte_p;
10686
10687 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10688 if (block_end > va_end) {
10689 block_end = va_end;
10690 }
10691
10692 tte_p = pmap_tte(pmap, block_start);
10693 if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
10694 pte_p = (pt_entry_t *) ttetokv(*tte_p);
10695 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
10696 epte_p = &pte_p[pte_index(pt_attr, block_end)];
10697
10698 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
10699 if (*pte_p != ARM_PTE_EMPTY) {
10700 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10701 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10702 }
10703 return FALSE;
10704 }
10705 }
10706 }
10707 block_start = block_end;
10708 }
10709
10710 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10711 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10712 }
10713
10714 return TRUE;
10715 }
10716
10717 boolean_t
10718 pmap_is_empty(
10719 pmap_t pmap,
10720 vm_map_offset_t va_start,
10721 vm_map_offset_t va_end)
10722 {
10723 return pmap_is_empty_internal(pmap, va_start, va_end);
10724 }
10725
10726 vm_map_offset_t
10727 pmap_max_offset(
10728 boolean_t is64,
10729 unsigned int option)
10730 {
10731 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
10732 }
10733
10734 vm_map_offset_t
10735 pmap_max_64bit_offset(
10736 __unused unsigned int option)
10737 {
10738 vm_map_offset_t max_offset_ret = 0;
10739
10740 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
10741 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
10742 max_offset_ret = arm64_pmap_max_offset_default;
10743 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
10744 max_offset_ret = min_max_offset;
10745 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
10746 max_offset_ret = MACH_VM_MAX_ADDRESS;
10747 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
10748 if (arm64_pmap_max_offset_default) {
10749 max_offset_ret = arm64_pmap_max_offset_default;
10750 } else if (max_mem > 0xC0000000) {
10751 // devices with > 3GB of memory
10752 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
10753 } else if (max_mem > 0x40000000) {
10754 // devices with > 1GB and <= 3GB of memory
10755 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
10756 } else {
10757 // devices with <= 1 GB of memory
10758 max_offset_ret = min_max_offset;
10759 }
10760 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
10761 if (arm64_pmap_max_offset_default) {
10762 // Allow the boot-arg to override jumbo size
10763 max_offset_ret = arm64_pmap_max_offset_default;
10764 } else {
10765 max_offset_ret = MACH_VM_JUMBO_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
10766 }
10767 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
10768 } else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
10769 max_offset_ret = MACH_VM_MAX_ADDRESS;
10770 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
10771 } else {
10772 panic("pmap_max_64bit_offset illegal option 0x%x", option);
10773 }
10774
10775 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
10776 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
10777 assert(max_offset_ret >= min_max_offset);
10778 }
10779
10780 return max_offset_ret;
10781 }
10782
10783 vm_map_offset_t
10784 pmap_max_32bit_offset(
10785 unsigned int option)
10786 {
10787 vm_map_offset_t max_offset_ret = 0;
10788
10789 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
10790 max_offset_ret = arm_pmap_max_offset_default;
10791 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
10792 max_offset_ret = VM_MAX_ADDRESS;
10793 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
10794 max_offset_ret = VM_MAX_ADDRESS;
10795 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
10796 if (arm_pmap_max_offset_default) {
10797 max_offset_ret = arm_pmap_max_offset_default;
10798 } else if (max_mem > 0x20000000) {
10799 max_offset_ret = VM_MAX_ADDRESS;
10800 } else {
10801 max_offset_ret = VM_MAX_ADDRESS;
10802 }
10803 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
10804 max_offset_ret = VM_MAX_ADDRESS;
10805 } else {
10806 panic("pmap_max_32bit_offset illegal option 0x%x", option);
10807 }
10808
10809 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
10810 return max_offset_ret;
10811 }
10812
10813 #if CONFIG_DTRACE
10814 /*
10815 * Constrain DTrace copyin/copyout actions
10816 */
10817 extern kern_return_t dtrace_copyio_preflight(addr64_t);
10818 extern kern_return_t dtrace_copyio_postflight(addr64_t);
10819
10820 kern_return_t
10821 dtrace_copyio_preflight(
10822 __unused addr64_t va)
10823 {
10824 if (current_map() == kernel_map) {
10825 return KERN_FAILURE;
10826 } else {
10827 return KERN_SUCCESS;
10828 }
10829 }
10830
10831 kern_return_t
10832 dtrace_copyio_postflight(
10833 __unused addr64_t va)
10834 {
10835 return KERN_SUCCESS;
10836 }
10837 #endif /* CONFIG_DTRACE */
10838
10839
10840 void
10841 pmap_flush_context_init(__unused pmap_flush_context *pfc)
10842 {
10843 }
10844
10845
10846 void
10847 pmap_flush(
10848 __unused pmap_flush_context *cpus_to_flush)
10849 {
10850 /* not implemented yet */
10851 return;
10852 }
10853
10854 /**
10855 * Perform basic validation checks on the destination only and
10856 * corresponding offset/sizes prior to writing to a read only allocation.
10857 *
10858 * @note Should be called before writing to an allocation from the read
10859 * only allocator.
10860 *
10861 * @param zid The ID of the zone the allocation belongs to.
10862 * @param va VA of element being modified (destination).
10863 * @param offset Offset being written to, in the element.
10864 * @param new_data_size Size of modification.
10865 *
10866 */
10867
10868 MARK_AS_PMAP_TEXT static void
10869 pmap_ro_zone_validate_element_dst(
10870 zone_id_t zid,
10871 vm_offset_t va,
10872 vm_offset_t offset,
10873 vm_size_t new_data_size)
10874 {
10875 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
10876 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
10877 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
10878 }
10879
10880 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
10881
10882 /* Check element is from correct zone and properly aligned */
10883 zone_require_ro(zid, elem_size, (void*)va);
10884
10885 if (__improbable(new_data_size > (elem_size - offset))) {
10886 panic("%s: New data size %lu too large for elem size %lu at addr %p",
10887 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
10888 }
10889 if (__improbable(offset >= elem_size)) {
10890 panic("%s: Offset %lu too large for elem size %lu at addr %p",
10891 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
10892 }
10893 }
10894
10895
10896 /**
10897 * Perform basic validation checks on the source, destination and
10898 * corresponding offset/sizes prior to writing to a read only allocation.
10899 *
10900 * @note Should be called before writing to an allocation from the read
10901 * only allocator.
10902 *
10903 * @param zid The ID of the zone the allocation belongs to.
10904 * @param va VA of element being modified (destination).
10905 * @param offset Offset being written to, in the element.
10906 * @param new_data Pointer to new data (source).
10907 * @param new_data_size Size of modification.
10908 *
10909 */
10910
10911 MARK_AS_PMAP_TEXT static void
10912 pmap_ro_zone_validate_element(
10913 zone_id_t zid,
10914 vm_offset_t va,
10915 vm_offset_t offset,
10916 const vm_offset_t new_data,
10917 vm_size_t new_data_size)
10918 {
10919 vm_offset_t sum = 0;
10920
10921 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
10922 panic("%s: Integer addition overflow %p + %lu = %lu",
10923 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
10924 }
10925
10926 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
10927 }
10928
10929 /**
10930 * Function to configure RO zone access permissions for a forthcoming write operation.
10931 */
10932 static void
10933 pmap_ro_zone_prepare_write(void)
10934 {
10935 }
10936
10937 /**
10938 * Function to indicate that a preceding RO zone write operation is complete.
10939 */
10940 static void
10941 pmap_ro_zone_complete_write(void)
10942 {
10943 }
10944
10945 /**
10946 * Function to align an address or size to the required RO zone mapping alignment.
10947 *
10948 * For the SPTM the RO zone region must be aligned on a twig boundary so that at least
10949 * the last-level kernel pagetable can be of the appropriate SPTM RO zone table type,
10950 * which allows the SPTM to enforce RO zone mapping permission restrictions.
10951 *
10952 * @param value the address or size to be aligned.
10953 *
10954 * @return the aligned value
10955 */
10956 vm_offset_t
10957 pmap_ro_zone_align(vm_offset_t value)
10958 {
10959 const pt_attr_t * const pt_attr = pmap_get_pt_attr(kernel_pmap);
10960 return PMAP_ALIGN(value, pt_attr_twig_size(pt_attr));
10961 }
10962
10963 /**
10964 * Function to copy kauth_cred from new_data to kv.
10965 * Function defined in "kern_prot.c"
10966 *
10967 * @note Will be removed upon completion of
10968 * <rdar://problem/72635194> Compiler PAC support for memcpy.
10969 *
10970 * @param kv Address to copy new data to.
10971 * @param new_data Pointer to new data.
10972 *
10973 */
10974
10975 extern void
10976 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
10977
10978 /**
10979 * Zalloc-specific memcpy that writes through the physical aperture
10980 * and ensures the element being modified is from a read-only zone.
10981 *
10982 * @note Designed to work only with the zone allocator's read-only submap.
10983 *
10984 * @param zid The ID of the zone to allocate from.
10985 * @param va VA of element to be modified.
10986 * @param offset Offset from element.
10987 * @param new_data Pointer to new data.
10988 * @param new_data_size Size of modification.
10989 *
10990 */
10991
10992 void
10993 pmap_ro_zone_memcpy(
10994 zone_id_t zid,
10995 vm_offset_t va,
10996 vm_offset_t offset,
10997 const vm_offset_t new_data,
10998 vm_size_t new_data_size)
10999 {
11000 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11001 }
11002
11003 MARK_AS_PMAP_TEXT void
11004 pmap_ro_zone_memcpy_internal(
11005 zone_id_t zid,
11006 vm_offset_t va,
11007 vm_offset_t offset,
11008 const vm_offset_t new_data,
11009 vm_size_t new_data_size)
11010 {
11011 if (!new_data || new_data_size == 0) {
11012 return;
11013 }
11014
11015 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11016 const bool istate = ml_set_interrupts_enabled(FALSE);
11017 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11018 pmap_ro_zone_prepare_write();
11019 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11020 pmap_ro_zone_complete_write();
11021 ml_set_interrupts_enabled(istate);
11022 }
11023
11024 /**
11025 * Zalloc-specific function to atomically mutate fields of an element that
11026 * belongs to a read-only zone, via the physcial aperture.
11027 *
11028 * @note Designed to work only with the zone allocator's read-only submap.
11029 *
11030 * @param zid The ID of the zone the element belongs to.
11031 * @param va VA of element to be modified.
11032 * @param offset Offset in element.
11033 * @param op Atomic operation to perform.
11034 * @param value Mutation value.
11035 *
11036 */
11037
11038 uint64_t
11039 pmap_ro_zone_atomic_op(
11040 zone_id_t zid,
11041 vm_offset_t va,
11042 vm_offset_t offset,
11043 zro_atomic_op_t op,
11044 uint64_t value)
11045 {
11046 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11047 }
11048
11049 MARK_AS_PMAP_TEXT uint64_t
11050 pmap_ro_zone_atomic_op_internal(
11051 zone_id_t zid,
11052 vm_offset_t va,
11053 vm_offset_t offset,
11054 zro_atomic_op_t op,
11055 uint64_t value)
11056 {
11057 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11058 vm_size_t value_size = op & 0xf;
11059 const boolean_t istate = ml_set_interrupts_enabled(FALSE);
11060
11061 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11062 pmap_ro_zone_prepare_write();
11063 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11064 pmap_ro_zone_complete_write();
11065 ml_set_interrupts_enabled(istate);
11066
11067 return value;
11068 }
11069
11070 /**
11071 * bzero for allocations from read only zones, that writes through the
11072 * physical aperture.
11073 *
11074 * @note This is called by the zfree path of all allocations from read
11075 * only zones.
11076 *
11077 * @param zid The ID of the zone the allocation belongs to.
11078 * @param va VA of element to be zeroed.
11079 * @param offset Offset in the element.
11080 * @param size Size of allocation.
11081 *
11082 */
11083
11084 void
11085 pmap_ro_zone_bzero(
11086 zone_id_t zid,
11087 vm_offset_t va,
11088 vm_offset_t offset,
11089 vm_size_t size)
11090 {
11091 pmap_ro_zone_bzero_internal(zid, va, offset, size);
11092 }
11093
11094 MARK_AS_PMAP_TEXT void
11095 pmap_ro_zone_bzero_internal(
11096 zone_id_t zid,
11097 vm_offset_t va,
11098 vm_offset_t offset,
11099 vm_size_t size)
11100 {
11101 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11102 const boolean_t istate = ml_set_interrupts_enabled(FALSE);
11103 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11104 pmap_ro_zone_prepare_write();
11105 bzero((void*)phystokv(pa), size);
11106 pmap_ro_zone_complete_write();
11107 ml_set_interrupts_enabled(istate);
11108 }
11109
11110 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
11111
11112 MARK_AS_PMAP_TEXT mach_vm_size_t
11113 pmap_query_resident_internal(
11114 pmap_t pmap,
11115 vm_map_address_t start,
11116 vm_map_address_t end,
11117 mach_vm_size_t *compressed_bytes_p)
11118 {
11119 mach_vm_size_t resident_bytes = 0;
11120 mach_vm_size_t compressed_bytes = 0;
11121
11122 pt_entry_t *bpte, *epte;
11123 pt_entry_t *pte_p;
11124 tt_entry_t *tte_p;
11125
11126 if (pmap == NULL) {
11127 return PMAP_RESIDENT_INVALID;
11128 }
11129
11130 validate_pmap(pmap);
11131
11132 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11133
11134 /* Ensure that this request is valid, and addresses exactly one TTE. */
11135 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11136 (end % pt_attr_page_size(pt_attr)))) {
11137 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11138 }
11139
11140 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11141 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11142 }
11143
11144 pmap_lock(pmap, PMAP_LOCK_SHARED);
11145 tte_p = pmap_tte(pmap, start);
11146 if (tte_p == (tt_entry_t *) NULL) {
11147 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11148 return PMAP_RESIDENT_INVALID;
11149 }
11150 if (tte_is_valid_table(*tte_p)) {
11151 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11152 bpte = &pte_p[pte_index(pt_attr, start)];
11153 epte = &pte_p[pte_index(pt_attr, end)];
11154
11155 for (; bpte < epte; bpte++) {
11156 if (pte_is_compressed(*bpte, bpte)) {
11157 compressed_bytes += pt_attr_page_size(pt_attr);
11158 } else if (pa_valid(pte_to_pa(*bpte))) {
11159 resident_bytes += pt_attr_page_size(pt_attr);
11160 }
11161 }
11162 }
11163 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11164
11165 if (compressed_bytes_p) {
11166 *compressed_bytes_p += compressed_bytes;
11167 }
11168
11169 return resident_bytes;
11170 }
11171
11172 mach_vm_size_t
11173 pmap_query_resident(
11174 pmap_t pmap,
11175 vm_map_address_t start,
11176 vm_map_address_t end,
11177 mach_vm_size_t *compressed_bytes_p)
11178 {
11179 mach_vm_size_t total_resident_bytes;
11180 mach_vm_size_t compressed_bytes;
11181 vm_map_address_t va;
11182
11183
11184 if (pmap == PMAP_NULL) {
11185 if (compressed_bytes_p) {
11186 *compressed_bytes_p = 0;
11187 }
11188 return 0;
11189 }
11190
11191 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11192
11193 total_resident_bytes = 0;
11194 compressed_bytes = 0;
11195
11196 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
11197 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
11198 VM_KERNEL_ADDRHIDE(end));
11199
11200 va = start;
11201 while (va < end) {
11202 vm_map_address_t l;
11203 mach_vm_size_t resident_bytes;
11204
11205 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
11206
11207 if (l > end) {
11208 l = end;
11209 }
11210 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
11211 if (resident_bytes == PMAP_RESIDENT_INVALID) {
11212 break;
11213 }
11214
11215 total_resident_bytes += resident_bytes;
11216
11217 va = l;
11218 }
11219
11220 if (compressed_bytes_p) {
11221 *compressed_bytes_p = compressed_bytes;
11222 }
11223
11224 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
11225 total_resident_bytes);
11226
11227 return total_resident_bytes;
11228 }
11229
11230 #if MACH_ASSERT
11231 static void
11232 pmap_check_ledgers(
11233 pmap_t pmap)
11234 {
11235 int pid;
11236 char *procname;
11237
11238 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
11239 /*
11240 * This pmap was not or is no longer fully associated
11241 * with a task (e.g. the old pmap after a fork()/exec() or
11242 * spawn()). Its "ledger" still points at a task that is
11243 * now using a different (and active) address space, so
11244 * we can't check that all the pmap ledgers are balanced here.
11245 *
11246 * If the "pid" is set, that means that we went through
11247 * pmap_set_process() in task_terminate_internal(), so
11248 * this task's ledger should not have been re-used and
11249 * all the pmap ledgers should be back to 0.
11250 */
11251 return;
11252 }
11253
11254 pid = pmap->pmap_pid;
11255 procname = pmap->pmap_procname;
11256
11257 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
11258 }
11259 #endif /* MACH_ASSERT */
11260
11261 void
11262 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
11263 {
11264 }
11265
11266 /**
11267 * The minimum shared region nesting size is used by the VM to determine when to
11268 * break up large mappings to nested regions. The smallest size that these
11269 * mappings can be broken into is determined by what page table level those
11270 * regions are being nested in at and the size of the page tables.
11271 *
11272 * For instance, if a nested region is nesting at L2 for a process utilizing
11273 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
11274 * block entry).
11275 *
11276 * @param pmap The target pmap to determine the block size based on whether it's
11277 * using 16KB or 4KB page tables.
11278 */
11279 uint64_t
11280 pmap_shared_region_size_min(__unused pmap_t pmap)
11281 {
11282 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11283
11284 /**
11285 * We always nest the shared region at L2 (32MB for 16KB pages, 8MB for
11286 * 4KB pages). This means that a target pmap will contain L2 entries that
11287 * point to shared L3 page tables in the shared region pmap.
11288 */
11289 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
11290 return pt_attr_twig_size(pt_attr) * page_ratio;
11291 }
11292
11293 boolean_t
11294 pmap_enforces_execute_only(
11295 pmap_t pmap)
11296 {
11297 return pmap != kernel_pmap;
11298 }
11299
11300 MARK_AS_PMAP_TEXT void
11301 pmap_set_vm_map_cs_enforced_internal(
11302 pmap_t pmap,
11303 bool new_value)
11304 {
11305 validate_pmap_mutable(pmap);
11306 pmap->pmap_vm_map_cs_enforced = new_value;
11307 }
11308
11309 void
11310 pmap_set_vm_map_cs_enforced(
11311 pmap_t pmap,
11312 bool new_value)
11313 {
11314 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
11315 }
11316
11317 extern int cs_process_enforcement_enable;
11318 bool
11319 pmap_get_vm_map_cs_enforced(
11320 pmap_t pmap)
11321 {
11322 if (cs_process_enforcement_enable) {
11323 return true;
11324 }
11325 return pmap->pmap_vm_map_cs_enforced;
11326 }
11327
11328 MARK_AS_PMAP_TEXT void
11329 pmap_set_jit_entitled_internal(
11330 __unused pmap_t pmap)
11331 {
11332 }
11333
11334 void
11335 pmap_set_jit_entitled(
11336 pmap_t pmap)
11337 {
11338 pmap_set_jit_entitled_internal(pmap);
11339 }
11340
11341 bool
11342 pmap_get_jit_entitled(
11343 __unused pmap_t pmap)
11344 {
11345 return false;
11346 }
11347
11348 MARK_AS_PMAP_TEXT void
11349 pmap_set_tpro_internal(
11350 __unused pmap_t pmap)
11351 {
11352 return;
11353 }
11354
11355 void
11356 pmap_set_tpro(
11357 pmap_t pmap)
11358 {
11359 pmap_set_tpro_internal(pmap);
11360 }
11361
11362 bool
11363 pmap_get_tpro(
11364 __unused pmap_t pmap)
11365 {
11366 return false;
11367 }
11368
11369
11370 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
11371
11372 MARK_AS_PMAP_TEXT kern_return_t
11373 pmap_query_page_info_internal(
11374 pmap_t pmap,
11375 vm_map_offset_t va,
11376 int *disp_p)
11377 {
11378 pmap_paddr_t pa;
11379 int disp;
11380 unsigned int pai;
11381 pt_entry_t *pte_p;
11382 pv_entry_t *pve_p;
11383
11384 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
11385 *disp_p = 0;
11386 return KERN_INVALID_ARGUMENT;
11387 }
11388
11389 validate_pmap(pmap);
11390 pmap_lock(pmap, PMAP_LOCK_SHARED);
11391
11392 try_again:
11393 disp = 0;
11394
11395 pte_p = pmap_pte(pmap, va);
11396 if (pte_p == PT_ENTRY_NULL) {
11397 goto done;
11398 }
11399
11400 const pt_entry_t pte = os_atomic_load(pte_p, relaxed);
11401 pa = pte_to_pa(pte);
11402 if (pa == 0) {
11403 if (pte_is_compressed(pte, pte_p)) {
11404 disp |= PMAP_QUERY_PAGE_COMPRESSED;
11405 if (pte & ARM_PTE_COMPRESSED_ALT) {
11406 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
11407 }
11408 }
11409 } else {
11410 disp |= PMAP_QUERY_PAGE_PRESENT;
11411 pai = pa_index(pa);
11412 if (!pa_valid(pa)) {
11413 goto done;
11414 }
11415 locked_pvh_t locked_pvh = pvh_lock(pai);
11416 if (__improbable(pte != os_atomic_load(pte_p, relaxed))) {
11417 /* something changed: try again */
11418 pvh_unlock(&locked_pvh);
11419 pmap_query_page_info_retries++;
11420 goto try_again;
11421 }
11422 pve_p = PV_ENTRY_NULL;
11423 int pve_ptep_idx = 0;
11424 if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
11425 unsigned int npves = 0;
11426 pve_p = pvh_pve_list(locked_pvh.pvh);
11427 while (pve_p != PV_ENTRY_NULL &&
11428 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
11429 if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
11430 pvh_lock_enter_sleep_mode(&locked_pvh);
11431 }
11432 pve_p = pve_next(pve_p);
11433 npves++;
11434 }
11435 }
11436
11437 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
11438 disp |= PMAP_QUERY_PAGE_ALTACCT;
11439 } else if (ppattr_test_reusable(pai)) {
11440 disp |= PMAP_QUERY_PAGE_REUSABLE;
11441 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
11442 disp |= PMAP_QUERY_PAGE_INTERNAL;
11443 }
11444 pvh_unlock(&locked_pvh);
11445 }
11446
11447 done:
11448 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11449 *disp_p = disp;
11450 return KERN_SUCCESS;
11451 }
11452
11453 kern_return_t
11454 pmap_query_page_info(
11455 pmap_t pmap,
11456 vm_map_offset_t va,
11457 int *disp_p)
11458 {
11459 return pmap_query_page_info_internal(pmap, va, disp_p);
11460 }
11461
11462
11463
11464 uint32_t
11465 pmap_user_va_bits(pmap_t pmap __unused)
11466 {
11467 #if __ARM_MIXED_PAGE_SIZE__
11468 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
11469 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
11470 #else
11471 return 64 - T0SZ_BOOT;
11472 #endif
11473 }
11474
11475 uint32_t
11476 pmap_kernel_va_bits(void)
11477 {
11478 return 64 - T1SZ_BOOT;
11479 }
11480
11481 static vm_map_size_t
11482 pmap_user_va_size(pmap_t pmap)
11483 {
11484 return 1ULL << pmap_user_va_bits(pmap);
11485 }
11486
11487
11488
11489 bool
11490 pmap_in_ppl(void)
11491 {
11492 return false;
11493 }
11494
11495 MARK_AS_PMAP_TEXT void
11496 pmap_footprint_suspend_internal(
11497 vm_map_t map,
11498 boolean_t suspend)
11499 {
11500 #if DEVELOPMENT || DEBUG
11501 if (suspend) {
11502 current_thread()->pmap_footprint_suspended = TRUE;
11503 map->pmap->footprint_was_suspended = TRUE;
11504 } else {
11505 current_thread()->pmap_footprint_suspended = FALSE;
11506 }
11507 #else /* DEVELOPMENT || DEBUG */
11508 (void) map;
11509 (void) suspend;
11510 #endif /* DEVELOPMENT || DEBUG */
11511 }
11512
11513 void
11514 pmap_footprint_suspend(
11515 vm_map_t map,
11516 boolean_t suspend)
11517 {
11518 pmap_footprint_suspend_internal(map, suspend);
11519 }
11520
11521 void
11522 pmap_nop(pmap_t pmap)
11523 {
11524 validate_pmap_mutable(pmap);
11525 }
11526
11527 pmap_t
11528 pmap_txm_kernel_pmap(void)
11529 {
11530 return kernel_pmap;
11531 }
11532
11533 TXMAddressSpace_t*
11534 pmap_txm_addr_space(const pmap_t pmap)
11535 {
11536 if (pmap) {
11537 return pmap->txm_addr_space;
11538 }
11539
11540 /*
11541 * When the passed in PMAP is NULL, it means the caller wishes to operate
11542 * on the current_pmap(). We could resolve and return that, but it is actually
11543 * safer to return NULL since these TXM interfaces also accept NULL inputs
11544 * which causes TXM to resolve to the current_pmap() equivalent internally.
11545 */
11546 return NULL;
11547 }
11548
11549 void
11550 pmap_txm_set_addr_space(
11551 pmap_t pmap,
11552 TXMAddressSpace_t *txm_addr_space)
11553 {
11554 assert(pmap != NULL);
11555
11556 if (pmap->txm_addr_space && txm_addr_space) {
11557 /* Attempted to overwrite the address space in the PMAP */
11558 panic("attempted ovewrite of TXM address space: %p | %p | %p",
11559 pmap, pmap->txm_addr_space, txm_addr_space);
11560 } else if (!pmap->txm_addr_space && !txm_addr_space) {
11561 /* This should never happen */
11562 panic("attempted NULL overwrite of TXM address space: %p", pmap);
11563 }
11564
11565 pmap->txm_addr_space = txm_addr_space;
11566 }
11567
11568 void
11569 pmap_txm_set_trust_level(
11570 pmap_t pmap,
11571 CSTrust_t trust_level)
11572 {
11573 assert(pmap != NULL);
11574
11575 CSTrust_t current_trust = pmap->txm_trust_level;
11576 if (current_trust != kCSTrustUntrusted) {
11577 panic("attempted to overwrite TXM trust on the pmap: %p", pmap);
11578 }
11579
11580 pmap->txm_trust_level = trust_level;
11581 }
11582
11583 kern_return_t
11584 pmap_txm_get_trust_level_kdp(
11585 pmap_t pmap,
11586 CSTrust_t *trust_level)
11587 {
11588 if (pmap == NULL) {
11589 return KERN_INVALID_ARGUMENT;
11590 } else if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
11591 return KERN_INVALID_ARGUMENT;
11592 }
11593
11594 if (trust_level != NULL) {
11595 *trust_level = pmap->txm_trust_level;
11596 }
11597 return KERN_SUCCESS;
11598 }
11599
11600 kern_return_t
11601 pmap_txm_get_jit_address_range_kdp(
11602 pmap_t pmap,
11603 uintptr_t *jit_region_start,
11604 uintptr_t *jit_region_end)
11605 {
11606 if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
11607 return KERN_INVALID_ARGUMENT;
11608 }
11609 TXMAddressSpace_t *txm_addr_space = pmap_txm_addr_space(pmap);
11610 if (NULL == txm_addr_space) {
11611 return KERN_INVALID_ARGUMENT;
11612 }
11613 if (ml_validate_nofault((vm_offset_t)txm_addr_space, sizeof(*txm_addr_space)) == false) {
11614 return KERN_INVALID_ARGUMENT;
11615 }
11616 /**
11617 * It's a bit gross that we're dereferencing what is supposed to be an abstract type.
11618 * If we were running in the TXM, we would always perform additional checks on txm_addr_space,
11619 * but this isn't necessary here, since we are running in the kernel and only using the results for
11620 * diagnostic purposes, rather than any policy enforcement.
11621 */
11622 if (txm_addr_space->jitRegion) {
11623 if (ml_validate_nofault((vm_offset_t)txm_addr_space->jitRegion, sizeof(txm_addr_space->jitRegion)) == false) {
11624 return KERN_INVALID_ARGUMENT;
11625 }
11626 if (txm_addr_space->jitRegion->addr && txm_addr_space->jitRegion->addrEnd) {
11627 *jit_region_start = txm_addr_space->jitRegion->addr;
11628 *jit_region_end = txm_addr_space->jitRegion->addrEnd;
11629 return KERN_SUCCESS;
11630 }
11631 }
11632 return KERN_NOT_FOUND;
11633 }
11634
11635 static pmap_t
11636 _pmap_txm_resolve_pmap(pmap_t pmap)
11637 {
11638 if (pmap == NULL) {
11639 pmap = current_pmap();
11640 if (pmap == kernel_pmap) {
11641 return NULL;
11642 }
11643 }
11644
11645 return pmap;
11646 }
11647
11648 void
11649 pmap_txm_acquire_shared_lock(pmap_t pmap)
11650 {
11651 pmap = _pmap_txm_resolve_pmap(pmap);
11652 if (!pmap) {
11653 return;
11654 }
11655
11656 lck_rw_lock_shared(&pmap->txm_lck);
11657 }
11658
11659 void
11660 pmap_txm_release_shared_lock(pmap_t pmap)
11661 {
11662 pmap = _pmap_txm_resolve_pmap(pmap);
11663 if (!pmap) {
11664 return;
11665 }
11666
11667 lck_rw_unlock_shared(&pmap->txm_lck);
11668 }
11669
11670 void
11671 pmap_txm_acquire_exclusive_lock(pmap_t pmap)
11672 {
11673 pmap = _pmap_txm_resolve_pmap(pmap);
11674 if (!pmap) {
11675 return;
11676 }
11677
11678 lck_rw_lock_exclusive(&pmap->txm_lck);
11679 }
11680
11681 void
11682 pmap_txm_release_exclusive_lock(pmap_t pmap)
11683 {
11684 pmap = _pmap_txm_resolve_pmap(pmap);
11685 if (!pmap) {
11686 return;
11687 }
11688
11689 lck_rw_unlock_exclusive(&pmap->txm_lck);
11690 }
11691
11692 static void
11693 _pmap_txm_transfer_page(const pmap_paddr_t addr)
11694 {
11695 sptm_retype_params_t retype_params = {
11696 .raw = SPTM_RETYPE_PARAMS_NULL
11697 };
11698
11699 /* Retype through the SPTM */
11700 sptm_retype(addr, XNU_DEFAULT, TXM_DEFAULT, retype_params);
11701 }
11702
11703 /**
11704 * Prepare a page for retyping to TXM_DEFAULT by clearing its
11705 * internal flags.
11706 *
11707 * @param pa Physical address of the page.
11708 */
11709 static inline void
11710 _pmap_txm_retype_prepare(const pmap_paddr_t pa)
11711 {
11712 const sptm_retype_params_t retype_params = {
11713 .raw = SPTM_RETYPE_PARAMS_NULL
11714 };
11715
11716 /**
11717 * SPTM allows XNU_DEFAULT pages to request deferral of TLB flushing
11718 * when their PTE is updated, which is an important performance
11719 * optimization. However, this also allows an attacker controlled
11720 * XNU to exploit a read reference with a stale write-enabled PTE in
11721 * TLB. This is fine as long as the page is not retyped and the damage
11722 * will be contained within XNU domain. However, when such a page needs
11723 * to be retyped, SPTM has to make sure there's no outstanding
11724 * reference, or there's no history of deferring TLBIs. Internally,
11725 * SPTM maintains a flag tracking past deferred TLBIs that only gets
11726 * cleared on retyping with no outstanding reference. Therefore, we
11727 * do a dummy retype to XNU_DEFAULT itself to clear the internal flag,
11728 * before we actually transfer this page to TXM domain. To make sure
11729 * SPTM won't throw a violation, all the mappings to the page have to
11730 * be removed before calling this.
11731 */
11732 sptm_retype(pa, XNU_DEFAULT, XNU_DEFAULT, retype_params);
11733 }
11734
11735 /**
11736 * Transfer an XNU owned page to TXM domain.
11737 *
11738 * @param addr Kernel virtual address of the page. It has to be page size
11739 * aligned.
11740 */
11741 void
11742 pmap_txm_transfer_page(const vm_address_t addr)
11743 {
11744 assert((addr & PAGE_MASK) == 0);
11745
11746 const pmap_paddr_t pa = kvtophys_nofail(addr);
11747 const unsigned int pai = pa_index(pa);
11748
11749 /* Lock the PVH lock to prevent concurrent updates to the mappings during the self retype below. */
11750 locked_pvh_t locked_pvh = pvh_lock(pai);
11751
11752 /* Disconnect the mapping to assure SPTM of no pending TLBI. */
11753 pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
11754 PMAP_OPTIONS_PPO_PENDING_RETYPE, &locked_pvh, NULL);
11755
11756 /* Self retype to clear the SPTM internal flags tracking delayed TLBIs for revoked writes. */
11757 _pmap_txm_retype_prepare(pa);
11758
11759 pvh_unlock(&locked_pvh);
11760
11761 /* XNU needs to hold an RO reference to the page despite the ownership being transferred to TXM. */
11762 pmap_enter_addr(kernel_pmap, addr, pa, VM_PROT_READ, VM_PROT_NONE, 0, true, PMAP_MAPPING_TYPE_INFER);
11763
11764 /* Finally, retype the page to TXM_DEFAULT. */
11765 _pmap_txm_transfer_page(pa);
11766 }
11767
11768 struct vm_object txm_vm_object_storage VM_PAGE_PACKED_ALIGNED;
11769 SECURITY_READ_ONLY_LATE(vm_object_t) txm_vm_object = &txm_vm_object_storage;
11770
11771 _Static_assert(sizeof(vm_map_address_t) == sizeof(pmap_paddr_t),
11772 "sizeof(vm_map_address_t) != sizeof(pmap_paddr_t)");
11773
11774 vm_map_address_t
11775 pmap_txm_allocate_page(void)
11776 {
11777 pmap_paddr_t phys_addr = 0;
11778 vm_page_t page = VM_PAGE_NULL;
11779 boolean_t thread_vm_privileged = false;
11780
11781 /* We are allowed to allocate privileged memory */
11782 thread_vm_privileged = set_vm_privilege(true);
11783
11784 /* Allocate a page from the VM free list */
11785 vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
11786 while ((page = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) {
11787 VM_PAGE_WAIT();
11788 }
11789
11790 /* Wire all of the pages allocated for TXM */
11791 vm_page_lock_queues();
11792 vm_page_wire(page, VM_KERN_MEMORY_SECURITY, TRUE);
11793 vm_page_unlock_queues();
11794
11795 phys_addr = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page));
11796 if (phys_addr == 0) {
11797 panic("invalid VM page allocated for TXM: %llu", phys_addr);
11798 }
11799
11800 /* Add the physical page to the TXM VM object */
11801 vm_object_lock(txm_vm_object);
11802 vm_page_insert_wired(
11803 page,
11804 txm_vm_object,
11805 phys_addr - gPhysBase,
11806 VM_KERN_MEMORY_SECURITY);
11807 vm_object_unlock(txm_vm_object);
11808
11809 /* Reset thread privilege */
11810 set_vm_privilege(thread_vm_privileged);
11811
11812 /* Retype the page */
11813 _pmap_txm_transfer_page(phys_addr);
11814
11815 return phys_addr;
11816 }
11817
11818 int
11819 pmap_cs_configuration(void)
11820 {
11821 code_signing_config_t config = 0;
11822
11823 /* Compute the code signing configuration */
11824 code_signing_configuration(NULL, &config);
11825
11826 return (int)config;
11827 }
11828
11829 bool
11830 pmap_performs_stage2_translations(
11831 __unused pmap_t pmap)
11832 {
11833 return false;
11834 }
11835
11836 bool
11837 pmap_has_iofilter_protected_write(void)
11838 {
11839 #if HAS_GUARDED_IO_FILTER
11840 return true;
11841 #else
11842 return false;
11843 #endif
11844 }
11845
11846 #if HAS_GUARDED_IO_FILTER
11847
11848 void
11849 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
11850 {
11851 /**
11852 * Even though this is done from EL1/2 for an address potentially owned by Guarded
11853 * Mode, we should be fine as mmu_kvtop uses "at s1e1r" checking for read access
11854 * only.
11855 */
11856 const pmap_paddr_t pa = mmu_kvtop(addr);
11857
11858 if (!pa) {
11859 panic("%s: addr 0x%016llx doesn't have a valid kernel mapping", __func__, (uint64_t) addr);
11860 }
11861
11862 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
11863 if (frame_type == XNU_PROTECTED_IO) {
11864 bool is_hibernating = false;
11865 if (__improbable(is_hibernating)) {
11866 /**
11867 * Default set to NO_PANICKING_DOMAIN and not to INVALID_DOMAIN since
11868 * INVALID_DOMAIN is set for panic in dispatch logic itself.
11869 */
11870 sptm_domain_t panic_source = NO_PANICKING_DOMAIN;
11871 (void)sptm_panic_source(&panic_source);
11872
11873 /**
11874 * If panic_source is invalid (NO_PANICKING_DOMAIN: sptm_panic_source() failed
11875 * or no panic occurred) OR if the panic_source is XNU_DOMAIN, then use the
11876 * hibernation-specific write.
11877 */
11878 if (panic_source == NO_PANICKING_DOMAIN || panic_source == XNU_DOMAIN) {
11879 sptm_hib_iofilter_protected_write(pa, value, width);
11880 } else {
11881 /* Panic source is valid (panic occurred) and not XNU_DOMAIN */
11882 sptm_iofilter_protected_write(pa, value, width);
11883 }
11884 } else {
11885 sptm_iofilter_protected_write(pa, value, width);
11886 }
11887 } else {
11888 /* Mappings is valid but not specified by I/O filter. However, we still try
11889 * accessing the address from kernel mode. This allows addresses that are not
11890 * owned by SPTM to be accessed by this interface.
11891 */
11892 switch (width) {
11893 case 1:
11894 *(volatile uint8_t *)addr = (uint8_t) value;
11895 break;
11896 case 2:
11897 *(volatile uint16_t *)addr = (uint16_t) value;
11898 break;
11899 case 4:
11900 *(volatile uint32_t *)addr = (uint32_t) value;
11901 break;
11902 case 8:
11903 *(volatile uint64_t *)addr = (uint64_t) value;
11904 break;
11905 default:
11906 panic("%s: width %llu not supported", __func__, width);
11907 }
11908 }
11909 }
11910
11911 #else /* HAS_GUARDED_IO_FILTER */
11912
11913 __attribute__((__noreturn__))
11914 void
11915 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
11916 {
11917 panic("%s called on an unsupported platform.", __FUNCTION__);
11918 }
11919
11920 #endif /* HAS_GUARDED_IO_FILTER */
11921
11922 void * __attribute__((noreturn))
11923 pmap_claim_reserved_ppl_page(void)
11924 {
11925 panic("%s: function not supported in this environment", __FUNCTION__);
11926 }
11927
11928 void __attribute__((noreturn))
11929 pmap_free_reserved_ppl_page(void __unused *kva)
11930 {
11931 panic("%s: function not supported in this environment", __FUNCTION__);
11932 }
11933
11934 bool
11935 pmap_lookup_in_loaded_trust_caches(__unused const uint8_t cdhash[CS_CDHASH_LEN])
11936 {
11937 kern_return_t kr = query_trust_cache(
11938 kTCQueryTypeLoadable,
11939 cdhash,
11940 NULL);
11941
11942 if (kr == KERN_SUCCESS) {
11943 return true;
11944 }
11945 return false;
11946 }
11947
11948 uint32_t
11949 pmap_lookup_in_static_trust_cache(__unused const uint8_t cdhash[CS_CDHASH_LEN])
11950 {
11951 TrustCacheQueryToken_t query_token = {0};
11952 kern_return_t kr = KERN_NOT_FOUND;
11953 uint64_t flags = 0;
11954 uint8_t hash_type = 0;
11955
11956 kr = query_trust_cache(
11957 kTCQueryTypeStatic,
11958 cdhash,
11959 &query_token);
11960
11961 if (kr == KERN_SUCCESS) {
11962 amfi->TrustCache.queryGetFlags(&query_token, &flags);
11963 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
11964
11965 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
11966 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
11967 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
11968 }
11969
11970 return 0;
11971 }
11972
11973 #if DEVELOPMENT || DEBUG
11974
11975 struct page_table_dump_header {
11976 uint64_t pa;
11977 uint64_t num_entries;
11978 uint64_t start_va;
11979 uint64_t end_va;
11980 };
11981
11982 static kern_return_t
11983 pmap_dump_page_tables_recurse(pmap_t pmap,
11984 const tt_entry_t *ttp,
11985 unsigned int cur_level,
11986 unsigned int level_mask,
11987 uint64_t start_va,
11988 void *buf_start,
11989 void *buf_end,
11990 size_t *bytes_copied)
11991 {
11992 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11993 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
11994
11995 uint64_t size = pt_attr->pta_level_info[cur_level].size;
11996 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
11997 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
11998 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
11999
12000 void *bufp = (uint8_t*)buf_start + *bytes_copied;
12001
12002 if (cur_level == pt_attr_root_level(pt_attr)) {
12003 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
12004 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
12005 }
12006
12007 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
12008 const tt_entry_t *tt_end = &ttp[num_entries];
12009
12010 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
12011 return KERN_INSUFFICIENT_BUFFER_SIZE;
12012 }
12013
12014 if (level_mask & (1U << cur_level)) {
12015 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
12016 header->pa = kvtophys_nofail((vm_offset_t)ttp);
12017 header->num_entries = num_entries;
12018 header->start_va = start_va;
12019 header->end_va = start_va + (num_entries * size);
12020
12021 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
12022 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
12023 }
12024 uint64_t current_va = start_va;
12025
12026 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
12027 tt_entry_t tte = *ttep;
12028
12029 if (!(tte & valid_mask)) {
12030 continue;
12031 }
12032
12033 if ((tte & type_mask) == type_block) {
12034 continue;
12035 } else {
12036 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
12037 panic("%s: corrupt entry %#llx at %p, "
12038 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
12039 __FUNCTION__, tte, ttep,
12040 ttp, cur_level, bufp, buf_end);
12041 }
12042
12043 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
12044
12045 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
12046 level_mask, current_va, buf_start, buf_end, bytes_copied);
12047
12048 if (recurse_result != KERN_SUCCESS) {
12049 return recurse_result;
12050 }
12051 }
12052 }
12053
12054 return KERN_SUCCESS;
12055 }
12056
12057 kern_return_t
12058 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
12059 {
12060 if (not_in_kdp) {
12061 panic("pmap_dump_page_tables must only be called from kernel debugger context");
12062 }
12063 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
12064 level_mask, pmap->min, bufp, buf_end, bytes_copied);
12065 }
12066
12067 #else /* DEVELOPMENT || DEBUG */
12068
12069 kern_return_t
12070 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
12071 unsigned int level_mask __unused, size_t *bytes_copied __unused)
12072 {
12073 return KERN_NOT_SUPPORTED;
12074 }
12075 #endif /* !(DEVELOPMENT || DEBUG) */
12076
12077
12078 #ifdef CONFIG_XNUPOST
12079 static volatile bool pmap_test_took_fault = false;
12080
12081 static bool
12082 pmap_test_fault_handler(arm_saved_state_t * state)
12083 {
12084 bool retval = false;
12085 uint64_t esr = get_saved_state_esr(state);
12086 esr_exception_class_t class = ESR_EC(esr);
12087 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
12088
12089 if ((class == ESR_EC_DABORT_EL1) &&
12090 ((fsc == FSC_PERMISSION_FAULT_L3)
12091 || (fsc == FSC_ACCESS_FLAG_FAULT_L3)
12092 || (fsc == FSC_TRANSLATION_FAULT_L0))) {
12093 pmap_test_took_fault = true;
12094 /* return to the instruction immediately after the call to NX page */
12095 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
12096 retval = true;
12097 }
12098
12099 return retval;
12100 }
12101
12102 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
12103 static NOKASAN bool
12104 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
12105 {
12106 pmap_t old_pmap = NULL;
12107 thread_t thread = current_thread();
12108
12109 pmap_test_took_fault = false;
12110
12111 /*
12112 * We're potentially switching pmaps without using the normal thread
12113 * mechanism; disable interrupts and preemption to avoid any unexpected
12114 * memory accesses.
12115 */
12116 const boolean_t old_int_state = ml_set_interrupts_enabled(FALSE);
12117 mp_disable_preemption();
12118
12119 if (pmap != NULL) {
12120 old_pmap = current_pmap();
12121 pmap_switch(pmap, thread);
12122
12123 /* Disable PAN; pmap shouldn't be the kernel pmap. */
12124 #if __ARM_PAN_AVAILABLE__
12125 __builtin_arm_wsr("pan", 0);
12126 #endif /* __ARM_PAN_AVAILABLE__ */
12127 }
12128
12129 ml_expect_fault_begin(pmap_test_fault_handler, va);
12130
12131 if (is_write) {
12132 *((volatile uint64_t*)(va)) = 0xdec0de;
12133 } else {
12134 volatile uint64_t tmp = *((volatile uint64_t*)(va));
12135 (void)tmp;
12136 }
12137
12138 /* Save the fault bool, and undo the gross stuff we did. */
12139 bool took_fault = pmap_test_took_fault;
12140 ml_expect_fault_end();
12141
12142 if (pmap != NULL) {
12143 #if __ARM_PAN_AVAILABLE__
12144 __builtin_arm_wsr("pan", 1);
12145 #endif /* __ARM_PAN_AVAILABLE__ */
12146
12147 pmap_switch(old_pmap, thread);
12148 }
12149
12150 mp_enable_preemption();
12151 ml_set_interrupts_enabled(old_int_state);
12152 bool retval = (took_fault == should_fault);
12153 return retval;
12154 }
12155
12156 static bool
12157 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
12158 {
12159 bool retval = pmap_test_access(pmap, va, should_fault, false);
12160
12161 if (!retval) {
12162 T_FAIL("%s: %s, "
12163 "pmap=%p, va=%p, should_fault=%u",
12164 __func__, should_fault ? "did not fault" : "faulted",
12165 pmap, (void*)va, (unsigned)should_fault);
12166 }
12167
12168 return retval;
12169 }
12170
12171 static bool
12172 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
12173 {
12174 bool retval = pmap_test_access(pmap, va, should_fault, true);
12175
12176 if (!retval) {
12177 T_FAIL("%s: %s, "
12178 "pmap=%p, va=%p, should_fault=%u",
12179 __func__, should_fault ? "did not fault" : "faulted",
12180 pmap, (void*)va, (unsigned)should_fault);
12181 }
12182
12183 return retval;
12184 }
12185
12186 static bool
12187 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
12188 {
12189 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12190 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
12191
12192 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
12193
12194 if (!retval) {
12195 T_FAIL("%s: bits=%u, "
12196 "pa=%p, should_be_set=%u",
12197 __func__, bits,
12198 (void*)pa, should_be_set);
12199 }
12200
12201 return retval;
12202 }
12203
12204 static __attribute__((noinline)) bool
12205 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
12206 {
12207 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
12208 return retval;
12209 }
12210
12211 static int
12212 pmap_test_test_config(unsigned int flags)
12213 {
12214 T_LOG("running pmap_test_test_config flags=0x%X", flags);
12215 unsigned int map_count = 0;
12216 unsigned long page_ratio = 0;
12217 pmap_t pmap = pmap_create_options(NULL, 0, flags);
12218
12219 if (!pmap) {
12220 panic("Failed to allocate pmap");
12221 }
12222
12223 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12224 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
12225 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
12226 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
12227
12228 if (pmap_page_size <= native_page_size) {
12229 page_ratio = native_page_size / pmap_page_size;
12230 } else {
12231 /*
12232 * We claim to support a page_ratio of less than 1, which is
12233 * not currently supported by the pmap layer; panic.
12234 */
12235 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
12236 "flags=%u",
12237 __func__, native_page_size, pmap_page_size,
12238 flags);
12239 }
12240
12241 if (PAGE_RATIO > 1) {
12242 /*
12243 * The kernel is deliberately pretending to have 16KB pages.
12244 * The pmap layer has code that supports this, so pretend the
12245 * page size is larger than it is.
12246 */
12247 pmap_page_size = PAGE_SIZE;
12248 native_page_size = PAGE_SIZE;
12249 }
12250
12251 /*
12252 * Get two pages from the VM; one to be mapped wired, and one to be
12253 * mapped nonwired.
12254 */
12255 vm_page_t unwired_vm_page = vm_page_grab();
12256 vm_page_t wired_vm_page = vm_page_grab();
12257
12258 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
12259 panic("Failed to grab VM pages");
12260 }
12261
12262 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
12263 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
12264
12265 pmap_paddr_t pa = ptoa(pn);
12266 pmap_paddr_t wired_pa = ptoa(wired_pn);
12267
12268 /*
12269 * We'll start mappings at the second twig TT. This keeps us from only
12270 * using the first entry in each TT, which would trivially be address
12271 * 0; one of the things we will need to test is retrieving the VA for
12272 * a given PTE.
12273 */
12274 vm_map_address_t va_base = pmap_twig_size;
12275 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
12276
12277 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
12278 /*
12279 * Not exactly a functional failure, but this test relies on
12280 * there being a spare PTE slot we can use to pin the TT.
12281 */
12282 panic("Cannot pin translation table");
12283 }
12284
12285 /*
12286 * Create the wired mapping; this will prevent the pmap layer from
12287 * reclaiming our test TTs, which would interfere with this test
12288 * ("interfere" -> "make it panic").
12289 */
12290 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true, PMAP_MAPPING_TYPE_INFER);
12291
12292 T_LOG("Validate that kernel cannot write to SPTM memory.");
12293 pt_entry_t * ptep = pmap_pte(pmap, va_base);
12294 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
12295
12296 /*
12297 * Create read-only mappings of the nonwired page; if the pmap does
12298 * not use the same page size as the kernel, create multiple mappings
12299 * so that the kernel page is fully mapped.
12300 */
12301 for (map_count = 0; map_count < page_ratio; map_count++) {
12302 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)),
12303 VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12304 }
12305
12306 /* Validate that all the PTEs have the expected PA and VA. */
12307 for (map_count = 0; map_count < page_ratio; map_count++) {
12308 ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
12309
12310 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
12311 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
12312 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
12313 }
12314
12315 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
12316 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
12317 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
12318 }
12319 }
12320
12321 T_LOG("Validate that reads to our mapping do not fault.");
12322 pmap_test_read(pmap, va_base, false);
12323
12324 T_LOG("Validate that writes to our mapping fault.");
12325 pmap_test_write(pmap, va_base, true);
12326
12327 T_LOG("Make the first mapping writable.");
12328 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12329
12330 T_LOG("Validate that writes to our mapping do not fault.");
12331 pmap_test_write(pmap, va_base, false);
12332
12333 /*
12334 * For page ratios of greater than 1: validate that writes to the other
12335 * mappings still fault. Remove the mappings afterwards (we're done
12336 * with page ratio testing).
12337 */
12338 for (map_count = 1; map_count < page_ratio; map_count++) {
12339 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
12340 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
12341 }
12342
12343 /* Remove remaining mapping */
12344 pmap_remove(pmap, va_base, va_base + pmap_page_size);
12345
12346 T_LOG("Make the first mapping execute-only");
12347 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false, PMAP_MAPPING_TYPE_INFER);
12348
12349
12350 T_LOG("Validate that reads to our mapping do not fault.");
12351 pmap_test_read(pmap, va_base, false);
12352
12353 T_LOG("Validate that reads to our mapping do not fault.");
12354 pmap_test_read(pmap, va_base, false);
12355
12356 T_LOG("Validate that writes to our mapping fault.");
12357 pmap_test_write(pmap, va_base, true);
12358
12359 pmap_remove(pmap, va_base, va_base + pmap_page_size);
12360
12361 T_LOG("Mark the page unreferenced and unmodified.");
12362 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12363 pmap_test_check_refmod(pa, 0);
12364
12365 /*
12366 * Begin testing the ref/mod state machine. Re-enter the mapping with
12367 * different protection/fault_type settings, and confirm that the
12368 * ref/mod state matches our expectations at each step.
12369 */
12370 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
12371 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12372 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12373
12374 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
12375 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12376 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12377 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12378
12379 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
12380 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12381 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12382 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12383
12384 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
12385 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12386 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12387
12388 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
12389 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12390 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12391 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12392
12393 /*
12394 * Shared memory testing; we'll have two mappings; one read-only,
12395 * one read-write.
12396 */
12397 vm_map_address_t rw_base = va_base;
12398 vm_map_address_t ro_base = va_base + pmap_page_size;
12399
12400 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12401 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12402
12403 /*
12404 * Test that we take faults as expected for unreferenced/unmodified
12405 * pages. Also test the arm_fast_fault interface, to ensure that
12406 * mapping permissions change as expected.
12407 */
12408 T_LOG("!ref/!mod: expect no access");
12409 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12410 pmap_test_read_write(pmap, ro_base, false, false);
12411 pmap_test_read_write(pmap, rw_base, false, false);
12412
12413 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
12414 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
12415 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12416 pmap_test_read_write(pmap, ro_base, true, false);
12417 pmap_test_read_write(pmap, rw_base, true, false);
12418
12419 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
12420 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12421 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12422 pmap_test_read_write(pmap, ro_base, true, false);
12423 pmap_test_read_write(pmap, rw_base, true, true);
12424
12425 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
12426 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12427 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12428 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12429 pmap_test_read_write(pmap, ro_base, true, false);
12430 pmap_test_read_write(pmap, rw_base, true, true);
12431
12432 T_LOG("RW protect both mappings; should not change protections.");
12433 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12434 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12435 pmap_test_read_write(pmap, ro_base, true, false);
12436 pmap_test_read_write(pmap, rw_base, true, true);
12437
12438 T_LOG("Read protect both mappings; RW mapping should become RO.");
12439 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
12440 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
12441 pmap_test_read_write(pmap, ro_base, true, false);
12442 pmap_test_read_write(pmap, rw_base, true, false);
12443
12444 T_LOG("RW protect the page; mappings should not change protections.");
12445 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12446 pmap_page_protect(pn, VM_PROT_ALL);
12447 pmap_test_read_write(pmap, ro_base, true, false);
12448 pmap_test_read_write(pmap, rw_base, true, true);
12449
12450 T_LOG("Read protect the page; RW mapping should become RO.");
12451 pmap_page_protect(pn, VM_PROT_READ);
12452 pmap_test_read_write(pmap, ro_base, true, false);
12453 pmap_test_read_write(pmap, rw_base, true, false);
12454
12455 T_LOG("Validate that disconnect removes all known mappings of the page.");
12456 pmap_disconnect(pn);
12457 if (!pmap_verify_free(pn)) {
12458 T_FAIL("Page still has mappings");
12459 }
12460
12461 #if defined(ARM_LARGE_MEMORY)
12462 #define PMAP_TEST_LARGE_MEMORY_VA 64 * (1ULL << 40) /* 64 TB */
12463 #if !defined(ARM_LARGE_MEMORY_KERNONLY)
12464
12465 T_LOG("Create new wired mapping in the extended address space enabled by ARM_LARGE_MEMORY.");
12466 pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12467 pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, true, true);
12468 pmap_remove(pmap, PMAP_TEST_LARGE_MEMORY_VA, PMAP_TEST_LARGE_MEMORY_VA + pmap_page_size);
12469 #else /* !defined(ARM_LARGE_MEMORY_KERNONLY) */
12470 /* Using kernel-only large memory. Make sure user pmap will fail. */
12471 T_LOG("Expect wired mapping to fault in ARM_LARGE_MEMORY when using KERNONLY.");
12472
12473 /* The mapping should be rejected, it's outside of T0SZ */
12474 const kern_return_t kr = pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa,
12475 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12476 T_QUIET; T_ASSERT_NE_INT(kr, KERN_SUCCESS, NULL);
12477
12478 /* Addressing outside of T0SZ should result in a L0 xlate fault */
12479 const bool did_fault = pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, false, false);
12480 T_QUIET; T_ASSERT(did_fault, NULL);
12481 #endif /* !defined(ARM_LARGE_MEMORY_KERNONLY) */
12482 #endif /* ARM_LARGE_MEMORY */
12483
12484 T_LOG("Remove the wired mapping, so we can tear down the test map.");
12485 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
12486 pmap_destroy(pmap);
12487
12488 T_LOG("Release the pages back to the VM.");
12489 vm_page_lock_queues();
12490 vm_page_free(unwired_vm_page);
12491 vm_page_free(wired_vm_page);
12492 vm_page_unlock_queues();
12493
12494 T_LOG("Testing successful!");
12495 return 0;
12496 }
12497
12498 kern_return_t
12499 pmap_test(void)
12500 {
12501 T_LOG("Starting pmap_tests");
12502 int flags = 0;
12503 flags |= PMAP_CREATE_64BIT;
12504
12505 #if __ARM_MIXED_PAGE_SIZE__ && !CONFIG_SPTM
12506 T_LOG("Testing VM_PAGE_SIZE_4KB");
12507 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
12508 T_LOG("Testing VM_PAGE_SIZE_16KB");
12509 pmap_test_test_config(flags);
12510 #else /* __ARM_MIXED_PAGE_SIZE__ */
12511 pmap_test_test_config(flags);
12512 #endif /* __ARM_MIXED_PAGE_SIZE__ */
12513
12514 T_PASS("completed pmap_test successfully");
12515 return KERN_SUCCESS;
12516 }
12517 #endif /* CONFIG_XNUPOST */
12518
12519 /*
12520 * The following function should never make it to RELEASE code, since
12521 * it provides a way to get the PPL to modify text pages.
12522 */
12523 #if DEVELOPMENT || DEBUG
12524
12525 /**
12526 * Forcibly overwrite executable text with an illegal instruction.
12527 *
12528 * @note Only used for xnu unit testing.
12529 *
12530 * @param pa The physical address to corrupt.
12531 *
12532 * @return KERN_SUCCESS on success.
12533 */
12534 kern_return_t
12535 pmap_test_text_corruption(pmap_paddr_t pa __unused)
12536 {
12537 /*
12538 * SPTM TODO: implement an SPTM version of this.
12539 * The physical apertue is owned by the SPTM and text
12540 * pages have RO physical aperture mappings.
12541 */
12542 return KERN_SUCCESS;
12543 }
12544
12545 #endif /* DEVELOPMENT || DEBUG */
12546
12547