1 /*
2 * Copyright (c) 2011-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/backtrace.h>
42 #include <kern/bits.h>
43 #include <kern/ecc.h>
44 #include <kern/thread.h>
45 #include <kern/sched.h>
46 #include <kern/zalloc.h>
47 #include <kern/zalloc_internal.h>
48 #include <kern/kalloc.h>
49 #include <kern/spl.h>
50 #include <kern/startup.h>
51 #include <kern/trap_telemetry.h>
52 #include <kern/trustcache.h>
53
54 #include <os/overflow.h>
55
56 #include <vm/pmap.h>
57 #include <vm/pmap_cs.h>
58 #include <vm/vm_map_xnu.h>
59 #include <vm/vm_kern.h>
60 #include <vm/vm_protos.h>
61 #include <vm/vm_object_internal.h>
62 #include <vm/vm_page_internal.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/cpm_internal.h>
65
66
67 #include <libkern/section_keywords.h>
68 #include <sys/errno.h>
69
70 #include <libkern/amfi/amfi.h>
71 #include <sys/trusted_execution_monitor.h>
72 #include <sys/trust_caches.h>
73 #include <sys/code_signing.h>
74
75 #include <machine/atomic.h>
76 #include <machine/thread.h>
77 #include <machine/lowglobals.h>
78
79 #include <arm/caches_internal.h>
80 #include <arm/cpu_data.h>
81 #include <arm/cpu_data_internal.h>
82 #include <arm/cpu_capabilities.h>
83 #include <arm/cpu_number.h>
84 #include <arm/machine_cpu.h>
85 #include <arm/misc_protos.h>
86 #include <arm/trap_internal.h>
87 #include <arm64/sptm/pmap/pmap_internal.h>
88 #include <arm64/sptm/sptm.h>
89
90 #include <arm64/proc_reg.h>
91 #include <pexpert/arm64/boot.h>
92 #include <arm64/ppl/uat.h>
93 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
94 #include <arm64/amcc_rorgn.h>
95 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
96
97 #include <pexpert/device_tree.h>
98
99 #include <san/kasan.h>
100 #include <sys/cdefs.h>
101
102 #if defined(HAS_APPLE_PAC)
103 #include <ptrauth.h>
104 #endif
105
106 #ifdef CONFIG_XNUPOST
107 #include <tests/xnupost.h>
108 #endif
109
110
111 #if HIBERNATION
112 #include <IOKit/IOHibernatePrivate.h>
113 #endif /* HIBERNATION */
114
115 #ifdef __ARM64_PMAP_SUBPAGE_L1__
116 /**
117 * Different from PPL, PMAP_ROOT_ALLOC_SIZE for subpage L1 devices is 128 bytes
118 * rather than 64 bytes, due to the metadata SPTM needs to track the subpage L1
119 * tables.
120 */
121 #define PMAP_ROOT_ALLOC_SIZE SUBPAGE_USER_ROOT_TABLE_SIZE
122 #else
123 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
124 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
125
126 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
127
128
129 /**
130 * Per-CPU data used to do setup and post-processing for SPTM calls.
131 * On the setup side, this structure is used to store parameters for batched SPTM operations.
132 * These parameters may be large (upwards of 1K), and given that SPTM calls are generally
133 * issued from preemption-disabled contexts anyway, it's better to store them in per-CPU
134 * data rather than the local stack.
135 * On the post-processing side, this structure exposes a pointer to the SPTM's per-CPU array
136 * of 'prev_ptes', that is the prior value encountered in each PTE at the time of the SPTM's
137 * atomic update of that PTE.
138 */
139 pmap_sptm_percpu_data_t PERCPU_DATA(pmap_sptm_percpu);
140
141 /**
142 * Reference group for global tracking of all outstanding pmap references.
143 */
144 os_refgrp_decl(static, pmap_refgrp, "pmap", NULL);
145
146 /* Boot-arg to enable/disable the use of XNU_KERNEL_RESTRICTED type in SPTM. */
147 TUNABLE(bool, use_xnu_restricted, "xnu_restricted", true);
148
149 extern u_int32_t random(void); /* from <libkern/libkern.h> */
150
151 static bool alloc_asid(pmap_t pmap);
152 static void free_asid(pmap_t pmap);
153 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
154 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
155
156 const struct page_table_ops native_pt_ops =
157 {
158 .alloc_id = alloc_asid,
159 .free_id = free_asid,
160 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
161 .wimg_to_pte = wimg_to_pte,
162 };
163
164 const struct page_table_level_info pmap_table_level_info_16k[] =
165 {
166 [0] = {
167 .size = ARM_16K_TT_L0_SIZE,
168 .offmask = ARM_16K_TT_L0_OFFMASK,
169 .shift = ARM_16K_TT_L0_SHIFT,
170 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
171 .valid_mask = ARM_TTE_VALID,
172 .type_mask = ARM_TTE_TYPE_MASK,
173 .type_block = ARM_TTE_TYPE_BLOCK
174 },
175 [1] = {
176 .size = ARM_16K_TT_L1_SIZE,
177 .offmask = ARM_16K_TT_L1_OFFMASK,
178 .shift = ARM_16K_TT_L1_SHIFT,
179 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
180 .valid_mask = ARM_TTE_VALID,
181 .type_mask = ARM_TTE_TYPE_MASK,
182 .type_block = ARM_TTE_TYPE_BLOCK
183 },
184 [2] = {
185 .size = ARM_16K_TT_L2_SIZE,
186 .offmask = ARM_16K_TT_L2_OFFMASK,
187 .shift = ARM_16K_TT_L2_SHIFT,
188 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
189 .valid_mask = ARM_TTE_VALID,
190 .type_mask = ARM_TTE_TYPE_MASK,
191 .type_block = ARM_TTE_TYPE_BLOCK
192 },
193 [3] = {
194 .size = ARM_16K_TT_L3_SIZE,
195 .offmask = ARM_16K_TT_L3_OFFMASK,
196 .shift = ARM_16K_TT_L3_SHIFT,
197 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
198 .valid_mask = ARM_PTE_TYPE_VALID,
199 .type_mask = ARM_TTE_TYPE_MASK,
200 .type_block = ARM_TTE_TYPE_L3BLOCK
201 }
202 };
203
204 const struct page_table_level_info pmap_table_level_info_4k[] =
205 {
206 [0] = {
207 .size = ARM_4K_TT_L0_SIZE,
208 .offmask = ARM_4K_TT_L0_OFFMASK,
209 .shift = ARM_4K_TT_L0_SHIFT,
210 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
211 .valid_mask = ARM_TTE_VALID,
212 .type_mask = ARM_TTE_TYPE_MASK,
213 .type_block = ARM_TTE_TYPE_BLOCK
214 },
215 [1] = {
216 .size = ARM_4K_TT_L1_SIZE,
217 .offmask = ARM_4K_TT_L1_OFFMASK,
218 .shift = ARM_4K_TT_L1_SHIFT,
219 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
220 .valid_mask = ARM_TTE_VALID,
221 .type_mask = ARM_TTE_TYPE_MASK,
222 .type_block = ARM_TTE_TYPE_BLOCK
223 },
224 [2] = {
225 .size = ARM_4K_TT_L2_SIZE,
226 .offmask = ARM_4K_TT_L2_OFFMASK,
227 .shift = ARM_4K_TT_L2_SHIFT,
228 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
229 .valid_mask = ARM_TTE_VALID,
230 .type_mask = ARM_TTE_TYPE_MASK,
231 .type_block = ARM_TTE_TYPE_BLOCK
232 },
233 [3] = {
234 .size = ARM_4K_TT_L3_SIZE,
235 .offmask = ARM_4K_TT_L3_OFFMASK,
236 .shift = ARM_4K_TT_L3_SHIFT,
237 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
238 .valid_mask = ARM_PTE_TYPE_VALID,
239 .type_mask = ARM_TTE_TYPE_MASK,
240 .type_block = ARM_TTE_TYPE_L3BLOCK
241 }
242 };
243
244 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
245 {
246 [0] = { /* Unused */
247 .size = ARM_4K_TT_L0_SIZE,
248 .offmask = ARM_4K_TT_L0_OFFMASK,
249 .shift = ARM_4K_TT_L0_SHIFT,
250 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
251 .valid_mask = ARM_TTE_VALID,
252 .type_mask = ARM_TTE_TYPE_MASK,
253 .type_block = ARM_TTE_TYPE_BLOCK
254 },
255 [1] = { /* Concatenated, so index mask is larger than normal */
256 .size = ARM_4K_TT_L1_SIZE,
257 .offmask = ARM_4K_TT_L1_OFFMASK,
258 .shift = ARM_4K_TT_L1_SHIFT,
259 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
260 .index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
261 #else
262 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
263 #endif
264 .valid_mask = ARM_TTE_VALID,
265 .type_mask = ARM_TTE_TYPE_MASK,
266 .type_block = ARM_TTE_TYPE_BLOCK
267 },
268 [2] = {
269 .size = ARM_4K_TT_L2_SIZE,
270 .offmask = ARM_4K_TT_L2_OFFMASK,
271 .shift = ARM_4K_TT_L2_SHIFT,
272 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
273 .valid_mask = ARM_TTE_VALID,
274 .type_mask = ARM_TTE_TYPE_MASK,
275 .type_block = ARM_TTE_TYPE_BLOCK
276 },
277 [3] = {
278 .size = ARM_4K_TT_L3_SIZE,
279 .offmask = ARM_4K_TT_L3_OFFMASK,
280 .shift = ARM_4K_TT_L3_SHIFT,
281 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
282 .valid_mask = ARM_PTE_TYPE_VALID,
283 .type_mask = ARM_TTE_TYPE_MASK,
284 .type_block = ARM_TTE_TYPE_L3BLOCK
285 }
286 };
287
288 const struct page_table_attr pmap_pt_attr_4k = {
289 .pta_level_info = pmap_table_level_info_4k,
290 .pta_root_level = (T0SZ_BOOT - 16) / 9,
291 #if __ARM_MIXED_PAGE_SIZE__
292 .pta_commpage_level = PMAP_TT_L2_LEVEL,
293 #else /* __ARM_MIXED_PAGE_SIZE__ */
294 #if __ARM_16K_PG__
295 .pta_commpage_level = PMAP_TT_L2_LEVEL,
296 #else /* __ARM_16K_PG__ */
297 .pta_commpage_level = PMAP_TT_L1_LEVEL,
298 #endif /* __ARM_16K_PG__ */
299 #endif /* __ARM_MIXED_PAGE_SIZE__ */
300 .pta_max_level = PMAP_TT_L3_LEVEL,
301 .pta_ops = &native_pt_ops,
302 .ap_ro = ARM_PTE_AP(AP_RORO),
303 .ap_rw = ARM_PTE_AP(AP_RWRW),
304 .ap_rona = ARM_PTE_AP(AP_RONA),
305 .ap_rwna = ARM_PTE_AP(AP_RWNA),
306 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
307 .ap_x = ARM_PTE_PNX,
308 #if __ARM_MIXED_PAGE_SIZE__
309 .pta_tcr_value = TCR_EL1_4KB,
310 #endif /* __ARM_MIXED_PAGE_SIZE__ */
311 .pta_page_size = 4096,
312 .pta_page_shift = 12,
313 .geometry_id = SPTM_PT_GEOMETRY_4K,
314 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB),
315 };
316
317 const struct page_table_attr pmap_pt_attr_16k_kern = {
318 .pta_level_info = pmap_table_level_info_16k,
319 .pta_root_level = PMAP_TT_L1_LEVEL,
320 .pta_commpage_level = PMAP_TT_L2_LEVEL,
321 .pta_max_level = PMAP_TT_L3_LEVEL,
322 .pta_ops = &native_pt_ops,
323 .ap_ro = ARM_PTE_AP(AP_RORO),
324 .ap_rw = ARM_PTE_AP(AP_RWRW),
325 .ap_rona = ARM_PTE_AP(AP_RONA),
326 .ap_rwna = ARM_PTE_AP(AP_RWNA),
327 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
328 .ap_x = ARM_PTE_PNX,
329 #if __ARM_MIXED_PAGE_SIZE__
330 .pta_tcr_value = TCR_EL1_16KB,
331 #endif /* __ARM_MIXED_PAGE_SIZE__ */
332 .pta_page_size = 16384,
333 .pta_page_shift = 14,
334 .geometry_id = SPTM_PT_GEOMETRY_16K_KERN,
335 .pta_va_valid_mask = ARM_PTE_T1_REGION_MASK(TCR_EL1_16KB),
336 };
337
338 const struct page_table_attr pmap_pt_attr_16k = {
339 .pta_level_info = pmap_table_level_info_16k,
340 .pta_root_level = PMAP_TT_L1_LEVEL,
341 .pta_commpage_level = PMAP_TT_L2_LEVEL,
342 .pta_max_level = PMAP_TT_L3_LEVEL,
343 .pta_ops = &native_pt_ops,
344 .ap_ro = ARM_PTE_AP(AP_RORO),
345 .ap_rw = ARM_PTE_AP(AP_RWRW),
346 .ap_rona = ARM_PTE_AP(AP_RONA),
347 .ap_rwna = ARM_PTE_AP(AP_RWNA),
348 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
349 .ap_x = ARM_PTE_PNX,
350 #if __ARM_MIXED_PAGE_SIZE__
351 .pta_tcr_value = TCR_EL1_16KB,
352 #endif /* __ARM_MIXED_PAGE_SIZE__ */
353 .pta_page_size = 16384,
354 .pta_page_shift = 14,
355 .geometry_id = SPTM_PT_GEOMETRY_16K,
356 .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB),
357 };
358
359 #if __ARM_16K_PG__
360 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
361 #else /* !__ARM_16K_PG__ */
362 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
363 #endif /* !__ARM_16K_PG__ */
364
365
366 #if DEVELOPMENT || DEBUG
367 int vm_footprint_suspend_allowed = 1;
368
369 extern int pmap_ledgers_panic;
370 extern int pmap_ledgers_panic_leeway;
371
372 #endif /* DEVELOPMENT || DEBUG */
373
374 #if DEVELOPMENT || DEBUG
375 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
376 (current_thread()->pmap_footprint_suspended)
377 #else /* DEVELOPMENT || DEBUG */
378 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
379 #endif /* DEVELOPMENT || DEBUG */
380
381 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
382
383
384 /* Keeps track of whether the pmap has been bootstrapped */
385 SECURITY_READ_ONLY_LATE(bool) pmap_bootstrapped = false;
386
387 /*
388 * Represents a tlb range that will be flushed before returning from the pmap.
389 * Used by phys_attribute_clear_range to defer flushing pages in this range until
390 * the end of the operation, and to accumulate batched operations for submission
391 * to the SPTM as a performance optimization.
392 */
393 typedef struct pmap_tlb_flush_range {
394 /* Address space in which the flush region resides */
395 pmap_t ptfr_pmap;
396
397 /* Page-aligned beginning of the flush region */
398 vm_map_address_t ptfr_start;
399
400 /* Page-aligned non-inclusive end of the flush region */
401 vm_map_address_t ptfr_end;
402
403 /**
404 * Address of current PTE position in ptfr_pmap's [ptfr_start, ptfr_end) region.
405 * This is meant to be set up by the caller of pmap_page_protect_options_with_flush_range()
406 * or arm_force_fast_fault_with_flush_range(), and used by those functions to determine
407 * when a given mapping can be added to the SPTM's per-CPU region templates array vs.
408 * the more complex task of adding it to the disjoint ops array.
409 */
410 pt_entry_t *current_ptep;
411
412 /**
413 * Starting VA for any not-yet-submitted per-CPU region templates. This is meant to be
414 * set up by the caller of pmap_page_protect_options_with_flush_range() or
415 * arm_force_fast_fault_with_flush_range() and used by pmap_multipage_op_submit_region()
416 * when issuing the SPTM call to purge any pending region ops.
417 */
418 vm_map_address_t pending_region_start;
419
420 /**
421 * Number of entries in the per-CPU SPTM region templates array which have not
422 * yet been submitted to the SPTM.
423 */
424 unsigned int pending_region_entries;
425
426 /**
427 * Indicates whether at least one region entry was added to the per-CPU region ops
428 * array since the last time this field was checked. Intended to be cleared by the
429 * caller.
430 */
431 bool region_entry_added;
432
433 /**
434 * Marker for the current paddr "header" entry in the per-CPU SPTM disjoint ops array.
435 * This field is intended to be modified only by pmap_multipage_op_submit_disjoint()
436 * and pmap_multipage_op_add_page(), and should be treated as opaque by callers
437 * of those functions.
438 */
439 sptm_update_disjoint_multipage_op_t *current_header;
440
441 /**
442 * Position in the per-CPU SPTM ops array of the first ordinary
443 * sptm_disjoint_op_t entry following [current_header]. This is the starting
444 * point at which mappings should be inserted for the page described by
445 * [current_header].
446 */
447 unsigned int current_header_first_mapping_index;
448
449 /**
450 * Number of entries in the per-CPU SPTM disjoint ops array, including paddr headers,
451 * which have not yet been submitted to the SPTM.
452 */
453 unsigned int pending_disjoint_entries;
454
455 /**
456 * This field is used by the preemption check interval logic on the
457 * phys_attribute_clear_range() path to determine when sufficient
458 * forward progress has been made to check for and (if necessary)
459 * handle pending preemption.
460 */
461 unsigned int processed_entries;
462
463 /**
464 * Indicates whether the top-level caller needs to flush the TLB for
465 * the region in [ptfr_pmap] described by [ptfr_start, ptfr_end).
466 * This will be set if the SPTM indicates that it needed to alter
467 * any valid mapping within this region and SPTM_UPDATE_DEFER_TLBI
468 * was passed to the relevant SPTM call(s).
469 */
470 bool ptfr_flush_needed;
471 } pmap_tlb_flush_range_t;
472
473
474
475 /* Virtual memory region for early allocation */
476 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
477 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
478 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
479
480 extern uint8_t bootstrap_pagetables[];
481
482 extern unsigned int not_in_kdp;
483
484 extern vm_offset_t first_avail;
485
486 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
487 extern vm_offset_t virtual_space_end; /* End of kernel address space */
488 extern vm_offset_t static_memory_end;
489
490 extern const vm_map_address_t physmap_base;
491 extern const vm_map_address_t physmap_end;
492
493 extern int maxproc, hard_maxproc;
494
495 extern bool sdsb_io_rgns_present;
496
497 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
498 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
499
500 /* The number of address bits one TTBR can cover. */
501 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
502
503 /*
504 * The bounds on our TTBRs. These are for sanity checking that
505 * an address is accessible by a TTBR before we attempt to map it.
506 */
507
508 /* The level of the root of a page table. */
509 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
510
511 /* The number of entries in the root TT of a page table. */
512 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
513
514 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
515 const pmap_t kernel_pmap = &kernel_pmap_store;
516
517 __static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
518
519 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
520 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
521
522 typedef struct tt_free_entry {
523 struct tt_free_entry *next;
524 } tt_free_entry_t;
525
526 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
527 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
528 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
529 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
530 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
531 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
532 _Atomic unsigned int inuse_iommu_pages_count[SPTM_IOMMUS_N_IDS] = {0}; /* number of active pages for each IOMMU class */
533
534 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
535 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
536
537 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
538 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
539
540 /* Lock group used for all pmap object locks. */
541 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
542
543 #if DEVELOPMENT || DEBUG
544 int nx_enabled = 1; /* enable no-execute protection */
545 int allow_data_exec = 0; /* No apps may execute data */
546 int allow_stack_exec = 0; /* No apps may execute from the stack */
547 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
548 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
549 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
550 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
551 #else /* DEVELOPMENT || DEBUG */
552 const int nx_enabled = 1; /* enable no-execute protection */
553 const int allow_data_exec = 0; /* No apps may execute data */
554 const int allow_stack_exec = 0; /* No apps may execute from the stack */
555 #endif /* DEVELOPMENT || DEBUG */
556
557
558 #if MACH_ASSERT
559 static void pmap_check_ledgers(pmap_t pmap);
560 #else
561 static inline void
pmap_check_ledgers(__unused pmap_t pmap)562 pmap_check_ledgers(__unused pmap_t pmap)
563 {
564 }
565 #endif /* MACH_ASSERT */
566
567 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
568
569 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
570 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
571
572 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
573
574 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
575
576 /* end of shared region + 512MB for various purposes */
577 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
578 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
579 "Minimum address space size outside allowable range");
580
581 // Max offset is 15.375GB for devices with "large" memory config
582 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
583 // Max offset is 11.375GB for devices with "small" memory config
584 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
585
586
587 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
588 "Large device address space size outside allowable range");
589 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
590 "Small device address space size outside allowable range");
591
592 # ifdef XNU_TARGET_OS_OSX
593 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
594 # else
595 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
596 # endif
597
598 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
599 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
600 #else
601 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
602 #endif
603
604 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
605 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
606 SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap;
607 #if !HAS_16BIT_ASID
608 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
609 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
610 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
611 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
612 #else
613 static uint16_t last_allocated_asid = 0;
614 #endif /* !HAS_16BIT_ASID */
615
616
617 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_default_table;
618 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_default_table;
619 #if __ARM_MIXED_PAGE_SIZE__
620 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_4k_table;
621 //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_4k_table;
622 #endif
623 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_data_pa = 0;
624 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_text_pa = 0;
625 SECURITY_READ_ONLY_LATE(static vm_map_address_t) commpage_text_user_va = 0;
626 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_ro_data_pa = 0;
627
628
629 #if (DEVELOPMENT || DEBUG)
630 /* Caches whether the SPTM sysreg API has been enabled by the SPTM */
631 SECURITY_READ_ONLY_LATE(static bool) sptm_sysreg_available = false;
632 #endif /* (DEVELOPMENT || DEBUG) */
633
634 /* PTE Define Macros */
635
636 #ifndef SPTM_PTE_IN_FLIGHT_MARKER
637 /* SPTM TODO: Get rid of this once we export SPTM_PTE_IN_FLIGHT_MARKER from the SPTM. */
638 #define SPTM_PTE_IN_FLIGHT_MARKER 0x80U
639 #endif /* SPTM_PTE_IN_FLIGHT_MARKER */
640
641 /**
642 * Determine whether a PTE has been marked as compressed. This function also panics if
643 * the PTE contains bits that shouldn't be present in a compressed PTE, which is most of them.
644 *
645 * @param pte the PTE contents to check
646 * @param ptep the address of the PTE contents, for diagnostic purposes only
647 *
648 * @return true if the PTE is compressed, false otherwise
649 */
650 static inline bool
pte_is_compressed(pt_entry_t pte,pt_entry_t * ptep)651 pte_is_compressed(pt_entry_t pte, pt_entry_t *ptep)
652 {
653 const bool compressed = (!pte_is_valid(pte) && (pte & ARM_PTE_COMPRESSED));
654 /**
655 * Check for bits that shouldn't be present in a compressed PTE. This is everything except the
656 * compressed/compressed-alt bits, as well as the SPTM's in-flight marker which may be set while
657 * the SPTM is in the process of flushing the TLBs after marking a previously-valid PTE as
658 * compressed.
659 */
660 if (__improbable(compressed && (pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER)))) {
661 panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?",
662 ptep, pte, pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER));
663 }
664 return compressed;
665 }
666
667 #define pte_is_wired(pte) \
668 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
669
670 #define pte_was_writeable(pte) \
671 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
672
673 #define pte_set_was_writeable(pte, was_writeable) \
674 do { \
675 if ((was_writeable)) { \
676 (pte) |= ARM_PTE_WRITEABLE; \
677 } else { \
678 (pte) &= ~ARM_PTE_WRITEABLE; \
679 } \
680 } while(0)
681
682 /**
683 * Updated wired-mapping accountings in the PTD and ledger.
684 *
685 * @param pmap The pmap against which to update accounting
686 * @param pte_p The PTE whose wired state is being changed
687 * @param wired Indicates whether the PTE is being wired or unwired.
688 */
689 static inline void
pte_update_wiredcnt(pmap_t pmap,pt_entry_t * pte_p,boolean_t wired)690 pte_update_wiredcnt(pmap_t pmap, pt_entry_t *pte_p, boolean_t wired)
691 {
692 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
693 unsigned short *ptd_wiredcnt_ptr = &(ptep_get_info(pte_p)->wiredcnt);
694 if (wired) {
695 if (__improbable(os_atomic_inc_orig(ptd_wiredcnt_ptr, relaxed) == UINT16_MAX)) {
696 panic("pmap %p (pte %p): wired count overflow", pmap, pte_p);
697 }
698 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
699 } else {
700 if (__improbable(os_atomic_dec_orig(ptd_wiredcnt_ptr, relaxed) == 0)) {
701 panic("pmap %p (pte %p): wired count underflow", pmap, pte_p);
702 }
703 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
704 }
705 }
706
707 /*
708 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
709 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
710 * will observe the updated PTE.
711 */
712 #define FLUSH_PTE() \
713 __builtin_arm_dmb(DMB_ISH);
714
715 /*
716 * Synchronize updates to PTEs that were previously valid and thus may be cached in
717 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
718 * TLBI. This should only require a store-store barrier, as subsequent accesses in
719 * program order will not issue until the DSB completes. Prior loads may be reordered
720 * after the barrier, but their behavior should not be materially affected by the
721 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
722 * matter for loads until the access is re-driven well after the TLB update is
723 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
724 * we should be in a position to handle access faults. For "voluntary" PTE access
725 * restriction due to unmapping or protection, the decision to restrict access should
726 * have a data dependency on prior loads in order to avoid a data race.
727 */
728 #define FLUSH_PTE_STRONG() \
729 __builtin_arm_dsb(DSB_ISHST);
730
731 /**
732 * Write enough page table entries to map a single VM page. On systems where the
733 * VM page size does not match the hardware page size, multiple page table
734 * entries will need to be written.
735 *
736 * @note This function does not emit a barrier to ensure these page table writes
737 * have completed before continuing. This is commonly needed. In the case
738 * where a DMB or DSB barrier is needed, then use the write_pte() and
739 * write_pte_strong() functions respectively instead of this one.
740 *
741 * @param ptep Pointer to the first page table entry to update.
742 * @param pte The value to write into each page table entry. In the case that
743 * multiple PTEs are updated to a non-empty value, then the address
744 * in this value will automatically be incremented for each PTE
745 * write.
746 */
747 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)748 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
749 {
750 /**
751 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
752 * systems, which is why it's checked at runtime instead of compile time.
753 * The "unreachable" warning needs to be suppressed because it still is a
754 * compile time constant on some systems.
755 */
756 __unreachable_ok_push
757 if (TEST_PAGE_RATIO_4) {
758 if (((uintptr_t)ptep) & 0x1f) {
759 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
760 __func__, ptep, (void*)pte);
761 }
762
763 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
764 /**
765 * If we're writing an empty/compressed PTE value, then don't
766 * auto-increment the address for each PTE write.
767 */
768 *ptep = pte;
769 *(ptep + 1) = pte;
770 *(ptep + 2) = pte;
771 *(ptep + 3) = pte;
772 } else {
773 *ptep = pte;
774 *(ptep + 1) = pte | 0x1000;
775 *(ptep + 2) = pte | 0x2000;
776 *(ptep + 3) = pte | 0x3000;
777 }
778 } else {
779 *ptep = pte;
780 }
781 __unreachable_ok_pop
782 }
783
784 /**
785 * Writes enough page table entries to map a single VM page and then ensures
786 * those writes complete by executing a Data Memory Barrier.
787 *
788 * @note The DMB issued by this function is not strong enough to protect against
789 * TLB invalidates from being reordered above the PTE writes. If a TLBI
790 * instruction is going to immediately be called after this write, it's
791 * recommended to call write_pte_strong() instead of this function.
792 *
793 * See the function header for write_pte_fast() for more details on the
794 * parameters.
795 */
796 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)797 write_pte(pt_entry_t *ptep, pt_entry_t pte)
798 {
799 write_pte_fast(ptep, pte);
800 FLUSH_PTE();
801 }
802
803 /**
804 * Retrieve the pmap structure for the thread running on the current CPU.
805 */
806 pmap_t
current_pmap()807 current_pmap()
808 {
809 const pmap_t current = vm_map_pmap(current_thread()->map);
810 assert(current != NULL);
811 return current;
812 }
813
814 #if DEVELOPMENT || DEBUG
815
816 /*
817 * Trace levels are controlled by a bitmask in which each
818 * level can be enabled/disabled by the (1<<level) position
819 * in the boot arg
820 * Level 0: PPL extension functionality
821 * Level 1: pmap lifecycle (create/destroy/switch)
822 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
823 * Level 3: internal state management (attributes/fast-fault)
824 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
825 */
826
827 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
828
829 #define PMAP_TRACE(level, ...) \
830 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
831 KDBG_RELEASE(__VA_ARGS__); \
832 }
833 #else /* DEVELOPMENT || DEBUG */
834
835 #define PMAP_TRACE(level, ...)
836
837 #endif /* DEVELOPMENT || DEBUG */
838
839
840 /*
841 * Internal function prototypes (forward declarations).
842 */
843
844 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
845
846 static void pmap_set_reference(ppnum_t pn);
847
848 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
849
850 static kern_return_t pmap_expand(
851 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
852
853 static void pmap_remove_range(pmap_t, vm_map_address_t, vm_map_address_t);
854
855 static tt_entry_t *pmap_tt1_allocate(pmap_t, vm_size_t, uint8_t);
856
857 static void pmap_tt1_deallocate(pmap_t, tt_entry_t *, vm_size_t);
858
859 static kern_return_t pmap_tt_allocate(
860 pmap_t, tt_entry_t **, pt_desc_t **, unsigned int, unsigned int);
861
862 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
863 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
864 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
865
866 static void pmap_unmap_commpage(
867 pmap_t pmap);
868
869 static boolean_t
870 pmap_is_64bit(pmap_t);
871
872
873 static void pmap_flush_tlb_for_paddr_async(pmap_paddr_t);
874
875 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
876
877 static boolean_t arm_clear_fast_fault(
878 ppnum_t ppnum,
879 vm_prot_t fault_type,
880 uintptr_t pvh,
881 pt_entry_t *pte_p,
882 pp_attr_t attrs_to_clear);
883
884 static void pmap_tte_deallocate(
885 pmap_t pmap,
886 vm_offset_t va_start,
887 tt_entry_t *ttep,
888 unsigned int level,
889 bool pmap_locked);
890
891
892 /*
893 * Temporary prototypes, while we wait for pmap_enter to move to taking an
894 * address instead of a page number.
895 */
896 kern_return_t
897 pmap_enter(
898 pmap_t pmap,
899 vm_map_address_t v,
900 ppnum_t pn,
901 vm_prot_t prot,
902 vm_prot_t fault_type,
903 unsigned int flags,
904 boolean_t wired,
905 pmap_mapping_type_t mapping_type);
906
907 static kern_return_t
908 pmap_enter_addr(
909 pmap_t pmap,
910 vm_map_address_t v,
911 pmap_paddr_t pa,
912 vm_prot_t prot,
913 vm_prot_t fault_type,
914 unsigned int flags,
915 boolean_t wired,
916 pmap_mapping_type_t mapping_type);
917
918 kern_return_t
919 pmap_enter_options_addr(
920 pmap_t pmap,
921 vm_map_address_t v,
922 pmap_paddr_t pa,
923 vm_prot_t prot,
924 vm_prot_t fault_type,
925 unsigned int flags,
926 boolean_t wired,
927 unsigned int options,
928 __unused void *arg,
929 pmap_mapping_type_t mapping_type);
930
931 #ifdef CONFIG_XNUPOST
932 kern_return_t pmap_test(void);
933 #endif /* CONFIG_XNUPOST */
934
935 PMAP_SUPPORT_PROTOTYPES(
936 kern_return_t,
937 arm_fast_fault, (pmap_t pmap,
938 vm_map_address_t va,
939 vm_prot_t fault_type,
940 bool was_af_fault,
941 bool from_user), ARM_FAST_FAULT_INDEX);
942
943 PMAP_SUPPORT_PROTOTYPES(
944 boolean_t,
945 arm_force_fast_fault, (ppnum_t ppnum,
946 vm_prot_t allow_mode,
947 int options), ARM_FORCE_FAST_FAULT_INDEX);
948
949 MARK_AS_PMAP_TEXT static boolean_t
950 arm_force_fast_fault_with_flush_range(
951 ppnum_t ppnum,
952 vm_prot_t allow_mode,
953 int options,
954 locked_pvh_t *locked_pvh,
955 pp_attr_t bits_to_clear,
956 pmap_tlb_flush_range_t *flush_range);
957
958 PMAP_SUPPORT_PROTOTYPES(
959 void,
960 pmap_batch_set_cache_attributes, (
961 const unified_page_list_t * page_list,
962 unsigned int cacheattr,
963 bool update_attr_table), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
964
965 PMAP_SUPPORT_PROTOTYPES(
966 void,
967 pmap_change_wiring, (pmap_t pmap,
968 vm_map_address_t v,
969 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
970
971 PMAP_SUPPORT_PROTOTYPES(
972 pmap_t,
973 pmap_create_options, (ledger_t ledger,
974 vm_map_size_t size,
975 unsigned int flags,
976 kern_return_t * kr), PMAP_CREATE_INDEX);
977
978 PMAP_SUPPORT_PROTOTYPES(
979 void,
980 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
981
982 PMAP_SUPPORT_PROTOTYPES(
983 kern_return_t,
984 pmap_enter_options, (pmap_t pmap,
985 vm_map_address_t v,
986 pmap_paddr_t pa,
987 vm_prot_t prot,
988 vm_prot_t fault_type,
989 unsigned int flags,
990 boolean_t wired,
991 unsigned int options,
992 pmap_mapping_type_t mapping_type), PMAP_ENTER_OPTIONS_INDEX);
993
994 PMAP_SUPPORT_PROTOTYPES(
995 pmap_paddr_t,
996 pmap_find_pa, (pmap_t pmap,
997 addr64_t va), PMAP_FIND_PA_INDEX);
998
999 PMAP_SUPPORT_PROTOTYPES(
1000 kern_return_t,
1001 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
1002
1003
1004 PMAP_SUPPORT_PROTOTYPES(
1005 boolean_t,
1006 pmap_is_empty, (pmap_t pmap,
1007 vm_map_offset_t va_start,
1008 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1009
1010
1011 PMAP_SUPPORT_PROTOTYPES(
1012 unsigned int,
1013 pmap_map_cpu_windows_copy, (ppnum_t pn,
1014 vm_prot_t prot,
1015 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1016
1017 PMAP_SUPPORT_PROTOTYPES(
1018 void,
1019 pmap_ro_zone_memcpy, (zone_id_t zid,
1020 vm_offset_t va,
1021 vm_offset_t offset,
1022 const vm_offset_t new_data,
1023 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1024
1025 PMAP_SUPPORT_PROTOTYPES(
1026 uint64_t,
1027 pmap_ro_zone_atomic_op, (zone_id_t zid,
1028 vm_offset_t va,
1029 vm_offset_t offset,
1030 zro_atomic_op_t op,
1031 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1032
1033 PMAP_SUPPORT_PROTOTYPES(
1034 void,
1035 pmap_ro_zone_bzero, (zone_id_t zid,
1036 vm_offset_t va,
1037 vm_offset_t offset,
1038 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1039
1040 PMAP_SUPPORT_PROTOTYPES(
1041 kern_return_t,
1042 pmap_nest, (pmap_t grand,
1043 pmap_t subord,
1044 addr64_t vstart,
1045 uint64_t size), PMAP_NEST_INDEX);
1046
1047 PMAP_SUPPORT_PROTOTYPES(
1048 void,
1049 pmap_page_protect_options, (ppnum_t ppnum,
1050 vm_prot_t prot,
1051 unsigned int options,
1052 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1053
1054 PMAP_SUPPORT_PROTOTYPES(
1055 vm_map_address_t,
1056 pmap_protect_options, (pmap_t pmap,
1057 vm_map_address_t start,
1058 vm_map_address_t end,
1059 vm_prot_t prot,
1060 unsigned int options,
1061 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1062
1063 PMAP_SUPPORT_PROTOTYPES(
1064 kern_return_t,
1065 pmap_query_page_info, (pmap_t pmap,
1066 vm_map_offset_t va,
1067 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1068
1069 PMAP_SUPPORT_PROTOTYPES(
1070 mach_vm_size_t,
1071 pmap_query_resident, (pmap_t pmap,
1072 vm_map_address_t start,
1073 vm_map_address_t end,
1074 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1075
1076 PMAP_SUPPORT_PROTOTYPES(
1077 void,
1078 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1079
1080 PMAP_SUPPORT_PROTOTYPES(
1081 vm_map_address_t,
1082 pmap_remove_options, (pmap_t pmap,
1083 vm_map_address_t start,
1084 vm_map_address_t end,
1085 int options), PMAP_REMOVE_OPTIONS_INDEX);
1086
1087
1088 PMAP_SUPPORT_PROTOTYPES(
1089 void,
1090 pmap_set_cache_attributes, (ppnum_t pn,
1091 unsigned int cacheattr,
1092 bool update_attr_table), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1093
1094 PMAP_SUPPORT_PROTOTYPES(
1095 void,
1096 pmap_update_compressor_page, (ppnum_t pn,
1097 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1098
1099 PMAP_SUPPORT_PROTOTYPES(
1100 void,
1101 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1102
1103 #if MACH_ASSERT
1104 PMAP_SUPPORT_PROTOTYPES(
1105 void,
1106 pmap_set_process, (pmap_t pmap,
1107 int pid,
1108 char *procname), PMAP_SET_PROCESS_INDEX);
1109 #endif
1110
1111 PMAP_SUPPORT_PROTOTYPES(
1112 void,
1113 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1114
1115 PMAP_SUPPORT_PROTOTYPES(
1116 void,
1117 pmap_unnest_options, (pmap_t grand,
1118 addr64_t vaddr,
1119 uint64_t size,
1120 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1121
1122 PMAP_SUPPORT_PROTOTYPES(
1123 void,
1124 phys_attribute_set, (ppnum_t pn,
1125 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1126
1127 PMAP_SUPPORT_PROTOTYPES(
1128 void,
1129 phys_attribute_clear, (ppnum_t pn,
1130 unsigned int bits,
1131 int options,
1132 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1133
1134 #if __ARM_RANGE_TLBI__
1135 PMAP_SUPPORT_PROTOTYPES(
1136 vm_map_address_t,
1137 phys_attribute_clear_range, (pmap_t pmap,
1138 vm_map_address_t start,
1139 vm_map_address_t end,
1140 unsigned int bits,
1141 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1142 #endif /* __ARM_RANGE_TLBI__ */
1143
1144
1145 PMAP_SUPPORT_PROTOTYPES(
1146 void,
1147 pmap_switch, (pmap_t pmap, thread_t thread), PMAP_SWITCH_INDEX);
1148
1149 PMAP_SUPPORT_PROTOTYPES(
1150 void,
1151 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1152
1153 PMAP_SUPPORT_PROTOTYPES(
1154 void,
1155 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1156
1157 PMAP_SUPPORT_PROTOTYPES(
1158 void,
1159 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1160
1161 PMAP_SUPPORT_PROTOTYPES(
1162 void,
1163 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1164
1165 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1166 PMAP_SUPPORT_PROTOTYPES(
1167 void,
1168 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1169 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1170
1171 PMAP_SUPPORT_PROTOTYPES(
1172 void,
1173 pmap_trim, (pmap_t grand,
1174 pmap_t subord,
1175 addr64_t vstart,
1176 uint64_t size), PMAP_TRIM_INDEX);
1177
1178 #if HAS_APPLE_PAC
1179 PMAP_SUPPORT_PROTOTYPES(
1180 void *,
1181 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1182 PMAP_SUPPORT_PROTOTYPES(
1183 void *,
1184 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1185 #endif /* HAS_APPLE_PAC */
1186
1187
1188 void pmap_footprint_suspend(vm_map_t map,
1189 boolean_t suspend);
1190 PMAP_SUPPORT_PROTOTYPES(
1191 void,
1192 pmap_footprint_suspend, (vm_map_t map,
1193 boolean_t suspend),
1194 PMAP_FOOTPRINT_SUSPEND_INDEX);
1195
1196
1197
1198
1199
1200 /*
1201 * The low global vector page is mapped at a fixed alias.
1202 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1203 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1204 * to check both addresses anyway for backward compatibility. So for now
1205 * we leave H6 and H7 where they were.
1206 */
1207 #if (ARM_PGSHIFT == 14)
1208 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1209 #else
1210 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1211 #endif
1212
1213 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1214 PMAP_ZINFO_PALLOC(
1215 pmap_t pmap, int bytes)
1216 {
1217 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1218 }
1219
1220 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1221 PMAP_ZINFO_PFREE(
1222 pmap_t pmap,
1223 int bytes)
1224 {
1225 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1226 }
1227
1228 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1229 pmap_tt_ledger_credit(
1230 pmap_t pmap,
1231 vm_size_t size)
1232 {
1233 if (pmap != kernel_pmap) {
1234 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1235 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1236 }
1237 }
1238
1239 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1240 pmap_tt_ledger_debit(
1241 pmap_t pmap,
1242 vm_size_t size)
1243 {
1244 if (pmap != kernel_pmap) {
1245 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1246 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1247 }
1248 }
1249
1250 static inline void
pmap_update_plru(uint16_t asid_index __unused)1251 pmap_update_plru(uint16_t asid_index __unused)
1252 {
1253 #if !HAS_16BIT_ASID
1254 if (__probable(pmap_asid_plru)) {
1255 unsigned plru_index = asid_index >> 6;
1256 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1257 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1258 asid_plru_bitmap[plru_index] = ((plru_index == 0) ? ~1ULL : UINT64_MAX);
1259 }
1260 }
1261 #endif /* !HAS_16BIT_ASID */
1262 }
1263
1264 static bool
alloc_asid(pmap_t pmap)1265 alloc_asid(pmap_t pmap)
1266 {
1267 int vasid = -1;
1268
1269 pmap_simple_lock(&asid_lock);
1270
1271 #if !HAS_16BIT_ASID
1272 if (__probable(pmap_asid_plru)) {
1273 unsigned plru_index = 0;
1274 uint64_t lowest_gen = asid_plru_generation[0];
1275 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1276 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1277 if (asid_plru_generation[i] < lowest_gen) {
1278 plru_index = i;
1279 lowest_gen = asid_plru_generation[i];
1280 lowest_gen_bitmap = asid_plru_bitmap[i];
1281 }
1282 }
1283
1284 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += (MAX_HW_ASIDS >> 6)) {
1285 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1286 if (temp_plru) {
1287 vasid = (plru_index << 6) + lsb_first(temp_plru);
1288 #if DEVELOPMENT || DEBUG
1289 ++pmap_asid_hits;
1290 #endif
1291 break;
1292 }
1293 }
1294 }
1295 #else
1296 /**
1297 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1298 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1299 * However, we first try to allocate starting from the position of the most-recently allocated
1300 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1301 * lower bit positions and then re-checking those same lower positions every time we allocate
1302 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1303 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1304 * logic, without requiring prohibitively expensive RCTX instructions.
1305 */
1306 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1307 #endif /* !HAS_16BIT_ASID */
1308 if (__improbable(vasid < 0)) {
1309 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1310 // slightly better with the collision detection scheme used by pmap_switch_internal().
1311 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1312 #if DEVELOPMENT || DEBUG
1313 ++pmap_asid_misses;
1314 #endif
1315 }
1316 if (__improbable(vasid < 0)) {
1317 pmap_simple_unlock(&asid_lock);
1318 return false;
1319 }
1320 assert((uint32_t)vasid < pmap_max_asids);
1321 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1322 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1323 const uint16_t hw_asid = (uint16_t)(vasid & (MAX_HW_ASIDS - 1));
1324 #if HAS_16BIT_ASID
1325 last_allocated_asid = hw_asid;
1326 #endif /* HAS_16BIT_ASID */
1327 pmap_simple_unlock(&asid_lock);
1328 assert(hw_asid != 0); // Should never alias kernel ASID
1329 pmap->asid = (uint16_t)vasid;
1330 pmap_update_plru(hw_asid);
1331 return true;
1332 }
1333
1334 static void
free_asid(pmap_t pmap)1335 free_asid(pmap_t pmap)
1336 {
1337 const uint16_t vasid = os_atomic_xchg(&pmap->asid, 0, relaxed);
1338 if (__improbable(vasid == 0)) {
1339 return;
1340 }
1341
1342 #if !HAS_16BIT_ASID
1343 if (pmap_asid_plru) {
1344 const uint16_t hw_asid = vasid & (MAX_HW_ASIDS - 1);
1345 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1346 }
1347 #endif /* !HAS_16BIT_ASID */
1348 pmap_simple_lock(&asid_lock);
1349 assert(!bitmap_test(&asid_bitmap[0], vasid));
1350 bitmap_set(&asid_bitmap[0], vasid);
1351 pmap_simple_unlock(&asid_lock);
1352 }
1353
1354
1355 boolean_t
pmap_valid_address(pmap_paddr_t addr)1356 pmap_valid_address(
1357 pmap_paddr_t addr)
1358 {
1359 return pa_valid(addr);
1360 }
1361
1362
1363
1364
1365
1366
1367 /*
1368 * Map memory at initialization. The physical addresses being
1369 * mapped are not managed and are never unmapped.
1370 *
1371 * For now, VM is already on, we only need to map the
1372 * specified memory.
1373 */
1374 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1375 pmap_map(
1376 vm_map_address_t virt,
1377 vm_offset_t start,
1378 vm_offset_t end,
1379 vm_prot_t prot,
1380 unsigned int flags)
1381 {
1382 kern_return_t kr;
1383 vm_size_t ps;
1384
1385 ps = PAGE_SIZE;
1386 while (start < end) {
1387 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1388 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1389
1390 if (kr != KERN_SUCCESS) {
1391 panic("%s: failed pmap_enter, "
1392 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1393 __FUNCTION__,
1394 (void *) virt, (void *) start, (void *) end, prot, flags);
1395 }
1396
1397 virt += ps;
1398 start += ps;
1399 }
1400
1401
1402 return virt;
1403 }
1404
1405 #if HAS_SPTM_SYSCTL
1406 bool disarm_protected_io = false;
1407 #endif /* HAS_SPTM_SYSCTL */
1408
1409 /**
1410 * Force the permission of a PTE to be kernel RO if a page has XNU_PROTECTED_IO type.
1411 *
1412 * @param paddr The physical address of the page.
1413 * @param tmplate The PTE value to be evaluated.
1414 *
1415 * @return A new PTE value with permission bits modified.
1416 */
1417 static inline
1418 pt_entry_t
pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr,pt_entry_t tmplate)1419 pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr, pt_entry_t tmplate)
1420 {
1421 #if HAS_SPTM_SYSCTL
1422 if (__improbable(disarm_protected_io)) {
1423 /* Make sure disarm_protected_io is read before its counterpart in SPTM */
1424 os_atomic_thread_fence(acquire);
1425 return tmplate;
1426 }
1427
1428 #endif /* HAS_SPTM_SYSCTL */
1429
1430 /**
1431 * When requesting RW mappings to an XNU_PROTECTED_IO frame, downgrade
1432 * the mapping to RO. This is required because IOKit relies on this
1433 * behavior currently in the PPL.
1434 */
1435 const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
1436 if (frame_type == XNU_PROTECTED_IO) {
1437 /* SPTM to own the page by converting KERN_RW to PPL_RW. */
1438 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1439 switch (xprr_perm) {
1440 case XPRR_KERN_RO_PERM:
1441 break;
1442 case XPRR_KERN_RW_PERM:
1443 tmplate &= ~ARM_PTE_XPRR_MASK;
1444 tmplate |= xprr_perm_to_pte(XPRR_KERN_RO_PERM);
1445 break;
1446 default:
1447 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1448 }
1449 }
1450
1451 return tmplate;
1452 }
1453
1454 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1455 pmap_map_bd_with_options(
1456 vm_map_address_t virt,
1457 vm_offset_t start,
1458 vm_offset_t end,
1459 vm_prot_t prot,
1460 int32_t options)
1461 {
1462 pt_entry_t tmplate;
1463 vm_map_address_t vaddr;
1464 vm_offset_t paddr;
1465 pt_entry_t mem_attr;
1466
1467 if (__improbable(start & PAGE_MASK)) {
1468 panic("%s: start 0x%lx is not page aligned", __func__, start);
1469 }
1470
1471 if (__improbable(end & PAGE_MASK)) {
1472 panic("%s: end 0x%lx is not page aligned", __func__, end);
1473 }
1474
1475 if (__improbable(!gDramBase || !gDramSize)) {
1476 panic("%s: gDramBase/gDramSize not initialized", __func__);
1477 }
1478
1479 bool first_page_is_dram = is_dram_addr(start);
1480 for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) {
1481 if (first_page_is_dram != is_dram_addr(pa)) {
1482 panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM",
1483 __func__, pa, first_page_is_dram ? "is not" : "is");
1484 }
1485 }
1486
1487 switch (options & PMAP_MAP_BD_MASK) {
1488 case PMAP_MAP_BD_WCOMB:
1489 if (is_dram_addr(start)) {
1490 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1491 } else {
1492 #if HAS_FEAT_XS
1493 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
1494 #else /* HAS_FEAT_XS */
1495 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1496 #endif /* HAS_FEAT_XS */
1497 #if DEBUG || DEVELOPMENT
1498 pmap_wcrt_on_non_dram_count_increment_atomic();
1499 #endif /* DEBUG || DEVELOPMENT */
1500 }
1501 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1502 break;
1503 case PMAP_MAP_BD_POSTED:
1504 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1505 break;
1506 case PMAP_MAP_BD_POSTED_REORDERED:
1507 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1508 break;
1509 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1510 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1511 break;
1512 default:
1513 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1514 break;
1515 }
1516
1517 tmplate = ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1518 mem_attr | ARM_PTE_TYPE_VALID | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1519
1520 #if __ARM_KERNEL_PROTECT__
1521 tmplate |= ARM_PTE_NG;
1522 #endif /* __ARM_KERNEL_PROTECT__ */
1523
1524 vaddr = virt;
1525 paddr = start;
1526 while (paddr < end) {
1527 __assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, vaddr, pmap_force_pte_kernel_ro_if_protected_io(paddr, tmplate) | pa_to_pte(paddr));
1528 assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1529
1530 vaddr += PAGE_SIZE;
1531 paddr += PAGE_SIZE;
1532 }
1533
1534 return vaddr;
1535 }
1536
1537 /*
1538 * Back-door routine for mapping kernel VM at initialization.
1539 * Useful for mapping memory outside the range
1540 * [vm_first_phys, vm_last_phys] (i.e., devices).
1541 * Otherwise like pmap_map.
1542 */
1543 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1544 pmap_map_bd(
1545 vm_map_address_t virt,
1546 vm_offset_t start,
1547 vm_offset_t end,
1548 vm_prot_t prot)
1549 {
1550 return pmap_map_bd_with_options(virt, start, end, prot, 0);
1551 }
1552
1553 /*
1554 * Back-door routine for mapping kernel VM at initialization.
1555 * Useful for mapping memory specific physical addresses in early
1556 * boot (i.e., before kernel_map is initialized).
1557 *
1558 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1559 */
1560
1561 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1562 pmap_map_high_window_bd(
1563 vm_offset_t pa_start,
1564 vm_size_t len,
1565 vm_prot_t prot)
1566 {
1567 pt_entry_t *ptep, pte;
1568 vm_map_address_t va_start = VREGION1_START;
1569 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1570 vm_map_address_t va_end;
1571 vm_map_address_t va;
1572 vm_size_t offset;
1573
1574 offset = pa_start & PAGE_MASK;
1575 pa_start -= offset;
1576 len += offset;
1577
1578 if (len > (va_max - va_start)) {
1579 panic("%s: area too large, "
1580 "pa_start=%p, len=%p, prot=0x%x",
1581 __FUNCTION__,
1582 (void*)pa_start, (void*)len, prot);
1583 }
1584
1585 scan:
1586 for (; va_start < va_max; va_start += PAGE_SIZE) {
1587 ptep = pmap_pte(kernel_pmap, va_start);
1588 assert(!pte_is_compressed(*ptep, ptep));
1589 if (!pte_is_valid(*ptep)) {
1590 break;
1591 }
1592 }
1593 if (va_start > va_max) {
1594 panic("%s: insufficient pages, "
1595 "pa_start=%p, len=%p, prot=0x%x",
1596 __FUNCTION__,
1597 (void*)pa_start, (void*)len, prot);
1598 }
1599
1600 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1601 ptep = pmap_pte(kernel_pmap, va_end);
1602 assert(!pte_is_compressed(*ptep, ptep));
1603 if (pte_is_valid(*ptep)) {
1604 va_start = va_end + PAGE_SIZE;
1605 goto scan;
1606 }
1607 }
1608
1609 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1610 ptep = pmap_pte(kernel_pmap, va);
1611 pte = pa_to_pte(pa_start)
1612 | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1613 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1614 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT)
1615 | ARM_PTE_SH(SH_OUTER_MEMORY);
1616 #if __ARM_KERNEL_PROTECT__
1617 pte |= ARM_PTE_NG;
1618 #endif /* __ARM_KERNEL_PROTECT__ */
1619 __assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, va, pte);
1620 assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1621 }
1622 #if KASAN
1623 kasan_notify_address(va_start, len);
1624 #endif
1625 return va_start;
1626 }
1627
1628 /*
1629 * pmap_get_arm64_prot
1630 *
1631 * return effective armv8 VMSA block protections including
1632 * table AP/PXN/XN overrides of a pmap entry
1633 *
1634 */
1635
1636 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1637 pmap_get_arm64_prot(
1638 pmap_t pmap,
1639 vm_offset_t addr)
1640 {
1641 tt_entry_t tte = 0;
1642 unsigned int level = 0;
1643 uint64_t effective_prot_bits = 0;
1644 uint64_t aggregate_tte = 0;
1645 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1646 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1647
1648 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1649 tte = *pmap_ttne(pmap, level, addr);
1650
1651 if (!(tte & ARM_TTE_VALID)) {
1652 return 0;
1653 }
1654
1655 if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
1656 /* Block or page mapping; both have the same protection bit layout. */
1657 break;
1658 } else if (tte_is_table(tte)) {
1659 /* All of the table bits we care about are overrides, so just OR them together. */
1660 aggregate_tte |= tte;
1661 }
1662 }
1663
1664 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1665 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1666 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1667
1668 /* Start with the PTE bits. */
1669 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1670
1671 /* Table AP bits mask out block/page AP bits */
1672 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1673
1674 /* XN/PXN bits can be OR'd in. */
1675 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1676 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1677
1678 return effective_prot_bits;
1679 }
1680
1681 /**
1682 * Helper macros for accessing the "unnested" and "in-progress" bits in
1683 * pmap->nested_region_unnested_table_bitmap.
1684 */
1685 #define UNNEST_BIT(index) ((index) * 2)
1686 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
1687
1688 /*
1689 * Bootstrap the system enough to run with virtual memory.
1690 *
1691 * The early VM initialization code has already allocated
1692 * the first CPU's translation table and made entries for
1693 * all the one-to-one mappings to be found there.
1694 *
1695 * We must set up the kernel pmap structures, the
1696 * physical-to-virtual translation lookup tables for the
1697 * physical memory to be managed (between avail_start and
1698 * avail_end).
1699 *
1700 * Map the kernel's code and data, and allocate the system page table.
1701 * Page_size must already be set.
1702 *
1703 * Parameters:
1704 * first_avail first available physical page -
1705 * after kernel page tables
1706 * avail_start PA of first managed physical page
1707 * avail_end PA of last managed physical page
1708 */
1709
1710 void
pmap_bootstrap(vm_offset_t vstart)1711 pmap_bootstrap(
1712 vm_offset_t vstart)
1713 {
1714 vm_map_offset_t maxoffset;
1715
1716 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
1717
1718 #if DEVELOPMENT || DEBUG
1719 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
1720 kprintf("Kernel traces for pmap operations enabled\n");
1721 }
1722 #endif
1723
1724 /*
1725 * Initialize the kernel pmap.
1726 */
1727 #if ARM_PARAMETERIZED_PMAP
1728 kernel_pmap->pmap_pt_attr = &pmap_pt_attr_16k_kern;
1729 #endif /* ARM_PARAMETERIZED_PMAP */
1730 #if HAS_APPLE_PAC
1731 kernel_pmap->disable_jop = 0;
1732 #endif /* HAS_APPLE_PAC */
1733 kernel_pmap->tte = cpu_tte;
1734 kernel_pmap->ttep = cpu_ttep;
1735 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
1736 kernel_pmap->max = UINTPTR_MAX;
1737 os_ref_init_count_raw(&kernel_pmap->ref_count, &pmap_refgrp, 1);
1738 kernel_pmap->nx_enabled = TRUE;
1739 kernel_pmap->is_64bit = TRUE;
1740 #if CONFIG_ROSETTA
1741 kernel_pmap->is_rosetta = FALSE;
1742 #endif
1743
1744 kernel_pmap->nested_region_addr = 0x0ULL;
1745 kernel_pmap->nested_region_size = 0x0ULL;
1746 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
1747 kernel_pmap->type = PMAP_TYPE_KERNEL;
1748
1749 kernel_pmap->asid = 0;
1750
1751 /**
1752 * The kernel pmap lock is no longer needed; init it and then destroy it to
1753 * place it in a known-invalid state that will cause any attempt to use it
1754 * to fail.
1755 */
1756 pmap_lock_init(kernel_pmap);
1757 pmap_lock_destroy(kernel_pmap);
1758
1759 pmap_max_asids = SPTMArgs->num_asids;
1760
1761 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
1762
1763 /**
1764 * Bootstrap the core pmap data structures (e.g., pv_head_table,
1765 * pp_attr_table, etc). This function will use `avail_start` to allocate
1766 * space for these data structures.
1767 * */
1768 pmap_data_bootstrap();
1769
1770 /**
1771 * Don't make any assumptions about the alignment of avail_start before this
1772 * point (i.e., pmap_data_bootstrap() performs allocations).
1773 */
1774 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
1775
1776 const pmap_paddr_t pmap_struct_start = avail_start;
1777
1778 asid_bitmap = (bitmap_t*)phystokv(avail_start);
1779 avail_start = round_page(avail_start + asid_table_size);
1780
1781 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
1782
1783 queue_init(&map_pmap_list);
1784 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
1785
1786 virtual_space_start = vstart;
1787 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
1788
1789 bitmap_full(&asid_bitmap[0], pmap_max_asids);
1790 /* Clear the ASIDs which will alias the reserved kernel ASID of 0. */
1791 for (unsigned int i = 0; i < pmap_max_asids; i += MAX_HW_ASIDS) {
1792 bitmap_clear(&asid_bitmap[0], i);
1793 }
1794
1795
1796 #if !HAS_16BIT_ASID
1797 /**
1798 * Align the range of available hardware ASIDs to a multiple of 64 to enable the
1799 * masking used by the PLRU scheme. This means we must handle the case in which
1800 * the returned hardware ASID is 0, which we do by clearing all vASIDs that will
1801 * alias the kernel ASID.
1802 */
1803 pmap_max_asids = pmap_max_asids & ~63ul;
1804 if (__improbable(pmap_max_asids == 0)) {
1805 panic("%s: insufficient number of ASIDs (%u) supplied by SPTM", __func__, (unsigned int)pmap_max_asids);
1806 }
1807 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
1808 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
1809 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
1810 _Static_assert((MAX_HW_ASIDS % 64) == 0, "MAX_HW_ASIDS is not divisible by 64");
1811 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
1812 bitmap_clear(&asid_plru_bitmap[0], 0);
1813 #endif /* !HAS_16BIT_ASID */
1814
1815
1816 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
1817 maxoffset = trunc_page(maxoffset);
1818 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
1819 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
1820 arm_pmap_max_offset_default = maxoffset;
1821 }
1822 }
1823 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
1824 maxoffset = trunc_page(maxoffset);
1825 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
1826 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
1827 arm64_pmap_max_offset_default = maxoffset;
1828 }
1829 }
1830
1831 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
1832
1833
1834 #if DEVELOPMENT || DEBUG
1835 PE_parse_boot_argn("vm_footprint_suspend_allowed",
1836 &vm_footprint_suspend_allowed,
1837 sizeof(vm_footprint_suspend_allowed));
1838 #endif /* DEVELOPMENT || DEBUG */
1839
1840 #if KASAN
1841 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
1842 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
1843 #endif /* KASAN */
1844
1845 /**
1846 * Ensure that avail_start is always left on a page boundary. The calling
1847 * code might not perform any alignment before allocating page tables so
1848 * this is important.
1849 */
1850 avail_start = round_page(avail_start);
1851
1852
1853 #if (DEVELOPMENT || DEBUG)
1854 (void)sptm_features_available(SPTM_FEATURE_SYSREG, &sptm_sysreg_available);
1855 #endif /* (DEVELOPMENT || DEBUG) */
1856
1857 #if __ARM64_PMAP_SUBPAGE_L1__
1858 /* Initialize the Subpage User Root Table subsystem. */
1859 surt_init();
1860 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
1861
1862 /* Signal that the pmap has been bootstrapped */
1863 pmap_bootstrapped = true;
1864 }
1865
1866 /**
1867 * Helper for creating a populated commpage table
1868 *
1869 * In order to avoid burning extra pages on mapping the commpage, we create a
1870 * dedicated table hierarchy for the commpage. We forcibly nest the translation tables from
1871 * this pmap into other pmaps. The level we will nest at depends on the MMU configuration (page
1872 * size, TTBR range, etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
1873 *
1874 * @note that this is NOT "the nested pmap" (which is used to nest the shared cache).
1875 *
1876 * @param rw_va Virtual address at which to insert a mapping to the kernel R/W commpage
1877 * @param ro_va Virtual address at which to insert a mapping to the kernel R/O commpage
1878 * @param rw_pa Physical address of kernel R/W commpage
1879 * @param ro_pa Physical address of kernel R/O commpage, may be 0 if not supported in this
1880 * configuration
1881 * @param rx_pa Physical address of user executable (and kernel R/O) commpage, may be 0 if
1882 * not supported in this configuration
1883 * @param pmap_create_flags Control flags for the temporary pmap created by this function
1884 *
1885 * @return the physical address of the created commpage table, typed as
1886 * XNU_PAGE_TABLE_COMMPAGE and containing all relevant commpage mappings.
1887 */
1888 static pmap_paddr_t
pmap_create_commpage_table(vm_map_address_t rw_va,vm_map_address_t ro_va,pmap_paddr_t rw_pa,pmap_paddr_t ro_pa,pmap_paddr_t rx_pa,unsigned int pmap_create_flags)1889 pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va,
1890 pmap_paddr_t rw_pa, pmap_paddr_t ro_pa, pmap_paddr_t rx_pa, unsigned int pmap_create_flags)
1891 {
1892 pmap_t temp_commpage_pmap = pmap_create_options(NULL, 0, pmap_create_flags);
1893 assert(temp_commpage_pmap != NULL);
1894 assert(rw_pa != 0);
1895 const pt_attr_t *pt_attr = pmap_get_pt_attr(temp_commpage_pmap);
1896
1897 /*
1898 * We only use pmap_expand to expand the pmap up to the commpage nesting level. At that level
1899 * and beyond, all the newly created tables will be nested directly into the userspace region
1900 * for each process, and as such they must be of the dedicated SPTM commpage table type so that
1901 * the SPTM can enforce the commpage security model which forbids random replacement of commpage
1902 * mappings.
1903 */
1904 kern_return_t kr = pmap_expand(temp_commpage_pmap, rw_va, 0, pt_attr_commpage_level(pt_attr));
1905 assert(kr == KERN_SUCCESS);
1906
1907 pmap_paddr_t commpage_table_pa = 0;
1908 for (unsigned int i = pt_attr_commpage_level(pt_attr); i < pt_attr_leaf_level(pt_attr); i++) {
1909 pmap_paddr_t new_table = 0;
1910 kr = pmap_page_alloc(&new_table, 0);
1911 assert((kr == KERN_SUCCESS) && (new_table != 0));
1912 if (commpage_table_pa == 0) {
1913 commpage_table_pa = new_table;
1914 }
1915
1916 pt_desc_t *ptdp = ptd_alloc(temp_commpage_pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
1917 assert(ptdp);
1918
1919 const unsigned int pai = pa_index(new_table);
1920 locked_pvh_t locked_pvh = pvh_lock(pai);
1921 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
1922
1923 ptd_info_init(ptdp, temp_commpage_pmap, pt_attr_align_va(pt_attr, i, rw_va), i + 1, NULL);
1924
1925 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1926 retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
1927 sptm_retype(new_table, XNU_DEFAULT, XNU_PAGE_TABLE_COMMPAGE, retype_params);
1928
1929 const sptm_tte_t table_tte = (new_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
1930
1931 sptm_map_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, i, rw_va),
1932 (sptm_pt_level_t)i, table_tte);
1933
1934 ptd_info_finalize(ptdp);
1935
1936 /* The PTD's assoicated pmap temp_commpage_pmap is to be destroyed, so set it to NULL here. */
1937 ptdp->pmap = NULL;
1938
1939 pvh_unlock(&locked_pvh);
1940 }
1941
1942 /*
1943 * Note the lack of ARM_PTE_NG here: commpage mappings are at fixed addresses and
1944 * frequently accessed, so we map them global to avoid unnecessary TLB pressure.
1945 */
1946 static const sptm_pte_t commpage_pte_template = ARM_PTE_TYPE_VALID
1947 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK)
1948 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX
1949 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF;
1950
1951 sptm_return_t sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, rw_va,
1952 commpage_pte_template | ARM_PTE_NX | pa_to_pte(rw_pa));
1953 assert(sptm_ret == SPTM_SUCCESS);
1954
1955 if (ro_pa != 0) {
1956 assert((ro_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1957 sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, ro_va,
1958 commpage_pte_template | ARM_PTE_NX | pa_to_pte(ro_pa));
1959 assert(sptm_ret == SPTM_SUCCESS);
1960 }
1961
1962 if (rx_pa != 0) {
1963 assert((commpage_text_user_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1964 assert((commpage_text_user_va != rw_va) && (commpage_text_user_va != ro_va));
1965 sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, commpage_text_user_va, commpage_pte_template | pa_to_pte(rx_pa));
1966 assert(sptm_ret == SPTM_SUCCESS);
1967 }
1968
1969
1970 /* Unmap the commpage table here so that it won't be deallocated by pmap_destroy(). */
1971 sptm_unmap_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, pt_attr_commpage_level(pt_attr), rw_va),
1972 (sptm_pt_level_t)pt_attr_commpage_level(pt_attr));
1973 pmap_destroy(temp_commpage_pmap);
1974
1975 return commpage_table_pa;
1976 }
1977
1978 /**
1979 * Helper for creating all commpage tables applicable to the current configuration.
1980 *
1981 * @note This function is intended to be called during bootstrap.
1982 * @note This function assumes that pmap_create_commpages has already executed, and therefore
1983 * the commpage_*_pa variables have been assigned to their final values. commpage_data_pa
1984 * is the kernel RW commpage and is assumed to be present on all configurations, so it
1985 * therefore must be non-zero at this point. The other variables are considered optional
1986 * depending upon configuration and may be zero.
1987 */
1988 void pmap_prepare_commpages(void);
1989 void
pmap_prepare_commpages(void)1990 pmap_prepare_commpages(void)
1991 {
1992 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1993 assert(commpage_data_pa != 0);
1994 sptm_retype(commpage_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RW, retype_params);
1995 if (commpage_ro_data_pa != 0) {
1996 sptm_retype(commpage_ro_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RO, retype_params);
1997 }
1998 if (commpage_text_pa != 0) {
1999 sptm_retype(commpage_text_pa, XNU_DEFAULT, XNU_COMMPAGE_RX, retype_params);
2000 }
2001
2002 /*
2003 * User mapping of comm page text section for 64 bit mapping only
2004 *
2005 * We don't insert the text commpage into the 32 bit mapping because we don't want
2006 * 32-bit user processes to get this page mapped in, they should never call into
2007 * this page.
2008 */
2009 commpage_default_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
2010 commpage_data_pa, commpage_ro_data_pa, commpage_text_pa, PMAP_CREATE_64BIT);
2011
2012 /*
2013 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
2014 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
2015 */
2016 commpage32_default_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
2017 commpage_data_pa, commpage_ro_data_pa, 0, 0);
2018
2019 #if __ARM_MIXED_PAGE_SIZE__
2020 commpage_4k_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
2021 commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_64BIT | PMAP_CREATE_FORCE_4K_PAGES);
2022
2023 /*
2024 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
2025 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
2026 * commpage32_4k_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
2027 * commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES);
2028 */
2029 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2030
2031 }
2032
2033 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2034 pmap_virtual_space(
2035 vm_offset_t *startp,
2036 vm_offset_t *endp
2037 )
2038 {
2039 *startp = virtual_space_start;
2040 *endp = virtual_space_end;
2041 }
2042
2043
2044 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2045 pmap_virtual_region(
2046 unsigned int region_select,
2047 vm_map_offset_t *startp,
2048 vm_map_size_t *size
2049 )
2050 {
2051 boolean_t ret = FALSE;
2052 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2053 if (region_select == 0) {
2054 /*
2055 * In this config, the bootstrap mappings should occupy their own L2
2056 * TTs, as they should be immutable after boot. Having the associated
2057 * TTEs and PTEs in their own pages allows us to lock down those pages,
2058 * while allowing the rest of the kernel address range to be remapped.
2059 */
2060 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2061 #if defined(ARM_LARGE_MEMORY)
2062 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2063 #else
2064 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2065 #endif
2066 ret = TRUE;
2067 }
2068
2069 #if defined(ARM_LARGE_MEMORY)
2070 if (region_select == 1) {
2071 *startp = VREGION1_START;
2072 *size = VREGION1_SIZE;
2073 ret = TRUE;
2074 }
2075 #endif
2076 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */
2077 #if defined(ARM_LARGE_MEMORY)
2078 /* For large memory systems with no KTRR/CTRR */
2079 if (region_select == 0) {
2080 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2081 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2082 ret = TRUE;
2083 }
2084
2085 if (region_select == 1) {
2086 *startp = VREGION1_START;
2087 *size = VREGION1_SIZE;
2088 ret = TRUE;
2089 }
2090 #else /* !defined(ARM_LARGE_MEMORY) */
2091 unsigned long low_global_vr_mask = 0;
2092 vm_map_size_t low_global_vr_size = 0;
2093
2094 if (region_select == 0) {
2095 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2096 if (!TEST_PAGE_SIZE_4K) {
2097 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2098 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2099 } else {
2100 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2101 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2102 }
2103 ret = TRUE;
2104 }
2105 if (region_select == 1) {
2106 *startp = VREGION1_START;
2107 *size = VREGION1_SIZE;
2108 ret = TRUE;
2109 }
2110 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2111 if (!TEST_PAGE_SIZE_4K) {
2112 low_global_vr_mask = 0xFFFFFFFFFE000000;
2113 low_global_vr_size = 0x2000000;
2114 } else {
2115 low_global_vr_mask = 0xFFFFFFFFFF800000;
2116 low_global_vr_size = 0x800000;
2117 }
2118
2119 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2120 *startp = LOW_GLOBAL_BASE_ADDRESS;
2121 *size = low_global_vr_size;
2122 ret = TRUE;
2123 }
2124
2125 if (region_select == 3) {
2126 /* In this config, we allow the bootstrap mappings to occupy the same
2127 * page table pages as the heap.
2128 */
2129 *startp = VM_MIN_KERNEL_ADDRESS;
2130 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2131 ret = TRUE;
2132 }
2133 #endif /* defined(ARM_LARGE_MEMORY) */
2134 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
2135 return ret;
2136 }
2137
2138 /*
2139 * Routines to track and allocate physical pages during early boot.
2140 * On most systems that memory runs from first_avail through to avail_end
2141 * with no gaps.
2142 *
2143 * If the system supports ECC and ecc_bad_pages_count > 0, we
2144 * need to skip those pages.
2145 */
2146
2147 static unsigned int avail_page_count = 0;
2148 static bool need_ram_ranges_init = true;
2149
2150
2151 /**
2152 * Checks to see if a given page is in
2153 * the array of known bad pages
2154 *
2155 * @param ppn page number to check
2156 */
2157 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2158 pmap_is_bad_ram(__unused ppnum_t ppn)
2159 {
2160 return false;
2161 }
2162
2163 /**
2164 * Prepare bad ram pages to be skipped.
2165 */
2166 #if HAS_MTE
2167
2168 /*
2169 * Things to track use of MTE tag pages.
2170 *
2171 * The tag storage region starts at mte_tag_storage_start, and ends at
2172 * mte_tag_storage_end. The tag storage region should consist of
2173 * mte_tag_storage_count pages.
2174 *
2175 * mte_tag_storage_start_pnum is just the physical page number of the first
2176 * page in the tag storage region.
2177 */
2178 SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_start;
2179 SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_end;
2180 SECURITY_READ_ONLY_LATE(ppnum_t) mte_tag_storage_start_pnum;
2181 SECURITY_READ_ONLY_LATE(uint_t) mte_tag_storage_count;
2182
2183 /*
2184 * Bounds for calculating which portions of the tag storage range that won't be
2185 * used for tag storage
2186 *
2187 * We currently expect DRAM to look (very roughly) like this (unless the maxmem
2188 * boot-arg is being used):
2189 *
2190 * +-----------+---------+-------------+-----------------+-----------+
2191 * | Unmanaged | Managed | Tag Storage | Managed (maybe) | Unmanaged |
2192 * +-----------+---------+-------------+-----------------+-----------+
2193 *
2194 * The system will never tag the unmanaged pages, as it will not have data
2195 * structures for those pages. The system will also never tag the tag storage
2196 * pages, as this is forbidden by the hardware.
2197 *
2198 * The maxmem boot-arg may grow the size of the ending unmanaged region
2199 * (potentially extended into or past the tag storage region).
2200 *
2201 * As far as the terminology goes, "recursive" tag storage is the tag storage
2202 * range that covers the tag storage region. The "managed" tag storage is the
2203 * tag storage range that covers managed memory (which includes the tag storage
2204 * range itself, unless the maxmem boot-arg is involved). This implicitly
2205 * means that the "unmanaged" tag storage range is all tag storage outside the
2206 * "managed" range.
2207 */
2208 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_recursive_start_pnum;
2209 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_recursive_end_pnum;
2210 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_managed_start_pnum;
2211 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_managed_end_pnum;
2212 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_discarded_start_pnum;
2213 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_discarded_end_pnum;
2214 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_recursive_discarded_end_pnum;
2215
2216 static inline void
pmap_tag_op(const unified_page_list_t * page_list,bool tag_not_untag,__assert_only bool panic_on_redundant_calls)2217 pmap_tag_op(const unified_page_list_t *page_list, bool tag_not_untag, __assert_only bool panic_on_redundant_calls)
2218 {
2219 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
2220 sptm_paddr_t *paddr_list = NULL;
2221
2222 unsigned int num_paddrs = 0;
2223
2224 /**
2225 * Drain the epochs to ensure any lingering batched operations that may have taken
2226 * an in-flight reference to these pages are complete. sptm_tag_papt_multipage(),
2227 * much like sptm_retype(), takes exclusive guards on each physical page, so this
2228 * is needed as a precaution to avoid a race with (for example) a concurrent
2229 * pmap_remove() which may still hold a lingering shared guard on a page in this
2230 * list after removing a mapping. The VM layer should guarantee that all existing
2231 * mappings have been disconnected and no new mappings should be incoming for the
2232 * pages when this function is called.
2233 */
2234 pmap_epoch_prepare_drain();
2235 pmap_epoch_drain();
2236
2237 unified_page_list_iterator_t iter;
2238
2239 for (unified_page_list_iterator_init(page_list, &iter);
2240 !unified_page_list_iterator_end(&iter);
2241 unified_page_list_iterator_next(&iter)) {
2242 bool is_fictitious = false;
2243 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
2244 const pmap_paddr_t paddr = ptoa(pn);
2245 vm_page_t page;
2246
2247 /**
2248 * The VM may pass a fictitious or guard page here, which doesn't have a valid
2249 * managed PA.
2250 */
2251 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
2252 continue;
2253 }
2254
2255 page = unified_page_list_iterator_vm_page(&iter);
2256 if (page == VM_PAGE_NULL) {
2257 /*
2258 * all pages that we tag or untag are managed, meaning
2259 * that resolution should always succeed once we're past
2260 * bootstrap.
2261 *
2262 * Before bootstrap it means that callers must be sure
2263 * there's work to do.
2264 */
2265 assert(startup_phase < STARTUP_SUB_KMEM);
2266 } else if (page->vmp_using_mte == tag_not_untag) {
2267 /* pmap_tag_op shoudldn't be called with no effect while
2268 * panic_on_redundant_calls is set. Hence assert below */
2269 assert(!panic_on_redundant_calls);
2270 continue;
2271 }
2272
2273 const unsigned int pai = pa_index(paddr);
2274 pp_attr_t pp_attr_current, pp_attr_template;
2275 unsigned int cacheattr = (tag_not_untag ? VM_WIMG_MTE : VM_WIMG_DEFAULT);
2276
2277 /**
2278 * We should not need the PVH lock here as the VM should not be issuing any concurrent
2279 * mappings requests against these pages.
2280 */
2281 os_atomic_rmw_loop(&pp_attr_table[pai], pp_attr_current, pp_attr_template, relaxed, {
2282 if (tag_not_untag) {
2283 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
2284 assert3u(pp_attr_current & PP_ATTR_WIMG_MASK, ==, VM_WIMG_DEFAULT);
2285 }
2286 } else {
2287 assert3u(pp_attr_current & PP_ATTR_WIMG_MASK, ==, VM_WIMG_MTE);
2288 }
2289 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
2290 });
2291
2292 if (num_paddrs == 0) {
2293 disable_preemption();
2294 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
2295 paddr_list = sptm_pcpu->sptm_paddrs;
2296 }
2297 paddr_list[num_paddrs++] = paddr;
2298 if (num_paddrs == SPTM_MAPPING_LIMIT) {
2299 if (tag_not_untag) {
2300 sptm_tag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs, 0);
2301 } else {
2302 sptm_untag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs);
2303 }
2304 enable_preemption();
2305 num_paddrs = 0;
2306 }
2307 }
2308
2309 if (num_paddrs != 0) {
2310 if (tag_not_untag) {
2311 sptm_tag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs, 0);
2312 } else {
2313 sptm_untag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs);
2314 }
2315 enable_preemption();
2316 }
2317 }
2318
2319 void
pmap_make_tagged_pages(const unified_page_list_t * page_list)2320 pmap_make_tagged_pages(const unified_page_list_t *page_list)
2321 {
2322 pmap_tag_op(page_list, true, false);
2323 }
2324
2325 void
pmap_make_tagged_page(ppnum_t pnum)2326 pmap_make_tagged_page(ppnum_t pnum)
2327 {
2328 upl_page_info_t single_page_upl = { .phys_addr = pnum };
2329 const unified_page_list_t page_list = {
2330 .upl = {.upl_info = &single_page_upl, .upl_size = 1},
2331 .type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
2332 };
2333 pmap_tag_op(&page_list, true, true);
2334 }
2335
2336 void
pmap_unmake_tagged_pages(const unified_page_list_t * page_list)2337 pmap_unmake_tagged_pages(const unified_page_list_t *page_list)
2338 {
2339 pmap_tag_op(page_list, false, false);
2340 }
2341
2342 void
pmap_unmake_tagged_page(ppnum_t pnum)2343 pmap_unmake_tagged_page(ppnum_t pnum)
2344 {
2345 upl_page_info_t single_page_upl = { .phys_addr = pnum };
2346 const unified_page_list_t page_list = {
2347 .upl = {.upl_info = &single_page_upl, .upl_size = 1},
2348 .type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
2349 };
2350 pmap_tag_op(&page_list, false, true);
2351 }
2352
2353 bool
pmap_is_tag_storage_page(ppnum_t pnum)2354 pmap_is_tag_storage_page(ppnum_t pnum)
2355 {
2356 return sptm_get_frame_type(ptoa(pnum)) == XNU_TAG_STORAGE;
2357 }
2358
2359 bool
pmap_in_tag_storage_range(ppnum_t pnum)2360 pmap_in_tag_storage_range(ppnum_t pnum)
2361 {
2362 pmap_paddr_t addr = ptoa(pnum);
2363
2364 return (mte_tag_storage_start <= addr) && (addr < mte_tag_storage_end);
2365 }
2366
2367 bool
pmap_tag_storage_is_recursive(ppnum_t pnum)2368 pmap_tag_storage_is_recursive(ppnum_t pnum)
2369 {
2370 assert(pmap_in_tag_storage_range(pnum));
2371
2372 return (mte_tag_storage_recursive_start_pnum <= pnum) &&
2373 (pnum < mte_tag_storage_recursive_end_pnum);
2374 }
2375
2376 bool
pmap_tag_storage_is_unmanaged(ppnum_t pnum)2377 pmap_tag_storage_is_unmanaged(ppnum_t pnum)
2378 {
2379 assert(pmap_in_tag_storage_range(pnum));
2380
2381 return (pnum < mte_tag_storage_managed_start_pnum) ||
2382 (mte_tag_storage_managed_end_pnum <= pnum);
2383 }
2384
2385 /*
2386 * Returns whether a physical page is MTE-tagged.
2387 *
2388 * Being a "tagged" page means the following:
2389 * 1. The cache attribute of the backing page is VM_WIMG_MTE.
2390 * 2. There is at least one MTE-enabled mapping of this physical page (the
2391 * physical aperture mapping, which is explicitly managed alongside the
2392 * page's cache attributes).
2393 *
2394 * IMPORTANT: this means that even if the mapping that "you" (the caller) have
2395 * is MTE-disabled, this function may still return true.
2396 */
2397 bool
pmap_tag_storage_is_discarded(ppnum_t pnum)2398 pmap_tag_storage_is_discarded(ppnum_t pnum)
2399 {
2400 assert(pmap_in_tag_storage_range(pnum));
2401
2402 return mte_tag_storage_discarded_start_pnum && (((pnum >= mte_tag_storage_discarded_start_pnum) &&
2403 (pnum < mte_tag_storage_discarded_end_pnum)) || ((pnum >= mte_tag_storage_recursive_start_pnum) && (pnum < mte_tag_storage_recursive_discarded_end_pnum)));
2404 }
2405
2406 bool
pmap_is_tagged_page(ppnum_t pnum)2407 pmap_is_tagged_page(ppnum_t pnum)
2408 {
2409 const pmap_paddr_t pa = ptoa(pnum);
2410
2411 if (!pmap_valid_address(pa)) {
2412 return false;
2413 }
2414
2415 unsigned int wimg = pmap_cache_attributes(pnum);
2416 return (wimg & VM_WIMG_MASK) == VM_WIMG_MTE;
2417 }
2418
2419 /*
2420 * Returns whether or not the specific translation corresponding to a given
2421 * virtual address is an MTE-enabled translation.
2422 */
2423 bool
pmap_is_tagged_mapping(pmap_t pmap,vm_map_offset_t va)2424 pmap_is_tagged_mapping(pmap_t pmap, vm_map_offset_t va)
2425 {
2426 pt_entry_t *ptep = pmap_pte(pmap, va);
2427 return ptep && (*ptep & ARM_PTE_ATTRINDX(CACHE_ATTRINDX_MTE));
2428 }
2429
2430 void
pmap_make_tag_storage_page(ppnum_t pnum)2431 pmap_make_tag_storage_page(ppnum_t pnum)
2432 {
2433 /**
2434 * Drain the epochs to ensure any lingering batched operations that may have taken
2435 * an in-flight reference to this page are complete.
2436 */
2437 pmap_epoch_prepare_drain();
2438 const pmap_paddr_t pa = ptoa(pnum);
2439 const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2440 pmap_epoch_drain();
2441 sptm_retype(pa, XNU_DEFAULT, XNU_TAG_STORAGE, retype_params);
2442 }
2443
2444 void
pmap_unmake_tag_storage_page(ppnum_t pnum)2445 pmap_unmake_tag_storage_page(ppnum_t pnum)
2446 {
2447 /**
2448 * Drain the epochs to ensure any lingering batched operations that may have operated
2449 * on just-removed mappings to this tag storage page have completed and are thus no
2450 * longer holding an in-flight reference to this page.
2451 */
2452 pmap_epoch_prepare_drain();
2453 const pmap_paddr_t pa = ptoa(pnum);
2454 const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2455 pmap_epoch_drain();
2456 sptm_retype(pa, XNU_TAG_STORAGE, XNU_DEFAULT, retype_params);
2457 }
2458
2459 /*
2460 * Given a physical address, calculates the physical page number of the tag
2461 * storage page that covers it.
2462 */
2463 static ppnum_t
map_paddr_to_tag_ppnum(pmap_paddr_t paddr)2464 map_paddr_to_tag_ppnum(pmap_paddr_t paddr)
2465 {
2466 uint64_t tag_page_index;
2467
2468 assert((paddr >= gDramBase) && (paddr < (gDramBase + gDramSize)));
2469 assert((paddr & PAGE_MASK) == 0);
2470
2471 tag_page_index = atop(paddr - gDramBase) / MTE_PAGES_PER_TAG_PAGE;
2472 return mte_tag_storage_start_pnum + tag_page_index;
2473 }
2474
2475 /*
2476 * Given the physical page number of a tag storage page, calculates the physical
2477 * page number of the first page covered by it.
2478 */
2479 ppnum_t
map_tag_ppnum_to_first_covered_ppnum(ppnum_t tag_ppnum)2480 map_tag_ppnum_to_first_covered_ppnum(ppnum_t tag_ppnum)
2481 {
2482 assert((mte_tag_storage_start_pnum <= tag_ppnum) && (tag_ppnum <= (mte_tag_storage_start_pnum + mte_tag_storage_count)));
2483
2484 uint64_t tag_page_index = tag_ppnum - mte_tag_storage_start_pnum;
2485 return atop(ptoa(tag_page_index * MTE_PAGES_PER_TAG_PAGE) + gDramBase);
2486 }
2487
2488 #endif /* HAS_MTE */
2489
2490 /*
2491 * Initialize the count of available pages. No lock needed here,
2492 * as this code is called while kernel boot up is single threaded.
2493 */
2494 static void
initialize_ram_ranges(void)2495 initialize_ram_ranges(void)
2496 {
2497 __assert_only pmap_paddr_t first = first_avail;
2498 pmap_paddr_t end = avail_end;
2499
2500 assert(first <= end);
2501 assert(first == (first & ~PAGE_MASK));
2502 assert(end == (end & ~PAGE_MASK));
2503
2504 need_ram_ranges_init = false;
2505
2506 #if HAS_MTE
2507 if (is_mte_enabled) {
2508 assert3u(atop(gDramSize) / MTE_PAGES_PER_TAG_PAGE, ==,
2509 SPTMArgs->n_tag_storage_frames);
2510 /* Export MTE tag region boundaries to the VM */
2511 mte_tag_storage_start = SPTMArgs->first_tag_storage_paddr;
2512 mte_tag_storage_count = SPTMArgs->n_tag_storage_frames;
2513 mte_tag_storage_end = mte_tag_storage_start + ptoa(mte_tag_storage_count);
2514 mte_tag_storage_start_pnum = atop(mte_tag_storage_start);
2515
2516 /*
2517 * Calculate the bounds of the recursive and managed tag
2518 * storage regions. These will be used to determine which tag
2519 * storage pages will never be used to store tags.
2520 */
2521 mte_tag_storage_recursive_start_pnum = map_paddr_to_tag_ppnum(mte_tag_storage_start);
2522 mte_tag_storage_recursive_end_pnum = map_paddr_to_tag_ppnum(mte_tag_storage_end);
2523 mte_tag_storage_managed_start_pnum = map_paddr_to_tag_ppnum(gPhysBase);
2524 mte_tag_storage_managed_end_pnum = map_paddr_to_tag_ppnum(gPhysBase + mem_size);
2525
2526 /*
2527 * If a capped memory override has been set via maxmem= / hw.memsize,
2528 * discard the remainder of memory and adjust the number of tag pages
2529 * available to the system by discarding them.
2530 */
2531 if (max_mem != mem_size) {
2532 #define TAG_STORAGE_MASK ((PAGE_SIZE * MTE_PAGES_PER_TAG_PAGE) - 1)
2533 assert(max_mem <= mem_size);
2534 assert(!(max_mem & TAG_STORAGE_MASK));
2535 // Make sure we do not retire a tag page that might have tagged pages associated
2536 first_avail = (first_avail + TAG_STORAGE_MASK) & ~TAG_STORAGE_MASK;
2537
2538 uint64_t discarding = mte_tag_storage_start - (max_mem - max_mem / MTE_PAGES_PER_TAG_PAGE) - avail_start;
2539 /*
2540 * Also align how much we discard up to a ~512KiB boundary. We might be
2541 * over/under discarding +=512KiB here (which is fine accuracy wise because
2542 * ml_static_mfree will also release different amount of memory depending on the
2543 * actual device config)
2544 */
2545 discarding &= ~TAG_STORAGE_MASK;
2546
2547 mte_tag_storage_discarded_start_pnum = map_paddr_to_tag_ppnum(first_avail);
2548
2549 first_avail += discarding;
2550
2551 mte_tag_storage_discarded_end_pnum = map_paddr_to_tag_ppnum(first_avail);
2552
2553 /*
2554 * Also adjust the number of recursive dead tag storage pages to match
2555 * the capped memory size
2556 */
2557 mte_tag_storage_recursive_discarded_end_pnum = mte_tag_storage_recursive_end_pnum - (mte_tag_storage_recursive_end_pnum - mte_tag_storage_recursive_start_pnum) * max_mem / mem_size;
2558 }
2559 } else {
2560 assert3u(SPTMArgs->n_tag_storage_frames, ==, 0);
2561 }
2562
2563 #if DEVELOPMENT || DEBUG
2564 printf("MTE [0x%llx, 0x%llx]\n", mte_tag_storage_start, mte_tag_storage_end);
2565 printf("MTE tag storage 0x%x\n", mte_tag_storage_count);
2566 #endif /* DEVELOPMENT || DEBUG */
2567 #endif /* HAS_MTE */
2568 avail_page_count = atop(end - first_avail);
2569 }
2570
2571 unsigned int
pmap_free_pages(void)2572 pmap_free_pages(
2573 void)
2574 {
2575 if (need_ram_ranges_init) {
2576 initialize_ram_ranges();
2577 }
2578 return avail_page_count;
2579 }
2580
2581 unsigned int
pmap_free_pages_span(void)2582 pmap_free_pages_span(
2583 void)
2584 {
2585 if (need_ram_ranges_init) {
2586 initialize_ram_ranges();
2587 }
2588 return (unsigned int)atop(avail_end - first_avail);
2589 }
2590
2591
2592 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2593 pmap_next_page_hi(
2594 ppnum_t * pnum,
2595 __unused boolean_t might_free)
2596 {
2597 return pmap_next_page(pnum);
2598 }
2599
2600
2601 boolean_t
pmap_next_page(ppnum_t * pnum)2602 pmap_next_page(
2603 ppnum_t *pnum)
2604 {
2605 if (need_ram_ranges_init) {
2606 initialize_ram_ranges();
2607 }
2608
2609
2610 if (first_avail != avail_end) {
2611 *pnum = (ppnum_t)atop(first_avail);
2612 first_avail += PAGE_SIZE;
2613 assert(avail_page_count > 0);
2614 --avail_page_count;
2615 return TRUE;
2616 }
2617 assert(avail_page_count == 0);
2618 return FALSE;
2619 }
2620
2621
2622
2623
2624 /**
2625 * Helper function to check wheter the given physical
2626 * page number is a restricted page.
2627 *
2628 * @param pn the physical page number to query.
2629 */
2630 bool
pmap_is_page_restricted(ppnum_t pn)2631 pmap_is_page_restricted(ppnum_t pn)
2632 {
2633 sptm_frame_type_t frame_type = sptm_get_frame_type(ptoa(pn));
2634 return frame_type == XNU_KERNEL_RESTRICTED;
2635 }
2636
2637 /*
2638 * Initialize the pmap module.
2639 * Called by vm_init, to initialize any structures that the pmap
2640 * system needs to map virtual memory.
2641 */
2642 void
pmap_init(void)2643 pmap_init(
2644 void)
2645 {
2646 /*
2647 * Protect page zero in the kernel map.
2648 * (can be overruled by permanent transltion
2649 * table entries at page zero - see arm_vm_init).
2650 */
2651 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2652
2653 pmap_initialized = TRUE;
2654
2655 /*
2656 * Create the zone of physical maps
2657 * and the physical-to-virtual entries.
2658 */
2659 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2660 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2661
2662
2663 /*
2664 * Initialize the pmap object (for tracking the vm_page_t
2665 * structures for pages we allocate to be page tables in
2666 * pmap_expand().
2667 */
2668 _vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2669 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2670
2671 /*
2672 * Initialize the TXM VM object in the same way as the
2673 * PMAP VM object.
2674 */
2675 _vm_object_allocate(mem_size, txm_vm_object, VM_MAP_SERIAL_SPECIAL);
2676 txm_vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2677
2678 /*
2679 * The values of [hard_]maxproc may have been scaled, make sure
2680 * they are still less than the value of pmap_max_asids.
2681 */
2682 if ((uint32_t)maxproc > pmap_max_asids) {
2683 maxproc = pmap_max_asids;
2684 }
2685 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2686 hard_maxproc = pmap_max_asids;
2687 }
2688 }
2689
2690 /**
2691 * Verify that a given physical page contains no mappings (outside of the
2692 * default physical aperture mapping).
2693 *
2694 * @param ppnum Physical page number to check there are no mappings to.
2695 *
2696 * @return True if there are no mappings, false otherwise or if the page is not
2697 * kernel-managed.
2698 */
2699 bool
pmap_verify_free(ppnum_t ppnum)2700 pmap_verify_free(ppnum_t ppnum)
2701 {
2702 const pmap_paddr_t pa = ptoa(ppnum);
2703
2704 assert(pa != vm_page_fictitious_addr);
2705
2706 /* Only mappings to kernel-managed physical memory are tracked. */
2707 if (!pa_valid(pa)) {
2708 return false;
2709 }
2710
2711 const unsigned int pai = pa_index(pa);
2712
2713 return pvh_test_type(pai_to_pvh(pai), PVH_TYPE_NULL);
2714 }
2715
2716
2717 #if __ARM64_PMAP_SUBPAGE_L1__
2718 static inline bool
pmap_user_root_size_matches_subpage_l1(vm_size_t root_size)2719 pmap_user_root_size_matches_subpage_l1(vm_size_t root_size)
2720 {
2721 return root_size == 8 * sizeof(tt_entry_t);
2722 }
2723 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2724
2725 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2726 pmap_root_alloc_size(pmap_t pmap)
2727 {
2728 #pragma unused(pmap)
2729 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2730 const unsigned int root_level = pt_attr_root_level(pt_attr);
2731 const uint64_t index = pt_attr_va_valid_mask(pt_attr);
2732 return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2733 }
2734
2735 /*
2736 * Create and return a physical map.
2737 *
2738 * If the size specified for the map
2739 * is zero, the map is an actual physical
2740 * map, and may be referenced by the
2741 * hardware.
2742 *
2743 * If the size specified is non-zero,
2744 * the map will be used in software only, and
2745 * is bounded by that size.
2746 */
2747 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2748 pmap_create_options_internal(
2749 ledger_t ledger,
2750 vm_map_size_t size,
2751 unsigned int flags,
2752 kern_return_t *kr)
2753 {
2754 pmap_t p;
2755 bool is_64bit = flags & PMAP_CREATE_64BIT;
2756 #if defined(HAS_APPLE_PAC)
2757 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2758 #endif /* defined(HAS_APPLE_PAC) */
2759 kern_return_t local_kr = KERN_SUCCESS;
2760 __unused uint8_t sptm_root_flags = SPTM_ROOT_PT_FLAGS_DEFAULT;
2761 TXMAddressSpaceFlags_t txm_flags = kTXMAddressSpaceFlagInit;
2762 const bool is_stage2 = false;
2763
2764 if (size != 0) {
2765 {
2766 // Size parameter should only be set for stage 2.
2767 return PMAP_NULL;
2768 }
2769 }
2770
2771 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2772 return PMAP_NULL;
2773 }
2774
2775 /*
2776 * Allocate a pmap struct from the pmap_zone. Then allocate
2777 * the translation table of the right size for the pmap.
2778 */
2779 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2780 local_kr = KERN_RESOURCE_SHORTAGE;
2781 goto pmap_create_fail;
2782 }
2783
2784 p->ledger = ledger;
2785
2786
2787 p->pmap_vm_map_cs_enforced = false;
2788 p->min = 0;
2789
2790
2791 #if CONFIG_ROSETTA
2792 if (flags & PMAP_CREATE_ROSETTA) {
2793 p->is_rosetta = TRUE;
2794 } else {
2795 p->is_rosetta = FALSE;
2796 }
2797 #endif /* CONFIG_ROSETTA */
2798 #if defined(HAS_APPLE_PAC)
2799 p->disable_jop = disable_jop;
2800
2801 if (p->disable_jop) {
2802 sptm_root_flags &= ~SPTM_ROOT_PT_FLAG_JOP;
2803 }
2804 #endif /* defined(HAS_APPLE_PAC) */
2805
2806 p->nested_region_true_start = 0;
2807 p->nested_region_true_end = ~0;
2808
2809 p->nx_enabled = true;
2810 p->is_64bit = is_64bit;
2811
2812 if (!is_64bit) {
2813 sptm_root_flags |= SPTM_ROOT_PT_FLAG_ARM64_32;
2814 }
2815
2816 p->nested_pmap = PMAP_NULL;
2817 p->type = PMAP_TYPE_USER;
2818
2819 #if ARM_PARAMETERIZED_PMAP
2820 /* Default to the native pt_attr */
2821 p->pmap_pt_attr = native_pt_attr;
2822 #endif /* ARM_PARAMETERIZED_PMAP */
2823 #if __ARM_MIXED_PAGE_SIZE__
2824 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2825 p->pmap_pt_attr = &pmap_pt_attr_4k;
2826 }
2827 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2828 p->max = pmap_user_va_size(p);
2829
2830 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2831 local_kr = KERN_NO_SPACE;
2832 goto id_alloc_fail;
2833 }
2834
2835 /**
2836 * We expect top level translation tables to always fit into a single
2837 * physical page. This would also catch a misconfiguration if 4K
2838 * concatenated page tables needed more than one physical tt1 page.
2839 */
2840 vm_size_t pmap_root_size = pmap_root_alloc_size(p);
2841 if (__improbable(pmap_root_size > PAGE_SIZE)) {
2842 panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)pmap_root_size);
2843 }
2844
2845 #if __ARM64_PMAP_SUBPAGE_L1__
2846 /**
2847 * Identify the case where the root qualifies for SURT, and update the
2848 * root size to the TTEs + the SPTM metadata, reflecting the actual
2849 * space taken by this subpage root table.
2850 */
2851 if (!(flags & PMAP_CREATE_NESTED) && pmap_user_root_size_matches_subpage_l1(pmap_root_size)) {
2852 pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE;
2853 }
2854 #endif
2855
2856 pmap_lock_init(p);
2857
2858 p->tte = pmap_tt1_allocate(p, pmap_root_size, sptm_root_flags);
2859 if (!(p->tte)) {
2860 local_kr = KERN_RESOURCE_SHORTAGE;
2861 goto tt1_alloc_fail;
2862 }
2863
2864 p->ttep = kvtophys_nofail((vm_offset_t)p->tte);
2865 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2866
2867 /*
2868 * initialize the rest of the structure
2869 */
2870 p->nested_region_addr = 0x0ULL;
2871 p->nested_region_size = 0x0ULL;
2872 p->nested_region_unnested_table_bitmap = NULL;
2873
2874 p->associated_vm_map_serial_id = VM_MAP_SERIAL_NONE;
2875 #if HAS_MTE
2876 p->restrict_receiving_aliases_to_tagged_memory = false;
2877 #endif /* HAS_MTE */
2878
2879 #if MACH_ASSERT
2880 p->pmap_pid = 0;
2881 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
2882 #endif /* MACH_ASSERT */
2883 #if DEVELOPMENT || DEBUG
2884 p->footprint_was_suspended = FALSE;
2885 #endif /* DEVELOPMENT || DEBUG */
2886
2887 os_ref_init_count_raw(&p->ref_count, &pmap_refgrp, 1);
2888 pmap_simple_lock(&pmaps_lock);
2889 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
2890 pmap_simple_unlock(&pmaps_lock);
2891
2892 /**
2893 * The SPTM pmap's concurrency model can sometimes allow ledger balances to transiently
2894 * go negative. Note that we still check overall ledger balance on pmap destruction.
2895 */
2896 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
2897 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
2898 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
2899 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
2900 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
2901 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
2902 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
2903 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
2904 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
2905
2906 if (!is_stage2) {
2907 /*
2908 * Complete initialization for the TXM address space. This needs to be done
2909 * after the SW ASID has been registered with the SPTM.
2910 * TXM enforcement does not apply to virtual machines.
2911 */
2912 if (flags & PMAP_CREATE_TEST) {
2913 txm_flags |= kTXMAddressSpaceFlagTest;
2914 }
2915
2916 pmap_txmlock_init(p);
2917 txm_register_address_space(p, p->asid, txm_flags);
2918 p->txm_trust_level = kCSTrustUntrusted;
2919 }
2920
2921 return p;
2922
2923 tt1_alloc_fail:
2924 pmap_get_pt_ops(p)->free_id(p);
2925 id_alloc_fail:
2926 zfree(pmap_zone, p);
2927 pmap_create_fail:
2928 *kr = local_kr;
2929 return PMAP_NULL;
2930 }
2931
2932 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)2933 pmap_create_options(
2934 ledger_t ledger,
2935 vm_map_size_t size,
2936 unsigned int flags)
2937 {
2938 pmap_t pmap;
2939 kern_return_t kr = KERN_SUCCESS;
2940
2941 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
2942
2943 ledger_reference(ledger);
2944
2945 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
2946
2947 if (pmap == PMAP_NULL) {
2948 ledger_dereference(ledger);
2949 }
2950
2951 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2952
2953 return pmap;
2954 }
2955
2956 #if MACH_ASSERT
2957 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)2958 pmap_set_process_internal(
2959 __unused pmap_t pmap,
2960 __unused int pid,
2961 __unused char *procname)
2962 {
2963 if (pmap == NULL || pmap->pmap_pid == -1) {
2964 return;
2965 }
2966
2967 validate_pmap_mutable(pmap);
2968
2969 pmap->pmap_pid = pid;
2970 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
2971 }
2972 #endif /* MACH_ASSERT */
2973
2974 #if MACH_ASSERT
2975 void
pmap_set_process(pmap_t pmap,int pid,char * procname)2976 pmap_set_process(
2977 pmap_t pmap,
2978 int pid,
2979 char *procname)
2980 {
2981 pmap_set_process_internal(pmap, pid, procname);
2982 }
2983 #endif /* MACH_ASSERT */
2984
2985 /*
2986 * pmap_deallocate_all_leaf_tts:
2987 *
2988 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
2989 * removing and deallocating all TTEs.
2990 */
2991 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,vm_map_address_t start_va,unsigned level)2992 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, vm_map_address_t start_va, unsigned level)
2993 {
2994 tt_entry_t tte = ARM_TTE_EMPTY;
2995 tt_entry_t * ttep = NULL;
2996 tt_entry_t * last_ttep = NULL;
2997
2998 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2999 const uint64_t size = pt_attr->pta_level_info[level].size;
3000
3001 assert(level < pt_attr_leaf_level(pt_attr));
3002
3003 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3004
3005 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
3006 vm_map_address_t va = start_va;
3007 for (ttep = first_ttep; ttep <= last_ttep; ttep += page_ratio, va += (size * page_ratio)) {
3008 if (!(*ttep & ARM_TTE_VALID)) {
3009 continue;
3010 }
3011
3012 for (unsigned i = 0; i < page_ratio; i++) {
3013 tte = ttep[i];
3014
3015 if (!(tte & ARM_TTE_VALID)) {
3016 panic("%s: found unexpectedly invalid tte, ttep=%p, tte=%p, "
3017 "pmap=%p, first_ttep=%p, level=%u",
3018 __FUNCTION__, ttep + i, (void *)tte,
3019 pmap, first_ttep, level);
3020 }
3021
3022 if (tte_is_block(tte)) {
3023 panic("%s: found block mapping, ttep=%p, tte=%p, "
3024 "pmap=%p, first_ttep=%p, level=%u",
3025 __FUNCTION__, ttep + i, (void *)tte,
3026 pmap, first_ttep, level);
3027 }
3028
3029 /* Must be valid, type table */
3030 if (level < pt_attr_twig_level(pt_attr)) {
3031 /* If we haven't reached the twig level, recurse to the next level. */
3032 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK),
3033 va + (size * i), level + 1);
3034 }
3035 }
3036
3037 /* Remove the TTE. */
3038 pmap_tte_deallocate(pmap, va, ttep, level, false);
3039 }
3040 }
3041
3042 /*
3043 * We maintain stats and ledgers so that a task's physical footprint is:
3044 * phys_footprint = ((internal - alternate_accounting)
3045 * + (internal_compressed - alternate_accounting_compressed)
3046 * + iokit_mapped
3047 * + purgeable_nonvolatile
3048 * + purgeable_nonvolatile_compressed
3049 * + page_table)
3050 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3051 */
3052
3053 /*
3054 * Retire the given physical map from service.
3055 * Should only be called if the map contains
3056 * no valid mappings.
3057 */
3058 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3059 pmap_destroy_internal(
3060 pmap_t pmap)
3061 {
3062 if (pmap == PMAP_NULL) {
3063 return;
3064 }
3065
3066 validate_pmap(pmap);
3067
3068 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3069 const bool is_stage2_pmap = false;
3070
3071 if (os_ref_release_raw(&pmap->ref_count, &pmap_refgrp) > 0) {
3072 return;
3073 }
3074
3075 if (!is_stage2_pmap) {
3076 /*
3077 * Complete all clean up required for TXM. This needs to happen before the
3078 * SW ASID has been unregistered with the SPTM.
3079 */
3080 txm_unregister_address_space(pmap);
3081 pmap_txmlock_destroy(pmap);
3082 }
3083
3084 /**
3085 * Drain any concurrent retype-sensitive SPTM operations. This is needed to
3086 * ensure that we don't unmap and retype the page tables while those operations
3087 * are still finishing on other CPUs, leading to an SPTM violation. In particular,
3088 * the multipage batched cacheability/attribute update code may issue SPTM calls
3089 * without holding the relevant PVH or pmap locks, so we can't guarantee those
3090 * calls have actually completed despite observing refcnt == 0.
3091 *
3092 * At this point, we CAN guarantee that:
3093 * 1) All prior PTE removals required to empty the pmap have completed and
3094 * been synchronized with DSB, *except* the commpage removal which doesn't
3095 * involve pages that can ever be retyped. Subsequent calls not already
3096 * in the pmap epoch will no longer observe these mappings.
3097 * 2) The pmap now has a zero refcount, so in a correctly functioning system
3098 * no further mappings will be requested for it.
3099 */
3100 pmap_epoch_prepare_drain();
3101
3102 if (!is_stage2_pmap) {
3103 pmap_unmap_commpage(pmap);
3104 }
3105
3106 pmap_simple_lock(&pmaps_lock);
3107 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3108 pmap_simple_unlock(&pmaps_lock);
3109
3110 pmap_epoch_drain();
3111
3112 /*
3113 * Free the memory maps, then the
3114 * pmap structure.
3115 */
3116 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pmap->min, pt_attr_root_level(pt_attr));
3117
3118 if (pmap->tte) {
3119 vm_size_t pmap_root_size = pmap_root_alloc_size(pmap);
3120 #if __ARM64_PMAP_SUBPAGE_L1__
3121 /**
3122 * Like in the allocation path, identify the case where the root table
3123 * qualifies for SURT.
3124 */
3125 if (pmap_user_root_size_matches_subpage_l1(pmap_root_size)) {
3126 /**
3127 * Nested tables cannot use SURT, so the allocated size has to be
3128 * PAGE_SIZE.
3129 */
3130 if (pmap_is_nested(pmap)) {
3131 pmap_root_size = PAGE_SIZE;
3132 } else {
3133 /**
3134 * Note: with SPTM, the kernel pmap is never supposed to be
3135 * destroyed because the SPTM relies on the existence of the
3136 * kernel root table. Also, the commpage-typed pmap doesn't
3137 * exist. Not only is the pmap associated with a commpage
3138 * table transient and destroyed right after the commpage
3139 * table is setup, but also the pmap is just a plain
3140 * PMAP_TYPE_USER typed pmap.
3141 */
3142 assert(pmap->type == PMAP_TYPE_USER);
3143 pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE;
3144 }
3145 }
3146 #endif
3147 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_size);
3148 pmap->tte = (tt_entry_t *) NULL;
3149 pmap->ttep = 0;
3150 }
3151
3152 if (pmap->type != PMAP_TYPE_NESTED) {
3153 /* return its asid to the pool */
3154 pmap_get_pt_ops(pmap)->free_id(pmap);
3155 if (pmap->nested_pmap != NULL) {
3156 /* release the reference we hold on the nested pmap */
3157 pmap_destroy_internal(pmap->nested_pmap);
3158 }
3159 }
3160
3161 pmap_check_ledgers(pmap);
3162
3163 if ((pmap->type == PMAP_TYPE_NESTED) && (pmap->nested_region_unnested_table_bitmap != NULL)) {
3164 bitmap_free(pmap->nested_region_unnested_table_bitmap,
3165 (pmap->nested_region_size >> (pt_attr_twig_shift(pt_attr) - 1)));
3166 }
3167
3168 pmap_lock_destroy(pmap);
3169 zfree(pmap_zone, pmap);
3170 }
3171
3172 void
pmap_destroy(pmap_t pmap)3173 pmap_destroy(
3174 pmap_t pmap)
3175 {
3176 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
3177
3178 ledger_t ledger = pmap->ledger;
3179
3180 pmap_destroy_internal(pmap);
3181
3182 ledger_dereference(ledger);
3183
3184 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3185 }
3186
3187
3188 /*
3189 * Add a reference to the specified pmap.
3190 */
3191 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3192 pmap_reference_internal(
3193 pmap_t pmap)
3194 {
3195 if (pmap != PMAP_NULL) {
3196 validate_pmap_mutable(pmap);
3197 os_ref_retain_raw(&pmap->ref_count, &pmap_refgrp);
3198 }
3199 }
3200
3201 void
pmap_reference(pmap_t pmap)3202 pmap_reference(
3203 pmap_t pmap)
3204 {
3205 pmap_reference_internal(pmap);
3206 }
3207
3208 static sptm_frame_type_t
get_sptm_pt_type(pmap_t pmap)3209 get_sptm_pt_type(pmap_t pmap)
3210 {
3211 const bool is_stage2_pmap = false;
3212 if (is_stage2_pmap) {
3213 assert(pmap->type != PMAP_TYPE_NESTED);
3214 return XNU_STAGE2_PAGE_TABLE;
3215 } else {
3216 return pmap->type == PMAP_TYPE_NESTED ? XNU_PAGE_TABLE_SHARED : XNU_PAGE_TABLE;
3217 }
3218 }
3219
3220 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,uint8_t sptm_root_flags)3221 pmap_tt1_allocate(pmap_t pmap, vm_size_t size, uint8_t sptm_root_flags)
3222 {
3223 pmap_paddr_t pa = 0;
3224 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3225 const bool is_stage2_pmap = false;
3226
3227 /**
3228 * Allocate the entire page for root-level page table unless it is subpage
3229 * L1 table, where size will be exactly PMAP_ROOT_ALLOC_SIZE.
3230 */
3231 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3232 size = PAGE_SIZE;
3233 }
3234
3235 #if __ARM64_PMAP_SUBPAGE_L1__
3236 /**
3237 * At this moment, the allocation size is smaller than the page size only
3238 * when it is a subpage L1 table. We will try to allocate a root table
3239 * from the SURTs (SUbpage Root Tables).
3240 */
3241 const bool use_surt = (size < PAGE_SIZE);
3242 if (use_surt) {
3243 /* It has to be a user pmap. */
3244 assert(pmap->type == PMAP_TYPE_USER);
3245
3246 /**
3247 * Subpage stage 2 root table is not supported. This is guaranteed by
3248 * the stage 2 pmaps using a different pmap geometry than the stage
3249 * 1 pmaps.
3250 */
3251 assert(!is_stage2_pmap);
3252
3253 /* Try allocating a SURT from the SURT page queue. */
3254 pa = surt_try_alloc();
3255
3256 /* If there is one SURT available, call SPTM to claim the SURT. */
3257 if (pa) {
3258 sptm_surt_alloc(surt_page_pa_from_surt_pa(pa),
3259 surt_index_from_surt_pa(pa),
3260 pt_attr->geometry_id,
3261 sptm_root_flags,
3262 pmap->asid);
3263
3264 /* We don't need to allocate a new page, so skip to the end. */
3265 goto ptt1a_done;
3266 }
3267 }
3268 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3269
3270 /**
3271 * Either the root table size is not suitable for SURT or SURT is out of
3272 * tables. In either case, a page needs to be allocated.
3273 */
3274 const kern_return_t ret = pmap_page_alloc(&pa, PMAP_PAGE_NOZEROFILL);
3275
3276 /* No page is allocated, so return 0 to signal failure. */
3277 if (ret != KERN_SUCCESS) {
3278 return (tt_entry_t *)0;
3279 }
3280
3281 /**
3282 * Drain the epochs to ensure any lingering batched operations that may have
3283 * taken an in-flight reference to this page are complete.
3284 */
3285 pmap_epoch_prepare_drain();
3286
3287 assert(pa);
3288
3289 #if __ARM64_PMAP_SUBPAGE_L1__
3290 if (use_surt) {
3291 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3292
3293 pmap_epoch_drain();
3294
3295 /**
3296 * The allocated page is retyped to XNU_SUBPAGE_USER_ROOT_TABLES as the
3297 * container of the SURTs.
3298 */
3299 sptm_retype(pa, XNU_DEFAULT, XNU_SUBPAGE_USER_ROOT_TABLES, retype_params);
3300
3301 /**
3302 * Before we add the page to the SURT page queue, claim the first SURT
3303 * for ourselves. This is safe since we are the only one accessing this
3304 * page at this moment.
3305 */
3306 sptm_surt_alloc(pa, 0, pt_attr->geometry_id, sptm_root_flags, pmap->asid);
3307
3308 /**
3309 * Add the newly allocated SURT page to the page queue.
3310 */
3311 surt_feed_page_with_first_table_allocated(pa);
3312 } else
3313 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3314 {
3315 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3316 retype_params.attr_idx = pt_attr->geometry_id;
3317 retype_params.flags = sptm_root_flags;
3318 if (is_stage2_pmap) {
3319 retype_params.vmid = pmap->vmid;
3320 } else {
3321 retype_params.asid = pmap->asid;
3322 }
3323
3324 pmap_epoch_drain();
3325
3326 sptm_retype(pa, XNU_DEFAULT, is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : XNU_USER_ROOT_TABLE,
3327 retype_params);
3328 }
3329
3330 #if __ARM64_PMAP_SUBPAGE_L1__
3331 ptt1a_done:
3332 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3333 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3334 * Depending on the device, this can vary between 512b and 16K. */
3335 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3336 pmap_tt_ledger_credit(pmap, size);
3337
3338 return (tt_entry_t *) phystokv(pa);
3339 }
3340
3341 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size)3342 pmap_tt1_deallocate(
3343 pmap_t pmap,
3344 tt_entry_t *tt,
3345 vm_size_t size)
3346 {
3347 pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)tt);
3348 const bool is_stage2_pmap = false;
3349
3350 /**
3351 * Free the entire page unless it is subpage L1 table, where size will be
3352 * exactly PMAP_ROOT_ALLOC_SIZE.
3353 */
3354 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3355 size = PAGE_SIZE;
3356 }
3357
3358 #if __ARM64_PMAP_SUBPAGE_L1__
3359 /**
3360 * At this moment, the free size is smaller than the page size only
3361 * when it is a subpage L1 table. We will try to free the root table
3362 * from the SURT page.
3363 */
3364 const bool use_surt = (size < PAGE_SIZE);
3365 if (use_surt) {
3366 /* It has to be a user pmap. */
3367 assert(pmap->type == PMAP_TYPE_USER);
3368
3369 /* Subpage stage 2 root table is not supported. */
3370 assert(!is_stage2_pmap);
3371
3372 /* Before we do anything in pmap, tell SPTM that the SURT is free. */
3373 sptm_surt_free(surt_page_pa_from_surt_pa(pa),
3374 surt_index_from_surt_pa(pa));
3375
3376 /**
3377 * Make sure the SURT bitmap update is not reordered before the SPTM
3378 * rw guard release.
3379 */
3380 os_atomic_thread_fence(release);
3381
3382 /**
3383 * Free the SURT in pmap scope, if surt_free() returns false, there
3384 * are still other SURTs on the page. In such case, do not retype
3385 * or free the page; just skip to the end to finish accounting.
3386 */
3387 if (!surt_free(pa)) {
3388 goto ptt1d_done;
3389 }
3390
3391 /**
3392 * Make sure the SURT bitmap read is not reordered after the SPTM
3393 * rw guard exclusive acquire in the retype case.
3394 */
3395 os_atomic_thread_fence(acquire);
3396 }
3397 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3398
3399 sptm_frame_type_t page_type;
3400 #if __ARM64_PMAP_SUBPAGE_L1__
3401 if (use_surt) {
3402 page_type = XNU_SUBPAGE_USER_ROOT_TABLES;
3403 } else
3404 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3405 if (is_stage2_pmap) {
3406 page_type = XNU_STAGE2_ROOT_TABLE;
3407 } else if (pmap->type == PMAP_TYPE_NESTED) {
3408 page_type = XNU_SHARED_ROOT_TABLE;
3409 } else {
3410 page_type = XNU_USER_ROOT_TABLE;
3411 }
3412
3413 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3414 sptm_retype(pa & ~PAGE_MASK, page_type, XNU_DEFAULT, retype_params);
3415 pmap_page_free(pa & ~PAGE_MASK);
3416
3417 #if __ARM64_PMAP_SUBPAGE_L1__
3418 ptt1d_done:
3419 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3420 OSAddAtomic(-(int32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3421 pmap_tt_ledger_debit(pmap, size);
3422 }
3423
3424 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,pt_desc_t ** ptdp_out,unsigned int level,unsigned int options)3425 pmap_tt_allocate(
3426 pmap_t pmap,
3427 tt_entry_t **ttp,
3428 pt_desc_t **ptdp_out,
3429 unsigned int level,
3430 unsigned int options)
3431 {
3432 pmap_paddr_t pa;
3433 const unsigned int alloc_flags =
3434 (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0;
3435
3436 /* Allocate a VM page to be used as the page table. */
3437 if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) {
3438 return KERN_RESOURCE_SHORTAGE;
3439 }
3440
3441 pt_desc_t *ptdp = ptd_alloc(pmap, alloc_flags);
3442 if (ptdp == NULL) {
3443 pmap_page_free(pa);
3444 return KERN_RESOURCE_SHORTAGE;
3445 }
3446
3447 unsigned int pai = pa_index(pa);
3448 locked_pvh_t locked_pvh = pvh_lock(pai);
3449 assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL), "%s: non-empty PVH %p",
3450 __func__, (void*)locked_pvh.pvh);
3451
3452 /**
3453 * Drain the epochs to ensure any lingering batched operations that may have taken
3454 * an in-flight reference to this page are complete.
3455 */
3456 pmap_epoch_prepare_drain();
3457
3458 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3459 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3460 } else {
3461 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3462 }
3463
3464 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3465
3466 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3467
3468 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
3469 pvh_unlock(&locked_pvh);
3470
3471 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3472 retype_params.level = (sptm_pt_level_t)level;
3473
3474 /**
3475 * SPTM TODO: To reduce the cost of draining and retyping, consider caching freed page table pages
3476 * in a small per-CPU bucket and reusing them in preference to calling pmap_page_alloc() above.
3477 */
3478 pmap_epoch_drain();
3479
3480 sptm_retype(pa, XNU_DEFAULT, get_sptm_pt_type(pmap), retype_params);
3481
3482 *ptdp_out = ptdp;
3483 *ttp = (tt_entry_t *)phystokv(pa);
3484
3485 return KERN_SUCCESS;
3486 }
3487
3488 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3489 pmap_tt_deallocate(
3490 pmap_t pmap,
3491 tt_entry_t *ttp,
3492 unsigned int level)
3493 {
3494 pt_desc_t *ptdp;
3495 vm_offset_t free_page = 0;
3496 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3497
3498 ptdp = ptep_get_ptd(ttp);
3499 ptdp->va = (vm_offset_t)-1;
3500
3501 const uint16_t refcnt = sptm_get_page_table_refcnt(kvtophys_nofail((vm_offset_t)ttp));
3502
3503 if (__improbable(refcnt != 0)) {
3504 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, refcnt);
3505 }
3506
3507 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3508 if (free_page != 0) {
3509 pmap_paddr_t pa = kvtophys_nofail(free_page);
3510 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3511 sptm_retype(pa, get_sptm_pt_type(pmap), XNU_DEFAULT, retype_params);
3512 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3513
3514 unsigned int pai = pa_index(pa);
3515 locked_pvh_t locked_pvh = pvh_lock(pai);
3516 assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTDP), "%s: non-PTD PVH %p",
3517 __func__, (void*)locked_pvh.pvh);
3518 pvh_update_head(&locked_pvh, NULL, PVH_TYPE_NULL);
3519 pvh_unlock(&locked_pvh);
3520 pmap_page_free(pa);
3521 if (level < pt_attr_leaf_level(pt_attr)) {
3522 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3523 } else {
3524 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3525 }
3526 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3527 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3528 }
3529 }
3530
3531 /**
3532 * Check table refcounts after clearing a translation table entry pointing to that table
3533 *
3534 * @note If the cleared TTE points to a leaf table, then that leaf table
3535 * must have a refcnt of zero before the TTE can be removed.
3536 *
3537 * @param pmap The pmap containing the page table whose TTE is being removed.
3538 * @param tte Value stored in the TTE prior to clearing it
3539 * @param level The level of the page table that contains the TTE being removed
3540 */
3541 static void
pmap_tte_check_refcounts(pmap_t pmap,tt_entry_t tte,unsigned int level)3542 pmap_tte_check_refcounts(
3543 pmap_t pmap,
3544 tt_entry_t tte,
3545 unsigned int level)
3546 {
3547 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3548
3549 /**
3550 * Remember, the passed in "level" parameter refers to the level above the
3551 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3552 * page table).
3553 */
3554 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3555
3556 unsigned short refcnt = 0;
3557
3558 /**
3559 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3560 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3561 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3562 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3563 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3564 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3565 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3566 * synchronize it against the disconnect operation. If that removal caused the
3567 * refcount to reach zero, the pagetable page could be freed before the disconnect
3568 * operation is finished using the relevant pagetable descriptor.
3569 * Address these cases by draining the epochs to ensure other cores are no longer
3570 * consuming the page table we're preparing to delete.
3571 */
3572 if (remove_leaf_table) {
3573 pmap_epoch_prepare_drain();
3574 pmap_epoch_drain();
3575 refcnt = sptm_get_page_table_refcnt(tte_to_pa(tte));
3576 }
3577
3578 #if MACH_ASSERT
3579 /**
3580 * On internal devices, always do the page table consistency check
3581 * regardless of page table level or the actual refcnt value.
3582 */
3583 {
3584 #else /* MACH_ASSERT */
3585 /**
3586 * Only perform the page table consistency check when deleting leaf page
3587 * tables and it seems like there might be valid/compressed mappings
3588 * leftover.
3589 */
3590 if (__improbable(remove_leaf_table && refcnt != 0)) {
3591 #endif /* MACH_ASSERT */
3592
3593 /**
3594 * There are multiple problems that can arise as a non-zero refcnt:
3595 * 1. A bug in the refcnt management logic.
3596 * 2. A memory stomper or hardware failure.
3597 * 3. The VM forgetting to unmap all of the valid mappings in an address
3598 * space before destroying a pmap.
3599 *
3600 * By looping over the page table and determining how many valid or
3601 * compressed entries there actually are, we can narrow down which of
3602 * these three cases is causing this panic. If the expected refcnt
3603 * (valid + compressed) and the actual refcnt don't match then the
3604 * problem is probably either a memory corruption issue (if the
3605 * non-empty entries don't match valid+compressed, that could also be a
3606 * sign of corruption) or refcnt management bug. Otherwise, there
3607 * actually are leftover mappings and the higher layers of xnu are
3608 * probably at fault.
3609 *
3610 * Note that we use PAGE_SIZE to govern the range of the table check,
3611 * because even for 4K processes we still allocate a 16K page for each
3612 * page table; we simply map it using 4 adjacent TTEs for the 4K case.
3613 */
3614 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(PAGE_SIZE - 1)));
3615
3616 pt_entry_t *ptep = bpte;
3617 unsigned short wiredcnt = ptep_get_info((pt_entry_t*)ttetokv(tte))->wiredcnt;
3618 unsigned short non_empty = 0, valid = 0, comp = 0;
3619 for (unsigned int i = 0; i < (PAGE_SIZE / sizeof(*ptep)); i++, ptep++) {
3620 /* Keep track of all non-empty entries to detect memory corruption. */
3621 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3622 non_empty++;
3623 }
3624
3625 if (__improbable(pte_is_compressed(*ptep, ptep))) {
3626 comp++;
3627 } else if (__improbable(pte_is_valid(*ptep))) {
3628 valid++;
3629 }
3630 }
3631
3632 #if MACH_ASSERT
3633 /**
3634 * On internal machines, panic whenever a page table getting deleted has
3635 * leftover mappings (valid or otherwise) or a leaf page table has a
3636 * non-zero refcnt.
3637 */
3638 if (__improbable((non_empty != 0) || (remove_leaf_table && ((refcnt != 0) || (wiredcnt != 0))))) {
3639 #else /* MACH_ASSERT */
3640 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3641 {
3642 #endif /* MACH_ASSERT */
3643 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3644 "%d compressed, %d non-empty, refcnt=%d, wiredcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3645 level + 1, valid, comp, non_empty, refcnt, wiredcnt, level, (uint64_t)tte, pmap, bpte);
3646 }
3647 }
3648 }
3649
3650 /**
3651 * Remove translation table entry pointing to a nested shared region table
3652 *
3653 * @note The TTE to clear out is expected to point to a leaf table with a refcnt
3654 * of zero.
3655 *
3656 * @param pmap The user pmap containing the nested page table whose TTE is being removed.
3657 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3658 * @param ttep Pointer to the TTE that should be cleared out.
3659 */
3660 static void
3661 pmap_tte_trim(
3662 pmap_t pmap,
3663 vm_offset_t va_start,
3664 tt_entry_t *ttep)
3665 {
3666 assert(ttep != NULL);
3667 const tt_entry_t tte = *ttep;
3668 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3669
3670 if (__improbable(tte == ARM_TTE_EMPTY)) {
3671 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3672 "stomper? pmap=%p ttep=%p", __func__, pt_attr_twig_level(pt_attr), pmap, ttep);
3673 }
3674
3675 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
3676 sptm_unnest_region(pmap->ttep, pmap->nested_pmap->ttep, va_start, (pt_attr_twig_size(pt_attr) * page_ratio) >> pt_attr->pta_page_shift);
3677
3678 pmap_tte_check_refcounts(pmap, tte, pt_attr_twig_level(pt_attr));
3679 }
3680
3681 /**
3682 * Remove a translation table entry.
3683 *
3684 * @note If the TTE to clear out points to a leaf table, then that leaf table
3685 * must have a mapping refcount of zero before the TTE can be removed.
3686 * @note If locked_pvh is non-NULL, this function expects to be called with
3687 * the PVH lock held and will return with it unlocked. Otherwise it
3688 * expects pmap to be locked exclusive, and will return with pmap unlocked.
3689 *
3690 * @param pmap The pmap containing the page table whose TTE is being removed.
3691 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3692 * @param ttep Pointer to the TTE that should be cleared out.
3693 * @param level The level of the page table that contains the TTE to be removed.
3694 * @param pmap_locked If true, the caller holds an exclusive pmap lock which should
3695 * be dropped after removing the table entry.
3696 */
3697 static void
3698 pmap_tte_remove(
3699 pmap_t pmap,
3700 vm_offset_t va_start,
3701 tt_entry_t *ttep,
3702 unsigned int level,
3703 bool pmap_locked)
3704 {
3705 assert(ttep != NULL);
3706 const tt_entry_t tte = *ttep;
3707
3708 if (__improbable(tte == ARM_TTE_EMPTY)) {
3709 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3710 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3711 }
3712
3713 sptm_unmap_table(pmap->ttep, pt_attr_align_va(pmap_get_pt_attr(pmap), level, va_start), (sptm_pt_level_t)level);
3714
3715 if (pmap_locked) {
3716 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3717 }
3718
3719 pmap_tte_check_refcounts(pmap, tte, level);
3720 }
3721
3722 /**
3723 * Given a pointer to an entry within a `level` page table, delete the
3724 * page table at `level` + 1 that is represented by that entry. For instance,
3725 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3726 * contains the PA of the L3 table, and `level` would be "2".
3727 *
3728 * @note If the table getting deallocated is a leaf table, then that leaf table
3729 * must have a mapping refcount of zero before getting deallocated.
3730 * @note If locked_pvh is non-NULL, this function expects to be called with
3731 * the PVH lock held and will return with it unlocked. Otherwise it
3732 * expects pmap to be locked exclusive, and will return with pmap unlocked.
3733 *
3734 * @param pmap The pmap that owns the page table to be deallocated.
3735 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3736 * @param ttep Pointer to the `level` TTE to remove.
3737 * @param level The level of the table that contains an entry pointing to the
3738 * table to be removed. The deallocated page table will be a
3739 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3740 * deleted).
3741 * @param pmap_locked If true, the caller holds an exclusive pmap lock which should
3742 * be dropped after removing the table entry.
3743 */
3744 static void
3745 pmap_tte_deallocate(
3746 pmap_t pmap,
3747 vm_offset_t va_start,
3748 tt_entry_t *ttep,
3749 unsigned int level,
3750 bool pmap_locked)
3751 {
3752 tt_entry_t tte = *ttep;
3753
3754 if (tte_get_ptd(tte)->pmap != pmap) {
3755 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3756 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3757 }
3758
3759 assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
3760 (unsigned long long)tte);
3761
3762 /* pmap_tte_remove() will drop the pmap lock if necessary. */
3763 pmap_tte_remove(pmap, va_start, ttep, level, pmap_locked);
3764
3765 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3766 }
3767
3768 /*
3769 * Remove a range of hardware page-table entries.
3770 * The range is given as the first (inclusive)
3771 * and last (exclusive) virtual addresses mapped by
3772 * the PTE region to be removed.
3773 *
3774 * The pmap must be locked shared.
3775 * If the pmap is not the kernel pmap, the range must lie
3776 * entirely within one pte-page. Assumes that the pte-page exists.
3777 *
3778 * Returns the number of PTE changed
3779 */
3780 MARK_AS_PMAP_TEXT static void
3781 pmap_remove_range(
3782 pmap_t pmap,
3783 vm_map_address_t va,
3784 vm_map_address_t end)
3785 {
3786 pmap_remove_range_options(pmap, va, end, PMAP_OPTIONS_REMOVE);
3787 }
3788
3789 MARK_AS_PMAP_TEXT void
3790 pmap_remove_range_options(
3791 pmap_t pmap,
3792 vm_map_address_t start,
3793 vm_map_address_t end,
3794 int options)
3795 {
3796 const unsigned int sptm_flags = ((options & PMAP_OPTIONS_REMOVE) ? SPTM_REMOVE_COMPRESSED : 0);
3797 unsigned int num_removed = 0;
3798 unsigned int num_external = 0, num_internal = 0, num_reusable = 0;
3799 unsigned int num_alt_internal = 0;
3800 unsigned int num_compressed = 0, num_alt_compressed = 0;
3801 unsigned short num_unwired = 0;
3802 bool need_strong_sync = false;
3803
3804 /*
3805 * The pmap lock should be held here. It will only be held shared in most if not all cases.
3806 */
3807 pmap_assert_locked(pmap, PMAP_LOCK_HELD);
3808
3809 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3810 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
3811 const uint64_t pmap_page_shift = pt_attr_leaf_shift(pt_attr);
3812 vm_map_address_t va = start;
3813 pt_entry_t *cpte = pmap_pte(pmap, va);
3814 assert(cpte != NULL);
3815
3816 while (va < end) {
3817 /**
3818 * We may need to sleep when taking the PVH lock below, and our pmap_pv_remove()
3819 * call below may also place the lock in sleep mode if processing a large PV list.
3820 * We therefore can't leave preemption disabled across that code, which means we
3821 * can't directly use the per-CPU prev_ptes array in that code. Since that code
3822 * only cares about the physical address stored in each prev_ptes entry, we'll
3823 * use a local array to stash off only the 4-byte physical address index in order
3824 * to reduce stack usage.
3825 */
3826 unsigned int pai_list[SPTM_MAPPING_LIMIT];
3827 _Static_assert(SPTM_MAPPING_LIMIT <= 64,
3828 "SPTM_MAPPING_LIMIT value causes excessive stack usage for pai_list");
3829
3830 unsigned int num_mappings = (end - va) >> pmap_page_shift;
3831 if (num_mappings > SPTM_MAPPING_LIMIT) {
3832 num_mappings = SPTM_MAPPING_LIMIT;
3833 }
3834
3835 /**
3836 * Disable preemption to ensure that we can safely access per-CPU mapping data after
3837 * issuing the SPTM call.
3838 */
3839 disable_preemption();
3840 /**
3841 * Enter the pmap epoch for the batched unmap operation. This is necessary because we
3842 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
3843 * call, so a concurrent pmap_page_protect() operation against one of those pages may
3844 * race this call. That should be perfectly fine as far as the PTE updates are concerned,
3845 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
3846 * if it does not first drain our epoch.
3847 */
3848 pmap_epoch_enter();
3849 sptm_unmap_region(pmap->ttep, va, num_mappings, sptm_flags);
3850 pmap_epoch_exit();
3851
3852 sptm_pte_t *prev_ptes = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes;
3853 for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3854 const pt_entry_t prev_pte = prev_ptes[i];
3855
3856 if (pte_is_compressed(prev_pte, cpte)) {
3857 if (options & PMAP_OPTIONS_REMOVE) {
3858 ++num_compressed;
3859 if (prev_pte & ARM_PTE_COMPRESSED_ALT) {
3860 ++num_alt_compressed;
3861 }
3862 }
3863 pai_list[i] = INVALID_PAI;
3864 continue;
3865 } else if (!pte_is_valid(prev_pte)) {
3866 pai_list[i] = INVALID_PAI;
3867 continue;
3868 }
3869
3870 if (pte_is_wired(prev_pte)) {
3871 num_unwired++;
3872 }
3873
3874 const pmap_paddr_t pa = pte_to_pa(prev_pte);
3875
3876 if (__improbable(!pa_valid(pa))) {
3877 pai_list[i] = INVALID_PAI;
3878 continue;
3879 }
3880 pai_list[i] = pa_index(pa);
3881 }
3882
3883 enable_preemption();
3884 cpte -= num_mappings;
3885
3886 for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3887 if (pai_list[i] == INVALID_PAI) {
3888 continue;
3889 }
3890 locked_pvh_t locked_pvh;
3891 if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
3892 locked_pvh = pvh_lock_nopreempt(pai_list[i]);
3893 } else {
3894 locked_pvh = pvh_lock(pai_list[i]);
3895 }
3896
3897 bool is_internal, is_altacct;
3898 pv_remove_return_t remove_status = pmap_remove_pv(pmap, cpte, &locked_pvh, &is_internal, &is_altacct);
3899
3900 switch (remove_status) {
3901 case PV_REMOVE_SUCCESS:
3902 ++num_removed;
3903 if (is_altacct) {
3904 assert(is_internal);
3905 num_internal++;
3906 num_alt_internal++;
3907 } else if (is_internal) {
3908 if (ppattr_test_reusable(pai_list[i])) {
3909 num_reusable++;
3910 } else {
3911 num_internal++;
3912 }
3913 } else {
3914 num_external++;
3915 }
3916 break;
3917 default:
3918 /*
3919 * PVE already removed; this can happen due to a concurrent pmap_disconnect()
3920 * executing before we grabbed the PVH lock.
3921 */
3922 break;
3923 }
3924
3925 pvh_unlock(&locked_pvh);
3926 }
3927
3928 va += (num_mappings << pmap_page_shift);
3929 }
3930
3931 if (__improbable(need_strong_sync)) {
3932 arm64_sync_tlb(true);
3933 }
3934
3935 /*
3936 * Update the counts
3937 */
3938 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
3939
3940 if (pmap != kernel_pmap) {
3941 if (num_unwired != 0) {
3942 ptd_info_t * const ptd_info = ptep_get_info(cpte - 1);
3943 if (__improbable(os_atomic_sub_orig(&ptd_info->wiredcnt, num_unwired, relaxed) < num_unwired)) {
3944 panic("%s: pmap %p VA [0x%llx, 0x%llx) (ptd info %p) wired count underflow", __func__, pmap,
3945 (unsigned long long)start, (unsigned long long)end, ptd_info);
3946 }
3947 }
3948
3949 /* update ledgers */
3950 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
3951 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
3952 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
3953 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
3954 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
3955 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
3956 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
3957 /* make needed adjustments to phys_footprint */
3958 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
3959 ((num_internal -
3960 num_alt_internal) +
3961 (num_compressed -
3962 num_alt_compressed)) * pmap_page_size);
3963 }
3964 }
3965
3966
3967 /*
3968 * Remove the given range of addresses
3969 * from the specified map.
3970 *
3971 * It is assumed that the start and end are properly
3972 * rounded to the hardware page size.
3973 */
3974 void
3975 pmap_remove(
3976 pmap_t pmap,
3977 vm_map_address_t start,
3978 vm_map_address_t end)
3979 {
3980 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
3981 }
3982
3983 MARK_AS_PMAP_TEXT vm_map_address_t
3984 pmap_remove_options_internal(
3985 pmap_t pmap,
3986 vm_map_address_t start,
3987 vm_map_address_t end,
3988 int options)
3989 {
3990 vm_map_address_t eva = end;
3991 tt_entry_t *tte_p;
3992 bool unlock = true;
3993
3994 if (__improbable(end < start)) {
3995 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
3996 }
3997 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3998 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
3999 }
4000
4001 validate_pmap_mutable(pmap);
4002
4003 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4004
4005 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
4006 pmap_lock(pmap, lock_mode);
4007
4008 tte_p = pmap_tte(pmap, start);
4009
4010 if ((tte_p == NULL) || ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_FAULT)) {
4011 goto done;
4012 }
4013
4014 assertf(tte_is_table(*tte_p), "%s: invalid TTE %p (0x%llx) for pmap %p va 0x%llx",
4015 __func__, tte_p, (unsigned long long)*tte_p, pmap, (unsigned long long)start);
4016
4017 pmap_remove_range_options(pmap, start, end, options);
4018
4019 if (pmap->type != PMAP_TYPE_USER) {
4020 goto done;
4021 }
4022
4023 uint16_t refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
4024 if (__improbable(refcnt == 0)) {
4025 ptd_info_t *ptd_info = ptep_get_info((pt_entry_t*)ttetokv(*tte_p));
4026 os_atomic_inc(&ptd_info->wiredcnt, relaxed); // Prevent someone else from freeing the table if we need to drop the lock
4027 if (!pmap_lock_shared_to_exclusive(pmap)) {
4028 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4029 }
4030 lock_mode = PMAP_LOCK_EXCLUSIVE;
4031 refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
4032 if ((os_atomic_dec(&ptd_info->wiredcnt, relaxed) == 0) && (refcnt == 0)) {
4033 /**
4034 * Drain any concurrent retype-sensitive SPTM operations. This is needed to
4035 * ensure that we don't unmap the page table and retype it while those operations
4036 * are still finishing on other CPUs, leading to an SPTM violation. In particular,
4037 * the multipage batched cacheability/attribute update code may issue SPTM calls
4038 * without holding the relevant PVH or pmap locks, so we can't guarantee those
4039 * calls have actually completed despite observing refcnt == 0.
4040 *
4041 * At this point, we CAN guarantee that:
4042 * 1) All prior PTE removals required to produce refcnt == 0 have
4043 * completed and been synchronized for all observers by DSB, and the
4044 * relevant PV list entries removed. Subsequent calls not already in the
4045 * pmap epoch will no longer observe these mappings.
4046 * 2) We now hold the pmap lock exclusive, so there will be no further attempt
4047 * to enter mappings in this page table before it is unmapped.
4048 */
4049 pmap_epoch_prepare_drain();
4050 pmap_epoch_drain();
4051 pmap_tte_deallocate(pmap, start, tte_p, pt_attr_twig_level(pt_attr), true);
4052 unlock = false; // pmap_tte_deallocate() has dropped the lock
4053 }
4054 }
4055 done:
4056 if (unlock) {
4057 pmap_unlock(pmap, lock_mode);
4058 }
4059
4060 return eva;
4061 }
4062
4063 void
4064 pmap_remove_options(
4065 pmap_t pmap,
4066 vm_map_address_t start,
4067 vm_map_address_t end,
4068 int options)
4069 {
4070 vm_map_address_t va;
4071
4072 if (pmap == PMAP_NULL) {
4073 return;
4074 }
4075
4076 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4077
4078 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4079 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4080 VM_KERNEL_ADDRHIDE(end));
4081
4082 #if MACH_ASSERT
4083 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4084 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4085 pmap, (uint64_t)start, (uint64_t)end);
4086 }
4087 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4088 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4089 pmap, (uint64_t)start, (uint64_t)end);
4090 }
4091 #endif
4092
4093 /*
4094 * We allow single-page requests to execute non-preemptibly,
4095 * as it doesn't make sense to sample AST_URGENT for a single-page
4096 * operation, and there are a couple of special use cases that
4097 * require a non-preemptible single-page operation.
4098 */
4099 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4100 pmap_verify_preemptible();
4101 }
4102
4103 /*
4104 * Invalidate the translation buffer first
4105 */
4106 va = start;
4107 while (va < end) {
4108 vm_map_address_t l;
4109
4110 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4111 if (l > end) {
4112 l = end;
4113 }
4114
4115 va = pmap_remove_options_internal(pmap, va, l, options);
4116 }
4117
4118 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4119 }
4120
4121
4122 /*
4123 * Remove phys addr if mapped in specified map
4124 */
4125 void
4126 pmap_remove_some_phys(
4127 __unused pmap_t map,
4128 __unused ppnum_t pn)
4129 {
4130 /* Implement to support working set code */
4131 }
4132
4133 /*
4134 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4135 * switch a thread onto a new vm_map.
4136 */
4137 void
4138 pmap_switch_user(thread_t thread, vm_map_t new_map)
4139 {
4140 pmap_t new_pmap = new_map->pmap;
4141
4142
4143 thread->map = new_map;
4144 pmap_set_pmap(new_pmap, thread);
4145
4146 }
4147 void
4148 pmap_set_pmap(
4149 pmap_t pmap,
4150 thread_t thread)
4151 {
4152 pmap_switch(pmap, thread);
4153 }
4154
4155 MARK_AS_PMAP_TEXT void
4156 pmap_switch_internal(
4157 pmap_t pmap,
4158 thread_t thread)
4159 {
4160 validate_pmap_mutable(pmap);
4161 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4162 const uint16_t asid_index = PMAP_HWASID(pmap);
4163 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4164 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4165 }
4166
4167 #if __ARM_KERNEL_PROTECT__
4168 asid_index >>= 1;
4169 #endif
4170
4171 if (asid_index > 0) {
4172 pmap_update_plru(asid_index);
4173 }
4174
4175 __unused sptm_return_t sptm_return;
4176 #if HAS_MTE
4177 if (ml_thread_get_sec_override(thread)) {
4178 assert(pmap != kernel_pmap);
4179 sptm_return = sptm_switch_root(pmap->ttep, 0, SPTM_ROOT_PT_FLAG_MTE);
4180 #else
4181 #pragma unused(thread)
4182 if (0) {
4183 #endif
4184 } else {
4185 sptm_return = sptm_switch_root(pmap->ttep, 0, 0);
4186 }
4187
4188 #if DEVELOPMENT || DEBUG
4189 if (__improbable(sptm_return & SPTM_SWITCH_ASID_TLBI_FLUSH)) {
4190 os_atomic_inc(&pmap_asid_flushes, relaxed);
4191 }
4192
4193 if (__improbable(sptm_return & SPTM_SWITCH_RCTX_FLUSH)) {
4194 os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4195 }
4196 #endif /* DEVELOPMENT || DEBUG */
4197 }
4198
4199 void
4200 pmap_switch(
4201 pmap_t pmap,
4202 thread_t thread)
4203 {
4204 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
4205 pmap_switch_internal(pmap, thread);
4206 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4207 }
4208
4209 void
4210 pmap_page_protect(
4211 ppnum_t ppnum,
4212 vm_prot_t prot)
4213 {
4214 pmap_page_protect_options(ppnum, prot, 0, NULL);
4215 }
4216
4217 /**
4218 * Helper function for performing per-mapping accounting following an SPTM disjoint unmap request.
4219 *
4220 * @note [pmap] cannot be the kernel pmap. This is because we do not maintain a ledger in the
4221 * kernel pmap.
4222 *
4223 * @param pmap The pmap that contained the mapping
4224 * @param pai The physical page index mapped by the mapping
4225 * @param is_compressed Indicates whether the operation was an unmap-to-compress vs. a full unmap
4226 * @param is_internal Indicates whether the mapping was for an internal (aka anonymous) VM page
4227 * @param is_altacct Indicates whether the mapping was subject to alternate accounting.
4228 */
4229 static void
4230 pmap_disjoint_unmap_accounting(pmap_t pmap, unsigned int pai, bool is_compressed, bool is_internal, bool is_altacct)
4231 {
4232 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4233 pvh_assert_locked(pai);
4234
4235 assert(pmap != kernel_pmap);
4236
4237 if (is_internal &&
4238 !is_altacct &&
4239 ppattr_test_reusable(pai)) {
4240 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4241 } else if (!is_internal) {
4242 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4243 }
4244
4245 if (is_altacct) {
4246 assert(is_internal);
4247 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4248 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4249 if (is_compressed) {
4250 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4251 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4252 }
4253 } else if (ppattr_test_reusable(pai)) {
4254 assert(is_internal);
4255 if (is_compressed) {
4256 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4257 /* was not in footprint, but is now */
4258 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4259 }
4260 } else if (is_internal) {
4261 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4262
4263 /*
4264 * Update all stats related to physical footprint, which only
4265 * deals with internal pages.
4266 */
4267 if (is_compressed) {
4268 /*
4269 * This removal is only being done so we can send this page to
4270 * the compressor; therefore it mustn't affect total task footprint.
4271 */
4272 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4273 } else {
4274 /*
4275 * This internal page isn't going to the compressor, so adjust stats to keep
4276 * phys_footprint up to date.
4277 */
4278 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4279 }
4280 } else {
4281 /* external page: no impact on ledgers */
4282 }
4283 }
4284
4285 /**
4286 * Helper function for issuing a disjoint unmap request to the SPTM and performing
4287 * related accounting. This function uses the 'prev_ptes' list generated by
4288 * the sptm_unmap_disjoint() call to determine whether said call altered the
4289 * relevant PTEs in a manner that would require accounting updates.
4290 *
4291 * @param pa The physical address against which the disjoint unmap will be issued.
4292 * @param num_mappings The number of disjoint mappings for the SPTM to update.
4293 * The per-CPU sptm_ops array should contain the same number
4294 * of individual disjoint requests.
4295 */
4296 static void
4297 pmap_disjoint_unmap(pmap_paddr_t pa, unsigned int num_mappings)
4298 {
4299 const unsigned int pai = pa_index(pa);
4300
4301 pvh_assert_locked(pai);
4302
4303 assert(num_mappings <= SPTM_MAPPING_LIMIT);
4304
4305 assert(get_preemption_level() > 0);
4306 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
4307
4308 sptm_unmap_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings);
4309
4310 for (unsigned int cur_mapping = 0; cur_mapping < num_mappings; ++cur_mapping) {
4311 pt_entry_t prev_pte = sptm_pcpu->sptm_prev_ptes[cur_mapping];
4312
4313 pt_desc_t * const ptdp = sptm_pcpu->sptm_ptds[cur_mapping];
4314 const pmap_t pmap = ptdp->pmap;
4315
4316 assertf(!pte_is_valid(prev_pte) ||
4317 ((pte_to_pa(prev_pte) & ~PAGE_MASK) == pa), "%s: prev_pte 0x%llx does not map pa 0x%llx",
4318 __func__, (unsigned long long)prev_pte, (unsigned long long)pa);
4319
4320 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4321 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4322
4323 if (pmap != kernel_pmap) {
4324 /*
4325 * If the prior PTE is invalid (which may happen due to a concurrent remove operation),
4326 * the compressed marker won't be written so we shouldn't account the mapping as compressed.
4327 */
4328 const bool is_compressed = (pte_is_valid(prev_pte) &&
4329 ((sptm_pcpu->sptm_ops[cur_mapping].pte_template & ARM_PTE_COMPRESSED_MASK) != 0));
4330 const bool is_internal = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_INTERNAL) != 0;
4331 const bool is_altacct = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_ALTACCT) != 0;
4332
4333 /*
4334 * The rule is that accounting related to PTE contents (wired, PTD refcount)
4335 * must be updated by whoever clears the PTE, while accounting related to physical page
4336 * attributes must be updated by whoever clears the PVE. We therefore always call
4337 * pmap_disjoint_unmap_accounting() here since we're removing the PVE, but only update
4338 * wired/PTD accounting if the prior PTE was valid.
4339 */
4340 pmap_disjoint_unmap_accounting(pmap, pai, is_compressed, is_internal, is_altacct);
4341
4342 if (!pte_is_valid(prev_pte)) {
4343 continue;
4344 }
4345
4346 if (pte_is_wired(prev_pte)) {
4347 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4348 if (__improbable(os_atomic_dec_orig(&sptm_pcpu->sptm_ptd_info[cur_mapping]->wiredcnt, relaxed) == 0)) {
4349 panic("%s: over-unwire of ptdp %p, ptd info %p", __func__,
4350 ptdp, sptm_pcpu->sptm_ptd_info[cur_mapping]);
4351 }
4352 }
4353 }
4354 }
4355 }
4356
4357 /**
4358 * The following two functions, pmap_multipage_op_submit_disjoint() and
4359 * pmap_multipage_op_add_page(), are intended to allow callers to manage batched SPTM
4360 * operations that may span multiple physical pages. They are intended to operate in
4361 * a way that allows callers such as pmap_page_protect_options_with_flush_range() to
4362 * insert mappings into the per-CPU SPTM disjoint ops array in the same manner that
4363 * they would for an ordinary single-page operation.
4364 * Functions such as pmap_page_protect_options_with_flush_range() operate on a single
4365 * physical page but may be passed a non-NULL flush_range object to indicate that the
4366 * call is part of a larger batched operation which may span multiple physical pages.
4367 * In that scenario, these functions are intended to be used as follows:
4368 * 1) Call pmap_multipage_op_add_page() to insert a "header" for the page into the per-
4369 * CPU SPTM ops array. Use the return value from this call as the starting index
4370 * at which to add ordinary mapping entries into the same array.
4371 * 2) Insert sptm_disjoint_op_t entries into the ops array in the normal manner until
4372 * the array is full, the SPTM options required for the upcoming sequence of pages
4373 * need to change, or the current mapping matches flush_range->current_ptep.
4374 * In the latter case, pmap_insert_flush_range_template() may instead be used
4375 * to insert the mapping into the per-CPU SPTM region templates array. See the
4376 * documentation for pmap_insert_flush_range_template() below.
4377 * 3) If the array is full, call pmap_multipage_op_submit_disjoint() and return to step 1).
4378 * 4) If the SPTM options need to change, call pmap_multipage_op_add_page() to insert
4379 * a new header with the updated options and, using the return value as the new
4380 * insertion point for the ops array, resume step 2).
4381 * 5) Upon completion, if there are any pending not-yet-submitted mappings, do not
4382 * submit those mappings to the SPTM as would ordinarily be done for a single-page
4383 * call. These trailing mappings will be submitted as part of the next batch,
4384 * or by the next-higher caller if the range operation is complete.
4385 *
4386 * Note that, as a performance optimization, the caller may track the insertion
4387 * point in the disjoint ops array locally (i.e. without incrementing
4388 * flush_range->pending_disjoint_entries on every iteration, as long as it takes care to do the
4389 * following:
4390 * 1) Initialize and update that insertion point as described in steps 1) and 4) above.
4391 * 2) Pass the updated insertion point as the 'pending_disjoint_entries' parameter into the calls
4392 * in steps 3) and 4) above.
4393 * 3) Update flush_range->pending_disjoint_entries with the locally-maintained value along with
4394 * step 5) above.
4395 */
4396
4397 /**
4398 * Submit any pending disjoint multi-page mapping updates to the SPTM.
4399 *
4400 * @note This function must be called with preemption disabled, and will drop
4401 * the preemption-disable count upon submitting to the SPTM.
4402 * @note [pending_disjoint_entries] must include *all* pending entries in the SPTM ops array,
4403 * including physical address "header" entries.
4404 * @note This function automatically updates the per_paddr_header.num_mappings field
4405 * for the most recent physical address header in the SPTM ops array to its final
4406 * value.
4407 *
4408 * @param pending_disjoint_entries The number of not-yet-submitted mappings according to the caller.
4409 * This value may be greater than [flush_range]->pending_disjoint_entries if
4410 * the caller has inserted mappings into the ops array without
4411 * updating [flush_range]->pending_disjoint_entries, in which case this
4412 * function will update [flush_range]->pending_disjoint_entries with the
4413 * caller's value.
4414 * @param flush_range The object tracking the current state of the multipage disjoint
4415 * operation.
4416 */
4417 static inline void
4418 pmap_multipage_op_submit_disjoint(unsigned int pending_disjoint_entries, pmap_tlb_flush_range_t *flush_range)
4419 {
4420 /**
4421 * Reconcile the number of pending entries as tracked by the caller with the
4422 * number of pending entries tracked by flush_range. If the caller's value is
4423 * greater, we assume the caller has inserted locally-tracked mappings into the
4424 * array without directly updating flush_range->pending_disjoint_entries. Otherwise, we
4425 * assume the caller has no locally-tracked mappings and is simply trying to
4426 * purge any pending mappings from a prior call sequence.
4427 */
4428 if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
4429 flush_range->pending_disjoint_entries = pending_disjoint_entries;
4430 } else {
4431 assert(pending_disjoint_entries == 0);
4432 }
4433 if (flush_range->pending_disjoint_entries != 0) {
4434 assert(get_preemption_level() > 0);
4435 /**
4436 * Compute the correct number of mappings for the most recent paddr
4437 * header based on the current position in the SPTM ops array.
4438 */
4439 flush_range->current_header->per_paddr_header.num_mappings =
4440 flush_range->pending_disjoint_entries - flush_range->current_header_first_mapping_index;
4441 const sptm_return_t sptm_return = sptm_update_disjoint_multipage(
4442 PERCPU_GET(pmap_sptm_percpu)->sptm_ops_pa, flush_range->pending_disjoint_entries);
4443
4444 /**
4445 * We may be submitting the batch and exiting the epoch partway through
4446 * processing the PV list for a page. That's fine, because in that case we'll
4447 * hold the PV lock for that page, which will prevent mappings of that page from
4448 * being disconnected and will prevent the completion of pmap_remove() against
4449 * any of those mappings, thus also guaranteeing the relevant page table pages
4450 * can't be freed. The epoch still protects mappings for any prior page in
4451 * the batch, whose PV locks are no longer held.
4452 */
4453 pmap_epoch_exit();
4454 enable_preemption();
4455 if (flush_range->pending_region_entries != 0) {
4456 flush_range->processed_entries += flush_range->pending_disjoint_entries;
4457 } else {
4458 flush_range->processed_entries = 0;
4459 }
4460 flush_range->pending_disjoint_entries = 0;
4461 if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4462 flush_range->ptfr_flush_needed = true;
4463 }
4464 }
4465 }
4466
4467 /**
4468 * Insert a new physical address "header" entry into the per-CPU SPTM ops array for a
4469 * multi-page SPTM operation. It is expected that the caller will subsequently add
4470 * mapping entries for this physical address into the array.
4471 *
4472 * @note This function will disable preemption upon creation of the first paddr header
4473 * (index 0 in the per-CPU SPTM ops array) and it is expected that
4474 * pmap_multipage_op_submit() will subsequently be called on the same CPU.
4475 * @note Before inserting the new header, this function automatically updates the
4476 * per_paddr_header.num_mappings field for the previous physical address header
4477 * (if present) in the SPTM ops array to its final value.
4478 *
4479 * @param phys The physical address for which to insert a header entry.
4480 * @param inout_pending_disjoint_entries
4481 * [input] The number of not-yet-submitted mappings according to the caller.
4482 * This value may be greater than [flush_range]->pending_disjoint_entries if
4483 * the caller has inserted mappings into the ops array without
4484 * updating [flush_range]->pending_disjoint_entries, in which case this
4485 * function will update [flush_range]->pending_disjoint_entries with the
4486 * caller's value.
4487 * [output] Returns the starting index at which the caller should insert mapping
4488 * entries into the per-CPU SPTM ops array.
4489 * @param sptm_update_options SPTM_UPDATE_* flags to pass to the SPTM call.
4490 * SPTM_UPDATE_SKIP_PAPT is automatically inserted by this
4491 * function.
4492 * @param flush_range The object tracking the current state of the multipage operation.
4493 *
4494 * @return True if the region operation was submitted to the SPTM due to the ops array already
4495 * being full, false otherwise. In the former case, the new header will not be added
4496 * to the array; the caller will need to re-invoke this function after taking any
4497 * necessary post-submission action (such as enabling preemption).
4498 */
4499 static inline bool
4500 pmap_multipage_op_add_page(
4501 pmap_paddr_t phys,
4502 unsigned int *inout_pending_disjoint_entries,
4503 uint32_t sptm_update_options,
4504 pmap_tlb_flush_range_t *flush_range)
4505 {
4506 unsigned int pending_disjoint_entries = *inout_pending_disjoint_entries;
4507
4508 /**
4509 * Reconcile the number of pending entries as tracked by the caller with the
4510 * number of pending entries tracked by flush_range. If the caller's value is
4511 * greater, we assume the caller has inserted locally-tracked mappings into the
4512 * array without directly updating flush_range->pending_disjoint_entries. Otherwise, we
4513 * assume the caller has no locally-tracked mappings and is adding its paddr
4514 * header for the first time.
4515 */
4516 if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
4517 flush_range->pending_disjoint_entries = pending_disjoint_entries;
4518 } else {
4519 assert(pending_disjoint_entries == 0);
4520 }
4521 if (flush_range->pending_disjoint_entries >= (SPTM_MAPPING_LIMIT - 1)) {
4522 /**
4523 * If the SPTM ops array is either full or only has space for the paddr
4524 * header, there won't be room for mapping entries, so submit the pending
4525 * mappings to the SPTM now, and return to allow the caller to take
4526 * any necessary post-submission action.
4527 */
4528 pmap_multipage_op_submit_disjoint(pending_disjoint_entries, flush_range);
4529 *inout_pending_disjoint_entries = 0;
4530 return true;
4531 }
4532 pending_disjoint_entries = flush_range->pending_disjoint_entries;
4533
4534 sptm_update_options |= SPTM_UPDATE_SKIP_PAPT;
4535 if (pending_disjoint_entries == 0) {
4536 disable_preemption();
4537 /**
4538 * Enter the pmap epoch while we gather the disjoint update arguments
4539 * and issue the SPTM call. Since this operation may cover multiple physical
4540 * pages, we may construct the argument array and invoke the SPTM without holding
4541 * all relevant PVH locks or pmap locks. We therefore need to record that we are
4542 * collecting and modifying mapping state so that e.g. pmap_page_protect() does
4543 * not attempt to retype the underlying pages and pmap_remove() does not attempt
4544 * to free the page tables used for these mappings without first draining our epoch.
4545 */
4546 pmap_epoch_enter();
4547 flush_range->pending_disjoint_entries = 1;
4548 } else {
4549 /**
4550 * Before inserting the new header, update the prior header's number
4551 * of paddr-specific mappings to its final value.
4552 */
4553 assert(flush_range->current_header != NULL);
4554 flush_range->current_header->per_paddr_header.num_mappings =
4555 pending_disjoint_entries - flush_range->current_header_first_mapping_index;
4556 }
4557 sptm_disjoint_op_t *sptm_ops = PERCPU_GET(pmap_sptm_percpu)->sptm_ops;
4558 flush_range->current_header = (sptm_update_disjoint_multipage_op_t*)&sptm_ops[pending_disjoint_entries];
4559 flush_range->current_header_first_mapping_index = ++pending_disjoint_entries;
4560 flush_range->current_header->per_paddr_header.paddr = phys;
4561 flush_range->current_header->per_paddr_header.num_mappings = 0;
4562 flush_range->current_header->per_paddr_header.options = sptm_update_options;
4563
4564 *inout_pending_disjoint_entries = pending_disjoint_entries;
4565 return false;
4566 }
4567
4568 /**
4569 * The following two functions, pmap_multipage_op_submit_region() and
4570 * pmap_insert_flush_range_template(), are meant to be used in a similar fashion
4571 * to pmap_multipage_op_submit_disjoint() and pmap_multipage_op_add_page(),
4572 * but for the specific case in which a given mapping within a PV list happens
4573 * to map the current VA within a VA region being operated on by
4574 * phys_attribute_clear_range(). This allows the pmap to further optimize
4575 * the SPTM calls by using sptm_update_region() to modify all mappings within
4576 * the VA region, which requires far fewer table walks than a disjoint operation.
4577 * Since the starting VA of the region, the owning pmap, and the insertion point
4578 * within the per-CPU region templates array are already known, these functions
4579 * don't require the special "header" entry or the complex array position tracking
4580 * of their disjoint equivalents above.
4581 * Note that these functions may be used together with the disjoint functions above;
4582 * these functions can be used for the "primary" mappings corresponding to the VA
4583 * region being manipulated by the VM layer, while the disjoint functions can be
4584 * used for any alias mappings of the underlying pages which fall outside that
4585 * VA region.
4586 */
4587
4588 /**
4589 * Submit any pending region-based templates for the specified flush_range.
4590 *
4591 * @note This function must be called with preemption disabled, and will drop
4592 * the preemption-disable count upon submitting to the SPTM.
4593 *
4594 * @param flush_range The object tracking the current state of the region operation.
4595 */
4596 static inline void
4597 pmap_multipage_op_submit_region(pmap_tlb_flush_range_t *flush_range)
4598 {
4599 if (flush_range->pending_region_entries != 0) {
4600 assert(get_preemption_level() > 0);
4601 pmap_assert_locked(flush_range->ptfr_pmap, PMAP_LOCK_SHARED);
4602 /**
4603 * If there are any pending disjoint entries, we're already in a pmap epoch.
4604 * For disjoint entries, we need to hold the epoch during the entire time we
4605 * construct the disjoint ops array because those ops may point to some arbitrary
4606 * pmap and we need to ensure the relevant page tables and even the pmap itself
4607 * aren't concurrently reclaimed while our ops array points to them.
4608 * But for a region op like this, we know we already hold the relevant pmap lock
4609 * so none of the above can happen concurrently. We therefore only need to hold
4610 * the epoch across the SPTM call itself to prevent a concurrent unmap operation
4611 * from attempting to retype the mapped pages while our SPTM call has them in-
4612 * flight.
4613 */
4614 if (flush_range->pending_disjoint_entries == 0) {
4615 pmap_epoch_enter();
4616 }
4617 const sptm_return_t sptm_return = sptm_update_region(flush_range->ptfr_pmap->ttep,
4618 flush_range->pending_region_start, flush_range->pending_region_entries,
4619 PERCPU_GET(pmap_sptm_percpu)->sptm_templates_pa,
4620 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | SPTM_UPDATE_DEFER_TLBI);
4621 if (flush_range->pending_disjoint_entries == 0) {
4622 pmap_epoch_exit();
4623 }
4624 enable_preemption();
4625 if (flush_range->pending_disjoint_entries != 0) {
4626 flush_range->processed_entries += flush_range->pending_region_entries;
4627 } else {
4628 flush_range->processed_entries = 0;
4629 }
4630 flush_range->pending_region_start += (flush_range->pending_region_entries <<
4631 pmap_get_pt_attr(flush_range->ptfr_pmap)->pta_page_shift);
4632 flush_range->pending_region_entries = 0;
4633 if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4634 flush_range->ptfr_flush_needed = true;
4635 }
4636 }
4637 }
4638
4639 /**
4640 * Insert a PTE template into the per-CPU SPTM region ops array.
4641 * This is meant to be used as a performance optimization for the case in which a given
4642 * mapping being processed by a function such as pmap_page_protect_options_with_flush_range()
4643 * happens to map the current iteration position within [flush_range]'s VA region.
4644 * In this case the mapping can be inserted as a region-based template rather than a disjoint
4645 * operation as would be done in the general case. The idea is that region-based SPTM
4646 * operations are significantly less expensive than disjoint operations, because each region
4647 * operation only requires a single page table walk at the beginning vs. a table walk for
4648 * each mapping in the disjoint case. Since the majority of mappings processed by a flush
4649 * range operation belong to the main flush range VA region (i.e. alias mappings outside
4650 * the region are less common), the performance improvement can be significant.
4651 *
4652 * @note This function will disable preemption upon inserting the first entry into the
4653 * per-CPU templates array, and will re-enable preemption upon submitting the region
4654 * operation to the SPTM.
4655 *
4656 * @param template The PTE template to insert into the per-CPU templates array.
4657 * @param flush_range The object tracking the current state of the region operation.
4658 *
4659 * @return True if the region operation was submitted to the SPTM, false otherwise.
4660 */
4661 static inline bool
4662 pmap_insert_flush_range_template(pt_entry_t template, pmap_tlb_flush_range_t *flush_range)
4663 {
4664 if (flush_range->pending_region_entries == 0) {
4665 disable_preemption();
4666 }
4667 flush_range->region_entry_added = true;
4668 PERCPU_GET(pmap_sptm_percpu)->sptm_templates[flush_range->pending_region_entries++] = template;
4669 if (flush_range->pending_region_entries == SPTM_MAPPING_LIMIT) {
4670 pmap_multipage_op_submit_region(flush_range);
4671 return true;
4672 }
4673 return false;
4674 }
4675
4676 /**
4677 * Wrapper function for submitting any pending operations, region-based or disjoint,
4678 * tracked by a flush range object. This is meant to be used by the top-level caller that
4679 * iterates over the flush range's VA region and calls functions such as
4680 * pmap_page_protect_options_with_flush_range() or arm_force_fast_fault_with_flush_range()
4681 * to construct the relevant SPTM operations arrays.
4682 *
4683 * @param flush_range The object tracking the current state of region and/or disjoint operations.
4684 */
4685 static inline void
4686 pmap_multipage_op_submit(pmap_tlb_flush_range_t *flush_range)
4687 {
4688 pmap_multipage_op_submit_disjoint(0, flush_range);
4689 pmap_multipage_op_submit_region(flush_range);
4690 }
4691
4692 /**
4693 * This is an internal-only flag that indicates the caller of pmap_page_protect_options_with_flush_range()
4694 * is removing/updating all mappings in preparation for a retype operation. In this case
4695 * pmap_page_protect_options() will assume (and assert) that the PVH lock for the physical page is held
4696 * by the calller, and will perform the necessary pmap epoch drain and retype the page back to XNU_DEFAULT
4697 * prior to returning.
4698 */
4699 #define PMAP_OPTIONS_PPO_PENDING_RETYPE 0x80000000
4700 _Static_assert(PMAP_OPTIONS_PPO_PENDING_RETYPE & PMAP_OPTIONS_RESERVED_MASK,
4701 "PMAP_OPTIONS_PPO_PENDING_RETYPE outside reserved encoding space");
4702
4703 /**
4704 * Lower the permission for all mappings to a given page. If VM_PROT_NONE is specified,
4705 * the mappings will be removed.
4706 *
4707 * @param ppnum Page number to lower the permission of.
4708 * @param prot The permission to lower to.
4709 * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
4710 * PMAP_OPTIONS_PPO_PENDING_RETYPE indicates the PVH lock for ppnum is
4711 * already locked and a pmap epoch drain shold be performed, along with
4712 * retyping [ppnum] back to XNU_DEFAULT.
4713 * PMAP_OPTIONS_COMPRESSOR indicates the function is called by the
4714 * VM compressor.
4715 * PMAP_OPTIONS_RETYPE requests the [ppnum] be retyped back to XNU_DEFAULT,
4716 * along with an epoch drain; like PMAP_OPTIONS_PPO_PENDING_RETYPE but without
4717 * the PVH lock being held by the caller.
4718 * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
4719 * by the caller. This is an input/output parameter which may be updated
4720 * to reflect a new PV head value to be passed to a later call to pvh_unlock().
4721 * @param flush_range When present, this function will skip the TLB flush for the
4722 * mappings that are covered by the range, leaving that to be
4723 * done later by the caller. It may also avoid submitting mapping
4724 * updates directly to the SPTM, instead accumulating them in a
4725 * per-CPU array to be submitted later by the caller.
4726 *
4727 * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4728 */
4729 MARK_AS_PMAP_TEXT static void
4730 pmap_page_protect_options_with_flush_range(
4731 ppnum_t ppnum,
4732 vm_prot_t prot,
4733 unsigned int options,
4734 locked_pvh_t *locked_pvh,
4735 pmap_tlb_flush_range_t *flush_range)
4736 {
4737 pmap_paddr_t phys = ptoa(ppnum);
4738 locked_pvh_t local_locked_pvh = {.pvh = 0};
4739 pv_entry_t *pve_p = NULL;
4740 pv_entry_t *pveh_p = NULL;
4741 pv_entry_t *pvet_p = NULL;
4742 pt_entry_t *pte_p = NULL;
4743 pv_entry_t *new_pve_p = NULL;
4744 pt_entry_t *new_pte_p = NULL;
4745
4746 bool remove = false;
4747 unsigned int pvh_cnt = 0;
4748 unsigned int num_mappings = 0, num_skipped_mappings = 0;
4749
4750 assert(ppnum != vm_page_fictitious_addr);
4751
4752 /**
4753 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4754 *
4755 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
4756 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
4757 * semantics conflict with each other, so assert they are not both true.
4758 */
4759 assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
4760
4761 /* Only work with managed pages. */
4762 if (!pa_valid(phys)) {
4763 return;
4764 }
4765
4766 /*
4767 * Determine the new protection.
4768 */
4769 switch (prot) {
4770 case VM_PROT_ALL:
4771 return; /* nothing to do */
4772 case VM_PROT_READ:
4773 case VM_PROT_READ | VM_PROT_EXECUTE:
4774 break;
4775 default:
4776 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4777 options = options & ~PMAP_OPTIONS_NOFLUSH;
4778 remove = true;
4779 break;
4780 }
4781
4782 /**
4783 * We don't support cross-page batching (indicated by flush_range being non-NULL) for removals,
4784 * as removals must use the SPTM prev_ptes array for accounting, which isn't supported for cross-
4785 * page batches.
4786 */
4787 assert((flush_range == NULL) || !remove);
4788
4789 unsigned int pai = pa_index(phys);
4790 if (__probable(locked_pvh == NULL)) {
4791 if (flush_range != NULL) {
4792 /**
4793 * If we're partway through processing a multi-page batched call,
4794 * preemption will already be disabled so we can't simply call
4795 * pvh_lock() which may block. Instead, we first try to acquire
4796 * the lock without waiting, which in most cases should succeed.
4797 * If it fails, we submit the pending batched operations to re-
4798 * enable preemption and then acquire the lock normally.
4799 */
4800 local_locked_pvh = pvh_try_lock(pai);
4801 if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
4802 pmap_multipage_op_submit(flush_range);
4803 local_locked_pvh = pvh_lock(pai);
4804 }
4805 } else {
4806 local_locked_pvh = pvh_lock(pai);
4807 }
4808 } else {
4809 local_locked_pvh = *locked_pvh;
4810 assert(pai == local_locked_pvh.pai);
4811 }
4812 assert(local_locked_pvh.pvh != 0);
4813 pvh_assert_locked(pai);
4814
4815 bool pvh_lock_sleep_mode_needed = false;
4816 bool clear_epoch = false;
4817
4818 /*
4819 * PVH should be locked before accessing per-CPU data, as we're relying on the lock
4820 * to disable preemption.
4821 */
4822 pmap_cpu_data_t *pmap_cpu_data = NULL;
4823 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
4824 sptm_disjoint_op_t *sptm_ops = NULL;
4825 pt_desc_t **sptm_ptds = NULL;
4826 ptd_info_t **sptm_ptd_info = NULL;
4827
4828 /* BEGIN IGNORE CODESTYLE */
4829
4830 /**
4831 * This would also work as a block, with the above variables declared using the
4832 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
4833 * dereferencing __block variables through stack forwarding pointers) isn't needed
4834 * here, as we never need to use this code sequence as a closure.
4835 */
4836 #define PPO_PERCPU_INIT() do { \
4837 disable_preemption(); \
4838 pmap_cpu_data = pmap_get_cpu_data(); \
4839 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
4840 sptm_ops = sptm_pcpu->sptm_ops; \
4841 sptm_ptds = sptm_pcpu->sptm_ptds; \
4842 sptm_ptd_info = sptm_pcpu->sptm_ptd_info; \
4843 if (remove) { \
4844 clear_epoch = true; \
4845 pmap_epoch_enter(); \
4846 } \
4847 } while (0)
4848
4849 /* END IGNORE CODESTYLE */
4850
4851
4852 PPO_PERCPU_INIT();
4853
4854 pv_entry_t **pve_pp = NULL;
4855
4856 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
4857 pte_p = pvh_ptep(local_locked_pvh.pvh);
4858 } else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4859 pve_p = pvh_pve_list(local_locked_pvh.pvh);
4860 pveh_p = pve_p;
4861 } else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
4862 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
4863 }
4864
4865 int pve_ptep_idx = 0;
4866 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4867
4868 /*
4869 * We need to keep track of whether a particular PVE list contains IOMMU
4870 * mappings when removing entries, because we should only remove CPU
4871 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4872 * it around.
4873 */
4874 bool iommu_mapping_in_pve = false;
4875
4876 /**
4877 * With regard to TLBI, there are three cases:
4878 *
4879 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
4880 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
4881 * itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
4882 * mapping is out of the range.
4883 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
4884 * let SPTM handle TLBI flushing.
4885 */
4886 const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
4887 const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
4888
4889 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4890 if (__improbable(pvh_lock_sleep_mode_needed)) {
4891 assert((num_mappings == 0) && (num_skipped_mappings == 0));
4892 if (clear_epoch) {
4893 pmap_epoch_exit();
4894 clear_epoch = false;
4895 }
4896 /**
4897 * Undo the explicit preemption disable done in the last call to PPO_PER_CPU_INIT().
4898 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
4899 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
4900 * core while processing SPTM per-CPU data. At the same time, we also want preemption
4901 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
4902 * urgent ASTs can be handled.
4903 */
4904 enable_preemption();
4905 pvh_lock_enter_sleep_mode(&local_locked_pvh);
4906 pvh_lock_sleep_mode_needed = false;
4907 PPO_PERCPU_INIT();
4908 }
4909
4910 if (pve_p != PV_ENTRY_NULL) {
4911 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4912 if (pte_p == PT_ENTRY_NULL) {
4913 goto protect_skip_pve;
4914 }
4915 }
4916
4917 #ifdef PVH_FLAG_IOMMU
4918 if (pvh_ptep_is_iommu(pte_p)) {
4919 iommu_mapping_in_pve = true;
4920 if (__improbable(remove && (options & PMAP_OPTIONS_COMPRESSOR))) {
4921 const iommu_instance_t iommu = ptep_get_iommu(pte_p);
4922 panic("%s: attempt to compress ppnum 0x%x owned by iommu driver "
4923 "%u (token: %#x), pve_p=%p", __func__, ppnum, GET_IOMMU_ID(iommu),
4924 GET_IOMMU_TOKEN(iommu), pve_p);
4925 }
4926 if (remove && (pve_p == PV_ENTRY_NULL)) {
4927 /*
4928 * We've found an IOMMU entry and it's the only entry in the PV list.
4929 * We don't discard IOMMU entries, so simply set up the new PV list to
4930 * contain the single IOMMU PTE and exit the loop.
4931 */
4932 new_pte_p = pte_p;
4933 break;
4934 }
4935 ++num_skipped_mappings;
4936 goto protect_skip_pve;
4937 }
4938 #endif
4939
4940 const pt_entry_t spte = os_atomic_load(pte_p, relaxed);
4941
4942 if (__improbable(!remove && !pte_is_valid(spte))) {
4943 ++num_skipped_mappings;
4944 goto protect_skip_pve;
4945 }
4946
4947 pt_desc_t *ptdp = NULL;
4948 pmap_t pmap = NULL;
4949 vm_map_address_t va = 0;
4950
4951 if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
4952 /**
4953 * If the current mapping matches the flush range's current iteration position,
4954 * there's no need to do the work of getting the PTD. We already know the pmap,
4955 * and the VA is implied by flush_range->pending_region_start.
4956 */
4957 pmap = flush_range->ptfr_pmap;
4958 } else {
4959 ptdp = ptep_get_ptd(pte_p);
4960 pmap = ptdp->pmap;
4961 va = ptd_get_va(ptdp, pte_p);
4962 }
4963
4964 /**
4965 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
4966 * pending disjoint ops, so we don't need to do flush range disjoint op management.
4967 */
4968 if ((flush_range != NULL) && (ptdp != NULL)) {
4969 /**
4970 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
4971 * We do this in three cases:
4972 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
4973 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
4974 * for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
4975 * 3) We need to change the options passed to the SPTM for a run of one or more mappings. Specifically,
4976 * if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
4977 * belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
4978 * the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
4979 */
4980 uint32_t per_mapping_sptm_update_options = sptm_update_options;
4981 if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4982 per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
4983 }
4984 if ((num_mappings == 0) ||
4985 (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
4986 if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
4987 /**
4988 * If we needed to submit the pending disjoint ops to make room for the new page,
4989 * flush any pending region ops to reenable preemption and restart the loop with
4990 * the lock in sleep mode. This prevents preemption from being held disabled
4991 * for an arbitrary amount of time in the pathological case in which we have
4992 * both pending region ops and an excessively long PV list that repeatedly
4993 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
4994 */
4995 pmap_multipage_op_submit_region(flush_range);
4996 assert(num_mappings == 0);
4997 num_skipped_mappings = 0;
4998 pvh_lock_sleep_mode_needed = true;
4999 continue;
5000 }
5001 }
5002 }
5003
5004 if (__improbable((pmap == NULL) ||
5005 (pte_is_valid(spte) && (atop(pte_to_pa(spte)) != ppnum)))) {
5006 #if MACH_ASSERT
5007 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
5008 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
5009 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
5010 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5011
5012 pv_entry_t *check_pvep = pve_p;
5013
5014 do {
5015 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
5016 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
5017 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, (void*)local_locked_pvh.pvh, pve_p, pai);
5018 }
5019 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
5020
5021 /* Restore previous PTEP value. */
5022 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
5023 }
5024 #endif
5025 panic("%s: bad PVE pte_p=%p pmap=%p prot=%d options=%u, pvh=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
5026 __func__, pte_p, pmap, prot, options, (void*)local_locked_pvh.pvh, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
5027 }
5028
5029 pt_entry_t pte_template = ARM_PTE_EMPTY;
5030
5031 if (ptdp != NULL) {
5032 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
5033 sptm_ops[num_mappings].vaddr = va;
5034 }
5035
5036 /* Remove the mapping if new protection is NONE */
5037 if (remove) {
5038 sptm_ptds[num_mappings] = ptdp;
5039 sptm_ptd_info[num_mappings] = ptd_get_info(ptdp);
5040 sptm_pcpu->sptm_acct_flags[num_mappings] = 0;
5041 if (pmap != kernel_pmap) {
5042 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
5043 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
5044
5045 if (is_internal) {
5046 sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_INTERNAL;
5047 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5048 }
5049 if (is_altacct) {
5050 sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_ALTACCT;
5051 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5052 }
5053 if (compress && is_internal) {
5054 pte_template = ARM_PTE_COMPRESSED;
5055 if (is_altacct) {
5056 pte_template |= ARM_PTE_COMPRESSED_ALT;
5057 }
5058 }
5059 }
5060 /* Remove this CPU mapping from PVE list. */
5061 if (pve_p != PV_ENTRY_NULL) {
5062 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5063 }
5064 } else {
5065 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5066
5067 if (pmap == kernel_pmap) {
5068 pte_template = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5069 } else {
5070 pte_template = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5071 }
5072
5073 /*
5074 * We must at least clear the 'was writeable' flag, as we're at least revoking write access,
5075 * meaning that the VM is effectively requesting that subsequent write accesses to these mappings
5076 * go through vm_fault() instead of being handled by arm_fast_fault().
5077 */
5078 pte_set_was_writeable(pte_template, false);
5079
5080 /*
5081 * While the naive implementation of this would serve to add execute
5082 * permission, this is not how the VM uses this interface, or how
5083 * x86_64 implements it. So ignore requests to add execute permissions.
5084 */
5085 #if DEVELOPMENT || DEBUG
5086 if ((!(prot & VM_PROT_EXECUTE) && nx_enabled && pmap->nx_enabled) ||
5087 (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
5088 #else
5089 if (!(prot & VM_PROT_EXECUTE) ||
5090 (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
5091 #endif
5092 {
5093 pte_template |= pt_attr_leaf_xn(pt_attr);
5094 }
5095 }
5096
5097 if (ptdp != NULL) {
5098 sptm_ops[num_mappings].pte_template = pte_template;
5099 ++num_mappings;
5100 } else if (pmap_insert_flush_range_template(pte_template, flush_range)) {
5101 /**
5102 * We submit both the pending disjoint and pending region ops whenever
5103 * either category reaches the mapping limit. Having pending operations
5104 * in either category will keep preemption disabled, and we want to ensure
5105 * that we can at least temporarily re-enable preemption roughly every
5106 * SPTM_MAPPING_LIMIT mappings.
5107 */
5108 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
5109 pvh_lock_sleep_mode_needed = true;
5110 num_mappings = num_skipped_mappings = 0;
5111 }
5112
5113 protect_skip_pve:
5114 if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
5115 if (flush_range != NULL) {
5116 /* See comment above for why we submit both disjoint and region ops when we hit the limit. */
5117 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
5118 pmap_multipage_op_submit_region(flush_range);
5119 } else if (num_mappings > 0) {
5120 if (remove) {
5121 pmap_disjoint_unmap(phys, num_mappings);
5122 } else {
5123 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
5124 }
5125 }
5126 pvh_lock_sleep_mode_needed = true;
5127 num_mappings = num_skipped_mappings = 0;
5128 }
5129 pte_p = PT_ENTRY_NULL;
5130 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5131 pve_ptep_idx = 0;
5132
5133 if (remove) {
5134 /**
5135 * If there are any IOMMU mappings in the PVE list, preserve
5136 * those mappings in a new PVE list (new_pve_p) which will later
5137 * become the new PVH entry. Keep track of the CPU mappings in
5138 * pveh_p/pvet_p so they can be deallocated later.
5139 */
5140 if (iommu_mapping_in_pve) {
5141 iommu_mapping_in_pve = false;
5142 pv_entry_t *temp_pve_p = pve_next(pve_p);
5143 pve_remove(&local_locked_pvh, pve_pp, pve_p);
5144 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
5145 pveh_p = pvh_pve_list(local_locked_pvh.pvh);
5146 } else {
5147 assert(pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL));
5148 pveh_p = PV_ENTRY_NULL;
5149 }
5150 pve_p->pve_next = new_pve_p;
5151 new_pve_p = pve_p;
5152 pve_p = temp_pve_p;
5153 continue;
5154 } else {
5155 pvet_p = pve_p;
5156 pvh_cnt++;
5157 }
5158 }
5159
5160 pve_pp = pve_next_ptr(pve_p);
5161 pve_p = pve_next(pve_p);
5162 iommu_mapping_in_pve = false;
5163 }
5164 }
5165
5166 if (num_mappings != 0) {
5167 if (remove) {
5168 pmap_disjoint_unmap(phys, num_mappings);
5169 } else if (flush_range == NULL) {
5170 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
5171 } else {
5172 /* Resync the pending mapping state in flush_range with our local state. */
5173 assert(num_mappings >= flush_range->pending_disjoint_entries);
5174 flush_range->pending_disjoint_entries = num_mappings;
5175 }
5176 }
5177
5178 if (clear_epoch) {
5179 pmap_epoch_exit();
5180 }
5181
5182 /**
5183 * Undo the explicit disable_preemption() done in PPO_PERCPU_INIT().
5184 * Note that enable_preemption() decrements a per-thread counter, so if
5185 * we happen to still hold the PVH lock in spin mode then preemption won't
5186 * actually be re-enabled until we drop the lock (which also decrements
5187 * the per-thread counter.
5188 */
5189 enable_preemption();
5190
5191 /* if we removed a bunch of entries, take care of them now */
5192 if (remove) {
5193 /**
5194 * If a retype is going to be needed here and/or by our caller, drain
5195 * the epochs to ensure that concurrent calls to batched operations such as
5196 * pmap_remove() and the various multipage attribute update functions have
5197 * finished consuming mappings of this page.
5198 */
5199 bool retype_needed = false;
5200 sptm_frame_type_t frame_type = XNU_DEFAULT;
5201 if (options & (PMAP_OPTIONS_PPO_PENDING_RETYPE | PMAP_OPTIONS_RETYPE)) {
5202 /**
5203 * If the frame type isn't currently XNU_DEFAULT, retype it back either
5204 * to satisfy the caller's request (PMAP_OPTIONS_RETYPE) or to ensure
5205 * the caller's subsequent retype will work as not all non-default types
5206 * can be directly retyped to one another without going through XNU_DEFAULT.
5207 */
5208 frame_type = sptm_get_frame_type(phys);
5209 retype_needed = (frame_type != XNU_DEFAULT);
5210 }
5211 /**
5212 * If the caller is indicating that it will subsequently retype the page
5213 * by passing PMAP_OPTIONS_PPO_PENDING_RETYPE, then we'll need to drain the epochs
5214 * regardless of current frame type to prepare for the caller's retype.
5215 */
5216 const bool drain_needed = retype_needed || !!(options & PMAP_OPTIONS_PPO_PENDING_RETYPE);
5217 if (__improbable(drain_needed)) {
5218 pmap_epoch_prepare_drain();
5219 }
5220 if (new_pve_p != PV_ENTRY_NULL) {
5221 pvh_update_head(&local_locked_pvh, new_pve_p, PVH_TYPE_PVEP);
5222 } else if (new_pte_p != PT_ENTRY_NULL) {
5223 pvh_update_head(&local_locked_pvh, new_pte_p, PVH_TYPE_PTEP);
5224 } else {
5225 pvh_set_flags(&local_locked_pvh, 0);
5226 pvh_update_head(&local_locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
5227 }
5228
5229 if (__improbable(drain_needed)) {
5230 pmap_epoch_drain();
5231 }
5232 if (__improbable(retype_needed)) {
5233 const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5234 sptm_retype(phys, frame_type, XNU_DEFAULT, retype_params);
5235 }
5236 }
5237
5238 if (__probable(locked_pvh == NULL)) {
5239 pvh_unlock(&local_locked_pvh);
5240 } else {
5241 *locked_pvh = local_locked_pvh;
5242 }
5243
5244 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5245 assert(pveh_p != PV_ENTRY_NULL);
5246 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5247 }
5248
5249 if ((flush_range != NULL) && !preemption_enabled()) {
5250 flush_range->processed_entries += num_skipped_mappings;
5251 }
5252 }
5253
5254 MARK_AS_PMAP_TEXT void
5255 pmap_page_protect_options_internal(
5256 ppnum_t ppnum,
5257 vm_prot_t prot,
5258 unsigned int options,
5259 void *arg)
5260 {
5261 if (arg != NULL) {
5262 /*
5263 * This is a legacy argument from pre-ARM era that the VM layer passes in to hint that it will call
5264 * pmap_flush() later to flush the TLB. On ARM platforms, however, pmap_flush() is not implemented,
5265 * as it's typically more efficient to perform the TLB flushing inline with the page table updates
5266 * themselves. Therefore, if the argument is non-NULL, pmap will take care of TLB flushing itself
5267 * by clearing PMAP_OPTIONS_NOFLUSH.
5268 */
5269 options &= ~PMAP_OPTIONS_NOFLUSH;
5270 }
5271 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL, NULL);
5272 }
5273
5274 void
5275 pmap_page_protect_options(
5276 ppnum_t ppnum,
5277 vm_prot_t prot,
5278 unsigned int options,
5279 void *arg)
5280 {
5281 pmap_paddr_t phys = ptoa(ppnum);
5282
5283 assert(ppnum != vm_page_fictitious_addr);
5284
5285 /* Only work with managed pages. */
5286 if (!pa_valid(phys)) {
5287 return;
5288 }
5289
5290 /*
5291 * Determine the new protection.
5292 */
5293 if (prot == VM_PROT_ALL) {
5294 return; /* nothing to do */
5295 }
5296
5297 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5298
5299 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5300
5301 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5302 }
5303
5304
5305 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5306 MARK_AS_PMAP_TEXT void
5307 pmap_disable_user_jop_internal(pmap_t pmap)
5308 {
5309 if (pmap == kernel_pmap) {
5310 panic("%s: called with kernel_pmap", __func__);
5311 }
5312 validate_pmap_mutable(pmap);
5313 sptm_configure_root(pmap->ttep, 0, SPTM_ROOT_PT_FLAG_JOP);
5314 pmap->disable_jop = true;
5315 }
5316
5317 void
5318 pmap_disable_user_jop(pmap_t pmap)
5319 {
5320 pmap_disable_user_jop_internal(pmap);
5321 }
5322 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5323
5324 /*
5325 * Indicates if the pmap layer enforces some additional restrictions on the
5326 * given set of protections.
5327 */
5328 bool
5329 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5330 {
5331 return false;
5332 }
5333
5334 static inline bool
5335 pmap_allows_xo(pmap_t pmap __unused)
5336 {
5337 return true;
5338 }
5339
5340 /*
5341 * Set the physical protection on the
5342 * specified range of this map as requested.
5343 * VERY IMPORTANT: Will not increase permissions.
5344 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5345 */
5346 void
5347 pmap_protect(
5348 pmap_t pmap,
5349 vm_map_address_t b,
5350 vm_map_address_t e,
5351 vm_prot_t prot)
5352 {
5353 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5354 }
5355
5356 static bool
5357 pmap_protect_strong_sync(unsigned int num_mappings __unused)
5358 {
5359 return false;
5360 }
5361
5362 MARK_AS_PMAP_TEXT vm_map_address_t
5363 pmap_protect_options_internal(
5364 pmap_t pmap,
5365 vm_map_address_t start,
5366 vm_map_address_t end,
5367 vm_prot_t prot,
5368 unsigned int options,
5369 __unused void *args)
5370 {
5371 pt_entry_t *pte_p;
5372 bool set_NX = true;
5373 bool set_XO = false;
5374 bool should_have_removed = false;
5375 bool need_strong_sync = false;
5376
5377 /* Validate the pmap input before accessing its data. */
5378 validate_pmap_mutable(pmap);
5379
5380 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5381
5382 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5383 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5384 }
5385
5386 #if DEVELOPMENT || DEBUG
5387 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5388 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5389 should_have_removed = true;
5390 }
5391 } else
5392 #endif
5393 {
5394 /* Determine the new protection. */
5395 switch (prot) {
5396 case VM_PROT_READ:
5397 case VM_PROT_READ | VM_PROT_EXECUTE:
5398 break;
5399 case VM_PROT_READ | VM_PROT_WRITE:
5400 case VM_PROT_ALL:
5401 return end; /* nothing to do */
5402 case VM_PROT_EXECUTE:
5403 set_XO = true;
5404 if (pmap_allows_xo(pmap)) {
5405 break;
5406 }
5407 /* Fall through and panic if this pmap shouldn't be allowed to have XO mappings. */
5408 OS_FALLTHROUGH;
5409 default:
5410 should_have_removed = true;
5411 }
5412 }
5413
5414 if (__improbable(should_have_removed)) {
5415 panic("%s: should have been a remove operation, "
5416 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5417 __FUNCTION__,
5418 pmap, (void *)start, (void *)end, prot, options, args);
5419 }
5420
5421 #if DEVELOPMENT || DEBUG
5422 bool force_write = false;
5423 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5424 force_write = true;
5425 }
5426 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5427 #else
5428 if ((prot & VM_PROT_EXECUTE))
5429 #endif
5430 {
5431 set_NX = false;
5432 } else {
5433 set_NX = true;
5434 }
5435
5436 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5437 vm_map_address_t va = start;
5438 vm_map_address_t sptm_start_va = start;
5439 unsigned int num_mappings = 0;
5440
5441 pmap_lock(pmap, PMAP_LOCK_SHARED);
5442
5443 pte_p = pmap_pte(pmap, start);
5444
5445 if (pte_p == NULL) {
5446 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5447 return end;
5448 }
5449
5450 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
5451 #if DEVELOPMENT || DEBUG
5452 if (!force_write)
5453 #endif
5454 {
5455 disable_preemption();
5456 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5457 }
5458
5459 pt_entry_t tmplate = ARM_PTE_EMPTY;
5460
5461 if (pmap == kernel_pmap) {
5462 #if DEVELOPMENT || DEBUG
5463 if (force_write) {
5464 tmplate = ARM_PTE_AP(AP_RWNA);
5465 } else
5466 #endif
5467 {
5468 tmplate = ARM_PTE_AP(AP_RONA);
5469 }
5470 } else {
5471 #if DEVELOPMENT || DEBUG
5472 if (force_write) {
5473 assert(pmap->type != PMAP_TYPE_NESTED);
5474 tmplate = pt_attr_leaf_rw(pt_attr);
5475 } else
5476 #endif
5477 if (__improbable(set_XO)) {
5478 tmplate = pt_attr_leaf_rona(pt_attr);
5479 } else {
5480 tmplate = pt_attr_leaf_ro(pt_attr);
5481 }
5482 }
5483
5484 if (set_NX) {
5485 tmplate |= pt_attr_leaf_xn(pt_attr);
5486 }
5487
5488 while (va < end) {
5489 pt_entry_t spte = ARM_PTE_EMPTY;
5490
5491 /**
5492 * Removing "NX" would grant "execute" access immediately, bypassing any
5493 * checks VM might want to do in its soft fault path.
5494 * pmap_protect() and co. are not allowed to increase access permissions,
5495 * except in the PMAP_OPTIONS_PROTECT_IMMEDIATE internal-only case.
5496 * Therefore, if we are not explicitly clearing execute permissions, inherit
5497 * the existing permissions.
5498 */
5499 if (!set_NX) {
5500 spte = os_atomic_load(pte_p, relaxed);
5501 if (__improbable(!pte_is_valid(spte))) {
5502 tmplate |= pt_attr_leaf_xn(pt_attr);
5503 } else {
5504 tmplate |= (spte & ARM_PTE_XMASK);
5505 }
5506 }
5507
5508 #if DEVELOPMENT || DEBUG
5509 /*
5510 * PMAP_OPTIONS_PROTECT_IMMEDIATE is an internal-only option that's intended to
5511 * provide a "backdoor" to allow normally write-protected compressor pages to be
5512 * be temporarily written without triggering expensive write faults.
5513 */
5514 while (force_write) {
5515 if (spte == ARM_PTE_EMPTY) {
5516 spte = os_atomic_load(pte_p, relaxed);
5517 }
5518 const pt_entry_t prev_pte = spte;
5519
5520 /* A concurrent disconnect may have cleared the PTE. */
5521 if (__improbable(!pte_is_valid(spte))) {
5522 break;
5523 }
5524
5525 /* Inherit permissions and "was_writeable" from the template. */
5526 spte = (spte & ~(ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE)) |
5527 (tmplate & (ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE));
5528
5529 /* Access flag should be set for any immediate change in protections */
5530 spte |= ARM_PTE_AF;
5531 const pmap_paddr_t pa = pte_to_pa(spte);
5532 const unsigned int pai = pa_index(pa);
5533 locked_pvh_t locked_pvh;
5534 if (pa_valid(pa)) {
5535 locked_pvh = pvh_lock(pai);
5536
5537 /**
5538 * The VM may concurrently call pmap_disconnect() on the compressor
5539 * page in question, e.g. if relocating the page to satisfy a precious
5540 * allocation. Now that we hold the PVH lock, re-check the PTE and
5541 * restart the loop if it's different from the value we read before
5542 * we held the lock.
5543 */
5544 if (__improbable(os_atomic_load(pte_p, relaxed) != prev_pte)) {
5545 pvh_unlock(&locked_pvh);
5546 spte = ARM_PTE_EMPTY;
5547 continue;
5548 }
5549 ppattr_modify_bits(pai, PP_ATTR_REFFAULT | PP_ATTR_MODFAULT,
5550 PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5551 }
5552
5553 __assert_only const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, va, spte);
5554
5555 /**
5556 * We don't expect the VM to be concurrently calling pmap_remove() against these
5557 * compressor mappings. If it does for some reason, that could cause the above
5558 * call to return either SPTM_SUCCESS or SPTM_MAP_FLUSH_PENDING.
5559 */
5560 assert3u(sptm_status, ==, SPTM_MAP_VALID);
5561
5562 if (pa_valid(pa)) {
5563 pvh_unlock(&locked_pvh);
5564 }
5565 break;
5566 }
5567
5568 #endif /* DEVELOPMENT || DEBUG */
5569
5570 va += pmap_page_size;
5571 ++pte_p;
5572
5573 #if DEVELOPMENT || DEBUG
5574 if (!force_write)
5575 #endif
5576 {
5577 sptm_pcpu->sptm_templates[num_mappings] = tmplate;
5578 ++num_mappings;
5579 if (num_mappings == SPTM_MAPPING_LIMIT) {
5580 /**
5581 * Enter the pmap epoch for the batched update operation. This is necessary because we
5582 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
5583 * call, so a concurrent pmap_page_protect() operation against one of those pages may
5584 * race this call. That should be perfectly fine as far as the PTE updates are concerned,
5585 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
5586 * if it does not first drain our epoch.
5587 */
5588 pmap_epoch_enter();
5589 sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5590 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5591 pmap_epoch_exit();
5592 need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5593
5594 /* Temporarily re-enable preemption to allow any urgent ASTs to be processed. */
5595 enable_preemption();
5596 num_mappings = 0;
5597 sptm_start_va = va;
5598 disable_preemption();
5599 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5600 }
5601 }
5602 }
5603
5604 /* This won't happen in the force_write case as we should never increment num_mappings. */
5605 if (num_mappings != 0) {
5606 pmap_epoch_enter();
5607 sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5608 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5609 pmap_epoch_exit();
5610 need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5611 }
5612
5613 #if DEVELOPMENT || DEBUG
5614 if (!force_write)
5615 #endif
5616 {
5617 enable_preemption();
5618 }
5619 pmap_unlock(pmap, PMAP_LOCK_SHARED);
5620 if (__improbable(need_strong_sync)) {
5621 arm64_sync_tlb(true);
5622 }
5623 return va;
5624 }
5625
5626 void
5627 pmap_protect_options(
5628 pmap_t pmap,
5629 vm_map_address_t b,
5630 vm_map_address_t e,
5631 vm_prot_t prot,
5632 unsigned int options,
5633 __unused void *args)
5634 {
5635 vm_map_address_t l, beg;
5636
5637 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5638
5639 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5640 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5641 pmap, (uint64_t)b, (uint64_t)e);
5642 }
5643
5644 /*
5645 * We allow single-page requests to execute non-preemptibly,
5646 * as it doesn't make sense to sample AST_URGENT for a single-page
5647 * operation, and there are a couple of special use cases that
5648 * require a non-preemptible single-page operation.
5649 */
5650 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5651 pmap_verify_preemptible();
5652 }
5653
5654 #if DEVELOPMENT || DEBUG
5655 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5656 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5657 pmap_remove_options(pmap, b, e, options);
5658 return;
5659 }
5660 } else
5661 #endif
5662 {
5663 /* Determine the new protection. */
5664 switch (prot) {
5665 case VM_PROT_READ:
5666 case VM_PROT_READ | VM_PROT_EXECUTE:
5667 break;
5668 case VM_PROT_READ | VM_PROT_WRITE:
5669 case VM_PROT_ALL:
5670 return; /* nothing to do */
5671 case VM_PROT_EXECUTE:
5672 if (pmap_allows_xo(pmap)) {
5673 break;
5674 }
5675 /* Fall through and remove the mapping if XO is requested and [pmap] doesn't allow it. */
5676 OS_FALLTHROUGH;
5677 default:
5678 pmap_remove_options(pmap, b, e, options);
5679 return;
5680 }
5681 }
5682
5683 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5684 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5685 VM_KERNEL_ADDRHIDE(e));
5686
5687 beg = b;
5688
5689 while (beg < e) {
5690 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5691
5692 if (l > e) {
5693 l = e;
5694 }
5695
5696 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5697 }
5698
5699
5700 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5701 }
5702
5703 /**
5704 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5705 *
5706 * @param pmap pmap to insert the pages into.
5707 * @param va virtual address to map the pages into.
5708 * @param pa page number of the first physical page to map.
5709 * @param size block size, in number of pages.
5710 * @param prot mapping protection attributes.
5711 * @param attr flags to pass to pmap_enter().
5712 *
5713 * @return KERN_SUCCESS.
5714 */
5715 kern_return_t
5716 pmap_map_block(
5717 pmap_t pmap,
5718 addr64_t va,
5719 ppnum_t pa,
5720 uint32_t size,
5721 vm_prot_t prot,
5722 int attr,
5723 unsigned int flags)
5724 {
5725 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5726 }
5727
5728 /**
5729 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5730 * As opposed to pmap_map_block(), this function takes
5731 * a physical address as an input and operates using the
5732 * page size associated with the input pmap.
5733 *
5734 * @param pmap pmap to insert the pages into.
5735 * @param va virtual address to map the pages into.
5736 * @param pa physical address of the first physical page to map.
5737 * @param size block size, in number of pages.
5738 * @param prot mapping protection attributes.
5739 * @param attr flags to pass to pmap_enter().
5740 *
5741 * @return KERN_SUCCESS.
5742 */
5743 kern_return_t
5744 pmap_map_block_addr(
5745 pmap_t pmap,
5746 addr64_t va,
5747 pmap_paddr_t pa,
5748 uint32_t size,
5749 vm_prot_t prot,
5750 int attr,
5751 unsigned int flags)
5752 {
5753 #if __ARM_MIXED_PAGE_SIZE__
5754 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5755 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5756 #else
5757 const uint64_t pmap_page_size = PAGE_SIZE;
5758 #endif
5759
5760 for (ppnum_t page = 0; page < size; page++) {
5761 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE, PMAP_MAPPING_TYPE_INFER) != KERN_SUCCESS) {
5762 panic("%s: failed pmap_enter_addr, "
5763 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5764 __FUNCTION__,
5765 pmap, va, (uint64_t)pa, size, prot, flags);
5766 }
5767
5768 va += pmap_page_size;
5769 pa += pmap_page_size;
5770 }
5771
5772
5773 return KERN_SUCCESS;
5774 }
5775
5776 kern_return_t
5777 pmap_enter_addr(
5778 pmap_t pmap,
5779 vm_map_address_t v,
5780 pmap_paddr_t pa,
5781 vm_prot_t prot,
5782 vm_prot_t fault_type,
5783 unsigned int flags,
5784 boolean_t wired,
5785 pmap_mapping_type_t mapping_type)
5786 {
5787 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, mapping_type);
5788 }
5789
5790 /*
5791 * Insert the given physical page (p) at
5792 * the specified virtual address (v) in the
5793 * target physical map with the protection requested.
5794 *
5795 * If specified, the page will be wired down, meaning
5796 * that the related pte can not be reclaimed.
5797 *
5798 * NB: This is the only routine which MAY NOT lazy-evaluate
5799 * or lose information. That is, this routine must actually
5800 * insert this page into the given map eventually (must make
5801 * forward progress eventually.
5802 */
5803 kern_return_t
5804 pmap_enter(
5805 pmap_t pmap,
5806 vm_map_address_t v,
5807 ppnum_t pn,
5808 vm_prot_t prot,
5809 vm_prot_t fault_type,
5810 unsigned int flags,
5811 boolean_t wired,
5812 pmap_mapping_type_t mapping_type)
5813 {
5814 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, mapping_type);
5815 }
5816
5817 /**
5818 * Helper function for determining the frame type that will be required for a physical page given
5819 * a set of mapping constraints.
5820 *
5821 * @param pmap The address space in which the page will be mapped.
5822 * @param pte The fully-configured page table entry, including permissions and output address, that
5823 * will be used for the mapping.
5824 * @param vaddr The virtual address that will be mapped using [pte]
5825 * @param options Extra mapping options that would be passed to pmap_enter() when performing the mapping
5826 * @param mapping_type The mapping type enum that would be passed to pmap_enter() when performing the mapping
5827 * @param prev_frame_type Output param that will store the existing frame type for the physical page
5828 * mapped by [pte]. As an optimization, this will only be queried if [*new_frame_type]
5829 * is determined to be something other than XNU_DEFAULT, otherwise it will be assumed
5830 * to be XNU_DEFAULT
5831 * @param new_frame_type Output param that will store the new frame type that will be required for the
5832 * physical page mapped by [pte]
5833 */
5834 static inline void
5835 pmap_frame_type_for_pte(
5836 pmap_t pmap __assert_only,
5837 pt_entry_t pte,
5838 vm_map_address_t vaddr __assert_only,
5839 unsigned int options,
5840 pmap_mapping_type_t mapping_type,
5841 sptm_frame_type_t *prev_frame_type,
5842 sptm_frame_type_t *new_frame_type)
5843 {
5844 const pmap_paddr_t paddr = pte_to_pa(pte) & ~PAGE_MASK;
5845 assert(prev_frame_type != NULL);
5846 assert(new_frame_type != NULL);
5847 *prev_frame_type = *new_frame_type = XNU_DEFAULT;
5848
5849 const uint64_t pte_perms = pte_to_xprr_perm(pte);
5850 /*
5851 * If the caller specified a mapping type of PMAP_MAPPINGS_TYPE_INFER, then we
5852 * keep the existing logic of deriving the SPTM frame type from the XPRR permissions.
5853 *
5854 * If the caller specified another mapping type, we simply follow that. This refactor was
5855 * needed for the XNU_KERNEL_RESTRICTED work, and it also allows us to be more precise at
5856 * what we want. It's better to let the caller specify the mapping type rather than use the
5857 * permissions for that.
5858 *
5859 * In the future, we should move entirely to use pmap_mapping_type_t; see rdar://114886323.
5860 */
5861 if (__improbable(mapping_type != PMAP_MAPPING_TYPE_INFER)) {
5862 switch (mapping_type) {
5863 case PMAP_MAPPING_TYPE_DEFAULT:
5864 *new_frame_type = (sptm_frame_type_t)mapping_type;
5865 break;
5866 case PMAP_MAPPING_TYPE_ROZONE:
5867 assert(((pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + pt_attr_page_size(pmap_get_pt_attr(pmap)))));
5868 *new_frame_type = (sptm_frame_type_t)mapping_type;
5869 break;
5870 case PMAP_MAPPING_TYPE_RESTRICTED:
5871 if (use_xnu_restricted) {
5872 *new_frame_type = (sptm_frame_type_t)mapping_type;
5873 } else {
5874 *new_frame_type = XNU_DEFAULT;
5875 }
5876 break;
5877 default:
5878 panic("invalid mapping type: %d", mapping_type);
5879 }
5880 } else if (__improbable(pte_perms == XPRR_USER_JIT_PERM)) {
5881 /*
5882 * Always check for XPRR_USER_JIT_PERM before we check for anything else. When using
5883 * RWX permissions, the only allowed type is XNU_USER_JIT, regardless of any other
5884 * flags which the VM may have provided.
5885 *
5886 * TODO: Assert that the PMAP_OPTIONS_XNU_USER_DEBUG flag isn't set when entering
5887 * this case. We can't do this for now because this might trigger on some macOS
5888 * systems where applications use MAP_JIT with RW/RX permissions, and then later
5889 * switch to RWX (which will cause a switch to XNU_USER_JIT from XNU_USER_DEBUG
5890 * but the VM will still have PMAP_OPTIONS_XNU_USER_DEBUG set). If the VM can
5891 * catch this case, and remove PMAP_OPTIONS_XNU_USER_DEBUG when an application
5892 * switches to RWX, then we can start asserting this requirement.
5893 */
5894 *new_frame_type = XNU_USER_JIT;
5895 } else if (__improbable(options & PMAP_OPTIONS_XNU_USER_DEBUG)) {
5896 /*
5897 * Both XNU_USER_DEBUG and XNU_USER_EXEC allow RX permissions. Given that, we must
5898 * test for PMAP_OPTIONS_XNU_USER_DEBUG before we test for XNU_USER_EXEC since the
5899 * XNU_USER_DEBUG type overlays the XNU_USER_EXEC type.
5900 */
5901 *new_frame_type = XNU_USER_DEBUG;
5902 } else if (pte_perms == XPRR_USER_RX_PERM) {
5903 *new_frame_type = XNU_USER_EXEC;
5904 } else if ((pte_perms == XPRR_USER_RW_PERM) ||
5905 (pte_was_writeable(pte) && (pte_perms == XPRR_USER_RO_PERM))) {
5906 /**
5907 * Allow retyping from user executable types (except XNU_USER_DEBUG, which already
5908 * allows user RW mappings) back to XNU_DEFAULT if a writable mapping is requested.
5909 * Our retype logic will disconnect all existing mappings, so future attempts to
5910 * execute these pages will fault, retype back to exec, and go back through any
5911 * needed CS validation. For all other current frame types, just leave the previous
5912 * and new frame types unchanged; for most other types attempting to add a user RW
5913 * mapping is a bug and we should just let the SPTM throw a violation.
5914 */
5915 const sptm_frame_type_t cur_frame_type = sptm_get_frame_type(paddr);
5916 if (__improbable(sptm_type_is_user_executable(cur_frame_type) &&
5917 (cur_frame_type != XNU_USER_DEBUG))) {
5918 *prev_frame_type = cur_frame_type;
5919 }
5920 }
5921
5922 if (__improbable(*new_frame_type != XNU_DEFAULT)) {
5923 *prev_frame_type = sptm_get_frame_type(paddr);
5924 }
5925 }
5926
5927 /*
5928 * Construct a PTE (and the physical page attributes) for the given virtual to
5929 * physical mapping.
5930 *
5931 * @param pmap The pmap representing the address space for which to construct
5932 * the mapping.
5933 * @param pa The physical address to be mapped by the new PTE.
5934 * @param prot Access permissions to apply to the new PTE.
5935 * @param fault_type The type of access fault that is triggering the request
5936 * to construct the new PTE.
5937 * @param wired Whether the new PTE should have the wired bit set.
5938 * @param options The extra mapping options passed to pmap_enter().
5939 * @param pp_attr_bits Output parameter that will return the physical page attributes
5940 * to apply to pp_attr_table for the new mapping.
5941 *
5942 * This function has no side effects and is safe to call while attempting a
5943 * pmap_enter transaction.
5944 */
5945 MARK_AS_PMAP_TEXT static pt_entry_t
5946 pmap_construct_pte(
5947 const pmap_t pmap,
5948 pmap_paddr_t pa,
5949 vm_prot_t prot,
5950 vm_prot_t fault_type,
5951 boolean_t wired,
5952 unsigned int options __unused,
5953 uint16_t *pp_attr_bits /* OUTPUT */
5954 )
5955 {
5956 const pt_attr_t* const pt_attr = pmap_get_pt_attr(pmap);
5957 bool set_NX = false, set_XO = false, set_TPRO = false;
5958 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
5959 assert(pp_attr_bits != NULL);
5960 *pp_attr_bits = 0;
5961
5962 if (wired) {
5963 pte |= ARM_PTE_WIRED;
5964 }
5965
5966 #if DEVELOPMENT || DEBUG
5967 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5968 #else
5969 if ((prot & VM_PROT_EXECUTE))
5970 #endif
5971 {
5972 set_NX = false;
5973 } else {
5974 set_NX = true;
5975 }
5976
5977 if (__improbable(prot == VM_PROT_EXECUTE)) {
5978 set_XO = true;
5979 if (!pmap_allows_xo(pmap)) {
5980 panic("%s: attempted execute-only mapping", __func__);
5981 }
5982 }
5983
5984 if (set_NX) {
5985 pte |= pt_attr_leaf_xn(pt_attr);
5986 } else {
5987 if (pmap == kernel_pmap) {
5988 pte |= ARM_PTE_NX;
5989 } else {
5990 pte |= pt_attr_leaf_x(pt_attr);
5991 }
5992 }
5993
5994 if (pmap == kernel_pmap) {
5995 #if __ARM_KERNEL_PROTECT__
5996 pte |= ARM_PTE_NG;
5997 #endif /* __ARM_KERNEL_PROTECT__ */
5998 if (prot & VM_PROT_WRITE) {
5999 pte |= ARM_PTE_AP(AP_RWNA);
6000 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6001 } else {
6002 pte |= ARM_PTE_AP(AP_RONA);
6003 *pp_attr_bits |= PP_ATTR_REFERENCED;
6004 }
6005 } else {
6006 if (pmap->type != PMAP_TYPE_NESTED) {
6007 pte |= ARM_PTE_NG;
6008 }
6009 if (set_TPRO) {
6010 pte |= pt_attr_leaf_rona(pt_attr);
6011 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6012 } else if (prot & VM_PROT_WRITE) {
6013 assert(pmap->type != PMAP_TYPE_NESTED);
6014 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6015 if (fault_type & VM_PROT_WRITE) {
6016 pte |= pt_attr_leaf_rw(pt_attr);
6017 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6018 } else {
6019 pte |= pt_attr_leaf_ro(pt_attr);
6020 /*
6021 * Mark the page as MODFAULT so that a subsequent write
6022 * may be handled through arm_fast_fault().
6023 */
6024 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6025 pte_set_was_writeable(pte, true);
6026 }
6027 } else {
6028 pte |= pt_attr_leaf_rw(pt_attr);
6029 *pp_attr_bits |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
6030 }
6031 } else {
6032 if (__improbable(set_XO)) {
6033 pte |= pt_attr_leaf_rona(pt_attr);
6034 } else {
6035 pte |= pt_attr_leaf_ro(pt_attr);
6036 }
6037 *pp_attr_bits |= PP_ATTR_REFERENCED;
6038 }
6039 }
6040
6041 pte |= ARM_PTE_AF;
6042 return pte;
6043 }
6044
6045 /**
6046 * This function allows the VM to query whether a mapping operation will result in a page being
6047 * retyped, without actually performing the mapping operation. It's useful for the VM to know
6048 * this when performing up-front page validation under the VM object lock.
6049 *
6050 * @param pmap The address space in which the mapping will occur
6051 * @param vaddr The virtual address that will be mapped
6052 * @param pn The physical page number to be mapped by [vaddr]
6053 * @param prot The permissions to be used for the mapping
6054 * @param options The extra mapping options that would be passed to pmap_enter() if the
6055 * mapping operation were performed
6056 * @param mapping_type The mapping type enum that would be passed to pmap_enter() if the
6057 * mapping operation were performed
6058 *
6059 * @return True if the mapping operation would produce a retype of the page at [pn],
6060 * False otherwise
6061 */
6062 bool
6063 pmap_will_retype(
6064 pmap_t pmap,
6065 vm_map_address_t vaddr,
6066 ppnum_t pn,
6067 vm_prot_t prot,
6068 unsigned int options,
6069 pmap_mapping_type_t mapping_type)
6070 {
6071 const pmap_paddr_t paddr = ptoa(pn);
6072 uint16_t pp_attr_bits;
6073 pt_entry_t pte = pmap_construct_pte(pmap, paddr, prot, prot, false, options, &pp_attr_bits);
6074 sptm_frame_type_t prev_frame_type, new_frame_type;
6075 pmap_frame_type_for_pte(pmap, pte, vaddr, options, mapping_type, &prev_frame_type, &new_frame_type);
6076
6077 return new_frame_type != prev_frame_type;
6078 }
6079
6080 /*
6081 * Attempt to update a PTE constructed by pmap_enter_options().
6082 *
6083 * @note performs no page table or accounting modifications, nor any lasting SPTM page type modification, on failure.
6084 * @note expects to be called with preemption disabled to guarantee safe access to SPTM per-CPU data.
6085 *
6086 * @param pmap The pmap representing the address space in which to store the new PTE
6087 * @param pte_p The physical aperture KVA of the PTE to store
6088 * @param new_pte The new value to store in *pte_p
6089 * @param v The virtual address mapped by pte_p
6090 * @param locked_pvh Input/Output parameter pointing to a wrapped pv_head_table entry returned by
6091 * a previous call to pvh_lock(). *locked_pvh will be updated if existing mappings
6092 * need to be disconnected prior to retyping.
6093 * @param old_pte Returns the prior PTE contents, iff the PTE is successfully updated
6094 * @param options bitmask of PMAP_OPTIONS_* flags passed to pmap_enter_options().
6095 * @param mapping_type The type of the new mapping, this defines which SPTM frame type to use.
6096 *
6097 * @return SPTM_SUCCESS iff able to successfully update *pte_p to new_pte via sptm_map_page(),
6098 * SPTM_MAP_VALID if an existing mapping was successfully upgraded via sptm_map_page(),
6099 * SPTM_MAP_FLUSH_PENDING if the TLB flush of a previous mapping is still in-flight and
6100 * the mapping operation should be retried, or if the mapping operation should be retried
6101 * because we had to temporarily re-enable preemption which would invalidate caller-held
6102 * per-CPU data.
6103 * Otherwise an appropriate SPTM or TXM error code; in these cases the mapping should not be
6104 * retried and the caller should return an error.
6105 */
6106 static inline sptm_return_t
6107 pmap_enter_pte(
6108 pmap_t pmap,
6109 pt_entry_t *pte_p,
6110 pt_entry_t new_pte,
6111 locked_pvh_t *locked_pvh,
6112 pt_entry_t *old_pte,
6113 vm_map_address_t v,
6114 unsigned int options,
6115 pmap_mapping_type_t mapping_type)
6116 {
6117 sptm_pte_t prev_pte;
6118 bool changed_wiring = false;
6119
6120 assert(pte_p != NULL);
6121 assert(old_pte != NULL);
6122
6123 /* SPTM TODO: handle PAGE_RATIO_4 configurations if those devices remain supported. */
6124
6125 assert(get_preemption_level() > 0);
6126 const pmap_paddr_t pa = pte_to_pa(new_pte) & ~PAGE_MASK;
6127 sptm_frame_type_t prev_frame_type;
6128 sptm_frame_type_t new_frame_type;
6129
6130 pmap_frame_type_for_pte(pmap, new_pte, v, options, mapping_type, &prev_frame_type, &new_frame_type);
6131
6132 if (__improbable(new_frame_type != prev_frame_type)) {
6133 /**
6134 * Remove all existing mappings prior to retyping, so that we can safely retype without having to worry
6135 * about a concurrent operation on one of those mappings triggering an SPTM violation. In particular,
6136 * pmap_remove() may clear a mapping to this page without holding its PVH lock. This approach works
6137 * because we hold the PVH lock during this call, and any attempt to enter a new mapping for the page
6138 * will also need to grab the PVH lock and call this function.
6139 */
6140 pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
6141 PMAP_OPTIONS_PPO_PENDING_RETYPE, locked_pvh, NULL);
6142 /**
6143 * In the unlikely event that pmap_page_protect_options_with_flush_range() had to process
6144 * an excessively long PV list, it will have enabled preemption by placing the PVH lock
6145 * in sleep mode. In this case, we may have been migrated to a different CPU, and caller
6146 * assumptions about the state of per-CPU data (such as per-CPU PVE availability) will no
6147 * longer hold true. Ask the caller to retry by pretending we encountered a pending flush.
6148 */
6149 if (__improbable(preemption_enabled())) {
6150 return SPTM_MAP_FLUSH_PENDING;
6151 }
6152 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
6153 /* Reload the existing frame type, as pmap_page_protect_options() may have changed it back to XNU_DEFAULT. */
6154 prev_frame_type = sptm_get_frame_type(pa);
6155 if (new_frame_type != prev_frame_type) {
6156 sptm_retype(pa, prev_frame_type, new_frame_type, retype_params);
6157 }
6158 }
6159
6160 if (pmap->type == PMAP_TYPE_NESTED) {
6161 /**
6162 * Enter the epoch before we check the unnesting state of the leaf page table, so that a
6163 * concurrent pmap_unnest() operation can guarantee that we either observe the unnested
6164 * table state and install a non-global mapping, or have finished installing a global mapping
6165 * before it marks all existing mappings as non-global.
6166 */
6167 pmap_epoch_enter();
6168 vm_map_offset_t nested_region_size = os_atomic_load(&pmap->nested_region_size, acquire);
6169 if (nested_region_size && (v >= pmap->nested_region_addr) && (v < (pmap->nested_region_addr + nested_region_size))) {
6170 assert(pmap->nested_region_addr != 0);
6171 assert(pmap->nested_region_unnested_table_bitmap != NULL);
6172 unsigned int index = (unsigned int)((v - pmap->nested_region_addr) >>
6173 pt_attr_twig_shift(pmap_get_pt_attr(pmap)));
6174
6175 if ((bitmap_test(pmap->nested_region_unnested_table_bitmap, UNNEST_IN_PROGRESS_BIT(index)))) {
6176 new_pte |= ARM_PTE_NG;
6177 }
6178 }
6179 }
6180 const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, v, new_pte);
6181 if (pmap->type == PMAP_TYPE_NESTED) {
6182 pmap_epoch_exit();
6183 }
6184 if (__improbable((sptm_status != SPTM_SUCCESS) && (sptm_status != SPTM_MAP_VALID))) {
6185 /*
6186 * We should always undo our previous retype, even if the SPTM returned SPTM_MAP_FLUSH_PENDING as
6187 * opposed to a TXM error. In the case of SPTM_MAP_FLUSH_PENDING, pmap_enter() will drop the PVH
6188 * lock before turning around to retry the mapping operation. It may then be possible for the
6189 * mapping state of the page to change such that our next attempt to map it will fail with a TXM
6190 * error, so if we were to leave the new type in place here we would then have lost our record
6191 * of the previous type and would effectively leave the page in an inconsistent state.
6192 */
6193 if (__improbable(new_frame_type != prev_frame_type)) {
6194 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
6195 sptm_retype(pa, new_frame_type, prev_frame_type, retype_params);
6196 }
6197 return sptm_status;
6198 }
6199
6200 *old_pte = prev_pte = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes[0];
6201
6202 if (prev_pte != new_pte) {
6203 changed_wiring = pte_is_compressed(prev_pte, pte_p) ?
6204 (new_pte & ARM_PTE_WIRED) != 0 :
6205 (new_pte & ARM_PTE_WIRED) != (prev_pte & ARM_PTE_WIRED);
6206
6207 if ((pmap != kernel_pmap) && changed_wiring) {
6208 pte_update_wiredcnt(pmap, pte_p, (new_pte & ARM_PTE_WIRED) != 0);
6209 }
6210
6211 PMAP_TRACE(4 + pt_attr_leaf_level(pmap_get_pt_attr(pmap)), PMAP_CODE(PMAP__TTE),
6212 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v),
6213 VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)), new_pte);
6214 }
6215
6216 return sptm_status;
6217 }
6218
6219 MARK_AS_PMAP_TEXT static pt_entry_t
6220 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
6221 {
6222 pt_entry_t pte;
6223
6224 switch (wimg & (VM_WIMG_MASK)) {
6225 case VM_WIMG_IO:
6226 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
6227 // Device-nGnRnE. On H14+, accesses to them can be reordered by
6228 // AP, while preserving the security benefits of using device
6229 // mapping against side-channel attacks. On pre-H14 platforms,
6230 // the accesses will still be strongly ordered.
6231 if (is_dram_addr(pa)) {
6232 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6233 } else {
6234 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
6235 #if HAS_FEAT_XS
6236 pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
6237 if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
6238 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
6239 }
6240 #endif /* HAS_FEAT_XS */
6241 }
6242 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6243 break;
6244 case VM_WIMG_RT:
6245 if (is_dram_addr(pa)) {
6246 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
6247 } else {
6248 #if HAS_FEAT_XS
6249 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6250 #else /* HAS_FEAT_XS */
6251 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6252 #endif /* HAS_FEAT_XS */
6253 #if DEBUG || DEVELOPMENT
6254 pmap_wcrt_on_non_dram_count_increment_atomic();
6255 #endif /* DEBUG || DEVELOPMENT */
6256 }
6257 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6258 break;
6259 case VM_WIMG_POSTED:
6260 if (is_dram_addr(pa)) {
6261 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6262 } else {
6263 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
6264 }
6265 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6266 break;
6267 case VM_WIMG_POSTED_REORDERED:
6268 if (is_dram_addr(pa)) {
6269 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6270 } else {
6271 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
6272 }
6273 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6274 break;
6275 case VM_WIMG_POSTED_COMBINED_REORDERED:
6276 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6277 #if HAS_FEAT_XS
6278 if (!is_dram_addr(pa)) {
6279 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6280 }
6281 #endif /* HAS_FEAT_XS */
6282 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6283 break;
6284 case VM_WIMG_WCOMB:
6285 if (is_dram_addr(pa)) {
6286 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
6287 } else {
6288 #if HAS_FEAT_XS
6289 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6290 #else /* HAS_FEAT_XS */
6291 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6292 #endif /* HAS_FEAT_XS */
6293 #if DEBUG || DEVELOPMENT
6294 pmap_wcrt_on_non_dram_count_increment_atomic();
6295 #endif /* DEBUG || DEVELOPMENT */
6296 }
6297 pte |= ARM_PTE_NX | ARM_PTE_PNX;
6298 break;
6299 case VM_WIMG_WTHRU:
6300 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
6301 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6302 break;
6303 case VM_WIMG_COPYBACK:
6304 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
6305 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6306 break;
6307 #if HAS_MTE
6308 case VM_WIMG_MTE:
6309 assert(is_mte_enabled);
6310
6311 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_MTE);
6312 pte |= ARM_PTE_SH(SH_MTE);
6313 break;
6314 #else /* HAS_MTE */
6315 case VM_WIMG_INNERWBACK:
6316 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
6317 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
6318 break;
6319 #endif /* HAS_MTE */
6320 default:
6321 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6322 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6323 }
6324
6325 return pte;
6326 }
6327
6328 MARK_AS_PMAP_TEXT kern_return_t
6329 pmap_enter_options_internal(
6330 pmap_t pmap,
6331 vm_map_address_t v,
6332 pmap_paddr_t pa,
6333 vm_prot_t prot,
6334 vm_prot_t fault_type,
6335 unsigned int flags,
6336 boolean_t wired,
6337 unsigned int options,
6338 pmap_mapping_type_t mapping_type)
6339 {
6340 ppnum_t pn = (ppnum_t)atop(pa);
6341 pt_entry_t *pte_p;
6342 unsigned int wimg_bits;
6343 bool committed = false;
6344 kern_return_t kr = KERN_SUCCESS;
6345 uint16_t pp_attr_bits;
6346 pv_free_list_t *local_pv_free;
6347
6348 validate_pmap_mutable(pmap);
6349
6350 /**
6351 * Prepare for the SPTM call early by prefetching the relavant FTEs. Cache misses
6352 * in SPTM accessing these turn out to contribute to a large portion of delay on
6353 * the critical path. Technically, sptm_prefetch_fte may not find an FTE associated
6354 * with pa and return LIBSPTM_FAILURE. However, we are okay with that as it's only
6355 * a best-effort performance optimization.
6356 */
6357 sptm_prefetch_fte(pmap->ttep);
6358 sptm_prefetch_fte(pa);
6359
6360 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6361
6362 if ((v) & pt_attr_leaf_offmask(pt_attr)) {
6363 panic("pmap_enter_options() pmap %p v 0x%llx",
6364 pmap, (uint64_t)v);
6365 }
6366
6367 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6368 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6369 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6370 }
6371
6372 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6373 panic("pmap_enter_options() pmap %p pa 0x%llx",
6374 pmap, (uint64_t)pa);
6375 }
6376
6377 /* The PA should not extend beyond the architected physical address space */
6378 pa &= ARM_PTE_PAGE_MASK;
6379
6380 if (__improbable((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap))) {
6381 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
6382 extern vm_offset_t ctrr_test_page;
6383 if (__probable(v != ctrr_test_page))
6384 #endif
6385 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6386 }
6387 if (__improbable((prot == VM_PROT_EXECUTE) && !pmap_allows_xo(pmap))) {
6388 return KERN_PROTECTION_FAILURE;
6389 }
6390
6391 assert(pn != vm_page_fictitious_addr);
6392
6393 pmap_lock(pmap, PMAP_LOCK_SHARED);
6394
6395 /*
6396 * Expand pmap to include this pte. Assume that
6397 * pmap is always expanded to include enough hardware
6398 * pages to map one VM page.
6399 */
6400 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6401 /* Must unlock to expand the pmap. */
6402 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6403
6404 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6405
6406 if (kr != KERN_SUCCESS) {
6407 return kr;
6408 }
6409
6410 pmap_lock(pmap, PMAP_LOCK_SHARED);
6411 }
6412
6413 if (options & PMAP_OPTIONS_NOENTER) {
6414 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6415 return KERN_SUCCESS;
6416 }
6417
6418 /*
6419 * Since we may not hold the pmap lock exclusive, updating the pte is
6420 * done via a cmpxchg loop.
6421 * We need to be careful about modifying non-local data structures before commiting
6422 * the new pte since we may need to re-do the transaction.
6423 */
6424 const pt_entry_t prev_pte = os_atomic_load(pte_p, relaxed);
6425
6426 if (pte_is_valid(prev_pte) && (pte_to_pa(prev_pte) != pa)) {
6427 /*
6428 * There is already a mapping here & it's for a different physical page.
6429 * First remove that mapping.
6430 * We assume that we can leave the pmap lock held for shared access rather
6431 * than exclusive access here, because we assume that the VM won't try to
6432 * simultaneously map the same VA to multiple different physical pages.
6433 * If that assumption is violated, sptm_map_page() will panic as the architecture
6434 * does not allow the output address of a mapping to be changed without a break-
6435 * before-make sequence.
6436 */
6437 pmap_remove_range(pmap, v, v + PAGE_SIZE);
6438 }
6439
6440 const pt_entry_t pte = pmap_construct_pte(pmap, pa, prot, fault_type, wired, options, &pp_attr_bits);
6441
6442 while (!committed) {
6443 pt_entry_t spte = ARM_PTE_EMPTY;
6444 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6445 bool skip_footprint_debit = false;
6446
6447 if (pa_valid(pa)) {
6448 unsigned int pai;
6449 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6450
6451 is_internal = FALSE;
6452 is_altacct = FALSE;
6453
6454 pai = pa_index(pa);
6455 locked_pvh_t locked_pvh;
6456
6457 if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
6458 locked_pvh = pvh_lock_nopreempt(pai);
6459 } else {
6460 locked_pvh = pvh_lock(pai);
6461 }
6462
6463 /*
6464 * Make sure that the current per-cpu PV free list has
6465 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6466 * if the transaction succeeds. At this point, preemption has either
6467 * been disabled by the caller or by pvh_lock() above.
6468 * Note that we can still be interrupted, but a primary
6469 * interrupt handler can never enter the pmap.
6470 */
6471 assert(get_preemption_level() > 0);
6472 local_pv_free = &pmap_get_cpu_data()->pv_free;
6473 const bool allocation_required = !pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL) &&
6474 !(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP) && pvh_ptep(locked_pvh.pvh) == pte_p);
6475
6476 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6477 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6478 int new_allocated_pves = 0;
6479 volatile uint16_t *wiredcnt = NULL;
6480 if (pmap != kernel_pmap) {
6481 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6482 wiredcnt = &ptd_info->wiredcnt;
6483 }
6484
6485 while (new_allocated_pves < 2) {
6486 local_pv_free = &pmap_get_cpu_data()->pv_free;
6487 pv_status = pv_alloc(pmap, PMAP_LOCK_SHARED, options, &new_pve_p[new_allocated_pves], &locked_pvh, wiredcnt);
6488 if (pv_status == PV_ALLOC_FAIL) {
6489 break;
6490 } else if (pv_status == PV_ALLOC_RETRY) {
6491 /*
6492 * In the case that pv_alloc() had to grab a new page of PVEs,
6493 * it will have dropped the pmap lock while doing so.
6494 * On non-PPL devices, dropping the lock re-enables preemption so we may
6495 * be on a different CPU now.
6496 */
6497 local_pv_free = &pmap_get_cpu_data()->pv_free;
6498 } else {
6499 /* If we've gotten this far then a node should've been allocated. */
6500 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6501
6502 new_allocated_pves++;
6503 }
6504 }
6505
6506 for (int i = 0; i < new_allocated_pves; i++) {
6507 pv_free(new_pve_p[i]);
6508 }
6509 }
6510
6511 if (pv_status == PV_ALLOC_FAIL) {
6512 pvh_unlock(&locked_pvh);
6513 kr = KERN_RESOURCE_SHORTAGE;
6514 break;
6515 } else if (pv_status == PV_ALLOC_RETRY) {
6516 pvh_unlock(&locked_pvh);
6517 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6518 continue;
6519 }
6520
6521 #if HAS_MTE
6522 if (flags & VM_MEM_MAP_MTE) {
6523 wimg_bits = VM_WIMG_MTE;
6524 } else
6525 #endif /* HAS_MTE */
6526 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6527 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6528 } else {
6529 wimg_bits = pmap_cache_attributes(pn);
6530 }
6531
6532 /**
6533 * We may be retrying this operation after dropping the PVH lock.
6534 * Cache attributes for the physical page may have changed while the lock
6535 * was dropped, so update PTE cache attributes on each loop iteration.
6536 */
6537 const pt_entry_t new_pte = pte | pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6538
6539
6540 const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, new_pte, &locked_pvh, &spte, v, options, mapping_type);
6541 assert(committed == false);
6542 if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
6543 committed = true;
6544 } else if (sptm_status == SPTM_MAP_FLUSH_PENDING) {
6545 pvh_unlock(&locked_pvh);
6546 continue;
6547 } else if (sptm_status == SPTM_MAP_CODESIGN_ERROR) {
6548 pvh_unlock(&locked_pvh);
6549 kr = KERN_CODESIGN_ERROR;
6550 break;
6551 } else {
6552 pvh_unlock(&locked_pvh);
6553 kr = KERN_FAILURE;
6554 break;
6555 }
6556 const bool had_valid_mapping = (sptm_status == SPTM_MAP_VALID);
6557 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6558 if (!had_valid_mapping) {
6559 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6560 int pve_ptep_idx = 0;
6561 pv_status = pmap_enter_pv(pmap, pte_p, options, PMAP_LOCK_SHARED, &locked_pvh, &new_pve_p, &pve_ptep_idx);
6562 /* We did all the allocations up top. So this shouldn't be able to fail. */
6563 if (pv_status != PV_ALLOC_SUCCESS) {
6564 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6565 __func__, pv_status, new_pve_p, pmap);
6566 }
6567
6568 if (pmap != kernel_pmap) {
6569 if (options & PMAP_OPTIONS_INTERNAL) {
6570 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6571 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6572 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6573 /*
6574 * Make a note to ourselves that this
6575 * mapping is using alternative
6576 * accounting. We'll need this in order
6577 * to know which ledger to debit when
6578 * the mapping is removed.
6579 *
6580 * The altacct bit must be set while
6581 * the pv head is locked. Defer the
6582 * ledger accounting until after we've
6583 * dropped the lock.
6584 */
6585 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6586 is_altacct = TRUE;
6587 }
6588 }
6589 if (ppattr_test_reusable(pai) &&
6590 !is_altacct) {
6591 is_reusable = TRUE;
6592 } else if (options & PMAP_OPTIONS_INTERNAL) {
6593 is_internal = TRUE;
6594 } else {
6595 is_external = TRUE;
6596 }
6597 }
6598 }
6599
6600 pvh_unlock(&locked_pvh);
6601
6602 if (pp_attr_bits != 0) {
6603 ppattr_pa_set_bits(pa, pp_attr_bits);
6604 }
6605
6606 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6607 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6608
6609 if (is_internal) {
6610 /*
6611 * Make corresponding adjustments to
6612 * phys_footprint statistics.
6613 */
6614 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6615 if (is_altacct) {
6616 /*
6617 * If this page is internal and
6618 * in an IOKit region, credit
6619 * the task's total count of
6620 * dirty, internal IOKit pages.
6621 * It should *not* count towards
6622 * the task's total physical
6623 * memory footprint, because
6624 * this entire region was
6625 * already billed to the task
6626 * at the time the mapping was
6627 * created.
6628 *
6629 * Put another way, this is
6630 * internal++ and
6631 * alternate_accounting++, so
6632 * net effect on phys_footprint
6633 * is 0. That means: don't
6634 * touch phys_footprint here.
6635 */
6636 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6637 } else {
6638 if (pte_is_compressed(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6639 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6640 skip_footprint_debit = true;
6641 } else {
6642 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6643 }
6644 }
6645 }
6646 if (is_reusable) {
6647 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6648 } else if (is_external) {
6649 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6650 }
6651 }
6652 } else {
6653 if (prot & VM_PROT_EXECUTE) {
6654 kr = KERN_FAILURE;
6655 break;
6656 }
6657
6658 wimg_bits = pmap_cache_attributes(pn);
6659 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6660 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6661 }
6662
6663 pt_entry_t new_pte = pte | pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6664
6665
6666 /**
6667 * pmap_enter_pte() expects to be called with preemption disabled so it can access
6668 * the per-CPU prev_ptes array.
6669 */
6670 disable_preemption();
6671 const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, new_pte, NULL, &spte, v, options, mapping_type);
6672 enable_preemption();
6673 assert(committed == false);
6674 if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
6675 committed = true;
6676
6677 /**
6678 * If there was already a valid pte here then we reuse its
6679 * reference on the ptd and drop the one that we took above.
6680 */
6681 } else if (__improbable(sptm_status != SPTM_MAP_FLUSH_PENDING)) {
6682 panic("%s: Unexpected SPTM return code %u for non-managed PA 0x%llx", __func__, (unsigned int)sptm_status, (unsigned long long)pa);
6683 }
6684 }
6685 if (committed) {
6686 if (pte_is_compressed(spte, pte_p)) {
6687 assert(pmap != kernel_pmap);
6688
6689 /* One less "compressed" */
6690 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6691 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6692
6693 if (spte & ARM_PTE_COMPRESSED_ALT) {
6694 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6695 } else if (!skip_footprint_debit) {
6696 /* Was part of the footprint */
6697 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6698 }
6699 }
6700 }
6701 }
6702
6703 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6704
6705 if (kr == KERN_CODESIGN_ERROR) {
6706 /* Print any logs from TXM */
6707 txm_print_logs();
6708 }
6709 return kr;
6710 }
6711
6712 kern_return_t
6713 pmap_enter_options_addr(
6714 pmap_t pmap,
6715 vm_map_address_t v,
6716 pmap_paddr_t pa,
6717 vm_prot_t prot,
6718 vm_prot_t fault_type,
6719 unsigned int flags,
6720 boolean_t wired,
6721 unsigned int options,
6722 __unused void *arg,
6723 pmap_mapping_type_t mapping_type)
6724 {
6725 kern_return_t kr = KERN_FAILURE;
6726
6727
6728 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6729 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6730
6731 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options, mapping_type);
6732
6733 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6734
6735 return kr;
6736 }
6737
6738 kern_return_t
6739 pmap_enter_options(
6740 pmap_t pmap,
6741 vm_map_address_t v,
6742 ppnum_t pn,
6743 vm_prot_t prot,
6744 vm_prot_t fault_type,
6745 unsigned int flags,
6746 boolean_t wired,
6747 unsigned int options,
6748 __unused void *arg,
6749 pmap_mapping_type_t mapping_type)
6750 {
6751 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot,
6752 fault_type, flags, wired, options, arg, mapping_type);
6753 }
6754
6755 /*
6756 * Routine: pmap_change_wiring
6757 * Function: Change the wiring attribute for a map/virtual-address
6758 * pair.
6759 * In/out conditions:
6760 * The mapping must already exist in the pmap.
6761 */
6762 MARK_AS_PMAP_TEXT void
6763 pmap_change_wiring_internal(
6764 pmap_t pmap,
6765 vm_map_address_t v,
6766 boolean_t wired)
6767 {
6768 pt_entry_t *pte_p, prev_pte;
6769
6770 validate_pmap_mutable(pmap);
6771
6772 pmap_lock(pmap, PMAP_LOCK_SHARED);
6773
6774 const pt_entry_t new_wiring = (wired ? ARM_PTE_WIRED : 0);
6775
6776 pte_p = pmap_pte(pmap, v);
6777 if (pte_p == PT_ENTRY_NULL) {
6778 if (!wired) {
6779 /*
6780 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6781 * may have been freed by a remove operation.
6782 */
6783 goto pmap_change_wiring_return;
6784 } else {
6785 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6786 }
6787 }
6788
6789 disable_preemption();
6790 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
6791 sptm_pcpu->sptm_templates[0] = (*pte_p & ~ARM_PTE_WIRED) | new_wiring;
6792
6793 pmap_epoch_enter();
6794 sptm_update_region(pmap->ttep, v, 1, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_SW_WIRED);
6795 pmap_epoch_exit();
6796
6797 prev_pte = os_atomic_load(&sptm_pcpu->sptm_prev_ptes[0], relaxed);
6798 enable_preemption();
6799
6800 if (!pte_is_valid(prev_pte)) {
6801 goto pmap_change_wiring_return;
6802 }
6803
6804 if ((pmap != kernel_pmap) && (wired != pte_is_wired(prev_pte))) {
6805 pte_update_wiredcnt(pmap, pte_p, wired);
6806 }
6807
6808 pmap_change_wiring_return:
6809 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6810 }
6811
6812 void
6813 pmap_change_wiring(
6814 pmap_t pmap,
6815 vm_map_address_t v,
6816 boolean_t wired)
6817 {
6818 pmap_change_wiring_internal(pmap, v, wired);
6819 }
6820
6821 MARK_AS_PMAP_TEXT pmap_paddr_t
6822 pmap_find_pa_internal(
6823 pmap_t pmap,
6824 addr64_t va)
6825 {
6826 pmap_paddr_t pa = 0;
6827
6828 validate_pmap(pmap);
6829
6830 if (pmap != kernel_pmap) {
6831 pmap_lock(pmap, PMAP_LOCK_SHARED);
6832 }
6833
6834 pa = pmap_vtophys(pmap, va);
6835
6836 if (pmap != kernel_pmap) {
6837 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6838 }
6839
6840 return pa;
6841 }
6842
6843 pmap_paddr_t
6844 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6845 {
6846 pmap_paddr_t pa = 0;
6847
6848 if (pmap == kernel_pmap) {
6849 pa = mmu_kvtop(va);
6850 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6851 /*
6852 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6853 * translation even if PAN would prevent kernel access through the translation.
6854 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6855 */
6856 pa = mmu_uvtop(va);
6857 }
6858 return pa;
6859 }
6860
6861 pmap_paddr_t
6862 pmap_find_pa(
6863 pmap_t pmap,
6864 addr64_t va)
6865 {
6866 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6867
6868 if (pa != 0) {
6869 return pa;
6870 }
6871
6872 if (not_in_kdp) {
6873 return pmap_find_pa_internal(pmap, va);
6874 } else {
6875 return pmap_vtophys(pmap, va);
6876 }
6877 }
6878
6879 ppnum_t
6880 pmap_find_phys_nofault(
6881 pmap_t pmap,
6882 addr64_t va)
6883 {
6884 ppnum_t ppn;
6885 ppn = atop(pmap_find_pa_nofault(pmap, va));
6886 return ppn;
6887 }
6888
6889 ppnum_t
6890 pmap_find_phys(
6891 pmap_t pmap,
6892 addr64_t va)
6893 {
6894 ppnum_t ppn;
6895 ppn = atop(pmap_find_pa(pmap, va));
6896 return ppn;
6897 }
6898
6899 /**
6900 * Translate a kernel virtual address into a physical address.
6901 *
6902 * @param va The kernel virtual address to translate. Does not work on user
6903 * virtual addresses.
6904 *
6905 * @return The physical address if the translation was successful, or zero if
6906 * no valid mappings were found for the given virtual address.
6907 */
6908 pmap_paddr_t
6909 kvtophys(vm_offset_t va)
6910 {
6911 sptm_paddr_t pa;
6912
6913 if (sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS) {
6914 return 0;
6915 }
6916
6917 return pa;
6918 }
6919
6920 /**
6921 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6922 * points to a non-kernel-managed physical page, then this call will panic().
6923 *
6924 * @note The output of this function is guaranteed to be a kernel-managed
6925 * physical page, which means it's safe to pass the output directly to
6926 * pa_index() to create a physical address index for various pmap data
6927 * structures.
6928 *
6929 * @param va The kernel virtual address to translate. Does not work on user
6930 * virtual addresses.
6931 *
6932 * @return The translated physical address for the given virtual address.
6933 */
6934 pmap_paddr_t
6935 kvtophys_nofail(vm_offset_t va)
6936 {
6937 pmap_paddr_t pa;
6938
6939 if (__improbable(sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS)) {
6940 panic("%s: VA->PA translation failed for va %p", __func__, (void *)va);
6941 }
6942
6943 return pa;
6944 }
6945
6946 pmap_paddr_t
6947 pmap_vtophys(
6948 pmap_t pmap,
6949 addr64_t va)
6950 {
6951 if ((va < pmap->min) || (va >= pmap->max)) {
6952 return 0;
6953 }
6954
6955 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6956
6957 tt_entry_t * ttp = NULL;
6958 tt_entry_t * ttep = NULL;
6959 tt_entry_t tte = ARM_TTE_EMPTY;
6960 pmap_paddr_t pa = 0;
6961 unsigned int cur_level;
6962
6963 ttp = pmap->tte;
6964
6965 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6966 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6967
6968 tte = *ttep;
6969
6970 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6971 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6972 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6973 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6974
6975 if ((tte & valid_mask) != valid_mask) {
6976 return (pmap_paddr_t) 0;
6977 }
6978
6979 /* This detects both leaf entries and intermediate block mappings. */
6980 if ((tte & type_mask) == type_block) {
6981 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6982 break;
6983 }
6984
6985 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6986 }
6987
6988 return pa;
6989 }
6990
6991 /*
6992 * pmap_init_pte_page - Initialize a page table page.
6993 */
6994 MARK_AS_PMAP_TEXT void
6995 pmap_init_pte_page(
6996 pmap_t pmap,
6997 pt_entry_t *pte_p,
6998 vm_offset_t va,
6999 unsigned int ttlevel,
7000 boolean_t alloc_ptd)
7001 {
7002 pt_desc_t *ptdp = NULL;
7003 unsigned int pai = pa_index(kvtophys_nofail((vm_offset_t)pte_p));
7004 const uintptr_t pvh = pai_to_pvh(pai);
7005
7006 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
7007 if (alloc_ptd) {
7008 /*
7009 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
7010 * on 4KB hardware, we may already have allocated a page table descriptor for a
7011 * bootstrap request, so we check for an existing PTD here.
7012 */
7013 ptdp = ptd_alloc(pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
7014 if (ptdp == NULL) {
7015 panic("%s: unable to allocate PTD", __func__);
7016 }
7017 locked_pvh_t locked_pvh = pvh_lock(pai);
7018 pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
7019 pvh_unlock(&locked_pvh);
7020 } else {
7021 panic("pmap_init_pte_page(): no PTD for pte_p %p", pte_p);
7022 }
7023 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7024 ptdp = pvh_ptd(pvh);
7025 } else {
7026 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7027 }
7028
7029 // pagetable zero-fill and barrier should be guaranteed by the SPTM
7030 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7031 }
7032
7033 /*
7034 * This function guarantees that a pmap has the necessary page tables in place
7035 * to map the specified VA. If necessary, it will allocate new tables at any
7036 * non-root level in the hierarchy (the root table is always already allocated
7037 * and stored in the pmap).
7038 *
7039 * @note This function is expected to be called without any pmap or PVH lock
7040 * held.
7041 *
7042 * @note It is possible for an L3 table newly allocated by this function to be
7043 * deleted by another thread before control returns to the caller, iff that
7044 * table is an ordinary userspace table. Callers that use this function
7045 * to allocate new user L3 tables are therefore expected to keep calling
7046 * this function until they observe a successful L3 PTE lookup with the pmap
7047 * lock held. As long as it does not drop the pmap lock, the caller may
7048 * then safely use the looked-up L3 table. See the use of this function in
7049 * pmap_enter_options_internal() for an example.
7050 *
7051 * @param pmap The pmap for which to ensure mapping space is present.
7052 * @param vaddr The virtual address for which to ensure mapping space is present
7053 * in [pmap].
7054 * @param options Flags to pass to pmap_tt_allocate() if a new table needs to be
7055 * allocated. The only valid option is PMAP_OPTIONS_NOWAIT, which
7056 * specifies that the allocation must not block.
7057 * @param level The maximum paging level for which to ensure a table is present.
7058 *
7059 * @return KERN_INVALID_ADDRESS if [v] is outside the pmap's mappable range,
7060 * KERN_RESOURCE_SHORTAGE if a new table can't be allocated,
7061 * KERN_SUCCESS otherwise.
7062 */
7063 MARK_AS_PMAP_TEXT static kern_return_t
7064 pmap_expand(
7065 pmap_t pmap,
7066 vm_map_address_t vaddr,
7067 unsigned int options,
7068 unsigned int level)
7069 {
7070 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7071
7072 if (__improbable((vaddr < pmap->min) || (vaddr >= pmap->max))) {
7073 return KERN_INVALID_ADDRESS;
7074 }
7075 pmap_paddr_t table_pa = pmap->ttep;
7076 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7077 const uint64_t table_align_mask = (PAGE_SIZE / pmap_page_size) - 1;
7078 unsigned int ttlevel = pt_attr_root_level(pt_attr);
7079 tt_entry_t *table_ttep = pmap->tte;
7080 tt_entry_t *ttep;
7081 tt_entry_t old_tte = ARM_TTE_EMPTY;
7082
7083 for (; ttlevel < level; ttlevel++) {
7084 /**
7085 * If the previous iteration didn't allocate a new table, obtain the table from the previous TTE.
7086 * Doing this step at the beginning of the loop instead of the end (which would make it part of
7087 * the prior iteration) avoids the possibility of executing this step to extract an L3 table KVA
7088 * from an L2 TTE, which would be useless because there would be no next iteration to make use
7089 * of the table KVA.
7090 */
7091 if (table_ttep == NULL) {
7092 assert(tte_is_valid_table(old_tte));
7093 table_pa = old_tte & ARM_TTE_TABLE_MASK;
7094 table_ttep = (tt_entry_t*)phystokv(table_pa);
7095 }
7096
7097 vm_map_address_t v = pt_attr_align_va(pt_attr, ttlevel, vaddr);
7098
7099 /**
7100 * We don't need to hold the pmap lock while walking the paging hierarchy. Only L3 tables are
7101 * allowed to be dynamically removed, and only for regular user pmaps at that. We may allocate
7102 * a new L3 table below, but we will only access L0-L2 tables, so there's no risk of a table
7103 * being deleted while we are using it for the next level(s) of lookup.
7104 */
7105 ttep = &table_ttep[ttn_index(pt_attr, vaddr, ttlevel)];
7106 old_tte = os_atomic_load(ttep, relaxed);
7107 table_ttep = NULL;
7108 if (!tte_is_valid_table(old_tte)) {
7109 tt_entry_t new_tte, *new_ttep;
7110 pt_desc_t *new_ptdp;
7111 while (pmap_tt_allocate(pmap, &new_ttep, &new_ptdp, ttlevel + 1, options | PMAP_PAGE_NOZEROFILL) != KERN_SUCCESS) {
7112 if (options & PMAP_OPTIONS_NOWAIT) {
7113 return KERN_RESOURCE_SHORTAGE;
7114 }
7115 VM_PAGE_WAIT();
7116 }
7117 assert(pa_valid(table_pa));
7118 /**
7119 * Grab the lower-level table's PVH lock to ensure we don't try to concurrently map different
7120 * tables at the same TTE.
7121 */
7122 locked_pvh_t locked_pvh = pvh_lock(pa_index(table_pa));
7123 old_tte = os_atomic_load(ttep, relaxed);
7124 if (!tte_is_valid_table(old_tte)) {
7125 /**
7126 * This call must be issued prior to sptm_map_table() so that the page table's
7127 * PTD info is valid by the time the new table becomes visible in the paging
7128 * hierarchy. sptm_map_table() is expected to issue a barrier that effectively
7129 * guarantees the PTD update will be visible to concurrent observers as soon as
7130 * the new table becomes visible in the paging hierarchy.
7131 */
7132 pmap_init_pte_page(pmap, (pt_entry_t *) new_ttep, v, ttlevel + 1, FALSE);
7133 pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)new_ttep);
7134 /*
7135 * If the table is going to map a kernel RO zone VA region, then we must
7136 * upgrade its SPTM type to XNU_PAGE_TABLE_ROZONE. The SPTM's type system
7137 * requires the table to be transitioned through XNU_DEFAULT for refcount
7138 * enforcement, which is fine since this path is expected to execute only
7139 * once during boot.
7140 */
7141 if (__improbable(ttlevel == pt_attr_twig_level(pt_attr)) &&
7142 (pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE)) {
7143 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
7144 sptm_retype(pa, XNU_PAGE_TABLE, XNU_DEFAULT, retype_params);
7145 retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
7146 sptm_retype(pa, XNU_DEFAULT, XNU_PAGE_TABLE_ROZONE, retype_params);
7147 }
7148 new_tte = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7149 sptm_map_table(pmap->ttep, v, (sptm_pt_level_t)ttlevel, new_tte);
7150 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7151 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), new_tte);
7152
7153 /**
7154 * Now that we've fully mapped the table, do final initialization of PTD
7155 * state, which includes dropping the wired count to allow future reclamation
7156 * of the page table page.
7157 */
7158 ptd_info_finalize(new_ptdp);
7159
7160 table_pa = pa;
7161 /**
7162 * If we need to set up multiple TTEs mapping different parts of the same page
7163 * (e.g. because we're carving multiple 4K page tables out of a 16K native page,
7164 * determine which of the grouped TTEs is the one that we need to follow for the
7165 * next level of the table walk.
7166 */
7167 table_ttep = new_ttep + ((((uintptr_t)ttep / sizeof(tt_entry_t)) & table_align_mask) *
7168 (pmap_page_size / sizeof(tt_entry_t)));
7169 new_ttep = (tt_entry_t *)NULL;
7170 }
7171 pvh_unlock(&locked_pvh);
7172
7173 if (new_ttep != (tt_entry_t *)NULL) {
7174 pmap_tt_deallocate(pmap, new_ttep, ttlevel + 1);
7175 new_ttep = (tt_entry_t *)NULL;
7176 }
7177 }
7178 }
7179
7180 return KERN_SUCCESS;
7181 }
7182
7183 /*
7184 * Routine: pmap_gc
7185 * Function:
7186 * Pmap garbage collection
7187 * Called by the pageout daemon when pages are scarce.
7188 *
7189 */
7190 void
7191 pmap_gc(void)
7192 {
7193 /*
7194 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7195 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7196 * or may contain wired mappings. However, it may make sense to scan the pmap VM
7197 * object here, and for each page consult the SPTM frame table and if necessary
7198 * the PTD in the PV head table. If the frame table indicates the page is a leaf
7199 * page table page and the PTD indicates it has no wired mappings, we can call
7200 * pmap_remove() on the VA region mapped by the page and therein return the page
7201 * to the VM.
7202 */
7203 }
7204
7205 /*
7206 * By default, don't attempt pmap GC more frequently
7207 * than once / 1 minutes.
7208 */
7209
7210 void
7211 compute_pmap_gc_throttle(
7212 void *arg __unused)
7213 {
7214 }
7215
7216 /*
7217 * pmap_attribute_cache_sync(vm_offset_t pa)
7218 *
7219 * Invalidates all of the instruction cache on a physical page and
7220 * pushes any dirty data from the data cache for the same physical page
7221 */
7222
7223 kern_return_t
7224 pmap_attribute_cache_sync(
7225 ppnum_t pp,
7226 vm_size_t size,
7227 __unused vm_machine_attribute_t attribute,
7228 __unused vm_machine_attribute_val_t * value)
7229 {
7230 if (size > PAGE_SIZE) {
7231 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7232 } else {
7233 cache_sync_page(pp);
7234 }
7235
7236 return KERN_SUCCESS;
7237 }
7238
7239 /*
7240 * pmap_sync_page_data_phys(ppnum_t pp)
7241 *
7242 * Invalidates all of the instruction cache on a physical page and
7243 * pushes any dirty data from the data cache for the same physical page.
7244 * Not required on SPTM systems, because the SPTM automatically performs
7245 * the invalidate operation when retyping to one of the types that allow
7246 * for executable permissions.
7247 */
7248 void
7249 pmap_sync_page_data_phys(
7250 __unused ppnum_t pp)
7251 {
7252 return;
7253 }
7254
7255 /*
7256 * pmap_sync_page_attributes_phys(ppnum_t pp)
7257 *
7258 * Write back and invalidate all cachelines on a physical page.
7259 */
7260 void
7261 pmap_sync_page_attributes_phys(
7262 ppnum_t pp)
7263 {
7264 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7265 }
7266
7267 #if CONFIG_COREDUMP
7268 /* temporary workaround */
7269 boolean_t
7270 coredumpok(
7271 vm_map_t map,
7272 mach_vm_offset_t va)
7273 {
7274 pt_entry_t *pte_p;
7275 pt_entry_t spte;
7276
7277 pte_p = pmap_pte(map->pmap, va);
7278 if (0 == pte_p) {
7279 return FALSE;
7280 }
7281 if (vm_map_entry_has_device_pager(map, va)) {
7282 return FALSE;
7283 }
7284 spte = *pte_p;
7285 #if HAS_MTE
7286 return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT || ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_MTE;
7287 #else /* !HAS_MTE */
7288 return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT;
7289 #endif /* HAS_MTE */
7290 }
7291 #endif
7292
7293 void
7294 fillPage(
7295 ppnum_t pn,
7296 unsigned int fill)
7297 {
7298 unsigned int *addr;
7299 int count;
7300
7301 addr = (unsigned int *) phystokv(ptoa(pn));
7302 count = PAGE_SIZE / sizeof(unsigned int);
7303 while (count--) {
7304 *addr++ = fill;
7305 }
7306 }
7307
7308 extern void mapping_set_mod(ppnum_t pn);
7309
7310 void
7311 mapping_set_mod(
7312 ppnum_t pn)
7313 {
7314 pmap_set_modify(pn);
7315 }
7316
7317 extern void mapping_set_ref(ppnum_t pn);
7318
7319 void
7320 mapping_set_ref(
7321 ppnum_t pn)
7322 {
7323 pmap_set_reference(pn);
7324 }
7325
7326 /*
7327 * Clear specified attribute bits.
7328 *
7329 * Try to force an arm_fast_fault() for all mappings of
7330 * the page - to force attributes to be set again at fault time.
7331 * If the forcing succeeds, clear the cached bits at the head.
7332 * Otherwise, something must have been wired, so leave the cached
7333 * attributes alone.
7334 */
7335 MARK_AS_PMAP_TEXT static void
7336 phys_attribute_clear_with_flush_range(
7337 ppnum_t pn,
7338 unsigned int bits,
7339 int options,
7340 void *arg,
7341 pmap_tlb_flush_range_t *flush_range)
7342 {
7343 pmap_paddr_t pa = ptoa(pn);
7344 vm_prot_t allow_mode = VM_PROT_ALL;
7345
7346 if ((arg != NULL) || (flush_range != NULL)) {
7347 options = options & ~PMAP_OPTIONS_NOFLUSH;
7348 }
7349
7350 if (__improbable((options & PMAP_OPTIONS_FF_WIRED) != 0)) {
7351 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7352 "invalid options",
7353 pn, bits, options, arg, flush_range);
7354 }
7355
7356 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7357 (options & PMAP_OPTIONS_NOFLUSH))) {
7358 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7359 "should not clear 'modified' without flushing TLBs",
7360 pn, bits, options, arg, flush_range);
7361 }
7362
7363 assert(pn != vm_page_fictitious_addr);
7364
7365 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7366 assert(bits == PP_ATTR_MODIFIED);
7367
7368 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, NULL, flush_range);
7369 /*
7370 * We short circuit this case; it should not need to
7371 * invoke arm_force_fast_fault, so just clear the modified bit.
7372 * pmap_page_protect has taken care of resetting
7373 * the state so that we'll see the next write as a fault to
7374 * the VM (i.e. we don't want a fast fault).
7375 */
7376 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7377 return;
7378 }
7379 if (bits & PP_ATTR_REFERENCED) {
7380 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7381 }
7382 if (bits & PP_ATTR_MODIFIED) {
7383 allow_mode &= ~VM_PROT_WRITE;
7384 }
7385
7386 if (bits == PP_ATTR_NOENCRYPT) {
7387 /*
7388 * We short circuit this case; it should not need to
7389 * invoke arm_force_fast_fault, so just clear and
7390 * return. On ARM, this bit is just a debugging aid.
7391 */
7392 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7393 return;
7394 }
7395
7396 arm_force_fast_fault_with_flush_range(pn, allow_mode, options, NULL, (pp_attr_t)bits, flush_range);
7397 }
7398
7399 MARK_AS_PMAP_TEXT void
7400 phys_attribute_clear_internal(
7401 ppnum_t pn,
7402 unsigned int bits,
7403 int options,
7404 void *arg)
7405 {
7406 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7407 }
7408
7409 #if __ARM_RANGE_TLBI__
7410
7411 MARK_AS_PMAP_TEXT static vm_map_address_t
7412 phys_attribute_clear_twig_internal(
7413 pmap_t pmap,
7414 vm_map_address_t start,
7415 vm_map_address_t end,
7416 unsigned int bits,
7417 unsigned int options,
7418 pmap_tlb_flush_range_t *flush_range)
7419 {
7420 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7421 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7422 assert(end >= start);
7423 assert((end - start) <= pt_attr_twig_size(pt_attr));
7424 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7425 vm_map_address_t va = start;
7426 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7427 tt_entry_t *tte_p;
7428 tte_p = pmap_tte(pmap, start);
7429
7430 /**
7431 * It's possible that this portion of our VA region has never been paged in, in which case
7432 * there may not be a valid twig or leaf table here.
7433 */
7434 if ((tte_p == (tt_entry_t *) NULL) || !tte_is_valid_table(*tte_p)) {
7435 assert(flush_range->pending_region_entries == 0);
7436 return end;
7437 }
7438
7439 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7440
7441 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7442 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7443 assert(end_pte_p >= start_pte_p);
7444 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7445 if (flush_range->pending_region_entries == 0) {
7446 flush_range->pending_region_start = va;
7447 } else {
7448 assertf((flush_range->pending_region_start +
7449 (flush_range->pending_region_entries * pmap_page_size)) == va,
7450 "pending_region_start 0x%llx + 0x%lx pages != va 0%llx",
7451 (unsigned long long)flush_range->pending_region_start,
7452 (unsigned long)flush_range->pending_region_entries,
7453 (unsigned long long)va);
7454 }
7455 flush_range->current_ptep = curr_pte_p;
7456 const pt_entry_t spte = os_atomic_load(curr_pte_p, relaxed);
7457 const pmap_paddr_t pa = pte_to_pa(spte);
7458 if (pte_is_valid(spte) && pa_valid(pa)) {
7459 /* The PTE maps a managed page, so do the appropriate PV list-based permission changes. */
7460 const ppnum_t pn = (ppnum_t) atop(pa);
7461 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7462 if (__probable(flush_range->region_entry_added)) {
7463 flush_range->region_entry_added = false;
7464 } else {
7465 /**
7466 * It's possible that some other thread removed the mapping between our check
7467 * of the PTE above and taking the PVH lock in the
7468 * phys_attribute_clear_with_flush_range() path. In that case we have a
7469 * discontinuity in the region to update, so just submit any pending region
7470 * templates and start a new region op on the next iteration.
7471 */
7472 pmap_multipage_op_submit_region(flush_range);
7473 }
7474 } else if (__improbable(!pte_is_valid(spte))) {
7475 /**
7476 * We've found an invalid mapping, so we have a discontinuity in the the region to
7477 * update. Handle this by submitting any pending region templates and starting a new
7478 * region on the next iteration. In theory we could instead handle this by installing
7479 * a "safe" (AF bit cleared, minimal permissions) PTE template; the SPTM would just
7480 * ignore the update on finding an invalid mapping in the PTE. But we don't know
7481 * what a "safe" template will be in all cases: for example, JIT regions require all
7482 * mappings to either be invalid or to have full RWX permissions.
7483 */
7484 pmap_multipage_op_submit_region(flush_range);
7485 } else if (pmap_insert_flush_range_template(spte, flush_range)) {
7486 /**
7487 * We've found a mapping to a non-managed page, so just insert the existing
7488 * PTE into the pending region ops since we don't manage attributes for non-managed
7489 * pages.
7490 * If pmap_insert_flush_range_template() returns true, indicating that it reached
7491 * the mapping limit and submitted the SPTM call, then we also submit any pending
7492 * disjoint ops. Having pending operations in either category will keep preemption
7493 * disabled, and we want to ensure that we can at least temporarily
7494 * re-enable preemption every SPTM_MAPPING_LIMIT mappings.
7495 */
7496 pmap_multipage_op_submit_disjoint(0, flush_range);
7497 }
7498
7499 /**
7500 * If the total number of pending + processed entries exceeds the mapping threshold,
7501 * we may need to submit all pending operations to avoid excessive preemption latency.
7502 * Otherwise, a small number of pending disjoint or region ops can hold preemption
7503 * disabled across an arbitrary number of total processed entries.
7504 * As an optimization, we may be able to avoid submitting if no urgent AST is
7505 * pending on the local CPU, but only if we aren't currently in an epoch. If we are
7506 * in an epoch, failure to submit in a timely manner can cause another CPU to wait
7507 * too long for our epoch to drain.
7508 */
7509 if (((flush_range->processed_entries + flush_range->pending_disjoint_entries +
7510 flush_range->pending_region_entries) >= SPTM_MAPPING_LIMIT) &&
7511 (pmap_in_epoch() || pmap_pending_preemption())) {
7512 pmap_multipage_op_submit(flush_range);
7513 assert(preemption_enabled());
7514 }
7515 }
7516
7517 /* SPTM region ops can't span L3 table boundaries, so submit any pending region templates now. */
7518 pmap_multipage_op_submit_region(flush_range);
7519 return end;
7520 }
7521
7522 MARK_AS_PMAP_TEXT vm_map_address_t
7523 phys_attribute_clear_range_internal(
7524 pmap_t pmap,
7525 vm_map_address_t start,
7526 vm_map_address_t end,
7527 unsigned int bits,
7528 unsigned int options)
7529 {
7530 if (__improbable(end < start)) {
7531 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7532 }
7533 validate_pmap_mutable(pmap);
7534
7535 vm_map_address_t va = start;
7536 pmap_tlb_flush_range_t flush_range = {
7537 .ptfr_pmap = pmap,
7538 .ptfr_start = start,
7539 .ptfr_end = end,
7540 .current_ptep = NULL,
7541 .pending_region_start = 0,
7542 .pending_region_entries = 0,
7543 .region_entry_added = false,
7544 .current_header = NULL,
7545 .current_header_first_mapping_index = 0,
7546 .processed_entries = 0,
7547 .pending_disjoint_entries = 0,
7548 .ptfr_flush_needed = false
7549 };
7550
7551 pmap_lock(pmap, PMAP_LOCK_SHARED);
7552 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7553
7554 while (va < end) {
7555 vm_map_address_t curr_end;
7556
7557 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7558 if (curr_end > end) {
7559 curr_end = end;
7560 }
7561
7562 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7563 }
7564 pmap_multipage_op_submit(&flush_range);
7565 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7566 assert((flush_range.pending_disjoint_entries == 0) && (flush_range.pending_region_entries == 0));
7567 if (flush_range.ptfr_flush_needed) {
7568 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7569 flush_range.ptfr_start,
7570 flush_range.ptfr_end - flush_range.ptfr_start,
7571 flush_range.ptfr_pmap,
7572 true);
7573 sync_tlb_flush();
7574 }
7575 return va;
7576 }
7577
7578 static void
7579 phys_attribute_clear_range(
7580 pmap_t pmap,
7581 vm_map_address_t start,
7582 vm_map_address_t end,
7583 unsigned int bits,
7584 unsigned int options)
7585 {
7586 /*
7587 * We allow single-page requests to execute non-preemptibly,
7588 * as it doesn't make sense to sample AST_URGENT for a single-page
7589 * operation, and there are a couple of special use cases that
7590 * require a non-preemptible single-page operation.
7591 */
7592 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7593 pmap_verify_preemptible();
7594 }
7595 __assert_only const int preemption_level = get_preemption_level();
7596
7597 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7598
7599 phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7600
7601 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7602
7603 assert(preemption_level == get_preemption_level());
7604 }
7605 #endif /* __ARM_RANGE_TLBI__ */
7606
7607 static void
7608 phys_attribute_clear(
7609 ppnum_t pn,
7610 unsigned int bits,
7611 int options,
7612 void *arg)
7613 {
7614 /*
7615 * Do we really want this tracepoint? It will be extremely chatty.
7616 * Also, should we have a corresponding trace point for the set path?
7617 */
7618 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7619
7620 phys_attribute_clear_internal(pn, bits, options, arg);
7621
7622 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7623 }
7624
7625 /*
7626 * Set specified attribute bits.
7627 *
7628 * Set cached value in the pv head because we have
7629 * no per-mapping hardware support for referenced and
7630 * modify bits.
7631 */
7632 MARK_AS_PMAP_TEXT void
7633 phys_attribute_set_internal(
7634 ppnum_t pn,
7635 unsigned int bits)
7636 {
7637 pmap_paddr_t pa = ptoa(pn);
7638 assert(pn != vm_page_fictitious_addr);
7639
7640 ppattr_pa_set_bits(pa, (uint16_t)bits);
7641
7642 return;
7643 }
7644
7645 static void
7646 phys_attribute_set(
7647 ppnum_t pn,
7648 unsigned int bits)
7649 {
7650 phys_attribute_set_internal(pn, bits);
7651 }
7652
7653
7654 /*
7655 * Check specified attribute bits.
7656 *
7657 * use the software cached bits (since no hw support).
7658 */
7659 static boolean_t
7660 phys_attribute_test(
7661 ppnum_t pn,
7662 unsigned int bits)
7663 {
7664 pmap_paddr_t pa = ptoa(pn);
7665 assert(pn != vm_page_fictitious_addr);
7666 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7667 }
7668
7669
7670 /*
7671 * Set the modify/reference bits on the specified physical page.
7672 */
7673 void
7674 pmap_set_modify(ppnum_t pn)
7675 {
7676 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7677 }
7678
7679
7680 /*
7681 * Clear the modify bits on the specified physical page.
7682 */
7683 void
7684 pmap_clear_modify(
7685 ppnum_t pn)
7686 {
7687 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7688 }
7689
7690
7691 /*
7692 * pmap_is_modified:
7693 *
7694 * Return whether or not the specified physical page is modified
7695 * by any physical maps.
7696 */
7697 boolean_t
7698 pmap_is_modified(
7699 ppnum_t pn)
7700 {
7701 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7702 }
7703
7704
7705 /*
7706 * Set the reference bit on the specified physical page.
7707 */
7708 static void
7709 pmap_set_reference(
7710 ppnum_t pn)
7711 {
7712 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7713 }
7714
7715 /*
7716 * Clear the reference bits on the specified physical page.
7717 */
7718 void
7719 pmap_clear_reference(
7720 ppnum_t pn)
7721 {
7722 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7723 }
7724
7725
7726 /*
7727 * pmap_is_referenced:
7728 *
7729 * Return whether or not the specified physical page is referenced
7730 * by any physical maps.
7731 */
7732 boolean_t
7733 pmap_is_referenced(
7734 ppnum_t pn)
7735 {
7736 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7737 }
7738
7739 /*
7740 * pmap_get_refmod(phys)
7741 * returns the referenced and modified bits of the specified
7742 * physical page.
7743 */
7744 unsigned int
7745 pmap_get_refmod(
7746 ppnum_t pn)
7747 {
7748 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7749 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7750 }
7751
7752 static inline unsigned int
7753 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7754 {
7755 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7756 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7757 }
7758
7759 /*
7760 * pmap_clear_refmod(phys, mask)
7761 * clears the referenced and modified bits as specified by the mask
7762 * of the specified physical page.
7763 */
7764 void
7765 pmap_clear_refmod_options(
7766 ppnum_t pn,
7767 unsigned int mask,
7768 unsigned int options,
7769 void *arg)
7770 {
7771 unsigned int bits;
7772
7773 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7774 phys_attribute_clear(pn, bits, options, arg);
7775 }
7776
7777 /*
7778 * Perform pmap_clear_refmod_options on a virtual address range.
7779 * The operation will be performed in bulk & tlb flushes will be coalesced
7780 * if possible.
7781 *
7782 * Returns true if the operation is supported on this platform.
7783 * If this function returns false, the operation is not supported and
7784 * nothing has been modified in the pmap.
7785 */
7786 bool
7787 pmap_clear_refmod_range_options(
7788 pmap_t pmap __unused,
7789 vm_map_address_t start __unused,
7790 vm_map_address_t end __unused,
7791 unsigned int mask __unused,
7792 unsigned int options __unused)
7793 {
7794 #if __ARM_RANGE_TLBI__
7795 unsigned int bits;
7796 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7797 phys_attribute_clear_range(pmap, start, end, bits, options);
7798 return true;
7799 #else /* __ARM_RANGE_TLBI__ */
7800 #pragma unused(pmap, start, end, mask, options)
7801 /*
7802 * This operation allows the VM to bulk modify refmod bits on a virtually
7803 * contiguous range of addresses. This is large performance improvement on
7804 * platforms that support ranged tlbi instructions. But on older platforms,
7805 * we can only flush per-page or the entire asid. So we currently
7806 * only support this operation on platforms that support ranged tlbi.
7807 * instructions. On other platforms, we require that
7808 * the VM modify the bits on a per-page basis.
7809 */
7810 return false;
7811 #endif /* __ARM_RANGE_TLBI__ */
7812 }
7813
7814 void
7815 pmap_clear_refmod(
7816 ppnum_t pn,
7817 unsigned int mask)
7818 {
7819 pmap_clear_refmod_options(pn, mask, 0, NULL);
7820 }
7821
7822 unsigned int
7823 pmap_disconnect_options(
7824 ppnum_t pn,
7825 unsigned int options,
7826 void *arg)
7827 {
7828 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7829 /*
7830 * On ARM, the "modified" bit is managed by software, so
7831 * we know up-front if the physical page is "modified",
7832 * without having to scan all the PTEs pointing to it.
7833 * The caller should have made the VM page "busy" so noone
7834 * should be able to establish any new mapping and "modify"
7835 * the page behind us.
7836 */
7837 if (pmap_is_modified(pn)) {
7838 /*
7839 * The page has been modified and will be sent to
7840 * the VM compressor.
7841 */
7842 options |= PMAP_OPTIONS_COMPRESSOR;
7843 } else {
7844 /*
7845 * The page hasn't been modified and will be freed
7846 * instead of compressed.
7847 */
7848 }
7849 }
7850
7851 /* disconnect the page */
7852 pmap_page_protect_options(pn, 0, options, arg);
7853
7854 /* return ref/chg status */
7855 return pmap_get_refmod(pn);
7856 }
7857
7858 /*
7859 * Routine:
7860 * pmap_disconnect
7861 *
7862 * Function:
7863 * Disconnect all mappings for this page and return reference and change status
7864 * in generic format.
7865 *
7866 */
7867 unsigned int
7868 pmap_disconnect(
7869 ppnum_t pn)
7870 {
7871 pmap_page_protect(pn, 0); /* disconnect the page */
7872 return pmap_get_refmod(pn); /* return ref/chg status */
7873 }
7874
7875 boolean_t
7876 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7877 {
7878 if (ptoa(first) >= vm_last_phys) {
7879 return FALSE;
7880 }
7881 if (ptoa(last) < vm_first_phys) {
7882 return FALSE;
7883 }
7884
7885 return TRUE;
7886 }
7887
7888 /*
7889 * The state maintained by the noencrypt functions is used as a
7890 * debugging aid on ARM. This incurs some overhead on the part
7891 * of the caller. A special case check in phys_attribute_clear
7892 * (the most expensive path) currently minimizes this overhead,
7893 * but stubbing these functions out on RELEASE kernels yields
7894 * further wins.
7895 */
7896 boolean_t
7897 pmap_is_noencrypt(
7898 ppnum_t pn)
7899 {
7900 #if DEVELOPMENT || DEBUG
7901 boolean_t result = FALSE;
7902
7903 if (!pa_valid(ptoa(pn))) {
7904 return FALSE;
7905 }
7906
7907 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7908
7909 return result;
7910 #else
7911 #pragma unused(pn)
7912 return FALSE;
7913 #endif
7914 }
7915
7916 void
7917 pmap_set_noencrypt(
7918 ppnum_t pn)
7919 {
7920 #if DEVELOPMENT || DEBUG
7921 if (!pa_valid(ptoa(pn))) {
7922 return;
7923 }
7924
7925 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7926 #else
7927 #pragma unused(pn)
7928 #endif
7929 }
7930
7931 void
7932 pmap_clear_noencrypt(
7933 ppnum_t pn)
7934 {
7935 #if DEVELOPMENT || DEBUG
7936 if (!pa_valid(ptoa(pn))) {
7937 return;
7938 }
7939
7940 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7941 #else
7942 #pragma unused(pn)
7943 #endif
7944 }
7945
7946 void
7947 pmap_lock_phys_page(ppnum_t pn)
7948 {
7949 unsigned int pai;
7950 pmap_paddr_t phys = ptoa(pn);
7951
7952 if (pa_valid(phys)) {
7953 pai = pa_index(phys);
7954 __unused const locked_pvh_t locked_pvh = pvh_lock(pai);
7955 } else {
7956 simple_lock(&phys_backup_lock, LCK_GRP_NULL);
7957 }
7958 }
7959
7960
7961 void
7962 pmap_unlock_phys_page(ppnum_t pn)
7963 {
7964 unsigned int pai;
7965 pmap_paddr_t phys = ptoa(pn);
7966
7967 if (pa_valid(phys)) {
7968 pai = pa_index(phys);
7969 locked_pvh_t locked_pvh = {.pvh = pai_to_pvh(pai), .pai = pai};
7970 pvh_unlock(&locked_pvh);
7971 } else {
7972 simple_unlock(&phys_backup_lock);
7973 }
7974 }
7975
7976 MARK_AS_PMAP_TEXT void
7977 pmap_clear_user_ttb_internal(void)
7978 {
7979 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7980 }
7981
7982 void
7983 pmap_clear_user_ttb(void)
7984 {
7985 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7986 pmap_clear_user_ttb_internal();
7987 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7988 }
7989
7990 /**
7991 * Set up a "fast fault", or a page fault that won't go through the VM layer on
7992 * a page. This is primarily used to manage ref/mod bits in software. Depending
7993 * on the value of allow_mode, the next read and/or write of the page will fault
7994 * and the ref/mod bits will be updated.
7995 *
7996 * @param ppnum Page number to set up a fast fault on.
7997 * @param allow_mode VM_PROT_NONE will cause the next read and write access to
7998 * fault.
7999 * VM_PROT_READ will only cause the next write access to fault.
8000 * Other values are undefined.
8001 * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
8002 * PMAP_OPTIONS_FF_WIRED forces a fast fault even on wired pages.
8003 * PMAP_OPTIONS_SET_REUSABLE/PMAP_OPTIONS_CLEAR_REUSABLE updates
8004 * the global reusable bit of the page.
8005 * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
8006 * by the caller. This is an input/output parameter which may be updated
8007 * to reflect a new PV head value to be passed to a later call to pvh_unlock().
8008 * @param bits_to_clear Mask of additional pp_attr_t bits to clear for the physical
8009 * page, iff this function completes successfully and returns
8010 * TRUE. This is typically some combination of
8011 * the referenced, modified, and noencrypt bits.
8012 * @param flush_range When present, this function will skip the TLB flush for the
8013 * mappings that are covered by the range, leaving that to be
8014 * done later by the caller. It may also avoid submitting mapping
8015 * updates directly to the SPTM, instead accumulating them in a
8016 * per-CPU array to be submitted later by the caller.
8017 *
8018 * @return TRUE if the fast fault was successfully configured for all mappings
8019 * of the page, FALSE otherwise (e.g. if wired mappings are present and
8020 * PMAP_OPTIONS_FF_WIRED was not passed).
8021 *
8022 * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
8023 *
8024 * @warning PMAP_OPTIONS_FF_WIRED should only be used with pages accessible from
8025 * EL0. The kernel may assume that accesses to wired, kernel-owned pages
8026 * won't fault.
8027 */
8028 MARK_AS_PMAP_TEXT static boolean_t
8029 arm_force_fast_fault_with_flush_range(
8030 ppnum_t ppnum,
8031 vm_prot_t allow_mode,
8032 int options,
8033 locked_pvh_t *locked_pvh,
8034 pp_attr_t bits_to_clear,
8035 pmap_tlb_flush_range_t *flush_range)
8036 {
8037 pmap_paddr_t phys = ptoa(ppnum);
8038 pv_entry_t *pve_p;
8039 pt_entry_t *pte_p;
8040 unsigned int pai;
8041 boolean_t result;
8042 unsigned int num_mappings = 0, num_skipped_mappings = 0;
8043 bool ref_fault;
8044 bool mod_fault;
8045 bool clear_write_fault = false;
8046 bool ref_aliases_mod = false;
8047
8048 assert(ppnum != vm_page_fictitious_addr);
8049
8050 /**
8051 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
8052 *
8053 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
8054 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
8055 * semantics conflict with each other, so assert they are not both true.
8056 */
8057 assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
8058
8059 if (!pa_valid(phys)) {
8060 return FALSE; /* Not a managed page. */
8061 }
8062
8063 result = TRUE;
8064 ref_fault = false;
8065 mod_fault = false;
8066 pai = pa_index(phys);
8067 locked_pvh_t local_locked_pvh = {.pvh = 0};
8068 if (__probable(locked_pvh == NULL)) {
8069 if (flush_range != NULL) {
8070 /**
8071 * If we're partway through processing a multi-page batched call,
8072 * preemption will already be disabled so we can't simply call
8073 * pvh_lock() which may block. Instead, we first try to acquire
8074 * the lock without waiting, which in most cases should succeed.
8075 * If it fails, we submit the pending batched operations to re-
8076 * enable preemption and then acquire the lock normally.
8077 */
8078 local_locked_pvh = pvh_try_lock(pai);
8079 if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
8080 pmap_multipage_op_submit(flush_range);
8081 local_locked_pvh = pvh_lock(pai);
8082 }
8083 } else {
8084 local_locked_pvh = pvh_lock(pai);
8085 }
8086 } else {
8087 local_locked_pvh = *locked_pvh;
8088 assert(pai == local_locked_pvh.pai);
8089 }
8090 assert(local_locked_pvh.pvh != 0);
8091 pvh_assert_locked(pai);
8092
8093 pte_p = PT_ENTRY_NULL;
8094 pve_p = PV_ENTRY_NULL;
8095 if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
8096 pte_p = pvh_ptep(local_locked_pvh.pvh);
8097 } else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
8098 pve_p = pvh_pve_list(local_locked_pvh.pvh);
8099 } else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
8100 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
8101 }
8102
8103 const bool is_reusable = ppattr_test_reusable(pai);
8104
8105 bool pvh_lock_sleep_mode_needed = false;
8106 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
8107 sptm_disjoint_op_t *sptm_ops = NULL;
8108
8109 /**
8110 * This would also work as a block, with the above variables declared using the
8111 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
8112 * dereferencing __block variables through stack forwarding pointers) isn't needed
8113 * here, as we never need to use this code sequence as a closure.
8114 */
8115 #define FFF_PERCPU_INIT() do { \
8116 disable_preemption(); \
8117 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
8118 sptm_ops = sptm_pcpu->sptm_ops; \
8119 } while (0)
8120
8121 FFF_PERCPU_INIT();
8122
8123 int pve_ptep_idx = 0;
8124
8125 /**
8126 * With regard to TLBI, there are three cases:
8127 *
8128 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
8129 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
8130 * itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
8131 * mapping is out of the range.
8132 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
8133 * let SPTM handle TLBI flushing.
8134 */
8135 const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
8136 const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
8137
8138 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8139 pt_entry_t spte;
8140 pt_entry_t tmplate;
8141
8142 if (__improbable(pvh_lock_sleep_mode_needed)) {
8143 assert((num_mappings == 0) && (num_skipped_mappings == 0));
8144 /**
8145 * Undo the explicit preemption disable done in the last call to FFF_PER_CPU_INIT().
8146 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
8147 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
8148 * core while processing SPTM per-CPU data. At the same time, we also want preemption
8149 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
8150 * urgent ASTs can be handled.
8151 */
8152 enable_preemption();
8153 pvh_lock_enter_sleep_mode(&local_locked_pvh);
8154 pvh_lock_sleep_mode_needed = false;
8155 FFF_PERCPU_INIT();
8156 }
8157
8158 if (pve_p != PV_ENTRY_NULL) {
8159 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8160 if (pte_p == PT_ENTRY_NULL) {
8161 goto fff_skip_pve;
8162 }
8163 }
8164
8165 #ifdef PVH_FLAG_IOMMU
8166 if (pvh_ptep_is_iommu(pte_p)) {
8167 ++num_skipped_mappings;
8168 goto fff_skip_pve;
8169 }
8170 #endif
8171 spte = os_atomic_load(pte_p, relaxed);
8172 if (pte_is_compressed(spte, pte_p)) {
8173 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8174 }
8175
8176 pt_desc_t *ptdp = NULL;
8177 pmap_t pmap = NULL;
8178 vm_map_address_t va = 0;
8179
8180 if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
8181 /**
8182 * If the current mapping matches the flush range's current iteration position,
8183 * there's no need to do the work of getting the PTD. We already know the pmap,
8184 * and the VA is implied by flush_range->pending_region_start.
8185 */
8186 pmap = flush_range->ptfr_pmap;
8187 } else {
8188 ptdp = ptep_get_ptd(pte_p);
8189 pmap = ptdp->pmap;
8190 va = ptd_get_va(ptdp, pte_p);
8191 assert(va >= pmap->min && va < pmap->max);
8192 }
8193
8194 bool skip_pte = pte_is_wired(spte) &&
8195 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8196
8197 if (skip_pte) {
8198 result = FALSE;
8199 }
8200
8201 // A concurrent pmap_remove() may have cleared the PTE
8202 if (__improbable(!pte_is_valid(spte))) {
8203 skip_pte = true;
8204 }
8205
8206 /**
8207 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
8208 * pending disjoint ops, so we don't need to do flush range disjoint op management.
8209 */
8210 if ((flush_range != NULL) && (ptdp != NULL) && !skip_pte) {
8211 /**
8212 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
8213 * We do this in three cases:
8214 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
8215 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
8216 * for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
8217 * 3) We need to change the options passed to the SPTM for a run of one or more mappings. Specifically,
8218 * if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
8219 * belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
8220 * the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
8221 */
8222 uint32_t per_mapping_sptm_update_options = sptm_update_options;
8223 if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8224 per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
8225 }
8226 if ((num_mappings == 0) ||
8227 (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
8228 if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
8229 /**
8230 * If we needed to submit the pending disjoint ops to make room for the new page,
8231 * flush any pending region ops to reenable preemption and restart the loop with
8232 * the lock in sleep mode. This prevents preemption from being held disabled
8233 * for an arbitrary amount of time in the pathological case in which we have
8234 * both pending region ops and an excessively long PV list that repeatedly
8235 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
8236 */
8237 pmap_multipage_op_submit_region(flush_range);
8238 assert(num_mappings == 0);
8239 num_skipped_mappings = 0;
8240 pvh_lock_sleep_mode_needed = true;
8241 continue;
8242 }
8243 }
8244 }
8245
8246 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8247
8248 /* update pmap stats and ledgers */
8249 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8250 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8251 if (is_altacct) {
8252 /*
8253 * We do not track "reusable" status for
8254 * "alternate accounting" mappings.
8255 */
8256 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8257 is_reusable &&
8258 is_internal &&
8259 pmap != kernel_pmap) {
8260 /* one less "reusable" */
8261 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8262 /* one more "internal" */
8263 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8264 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8265
8266 /*
8267 * Since the page is being marked non-reusable, we assume that it will be
8268 * modified soon. Avoid the cost of another trap to handle the fast
8269 * fault when we next write to this page.
8270 */
8271 clear_write_fault = true;
8272 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8273 !is_reusable &&
8274 is_internal &&
8275 pmap != kernel_pmap) {
8276 /* one more "reusable" */
8277 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8278 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8279 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8280 }
8281
8282 if (skip_pte) {
8283 ++num_skipped_mappings;
8284 goto fff_skip_pve;
8285 }
8286
8287 tmplate = spte;
8288
8289 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8290 /* read protection sets the pte to fault */
8291 tmplate = tmplate & ~ARM_PTE_AF;
8292 ref_fault = true;
8293 }
8294 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8295 /* take away write permission if set */
8296 if (pmap == kernel_pmap) {
8297 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8298 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8299 pte_set_was_writeable(tmplate, true);
8300 mod_fault = true;
8301 }
8302 } else {
8303 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8304 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8305 pte_set_was_writeable(tmplate, true);
8306 mod_fault = true;
8307 }
8308 }
8309 }
8310
8311 if (ptdp != NULL) {
8312 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
8313 sptm_ops[num_mappings].vaddr = va;
8314 sptm_ops[num_mappings].pte_template = tmplate;
8315 ++num_mappings;
8316 } else if (pmap_insert_flush_range_template(tmplate, flush_range)) {
8317 /**
8318 * We submit both the pending disjoint and pending region ops whenever
8319 * either category reaches the mapping limit. Having pending operations
8320 * in either category will keep preemption disabled, and we want to ensure
8321 * that we can at least temporarily re-enable preemption roughly every
8322 * SPTM_MAPPING_LIMIT mappings.
8323 */
8324 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
8325 pvh_lock_sleep_mode_needed = true;
8326 num_mappings = num_skipped_mappings = 0;
8327 }
8328 fff_skip_pve:
8329 if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
8330 if (flush_range != NULL) {
8331 /* See comment above for why we submit both disjoint and region ops when we hit the limit. */
8332 pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
8333 pmap_multipage_op_submit_region(flush_range);
8334 } else if (num_mappings > 0) {
8335 sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
8336 }
8337 pvh_lock_sleep_mode_needed = true;
8338 num_mappings = num_skipped_mappings = 0;
8339 }
8340 pte_p = PT_ENTRY_NULL;
8341 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8342 pve_ptep_idx = 0;
8343 pve_p = pve_next(pve_p);
8344 }
8345 }
8346
8347 if (num_mappings != 0) {
8348 sptm_return_t sptm_ret;
8349
8350 if (flush_range == NULL) {
8351 sptm_ret = sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
8352 } else {
8353 /* Resync the pending mapping state in flush_range with our local state. */
8354 assert(num_mappings >= flush_range->pending_disjoint_entries);
8355 flush_range->pending_disjoint_entries = num_mappings;
8356 }
8357 }
8358
8359 /**
8360 * Undo the explicit disable_preemption() done in FFF_PERCPU_INIT().
8361 * Note that enable_preemption() decrements a per-thread counter, so if
8362 * we happen to still hold the PVH lock in spin mode then preemption won't
8363 * actually be re-enabled until we drop the lock (which also decrements
8364 * the per-thread counter.
8365 */
8366 enable_preemption();
8367
8368 /*
8369 * If we are using the same approach for ref and mod
8370 * faults on this PTE, do not clear the write fault;
8371 * this would cause both ref and mod to be set on the
8372 * page again, and prevent us from taking ANY read/write
8373 * fault on the mapping.
8374 */
8375 if (clear_write_fault && !ref_aliases_mod) {
8376 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, local_locked_pvh.pvh, PT_ENTRY_NULL, 0);
8377 }
8378
8379 pp_attr_t attrs_to_clear = (result ? bits_to_clear : 0);
8380 pp_attr_t attrs_to_set = 0;
8381 /* update global "reusable" status for this page */
8382 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8383 attrs_to_clear |= PP_ATTR_REUSABLE;
8384 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8385 attrs_to_set |= PP_ATTR_REUSABLE;
8386 }
8387
8388 if (mod_fault) {
8389 attrs_to_set |= PP_ATTR_MODFAULT;
8390 }
8391 if (ref_fault) {
8392 attrs_to_set |= PP_ATTR_REFFAULT;
8393 }
8394
8395 if (attrs_to_set | attrs_to_clear) {
8396 ppattr_modify_bits(pai, attrs_to_clear, attrs_to_set);
8397 }
8398
8399 if (__probable(locked_pvh == NULL)) {
8400 pvh_unlock(&local_locked_pvh);
8401 } else {
8402 *locked_pvh = local_locked_pvh;
8403 }
8404 if ((flush_range != NULL) && !preemption_enabled()) {
8405 flush_range->processed_entries += num_skipped_mappings;
8406 }
8407 return result;
8408 }
8409
8410 MARK_AS_PMAP_TEXT boolean_t
8411 arm_force_fast_fault_internal(
8412 ppnum_t ppnum,
8413 vm_prot_t allow_mode,
8414 int options)
8415 {
8416 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8417 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8418 }
8419 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL, 0, NULL);
8420 }
8421
8422 /*
8423 * Routine: arm_force_fast_fault
8424 *
8425 * Function:
8426 * Force all mappings for this page to fault according
8427 * to the access modes allowed, so we can gather ref/modify
8428 * bits again.
8429 */
8430
8431 boolean_t
8432 arm_force_fast_fault(
8433 ppnum_t ppnum,
8434 vm_prot_t allow_mode,
8435 int options,
8436 __unused void *arg)
8437 {
8438 pmap_paddr_t phys = ptoa(ppnum);
8439
8440 assert(ppnum != vm_page_fictitious_addr);
8441
8442 if (!pa_valid(phys)) {
8443 return FALSE; /* Not a managed page. */
8444 }
8445
8446 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8447 }
8448
8449 /**
8450 * Clear pending force fault for at most SPTM_MAPPING_LIMIT mappings for this
8451 * page based on the observed fault type, and update the appropriate ref/modify
8452 * bits for the physical page. This typically involves adding write permissions
8453 * back for write faults and setting the Access Flag for both read/write faults
8454 * (since the lack of those things is what caused the fault in the first place).
8455 *
8456 * @note Only SPTM_MAPPING_LIMIT number of mappings can be modified in a single
8457 * arm_clear_fast_fault() call to prevent excessive PVH lock contention as
8458 * the PVH lock should be held for `ppnum` already. If a fault is
8459 * subsequently taken on a mapping we haven't processed, arm_fast_fault()
8460 * will call this function with a non-NULL pte_p to perform a targeted
8461 * fixup.
8462 *
8463 * @param ppnum Page number of the page to clear a pending force fault on.
8464 * @param fault_type The type of access/fault that triggered us wanting to clear
8465 * the pending force fault status. This determines how we
8466 * modify the PTE to not cause a fault in the future and also
8467 * whether we mark the PTE as referenced or modified.
8468 * Typically a write fault would cause the page to be marked
8469 * as referenced and modified, and a read fault would only
8470 * cause the page to be marked as referenced.
8471 * @param pvh pv_head_table entry value for [ppnum] returned by a previous call
8472 * to pvh_lock().
8473 * @param pte_p If this value is non-PT_ENTRY_NULL then only this specified PTE
8474 * will be modified. If it is PT_ENTRY_NULL, then every mapping to
8475 * `ppnum` will be modified.
8476 * @param attrs_to_clear Mask of additional pp_attr_t bits to clear for the physical
8477 * page upon completion of this function. This is typically
8478 * some combination of the REFFAULT and MODFAULT bits.
8479 *
8480 * @return TRUE if any PTEs were modified, FALSE otherwise.
8481 */
8482 MARK_AS_PMAP_TEXT static boolean_t
8483 arm_clear_fast_fault(
8484 ppnum_t ppnum,
8485 vm_prot_t fault_type,
8486 uintptr_t pvh,
8487 pt_entry_t *pte_p,
8488 pp_attr_t attrs_to_clear)
8489 {
8490 const pmap_paddr_t pa = ptoa(ppnum);
8491 pv_entry_t *pve_p;
8492 boolean_t result;
8493 unsigned int num_mappings = 0, num_skipped_mappings = 0;
8494 pp_attr_t attrs_to_set = 0;
8495
8496 assert(ppnum != vm_page_fictitious_addr);
8497
8498 if (!pa_valid(pa)) {
8499 return FALSE; /* Not a managed page. */
8500 }
8501
8502 result = FALSE;
8503 pve_p = PV_ENTRY_NULL;
8504 if (pte_p == PT_ENTRY_NULL) {
8505 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
8506 pte_p = pvh_ptep(pvh);
8507 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
8508 pve_p = pvh_pve_list(pvh);
8509 } else if (__improbable(!pvh_test_type(pvh, PVH_TYPE_NULL))) {
8510 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)pvh, (uint64_t)pa);
8511 }
8512 }
8513
8514 disable_preemption();
8515 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
8516 sptm_disjoint_op_t *sptm_ops = sptm_pcpu->sptm_ops;
8517
8518 int pve_ptep_idx = 0;
8519
8520 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8521 pt_entry_t spte;
8522 pt_entry_t tmplate;
8523
8524 if (pve_p != PV_ENTRY_NULL) {
8525 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8526 if (pte_p == PT_ENTRY_NULL) {
8527 goto cff_skip_pve;
8528 }
8529 }
8530
8531 #ifdef PVH_FLAG_IOMMU
8532 if (pvh_ptep_is_iommu(pte_p)) {
8533 ++num_skipped_mappings;
8534 goto cff_skip_pve;
8535 }
8536 #endif
8537 spte = os_atomic_load(pte_p, relaxed);
8538 // A concurrent pmap_remove() may have cleared the PTE
8539 if (__improbable(!pte_is_valid(spte))) {
8540 ++num_skipped_mappings;
8541 goto cff_skip_pve;
8542 }
8543
8544 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8545 const pmap_t pmap = ptdp->pmap;
8546
8547 tmplate = spte;
8548
8549 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8550 assert(pmap);
8551 {
8552 if (pmap == kernel_pmap) {
8553 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8554 } else {
8555 assert(pmap->type != PMAP_TYPE_NESTED);
8556 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8557 }
8558 }
8559
8560 tmplate |= ARM_PTE_AF;
8561
8562 pte_set_was_writeable(tmplate, false);
8563 attrs_to_set |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8564 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8565 assert(pmap);
8566 tmplate = spte | ARM_PTE_AF;
8567
8568 {
8569 attrs_to_set |= PP_ATTR_REFERENCED;
8570 }
8571 }
8572
8573 assert(spte != ARM_PTE_EMPTY);
8574
8575 if (spte != tmplate) {
8576 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8577 assert(va >= pmap->min && va < pmap->max);
8578
8579 sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
8580 sptm_ops[num_mappings].vaddr = va;
8581 sptm_ops[num_mappings].pte_template = tmplate;
8582 ++num_mappings;
8583 result = TRUE;
8584 }
8585
8586 cff_skip_pve:
8587 if ((num_mappings + num_skipped_mappings) == SPTM_MAPPING_LIMIT) {
8588 if (num_mappings != 0) {
8589 sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
8590 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
8591 num_mappings = 0;
8592 }
8593 /*
8594 * We've reached the limit of mappings that can be processed in a single arm_clear_fast_fault()
8595 * call. Bail out here to avoid excessive PVH lock duration on the fault path. If a fault is
8596 * subsequently taken on a mapping we haven't processed, arm_fast_fault() will call this
8597 * function with a non-NULL pte_p to perform a targeted fixup.
8598 */
8599 break;
8600 }
8601
8602 pte_p = PT_ENTRY_NULL;
8603 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8604 pve_ptep_idx = 0;
8605 pve_p = pve_next(pve_p);
8606 }
8607 }
8608
8609 if (num_mappings != 0) {
8610 assert(result == TRUE);
8611 sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
8612 SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
8613 }
8614
8615 if (attrs_to_set | attrs_to_clear) {
8616 ppattr_modify_bits(pa_index(pa), attrs_to_clear, attrs_to_set);
8617 }
8618 enable_preemption();
8619
8620 return result;
8621 }
8622
8623 /*
8624 * Determine if the fault was induced by software tracking of
8625 * modify/reference bits. If so, re-enable the mapping (and set
8626 * the appropriate bits).
8627 *
8628 * Returns KERN_SUCCESS if the fault was induced and was
8629 * successfully handled.
8630 *
8631 * Returns KERN_FAILURE if the fault was not induced and
8632 * the function was unable to deal with it.
8633 *
8634 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8635 * disallows this type of access.
8636 */
8637 MARK_AS_PMAP_TEXT kern_return_t
8638 arm_fast_fault_internal(
8639 pmap_t pmap,
8640 vm_map_address_t va,
8641 vm_prot_t fault_type,
8642 __unused bool was_af_fault,
8643 __unused bool from_user)
8644 {
8645 kern_return_t result = KERN_FAILURE;
8646 pt_entry_t *ptep;
8647 pt_entry_t spte = ARM_PTE_EMPTY;
8648 locked_pvh_t locked_pvh = {.pvh = 0};
8649 unsigned int pai;
8650 pmap_paddr_t pa;
8651 validate_pmap_mutable(pmap);
8652
8653 if (__probable(preemption_enabled())) {
8654 pmap_lock(pmap, PMAP_LOCK_SHARED);
8655 } else if (__improbable(!pmap_try_lock(pmap, PMAP_LOCK_SHARED))) {
8656 /**
8657 * In certain cases, arm_fast_fault() may be invoked with preemption disabled
8658 * on the copyio path. In theses cases the (in-kernel) caller expects that any
8659 * faults taken against the user address may not be handled successfully
8660 * (vm_fault() allows non-preemptible callers with the possibility that the
8661 * fault may not be successfully handled) and will result in the copyio operation
8662 * returning EFAULT. It is then the caller's responsibility to retry the copyio
8663 * operation in a preemptible context.
8664 *
8665 * For these cases attempting to acquire the sleepable lock will panic, so
8666 * we simply make a best effort and return failure just as the VM does if we
8667 * can't acquire the lock without sleeping.
8668 */
8669 return result;
8670 }
8671
8672 /*
8673 * If the entry doesn't exist, is completely invalid, or is already
8674 * valid, we can't fix it here.
8675 */
8676
8677 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8678 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8679 if (ptep != PT_ENTRY_NULL) {
8680 while (true) {
8681 spte = os_atomic_load(ptep, relaxed);
8682
8683 pa = pte_to_pa(spte);
8684
8685 if ((spte == ARM_PTE_EMPTY) || pte_is_compressed(spte, ptep)) {
8686 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8687 return result;
8688 }
8689
8690 if (!pa_valid(pa)) {
8691 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
8692 if (frame_type == XNU_PROTECTED_IO) {
8693 result = KERN_PROTECTION_FAILURE;
8694 }
8695 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8696 return result;
8697 }
8698 pai = pa_index(pa);
8699 /**
8700 * Check for preemption disablement and in that case use pvh_try_lock()
8701 * for the same reason we use pmap_try_lock() above.
8702 */
8703 if (__probable(preemption_enabled())) {
8704 locked_pvh = pvh_lock(pai);
8705 } else {
8706 locked_pvh = pvh_try_lock(pai);
8707 if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
8708 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8709 return result;
8710 }
8711 }
8712 assert(locked_pvh.pvh != 0);
8713 if (os_atomic_load(ptep, relaxed) == spte) {
8714 /*
8715 * Double-check the spte value, as we care about the AF bit.
8716 * It's also possible that pmap_page_protect() transitioned the
8717 * PTE to compressed/empty before we grabbed the PVH lock.
8718 */
8719 break;
8720 }
8721 pvh_unlock(&locked_pvh);
8722 }
8723 } else {
8724 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8725 return result;
8726 }
8727
8728
8729 if (result == KERN_SUCCESS) {
8730 goto ff_cleanup;
8731 }
8732
8733 pp_attr_t attrs = os_atomic_load(&pp_attr_table[pai], relaxed);
8734 if ((attrs & PP_ATTR_REFFAULT) || ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT))) {
8735 /*
8736 * An attempted access will always clear ref/mod fault state, as
8737 * appropriate for the fault type. arm_clear_fast_fault will
8738 * update the associated PTEs for the page as appropriate; if
8739 * any PTEs are updated, we redrive the access. If the mapping
8740 * does not actually allow for the attempted access, the
8741 * following fault will (hopefully) fail to update any PTEs, and
8742 * thus cause arm_fast_fault to decide that it failed to handle
8743 * the fault.
8744 */
8745 pp_attr_t attrs_to_clear = 0;
8746 if (attrs & PP_ATTR_REFFAULT) {
8747 attrs_to_clear |= PP_ATTR_REFFAULT;
8748 }
8749 if ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT)) {
8750 attrs_to_clear |= PP_ATTR_MODFAULT;
8751 }
8752
8753 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, PT_ENTRY_NULL, attrs_to_clear)) {
8754 /*
8755 * Should this preserve KERN_PROTECTION_FAILURE? The
8756 * cost of not doing so is a another fault in a case
8757 * that should already result in an exception.
8758 */
8759 result = KERN_SUCCESS;
8760 }
8761 }
8762
8763 /*
8764 * If the PTE already has sufficient permissions, we can report the fault as handled.
8765 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8766 * on mappings of the same page
8767 */
8768 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8769 uintptr_t ap_ro, ap_rw, ap_x;
8770 if (pmap == kernel_pmap) {
8771 ap_ro = ARM_PTE_AP(AP_RONA);
8772 ap_rw = ARM_PTE_AP(AP_RWNA);
8773 ap_x = ARM_PTE_NX;
8774 } else {
8775 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8776 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8777 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8778 }
8779 /*
8780 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8781 * hardware they may be xPRR-protected, in which case they'll be handled
8782 * by the is_pte_xprr_protected() case above. Additionally, the exception
8783 * handling path currently does not call arm_fast_fault() without at least
8784 * VM_PROT_READ in fault_type.
8785 */
8786 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8787 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8788 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8789 result = KERN_SUCCESS;
8790 }
8791 }
8792 }
8793
8794 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, ptep, 0)) {
8795 /*
8796 * A prior arm_clear_fast_fault() operation may have returned early due to
8797 * another pending PV list operation or an excessively large PV list.
8798 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8799 * taking a fault on the same mapping.
8800 */
8801 result = KERN_SUCCESS;
8802 }
8803
8804 ff_cleanup:
8805
8806 pvh_unlock(&locked_pvh);
8807 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8808 return result;
8809 }
8810
8811 kern_return_t
8812 arm_fast_fault(
8813 pmap_t pmap,
8814 vm_map_address_t va,
8815 vm_prot_t fault_type,
8816 bool was_af_fault,
8817 __unused bool from_user)
8818 {
8819 kern_return_t result = KERN_FAILURE;
8820
8821 if (va < pmap->min || va >= pmap->max) {
8822 return result;
8823 }
8824
8825 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8826 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8827 from_user);
8828
8829
8830 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8831
8832 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8833
8834 return result;
8835 }
8836
8837 void
8838 pmap_copy_page(
8839 ppnum_t psrc,
8840 ppnum_t pdst,
8841 int options)
8842 {
8843 bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8844 (addr64_t) (ptoa(pdst)),
8845 PAGE_SIZE,
8846 options);
8847 }
8848
8849
8850 /*
8851 * pmap_copy_page copies the specified (machine independent) pages.
8852 */
8853 void
8854 pmap_copy_part_page(
8855 ppnum_t psrc,
8856 vm_offset_t src_offset,
8857 ppnum_t pdst,
8858 vm_offset_t dst_offset,
8859 vm_size_t len)
8860 {
8861 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8862 (addr64_t) (ptoa(pdst) + dst_offset),
8863 len);
8864 }
8865
8866
8867 /*
8868 * pmap_zero_page zeros the specified (machine independent) page.
8869 */
8870 void
8871 pmap_zero_page(
8872 ppnum_t pn)
8873 {
8874 assert(pn != vm_page_fictitious_addr);
8875 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8876 }
8877
8878 /*
8879 * pmap_zero_page_with_options allows to specify further operations
8880 * to perform with the zeroing.
8881 */
8882 void
8883 pmap_zero_page_with_options(
8884 ppnum_t pn,
8885 int options)
8886 {
8887 assert(pn != vm_page_fictitious_addr);
8888 bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8889 }
8890
8891 /*
8892 * pmap_zero_part_page
8893 * zeros the specified (machine independent) part of a page.
8894 */
8895 void
8896 pmap_zero_part_page(
8897 ppnum_t pn,
8898 vm_offset_t offset,
8899 vm_size_t len)
8900 {
8901 assert(pn != vm_page_fictitious_addr);
8902 assert(offset + len <= PAGE_SIZE);
8903 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8904 }
8905
8906 void
8907 pmap_map_globals(
8908 void)
8909 {
8910 pt_entry_t pte;
8911
8912 pte = pa_to_pte(kvtophys_nofail((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX |
8913 ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8914 #if __ARM_KERNEL_PROTECT__
8915 pte |= ARM_PTE_NG;
8916 #endif /* __ARM_KERNEL_PROTECT__ */
8917 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8918 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8919 sptm_map_page(kernel_pmap->ttep, LOWGLOBAL_ALIAS, pte);
8920
8921
8922 #if KASAN
8923 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8924 #endif
8925 }
8926
8927 vm_offset_t
8928 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8929 {
8930 if (__improbable(index >= CPUWINDOWS_MAX)) {
8931 panic("%s: invalid index %u", __func__, index);
8932 }
8933 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8934 }
8935
8936 MARK_AS_PMAP_TEXT unsigned int
8937 pmap_map_cpu_windows_copy_internal(
8938 ppnum_t pn,
8939 vm_prot_t prot,
8940 unsigned int wimg_bits)
8941 {
8942 pt_entry_t *ptep = NULL, pte;
8943 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8944 unsigned int cpu_num;
8945 unsigned int cpu_window_index;
8946 vm_offset_t cpu_copywindow_vaddr = 0;
8947 bool need_strong_sync = false;
8948
8949 assert(get_preemption_level() > 0);
8950 cpu_num = pmap_cpu_data->cpu_number;
8951
8952 for (cpu_window_index = 0; cpu_window_index < CPUWINDOWS_MAX; cpu_window_index++) {
8953 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, cpu_window_index);
8954 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8955 assert(!pte_is_compressed(*ptep, ptep));
8956 if (!pte_is_valid(*ptep)) {
8957 break;
8958 }
8959 }
8960 if (__improbable(cpu_window_index == CPUWINDOWS_MAX)) {
8961 panic("%s: out of windows", __func__);
8962 }
8963
8964 const pmap_paddr_t paddr = ptoa(pn);
8965 pte = pa_to_pte(paddr) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8966 #if __ARM_KERNEL_PROTECT__
8967 pte |= ARM_PTE_NG;
8968 #endif /* __ARM_KERNEL_PROTECT__ */
8969 pte |= wimg_to_pte(wimg_bits, paddr);
8970
8971 if (prot & VM_PROT_WRITE) {
8972 pte |= ARM_PTE_AP(AP_RWNA);
8973 } else {
8974 pte |= ARM_PTE_AP(AP_RONA);
8975 }
8976
8977 /*
8978 * It's expected to be safe for an interrupt handler to nest copy-window usage with the
8979 * active thread on a CPU, as long as a sufficient number of copy windows are available.
8980 * --If the interrupt handler executes before the active thread creates the per-CPU mapping,
8981 * or after the active thread completely removes the mapping, it may use the same mapping
8982 * but will finish execution and tear down the mapping without the thread needing to know.
8983 * --If the interrupt handler executes after the active thread creates the per-CPU mapping,
8984 * it will observe the valid mapping and use a different copy window.
8985 * --If the interrupt handler executes after the active thread clears the PTE in
8986 * pmap_unmap_cpu_windows_copy() but before the active thread flushes the TLB, the code
8987 * for computing cpu_window_index above will observe the PTE_INVALID_IN_FLIGHT token set
8988 * by the SPTM, and will select a different index.
8989 */
8990 const sptm_return_t sptm_status = sptm_map_page(kernel_pmap->ttep, cpu_copywindow_vaddr, pte);
8991 if (__improbable(sptm_status != SPTM_SUCCESS)) {
8992 panic("%s: failed to map CPU copy-window VA 0x%llx with SPTM status %d",
8993 __func__, (unsigned long long)cpu_copywindow_vaddr, sptm_status);
8994 }
8995
8996
8997 /*
8998 * Clean up any pending strong TLB flush for the same window in a thread we may have
8999 * interrupted.
9000 */
9001 if (__improbable(pmap_cpu_data->copywindow_strong_sync[cpu_window_index])) {
9002 arm64_sync_tlb(true);
9003 }
9004 pmap_cpu_data->copywindow_strong_sync[cpu_window_index] = need_strong_sync;
9005
9006 return cpu_window_index;
9007 }
9008
9009 unsigned int
9010 pmap_map_cpu_windows_copy(
9011 ppnum_t pn,
9012 vm_prot_t prot,
9013 unsigned int wimg_bits)
9014 {
9015 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
9016 }
9017
9018 MARK_AS_PMAP_TEXT void
9019 pmap_unmap_cpu_windows_copy_internal(
9020 unsigned int index)
9021 {
9022 unsigned int cpu_num;
9023 vm_offset_t cpu_copywindow_vaddr = 0;
9024 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
9025
9026 assert(index < CPUWINDOWS_MAX);
9027 assert(get_preemption_level() > 0);
9028
9029 cpu_num = pmap_cpu_data->cpu_number;
9030
9031 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
9032 /* Issue full-system DSB to ensure prior operations on the per-CPU window
9033 * (which are likely to have been on I/O memory) are complete before
9034 * tearing down the mapping. */
9035 __builtin_arm_dsb(DSB_SY);
9036 sptm_unmap_region(kernel_pmap->ttep, cpu_copywindow_vaddr, 1, 0);
9037 if (__improbable(pmap_cpu_data->copywindow_strong_sync[index])) {
9038 arm64_sync_tlb(true);
9039 pmap_cpu_data->copywindow_strong_sync[index] = false;
9040 }
9041 }
9042
9043 void
9044 pmap_unmap_cpu_windows_copy(
9045 unsigned int index)
9046 {
9047 return pmap_unmap_cpu_windows_copy_internal(index);
9048 }
9049
9050 /*
9051 * Indicate that a pmap is intended to be used as a nested pmap
9052 * within one or more larger address spaces. This must be set
9053 * before pmap_nest() is called with this pmap as the 'subordinate'.
9054 */
9055 MARK_AS_PMAP_TEXT void
9056 pmap_set_nested_internal(
9057 pmap_t pmap)
9058 {
9059 validate_pmap_mutable(pmap);
9060 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9061 if (__improbable(pmap->type != PMAP_TYPE_USER)) {
9062 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9063 __func__, pmap, pmap->type);
9064 }
9065 pmap->type = PMAP_TYPE_NESTED;
9066 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
9067 retype_params.attr_idx = (pt_attr_page_size(pt_attr) == 4096) ? SPTM_PT_GEOMETRY_4K : SPTM_PT_GEOMETRY_16K;
9068 pmap_txm_acquire_exclusive_lock(pmap);
9069 sptm_retype(pmap->ttep, XNU_USER_ROOT_TABLE, XNU_SHARED_ROOT_TABLE, retype_params);
9070 pmap_txm_release_exclusive_lock(pmap);
9071 pmap_get_pt_ops(pmap)->free_id(pmap);
9072 }
9073
9074 void
9075 pmap_set_nested(
9076 pmap_t pmap)
9077 {
9078 pmap_set_nested_internal(pmap);
9079 }
9080
9081 bool
9082 pmap_is_nested(
9083 pmap_t pmap)
9084 {
9085 return pmap->type == PMAP_TYPE_NESTED;
9086 }
9087
9088 /*
9089 * pmap_trim_range(pmap, start, end)
9090 *
9091 * pmap = pmap to operate on
9092 * start = start of the range
9093 * end = end of the range
9094 *
9095 * Attempts to deallocate TTEs for the given range in the nested range.
9096 */
9097 MARK_AS_PMAP_TEXT static void
9098 pmap_trim_range(
9099 pmap_t pmap,
9100 addr64_t start,
9101 addr64_t end)
9102 {
9103 addr64_t cur;
9104 addr64_t nested_region_start;
9105 addr64_t nested_region_end;
9106 addr64_t adjusted_start;
9107 addr64_t adjusted_end;
9108 addr64_t adjust_offmask;
9109 tt_entry_t * tte_p;
9110 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9111
9112 if (__improbable(end < start)) {
9113 panic("%s: invalid address range, "
9114 "pmap=%p, start=%p, end=%p",
9115 __func__,
9116 pmap, (void*)start, (void*)end);
9117 }
9118
9119 nested_region_start = pmap->nested_region_addr;
9120 nested_region_end = nested_region_start + pmap->nested_region_size;
9121
9122 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9123 panic("%s: range outside nested region %p-%p, "
9124 "pmap=%p, start=%p, end=%p",
9125 __func__, (void *)nested_region_start, (void *)nested_region_end,
9126 pmap, (void*)start, (void*)end);
9127 }
9128
9129 /* Contract the range to TT page boundaries. */
9130 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
9131
9132 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
9133 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9134 adjusted_end = end & ~adjust_offmask;
9135
9136 /* Iterate over the range, trying to remove TTEs. */
9137 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += (pt_attr_twig_size(pt_attr) * page_ratio)) {
9138 tte_p = pmap_tte(pmap, cur);
9139
9140 if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
9141 if ((pmap->type == PMAP_TYPE_NESTED) && (sptm_get_page_table_refcnt(tte_to_pa(*tte_p)) == 0)) {
9142 /* Deallocate for the nested map. */
9143 pmap_tte_deallocate(pmap, cur, tte_p, pt_attr_twig_level(pt_attr), false);
9144 } else if (pmap->type == PMAP_TYPE_USER) {
9145 /**
9146 * Just remove for the parent map. If the leaf table pointed
9147 * to by the TTE being removed (owned by the nested pmap)
9148 * has any mappings, then this call will panic. This
9149 * enforces the policy that tables being trimmed must be
9150 * empty to prevent possible use-after-free attacks.
9151 */
9152 pmap_tte_trim(pmap, cur, tte_p);
9153 } else {
9154 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9155 }
9156 }
9157 }
9158 }
9159
9160 /*
9161 * pmap_trim_internal(grand, subord, vstart, size)
9162 *
9163 * grand = pmap subord is nested in
9164 * subord = nested pmap
9165 * vstart = start of the used range in grand
9166 * size = size of the used range
9167 *
9168 * Attempts to trim the shared region page tables down to only cover the given
9169 * range in subord and grand.
9170 *
9171 * This function assumes that trimming of [subord] happens exactly once, against
9172 * a temporary [grand] pmap, and that it happens before [subord] is ever actually
9173 * nested in a real task pmap. Unlike its PPL predecessor (which can't trust its
9174 * callers), the SPTM implementation therefore does not do any refcounting to
9175 * track top-level pmaps that may have nested tables outside the trimmed range.
9176 */
9177 MARK_AS_PMAP_TEXT void
9178 pmap_trim_internal(
9179 pmap_t grand,
9180 pmap_t subord,
9181 addr64_t vstart,
9182 uint64_t size)
9183 {
9184 addr64_t vend;
9185 addr64_t adjust_offmask;
9186
9187 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9188 panic("%s: grand addr wraps around, "
9189 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9190 __func__, grand, subord, (void*)vstart, size);
9191 }
9192
9193 validate_pmap_mutable(grand);
9194 validate_pmap(subord);
9195
9196 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9197
9198 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9199 panic("%s: subord is of non-nestable type 0x%hhx, "
9200 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9201 __func__, subord->type, grand, subord, (void*)vstart, size);
9202 }
9203
9204 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9205 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9206 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9207 __func__, grand->type, grand, subord, (void*)vstart, size);
9208 }
9209
9210 if (__improbable(grand->nested_pmap != subord)) {
9211 panic("%s: grand->nested != subord, "
9212 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9213 __func__, grand, subord, (void*)vstart, size);
9214 }
9215
9216 if (__improbable((vstart < grand->nested_region_addr) ||
9217 (vend > (grand->nested_region_addr + grand->nested_region_size)))) {
9218 panic("%s: grand range not in nested region, "
9219 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9220 __func__, grand, subord, (void*)vstart, size);
9221 }
9222
9223 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
9224 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
9225 vm_map_offset_t true_end = vend;
9226
9227 os_atomic_store(&subord->nested_region_true_start, vstart & ~adjust_offmask, relaxed);
9228
9229 if (__improbable(os_add_overflow(true_end, adjust_offmask, &true_end))) {
9230 panic("%s: padded true end wraps around, "
9231 "grand=%p, subord=%p, vstart=%p, size=%#llx",
9232 __func__, grand, subord, (void*)vstart, size);
9233 }
9234
9235 os_atomic_store(&subord->nested_region_true_end, true_end & ~adjust_offmask, relaxed);
9236
9237 os_atomic_store(&grand->nested_region_true_start, subord->nested_region_true_start, relaxed);
9238 os_atomic_store(&grand->nested_region_true_end, subord->nested_region_true_end, relaxed);
9239 /* Trim grand to only cover the given range. */
9240 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9241 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9242 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9243 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9244 }
9245
9246 void
9247 pmap_trim(
9248 pmap_t grand,
9249 pmap_t subord,
9250 addr64_t vstart,
9251 uint64_t size)
9252 {
9253 pmap_trim_internal(grand, subord, vstart, size);
9254 }
9255
9256 #if HAS_APPLE_PAC
9257
9258 void *
9259 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9260 {
9261 void *res = NULL;
9262 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
9263
9264 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9265 __compiler_materialize_and_prevent_reordering_on(value);
9266 res = sptm_sign_user_pointer(value, key, discriminator, jop_key);
9267 __compiler_materialize_and_prevent_reordering_on(res);
9268 ml_disable_user_jop_key(jop_key, saved_jop_state);
9269
9270 ml_set_interrupts_enabled(current_intr_state);
9271
9272 return res;
9273 }
9274
9275 typedef struct {
9276 void *locations[SPTM_BATCHED_OPS_LIMIT];
9277 unsigned int index;
9278 uint64_t jop_key;
9279 } pmap_batch_sign_user_ptr_state_t;
9280
9281 static pmap_batch_sign_user_ptr_state_t PERCPU_DATA(percpu_pmap_batch_sign_user_ptr_state);
9282
9283 /**
9284 * Accumulates a user pointer signing request, and calls into SPTM to sign
9285 * them as it sees fit or is told to do so. If an SPTM call is made,
9286 * this function copies the signed pointers to their respective locations.
9287 *
9288 * @note This function will disable preemption when called for the first
9289 * time or for the first time after a submission to SPTM. It enables
9290 * preemption after a submission is made.
9291 *
9292 * @note The caller can force the submission of accumulated ops so far by
9293 * passing a NULL location pointer.
9294 *
9295 * @note The jop_key argument is expected to be consistent throughout a
9296 * batch. This function will panic if it detects the jop_key passed
9297 * in is inconsistent with the other ops in the batch.
9298 *
9299 * @param location The destination where the signed pointer will be copied
9300 * to. The caller can pass a NULL pointer to force an SPTM
9301 * submission of the accumulated signing ops so far. In
9302 * such case, the rest of the argument list is ignored.
9303 * @param value The pointer to be signed.
9304 * @param key The key used to sign the pointer.
9305 * @param discriminator The discriminator used to sign the pointer.
9306 * @param jop_key The JOP key used to sign the pointer.
9307 *
9308 * @return true if an SPTM call was made. Otherwise false.
9309 */
9310 bool
9311 pmap_batch_sign_user_ptr(void *location, void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9312 {
9313 bool submitted_to_sptm = false;
9314
9315 /* Disable preemption to access percpu data. */
9316 disable_preemption();
9317
9318 pmap_batch_sign_user_ptr_state_t *state = PERCPU_GET(percpu_pmap_batch_sign_user_ptr_state);
9319 void **locations = state->locations;
9320 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9321 sptm_user_pointer_op_t *sptm_user_pointer_ops = (sptm_user_pointer_op_t *) sptm_pcpu->sptm_user_pointer_ops;
9322 uintptr_t *sptm_values = (uintptr_t *) sptm_pcpu->sptm_prev_ptes;
9323
9324 if (state->index != 0) {
9325 /* Avoid leaking preemption counts by offsetting the disable at the beginning of this function. */
9326 enable_preemption();
9327
9328 /* Disabled preemption is still expected. */
9329 assert(!preemption_enabled());
9330 }
9331
9332 assert(state->index < SPTM_BATCHED_OPS_LIMIT);
9333
9334 /* Stash a pointer signing op if a copy location is supplied. */
9335 if (location != NULL) {
9336 locations[state->index] = location;
9337 sptm_user_pointer_ops[state->index].value = (uintptr_t)value;
9338 sptm_user_pointer_ops[state->index].key = key;
9339 sptm_user_pointer_ops[state->index].discriminator = discriminator;
9340
9341 if (state->index == 0) {
9342 state->jop_key = jop_key;
9343 } else {
9344 assert(state->jop_key == jop_key);
9345 }
9346
9347 state->index = state->index + 1;
9348 }
9349
9350 /**
9351 * Submit the stashed ops on this cpu to SPTM when:
9352 * 1. there are SPTM_BATCHED_OPS_LIMIT ops accumulated on the cpu, or
9353 * 2. the caller asks us to submit whatever we have accumulated by
9354 * passing in a NULL location argument.
9355 */
9356 if (state->index == SPTM_BATCHED_OPS_LIMIT || location == NULL) {
9357 if (__probable(state->index > 0)) {
9358 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
9359
9360 uint64_t saved_jop_state = ml_enable_user_jop_key(state->jop_key);
9361 sptm_batch_sign_user_pointer(sptm_pcpu->sptm_user_pointer_ops_pa, state->index, state->jop_key);
9362 ml_disable_user_jop_key(state->jop_key, saved_jop_state);
9363
9364 ml_set_interrupts_enabled(current_intr_state);
9365
9366 for (unsigned int i = 0; i < state->index; i++) {
9367 memcpy(locations[i], &(sptm_values[i]), sizeof(sptm_values[i]));
9368 }
9369
9370 state->index = 0;
9371 state->jop_key = 0;
9372 submitted_to_sptm = true;
9373 }
9374 }
9375
9376 /**
9377 * There is a slight difference between using submitted_to_sptm and
9378 * state->index here. We need to take care of the case when there is
9379 * no op accumulated but a NULL location passed in, where submitted_to_sptm
9380 * will be false and leak a preemption count.
9381 */
9382 if (state->index == 0) {
9383 assert(submitted_to_sptm || (location == NULL));
9384 enable_preemption();
9385 }
9386
9387 return submitted_to_sptm;
9388 }
9389
9390 void *
9391 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9392 {
9393 void *res = NULL;
9394 const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
9395
9396 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9397 __compiler_materialize_and_prevent_reordering_on(value);
9398 res = sptm_auth_user_pointer(value, key, discriminator, jop_key);
9399 __compiler_materialize_and_prevent_reordering_on(res);
9400 ml_disable_user_jop_key(jop_key, saved_jop_state);
9401
9402 if (res == SPTM_AUTH_FAILURE) {
9403 res = ml_poison_ptr(value, key);
9404 }
9405
9406 ml_set_interrupts_enabled(current_intr_state);
9407
9408 return res;
9409 }
9410 #endif /* HAS_APPLE_PAC */
9411
9412 /**
9413 * Establishes the pmap associated with a shared region as the nested pmap
9414 * for a top-level user pmap.
9415 *
9416 * @param grand The top-level user pmap
9417 * @param subord The pmap to be set as [grand]'s nested pmap
9418 * @param vstart The base VA of the region to be nested.
9419 * @param size The size (in bytes) of the region to be nested.
9420 */
9421 void
9422 pmap_set_shared_region(
9423 pmap_t grand,
9424 pmap_t subord,
9425 addr64_t vstart,
9426 uint64_t size)
9427 {
9428 addr64_t vend;
9429
9430 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START,
9431 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size);
9432
9433 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9434 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9435 }
9436
9437 validate_pmap_mutable(grand);
9438 validate_pmap(subord);
9439 os_ref_retain_raw(&subord->ref_count, &pmap_refgrp);
9440
9441 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9442 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9443 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9444 }
9445
9446 if (__improbable(((size | vstart) &
9447 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9448 panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx",
9449 __func__, grand, vstart, size);
9450 }
9451
9452 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9453 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9454 }
9455
9456 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9457 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9458 }
9459
9460 if (subord->nested_region_size == 0) {
9461 /**
9462 * Since subord->nested_region_size is 0, this is the first time subord is being
9463 * associated with a top-level pmap. We therefore need to take a few extra steps to
9464 * ensure the shared region is properly configured. This initial setup step is expected
9465 * to be issued by the VM layer against a temporary grand pmap before any other pmap
9466 * is allowed to associate with subord, so synchronization is not needed here to prevent
9467 * concurrent initialization.
9468 */
9469 sptm_configure_shared_region(subord->ttep, vstart, size >> pt_attr->pta_page_shift);
9470
9471 /**
9472 * Since this is the first time subord is being associated with a top-level pmap, ensure
9473 * its nested region is fully expanded to L3 so that all relevant L3 tables can later be
9474 * inserted into top-level pmaps via pmap_nest(). Note that pmap_remove() will never
9475 * dynamically free L3 tables from nested pmaps. However, some of these tables may be
9476 * freed by a later call to pmap_trim().
9477 */
9478 vm_map_offset_t vaddr = vstart;
9479 while (vaddr < vend) {
9480 const tt_entry_t *const stte_p = pmap_tte(subord, vaddr);
9481 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9482 __assert_only kern_return_t kr;
9483 kr = pmap_expand(subord, vaddr, 0, pt_attr_leaf_level(pt_attr));
9484 assert3u(kr, ==, KERN_SUCCESS);
9485 }
9486 vaddr += pt_attr_twig_size(pt_attr);
9487 }
9488
9489 const uint64_t nested_region_unnested_table_bits = (size >> (pt_attr_twig_shift(pt_attr) - 1));
9490 if (__improbable((nested_region_unnested_table_bits > UINT_MAX))) {
9491 panic("%s: bitmap allocation size %llu will truncate, "
9492 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9493 __func__, nested_region_unnested_table_bits,
9494 grand, subord, vstart, size);
9495 }
9496
9497 subord->nested_region_unnested_table_bitmap = bitmap_alloc((uint) nested_region_unnested_table_bits);
9498 subord->nested_region_addr = vstart;
9499 subord->nested_region_size = (mach_vm_offset_t)size;
9500 }
9501
9502 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9503 grand->nested_region_addr = vstart;
9504 grand->nested_region_size = (mach_vm_offset_t)size;
9505 assert3u(grand->nested_region_addr, ==, subord->nested_region_addr);
9506 assert3u(grand->nested_region_size, ==, subord->nested_region_size);
9507 pmap_txm_acquire_exclusive_lock(grand);
9508 pmap_txm_acquire_shared_lock(subord);
9509 sptm_set_shared_region(grand->ttep, subord->ttep);
9510 pmap_txm_release_shared_lock(subord);
9511 pmap_txm_release_exclusive_lock(grand);
9512 } else {
9513 panic("%s: pmap %p already has a nested pmap %p", __func__, grand, grand->nested_pmap);
9514 }
9515
9516 PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END);
9517 }
9518
9519 /**
9520 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9521 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9522 * This function operates in 2 main phases:
9523 * 1. Expands grand to ensure the required twig-level page table pages for
9524 * the mapping range are present in grand.
9525 * 2. Invokes sptm_nest_region() to copy the relevant TTEs from subord to grand.
9526 *
9527 * @note This function requires that pmap_set_shared_region() has already been
9528 * called for the [grand, subord] pair.
9529 *
9530 * @note The VA region defined by vstart and vsize must lie entirely within the
9531 * VA region established by the previous call to pmap_set_shared_region().
9532 *
9533 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9534 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9535 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9536 * @param size twig-aligned size of the nesting range
9537 *
9538 * @return KERN_RESOURCE_SHORTAGE on allocation failure, KERN_SUCCESS otherwise
9539 */
9540 MARK_AS_PMAP_TEXT kern_return_t
9541 pmap_nest_internal(
9542 pmap_t grand,
9543 pmap_t subord,
9544 addr64_t vstart,
9545 uint64_t size)
9546 {
9547 kern_return_t kr = KERN_SUCCESS;
9548 vm_map_offset_t vaddr;
9549 tt_entry_t *gtte_p;
9550
9551 addr64_t vend;
9552 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9553 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9554 }
9555
9556 validate_pmap_mutable(grand);
9557 validate_pmap(subord);
9558
9559 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9560
9561 if (__improbable(((size | vstart) &
9562 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9563 panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx",
9564 __func__, grand, vstart, size);
9565 }
9566
9567 if (__improbable(subord != grand->nested_pmap)) {
9568 panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p",
9569 __func__, subord, grand, grand->nested_pmap);
9570 }
9571
9572 addr64_t true_start = vstart;
9573 if (true_start < subord->nested_region_true_start) {
9574 true_start = subord->nested_region_true_start;
9575 }
9576
9577 addr64_t true_end = vend;
9578 if (true_end > subord->nested_region_true_end) {
9579 true_end = subord->nested_region_true_end;
9580 }
9581
9582 /* Ensure grand is expanded to L2 so that sptm_nest_region() can copy L3 entries from subord. */
9583 vaddr = (vm_map_offset_t) true_start;
9584
9585 while (vaddr < true_end) {
9586 gtte_p = pmap_tte(grand, vaddr);
9587 if (gtte_p == PT_ENTRY_NULL) {
9588 kr = pmap_expand(grand, vaddr, 0, pt_attr_twig_level(pt_attr));
9589
9590 if (kr != KERN_SUCCESS) {
9591 goto done;
9592 }
9593 }
9594
9595 vaddr += pt_attr_twig_size(pt_attr);
9596 }
9597
9598 vaddr = (vm_map_offset_t) true_start;
9599
9600 while (vaddr < true_end) {
9601 /*
9602 * The SPTM requires the run of TTE updates to all reside within the same L2 page, so the region
9603 * we supply to the SPTM can't span multiple L1 TTEs.
9604 */
9605 vm_map_offset_t vlim = ((vaddr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9606 if (vlim > true_end) {
9607 vlim = true_end;
9608 }
9609 sptm_nest_region(grand->ttep, subord->ttep, vaddr, (vlim - vaddr) >> pt_attr->pta_page_shift);
9610 vaddr = vlim;
9611 }
9612
9613 done:
9614 return kr;
9615 }
9616
9617 kern_return_t
9618 pmap_nest(
9619 pmap_t grand,
9620 pmap_t subord,
9621 addr64_t vstart,
9622 uint64_t size)
9623 {
9624 kern_return_t kr = KERN_SUCCESS;
9625
9626 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9627 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9628 VM_KERNEL_ADDRHIDE(vstart));
9629
9630 pmap_verify_preemptible();
9631 kr = pmap_nest_internal(grand, subord, vstart, size);
9632
9633 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9634
9635 return kr;
9636 }
9637
9638 /*
9639 * kern_return_t pmap_unnest(grand, vaddr)
9640 *
9641 * grand = the pmap that will have the virtual range unnested
9642 * vaddr = start of range in pmap to be unnested
9643 * size = size of range in pmap to be unnested
9644 *
9645 */
9646
9647 kern_return_t
9648 pmap_unnest(
9649 pmap_t grand,
9650 addr64_t vaddr,
9651 uint64_t size)
9652 {
9653 return pmap_unnest_options(grand, vaddr, size, 0);
9654 }
9655
9656 /**
9657 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9658 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9659 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9660 * still have the region nested. The mappings in 'grand' will be left empty
9661 * with the assumption that they will be demand-filled by subsequent access faults.
9662 *
9663 * This function operates in 2 main phases:
9664 * 1. Iteration over the nested pmap's mappings for the specified range to mark
9665 * them non-global.
9666 * 2. Calling the SPTM to clear the twig-level TTEs for the address range in grand.
9667 *
9668 * @param grand pmap from which to unnest mappings
9669 * @param vaddr twig-aligned virtual address for the beginning of the nested range
9670 * @param size twig-aligned size of the nested range
9671 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9672 * grand is being torn down and step 1) above is not needed.
9673 */
9674 MARK_AS_PMAP_TEXT void
9675 pmap_unnest_options_internal(
9676 pmap_t grand,
9677 addr64_t vaddr,
9678 uint64_t size,
9679 unsigned int option)
9680 {
9681 vm_map_offset_t start;
9682 vm_map_offset_t addr;
9683 unsigned int current_index;
9684 unsigned int start_index;
9685 unsigned int max_index;
9686
9687 addr64_t vend;
9688 addr64_t true_end;
9689 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9690 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9691 }
9692
9693 validate_pmap_mutable(grand);
9694
9695 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9696
9697 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9698 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9699 (unsigned long long)vaddr, (unsigned long long)size);
9700 }
9701
9702 struct pmap * const subord = grand->nested_pmap;
9703 if (__improbable(subord == NULL)) {
9704 panic("%s: %p has no nested pmap", __func__, grand);
9705 }
9706
9707 true_end = vend;
9708 if (true_end > subord->nested_region_true_end) {
9709 true_end = subord->nested_region_true_end;
9710 }
9711
9712 if ((option & PMAP_UNNEST_CLEAN) == 0) {
9713 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9714 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9715 }
9716
9717 start = vaddr;
9718 if (start < subord->nested_region_true_start) {
9719 start = subord->nested_region_true_start;
9720 }
9721 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9722 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9723
9724 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9725 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9726
9727 bool unnested = bitmap_test(subord->nested_region_unnested_table_bitmap, UNNEST_BIT(current_index));
9728 os_atomic_thread_fence(acquire);
9729 if (!unnested) {
9730 atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap,
9731 UNNEST_IN_PROGRESS_BIT(current_index), memory_order_relaxed);
9732 /*
9733 * Issue a store-load barrier to ensure the UNNEST_IN_PROGRESS bit is visible to any pmap_enter()
9734 * operation that enters the epoch after this point.
9735 */
9736 os_atomic_thread_fence(seq_cst);
9737 pmap_epoch_prepare_drain();
9738 pmap_epoch_drain();
9739
9740 unsigned int num_mappings = 0;
9741 disable_preemption();
9742 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9743 /*
9744 * We've marked the 'twig' region as being unnested. Every mapping entered within
9745 * the nested pmap in this region will now be marked non-global.
9746 */
9747 while (addr < vlim) {
9748 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9749
9750 sptm_pcpu->sptm_templates[num_mappings] = ARM_PTE_NG;
9751 ++num_mappings;
9752
9753 if (num_mappings == SPTM_MAPPING_LIMIT) {
9754 pmap_epoch_enter();
9755 /**
9756 * It's technically possible (though highly unlikely) for subord to
9757 * be concurrently trimmed, so re-check the bounds within the epoch to
9758 * avoid potentially issuing an SPTM operation against a deleted leaf
9759 * page table. This assumes the following:
9760 * 1) The pmap_trim() code path always issues a barrier and an epoch
9761 * drain in between updating subord's true bounds and actually
9762 * trimming subord, effectively purging any operation here which
9763 * may be using stale bounds.
9764 * 2) The true bounds, if set, will always be twig-aligned, thus
9765 * the region we operate on here can never span the starting or
9766 * ending bounds.
9767 */
9768 if ((start >= subord->nested_region_true_start) &&
9769 (start < subord->nested_region_true_end)) {
9770 sptm_update_region(subord->ttep, start, num_mappings,
9771 sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9772 }
9773 pmap_epoch_exit();
9774 enable_preemption();
9775 num_mappings = 0;
9776 start = addr;
9777 disable_preemption();
9778 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9779 }
9780 }
9781 /**
9782 * The SPTM does not allow region updates to span multiple leaf page tables, so request
9783 * any remaining updates up to vlim before moving to the next page table page.
9784 */
9785 if (num_mappings != 0) {
9786 pmap_epoch_enter();
9787 if ((start >= subord->nested_region_true_start) &&
9788 (start < subord->nested_region_true_end)) {
9789 sptm_update_region(subord->ttep, start, num_mappings,
9790 sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9791 }
9792 pmap_epoch_exit();
9793 }
9794 enable_preemption();
9795 atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap,
9796 UNNEST_BIT(current_index), memory_order_release);
9797 }
9798 addr = start = vlim;
9799 }
9800 }
9801
9802 /*
9803 * invalidate all pdes for segment at vaddr in pmap grand
9804 */
9805 addr = vaddr;
9806
9807 if (addr < subord->nested_region_true_start) {
9808 addr = subord->nested_region_true_start;
9809 }
9810
9811 if (true_end > subord->nested_region_true_end) {
9812 true_end = subord->nested_region_true_end;
9813 }
9814
9815 while (addr < true_end) {
9816 vm_map_offset_t vlim = ((addr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9817 if (vlim > true_end) {
9818 vlim = true_end;
9819 }
9820 sptm_unnest_region(grand->ttep, subord->ttep, addr, (vlim - addr) >> pt_attr->pta_page_shift);
9821 addr = vlim;
9822 }
9823 }
9824
9825 kern_return_t
9826 pmap_unnest_options(
9827 pmap_t grand,
9828 addr64_t vaddr,
9829 uint64_t size,
9830 unsigned int option)
9831 {
9832 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9833 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9834
9835 pmap_verify_preemptible();
9836 pmap_unnest_options_internal(grand, vaddr, size, option);
9837
9838 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9839
9840 return KERN_SUCCESS;
9841 }
9842
9843 boolean_t
9844 pmap_adjust_unnest_parameters(
9845 __unused pmap_t p,
9846 __unused vm_map_offset_t *s,
9847 __unused vm_map_offset_t *e)
9848 {
9849 return TRUE; /* to get to log_unnest_badness()... */
9850 }
9851
9852 /**
9853 * Perform any necessary pre-nesting of the parent's shared region at fork()
9854 * time.
9855 *
9856 * @note This should only be called from vm_map_fork().
9857 *
9858 * @param old_pmap The pmap of the parent task.
9859 * @param new_pmap The pmap of the child task.
9860 *
9861 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9862 * KERN_INVALID_ARGUMENT if the arguments were not valid.
9863 */
9864 kern_return_t
9865 pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap)
9866 {
9867 if (old_pmap == NULL || new_pmap == NULL) {
9868 return KERN_INVALID_ARGUMENT;
9869 }
9870 if (old_pmap->nested_pmap == NULL) {
9871 return KERN_SUCCESS;
9872 }
9873 pmap_set_shared_region(new_pmap,
9874 old_pmap->nested_pmap,
9875 old_pmap->nested_region_addr,
9876 old_pmap->nested_region_size);
9877 return KERN_SUCCESS;
9878 }
9879
9880 /*
9881 * disable no-execute capability on
9882 * the specified pmap
9883 */
9884 #if DEVELOPMENT || DEBUG
9885 void
9886 pmap_disable_NX(
9887 pmap_t pmap)
9888 {
9889 pmap->nx_enabled = FALSE;
9890 }
9891 #else
9892 void
9893 pmap_disable_NX(
9894 __unused pmap_t pmap)
9895 {
9896 }
9897 #endif
9898
9899 /*
9900 * flush a range of hardware TLB entries.
9901 * NOTE: assumes the smallest TLB entry in use will be for
9902 * an ARM small page (4K).
9903 */
9904
9905 #if __ARM_RANGE_TLBI__
9906 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
9907 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
9908 #else
9909 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
9910 #endif // __ARM_RANGE_TLBI__
9911
9912 static void
9913 flush_mmu_tlb_region_asid_async(
9914 vm_offset_t va,
9915 size_t length,
9916 pmap_t pmap,
9917 bool last_level_only __unused)
9918 {
9919 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
9920 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
9921 ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
9922 const uint16_t asid = PMAP_HWASID(pmap);
9923
9924 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
9925 boolean_t flush_all = FALSE;
9926
9927 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
9928 flush_all = TRUE;
9929 }
9930 if (flush_all) {
9931 flush_mmu_tlb_async();
9932 } else {
9933 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, false);
9934 }
9935 return;
9936 }
9937 #if __ARM_RANGE_TLBI__
9938 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
9939 va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
9940 if (pmap->type == PMAP_TYPE_NESTED) {
9941 flush_mmu_tlb_allrange_async(va, last_level_only, false);
9942 } else {
9943 flush_mmu_tlb_range_async(va, last_level_only, false);
9944 }
9945 return;
9946 }
9947 #endif
9948 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
9949 va = tlbi_asid(asid) | tlbi_addr(va);
9950
9951 if (pmap->type == PMAP_TYPE_NESTED) {
9952 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, false);
9953 } else {
9954 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, false);
9955 }
9956 }
9957
9958 void
9959 flush_mmu_tlb_region(
9960 vm_offset_t va,
9961 unsigned length)
9962 {
9963 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
9964 sync_tlb_flush();
9965 }
9966
9967 unsigned int
9968 pmap_cache_attributes(
9969 ppnum_t pn)
9970 {
9971 pmap_paddr_t paddr;
9972 unsigned int pai;
9973 unsigned int result;
9974 pp_attr_t pp_attr_current;
9975
9976 paddr = ptoa(pn);
9977
9978 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
9979
9980 if (!pa_valid(paddr)) {
9981 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
9982 return (io_rgn == NULL || io_rgn->signature == 'SKIO') ? VM_WIMG_IO : io_rgn->wimg;
9983 }
9984
9985 result = VM_WIMG_DEFAULT;
9986
9987 pai = pa_index(paddr);
9988
9989 pp_attr_current = pp_attr_table[pai];
9990 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
9991 result = pp_attr_current & PP_ATTR_WIMG_MASK;
9992 }
9993 return result;
9994 }
9995
9996 MARK_AS_PMAP_TEXT static void
9997 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
9998 {
9999 if ((wimg_bits_prev != wimg_bits_new)
10000 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10001 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10002 && (wimg_bits_new != VM_WIMG_COPYBACK))
10003 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10004 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10005 pmap_sync_page_attributes_phys(pn);
10006 }
10007
10008 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10009 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10010 }
10011 }
10012
10013 MARK_AS_PMAP_TEXT __unused void
10014 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10015 {
10016 pmap_paddr_t paddr = ptoa(pn);
10017
10018 if (__improbable(!pa_valid(paddr))) {
10019 panic("%s called on non-managed page 0x%08x", __func__, pn);
10020 }
10021
10022 pmap_set_cache_attributes_internal(pn, new_cacheattr, false);
10023
10024 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10025 }
10026
10027 static inline bool
10028 cacheattr_supports_compressor(unsigned int cacheattr)
10029 {
10030 switch (cacheattr) {
10031 case VM_WIMG_DEFAULT:
10032 return true;
10033 #if HAS_MTE
10034 case VM_WIMG_MTE:
10035 return true;
10036 #endif /* HAS_MTE */
10037 default:
10038 return false;
10039 }
10040 }
10041
10042 void *
10043 pmap_map_compressor_page(ppnum_t pn)
10044 {
10045 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10046 if (!cacheattr_supports_compressor(cacheattr)) {
10047 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10048 }
10049
10050 return (void*)phystokv(ptoa(pn));
10051 }
10052
10053 void
10054 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10055 {
10056 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10057 if (!cacheattr_supports_compressor(cacheattr)) {
10058 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10059 }
10060 }
10061
10062 /**
10063 * Flushes TLB entries associated with the page specified by paddr, but do not
10064 * issue barriers yet.
10065 *
10066 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10067 */
10068 static void
10069 pmap_flush_tlb_for_paddr_async(pmap_paddr_t paddr)
10070 {
10071 /* Flush the physical aperture mappings. */
10072 const vm_offset_t kva = phystokv(paddr);
10073 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10074
10075 /* Flush the mappings tracked in the ptes. */
10076 const unsigned int pai = pa_index(paddr);
10077 locked_pvh_t locked_pvh = pvh_lock(pai);
10078
10079 pt_entry_t *pte_p = PT_ENTRY_NULL;
10080 pv_entry_t *pve_p = PV_ENTRY_NULL;
10081
10082 if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP)) {
10083 pte_p = pvh_ptep(locked_pvh.pvh);
10084 } else if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
10085 pve_p = pvh_pve_list(locked_pvh.pvh);
10086 pte_p = PT_ENTRY_NULL;
10087 }
10088
10089 unsigned int nptes = 0;
10090 int pve_ptep_idx = 0;
10091 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10092 if (pve_p != PV_ENTRY_NULL) {
10093 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10094 if (pte_p == PT_ENTRY_NULL) {
10095 goto flush_tlb_skip_pte;
10096 }
10097 }
10098
10099 if (__improbable(nptes == SPTM_MAPPING_LIMIT)) {
10100 pvh_lock_enter_sleep_mode(&locked_pvh);
10101 }
10102 ++nptes;
10103 #ifdef PVH_FLAG_IOMMU
10104 if (pvh_ptep_is_iommu(pte_p)) {
10105 goto flush_tlb_skip_pte;
10106 }
10107 #endif /* PVH_FLAG_IOMMU */
10108 const pmap_t pmap = ptep_get_pmap(pte_p);
10109 const vm_map_address_t va = ptep_get_va(pte_p);
10110
10111 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10112
10113 flush_tlb_skip_pte:
10114 pte_p = PT_ENTRY_NULL;
10115 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10116 pve_ptep_idx = 0;
10117 pve_p = pve_next(pve_p);
10118 }
10119 }
10120 pvh_unlock(&locked_pvh);
10121 }
10122
10123 /**
10124 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10125 *
10126 * @param pai The Physical Address Index of the entry.
10127 * @param cacheattr The new cache attribute.
10128 */
10129 MARK_AS_PMAP_TEXT static void
10130 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10131 {
10132 pvh_assert_locked(pai);
10133
10134 pp_attr_t pp_attr_current, pp_attr_template;
10135 do {
10136 pp_attr_current = pp_attr_table[pai];
10137 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10138
10139 /**
10140 * WIMG bits should only be updated under the PVH lock, but we should do
10141 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10142 */
10143 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10144 }
10145
10146 /**
10147 * Structure for tracking where we are during the collection of mappings for batch
10148 * cache attribute updates.
10149 *
10150 * @note We need to track where in the per-cpu ops table we are filling the next mappings into,
10151 * because the collection routine can return with a not completely filled ops table when
10152 * it exhausts the PV list for a page. In such case, the remaining slots in the ops table
10153 * will be used for mappings of the next page.
10154 *
10155 * @note We also need to record where we are in the PV list, because the collection routine can
10156 * also return when the ops table is filled but it's still in the middle of the PV list.
10157 * Those remaining items in the PV list need to be handled by the next batch operation in
10158 * a new ops table.
10159 */
10160 typedef struct {
10161 /* Where we are in the sptm ops table. */
10162 unsigned int sptm_ops_index;
10163
10164 /**
10165 * The last collected physical address from the previous full ops array (and in turn, SPTM
10166 * call). This is used to know whether the SPTM call for the latest full ops table should
10167 * skip updating the PAPT mapping (seeing as the last call would have handled updating it).
10168 */
10169 pmap_paddr_t last_table_last_papt_pa;
10170
10171 /**
10172 * Where we are in the pv list.
10173 *
10174 * When ptep is non-null, there's only one mapping to the page and the ptep is the address
10175 * of it.
10176 *
10177 * When pvep is non-null, there's more than one mapping and the mappings are tracked by the
10178 * PV list.
10179 *
10180 * When they are both null, it indicates we are collecting for a new page and the collection
10181 * function will initialize them to be one of the two states above.
10182 *
10183 * It is undefined when they are both non-null.
10184 */
10185 pt_entry_t *ptep;
10186 pv_entry_t *pvep;
10187 unsigned int pve_ptep_idx;
10188 } pmap_sptm_update_cache_attr_ops_collect_state_t;
10189
10190 /**
10191 * Reports whether there is any pending ops in an sptm cache attr ops table.
10192 *
10193 * @param state A pmap_sptm_update_cache_attr_ops_collect_state_t structure.
10194 *
10195 * @return True if there's any outstanding cache attr op.
10196 * False otherwise.
10197 */
10198 static inline bool
10199 pmap_is_sptm_update_cache_attr_ops_pending(pmap_sptm_update_cache_attr_ops_collect_state_t state)
10200 {
10201 return state.sptm_ops_index > 0;
10202 }
10203
10204 /**
10205 * Struct for encoding the collection status into pmap_sptm_update_cache_attr_ops_collect()'s
10206 * return value indicating what kind of attention it needs.
10207 */
10208 typedef enum {
10209 OPS_COLLECT_NOTHING = 0x0,
10210
10211 /* The ops table is full, and the caller should commit the table to SPTM. */
10212 OPS_COLLECT_RETURN_FULL_TABLE = 0x1,
10213
10214 /**
10215 * The page has its mappings completely collected, and the caller should
10216 * pass in a new page next time.
10217 */
10218 OPS_COLLECT_RETURN_COMPLETED_PAGE = 0x2,
10219 } pmap_sptm_update_cache_attr_ops_collect_return_t;
10220
10221 /**
10222 * Collects mappings of a physical page into an SPTM ops table for cache attribute updates.
10223 *
10224 * @note This routine returns either when the ops table is full or the page represented by
10225 * pa has no more mapping to collect. The caller should call this routine again with
10226 * a fresh ops table, or a new page, or both, depending on the return code.
10227 *
10228 * @note The PVH lock needs to be held for pa.
10229 *
10230 * @param state Tracks the state of PV list traversal and SPTM ops table filling. It is used
10231 * by this routine to save the progress of the collection.
10232 * @param sptm_ops Pointer to the SPTM ops table.
10233 * @param pa The physical address whose mappings are to be collected.
10234 * @param attributes The new cache attributes.
10235 *
10236 * @return A pmap_sptm_update_cache_attr_ops_collect_return_t that encodes what the caller
10237 * should do before calling this routine again. See the inline comments around
10238 * pmap_sptm_update_cache_attr_ops_collect_return_t for details.
10239 */
10240 static pmap_sptm_update_cache_attr_ops_collect_return_t
10241 pmap_sptm_update_cache_attr_ops_collect(
10242 pmap_sptm_update_cache_attr_ops_collect_state_t *state,
10243 sptm_update_disjoint_multipage_op_t *sptm_ops,
10244 pmap_paddr_t pa,
10245 unsigned int attributes)
10246 {
10247 if (state == NULL || sptm_ops == NULL) {
10248 panic("%s: unexpected null arguments - state: %p, sptm_ops: %p", __func__, state, sptm_ops);
10249 }
10250
10251 PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_START, pa, attributes, state->sptm_ops_index);
10252
10253 /* Copy the states into local variables. */
10254 unsigned int sptm_ops_index = state->sptm_ops_index;
10255 pmap_paddr_t last_table_last_papt_pa = state->last_table_last_papt_pa;
10256 pv_entry_t *pvep = state->pvep;
10257 pt_entry_t *ptep = state->ptep;
10258 unsigned int pve_ptep_idx = state->pve_ptep_idx;
10259
10260 unsigned int pai = pa_index(pa);
10261
10262 /* We should at least have one free slot in the ops table. */
10263 assert(sptm_ops_index < SPTM_MAPPING_LIMIT);
10264
10265 /* The PVH lock for pa has to be locked. */
10266 pvh_assert_locked(pai);
10267
10268 /* If pvep and ptep are both null in the state, it's a new page. Initialize the states. */
10269 if (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL) {
10270 const uintptr_t pvh = pai_to_pvh(pai);
10271 if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
10272 ptep = PT_ENTRY_NULL;
10273 pvep = pvh_pve_list(pvh);
10274 pve_ptep_idx = 0;
10275 } else if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
10276 ptep = pvh_ptep(pvh);
10277 pvep = PV_ENTRY_NULL;
10278 pve_ptep_idx = 0;
10279 }
10280 }
10281
10282 /**
10283 * The first entry filled in is always the PAPT header entry:
10284 *
10285 * 1) In the case of a fresh ops table, the first entry has to be a PAPT header.
10286 * 2) In the case of a fresh page, we need to insert a new PAPT header to request
10287 * SPTM to operate on a new page.
10288 *
10289 * Remember the index of the PAPT header here so that we can update the number
10290 * of mappings field later when we finish collecting.
10291 */
10292 const unsigned int papt_sptm_ops_index = sptm_ops_index;
10293 unsigned int num_mappings = 0;
10294
10295 /* Assemble the PTE template for the PAPT mapping. */
10296 const vm_address_t kva = phystokv(pa);
10297 const pt_entry_t *papt_ptep = pmap_pte(kernel_pmap, kva);
10298
10299 pt_entry_t template = os_atomic_load(papt_ptep, relaxed);
10300 template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10301 template |= wimg_to_pte(attributes, pa);
10302
10303 /* Fill in the PAPT header entry. */
10304 sptm_ops[papt_sptm_ops_index].per_paddr_header.paddr = pa;
10305 sptm_ops[papt_sptm_ops_index].per_paddr_header.papt_pte_template = template;
10306 sptm_ops[papt_sptm_ops_index].per_paddr_header.options = SPTM_UPDATE_SH | SPTM_UPDATE_MAIR | SPTM_UPDATE_DEFER_TLBI;
10307
10308 if ((papt_sptm_ops_index == 0) && (pa == last_table_last_papt_pa)) {
10309 /**
10310 * If the previous SPTM call was made with an ops table that already included
10311 * updating the PA of the page that this table starts with, then we can assume
10312 * that call already updated the PAPT and we can safely skip it in this
10313 * upcoming one.
10314 */
10315 sptm_ops[0].per_paddr_header.options |= SPTM_UPDATE_SKIP_PAPT;
10316 }
10317
10318 sptm_ops_index++;
10319
10320 /**
10321 * Main loop for collecting the mappings into the ops table. It terminates either
10322 * when the ops table is full or the PV list is exhausted.
10323 */
10324 while ((sptm_ops_index < SPTM_MAPPING_LIMIT) && (pvep != PV_ENTRY_NULL || ptep != PT_ENTRY_NULL)) {
10325 /**
10326 * Update ptep. There are really two cases here:
10327 *
10328 * 1) pvep is PV_ENTRY_NULL. In this case, ptep holds the pointer to
10329 * the only mapping to the page.
10330 * 2) pvep is not PV_ENTRY_NULL. In such case, ptep is updated accroding to
10331 * pvep and pve_ptep_idx.
10332 */
10333 if (pvep != PV_ENTRY_NULL) {
10334 ptep = pve_get_ptep(pvep, pve_ptep_idx);
10335
10336 /* This pve is empty, so skip to next one. */
10337 if (ptep == PT_ENTRY_NULL) {
10338 goto sucaoc_skip_pte;
10339 }
10340 }
10341
10342 #ifdef PVH_FLAG_IOMMU
10343 /* Skip IOMMU pteps. */
10344 if (pvh_ptep_is_iommu(ptep)) {
10345 goto sucaoc_skip_pte;
10346 }
10347 #endif
10348 /* Assemble the PTE template for the mapping. */
10349 const vm_address_t va = ptep_get_va(ptep);
10350 const pmap_t pmap = ptep_get_pmap(ptep);
10351
10352 template = os_atomic_load(ptep, relaxed);
10353 template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10354 template |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, pa);
10355
10356 /* Fill into the ops table. */
10357 sptm_ops[sptm_ops_index].disjoint_op.root_pt_paddr = pmap->ttep;
10358 sptm_ops[sptm_ops_index].disjoint_op.vaddr = va;
10359 sptm_ops[sptm_ops_index].disjoint_op.pte_template = template;
10360
10361 /* Move the sptm ops table cursor. */
10362 sptm_ops_index++;
10363
10364 /* Increment the mappings counter. */
10365 num_mappings++;
10366
10367 sucaoc_skip_pte:
10368 /**
10369 * Reset ptep to PT_ENTRY_NULL to keep the loop precondition of either ptep
10370 * or pvep is nonnull (not both, not neither) true.
10371 */
10372 ptep = PT_ENTRY_NULL;
10373
10374 /* Advance to next pvep if we have exhausted the pteps in it. */
10375 if ((pvep != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10376 pve_ptep_idx = 0;
10377 pvep = pve_next(pvep);
10378 }
10379 }
10380
10381 /* Update the PAPT header for the number of mappings. */
10382 sptm_ops[papt_sptm_ops_index].per_paddr_header.num_mappings = num_mappings;
10383
10384 const bool full_table = (sptm_ops_index >= SPTM_MAPPING_LIMIT);
10385 const bool collection_done_for_page = (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL);
10386
10387 /**
10388 * The ops table is full, so the caller should now invoke the SPTM before calling
10389 * into this function again.
10390 */
10391 if (full_table) {
10392 /* Update last_table_last_papt_pa to be the pa collected in this call. */
10393 last_table_last_papt_pa = pa;
10394
10395 /* Reset sptm_ops_index. */
10396 sptm_ops_index = 0;
10397 }
10398
10399 /* Copy the updated collection states back to the parameter structure. */
10400 state->sptm_ops_index = sptm_ops_index;
10401 state->last_table_last_papt_pa = last_table_last_papt_pa;
10402 state->pvep = pvep;
10403 state->ptep = ptep;
10404 state->pve_ptep_idx = pve_ptep_idx;
10405
10406 /* Assemble the return value. */
10407 pmap_sptm_update_cache_attr_ops_collect_return_t retval = OPS_COLLECT_NOTHING;
10408
10409 if (full_table) {
10410 retval |= OPS_COLLECT_RETURN_FULL_TABLE;
10411 }
10412
10413 if (collection_done_for_page) {
10414 retval |= OPS_COLLECT_RETURN_COMPLETED_PAGE;
10415 }
10416
10417 PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_END, pa, attributes, sptm_ops_index);
10418
10419 return retval;
10420 }
10421
10422 /* At least one PAPT header plus one mapping. */
10423 static_assert(SPTM_MAPPING_LIMIT >= 2);
10424
10425 /**
10426 * Returns if a cache attribute is allowed (on managed pages).
10427 *
10428 * @param attributes A 32-bit value whose VM_WIMG_MASK bits represent the
10429 * cache attribute.
10430 *
10431 * @return True if the cache attribute is allowed on managed pages.
10432 * False otherwise.
10433 */
10434 static bool
10435 pmap_is_cache_attribute_allowed(unsigned int attributes)
10436 {
10437 if (pmap_panic_dev_wimg_on_managed) {
10438 switch (attributes & VM_WIMG_MASK) {
10439 /* supported on DRAM, but slow, so we disallow */
10440 case VM_WIMG_IO: // nGnRnE
10441 case VM_WIMG_POSTED: // nGnRE
10442
10443 /* unsupported on DRAM */
10444 case VM_WIMG_POSTED_REORDERED: // nGRE
10445 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10446 return false;
10447
10448 default:
10449 return true;
10450 }
10451 }
10452
10453 return true;
10454 }
10455
10456 /**
10457 * Batch updates the cache attributes of a list of pages in three passes.
10458 *
10459 * In pass one, the pp_attr_table and the pte are updated (by SPTM) for the pages in the list.
10460 * In pass two, TLB entries are flushed for each page in the list if necessary.
10461 * In pass three, caches are cleaned for each page in the list if necessary.
10462 *
10463 * @param page_list List of pages to be updated.
10464 * @param cacheattr The new cache attributes.
10465 * @param update_attr_table Whether the pp_attr_table should be updated. This is useful for compressor
10466 * pages where it's desired to keep the old WIMG bits.
10467 */
10468 void
10469 pmap_batch_set_cache_attributes_internal(
10470 const unified_page_list_t *page_list,
10471 unsigned int cacheattr,
10472 bool update_attr_table)
10473 {
10474 bool tlb_flush_pass_needed = false;
10475 bool rt_cache_flush_pass_needed = false;
10476 bool preemption_disabled = false;
10477
10478 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE1);
10479
10480 pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
10481 sptm_update_disjoint_multipage_op_t *sptm_ops = NULL;
10482
10483 pmap_sptm_update_cache_attr_ops_collect_state_t state = {0};
10484
10485 unified_page_list_iterator_t iter;
10486
10487 for (unified_page_list_iterator_init(page_list, &iter);
10488 !unified_page_list_iterator_end(&iter);
10489 unified_page_list_iterator_next(&iter)) {
10490 bool is_fictitious = false;
10491 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10492 const pmap_paddr_t paddr = ptoa(pn);
10493
10494 /**
10495 * Skip if the page is not managed.
10496 *
10497 * We don't panic here because sometimes the user just blindly pass in
10498 * pages that are not managed. We need to handle that gracefully.
10499 */
10500 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10501 continue;
10502 }
10503
10504 const unsigned int pai = pa_index(paddr);
10505 locked_pvh_t locked_pvh = {.pvh = 0};
10506
10507 if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
10508 /**
10509 * If we're partway through processing a multi-page batched call,
10510 * preemption will already be disabled so we can't simply call
10511 * pvh_lock() which may block. Instead, we first try to acquire
10512 * the lock without waiting, which in most cases should succeed.
10513 * If it fails, we submit the pending batched operations to re-
10514 * enable preemption and then acquire the lock normally.
10515 */
10516 locked_pvh = pvh_try_lock(pai);
10517 if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
10518 assert(preemption_disabled);
10519 const sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
10520 pmap_epoch_exit();
10521 enable_preemption();
10522 preemption_disabled = false;
10523 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10524 tlb_flush_pass_needed = true;
10525 }
10526 state.sptm_ops_index = 0;
10527 locked_pvh = pvh_lock(pai);
10528 }
10529 } else {
10530 locked_pvh = pvh_lock(pai);
10531 }
10532 assert(locked_pvh.pvh != 0);
10533
10534 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10535
10536 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10537 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10538 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10539 }
10540
10541 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10542
10543 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10544 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10545 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10546 }
10547
10548 /**
10549 * When update_attr_table is false, we know that wimg_bits_prev read from pp_attr_table is not to be trusted,
10550 * and we should force update the cache attribute.
10551 */
10552 const bool force_update = !update_attr_table;
10553 /* Update the cache attributes in PTE and PP_ATTR table. */
10554 if ((wimg_bits_new != wimg_bits_prev) || force_update) {
10555 if (!pmap_is_cache_attribute_allowed(cacheattr)) {
10556 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, pn=%#x",
10557 __func__, cacheattr & VM_WIMG_MASK, pn);
10558 }
10559
10560 /* Update PP_ATTR_TABLE */
10561 if (update_attr_table) {
10562 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10563 }
10564
10565 bool mapping_collection_done = false;
10566 bool pvh_lock_sleep_mode_needed = false;
10567 do {
10568 if (__improbable(pvh_lock_sleep_mode_needed)) {
10569 assert(!preemption_disabled);
10570 pvh_lock_enter_sleep_mode(&locked_pvh);
10571 pvh_lock_sleep_mode_needed = false;
10572 }
10573
10574 /* Disable preemption to use the per-CPU structure safely. */
10575 if (!preemption_disabled) {
10576 preemption_disabled = true;
10577 disable_preemption();
10578 /**
10579 * Enter the pmap epoch while we gather the disjoint update arguments
10580 * and issue the SPTM call. Since this operation may cover multiple physical
10581 * pages, we may construct the argument array and invoke the SPTM without holding
10582 * all relevant PVH locks, we need to record that we are collecting and modifying
10583 * mapping state so that e.g. pmap_page_protect() does not attempt to retype the
10584 * underlying pages and pmap_remove() does not attempt to free the page tables
10585 * used for these mappings without first draining our epoch.
10586 */
10587 pmap_epoch_enter();
10588
10589 sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
10590 sptm_ops = (sptm_update_disjoint_multipage_op_t *) sptm_pcpu->sptm_ops;
10591 }
10592
10593 /* The return value indicates if we should call into SPTM in this iteration. */
10594 pmap_sptm_update_cache_attr_ops_collect_return_t retval =
10595 pmap_sptm_update_cache_attr_ops_collect(&state, sptm_ops, paddr, cacheattr);
10596
10597 /* The collection routine should only return if it needs attention. */
10598 assert(retval != OPS_COLLECT_NOTHING);
10599
10600 /* Gather information for next step from the return value. */
10601 mapping_collection_done = retval & OPS_COLLECT_RETURN_COMPLETED_PAGE;
10602 const bool call_sptm = retval & OPS_COLLECT_RETURN_FULL_TABLE;
10603
10604 if (call_sptm) {
10605 /* Call into SPTM with this SPTM ops table. */
10606 sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, SPTM_MAPPING_LIMIT);
10607 /**
10608 * We may be submitting the batch and exiting the epoch partway through
10609 * processing the PV list for a page. That's fine, because in that case we'll
10610 * hold the PV lock for that page, which will prevent mappings of that page from
10611 * being disconnected and will prevent the completion of pmap_remove() against
10612 * any of those mappings, thus also guaranteeing the relevant page table pages
10613 * can't be freed. The epoch still protects mappings for any prior page in
10614 * the batch, whose PV locks are no longer held.
10615 */
10616 pmap_epoch_exit();
10617 /**
10618 * Balance out the explicit disable_preemption() made either at the beginning of
10619 * the function or on a prior iteration of the loop that placed the PVH lock in
10620 * sleep mode. Note that enable_preemption() decrements a per-thread counter,
10621 * so if we still happen to hold the PVH lock in spin mode preemption won't
10622 * actually be re-enabled until we switch the lock over to sleep mode on
10623 * the next iteration.
10624 */
10625 enable_preemption();
10626 preemption_disabled = false;
10627 pvh_lock_sleep_mode_needed = true;
10628
10629 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10630 tlb_flush_pass_needed = true;
10631 }
10632 }
10633
10634 /* We cannot be in a situation where we didn't call into SPTM while also having not finished walking the pv list. */
10635 assert(call_sptm || mapping_collection_done);
10636 } while (!mapping_collection_done);
10637
10638 /**
10639 * We could technically force the cache flush pass here when force_update is true, but
10640 * since the compressor mapping/unmapping path handles cache flushing itself, it's fine
10641 * leaving this as is.
10642 */
10643 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10644 rt_cache_flush_pass_needed = true;
10645 }
10646 }
10647
10648 pvh_unlock(&locked_pvh);
10649 }
10650
10651 if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
10652 assert(preemption_disabled);
10653 sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
10654 pmap_epoch_exit();
10655 if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10656 tlb_flush_pass_needed = true;
10657 }
10658
10659 /**
10660 * This is the last sptm_update_cache_attr() call whatsoever, so it's
10661 * okay not to update the state variables.
10662 */
10663
10664 enable_preemption();
10665 } else if (preemption_disabled) {
10666 pmap_epoch_exit();
10667 enable_preemption();
10668 }
10669
10670 if (tlb_flush_pass_needed) {
10671 /* Sync the PTE writes before potential TLB/Cache flushes. */
10672 FLUSH_PTE_STRONG();
10673
10674 /**
10675 * Pass 2: for each physical page and for each mapping, we need to flush
10676 * the TLB for it.
10677 */
10678 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE2);
10679 for (unified_page_list_iterator_init(page_list, &iter);
10680 !unified_page_list_iterator_end(&iter);
10681 unified_page_list_iterator_next(&iter)) {
10682 bool is_fictitious = false;
10683 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10684 const pmap_paddr_t paddr = ptoa(pn);
10685
10686 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10687 continue;
10688 }
10689
10690 pmap_flush_tlb_for_paddr_async(paddr);
10691 }
10692
10693 #if HAS_FEAT_XS
10694 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10695 arm64_sync_tlb(false);
10696 #else
10697 /**
10698 * For targets that distinguish between mild and strong DSB, mild DSB
10699 * will not drain the prefetcher. This can lead to prefetch-driven
10700 * cache fills that defeat the uncacheable requirement of the RT memory type.
10701 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10702 */
10703 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10704 #endif
10705 }
10706
10707 if (rt_cache_flush_pass_needed) {
10708 /* Pass 3: Flush the cache if the page is recently set to RT */
10709 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE3);
10710 /**
10711 * We disable preemption to ensure we are not preempted
10712 * in the state where DC by VA instructions remain enabled.
10713 */
10714 disable_preemption();
10715
10716 assert(get_preemption_level() > 0);
10717
10718 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10719 /**
10720 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10721 * and the host will handle cache maintenance for it. So we don't need to
10722 * worry about enabling the ops here for AVP.
10723 */
10724 enable_dc_mva_ops();
10725 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10726 /**
10727 * DMB should be sufficient to ensure prior accesses to the memory in question are
10728 * correctly ordered relative to the upcoming cache maintenance operations.
10729 */
10730 __builtin_arm_dmb(DMB_SY);
10731
10732 for (unified_page_list_iterator_init(page_list, &iter);
10733 !unified_page_list_iterator_end(&iter);) {
10734 bool is_fictitious = false;
10735 const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10736 const pmap_paddr_t paddr = ptoa(pn);
10737
10738 if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10739 unified_page_list_iterator_next(&iter);
10740 continue;
10741 }
10742
10743 CleanPoC_DcacheRegion_Force_nopreempt_nohid_nobarrier(phystokv(paddr), PAGE_SIZE);
10744
10745 unified_page_list_iterator_next(&iter);
10746 if (__improbable(pmap_pending_preemption() && !unified_page_list_iterator_end(&iter))) {
10747 __builtin_arm_dsb(DSB_SY);
10748 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10749 disable_dc_mva_ops();
10750 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10751 enable_preemption();
10752 assert(preemption_enabled());
10753 disable_preemption();
10754 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10755 enable_dc_mva_ops();
10756 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10757 }
10758 }
10759
10760 /* Issue DSB to ensure cache maintenance is fully complete before subsequent accesses. */
10761 __builtin_arm_dsb(DSB_SY);
10762 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10763 disable_dc_mva_ops();
10764 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10765
10766 enable_preemption();
10767 }
10768
10769 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE4);
10770 }
10771
10772 /**
10773 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10774 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10775 *
10776 * @param page_list List of pages to be updated.
10777 * @param cacheattr The new cache attribute.
10778 */
10779 void
10780 pmap_batch_set_cache_attributes(
10781 const unified_page_list_t *page_list,
10782 unsigned int cacheattr)
10783 {
10784 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10785
10786 /* Verify we are being called from a preemptible context. */
10787 pmap_verify_preemptible();
10788
10789 pmap_batch_set_cache_attributes_internal(page_list, cacheattr, true);
10790
10791 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10792 }
10793
10794 MARK_AS_PMAP_TEXT void
10795 pmap_set_cache_attributes_internal(
10796 ppnum_t pn,
10797 unsigned int cacheattr,
10798 bool update_attr_table)
10799 {
10800 upl_page_info_t single_page_upl = { .phys_addr = pn };
10801 const unified_page_list_t page_list = {
10802 .upl = {.upl_info = &single_page_upl, .upl_size = 1},
10803 .type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
10804 };
10805
10806 pmap_batch_set_cache_attributes_internal(&page_list, cacheattr, update_attr_table);
10807 }
10808
10809 void
10810 pmap_set_cache_attributes(
10811 ppnum_t pn,
10812 unsigned int cacheattr)
10813 {
10814 pmap_set_cache_attributes_internal(pn, cacheattr, true);
10815 }
10816
10817 void
10818 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10819 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10820 {
10821 pmap_paddr_t data_pa = 0; // data address
10822 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10823 pmap_paddr_t text_pa = 0; // text address
10824
10825 *kernel_data_addr = 0;
10826 *kernel_text_addr = 0;
10827 *user_text_addr = 0;
10828
10829 kern_return_t kr = pmap_page_alloc(&data_pa, PMAP_PAGE_ALLOCATE_NONE);
10830 assert(kr == KERN_SUCCESS);
10831
10832 kr = pmap_page_alloc(&ro_data_pa, PMAP_PAGE_ALLOCATE_NONE);
10833 assert(kr == KERN_SUCCESS);
10834
10835 #if CONFIG_ARM_PFZ
10836 kr = pmap_page_alloc(&text_pa, PMAP_PAGE_ALLOCATE_NONE);
10837 assert(kr == KERN_SUCCESS);
10838
10839 /**
10840 * User mapping of comm page text section for 64 bit mapping only
10841 *
10842 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10843 * user processes to get this page mapped in, they should never call into
10844 * this page.
10845 *
10846 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10847 * is slid in the same L3 as the data commpage. It is either outside the
10848 * max of user VA or is pre-reserved in vm_map_exec(). This means that
10849 * it is reserved and unavailable to mach VM for future mappings.
10850 */
10851 const int num_ptes = pt_attr_leaf_size(native_pt_attr) >> PTE_SHIFT;
10852
10853 do {
10854 const int text_leaf_index = random() % num_ptes;
10855
10856 /**
10857 * Generate a VA for the commpage text with the same root and twig index as data
10858 * comm page, but with new leaf index we've just generated.
10859 */
10860 commpage_text_user_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(native_pt_attr));
10861 commpage_text_user_va |= (text_leaf_index << pt_attr_leaf_shift(native_pt_attr));
10862 } while ((commpage_text_user_va == _COMM_PAGE64_BASE_ADDRESS) ||
10863 (commpage_text_user_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10864
10865 *user_text_addr = commpage_text_user_va;
10866 *kernel_text_addr = phystokv(text_pa);
10867 #endif
10868
10869 /* For manipulation in kernel, go straight to physical page */
10870 commpage_data_pa = data_pa;
10871 *kernel_data_addr = phystokv(data_pa);
10872 assert(commpage_ro_data_pa == 0);
10873 commpage_ro_data_pa = ro_data_pa;
10874 *kernel_ro_data_addr = phystokv(ro_data_pa);
10875 assert(commpage_text_pa == 0);
10876 commpage_text_pa = text_pa;
10877 }
10878
10879
10880 /*
10881 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10882 * with user controlled TTEs for regions that aren't explicitly reserved by the
10883 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10884 */
10885 #if (ARM_PGSHIFT == 14)
10886 /**
10887 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10888 * commpage completely above the maximum 32-bit userspace VA.
10889 */
10890 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10891 static_assert(_COMM_PAGE64_NESTING_START == SPTM_ARM64_COMMPAGE_REGION_START);
10892 static_assert(_COMM_PAGE64_NESTING_SIZE == SPTM_ARM64_COMMPAGE_REGION_SIZE);
10893
10894 /**
10895 * Normally there'd be an assert to check that 64-bit devices with 64-bit
10896 * userspace VAs can nest the commpage completely above the maximum 64-bit
10897 * userpace VA, but that technically isn't true on macOS. On those systems, the
10898 * commpage lives within the userspace VA range, but is protected by the VM as
10899 * a reserved region (see vm_reserved_regions[] definition for more info).
10900 */
10901
10902 #elif (ARM_PGSHIFT == 12)
10903 /**
10904 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
10905 * above the maximum userspace VA.
10906 */
10907 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
10908 #else
10909 #error Nested shared page mapping is unsupported on this config
10910 #endif
10911
10912 MARK_AS_PMAP_TEXT kern_return_t
10913 pmap_insert_commpage_internal(
10914 pmap_t pmap)
10915 {
10916 kern_return_t kr = KERN_SUCCESS;
10917 vm_offset_t commpage_vaddr;
10918 pt_entry_t *ttep;
10919 pmap_paddr_t commpage_table = commpage_default_table;
10920
10921 /* Validate the pmap input before accessing its data. */
10922 validate_pmap_mutable(pmap);
10923
10924 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10925 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10926
10927 #if __ARM_MIXED_PAGE_SIZE__
10928 #if !__ARM_16K_PG__
10929 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
10930 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
10931 #endif /* !__ARM_16K_PG__ */
10932
10933 /* Choose the correct shared page pmap to use. */
10934 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
10935 if (pmap_page_size == 4096) {
10936 if (pmap_is_64bit(pmap)) {
10937 commpage_table = commpage_4k_table;
10938 } else {
10939 panic("32-bit 4k commpage not currently supported for SPTM configurations");
10940 //commpage_table = commpage32_4k_table;
10941 }
10942 } else if (pmap_page_size != 16384) {
10943 panic("No commpage table exists for the wanted page size: %llu", pmap_page_size);
10944 } else
10945 #endif /* __ARM_MIXED_PAGE_SIZE__ */
10946 {
10947 if (pmap_is_64bit(pmap)) {
10948 commpage_table = commpage_default_table;
10949 } else {
10950 commpage_table = commpage32_default_table;
10951 }
10952 }
10953
10954 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
10955 #error We assume a single page.
10956 #endif
10957
10958 if (pmap_is_64bit(pmap)) {
10959 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10960 } else {
10961 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10962 }
10963
10964
10965 pmap_lock(pmap, PMAP_LOCK_SHARED);
10966
10967 /*
10968 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
10969 * two (2MB) depending on the address space layout. For 16KB pages, each level
10970 * one entry is 64GB, so we must go to the second level entry (32MB) in order
10971 * to "nest".
10972 *
10973 * Note: This is not "nesting" in the shared cache sense. This definition of
10974 * nesting just means inserting pointers to pre-allocated tables inside of
10975 * the passed in pmap to allow us to share page tables (which map the shared
10976 * page) for every task. This saves at least one page of memory per process
10977 * compared to creating new page tables in every process for mapping the
10978 * shared page.
10979 */
10980
10981 /**
10982 * Allocate the twig page tables if needed, and slam a pointer to the shared
10983 * page's tables into place.
10984 */
10985 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
10986 pmap_unlock(pmap, PMAP_LOCK_SHARED);
10987
10988 kr = pmap_expand(pmap, commpage_vaddr, 0, commpage_level);
10989
10990 if (kr != KERN_SUCCESS) {
10991 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
10992 }
10993
10994 pmap_lock(pmap, PMAP_LOCK_SHARED);
10995 }
10996
10997 if (*ttep != ARM_PTE_EMPTY) {
10998 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
10999 }
11000
11001 sptm_map_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level,
11002 (commpage_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID);
11003
11004 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11005
11006 return kr;
11007 }
11008
11009 static void
11010 pmap_unmap_commpage(
11011 pmap_t pmap)
11012 {
11013 pt_entry_t *ptep;
11014 vm_offset_t commpage_vaddr;
11015
11016 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11017 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11018 __assert_only pmap_paddr_t commpage_pa = commpage_data_pa;
11019
11020 if (pmap_is_64bit(pmap)) {
11021 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11022 } else {
11023 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11024 }
11025
11026
11027 ptep = pmap_pte(pmap, commpage_vaddr);
11028
11029 if (ptep == NULL) {
11030 return;
11031 }
11032
11033 /* It had better be mapped to the shared page. */
11034 if (pte_to_pa(*ptep) != commpage_pa) {
11035 panic("%s: non-commpage PA 0x%llx mapped at VA 0x%llx in pmap %p; expected 0x%llx",
11036 __func__, (unsigned long long)pte_to_pa(*ptep), (unsigned long long)commpage_vaddr,
11037 pmap, (unsigned long long)commpage_pa);
11038 }
11039
11040 sptm_unmap_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level);
11041 }
11042
11043 void
11044 pmap_insert_commpage(
11045 pmap_t pmap)
11046 {
11047 pmap_insert_commpage_internal(pmap);
11048 }
11049
11050 static boolean_t
11051 pmap_is_64bit(
11052 pmap_t pmap)
11053 {
11054 return pmap->is_64bit;
11055 }
11056
11057 bool
11058 pmap_is_exotic(
11059 pmap_t pmap __unused)
11060 {
11061 return false;
11062 }
11063
11064
11065 /* ARMTODO -- an implementation that accounts for
11066 * holes in the physical map, if any.
11067 */
11068 boolean_t
11069 pmap_valid_page(
11070 ppnum_t pn)
11071 {
11072 return pa_valid(ptoa(pn));
11073 }
11074
11075 boolean_t
11076 pmap_bootloader_page(
11077 ppnum_t pn)
11078 {
11079 pmap_paddr_t paddr = ptoa(pn);
11080
11081 if (pa_valid(paddr)) {
11082 return FALSE;
11083 }
11084 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11085 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11086 }
11087
11088 MARK_AS_PMAP_TEXT boolean_t
11089 pmap_is_empty_internal(
11090 pmap_t pmap,
11091 vm_map_offset_t va_start,
11092 vm_map_offset_t va_end)
11093 {
11094 vm_map_offset_t block_start, block_end;
11095 tt_entry_t *tte_p;
11096
11097 if (pmap == NULL) {
11098 return TRUE;
11099 }
11100
11101 validate_pmap(pmap);
11102
11103 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11104 unsigned int initial_not_in_kdp = not_in_kdp;
11105
11106 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11107 pmap_lock(pmap, PMAP_LOCK_SHARED);
11108 }
11109
11110
11111 /* TODO: This will be faster if we increment ttep at each level. */
11112 block_start = va_start;
11113
11114 while (block_start < va_end) {
11115 pt_entry_t *bpte_p, *epte_p;
11116 pt_entry_t *pte_p;
11117
11118 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11119 if (block_end > va_end) {
11120 block_end = va_end;
11121 }
11122
11123 tte_p = pmap_tte(pmap, block_start);
11124 if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
11125 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11126 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11127 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11128
11129 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11130 if (*pte_p != ARM_PTE_EMPTY) {
11131 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11132 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11133 }
11134 return FALSE;
11135 }
11136 }
11137 }
11138 block_start = block_end;
11139 }
11140
11141 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11142 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11143 }
11144
11145 return TRUE;
11146 }
11147
11148 boolean_t
11149 pmap_is_empty(
11150 pmap_t pmap,
11151 vm_map_offset_t va_start,
11152 vm_map_offset_t va_end)
11153 {
11154 return pmap_is_empty_internal(pmap, va_start, va_end);
11155 }
11156
11157 vm_map_offset_t
11158 pmap_max_offset(
11159 boolean_t is64,
11160 unsigned int option)
11161 {
11162 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11163 }
11164
11165 vm_map_offset_t
11166 pmap_max_64bit_offset(
11167 __unused unsigned int option)
11168 {
11169 vm_map_offset_t max_offset_ret = 0;
11170
11171 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11172 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11173 max_offset_ret = arm64_pmap_max_offset_default;
11174 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11175 max_offset_ret = min_max_offset;
11176 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11177 max_offset_ret = MACH_VM_MAX_ADDRESS;
11178 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11179 if (arm64_pmap_max_offset_default) {
11180 max_offset_ret = arm64_pmap_max_offset_default;
11181 } else if (max_mem > 0xC0000000) {
11182 // devices with > 3GB of memory
11183 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11184 } else if (max_mem > 0x40000000) {
11185 // devices with > 1GB and <= 3GB of memory
11186 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11187 } else {
11188 // devices with <= 1 GB of memory
11189 max_offset_ret = min_max_offset;
11190 }
11191 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11192 if (arm64_pmap_max_offset_default) {
11193 // Allow the boot-arg to override jumbo size
11194 max_offset_ret = arm64_pmap_max_offset_default;
11195 } else {
11196 max_offset_ret = MACH_VM_JUMBO_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11197 }
11198 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11199 } else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11200 max_offset_ret = MACH_VM_MAX_ADDRESS;
11201 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11202 } else {
11203 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11204 }
11205
11206 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11207 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11208 assert(max_offset_ret >= min_max_offset);
11209 }
11210
11211 return max_offset_ret;
11212 }
11213
11214 vm_map_offset_t
11215 pmap_max_32bit_offset(
11216 unsigned int option)
11217 {
11218 vm_map_offset_t max_offset_ret = 0;
11219
11220 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11221 max_offset_ret = arm_pmap_max_offset_default;
11222 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11223 max_offset_ret = VM_MAX_ADDRESS;
11224 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11225 max_offset_ret = VM_MAX_ADDRESS;
11226 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11227 if (arm_pmap_max_offset_default) {
11228 max_offset_ret = arm_pmap_max_offset_default;
11229 } else if (max_mem > 0x20000000) {
11230 max_offset_ret = VM_MAX_ADDRESS;
11231 } else {
11232 max_offset_ret = VM_MAX_ADDRESS;
11233 }
11234 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11235 max_offset_ret = VM_MAX_ADDRESS;
11236 } else {
11237 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11238 }
11239
11240 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11241 return max_offset_ret;
11242 }
11243
11244 #if CONFIG_DTRACE
11245 /*
11246 * Constrain DTrace copyin/copyout actions
11247 */
11248 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11249 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11250
11251 kern_return_t
11252 dtrace_copyio_preflight(
11253 __unused addr64_t va)
11254 {
11255 if (current_map() == kernel_map) {
11256 return KERN_FAILURE;
11257 } else {
11258 return KERN_SUCCESS;
11259 }
11260 }
11261
11262 kern_return_t
11263 dtrace_copyio_postflight(
11264 __unused addr64_t va)
11265 {
11266 return KERN_SUCCESS;
11267 }
11268 #endif /* CONFIG_DTRACE */
11269
11270
11271 void
11272 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11273 {
11274 }
11275
11276
11277 void
11278 pmap_flush(
11279 __unused pmap_flush_context *cpus_to_flush)
11280 {
11281 /* not implemented yet */
11282 return;
11283 }
11284
11285 /**
11286 * Perform basic validation checks on the destination only and
11287 * corresponding offset/sizes prior to writing to a read only allocation.
11288 *
11289 * @note Should be called before writing to an allocation from the read
11290 * only allocator.
11291 *
11292 * @param zid The ID of the zone the allocation belongs to.
11293 * @param va VA of element being modified (destination).
11294 * @param offset Offset being written to, in the element.
11295 * @param new_data_size Size of modification.
11296 *
11297 */
11298
11299 MARK_AS_PMAP_TEXT static void
11300 pmap_ro_zone_validate_element_dst(
11301 zone_id_t zid,
11302 vm_offset_t va,
11303 vm_offset_t offset,
11304 vm_size_t new_data_size)
11305 {
11306 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11307 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11308 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11309 }
11310
11311 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
11312
11313 /* Check element is from correct zone and properly aligned */
11314 zone_require_ro(zid, elem_size, (void*)va);
11315
11316 if (__improbable(new_data_size > (elem_size - offset))) {
11317 panic("%s: New data size %lu too large for elem size %lu at addr %p",
11318 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11319 }
11320 if (__improbable(offset >= elem_size)) {
11321 panic("%s: Offset %lu too large for elem size %lu at addr %p",
11322 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11323 }
11324 }
11325
11326
11327 /**
11328 * Perform basic validation checks on the source, destination and
11329 * corresponding offset/sizes prior to writing to a read only allocation.
11330 *
11331 * @note Should be called before writing to an allocation from the read
11332 * only allocator.
11333 *
11334 * @param zid The ID of the zone the allocation belongs to.
11335 * @param va VA of element being modified (destination).
11336 * @param offset Offset being written to, in the element.
11337 * @param new_data Pointer to new data (source).
11338 * @param new_data_size Size of modification.
11339 *
11340 */
11341
11342 MARK_AS_PMAP_TEXT static void
11343 pmap_ro_zone_validate_element(
11344 zone_id_t zid,
11345 vm_offset_t va,
11346 vm_offset_t offset,
11347 const vm_offset_t new_data,
11348 vm_size_t new_data_size)
11349 {
11350 vm_offset_t sum = 0;
11351
11352 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11353 panic("%s: Integer addition overflow %p + %lu = %lu",
11354 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11355 }
11356
11357 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11358 }
11359
11360 /**
11361 * Function to configure RO zone access permissions for a forthcoming write operation.
11362 */
11363 static void
11364 pmap_ro_zone_prepare_write(void)
11365 {
11366 }
11367
11368 /**
11369 * Function to indicate that a preceding RO zone write operation is complete.
11370 */
11371 static void
11372 pmap_ro_zone_complete_write(void)
11373 {
11374 }
11375
11376 /**
11377 * Function to align an address or size to the required RO zone mapping alignment.
11378 *
11379 * For the SPTM the RO zone region must be aligned on a twig boundary so that at least
11380 * the last-level kernel pagetable can be of the appropriate SPTM RO zone table type,
11381 * which allows the SPTM to enforce RO zone mapping permission restrictions.
11382 *
11383 * @param value the address or size to be aligned.
11384 *
11385 * @return the aligned value
11386 */
11387 vm_offset_t
11388 pmap_ro_zone_align(vm_offset_t value)
11389 {
11390 const pt_attr_t * const pt_attr = pmap_get_pt_attr(kernel_pmap);
11391 return PMAP_ALIGN(value, pt_attr_twig_size(pt_attr));
11392 }
11393
11394 /**
11395 * Function to copy kauth_cred from new_data to kv.
11396 * Function defined in "kern_prot.c"
11397 *
11398 * @note Will be removed upon completion of
11399 * <rdar://problem/72635194> Compiler PAC support for memcpy.
11400 *
11401 * @param kv Address to copy new data to.
11402 * @param new_data Pointer to new data.
11403 *
11404 */
11405
11406 extern void
11407 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11408
11409 /**
11410 * Zalloc-specific memcpy that writes through the physical aperture
11411 * and ensures the element being modified is from a read-only zone.
11412 *
11413 * @note Designed to work only with the zone allocator's read-only submap.
11414 *
11415 * @param zid The ID of the zone to allocate from.
11416 * @param va VA of element to be modified.
11417 * @param offset Offset from element.
11418 * @param new_data Pointer to new data.
11419 * @param new_data_size Size of modification.
11420 *
11421 */
11422
11423 void
11424 pmap_ro_zone_memcpy(
11425 zone_id_t zid,
11426 vm_offset_t va,
11427 vm_offset_t offset,
11428 const vm_offset_t new_data,
11429 vm_size_t new_data_size)
11430 {
11431 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11432 }
11433
11434 MARK_AS_PMAP_TEXT void
11435 pmap_ro_zone_memcpy_internal(
11436 zone_id_t zid,
11437 vm_offset_t va,
11438 vm_offset_t offset,
11439 const vm_offset_t new_data,
11440 vm_size_t new_data_size)
11441 {
11442 if (!new_data || new_data_size == 0) {
11443 return;
11444 }
11445
11446 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11447 const bool istate = ml_set_interrupts_enabled(FALSE);
11448 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11449 pmap_ro_zone_prepare_write();
11450 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11451 pmap_ro_zone_complete_write();
11452 ml_set_interrupts_enabled(istate);
11453 }
11454
11455 /**
11456 * Zalloc-specific function to atomically mutate fields of an element that
11457 * belongs to a read-only zone, via the physcial aperture.
11458 *
11459 * @note Designed to work only with the zone allocator's read-only submap.
11460 *
11461 * @param zid The ID of the zone the element belongs to.
11462 * @param va VA of element to be modified.
11463 * @param offset Offset in element.
11464 * @param op Atomic operation to perform.
11465 * @param value Mutation value.
11466 *
11467 */
11468
11469 uint64_t
11470 pmap_ro_zone_atomic_op(
11471 zone_id_t zid,
11472 vm_offset_t va,
11473 vm_offset_t offset,
11474 zro_atomic_op_t op,
11475 uint64_t value)
11476 {
11477 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11478 }
11479
11480 MARK_AS_PMAP_TEXT uint64_t
11481 pmap_ro_zone_atomic_op_internal(
11482 zone_id_t zid,
11483 vm_offset_t va,
11484 vm_offset_t offset,
11485 zro_atomic_op_t op,
11486 uint64_t value)
11487 {
11488 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11489 vm_size_t value_size = op & 0xf;
11490 const boolean_t istate = ml_set_interrupts_enabled(FALSE);
11491
11492 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11493 pmap_ro_zone_prepare_write();
11494 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11495 pmap_ro_zone_complete_write();
11496 ml_set_interrupts_enabled(istate);
11497
11498 return value;
11499 }
11500
11501 /**
11502 * bzero for allocations from read only zones, that writes through the
11503 * physical aperture.
11504 *
11505 * @note This is called by the zfree path of all allocations from read
11506 * only zones.
11507 *
11508 * @param zid The ID of the zone the allocation belongs to.
11509 * @param va VA of element to be zeroed.
11510 * @param offset Offset in the element.
11511 * @param size Size of allocation.
11512 *
11513 */
11514
11515 void
11516 pmap_ro_zone_bzero(
11517 zone_id_t zid,
11518 vm_offset_t va,
11519 vm_offset_t offset,
11520 vm_size_t size)
11521 {
11522 pmap_ro_zone_bzero_internal(zid, va, offset, size);
11523 }
11524
11525 MARK_AS_PMAP_TEXT void
11526 pmap_ro_zone_bzero_internal(
11527 zone_id_t zid,
11528 vm_offset_t va,
11529 vm_offset_t offset,
11530 vm_size_t size)
11531 {
11532 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11533 const boolean_t istate = ml_set_interrupts_enabled(FALSE);
11534 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11535 pmap_ro_zone_prepare_write();
11536 bzero((void*)phystokv(pa), size);
11537 pmap_ro_zone_complete_write();
11538 ml_set_interrupts_enabled(istate);
11539 }
11540
11541 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
11542
11543 MARK_AS_PMAP_TEXT mach_vm_size_t
11544 pmap_query_resident_internal(
11545 pmap_t pmap,
11546 vm_map_address_t start,
11547 vm_map_address_t end,
11548 mach_vm_size_t *compressed_bytes_p)
11549 {
11550 mach_vm_size_t resident_bytes = 0;
11551 mach_vm_size_t compressed_bytes = 0;
11552
11553 pt_entry_t *bpte, *epte;
11554 pt_entry_t *pte_p;
11555 tt_entry_t *tte_p;
11556
11557 if (pmap == NULL) {
11558 return PMAP_RESIDENT_INVALID;
11559 }
11560
11561 validate_pmap(pmap);
11562
11563 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11564
11565 /* Ensure that this request is valid, and addresses exactly one TTE. */
11566 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11567 (end % pt_attr_page_size(pt_attr)))) {
11568 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11569 }
11570
11571 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11572 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11573 }
11574
11575 pmap_lock(pmap, PMAP_LOCK_SHARED);
11576 tte_p = pmap_tte(pmap, start);
11577 if (tte_p == (tt_entry_t *) NULL) {
11578 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11579 return PMAP_RESIDENT_INVALID;
11580 }
11581 if (tte_is_valid_table(*tte_p)) {
11582 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11583 bpte = &pte_p[pte_index(pt_attr, start)];
11584 epte = &pte_p[pte_index(pt_attr, end)];
11585
11586 for (; bpte < epte; bpte++) {
11587 if (pte_is_compressed(*bpte, bpte)) {
11588 compressed_bytes += pt_attr_page_size(pt_attr);
11589 } else if (pa_valid(pte_to_pa(*bpte))) {
11590 resident_bytes += pt_attr_page_size(pt_attr);
11591 }
11592 }
11593 }
11594 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11595
11596 if (compressed_bytes_p) {
11597 *compressed_bytes_p += compressed_bytes;
11598 }
11599
11600 return resident_bytes;
11601 }
11602
11603 mach_vm_size_t
11604 pmap_query_resident(
11605 pmap_t pmap,
11606 vm_map_address_t start,
11607 vm_map_address_t end,
11608 mach_vm_size_t *compressed_bytes_p)
11609 {
11610 mach_vm_size_t total_resident_bytes;
11611 mach_vm_size_t compressed_bytes;
11612 vm_map_address_t va;
11613
11614
11615 if (pmap == PMAP_NULL) {
11616 if (compressed_bytes_p) {
11617 *compressed_bytes_p = 0;
11618 }
11619 return 0;
11620 }
11621
11622 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11623
11624 total_resident_bytes = 0;
11625 compressed_bytes = 0;
11626
11627 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
11628 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
11629 VM_KERNEL_ADDRHIDE(end));
11630
11631 va = start;
11632 while (va < end) {
11633 vm_map_address_t l;
11634 mach_vm_size_t resident_bytes;
11635
11636 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
11637
11638 if (l > end) {
11639 l = end;
11640 }
11641 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
11642 if (resident_bytes == PMAP_RESIDENT_INVALID) {
11643 break;
11644 }
11645
11646 total_resident_bytes += resident_bytes;
11647
11648 va = l;
11649 }
11650
11651 if (compressed_bytes_p) {
11652 *compressed_bytes_p = compressed_bytes;
11653 }
11654
11655 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
11656 total_resident_bytes);
11657
11658 return total_resident_bytes;
11659 }
11660
11661 #if MACH_ASSERT
11662 static void
11663 pmap_check_ledgers(
11664 pmap_t pmap)
11665 {
11666 int pid;
11667 char *procname;
11668
11669 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
11670 /*
11671 * This pmap was not or is no longer fully associated
11672 * with a task (e.g. the old pmap after a fork()/exec() or
11673 * spawn()). Its "ledger" still points at a task that is
11674 * now using a different (and active) address space, so
11675 * we can't check that all the pmap ledgers are balanced here.
11676 *
11677 * If the "pid" is set, that means that we went through
11678 * pmap_set_process() in task_terminate_internal(), so
11679 * this task's ledger should not have been re-used and
11680 * all the pmap ledgers should be back to 0.
11681 */
11682 return;
11683 }
11684
11685 pid = pmap->pmap_pid;
11686 procname = pmap->pmap_procname;
11687
11688 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
11689 }
11690 #endif /* MACH_ASSERT */
11691
11692 void
11693 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
11694 {
11695 }
11696
11697 /**
11698 * The minimum shared region nesting size is used by the VM to determine when to
11699 * break up large mappings to nested regions. The smallest size that these
11700 * mappings can be broken into is determined by what page table level those
11701 * regions are being nested in at and the size of the page tables.
11702 *
11703 * For instance, if a nested region is nesting at L2 for a process utilizing
11704 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
11705 * block entry).
11706 *
11707 * @param pmap The target pmap to determine the block size based on whether it's
11708 * using 16KB or 4KB page tables.
11709 */
11710 uint64_t
11711 pmap_shared_region_size_min(__unused pmap_t pmap)
11712 {
11713 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11714
11715 /**
11716 * We always nest the shared region at L2 (32MB for 16KB pages, 8MB for
11717 * 4KB pages). This means that a target pmap will contain L2 entries that
11718 * point to shared L3 page tables in the shared region pmap.
11719 */
11720 const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
11721 return pt_attr_twig_size(pt_attr) * page_ratio;
11722 }
11723
11724 boolean_t
11725 pmap_enforces_execute_only(
11726 pmap_t pmap)
11727 {
11728 return pmap != kernel_pmap;
11729 }
11730
11731 MARK_AS_PMAP_TEXT void
11732 pmap_set_vm_map_cs_enforced_internal(
11733 pmap_t pmap,
11734 bool new_value)
11735 {
11736 validate_pmap_mutable(pmap);
11737 pmap->pmap_vm_map_cs_enforced = new_value;
11738 }
11739
11740 void
11741 pmap_set_vm_map_cs_enforced(
11742 pmap_t pmap,
11743 bool new_value)
11744 {
11745 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
11746 }
11747
11748 extern int cs_process_enforcement_enable;
11749 bool
11750 pmap_get_vm_map_cs_enforced(
11751 pmap_t pmap)
11752 {
11753 if (cs_process_enforcement_enable) {
11754 return true;
11755 }
11756 return pmap->pmap_vm_map_cs_enforced;
11757 }
11758
11759 MARK_AS_PMAP_TEXT void
11760 pmap_set_jit_entitled_internal(
11761 __unused pmap_t pmap)
11762 {
11763 }
11764
11765 void
11766 pmap_set_jit_entitled(
11767 pmap_t pmap)
11768 {
11769 pmap_set_jit_entitled_internal(pmap);
11770 }
11771
11772 bool
11773 pmap_get_jit_entitled(
11774 __unused pmap_t pmap)
11775 {
11776 return false;
11777 }
11778
11779 MARK_AS_PMAP_TEXT void
11780 pmap_set_tpro_internal(
11781 __unused pmap_t pmap)
11782 {
11783 return;
11784 }
11785
11786 void
11787 pmap_set_tpro(
11788 pmap_t pmap)
11789 {
11790 pmap_set_tpro_internal(pmap);
11791 }
11792
11793 bool
11794 pmap_get_tpro(
11795 __unused pmap_t pmap)
11796 {
11797 return false;
11798 }
11799
11800 #if HAS_MTE
11801 void
11802 pmap_set_tag_check_enabled(
11803 pmap_t pmap)
11804 {
11805 validate_pmap_mutable(pmap);
11806
11807 if (pmap->type == PMAP_TYPE_USER) {
11808 sptm_configure_root(pmap->ttep, SPTM_ROOT_PT_FLAG_MTE, SPTM_ROOT_PT_FLAG_MTE);
11809 }
11810 }
11811
11812 void
11813 pmap_set_user_tag_check_faults_disabled(
11814 pmap_t pmap)
11815 {
11816 validate_pmap_mutable(pmap);
11817
11818 if (pmap->type != PMAP_TYPE_USER) {
11819 return;
11820 }
11821
11822 sptm_configure_root(pmap->ttep, SPTM_ROOT_PT_FLAG_NO_TAG_FAULT, SPTM_ROOT_PT_FLAG_NO_TAG_FAULT);
11823 if (pmap == current_pmap()) {
11824 /* SPTM defers reconfiguring TCF0 until the next sptm_switch_root() call */
11825 sptm_return_t __assert_only ret = sptm_switch_root(pmap->ttep, 0, 0);
11826 assert3u(ret & SPTM_SUCCESS, ==, SPTM_SUCCESS);
11827 }
11828 }
11829 #endif /* HAS_MTE */
11830
11831 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
11832
11833 MARK_AS_PMAP_TEXT kern_return_t
11834 pmap_query_page_info_internal(
11835 pmap_t pmap,
11836 vm_map_offset_t va,
11837 int *disp_p)
11838 {
11839 pmap_paddr_t pa;
11840 int disp;
11841 unsigned int pai;
11842 pt_entry_t *pte_p;
11843 pv_entry_t *pve_p;
11844
11845 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
11846 *disp_p = 0;
11847 return KERN_INVALID_ARGUMENT;
11848 }
11849
11850 validate_pmap(pmap);
11851 pmap_lock(pmap, PMAP_LOCK_SHARED);
11852
11853 try_again:
11854 disp = 0;
11855
11856 pte_p = pmap_pte(pmap, va);
11857 if (pte_p == PT_ENTRY_NULL) {
11858 goto done;
11859 }
11860
11861 const pt_entry_t pte = os_atomic_load(pte_p, relaxed);
11862 pa = pte_to_pa(pte);
11863 if (pa == 0) {
11864 if (pte_is_compressed(pte, pte_p)) {
11865 disp |= PMAP_QUERY_PAGE_COMPRESSED;
11866 if (pte & ARM_PTE_COMPRESSED_ALT) {
11867 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
11868 }
11869 }
11870 } else {
11871 disp |= PMAP_QUERY_PAGE_PRESENT;
11872 pai = pa_index(pa);
11873 if (!pa_valid(pa)) {
11874 goto done;
11875 }
11876 locked_pvh_t locked_pvh = pvh_lock(pai);
11877 if (__improbable(pte != os_atomic_load(pte_p, relaxed))) {
11878 /* something changed: try again */
11879 pvh_unlock(&locked_pvh);
11880 pmap_query_page_info_retries++;
11881 goto try_again;
11882 }
11883 pve_p = PV_ENTRY_NULL;
11884 int pve_ptep_idx = 0;
11885 if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
11886 unsigned int npves = 0;
11887 pve_p = pvh_pve_list(locked_pvh.pvh);
11888 while (pve_p != PV_ENTRY_NULL &&
11889 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
11890 if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
11891 pvh_lock_enter_sleep_mode(&locked_pvh);
11892 }
11893 pve_p = pve_next(pve_p);
11894 npves++;
11895 }
11896 }
11897
11898 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
11899 disp |= PMAP_QUERY_PAGE_ALTACCT;
11900 } else if (ppattr_test_reusable(pai)) {
11901 disp |= PMAP_QUERY_PAGE_REUSABLE;
11902 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
11903 disp |= PMAP_QUERY_PAGE_INTERNAL;
11904 }
11905 pvh_unlock(&locked_pvh);
11906 }
11907
11908 done:
11909 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11910 *disp_p = disp;
11911 return KERN_SUCCESS;
11912 }
11913
11914 kern_return_t
11915 pmap_query_page_info(
11916 pmap_t pmap,
11917 vm_map_offset_t va,
11918 int *disp_p)
11919 {
11920 return pmap_query_page_info_internal(pmap, va, disp_p);
11921 }
11922
11923
11924
11925 uint32_t
11926 pmap_user_va_bits(pmap_t pmap __unused)
11927 {
11928 #if __ARM_MIXED_PAGE_SIZE__
11929 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
11930 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
11931 #else
11932 return 64 - T0SZ_BOOT;
11933 #endif
11934 }
11935
11936 uint32_t
11937 pmap_kernel_va_bits(void)
11938 {
11939 return 64 - T1SZ_BOOT;
11940 }
11941
11942 static vm_map_size_t
11943 pmap_user_va_size(pmap_t pmap)
11944 {
11945 return 1ULL << pmap_user_va_bits(pmap);
11946 }
11947
11948 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
11949 static vm_map_address_t
11950 pmap_strip_user_addr(pmap_t pmap, vm_map_address_t ptr)
11951 {
11952 assert(pmap && pmap != kernel_pmap);
11953
11954 /*
11955 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR0 address.
11956 * Ignore the strip request.
11957 */
11958 if ((ptr & TTBR_SELECTOR) != 0) {
11959 return ptr;
11960 }
11961
11962 /* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
11963 return ptr & (pmap->max - 1);
11964 }
11965
11966 static vm_map_address_t
11967 pmap_strip_kernel_addr(pmap_t pmap, vm_map_address_t ptr)
11968 {
11969 assert(pmap && pmap == kernel_pmap);
11970
11971 /*
11972 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR1 address.
11973 * Ignore the strip request.
11974 */
11975 if ((ptr & TTBR_SELECTOR) == 0) {
11976 return ptr;
11977 }
11978
11979 /* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
11980 return ptr | pmap->min;
11981 }
11982
11983 vm_map_address_t
11984 pmap_strip_addr(pmap_t pmap, vm_map_address_t ptr)
11985 {
11986 assert(pmap);
11987
11988 return pmap == kernel_pmap ? pmap_strip_kernel_addr(pmap, ptr) :
11989 pmap_strip_user_addr(pmap, ptr);
11990 }
11991 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
11992
11993
11994 bool
11995 pmap_in_ppl(void)
11996 {
11997 return false;
11998 }
11999
12000 MARK_AS_PMAP_TEXT void
12001 pmap_footprint_suspend_internal(
12002 vm_map_t map,
12003 boolean_t suspend)
12004 {
12005 #if DEVELOPMENT || DEBUG
12006 if (suspend) {
12007 current_thread()->pmap_footprint_suspended = TRUE;
12008 map->pmap->footprint_was_suspended = TRUE;
12009 } else {
12010 current_thread()->pmap_footprint_suspended = FALSE;
12011 }
12012 #else /* DEVELOPMENT || DEBUG */
12013 (void) map;
12014 (void) suspend;
12015 #endif /* DEVELOPMENT || DEBUG */
12016 }
12017
12018 void
12019 pmap_footprint_suspend(
12020 vm_map_t map,
12021 boolean_t suspend)
12022 {
12023 pmap_footprint_suspend_internal(map, suspend);
12024 }
12025
12026 void
12027 pmap_nop(pmap_t pmap)
12028 {
12029 validate_pmap_mutable(pmap);
12030 }
12031
12032 pmap_t
12033 pmap_txm_kernel_pmap(void)
12034 {
12035 return kernel_pmap;
12036 }
12037
12038 TXMAddressSpace_t*
12039 pmap_txm_addr_space(const pmap_t pmap)
12040 {
12041 if (pmap) {
12042 return pmap->txm_addr_space;
12043 }
12044
12045 /*
12046 * When the passed in PMAP is NULL, it means the caller wishes to operate
12047 * on the current_pmap(). We could resolve and return that, but it is actually
12048 * safer to return NULL since these TXM interfaces also accept NULL inputs
12049 * which causes TXM to resolve to the current_pmap() equivalent internally.
12050 */
12051 return NULL;
12052 }
12053
12054 void
12055 pmap_txm_set_addr_space(
12056 pmap_t pmap,
12057 TXMAddressSpace_t *txm_addr_space)
12058 {
12059 assert(pmap != NULL);
12060
12061 if (pmap->txm_addr_space && txm_addr_space) {
12062 /* Attempted to overwrite the address space in the PMAP */
12063 panic("attempted ovewrite of TXM address space: %p | %p | %p",
12064 pmap, pmap->txm_addr_space, txm_addr_space);
12065 } else if (!pmap->txm_addr_space && !txm_addr_space) {
12066 /* This should never happen */
12067 panic("attempted NULL overwrite of TXM address space: %p", pmap);
12068 }
12069
12070 pmap->txm_addr_space = txm_addr_space;
12071 }
12072
12073 void
12074 pmap_txm_set_trust_level(
12075 pmap_t pmap,
12076 CSTrust_t trust_level)
12077 {
12078 assert(pmap != NULL);
12079
12080 CSTrust_t current_trust = pmap->txm_trust_level;
12081 if (current_trust != kCSTrustUntrusted) {
12082 panic("attempted to overwrite TXM trust on the pmap: %p", pmap);
12083 }
12084
12085 pmap->txm_trust_level = trust_level;
12086 }
12087
12088 kern_return_t
12089 pmap_txm_get_trust_level_kdp(
12090 pmap_t pmap,
12091 CSTrust_t *trust_level)
12092 {
12093 if (pmap == NULL) {
12094 return KERN_INVALID_ARGUMENT;
12095 } else if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
12096 return KERN_INVALID_ARGUMENT;
12097 }
12098
12099 if (trust_level != NULL) {
12100 *trust_level = pmap->txm_trust_level;
12101 }
12102 return KERN_SUCCESS;
12103 }
12104
12105 kern_return_t
12106 pmap_txm_get_jit_address_range_kdp(
12107 pmap_t pmap,
12108 uintptr_t *jit_region_start,
12109 uintptr_t *jit_region_end)
12110 {
12111 if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
12112 return KERN_INVALID_ARGUMENT;
12113 }
12114 TXMAddressSpace_t *txm_addr_space = pmap_txm_addr_space(pmap);
12115 if (NULL == txm_addr_space) {
12116 return KERN_INVALID_ARGUMENT;
12117 }
12118 if (ml_validate_nofault((vm_offset_t)txm_addr_space, sizeof(*txm_addr_space)) == false) {
12119 return KERN_INVALID_ARGUMENT;
12120 }
12121 /**
12122 * It's a bit gross that we're dereferencing what is supposed to be an abstract type.
12123 * If we were running in the TXM, we would always perform additional checks on txm_addr_space,
12124 * but this isn't necessary here, since we are running in the kernel and only using the results for
12125 * diagnostic purposes, rather than any policy enforcement.
12126 */
12127 if (txm_addr_space->jitRegion) {
12128 if (ml_validate_nofault((vm_offset_t)txm_addr_space->jitRegion, sizeof(txm_addr_space->jitRegion)) == false) {
12129 return KERN_INVALID_ARGUMENT;
12130 }
12131 if (txm_addr_space->jitRegion->addr && txm_addr_space->jitRegion->addrEnd) {
12132 *jit_region_start = txm_addr_space->jitRegion->addr;
12133 *jit_region_end = txm_addr_space->jitRegion->addrEnd;
12134 return KERN_SUCCESS;
12135 }
12136 }
12137 return KERN_NOT_FOUND;
12138 }
12139
12140 static pmap_t
12141 _pmap_txm_resolve_pmap(pmap_t pmap)
12142 {
12143 if (pmap == NULL) {
12144 pmap = current_pmap();
12145 if (pmap == kernel_pmap) {
12146 return NULL;
12147 }
12148 }
12149
12150 return pmap;
12151 }
12152
12153 void
12154 pmap_txm_acquire_shared_lock(pmap_t pmap)
12155 {
12156 pmap = _pmap_txm_resolve_pmap(pmap);
12157 if (!pmap) {
12158 return;
12159 }
12160
12161 lck_rw_lock_shared(&pmap->txm_lck);
12162 }
12163
12164 void
12165 pmap_txm_release_shared_lock(pmap_t pmap)
12166 {
12167 pmap = _pmap_txm_resolve_pmap(pmap);
12168 if (!pmap) {
12169 return;
12170 }
12171
12172 lck_rw_unlock_shared(&pmap->txm_lck);
12173 }
12174
12175 void
12176 pmap_txm_acquire_exclusive_lock(pmap_t pmap)
12177 {
12178 pmap = _pmap_txm_resolve_pmap(pmap);
12179 if (!pmap) {
12180 return;
12181 }
12182
12183 lck_rw_lock_exclusive(&pmap->txm_lck);
12184 }
12185
12186 void
12187 pmap_txm_release_exclusive_lock(pmap_t pmap)
12188 {
12189 pmap = _pmap_txm_resolve_pmap(pmap);
12190 if (!pmap) {
12191 return;
12192 }
12193
12194 lck_rw_unlock_exclusive(&pmap->txm_lck);
12195 }
12196
12197 static void
12198 _pmap_txm_transfer_page(const pmap_paddr_t addr)
12199 {
12200 sptm_retype_params_t retype_params = {
12201 .raw = SPTM_RETYPE_PARAMS_NULL
12202 };
12203
12204 /* Retype through the SPTM */
12205 sptm_retype(addr, XNU_DEFAULT, TXM_DEFAULT, retype_params);
12206 }
12207
12208 /**
12209 * Prepare a page for retyping to TXM_DEFAULT by clearing its
12210 * internal flags.
12211 *
12212 * @param pa Physical address of the page.
12213 */
12214 static inline void
12215 _pmap_txm_retype_prepare(const pmap_paddr_t pa)
12216 {
12217 const sptm_retype_params_t retype_params = {
12218 .raw = SPTM_RETYPE_PARAMS_NULL
12219 };
12220
12221 /**
12222 * SPTM allows XNU_DEFAULT pages to request deferral of TLB flushing
12223 * when their PTE is updated, which is an important performance
12224 * optimization. However, this also allows an attacker controlled
12225 * XNU to exploit a read reference with a stale write-enabled PTE in
12226 * TLB. This is fine as long as the page is not retyped and the damage
12227 * will be contained within XNU domain. However, when such a page needs
12228 * to be retyped, SPTM has to make sure there's no outstanding
12229 * reference, or there's no history of deferring TLBIs. Internally,
12230 * SPTM maintains a flag tracking past deferred TLBIs that only gets
12231 * cleared on retyping with no outstanding reference. Therefore, we
12232 * do a dummy retype to XNU_DEFAULT itself to clear the internal flag,
12233 * before we actually transfer this page to TXM domain. To make sure
12234 * SPTM won't throw a violation, all the mappings to the page have to
12235 * be removed before calling this.
12236 */
12237 sptm_retype(pa, XNU_DEFAULT, XNU_DEFAULT, retype_params);
12238 }
12239
12240 /**
12241 * Transfer an XNU owned page to TXM domain.
12242 *
12243 * @param addr Kernel virtual address of the page. It has to be page size
12244 * aligned.
12245 */
12246 void
12247 pmap_txm_transfer_page(const vm_address_t addr)
12248 {
12249 assert((addr & PAGE_MASK) == 0);
12250
12251 const pmap_paddr_t pa = kvtophys_nofail(addr);
12252 const unsigned int pai = pa_index(pa);
12253
12254 /* Lock the PVH lock to prevent concurrent updates to the mappings during the self retype below. */
12255 locked_pvh_t locked_pvh = pvh_lock(pai);
12256
12257 /* Disconnect the mapping to assure SPTM of no pending TLBI. */
12258 pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
12259 PMAP_OPTIONS_PPO_PENDING_RETYPE, &locked_pvh, NULL);
12260
12261 /* Self retype to clear the SPTM internal flags tracking delayed TLBIs for revoked writes. */
12262 _pmap_txm_retype_prepare(pa);
12263
12264 pvh_unlock(&locked_pvh);
12265
12266 /* XNU needs to hold an RO reference to the page despite the ownership being transferred to TXM. */
12267 pmap_enter_addr(kernel_pmap, addr, pa, VM_PROT_READ, VM_PROT_NONE, 0, true, PMAP_MAPPING_TYPE_INFER);
12268
12269 /* Finally, retype the page to TXM_DEFAULT. */
12270 _pmap_txm_transfer_page(pa);
12271 }
12272
12273 struct vm_object txm_vm_object_storage VM_PAGE_PACKED_ALIGNED;
12274 SECURITY_READ_ONLY_LATE(vm_object_t) txm_vm_object = &txm_vm_object_storage;
12275
12276 _Static_assert(sizeof(vm_map_address_t) == sizeof(pmap_paddr_t),
12277 "sizeof(vm_map_address_t) != sizeof(pmap_paddr_t)");
12278
12279 vm_map_address_t
12280 pmap_txm_allocate_page(void)
12281 {
12282 pmap_paddr_t phys_addr = 0;
12283 vm_page_t page = VM_PAGE_NULL;
12284 boolean_t thread_vm_privileged = false;
12285
12286 /* We are allowed to allocate privileged memory */
12287 thread_vm_privileged = set_vm_privilege(true);
12288
12289 /* Allocate a page from the VM free list */
12290 vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
12291 while ((page = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) {
12292 VM_PAGE_WAIT();
12293 }
12294
12295 /* Wire all of the pages allocated for TXM */
12296 vm_page_lock_queues();
12297 vm_page_wire(page, VM_KERN_MEMORY_SECURITY, TRUE);
12298 vm_page_unlock_queues();
12299
12300 phys_addr = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page));
12301 if (phys_addr == 0) {
12302 panic("invalid VM page allocated for TXM: %llu", phys_addr);
12303 }
12304
12305 /* Add the physical page to the TXM VM object */
12306 vm_object_lock(txm_vm_object);
12307 vm_page_insert_wired(
12308 page,
12309 txm_vm_object,
12310 phys_addr - gPhysBase,
12311 VM_KERN_MEMORY_SECURITY);
12312 vm_object_unlock(txm_vm_object);
12313
12314 /* Reset thread privilege */
12315 set_vm_privilege(thread_vm_privileged);
12316
12317 /* Retype the page */
12318 _pmap_txm_transfer_page(phys_addr);
12319
12320 return phys_addr;
12321 }
12322
12323 int
12324 pmap_cs_configuration(void)
12325 {
12326 code_signing_config_t config = 0;
12327
12328 /* Compute the code signing configuration */
12329 code_signing_configuration(NULL, &config);
12330
12331 return (int)config;
12332 }
12333
12334 bool
12335 pmap_performs_stage2_translations(
12336 __unused pmap_t pmap)
12337 {
12338 return false;
12339 }
12340
12341 bool
12342 pmap_has_iofilter_protected_write(void)
12343 {
12344 #if HAS_GUARDED_IO_FILTER
12345 return true;
12346 #else
12347 return false;
12348 #endif
12349 }
12350
12351 #if HAS_GUARDED_IO_FILTER
12352
12353 void
12354 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12355 {
12356 /**
12357 * Even though this is done from EL1/2 for an address potentially owned by Guarded
12358 * Mode, we should be fine as mmu_kvtop uses "at s1e1r" checking for read access
12359 * only.
12360 */
12361 const pmap_paddr_t pa = mmu_kvtop(addr);
12362
12363 if (!pa) {
12364 panic("%s: addr 0x%016llx doesn't have a valid kernel mapping", __func__, (uint64_t) addr);
12365 }
12366
12367 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
12368 if (frame_type == XNU_PROTECTED_IO) {
12369 bool is_hibernating = false;
12370 if (__improbable(is_hibernating)) {
12371 /**
12372 * Default set to NO_PANICKING_DOMAIN and not to INVALID_DOMAIN since
12373 * INVALID_DOMAIN is set for panic in dispatch logic itself.
12374 */
12375 sptm_domain_t panic_source = NO_PANICKING_DOMAIN;
12376 (void)sptm_panic_source(&panic_source);
12377
12378 /**
12379 * If panic_source is invalid (NO_PANICKING_DOMAIN: sptm_panic_source() failed
12380 * or no panic occurred) OR if the panic_source is XNU_DOMAIN, then use the
12381 * hibernation-specific write.
12382 */
12383 if (panic_source == NO_PANICKING_DOMAIN || panic_source == XNU_DOMAIN) {
12384 sptm_hib_iofilter_protected_write(pa, value, width);
12385 } else {
12386 /* Panic source is valid (panic occurred) and not XNU_DOMAIN */
12387 sptm_iofilter_protected_write(pa, value, width);
12388 }
12389 } else {
12390 sptm_iofilter_protected_write(pa, value, width);
12391 }
12392 } else {
12393 /* Mappings is valid but not specified by I/O filter. However, we still try
12394 * accessing the address from kernel mode. This allows addresses that are not
12395 * owned by SPTM to be accessed by this interface.
12396 */
12397 switch (width) {
12398 case 1:
12399 *(volatile uint8_t *)addr = (uint8_t) value;
12400 break;
12401 case 2:
12402 *(volatile uint16_t *)addr = (uint16_t) value;
12403 break;
12404 case 4:
12405 *(volatile uint32_t *)addr = (uint32_t) value;
12406 break;
12407 case 8:
12408 *(volatile uint64_t *)addr = (uint64_t) value;
12409 break;
12410 default:
12411 panic("%s: width %llu not supported", __func__, width);
12412 }
12413 }
12414 }
12415
12416 #else /* HAS_GUARDED_IO_FILTER */
12417
12418 __attribute__((__noreturn__))
12419 void
12420 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12421 {
12422 panic("%s called on an unsupported platform.", __FUNCTION__);
12423 }
12424
12425 #endif /* HAS_GUARDED_IO_FILTER */
12426
12427 void * __attribute__((noreturn))
12428 pmap_claim_reserved_ppl_page(void)
12429 {
12430 panic("%s: function not supported in this environment", __FUNCTION__);
12431 }
12432
12433 void __attribute__((noreturn))
12434 pmap_free_reserved_ppl_page(void __unused *kva)
12435 {
12436 panic("%s: function not supported in this environment", __FUNCTION__);
12437 }
12438
12439 bool
12440 pmap_lookup_in_loaded_trust_caches(__unused const uint8_t cdhash[CS_CDHASH_LEN])
12441 {
12442 kern_return_t kr = query_trust_cache(
12443 kTCQueryTypeLoadable,
12444 cdhash,
12445 NULL);
12446
12447 if (kr == KERN_SUCCESS) {
12448 return true;
12449 }
12450 return false;
12451 }
12452
12453 uint32_t
12454 pmap_lookup_in_static_trust_cache(__unused const uint8_t cdhash[CS_CDHASH_LEN])
12455 {
12456 TrustCacheQueryToken_t query_token = {0};
12457 kern_return_t kr = KERN_NOT_FOUND;
12458 uint64_t flags = 0;
12459 uint8_t hash_type = 0;
12460
12461 kr = query_trust_cache(
12462 kTCQueryTypeStatic,
12463 cdhash,
12464 &query_token);
12465
12466 if (kr == KERN_SUCCESS) {
12467 amfi->TrustCache.queryGetFlags(&query_token, &flags);
12468 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
12469
12470 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
12471 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
12472 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
12473 }
12474
12475 return 0;
12476 }
12477
12478 #if DEVELOPMENT || DEBUG
12479
12480 struct page_table_dump_header {
12481 uint64_t pa;
12482 uint64_t num_entries;
12483 uint64_t start_va;
12484 uint64_t end_va;
12485 };
12486
12487 static kern_return_t
12488 pmap_dump_page_tables_recurse(pmap_t pmap,
12489 const tt_entry_t *ttp,
12490 unsigned int cur_level,
12491 unsigned int level_mask,
12492 uint64_t start_va,
12493 void *buf_start,
12494 void *buf_end,
12495 size_t *bytes_copied)
12496 {
12497 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12498 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
12499
12500 uint64_t size = pt_attr->pta_level_info[cur_level].size;
12501 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
12502 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
12503 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
12504
12505 void *bufp = (uint8_t*)buf_start + *bytes_copied;
12506
12507 if (cur_level == pt_attr_root_level(pt_attr)) {
12508 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
12509 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
12510 }
12511
12512 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
12513 const tt_entry_t *tt_end = &ttp[num_entries];
12514
12515 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
12516 return KERN_INSUFFICIENT_BUFFER_SIZE;
12517 }
12518
12519 if (level_mask & (1U << cur_level)) {
12520 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
12521 header->pa = kvtophys_nofail((vm_offset_t)ttp);
12522 header->num_entries = num_entries;
12523 header->start_va = start_va;
12524 header->end_va = start_va + (num_entries * size);
12525
12526 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
12527 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
12528 }
12529 uint64_t current_va = start_va;
12530
12531 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
12532 tt_entry_t tte = *ttep;
12533
12534 if (!(tte & valid_mask)) {
12535 continue;
12536 }
12537
12538 if ((tte & type_mask) == type_block) {
12539 continue;
12540 } else {
12541 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
12542 panic("%s: corrupt entry %#llx at %p, "
12543 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
12544 __FUNCTION__, tte, ttep,
12545 ttp, cur_level, bufp, buf_end);
12546 }
12547
12548 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
12549
12550 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
12551 level_mask, current_va, buf_start, buf_end, bytes_copied);
12552
12553 if (recurse_result != KERN_SUCCESS) {
12554 return recurse_result;
12555 }
12556 }
12557 }
12558
12559 return KERN_SUCCESS;
12560 }
12561
12562 kern_return_t
12563 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
12564 {
12565 if (not_in_kdp) {
12566 panic("pmap_dump_page_tables must only be called from kernel debugger context");
12567 }
12568 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
12569 level_mask, pmap->min, bufp, buf_end, bytes_copied);
12570 }
12571
12572 #else /* DEVELOPMENT || DEBUG */
12573
12574 kern_return_t
12575 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
12576 unsigned int level_mask __unused, size_t *bytes_copied __unused)
12577 {
12578 return KERN_NOT_SUPPORTED;
12579 }
12580 #endif /* !(DEVELOPMENT || DEBUG) */
12581
12582
12583 #ifdef CONFIG_XNUPOST
12584 static volatile bool pmap_test_took_fault = false;
12585
12586 static bool
12587 pmap_test_fault_handler(arm_saved_state_t * state)
12588 {
12589 bool retval = false;
12590 uint64_t esr = get_saved_state_esr(state);
12591 esr_exception_class_t class = ESR_EC(esr);
12592 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
12593
12594 if ((class == ESR_EC_DABORT_EL1) &&
12595 ((fsc == FSC_PERMISSION_FAULT_L3)
12596 || (fsc == FSC_ACCESS_FLAG_FAULT_L3)
12597 || (fsc == FSC_TRANSLATION_FAULT_L0))) {
12598 pmap_test_took_fault = true;
12599 /* return to the instruction immediately after the call to NX page */
12600 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
12601 retval = true;
12602 }
12603
12604 return retval;
12605 }
12606
12607 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
12608 static NOKASAN bool
12609 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
12610 {
12611 pmap_t old_pmap = NULL;
12612 thread_t thread = current_thread();
12613
12614 pmap_test_took_fault = false;
12615
12616 /*
12617 * We're potentially switching pmaps without using the normal thread
12618 * mechanism; disable interrupts and preemption to avoid any unexpected
12619 * memory accesses.
12620 */
12621 const boolean_t old_int_state = ml_set_interrupts_enabled(FALSE);
12622 mp_disable_preemption();
12623
12624 if (pmap != NULL) {
12625 old_pmap = current_pmap();
12626 pmap_switch(pmap, thread);
12627
12628 /* Disable PAN; pmap shouldn't be the kernel pmap. */
12629 #if __ARM_PAN_AVAILABLE__
12630 __builtin_arm_wsr("pan", 0);
12631 #endif /* __ARM_PAN_AVAILABLE__ */
12632 }
12633
12634 ml_expect_fault_begin(pmap_test_fault_handler, va);
12635
12636 if (is_write) {
12637 *((volatile uint64_t*)(va)) = 0xdec0de;
12638 } else {
12639 volatile uint64_t tmp = *((volatile uint64_t*)(va));
12640 (void)tmp;
12641 }
12642
12643 /* Save the fault bool, and undo the gross stuff we did. */
12644 bool took_fault = pmap_test_took_fault;
12645 ml_expect_fault_end();
12646
12647 if (pmap != NULL) {
12648 #if __ARM_PAN_AVAILABLE__
12649 __builtin_arm_wsr("pan", 1);
12650 #endif /* __ARM_PAN_AVAILABLE__ */
12651
12652 pmap_switch(old_pmap, thread);
12653 }
12654
12655 mp_enable_preemption();
12656 ml_set_interrupts_enabled(old_int_state);
12657 bool retval = (took_fault == should_fault);
12658 return retval;
12659 }
12660
12661 static bool
12662 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
12663 {
12664 bool retval = pmap_test_access(pmap, va, should_fault, false);
12665
12666 if (!retval) {
12667 T_FAIL("%s: %s, "
12668 "pmap=%p, va=%p, should_fault=%u",
12669 __func__, should_fault ? "did not fault" : "faulted",
12670 pmap, (void*)va, (unsigned)should_fault);
12671 }
12672
12673 return retval;
12674 }
12675
12676 static bool
12677 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
12678 {
12679 bool retval = pmap_test_access(pmap, va, should_fault, true);
12680
12681 if (!retval) {
12682 T_FAIL("%s: %s, "
12683 "pmap=%p, va=%p, should_fault=%u",
12684 __func__, should_fault ? "did not fault" : "faulted",
12685 pmap, (void*)va, (unsigned)should_fault);
12686 }
12687
12688 return retval;
12689 }
12690
12691 static bool
12692 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
12693 {
12694 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12695 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
12696
12697 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
12698
12699 if (!retval) {
12700 T_FAIL("%s: bits=%u, "
12701 "pa=%p, should_be_set=%u",
12702 __func__, bits,
12703 (void*)pa, should_be_set);
12704 }
12705
12706 return retval;
12707 }
12708
12709 static __attribute__((noinline)) bool
12710 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
12711 {
12712 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
12713 return retval;
12714 }
12715
12716 static int
12717 pmap_test_test_config(unsigned int flags)
12718 {
12719 T_LOG("running pmap_test_test_config flags=0x%X", flags);
12720 unsigned int map_count = 0;
12721 unsigned long page_ratio = 0;
12722 pmap_t pmap = pmap_create_options(NULL, 0, flags);
12723
12724 if (!pmap) {
12725 panic("Failed to allocate pmap");
12726 }
12727
12728 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12729 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
12730 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
12731 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
12732
12733 if (pmap_page_size <= native_page_size) {
12734 page_ratio = native_page_size / pmap_page_size;
12735 } else {
12736 /*
12737 * We claim to support a page_ratio of less than 1, which is
12738 * not currently supported by the pmap layer; panic.
12739 */
12740 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
12741 "flags=%u",
12742 __func__, native_page_size, pmap_page_size,
12743 flags);
12744 }
12745
12746 if (PAGE_RATIO > 1) {
12747 /*
12748 * The kernel is deliberately pretending to have 16KB pages.
12749 * The pmap layer has code that supports this, so pretend the
12750 * page size is larger than it is.
12751 */
12752 pmap_page_size = PAGE_SIZE;
12753 native_page_size = PAGE_SIZE;
12754 }
12755
12756 /*
12757 * Get two pages from the VM; one to be mapped wired, and one to be
12758 * mapped nonwired.
12759 */
12760 vm_page_t unwired_vm_page = vm_page_grab();
12761 vm_page_t wired_vm_page = vm_page_grab();
12762
12763 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
12764 panic("Failed to grab VM pages");
12765 }
12766
12767 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
12768 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
12769
12770 pmap_paddr_t pa = ptoa(pn);
12771 pmap_paddr_t wired_pa = ptoa(wired_pn);
12772
12773 /*
12774 * We'll start mappings at the second twig TT. This keeps us from only
12775 * using the first entry in each TT, which would trivially be address
12776 * 0; one of the things we will need to test is retrieving the VA for
12777 * a given PTE.
12778 */
12779 vm_map_address_t va_base = pmap_twig_size;
12780 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
12781
12782 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
12783 /*
12784 * Not exactly a functional failure, but this test relies on
12785 * there being a spare PTE slot we can use to pin the TT.
12786 */
12787 panic("Cannot pin translation table");
12788 }
12789
12790 /*
12791 * Create the wired mapping; this will prevent the pmap layer from
12792 * reclaiming our test TTs, which would interfere with this test
12793 * ("interfere" -> "make it panic").
12794 */
12795 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true, PMAP_MAPPING_TYPE_INFER);
12796
12797 T_LOG("Validate that kernel cannot write to SPTM memory.");
12798 pt_entry_t * ptep = pmap_pte(pmap, va_base);
12799 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
12800
12801 /*
12802 * Create read-only mappings of the nonwired page; if the pmap does
12803 * not use the same page size as the kernel, create multiple mappings
12804 * so that the kernel page is fully mapped.
12805 */
12806 for (map_count = 0; map_count < page_ratio; map_count++) {
12807 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)),
12808 VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12809 }
12810
12811 /* Validate that all the PTEs have the expected PA and VA. */
12812 for (map_count = 0; map_count < page_ratio; map_count++) {
12813 ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
12814
12815 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
12816 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
12817 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
12818 }
12819
12820 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
12821 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
12822 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
12823 }
12824 }
12825
12826 T_LOG("Validate that reads to our mapping do not fault.");
12827 pmap_test_read(pmap, va_base, false);
12828
12829 T_LOG("Validate that writes to our mapping fault.");
12830 pmap_test_write(pmap, va_base, true);
12831
12832 T_LOG("Make the first mapping writable.");
12833 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12834
12835 T_LOG("Validate that writes to our mapping do not fault.");
12836 pmap_test_write(pmap, va_base, false);
12837
12838 /*
12839 * For page ratios of greater than 1: validate that writes to the other
12840 * mappings still fault. Remove the mappings afterwards (we're done
12841 * with page ratio testing).
12842 */
12843 for (map_count = 1; map_count < page_ratio; map_count++) {
12844 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
12845 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
12846 }
12847
12848 /* Remove remaining mapping */
12849 pmap_remove(pmap, va_base, va_base + pmap_page_size);
12850
12851 T_LOG("Test XO mapping");
12852 kern_return_t kr = pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false, PMAP_MAPPING_TYPE_INFER);
12853 if (pmap_allows_xo(pmap)) {
12854 if (kr != KERN_SUCCESS) {
12855 T_FAIL("XO mapping returned 0x%x instead of KERN_SUCCESS", (unsigned int)kr);
12856 }
12857 } else if (kr != KERN_PROTECTION_FAILURE) {
12858 T_FAIL("XO mapping returned 0x%x instead of KERN_PROTECTION_FAILURE", (unsigned int)kr);
12859 }
12860
12861 T_LOG("Make the first mapping RX");
12862 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE | VM_PROT_READ, VM_PROT_EXECUTE, 0, false, PMAP_MAPPING_TYPE_INFER);
12863
12864 T_LOG("Validate that reads to our mapping do not fault.");
12865 pmap_test_read(pmap, va_base, false);
12866
12867 T_LOG("Validate that writes to our mapping fault.");
12868 pmap_test_write(pmap, va_base, true);
12869
12870 pmap_remove(pmap, va_base, va_base + pmap_page_size);
12871
12872 T_LOG("Mark the page unreferenced and unmodified.");
12873 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12874 pmap_test_check_refmod(pa, 0);
12875 pmap_recycle_page(atop(pa));
12876
12877 /*
12878 * Begin testing the ref/mod state machine. Re-enter the mapping with
12879 * different protection/fault_type settings, and confirm that the
12880 * ref/mod state matches our expectations at each step.
12881 */
12882 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
12883 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12884 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12885
12886 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
12887 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12888 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12889 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12890
12891 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
12892 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12893 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12894 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12895
12896 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
12897 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12898 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12899
12900 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
12901 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12902 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12903 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12904
12905 /*
12906 * Shared memory testing; we'll have two mappings; one read-only,
12907 * one read-write.
12908 */
12909 vm_map_address_t rw_base = va_base;
12910 vm_map_address_t ro_base = va_base + pmap_page_size;
12911
12912 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12913 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12914
12915 /*
12916 * Test that we take faults as expected for unreferenced/unmodified
12917 * pages. Also test the arm_fast_fault interface, to ensure that
12918 * mapping permissions change as expected.
12919 */
12920 T_LOG("!ref/!mod: expect no access");
12921 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12922 pmap_test_read_write(pmap, ro_base, false, false);
12923 pmap_test_read_write(pmap, rw_base, false, false);
12924
12925 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
12926 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
12927 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12928 pmap_test_read_write(pmap, ro_base, true, false);
12929 pmap_test_read_write(pmap, rw_base, true, false);
12930
12931 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
12932 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12933 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12934 pmap_test_read_write(pmap, ro_base, true, false);
12935 pmap_test_read_write(pmap, rw_base, true, true);
12936
12937 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
12938 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12939 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12940 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12941 pmap_test_read_write(pmap, ro_base, true, false);
12942 pmap_test_read_write(pmap, rw_base, true, true);
12943
12944 T_LOG("RW protect both mappings; should not change protections.");
12945 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12946 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12947 pmap_test_read_write(pmap, ro_base, true, false);
12948 pmap_test_read_write(pmap, rw_base, true, true);
12949
12950 T_LOG("Read protect both mappings; RW mapping should become RO.");
12951 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
12952 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
12953 pmap_test_read_write(pmap, ro_base, true, false);
12954 pmap_test_read_write(pmap, rw_base, true, false);
12955
12956 T_LOG("RW protect the page; mappings should not change protections.");
12957 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12958 pmap_page_protect(pn, VM_PROT_ALL);
12959 pmap_test_read_write(pmap, ro_base, true, false);
12960 pmap_test_read_write(pmap, rw_base, true, true);
12961
12962 T_LOG("Read protect the page; RW mapping should become RO.");
12963 pmap_page_protect(pn, VM_PROT_READ);
12964 pmap_test_read_write(pmap, ro_base, true, false);
12965 pmap_test_read_write(pmap, rw_base, true, false);
12966
12967 T_LOG("Validate that disconnect removes all known mappings of the page.");
12968 pmap_disconnect(pn);
12969 if (!pmap_verify_free(pn)) {
12970 T_FAIL("Page still has mappings");
12971 }
12972
12973 #if defined(ARM_LARGE_MEMORY)
12974 #define PMAP_TEST_LARGE_MEMORY_VA 64 * (1ULL << 40) /* 64 TB */
12975 #if !defined(ARM_LARGE_MEMORY_KERNONLY)
12976
12977 T_LOG("Create new wired mapping in the extended address space enabled by ARM_LARGE_MEMORY.");
12978 pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12979 pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, true, true);
12980 pmap_remove(pmap, PMAP_TEST_LARGE_MEMORY_VA, PMAP_TEST_LARGE_MEMORY_VA + pmap_page_size);
12981 #else /* !defined(ARM_LARGE_MEMORY_KERNONLY) */
12982 /* Using kernel-only large memory. Make sure user pmap will fail. */
12983 T_LOG("Expect wired mapping to fault in ARM_LARGE_MEMORY when using KERNONLY.");
12984
12985 /* The mapping should be rejected, it's outside of T0SZ */
12986 kr = pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa,
12987 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12988 T_QUIET; T_ASSERT_NE_INT(kr, KERN_SUCCESS, NULL);
12989
12990 /* Addressing outside of T0SZ should result in a L0 xlate fault */
12991 const bool did_fault = pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, false, false);
12992 T_QUIET; T_ASSERT(did_fault, NULL);
12993 #endif /* !defined(ARM_LARGE_MEMORY_KERNONLY) */
12994 #endif /* ARM_LARGE_MEMORY */
12995
12996 T_LOG("Remove the wired mapping, so we can tear down the test map.");
12997 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
12998 pmap_destroy(pmap);
12999
13000 T_LOG("Release the pages back to the VM.");
13001 vm_page_lock_queues();
13002 vm_page_free(unwired_vm_page);
13003 vm_page_free(wired_vm_page);
13004 vm_page_unlock_queues();
13005
13006 T_LOG("Testing successful!");
13007 return 0;
13008 }
13009
13010 kern_return_t
13011 pmap_test(void)
13012 {
13013 T_LOG("Starting pmap_tests");
13014 const int flags = PMAP_CREATE_TEST | PMAP_CREATE_64BIT;
13015
13016 #if __ARM_MIXED_PAGE_SIZE__
13017 T_LOG("Testing VM_PAGE_SIZE_4KB");
13018 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
13019 T_LOG("Testing VM_PAGE_SIZE_16KB");
13020 pmap_test_test_config(flags);
13021 #else /* __ARM_MIXED_PAGE_SIZE__ */
13022 pmap_test_test_config(flags);
13023 #endif /* __ARM_MIXED_PAGE_SIZE__ */
13024
13025 T_PASS("completed pmap_test successfully");
13026 return KERN_SUCCESS;
13027 }
13028 #endif /* CONFIG_XNUPOST */
13029
13030 /*
13031 * The following function should never make it to RELEASE code, since
13032 * it provides a way to get the PPL to modify text pages.
13033 */
13034 #if DEVELOPMENT || DEBUG
13035
13036 /**
13037 * Forcibly overwrite executable text with an illegal instruction.
13038 *
13039 * @note Only used for xnu unit testing.
13040 *
13041 * @param pa The physical address to corrupt.
13042 *
13043 * @return KERN_SUCCESS on success.
13044 */
13045 kern_return_t
13046 pmap_test_text_corruption(pmap_paddr_t pa __unused)
13047 {
13048 /*
13049 * SPTM TODO: implement an SPTM version of this.
13050 * The physical apertue is owned by the SPTM and text
13051 * pages have RO physical aperture mappings.
13052 */
13053 return KERN_SUCCESS;
13054 }
13055
13056 #endif /* DEVELOPMENT || DEBUG */
13057
13058