1 /*
2 * Copyright (c) 2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /**
29 * This header file is used to store the types, prototypes, and inline functions
30 * that define some of the most important data structures used in the pmap. This
31 * header is only meant for sharing types within the pmap; if a type is meant to
32 * be used by the rest of the kernel, then put it into osfmk/arm64/sptm/pmap/pmap.h.
33 */
34 #pragma once
35
36 #include <stdint.h>
37
38 #include <kern/ledger.h>
39 #include <mach/vm_types.h>
40 #include <mach_assert.h>
41 #include <vm/vm_page.h>
42
43 #include <arm/cpu_data.h>
44 #include <arm/machine_routines.h>
45 #include <arm64/proc_reg.h>
46
47 #if HIBERNATION
48 #include <arm64/hibernate_secure_hmac.h>
49 #endif /* HIBERNATION */
50
51 /* Temporary include before moving all ledger functions into pmap_data.c */
52 #include <os/refcnt.h>
53
54 /**
55 * These headers are safe to be included in this file since they shouldn't rely
56 * on any of the internal pmap header files (so no circular dependencies).
57 */
58 #include <arm64/sptm/pmap/pmap.h>
59 #include <arm64/sptm/pmap/pmap_pt_geometry.h>
60
61 #include <arm64/sptm/sptm.h>
62
63 /**
64 * These values represent the first and last kernel-managed physical addresses.
65 * We keep track of extra metadata on kernel-managed pages compared to other
66 * pages (usually iBoot carved out memory or I/O).
67 */
68 extern pmap_paddr_t vm_first_phys, vm_last_phys;
69
70 #define PMAP_HIB_STATE_REACHED(states) false
71 #define PMAP_ASSERT_NOT_WRITING_HIB()
72 #define PMAP_IS_HIBERNATING() false
73
74 /**
75 * Return whether the given address represents a kernel-managed physical page.
76 *
77 * Whether a page is considered "kernel-managed" is determined by the BootArgs
78 * passed by the bootloader. Typically memory carved out by the bootloader as
79 * well as I/O memory should return false.
80 *
81 * @param pa The physical address to check.
82 */
83 static inline bool
pa_valid(pmap_paddr_t pa)84 pa_valid(pmap_paddr_t pa)
85 {
86 return (pa >= vm_first_phys) && (pa < vm_last_phys);
87 }
88
89 /* Sentinal value indicating an invalid physical address index. */
90 #define INVALID_PAI UINT_MAX
91
92 /**
93 * The pmap has a variety of data structures (pv_head_table/pp_attr_table) that
94 * contain an entry for every kernel-managed page in the system. These systems
95 * are indexed with physical address indices ("pai") generated by this function.
96 *
97 * The logic is simple since there should be one entry in each of these data
98 * structures for each kernel-managed physical page in the system. These data
99 * structures are allocated on boot based on the amount of memory available.
100 *
101 * @note PAIs are defined using the VM page size, which might not be identical
102 * to the underlying hardware page size for an arbitrary address space.
103 * This means that the data structures relying on PAIs will contain one
104 * entry for each VM page, not hardware page.
105 *
106 * @note This function is only valid for physical addresses that are
107 * kernel-managed.
108 */
109 static inline unsigned int
pa_index(pmap_paddr_t pa)110 pa_index(pmap_paddr_t pa)
111 {
112 return (unsigned int)atop(pa - vm_first_phys);
113 }
114
115 /**
116 * Convert from a physical address index (pai) back to a raw physical address.
117 *
118 * @param pai The physical address index to convert to a PA.
119 *
120 * @return The page-aligned physical address corresponding to [pai].
121 */
122 static inline pmap_paddr_t
pai_to_pa(unsigned int pai)123 pai_to_pa(unsigned int pai)
124 {
125 return ptoa((pmap_paddr_t)pai) + vm_first_phys;
126 }
127
128 /* See the definition of pv_head_table for more information. */
129 extern uintptr_t *pv_head_table;
130
131 /* Represents a NULL entry in the pv_head_table. */
132 #define PV_ENTRY_NULL ((pv_entry_t *) 0)
133
134 /**
135 * Given a physical address index, return the corresponding pv_head_table entry.
136 *
137 * @note The returned entry might be invalid, or a pointer to a pt_entry_t,
138 * pv_entry_t, or pt_desc_t depending on the type for this entry.
139 * Determine the type using pvh_test_type().
140 *
141 * @param pai The index returned by pa_index() for the page whose pv_head_table
142 * entry should be retrieved.
143 */
144 static inline uintptr_t
pai_to_pvh(unsigned int pai)145 pai_to_pvh(unsigned int pai)
146 {
147 return pv_head_table[pai];
148 }
149
150 /**
151 * Each pv_head_table entry can be one of four different types:
152 *
153 * - PVH_TYPE_NULL: No mappings to the physical page exist outside of the
154 * physical aperture. Physical aperture mappings are not
155 * tracked in the pv_head_table.
156 *
157 * - PVH_TYPE_PVEP: There are multiple mappings to the physical page.
158 * These entries are linked lists of pv_entry_t objects (which
159 * each contain a pointer to the associated PTE and a pointer
160 * to the next entry in the list).
161 *
162 * - PVH_TYPE_PTEP: There is a single mapping to the physical page. Once more
163 * mappings are created, this entry will get upgraded to an
164 * entry of type PVH_TYPE_PVEP. These entries are pointers
165 * directly to the page table entry that contain the mapping
166 * (pt_entry_t*).
167 *
168 * - PVH_TYPE_PTDP: The physical page is being used as a page table. These
169 * entries are pointers to page table descriptor structures
170 * (pt_desc_t) which contain metadata related to each page
171 * table.
172 *
173 * The type is stored in the bottom two bits of each pv_head_table entry. That
174 * type needs to be checked before dereferencing the pointer to determine which
175 * pointer type to dereference as.
176 */
177 #define PVH_TYPE_NULL 0x0UL
178 #define PVH_TYPE_PVEP 0x1UL
179 #define PVH_TYPE_PTEP 0x2UL
180 #define PVH_TYPE_PTDP 0x3UL
181
182 #define PVH_TYPE_MASK (0x3UL)
183
184
185 /**
186 * PV_HEAD_TABLE Flags.
187 *
188 * All flags listed below are stored in the pv_head_table entry/pointer
189 * (per-physical-page) unless otherwise noted.
190 *
191 * Please update the pv_walk LLDB macro if these flags are changed or added to.
192 */
193
194 /**
195 * This flag is set for every mapping created by an IOMMU.
196 *
197 * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
198 * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
199 */
200 #define PVH_FLAG_IOMMU 0x4UL
201
202 /**
203 * This flag is only valid when PVH_FLAG_IOMMU is set. For an IOMMU mapping, if
204 * this bit is set, then the PTE pointer points directly into the IOMMU page
205 * table for this mapping. If this bit is cleared, then the "PTE pointer" is
206 * actually a pointer to the IOMMU descriptor object that owns this mapping.
207 *
208 * There are cases where it's not easy to tie an IOMMU mapping directly to a
209 * specific page table, so this allows us to at least get a pointer to which
210 * IOMMU created this mapping which is useful for debugging purposes.
211 *
212 * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
213 * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
214 */
215 #define PVH_FLAG_IOMMU_TABLE (1ULL << 63)
216
217 /**
218 * This flag is set when the first CPU (non-IOMMU) mapping is created. This is
219 * important to keep track of because various accounting statistics are based on
220 * the options specified for the first CPU mapping. This flag, and thus the
221 * accounting statistics, will persist as long as there *any* mappings of the
222 * page (including IOMMU mappings). This works because the accounting for a page
223 * should not need to change until the page is recycled by the VM layer, and we
224 * double-check that there are no mappings (CPU or IOMMU) when a page is
225 * recycled (see: pmap_verify_free()).
226 */
227 #define PVH_FLAG_CPU (1ULL << 62)
228
229 /* This bit is used as a lock when modifying a pv_head_table entry. */
230 #define PVH_LOCK_BIT 61
231 #define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT)
232
233 /**
234 * This flag is set when there are any executable mappings to this physical
235 * page. This is used to prevent any writable mappings from being created at
236 * the same time an executable mapping exists.
237 */
238 #define PVH_FLAG_EXEC (1ULL << 60)
239
240 /**
241 * This flag is used to mark that a page has been hashed into the hibernation
242 * image.
243 *
244 * The hibernation driver will use this to ensure that all PPL-owned memory is
245 * correctly included into the hibernation image (a missing PPL page could be
246 * a security concern when coming out of hibernation).
247 */
248 #define PVH_FLAG_HASHED (1ULL << 58)
249
250 #define PVH_FLAG_RETIRED 0
251
252
253 #define PVH_FLAG_TAGGED 0
254
255
256 /**
257 * This flag is used to mark that a PV head entry has been placed into
258 * "sleep mode", which typically happens when the lock owner needs to
259 * process a long PV list. If this bit is set, threads which contend
260 * on the PVH lock must call thread_block() to wait until they are awakened
261 * by the current lock owner releasing the lock.
262 */
263 #define PVH_FLAG_SLEEP (1ULL << 54)
264
265 /**
266 * These bits need to be set to safely dereference a pv_head_table
267 * entry/pointer.
268 *
269 * Any change to this #define should also update the copy located in the pmap.py
270 * LLDB macros file.
271 */
272 #define PVH_MUTABLE_FLAGS (PVH_FLAG_CPU | PVH_FLAG_EXEC | PVH_FLAG_HASHED | PVH_FLAG_RETIRED | PVH_FLAG_TAGGED)
273
274 #define PVH_LOCK_FLAGS (PVH_FLAG_LOCK | PVH_FLAG_SLEEP)
275
276 #define PVH_HIGH_FLAGS (PVH_MUTABLE_FLAGS | PVH_LOCK_FLAGS)
277
278 /* Mask used to clear out the TYPE bits from a pv_head_table entry/pointer. */
279 #define PVH_LIST_MASK (~PVH_TYPE_MASK)
280
281 /* Which 32-bit word in each pv_head_table entry/pointer contains the LOCK bit. */
282 #define PVH_LOCK_WORD 1 /* Assumes little-endian */
283
284 /**
285 * Assert that a pv_head_table entry is locked. Will panic if the lock isn't
286 * acquired.
287 *
288 * @param index The physical address index to check.
289 */
290 static inline void
pvh_assert_locked(__assert_only unsigned int index)291 pvh_assert_locked(__assert_only unsigned int index)
292 {
293 assertf(os_atomic_load(&pv_head_table[index], relaxed) & PVH_LOCK_FLAGS,
294 "%s: PVH %p (=%p) for pai 0x%x not locked or in sleep mode", __func__,
295 &pv_head_table[index], (void*)(os_atomic_load(&pv_head_table[index], relaxed)), index);
296 }
297
298 /**
299 * Helper function for returning the 32-bit PVH lock word corresponding
300 * to a physical address index.
301 *
302 * @param index The physical address index of the pv_head_table entry
303 *
304 * @return A pointer to the 32-bit word containing the lock bit
305 */
306 static inline uint32_t*
pvh_lock_word(unsigned int index)307 pvh_lock_word(unsigned int index)
308 {
309 return (uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD;
310 }
311
312 /**
313 * Helper macro for computing the lock bit offset within the 32-bit
314 * lock word for each PV head entry.
315 *
316 * @return A 32-bit integer containing the lock bit offset.
317 */
318 #define PVH_LOCK_BIT_OFFSET (PVH_LOCK_BIT - (PVH_LOCK_WORD * 32))
319
320 /**
321 * Lock a pv_head_table entry, and return the value stored in the pv_head_table array.
322 *
323 * @param index The physical address index of the pv_head_table entry to lock.
324 *
325 * @return A wrapper object with the contents of the locked pv_head_table entry.
326 */
327 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock(unsigned int index)328 pvh_lock(unsigned int index)
329 {
330 extern unsigned int not_in_kdp;
331 const bool was_preemptible = preemption_enabled();
332 assert(was_preemptible || (startup_phase < STARTUP_SUB_EARLY_BOOT) ||
333 PMAP_IS_HIBERNATING() || !not_in_kdp);
334
335 bool (^check_preemption)(void) = ^bool (void) {
336 return was_preemptible && pmap_pending_preemption();
337 };
338
339 hw_lock_status_t ret;
340 locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
341 do {
342 ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
343 &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
344
345 if (ret == HW_LOCK_ACQUIRED) {
346 locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
347 if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
348 wait_result_t wres;
349 wres = assert_wait(&pv_head_table[index], THREAD_UNINT);
350 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
351 assertf(wres == THREAD_WAITING, "%s: unexpected wait result %d", __func__, wres);
352 thread_block(THREAD_CONTINUE_NULL);
353 ret = HW_LOCK_CONTENDED;
354 }
355 }
356 } while (ret != HW_LOCK_ACQUIRED);
357
358 return locked_pvh;
359 }
360
361 /**
362 * Lock a pvh_head_table entry, possibly in a preemption-disabled context.
363 *
364 * @note This function is only meant for special use cases in which pmap
365 * functions must be invoked with preemption disabled. These cases
366 * are expected to be rare and limited. If you think you need to
367 * use this in more places, you're probably wrong.
368 *
369 * @param index The physical address index of the pv_head_table entry to lock.
370 *
371 * @return A wrapper object with the contents of the locked pv_head_table entry.
372 */
373 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock_nopreempt(unsigned int index)374 pvh_lock_nopreempt(unsigned int index)
375 {
376 if (__improbable(preemption_enabled())) {
377 return pvh_lock(index);
378 }
379 hw_lock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
380 const locked_pvh_t locked_pvh = {.pvh = os_atomic_load(&pv_head_table[index], relaxed), .pai = index};
381
382 if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
383 panic("%s invoked on sleep-mode PVH %p for pai 0x%x", __func__, &pv_head_table[index], index);
384 }
385
386 return locked_pvh;
387 }
388
389 /**
390 * Attempt to lock a pv_head_table entry, failing if the lock can't be immediately acquired.
391 *
392 * @param index The physical address index of the pv_head_table entry to lock.
393 *
394 * @return A wrapper object with the contents of the locked pv_head_table entry if successful,
395 * 0 otherwise.
396 */
397 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_try_lock(unsigned int index)398 pvh_try_lock(unsigned int index)
399 {
400 locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
401 bool locked = hw_lock_bit_try(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
402
403 if (locked) {
404 locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
405 assert(locked_pvh.pvh != 0);
406 if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
407 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
408 locked_pvh.pvh = 0;
409 }
410 }
411
412 return locked_pvh;
413 }
414
415 /**
416 * Helper for determining whether a preceding pvh_try_lock() call succeeded.
417 *
418 * @param locked_pvh A wrapper representing a possibly-locked PV head table entry
419 * returned by pvh_try_lock().
420 *
421 * @return True if [locked_pvh] represents a successfully-locked PVH, false otherwise.
422 */
423 static inline bool
pvh_try_lock_success(const locked_pvh_t * locked_pvh)424 pvh_try_lock_success(const locked_pvh_t *locked_pvh)
425 {
426 assert(locked_pvh != NULL);
427 return locked_pvh->pvh != 0;
428 }
429
430 /**
431 * Place a pv_head_table entry in sleep mode, so that other threads contending on the PVH
432 * lock will sleep until this thread calls pvh_unlock().
433 *
434 * @note It is legal to call this function if the lock is already in sleep mode.
435 * In that case, the call will have no effect.
436 * @note This function must not be called with preemption disabled by any other agent
437 * but [locked_pvh] itself. Preemption must be fully re-enabled by the time
438 * this function returns, either because it was already enabled (because the
439 * lock was already in sleep mode), or because this function enabled it by placing
440 * the lock in sleep mode.
441 *
442 * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
443 */
444 static inline void
pvh_lock_enter_sleep_mode(locked_pvh_t * locked_pvh)445 pvh_lock_enter_sleep_mode(locked_pvh_t *locked_pvh)
446 {
447 assert(locked_pvh != NULL);
448 assert(locked_pvh->pvh != 0);
449 unsigned int index = locked_pvh->pai;
450 pvh_assert_locked(index);
451 const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
452 if (!(old_pvh & PVH_FLAG_SLEEP)) {
453 assert(old_pvh & PVH_FLAG_LOCK);
454 os_atomic_store(&pv_head_table[index], old_pvh | PVH_FLAG_SLEEP, relaxed);
455 /**
456 * Tell the scheduler that this thread may need a priority boost if it needs to go
457 * off-core, to reduce the likelihood of priority inversion.
458 */
459 locked_pvh->pri_token = thread_priority_floor_start();
460 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
461 }
462
463 /* Hibernation runs single-core so we can skip this check. */
464 assert(preemption_enabled() || PMAP_IS_HIBERNATING());
465 }
466
467 /**
468 * Check that a pv_head_table entry/pointer is a specific type.
469 *
470 * @param pvh The pv_head_table entry/pointer to check.
471 * @param type The type to check for.
472 *
473 * @return True if the pv_head_table entry is of the passed in type, false
474 * otherwise.
475 */
476 static inline bool
pvh_test_type(uintptr_t pvh,uintptr_t type)477 pvh_test_type(uintptr_t pvh, uintptr_t type)
478 {
479 return (pvh & PVH_TYPE_MASK) == type;
480 }
481
482 /**
483 * Unlock a pv_head_table entry, updating the contents of the entry with the passed-in value.
484 *
485 * @note Only the non-lock flags, pointer, and type fields of the entry will be updated
486 * according to the passed-in value. PVH_LOCK_FLAGS will be ignored as they are
487 * directly manipulated by this function.
488 *
489 * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
490 * The pvh field from this entry, except for the PVH_LOCK_FLAGS bits, will be stored
491 * in pv_head_table to reflect any updates that may have been performed on the PV list
492 * while the lock was held.
493 */
494 static inline void
pvh_unlock(locked_pvh_t * locked_pvh)495 pvh_unlock(locked_pvh_t *locked_pvh)
496 {
497 assert(locked_pvh != NULL);
498 assert(locked_pvh->pvh != 0);
499 unsigned int index = locked_pvh->pai;
500 pvh_assert_locked(index);
501 const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
502 bool pri_floor_end = false;
503
504 if (__improbable(old_pvh & PVH_FLAG_SLEEP)) {
505 pri_floor_end = true;
506 const bool was_preemptible = preemption_enabled();
507 bool (^check_preemption)(void) = ^bool (void) {
508 return was_preemptible && pmap_pending_preemption();
509 };
510
511 hw_lock_status_t ret;
512 do {
513 ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
514 &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
515 } while (ret != HW_LOCK_ACQUIRED);
516
517 os_atomic_store(&pv_head_table[index],
518 (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
519 thread_wakeup(&pv_head_table[index]);
520 } else if ((old_pvh & ~PVH_LOCK_FLAGS) != (locked_pvh->pvh & ~PVH_LOCK_FLAGS)) {
521 os_atomic_store(&pv_head_table[index],
522 (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
523 }
524 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
525
526 if (__improbable(pri_floor_end)) {
527 thread_priority_floor_end(&locked_pvh->pri_token);
528 }
529
530 locked_pvh->pvh = 0;
531 }
532
533 /**
534 * Convert a pv_head_table entry/pointer into a page table entry pointer. This
535 * should only be done if the type of this entry is PVH_TYPE_PTEP.
536 *
537 * @param pvh The pv_head_table entry/pointer to convert into a pt_entry_t*.
538 *
539 * @return Return back a safe to derefence pointer to the single mapping of this
540 * physical page by masking off the TYPE bits and adding any missing
541 * flags to the upper portion of the pointer.
542 */
543 static inline pt_entry_t*
pvh_ptep(uintptr_t pvh)544 pvh_ptep(uintptr_t pvh)
545 {
546 assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
547 return (pt_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
548 }
549
550 /**
551 * Convert a pv_head_table entry/pointer into a PVE list pointer. This
552 * should only be done if the type of this entry is PVH_TYPE_PVEP.
553 *
554 * @param pvh The pv_head_table entry/pointer to convert into a safe to
555 * dereference pv_entry_t*.
556 *
557 * @return Return back a safe to derefence pointer to the first mapping of this
558 * physical page by masking off the TYPE bits and adding any missing
559 * flags to the upper portion of the pointer.
560 */
561 static inline pv_entry_t*
pvh_pve_list(uintptr_t pvh)562 pvh_pve_list(uintptr_t pvh)
563 {
564 assert(pvh_test_type(pvh, PVH_TYPE_PVEP));
565 return (pv_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
566 }
567
568 /**
569 * Return the mutable flags associated with a pv_head_table entry/pointer.
570 *
571 * @param pvh The pv_head_table entry whose flags to get.
572 *
573 * @return The mutable flags encoded in [pvh].
574 */
575 static inline uintptr_t
pvh_get_flags(uintptr_t pvh)576 pvh_get_flags(uintptr_t pvh)
577 {
578 return pvh & PVH_MUTABLE_FLAGS;
579 }
580
581 /**
582 * Update the flags associated with a pv_head_table entry/pointer.
583 *
584 * @note This function does not actually modify the pv_head_table,
585 * it only installs an updated pv_head_table entry in [locked_pvh]
586 * that can later be passed to pvh_unlock() to update the actual array
587 * entry.
588 *
589 * @param locked_pvh A wrapper struct containing the pv_head_table
590 * entry/pointer to update.
591 *
592 */
593 static inline void
pvh_set_flags(locked_pvh_t * locked_pvh,uintptr_t flags)594 pvh_set_flags(locked_pvh_t *locked_pvh, uintptr_t flags)
595 {
596 locked_pvh->pvh = (locked_pvh->pvh & ~PVH_MUTABLE_FLAGS) | (flags & PVH_MUTABLE_FLAGS);
597 }
598
599 /**
600 * Update a pv_head_table entry/pointer to be a different type and/or point to
601 * a different object.
602 *
603 * @note This function does not actually modify the pv_head_table,
604 * it only installs an updated pv_head_table entry in [locked_pvh]
605 * that can later be passed to pvh_unlock() to update the actual array
606 * entry.
607 *
608 * @param locked_pvh A wrapper struct containing the pv_head_table
609 * entry/pointer to update.
610 * @param pvep The new entry to use. This could be either a pt_entry_t*,
611 * pv_entry_t*, or pt_desc_t* depending on the type.
612 * @param type The type of the new entry.
613 */
614 static inline void
pvh_update_head(locked_pvh_t * locked_pvh,void * pvep,unsigned int type)615 pvh_update_head(locked_pvh_t *locked_pvh, void *pvep, unsigned int type)
616 {
617 assert(!((uintptr_t)pvep & PVH_TYPE_MASK));
618 const uintptr_t pvh_flags = locked_pvh->pvh & PVH_HIGH_FLAGS;
619 locked_pvh->pvh = ((uintptr_t)pvep & ~PVH_HIGH_FLAGS) | type | pvh_flags;
620 }
621
622 /**
623 * Given a page table entry pointer retrieved from the pv_head_table (from an
624 * entry of type PVH_TYPE_PTEP or PVH_TYPE_PVEP), return back whether the PTE is
625 * an IOMMU mapping.
626 *
627 * @note The way this function determines whether the passed in pointer is
628 * pointing to an IOMMU PTE, is by checking for a special flag stored in
629 * the lower bits of the pointer. This flag is only set on pointers stored
630 * in the pv_head_table, and as such, this function will only work on
631 * pointers retrieved from the pv_head_table. If a pointer to a PTE was
632 * directly retrieved from an IOMMU's page tables, this function would
633 * always return false despite actually being an IOMMU PTE.
634 *
635 * @param ptep A PTE pointer obtained from the pv_head_table to check.
636 *
637 * @return True if the entry is an IOMMU mapping, false otherwise.
638 */
639 static inline bool
pvh_ptep_is_iommu(const pt_entry_t * ptep)640 pvh_ptep_is_iommu(const pt_entry_t *ptep)
641 {
642 #ifdef PVH_FLAG_IOMMU
643 return (uintptr_t)ptep & PVH_FLAG_IOMMU;
644 #else /* PVH_FLAG_IOMMU */
645 #pragma unused(ptep)
646 return false;
647 #endif /* PVH_FLAG_IOMMU */
648 }
649
650 /**
651 * Sometimes the PTE pointers retrieved from the pv_head_table (from an entry of
652 * type PVH_TYPE_PTEP or PVH_TYPE_PVEP) contain flags themselves. This function
653 * strips out those flags and returns back a dereferencable pointer.
654 *
655 * @param ptep The PTE pointer to strip out the unwanted flags.
656 *
657 * @return A valid dereferencable pointer to the page table entry.
658 */
659 static inline const pt_entry_t*
pvh_strip_ptep(const pt_entry_t * ptep)660 pvh_strip_ptep(const pt_entry_t *ptep)
661 {
662 #ifdef PVH_FLAG_IOMMU
663 const uintptr_t pte_va = (uintptr_t)ptep;
664 return (const pt_entry_t*)((pte_va & ~PVH_FLAG_IOMMU) | PVH_FLAG_IOMMU_TABLE);
665 #else /* PVH_FLAG_IOMMU */
666 return ptep;
667 #endif /* PVH_FLAG_IOMMU */
668 }
669
670 /**
671 * PVH_TYPE_PVEP Helper Functions.
672 *
673 * The following are methods used to manipulate PVE lists. This is the type of
674 * pv_head_table entry used when there are multiple mappings to a single
675 * physical page.
676 */
677
678 /**
679 * Whether a physical page is using "alternate accounting" (ALTACCT) for its
680 * ledger statistics is something that needs to be tracked on a per-mapping
681 * basis, not on a per-physical-page basis. Because of that, it's tracked
682 * differently depending on whether there's a single mapping to a page
683 * (PVH_TYPE_PTEP) or multiple (PVH_TYPE_PVEP). For single mappings, the bit is
684 * tracked in the pp_attr_table. But when there are multiple mappings, the least
685 * significant bit of the corresponding "pve_pte" pointer in each pv_entry object
686 * is used as a marker for pages using alternate accounting.
687 *
688 * @note See the definition for PP_ATTR_ALTACCT for a more detailed description
689 * of what "alternate accounting" actually means in respect to the
690 * footprint ledger.
691 *
692 * Since some code (KernelDiskImages, e.g.) might map a phsyical page as
693 * "device" memory (i.e. external) while it's also being used as regular
694 * "anonymous" memory (i.e. internal) in user space, we have to manage the
695 * "internal" attribute per mapping rather than per physical page.
696 * When there are multiple mappings, we use the next least significant bit of
697 * the corresponding "pve_pte" pointer for that.
698 */
699 #define PVE_PTEP_ALTACCT ((uintptr_t) 0x1)
700 #define PVE_PTEP_INTERNAL ((uintptr_t) 0x2)
701 #define PVE_PTEP_FLAGS (PVE_PTEP_ALTACCT | PVE_PTEP_INTERNAL)
702
703 /**
704 * Set the ALTACCT bit for a specific PTE pointer.
705 *
706 * @param pvep A pointer to the current pv_entry mapping in the linked list of
707 * mappings.
708 * @param idx Index of the chosen PTE pointer inside the PVE.
709 */
710 static inline void
pve_set_altacct(pv_entry_t * pvep,unsigned idx)711 pve_set_altacct(pv_entry_t *pvep, unsigned idx)
712 {
713 assert(idx < PTE_PER_PVE);
714 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_ALTACCT);
715 }
716
717 /**
718 * Set the INTERNAL bit for a specific PTE pointer.
719 *
720 * @param pvep A pointer to the current pv_entry mapping in the linked list of
721 * mappings.
722 * @param idx Index of the chosen PTE pointer inside the PVE.
723 */
724 static inline void
pve_set_internal(pv_entry_t * pvep,unsigned idx)725 pve_set_internal(pv_entry_t *pvep, unsigned idx)
726 {
727 assert(idx < PTE_PER_PVE);
728 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_INTERNAL);
729 }
730
731 /**
732 * Clear the ALTACCT bit for a specific PTE pointer.
733 *
734 * @param pvep A pointer to the current pv_entry mapping in the linked list of
735 * mappings.
736 * @param idx Index of the chosen PTE pointer inside the PVE.
737 */
738 static inline void
pve_clr_altacct(pv_entry_t * pvep,unsigned idx)739 pve_clr_altacct(pv_entry_t *pvep, unsigned idx)
740 {
741 assert(idx < PTE_PER_PVE);
742 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_ALTACCT);
743 }
744
745 /**
746 * Clear the INTERNAL bit for a specific PTE pointer.
747 *
748 * @param pvep A pointer to the current pv_entry mapping in the linked list of
749 * mappings.
750 * @param idx Index of the chosen PTE pointer inside the PVE.
751 */
752 static inline void
pve_clr_internal(pv_entry_t * pvep,unsigned idx)753 pve_clr_internal(pv_entry_t *pvep, unsigned idx)
754 {
755 assert(idx < PTE_PER_PVE);
756 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_INTERNAL);
757 }
758
759 /**
760 * Return the ALTACCT bit for a specific PTE pointer.
761 *
762 * @param pvep A pointer to the current pv_entry mapping in the linked list of
763 * mappings.
764 * @param idx Index of the chosen PTE pointer inside the PVE.
765 */
766 static inline bool
pve_get_altacct(pv_entry_t * pvep,unsigned idx)767 pve_get_altacct(pv_entry_t *pvep, unsigned idx)
768 {
769 assert(idx < PTE_PER_PVE);
770 return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_ALTACCT;
771 }
772
773 /**
774 * Return the INTERNAL bit for a specific PTE pointer.
775 *
776 * @param pvep A pointer to the current pv_entry mapping in the linked list of
777 * mappings.
778 * @param idx Index of the chosen PTE pointer inside the PVE.
779 */
780 static inline bool
pve_get_internal(pv_entry_t * pvep,unsigned idx)781 pve_get_internal(pv_entry_t *pvep, unsigned idx)
782 {
783 assert(idx < PTE_PER_PVE);
784 return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_INTERNAL;
785 }
786
787 /**
788 * Return the next mapping (pv_entry) in a linked list of mappings. This applies
789 * to pv_head_table entries of type PVH_TYPE_PVEP.
790 *
791 * @param pvep A pointer to the current pv_entry mapping in the linked list of
792 * mappings.
793 *
794 * @return The next virtual mapping for a physical page, or PV_ENTRY_NULL if the
795 * end of the list has been reached.
796 */
797 static inline pv_entry_t *
pve_next(pv_entry_t * pvep)798 pve_next(pv_entry_t *pvep)
799 {
800 return pvep->pve_next;
801 }
802
803 /**
804 * Return a pointer to the pve_next field in a pv_entry. This value is used
805 * when adding and removing entries to a PVE list.
806 *
807 * @param pvep The pv_entry whose pve_next field is being accessed.
808 *
809 * @return Pointer to the pve_next field.
810 */
811 static inline pv_entry_t **
pve_next_ptr(pv_entry_t * pvep)812 pve_next_ptr(pv_entry_t *pvep)
813 {
814 return &pvep->pve_next;
815 }
816
817 /**
818 * Return a pointer to the page table entry for this mapping.
819 *
820 * @param pvep The pv_entry whose pve_ptep field is to be returned.
821 * @param idx Index of the chosen PTE pointer inside the PVE.
822 *
823 * @return Pointer to the page table entry.
824 */
825 static inline pt_entry_t *
pve_get_ptep(pv_entry_t * pvep,unsigned idx)826 pve_get_ptep(pv_entry_t *pvep, unsigned idx)
827 {
828 assert(idx < PTE_PER_PVE);
829 return (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_FLAGS);
830 }
831
832 /**
833 * Update the page table entry for a specific physical to virtual mapping.
834 *
835 * @param pvep The pv_entry to update.
836 * @param idx Index of the chosen PTE pointer inside the PVE.
837 * @param ptep_new The new page table entry.
838 */
839 static inline void
pve_set_ptep(pv_entry_t * pvep,unsigned idx,pt_entry_t * ptep_new)840 pve_set_ptep(pv_entry_t *pvep, unsigned idx, pt_entry_t *ptep_new)
841 {
842 assert(idx < PTE_PER_PVE);
843 pvep->pve_ptep[idx] = ptep_new;
844 }
845
846 /**
847 * Initialize all fields in a PVE to NULL.
848 *
849 * @param pvep The pv_entry to initialize.
850 */
851 static inline void
pve_init(pv_entry_t * pvep)852 pve_init(pv_entry_t *pvep)
853 {
854 pvep->pve_next = PV_ENTRY_NULL;
855 for (int i = 0; i < PTE_PER_PVE; i++) {
856 pvep->pve_ptep[i] = PT_ENTRY_NULL;
857 }
858 }
859
860 /**
861 * Find PTE pointer in PVE and return its index.
862 *
863 * @param pvep The PVE to search.
864 * @param ptep PTE to search for.
865 *
866 * @return Index of the found entry, or -1 if no entry exists.
867 */
868 static inline int
pve_find_ptep_index(pv_entry_t * pvep,pt_entry_t * ptep)869 pve_find_ptep_index(pv_entry_t *pvep, pt_entry_t *ptep)
870 {
871 for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
872 if (pve_get_ptep(pvep, i) == ptep) {
873 return (int)i;
874 }
875 }
876
877 return -1;
878 }
879
880 /**
881 * Checks if no PTEs are currently associated with this PVE.
882 *
883 * @param pvep The PVE to search.
884 *
885 * @return True if no PTEs are currently associated with this PVE, or false.
886 */
887 static inline bool
pve_is_empty(pv_entry_t * pvep)888 pve_is_empty(pv_entry_t *pvep)
889 {
890 for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
891 if (pve_get_ptep(pvep, i) != PT_ENTRY_NULL) {
892 return false;
893 }
894 }
895
896 return true;
897 }
898
899 /**
900 * Prepend a new pv_entry node to a PVE list.
901 *
902 * @note This function does not actually modify the pv_head_table,
903 * it only installs an updated pv_head_table entry in [locked_pvh]
904 * that can later be passed to pvh_unlock() to update the actual array
905 * entry.
906 *
907 * @param locked_pvh A wrapper struct containing the pv_head_table
908 * entry/pointer to update. This entry represents
909 * the linked list of mappings to update.
910 * @param pvep The new mapping to add to the linked list.
911 */
912 static inline void
pve_add(locked_pvh_t * locked_pvh,pv_entry_t * pvep)913 pve_add(locked_pvh_t *locked_pvh, pv_entry_t *pvep)
914 {
915 assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
916
917 pvep->pve_next = pvh_pve_list(locked_pvh->pvh);
918 pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
919 }
920
921 /**
922 * Remove an entry from a PVE list of mappings.
923 *
924 * @note This function does not actually modify the pv_head_table,
925 * it only installs an updated pv_head_table entry in [locked_pvh]
926 * that can later be passed to pvh_unlock() to update the actual array
927 * entry.
928 *
929 * @param locked_pvh A wrapper struct containing the pv_head_table entry/pointer
930 * to update. This entry represents the linked list of mappings
931 * from which to remove an entry.
932 * @param pvepp A pointer to the pv_entry_t* that's being removed. If this entry
933 * is the first in the linked list of mappings, then NULL should be
934 * passed here and the removal will be reflected in the returned
935 * pv_head_table entry.
936 * @param pvep The entry that should be removed. Should be identical to a
937 * dereference of the pvepp parameter (unless it's the pv_head_table
938 * entry).
939 */
940 static inline void
pve_remove(locked_pvh_t * locked_pvh,pv_entry_t ** pvepp,pv_entry_t * pvep)941 pve_remove(locked_pvh_t *locked_pvh, pv_entry_t **pvepp, pv_entry_t *pvep)
942 {
943 assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
944
945 if (pvepp == NULL) {
946 assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
947 __func__, (void*)locked_pvh->pvh, pvep);
948 if (pve_next(pvep) == PV_ENTRY_NULL) {
949 /* The last mapping to this page is being removed. */
950 pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
951 } else {
952 /**
953 * There are still mappings left, make the next one the new head of
954 * the list. This effectively removes the first entry from the list.
955 */
956 pvh_update_head(locked_pvh, pve_next(pvep), PVH_TYPE_PVEP);
957 }
958 } else {
959 /**
960 * Move the previous entry's next field to the entry after the one being
961 * removed. This will clobber the ALTACCT and INTERNAL bits.
962 */
963 *pvepp = pve_next(pvep);
964 }
965 }
966
967 /**
968 * PVH_TYPE_PTDP Types and Helper Functions.
969 *
970 * The following are types and methods used to manipulate page table descriptor
971 * (PTD) objects. This is the type of pv_head_table entry used when a page is
972 * being used as a page table.
973 */
974
975 /**
976 * Page table descriptor (PTD) info structure.
977 *
978 * Contains information about a page table. These pieces of data are separate
979 * from the PTD itself because in address spaces where the VM page size doesn't
980 * match the underlying hardware page size, one PTD could represent multiple
981 * page tables (and so will need multiple PTD info structures).
982 *
983 * These fields are also in their own struct so that they can be allocated
984 * separately from the associated pt_desc_t object. This allows us to allocate
985 * the counts in this structure in a way that ensures they don't fall within the
986 * same cache line as the main pt_desc_t object. This is important because the
987 * fields in this structure are atomically updated which could cause false
988 * sharing cache performance issues with the "va" field in pt_desc_t if all of
989 * the fields were within the same structure.
990 */
991 typedef struct {
992 /*
993 * For non-leaf pagetables, should be 0.
994 * For leaf pagetables, should reflect the number of wired entries.
995 * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU
996 * operations are implicitly wired).
997 */
998 unsigned short wiredcnt;
999 } ptd_info_t;
1000
1001 /**
1002 * This type is used to identify a specific IOMMU driver and an instance of
1003 * that driver which owns a specific page or page table. This type will be used
1004 * within both PTD and PVE lists to track IOMMU-owned pages and IOMMU mappings
1005 * respectively.
1006 *
1007 * Despite the fact this value is not a pointer, we need to make this value sort
1008 * of look like a kernel pointer: the bottom 3-bits must be zero and the upper
1009 * bits must all be ones by default. This is due to the fact that this type can
1010 * be embedded into the PVH table to represent an IOMMU mapping. The PVH table
1011 * code expects "kernel-pointer-like" properties so it can store flags in those
1012 * areas of the 64-bit value.
1013 */
1014 typedef uint64_t iommu_instance_t;
1015
1016 /* 8-bit ID of the IOMMU driver which the instance derives from. */
1017 #define IOMMU_ID_SHIFT 8U
1018 #define IOMMU_ID_MASK 0x000000000000FF00ULL
1019
1020 #define GET_IOMMU_ID(x) ((sptm_iommu_id_t)(((x) & IOMMU_ID_MASK) >> IOMMU_ID_SHIFT))
1021 #define SET_IOMMU_ID(x) (((uint64_t)(x) << IOMMU_ID_SHIFT) & IOMMU_ID_MASK)
1022
1023 /**
1024 * An IOMMU token is a 32-bit value unique to each instance of an IOMMU driver.
1025 * This is strictly used to help with debugging and provides a mechanism to
1026 * trace a mapping or page table back to the exact IOMMU instance that owns it.
1027 * Typically, this would be the instance ID, but for drivers that use only a
1028 * single global instance, this could be something else like a root page table
1029 * ppnum_t.
1030 */
1031 #define IOMMU_TOKEN_SHIFT 16U
1032 #define IOMMU_TOKEN_MASK 0x0000FFFFFFFF0000ULL
1033
1034 #define GET_IOMMU_TOKEN(x) ((iommu_token_t)(((x) & IOMMU_TOKEN_MASK) >> IOMMU_TOKEN_SHIFT))
1035 #define SET_IOMMU_TOKEN(x) (((uint64_t)(x) << IOMMU_TOKEN_SHIFT) & IOMMU_TOKEN_MASK)
1036
1037 /**
1038 * The default value for iommu_instance_t. See the type definition for more
1039 * details on why the upper bits need to initially be all ones.
1040 */
1041 #define IOMMU_INSTANCE_DEFAULT 0xFFFF000000000000ULL
1042
1043 /**
1044 * Since "zero" is a valid IOMMU ID and token, the "NULL" value of an IOMMU
1045 * instance sets the ID and token to all ones as a sentinel invalid value.
1046 */
1047 #define IOMMU_INSTANCE_NULL 0xFFFFFFFFFFFFFF00ULL
1048
1049 /**
1050 * Page Table Descriptor (PTD).
1051 *
1052 * Provides a per-table data structure and a way of keeping track of all page
1053 * tables in the system.
1054 *
1055 * This structure is also used as a convenient way of keeping track of IOMMU
1056 * pages (which may or may not be used as page tables). In that case the SPTM
1057 * frame type for the page will be XNU_IOMMU, the "iommu" field will describe
1058 * the owner of the page, and ptd_info[0].wiredcnt can be used as an arbitrary
1059 * refcnt controlled by the IOMMU driver.
1060 */
1061 typedef struct pt_desc {
1062 /* Each page table is either owned by a pmap or a specific IOMMU. */
1063 union {
1064 struct pmap *pmap;
1065 };
1066
1067 /**
1068 * The following fields contain per-page-table properties, and as such,
1069 * might have multiple elements each. This is due to a single PTD
1070 * potentially representing multiple page tables (in address spaces where
1071 * the VM page size differs from the hardware page size). Use the
1072 * ptd_get_index() function to get the correct index for a specific page
1073 * table.
1074 */
1075
1076 /**
1077 * The first address of the virtual address space this page table is
1078 * translating for, or a value set by an IOMMU driver if this PTD is being
1079 * used to track an IOMMU page.
1080 */
1081 vm_offset_t va;
1082
1083 /**
1084 * ptd_info_t's are allocated separately so as to reduce false sharing
1085 * with the va field. This is desirable because ptd_info_t's are updated
1086 * atomically from all CPUs.
1087 */
1088 ptd_info_t *ptd_info;
1089 } pt_desc_t;
1090
1091 /**
1092 * Per-CPU structure for tracking in-flight SPTM retype operations.
1093 *
1094 * This structure is intended to be embedded in the pmap per-CPU data object,
1095 * and is meant to be used for situations in which the caller needs to ensure
1096 * that potentially sensitive concurrent SPTM operations have completed on other
1097 * CPUs prior to retyping a page. If these sensitive operations haven't completed
1098 * when the retype occurs, and they happen to involve the page being retyped
1099 * (either directly or through mappings thereof), an SPTM violation panic may
1100 * result.
1101 */
1102 typedef struct {
1103 /**
1104 * Critical section sequence number of the local CPU. A value of zero
1105 * indicates that no retype epoch critical section is currently active on
1106 * the CPU.
1107 */
1108 uint64_t local_seq;
1109
1110 /**
1111 * The sequence number to use the next time a retype epoch critical section
1112 * is entered on the local CPU. This should monotonically increase.
1113 */
1114 uint64_t next_seq;
1115
1116 /**
1117 * This array stores the retype sequence numbers observed on remote CPUs.
1118 * When the local CPU needs to wait for critical sections to complete on
1119 * other CPUs, this is intended to provide an initial sample of those other
1120 * CPUs' critical section state. The caller can then wait for each remote
1121 * CPU's sequence number to return to zero or advance beyond the value
1122 * stored in its entry in this array.
1123 */
1124 uint64_t remote_seq[MAX_CPUS];
1125
1126 /**
1127 * Flags used to track the state of an active retype epoch drain operation
1128 * on the local CPU.
1129 */
1130
1131 /**
1132 * This flag indicates that a drain operation has been prepared on the
1133 * local CPU by sampling remote CPU epoch states into the remote_seq array.
1134 * This must be set before the drain operation can be performed.
1135 */
1136 #define PMAP_RETYPE_EPOCH_PREPARED (1 << 0)
1137
1138 /**
1139 * This flag indicates that one or more remote CPUs had a non-zero retype
1140 * epoch value when the remote_seq array was most recently sampled.
1141 * If this flag is not set, then we already know that no remote CPUs can
1142 * be in a critical section in which prior mapping state for the page to
1143 * be retyped may have been observed, so we can skip the drain operation.
1144 */
1145 #define PMAP_RETYPE_EPOCH_DRAIN_REQUIRED (1 << 1)
1146 uint8_t flags;
1147 } pmap_retype_epoch_t;
1148
1149 #define PMAP_SPTM_PCPU_ALIGN (8192)
1150
1151 typedef struct {
1152 /**
1153 * Per-CPU array of SPTM_MAPPING_LIMIT PTE records, obtained from SPTM
1154 * during bootstrap.
1155 */
1156 sptm_pte_t *sptm_prev_ptes;
1157
1158 /**
1159 * A piece of per-cpu scratch memory used by IOMMU drivers when passing data
1160 * into the SPTM. The size is defined by PMAP_IOMMU_SCRATCH_SIZE.
1161 */
1162 void *sptm_iommu_scratch;
1163
1164 /* Accumulator for batched disjoint SPTM ops, to avoid excessive stack usage. */
1165 sptm_disjoint_op_t sptm_ops[SPTM_MAPPING_LIMIT];
1166
1167 /* Accumulator for batched VA-contiguous SPTM ops, to avoid excessive stack usage. */
1168 sptm_pte_t sptm_templates[SPTM_MAPPING_LIMIT];
1169
1170 /* Base PA of ops array, for passing the ops into the SPTM. */
1171 pmap_paddr_t sptm_ops_pa;
1172
1173 /* Base PA of templates array, for passing templates into the SPTM. */
1174 pmap_paddr_t sptm_templates_pa;
1175
1176 /* PMAP pagetable descriptors associated with each element of sptm_ops. */
1177 pt_desc_t *sptm_ptds[SPTM_MAPPING_LIMIT];
1178
1179 /* PTD info objects associated with each pmap PTE pointer. */
1180 ptd_info_t *sptm_ptd_info[SPTM_MAPPING_LIMIT];
1181
1182 /* Accounting-related flags for each element of sptm_ops. */
1183 #define PMAP_SPTM_FLAG_INTERNAL (0x1)
1184 #define PMAP_SPTM_FLAG_ALTACCT (0x2)
1185 uint8_t sptm_acct_flags[SPTM_MAPPING_LIMIT];
1186
1187 /* Retype epoch tracking structure. */
1188 pmap_retype_epoch_t retype_epoch;
1189
1190 /* Guest virtual machine dispatch structure. */
1191 sptm_guest_dispatch_t sptm_guest_dispatch;
1192
1193 /* Guest virtual machine dispatch structure physical address. */
1194 pmap_paddr_t sptm_guest_dispatch_paddr;
1195
1196 /* SPTM Logical CPU ID */
1197 uint16_t sptm_cpu_id;
1198
1199 /* Read index associated with this CPU's SPTM trace buffer */
1200 uint64_t sptm_trace_buffer_read_index;
1201 } __attribute__((aligned(PMAP_SPTM_PCPU_ALIGN))) pmap_sptm_percpu_data_t;
1202
1203 _Static_assert((PAGE_SIZE % PMAP_SPTM_PCPU_ALIGN) == 0,
1204 "SPTM per-CPU data alignment does not fit evenly within a page");
1205 _Static_assert(sizeof(pmap_sptm_percpu_data_t) <= PMAP_SPTM_PCPU_ALIGN,
1206 "sizeof(pmap_sptm_percpu_data_t) is larger than PMAP_SPTM_PCPU_ALIGN");
1207
1208 PERCPU_DECL(pmap_sptm_percpu_data_t, pmap_sptm_percpu);
1209
1210 /**
1211 * Convert a pv_head_table entry/pointer into a page table descriptor pointer.
1212 * This should only be done if the type of this entry is PVH_TYPE_PTDP.
1213 *
1214 * @param pvh The pv_head_table entry/pointer to convert into a safe to
1215 * dereference pt_desc_t*.
1216 *
1217 * @return Return back a safe to derefence pointer to the page table descriptor
1218 * for this physical page by masking off the TYPE bits and adding any
1219 * missing flags to the upper portion of the pointer.
1220 */
1221 static inline pt_desc_t*
pvh_ptd(uintptr_t pvh)1222 pvh_ptd(uintptr_t pvh)
1223 {
1224 return (pt_desc_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
1225 }
1226
1227 /**
1228 * Given an arbitrary page table entry, return back the page table descriptor
1229 * (PTD) object for the page table that contains that entry.
1230 *
1231 * @param ptep Pointer to a PTE whose page table descriptor object to return.
1232 *
1233 * @return The PTD object for the passed in page table.
1234 */
1235 static inline pt_desc_t *
ptep_get_ptd(const pt_entry_t * ptep)1236 ptep_get_ptd(const pt_entry_t *ptep)
1237 {
1238 assert(ptep != NULL);
1239
1240 const vm_offset_t pt_base_va = (vm_offset_t)ptep;
1241 uintptr_t pvh = pai_to_pvh(pa_index(kvtophys(pt_base_va)));
1242
1243 if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1244 panic("%s: invalid PV head 0x%llx for PTE %p", __func__, (uint64_t)pvh, ptep);
1245 }
1246
1247 return pvh_ptd(pvh);
1248 }
1249
1250 /**
1251 * Given an arbitrary page table entry, return back the pmap that owns that
1252 * page table.
1253 *
1254 * @note This won't work correctly for page tables owned by IOMMUs, because
1255 * those table aren't owned by any specific pmap.
1256 *
1257 * @param ptep Pointer to a page table entry whose owner we're trying to return.
1258 *
1259 * @return The pmap that owns the given page table entry.
1260 */
1261 static inline struct pmap *
ptep_get_pmap(const pt_entry_t * ptep)1262 ptep_get_pmap(const pt_entry_t *ptep)
1263 {
1264 return ptep_get_ptd(ptep)->pmap;
1265 }
1266
1267
1268 /**
1269 * Given an arbitrary translation table entry, get the page table descriptor
1270 * (PTD) object for the page table pointed to by the TTE.
1271 *
1272 * @param tte The translation table entry to parse. For instance, if this is an
1273 * L2 TTE, then the PTD for the L3 table this entry points to will be
1274 * returned.
1275 *
1276 * @return The page table descriptor (PTD) for the page table pointed to by this
1277 * TTE.
1278 */
1279 static inline pt_desc_t *
tte_get_ptd(const tt_entry_t tte)1280 tte_get_ptd(const tt_entry_t tte)
1281 {
1282 const vm_offset_t pt_base_va = (vm_offset_t)(tte & ~((tt_entry_t)PAGE_MASK));
1283 uintptr_t pvh = pai_to_pvh(pa_index(pt_base_va));
1284
1285 if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1286 panic("%s: invalid PV head 0x%llx for TTE 0x%llx", __func__, (uint64_t)pvh, (uint64_t)tte);
1287 }
1288
1289 return pvh_ptd(pvh);
1290 }
1291
1292 /**
1293 * This function returns the ptd_info_t structure associated with a given
1294 * page table descriptor.
1295 *
1296 * @param ptd The page table descriptor that's being accessed.
1297 *
1298 * @return ptd_info_t structure associated with [ptd].
1299 */
1300 static inline ptd_info_t *
ptd_get_info(pt_desc_t * ptd)1301 ptd_get_info(pt_desc_t *ptd)
1302 {
1303 assert(ptd != NULL);
1304 return ptd->ptd_info;
1305 }
1306
1307 /**
1308 * Given a pointer to a page table entry, return back the ptd_info structure
1309 * for the page table that contains that entry.
1310 *
1311 * @param ptep Pointer to a PTE whose ptd_info object to return.
1312 *
1313 * @return The ptd_info object for the page table that contains the passed in
1314 * page table entry.
1315 */
1316 static inline ptd_info_t *
ptep_get_info(const pt_entry_t * ptep)1317 ptep_get_info(const pt_entry_t *ptep)
1318 {
1319 return ptd_get_info(ptep_get_ptd(ptep));
1320 }
1321
1322 /**
1323 * Return the virtual address mapped by the passed in leaf page table entry,
1324 * using an already-retrieved pagetable descriptor.
1325 *
1326 * @param ptdp pointer to the descriptor for the pagetable containing ptep
1327 * @param ptep Pointer to a PTE to parse
1328 */
1329 static inline vm_map_address_t
ptd_get_va(const pt_desc_t * ptdp,const pt_entry_t * ptep)1330 ptd_get_va(const pt_desc_t *ptdp, const pt_entry_t *ptep)
1331 {
1332 const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);
1333
1334 vm_map_address_t va = ptdp->va;
1335
1336 const uint64_t pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(ptdp->pmap));
1337 const vm_offset_t ptep_page = (vm_offset_t)ptep >> pmap_page_shift;
1338
1339 /**
1340 * Use the difference between the VM page shift and the hardware page shift
1341 * to get the index of the correct page table. In practice, this equates to
1342 * masking out the bottom two bits of the L3 table index in address spaces
1343 * where the VM page size is greater than the hardware page size. In address
1344 * spaces where they're identical, the index will always be zero.
1345 */
1346 const unsigned int ttep_index = ptep_page & ((1U << (PAGE_SHIFT - pmap_page_shift)) - 1);
1347 va += ttep_index * pt_attr_twig_size(pt_attr);
1348
1349 /* Increment VA now to target the VA space covered by this specific PTE */
1350 const vm_offset_t ptep_index = ((vm_offset_t)ptep & pt_attr_leaf_offmask(pt_attr)) / sizeof(*ptep);
1351 va += (ptep_index << pt_attr_leaf_shift(pt_attr));
1352
1353 return va;
1354 }
1355
1356 /**
1357 * Return the virtual address that is being mapped by the passed in leaf page
1358 * table entry.
1359 *
1360 * @param ptep Pointer to a PTE to parse.
1361 */
1362 static inline vm_map_address_t
ptep_get_va(const pt_entry_t * ptep)1363 ptep_get_va(const pt_entry_t *ptep)
1364 {
1365 return ptd_get_va(ptep_get_ptd(ptep), ptep);
1366 }
1367
1368 /**
1369 * Physical Page Attribute Table (pp_attr_table) defines and helper functions.
1370 */
1371
1372 /* How many bits to use for flags on a per-VM-page basis. */
1373 typedef uint16_t pp_attr_t;
1374
1375 /* See the definition of pp_attr_table for more information. */
1376 extern volatile pp_attr_t* pp_attr_table;
1377
1378 /**
1379 * Flags stored in the pp_attr_table on a per-physical-page basis.
1380 *
1381 * Please update the pv_walk LLDB macro if these flags are changed or added to.
1382 */
1383
1384 /**
1385 * The bottom 6-bits are used to store the default WIMG (cacheability and memory
1386 * type) setting for this physical page. This can be changed by calling
1387 * pmap_set_cache_attributes().
1388 *
1389 * If a default WIMG setting isn't set for a page, then the default is Normal,
1390 * Cached memory (VM_WIMG_DEFAULT).
1391 */
1392 #define PP_ATTR_WIMG_MASK 0x003F
1393 #define PP_ATTR_WIMG(x) ((x) & PP_ATTR_WIMG_MASK)
1394
1395 /**
1396 * The reference and modify bits keep track of whether a page has been accessed
1397 * or modified since the last time the bits were cleared. These bits are used to
1398 * enforce policy decisions in the VM layer.
1399 */
1400 #define PP_ATTR_REFERENCED 0x0040
1401 #define PP_ATTR_MODIFIED 0x0080
1402
1403 /**
1404 * This physical page is being used as anonymous memory that's internally
1405 * managed by the VM and is not connected to an external pager. This flag is
1406 * only set/cleared on the first CPU mapping of a page (see PVH_FLAG_CPU). Any
1407 * subsequent mappings won't set/clear this flag until all mappings are removed
1408 * and a new CPU mapping is added.
1409 */
1410 #define PP_ATTR_INTERNAL 0x0100
1411
1412 /**
1413 * This flag is used to keep track of pages that are still resident but are not
1414 * considered dirty and can be reclaimed under memory pressure. These pages do
1415 * not count as a part of the memory footprint, so the footprint ledger does not
1416 * need to be updated for these pages. This is hinted to the VM by the
1417 * `madvise(MADV_FREE_REUSABLE)` system call.
1418 */
1419 #define PP_ATTR_REUSABLE 0x0200
1420
1421 /**
1422 * This flag denotes that a page is utilizing "alternate accounting". This means
1423 * that the pmap doesn't need to keep track of these pages with regards to the
1424 * footprint ledger because the VM is already accounting for them in a different
1425 * way. These include IOKit mappings (VM adds their entire virtual size to the
1426 * footprint), and purgeable pages (VM counts them only when non-volatile and
1427 * only for one "owner"), among others.
1428 *
1429 * Note that alternate accounting status is tracked on a per-mapping basis (not
1430 * per-page). Because of that the ALTACCT flag in the pp_attr_table is only used
1431 * when there's a single mapping to a page. When there are multiple mappings,
1432 * the status of this flag is tracked in the pv_head_table (see PVE_PTEP_ALTACCT
1433 * above).
1434 */
1435 #define PP_ATTR_ALTACCT 0x0400
1436
1437 /**
1438 * This bit was originally used on x86 to keep track of what pages to not
1439 * encrypt during the hibernation process as a performance optimization when
1440 * encryption was done in software. This doesn't apply to the ARM
1441 * hibernation process because all pages are automatically encrypted using
1442 * hardware acceleration. Despite that, the pmap still keeps track of this flag
1443 * as a debugging aid on internal builds.
1444 *
1445 * TODO: This bit can probably be reclaimed:
1446 * rdar://70740650 (PMAP Cleanup: Potentially reclaim the PP_ATTR_NOENCRYPT bit on ARM)
1447 */
1448 #define PP_ATTR_NOENCRYPT 0x0800
1449
1450 /**
1451 * These bits denote that a physical page is expecting the next access or
1452 * modification to set the PP_ATTR_REFERENCED and PP_ATTR_MODIFIED flags
1453 * respectively.
1454 */
1455 #define PP_ATTR_REFFAULT 0x1000
1456 #define PP_ATTR_MODFAULT 0x2000
1457
1458 /**
1459 * Atomically set some flags in a pp_attr_table entry.
1460 *
1461 * @param pai The physical address index for the entry to update.
1462 * @param bits The flags to set in the entry.
1463 */
1464 static inline void
ppattr_set_bits(unsigned int pai,pp_attr_t bits)1465 ppattr_set_bits(unsigned int pai, pp_attr_t bits)
1466 {
1467 volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1468 os_atomic_or(ppattr, bits, relaxed);
1469 }
1470
1471 /**
1472 * Atomically clear some flags in a pp_attr_table entry.
1473 *
1474 * @param pai The physical address index for the entry to update.
1475 * @param bits The flags to clear in the entry.
1476 */
1477 static inline void
ppattr_clear_bits(unsigned int pai,pp_attr_t bits)1478 ppattr_clear_bits(unsigned int pai, pp_attr_t bits)
1479 {
1480 volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1481 os_atomic_andnot(ppattr, bits, relaxed);
1482 }
1483
1484 /**
1485 * General-purpose function for atomically modifying flags in a pp_attr_table entry.
1486 *
1487 * @param pai The physical address index for the entry to update.
1488 * @param bits_to_clear Mask of bits to atomically clear from the entry.
1489 * @param bits_to_set Mask of bits to atomically set in the entry.
1490 *
1491 * @note [bits_to_clear] and [bits_to_set] must not overlap.
1492 */
1493 static inline void
ppattr_modify_bits(unsigned int pai,pp_attr_t bits_to_clear,pp_attr_t bits_to_set)1494 ppattr_modify_bits(unsigned int pai, pp_attr_t bits_to_clear, pp_attr_t bits_to_set)
1495 {
1496 assert((bits_to_set & bits_to_clear) == 0);
1497 pp_attr_t prev_ppattr, new_ppattr;
1498 os_atomic_rmw_loop(&pp_attr_table[pai], prev_ppattr, new_ppattr, relaxed, {
1499 new_ppattr = (prev_ppattr & ~bits_to_clear) | bits_to_set;
1500 });
1501 }
1502
1503 /**
1504 * Return true if the pp_attr_table entry contains the passed in bits.
1505 *
1506 * @param pai The physical address index for the entry to test.
1507 * @param bits The flags to check for.
1508 */
1509 static inline bool
ppattr_test_bits(unsigned int pai,pp_attr_t bits)1510 ppattr_test_bits(unsigned int pai, pp_attr_t bits)
1511 {
1512 const volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1513 return (*ppattr & bits) == bits;
1514 }
1515
1516 /**
1517 * Only set some flags in a pp_attr_table entry if the passed in physical
1518 * address is a kernel-managed address.
1519 *
1520 * @param pa The physical address for the entry to update.
1521 * @param bits The flags to set in the entry.
1522 */
1523 static inline void
ppattr_pa_set_bits(pmap_paddr_t pa,pp_attr_t bits)1524 ppattr_pa_set_bits(pmap_paddr_t pa, pp_attr_t bits)
1525 {
1526 if (pa_valid(pa)) {
1527 ppattr_set_bits(pa_index(pa), bits);
1528 }
1529 }
1530
1531 /**
1532 * Only clear some flags in a pp_attr_table entry if the passed in physical
1533 * address is a kernel-managed address.
1534 *
1535 * @param pa The physical address for the entry to update.
1536 * @param bits The flags to clear in the entry.
1537 */
1538 static inline void
ppattr_pa_clear_bits(pmap_paddr_t pa,pp_attr_t bits)1539 ppattr_pa_clear_bits(pmap_paddr_t pa, pp_attr_t bits)
1540 {
1541 if (pa_valid(pa)) {
1542 ppattr_clear_bits(pa_index(pa), bits);
1543 }
1544 }
1545
1546 /**
1547 * Only test flags in a pp_attr_table entry if the passed in physical address
1548 * is a kernel-managed page.
1549 *
1550 * @param pa The physical address for the entry to test.
1551 * @param bits The flags to check for.
1552 *
1553 * @return False if the PA isn't a kernel-managed page, otherwise true/false
1554 * depending on whether the bits are set.
1555 */
1556 static inline bool
ppattr_pa_test_bits(pmap_paddr_t pa,pp_attr_t bits)1557 ppattr_pa_test_bits(pmap_paddr_t pa, pp_attr_t bits)
1558 {
1559 return pa_valid(pa) ? ppattr_test_bits(pa_index(pa), bits) : false;
1560 }
1561
1562 /**
1563 * Set the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the passed
1564 * in physical address is a kernel-managed page.
1565 *
1566 * @param pa The physical address for the entry to update.
1567 */
1568 static inline void
ppattr_pa_set_modify(pmap_paddr_t pa)1569 ppattr_pa_set_modify(pmap_paddr_t pa)
1570 {
1571 ppattr_pa_set_bits(pa, PP_ATTR_MODIFIED);
1572 }
1573
1574 /**
1575 * Clear the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the
1576 * passed in physical address is a kernel-managed page.
1577 *
1578 * @param pa The physical address for the entry to update.
1579 */
1580 static inline void
ppattr_pa_clear_modify(pmap_paddr_t pa)1581 ppattr_pa_clear_modify(pmap_paddr_t pa)
1582 {
1583 ppattr_pa_clear_bits(pa, PP_ATTR_MODIFIED);
1584 }
1585
1586 /**
1587 * Set the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1588 * passed in physical address is a kernel-managed page.
1589 *
1590 * @param pa The physical address for the entry to update.
1591 */
1592 static inline void
ppattr_pa_set_reference(pmap_paddr_t pa)1593 ppattr_pa_set_reference(pmap_paddr_t pa)
1594 {
1595 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
1596 }
1597
1598 /**
1599 * Clear the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1600 * passed in physical address is a kernel-managed page.
1601 *
1602 * @param pa The physical address for the entry to update.
1603 */
1604 static inline void
ppattr_pa_clear_reference(pmap_paddr_t pa)1605 ppattr_pa_clear_reference(pmap_paddr_t pa)
1606 {
1607 ppattr_pa_clear_bits(pa, PP_ATTR_REFERENCED);
1608 }
1609
1610 /**
1611 * Set the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1612 *
1613 * @param pai The physical address index for the entry to update.
1614 */
1615 static inline void
ppattr_set_internal(unsigned int pai)1616 ppattr_set_internal(unsigned int pai)
1617 {
1618 ppattr_set_bits(pai, PP_ATTR_INTERNAL);
1619 }
1620
1621 /**
1622 * Clear the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1623 *
1624 * @param pai The physical address index for the entry to update.
1625 */
1626 static inline void
ppattr_clear_internal(unsigned int pai)1627 ppattr_clear_internal(unsigned int pai)
1628 {
1629 ppattr_clear_bits(pai, PP_ATTR_INTERNAL);
1630 }
1631
1632 /**
1633 * Return true if the pp_attr_table entry has the PP_ATTR_INTERNAL flag set.
1634 *
1635 * @param pai The physical address index for the entry to test.
1636 */
1637 static inline bool
ppattr_test_internal(unsigned int pai)1638 ppattr_test_internal(unsigned int pai)
1639 {
1640 return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1641 }
1642
1643 /**
1644 * Set the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1645 *
1646 * @param pai The physical address index for the entry to update.
1647 */
1648 static inline void
ppattr_set_reusable(unsigned int pai)1649 ppattr_set_reusable(unsigned int pai)
1650 {
1651 ppattr_set_bits(pai, PP_ATTR_REUSABLE);
1652 }
1653
1654 /**
1655 * Clear the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1656 *
1657 * @param pai The physical address index for the entry to update.
1658 */
1659 static inline void
ppattr_clear_reusable(unsigned int pai)1660 ppattr_clear_reusable(unsigned int pai)
1661 {
1662 ppattr_clear_bits(pai, PP_ATTR_REUSABLE);
1663 }
1664
1665 /**
1666 * Return true if the pp_attr_table entry has the PP_ATTR_REUSABLE flag set.
1667 *
1668 * @param pai The physical address index for the entry to test.
1669 */
1670 static inline bool
ppattr_test_reusable(unsigned int pai)1671 ppattr_test_reusable(unsigned int pai)
1672 {
1673 return ppattr_test_bits(pai, PP_ATTR_REUSABLE);
1674 }
1675
1676 /**
1677 * Set the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1678 *
1679 * @note This is only valid when the ALTACCT flag is being tracked using the
1680 * pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1681 * PP_ATTR_ALTACCT definitions for more information.
1682 *
1683 * @param pai The physical address index for the entry to update.
1684 */
1685 static inline void
ppattr_set_altacct(unsigned int pai)1686 ppattr_set_altacct(unsigned int pai)
1687 {
1688 ppattr_set_bits(pai, PP_ATTR_ALTACCT);
1689 }
1690
1691 /**
1692 * Clear the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1693 *
1694 * @note This is only valid when the ALTACCT flag is being tracked using the
1695 * pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1696 * PP_ATTR_ALTACCT definitions for more information.
1697 *
1698 * @param pai The physical address index for the entry to update.
1699 */
1700 static inline void
ppattr_clear_altacct(unsigned int pai)1701 ppattr_clear_altacct(unsigned int pai)
1702 {
1703 ppattr_clear_bits(pai, PP_ATTR_ALTACCT);
1704 }
1705
1706 /**
1707 * Get the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1708 *
1709 * @note This is only valid when the ALTACCT flag is being tracked using the
1710 * pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1711 * PP_ATTR_ALTACCT definitions for more information.
1712 *
1713 * @param pai The physical address index for the entry to test.
1714 *
1715 * @return True if the passed in page uses alternate accounting, false
1716 * otherwise.
1717 */
1718 static inline bool
ppattr_is_altacct(unsigned int pai)1719 ppattr_is_altacct(unsigned int pai)
1720 {
1721 return ppattr_test_bits(pai, PP_ATTR_ALTACCT);
1722 }
1723
1724 /**
1725 * Get the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1726 *
1727 * @note This is only valid when the INTERNAL flag is being tracked using the
1728 * pp_attr_table. See the descriptions above the PVE_PTEP_INTERNAL and
1729 * PP_ATTR_INTERNAL definitions for more information.
1730 *
1731 * @param pai The physical address index for the entry to test.
1732 *
1733 * @return True if the passed in page is accounted for as "internal", false
1734 * otherwise.
1735 */
1736 static inline bool
ppattr_is_internal(unsigned int pai)1737 ppattr_is_internal(unsigned int pai)
1738 {
1739 return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1740 }
1741
1742 /**
1743 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1744 * depending on whether there are one or multiple mappings to a page. This
1745 * function abstracts out the difference between single and multiple mappings to
1746 * a page and provides a single function for determining whether alternate
1747 * accounting is set for a mapping.
1748 *
1749 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1750 * definitions for more information.
1751 *
1752 * @param pai The physical address index for the entry to test.
1753 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1754 * @param idx Index of the chosen PTE pointer inside the PVE.
1755 *
1756 * @return True if the passed in page uses alternate accounting, false
1757 * otherwise.
1758 */
1759 static inline bool
ppattr_pve_is_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1760 ppattr_pve_is_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1761 {
1762 return (pvep == PV_ENTRY_NULL) ? ppattr_is_altacct(pai) : pve_get_altacct(pvep, idx);
1763 }
1764
1765 /**
1766 * The "internal" (INTERNAL) status for a page is tracked differently
1767 * depending on whether there are one or multiple mappings to a page. This
1768 * function abstracts out the difference between single and multiple mappings to
1769 * a page and provides a single function for determining whether "internal"
1770 * is set for a mapping.
1771 *
1772 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1773 * definitions for more information.
1774 *
1775 * @param pai The physical address index for the entry to test.
1776 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1777 * @param idx Index of the chosen PTE pointer inside the PVE.
1778 *
1779 * @return True if the passed in page is "internal", false otherwise.
1780 */
1781 static inline bool
ppattr_pve_is_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1782 ppattr_pve_is_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1783 {
1784 return (pvep == PV_ENTRY_NULL) ? ppattr_is_internal(pai) : pve_get_internal(pvep, idx);
1785 }
1786
1787 /**
1788 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1789 * depending on whether there are one or multiple mappings to a page. This
1790 * function abstracts out the difference between single and multiple mappings to
1791 * a page and provides a single function for setting the alternate accounting status
1792 * for a mapping.
1793 *
1794 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1795 * definitions for more information.
1796 *
1797 * @param pai The physical address index for the entry to update.
1798 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1799 * @param idx Index of the chosen PTE pointer inside the PVE.
1800 */
1801 static inline void
ppattr_pve_set_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1802 ppattr_pve_set_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1803 {
1804 if (pvep == PV_ENTRY_NULL) {
1805 ppattr_set_altacct(pai);
1806 } else {
1807 pve_set_altacct(pvep, idx);
1808 }
1809 }
1810
1811 /**
1812 * The "internal" (INTERNAL) status for a page is tracked differently
1813 * depending on whether there are one or multiple mappings to a page. This
1814 * function abstracts out the difference between single and multiple mappings to
1815 * a page and provides a single function for setting the "internal" status
1816 * for a mapping.
1817 *
1818 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1819 * definitions for more information.
1820 *
1821 * @param pai The physical address index for the entry to update.
1822 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1823 * @param idx Index of the chosen PTE pointer inside the PVE.
1824 */
1825 static inline void
ppattr_pve_set_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1826 ppattr_pve_set_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1827 {
1828 if (pvep == PV_ENTRY_NULL) {
1829 ppattr_set_internal(pai);
1830 } else {
1831 pve_set_internal(pvep, idx);
1832 }
1833 }
1834
1835 /**
1836 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1837 * depending on whether there are one or multiple mappings to a page. This
1838 * function abstracts out the difference between single and multiple mappings to
1839 * a page and provides a single function for clearing the alternate accounting status
1840 * for a mapping.
1841 *
1842 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1843 * definitions for more information.
1844 *
1845 * @param pai The physical address index for the entry to update.
1846 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1847 * @param idx Index of the chosen PTE pointer inside the PVE.
1848 */
1849 static inline void
ppattr_pve_clr_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1850 ppattr_pve_clr_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1851 {
1852 if (pvep == PV_ENTRY_NULL) {
1853 ppattr_clear_altacct(pai);
1854 } else {
1855 pve_clr_altacct(pvep, idx);
1856 }
1857 }
1858
1859 /**
1860 * The "internal" (INTERNAL) status for a page is tracked differently
1861 * depending on whether there are one or multiple mappings to a page. This
1862 * function abstracts out the difference between single and multiple mappings to
1863 * a page and provides a single function for clearing the "internal" status
1864 * for a mapping.
1865 *
1866 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1867 * definitions for more information.
1868 *
1869 * @param pai The physical address index for the entry to update.
1870 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1871 * @param idx Index of the chosen PTE pointer inside the PVE.
1872 */
1873 static inline void
ppattr_pve_clr_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1874 ppattr_pve_clr_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1875 {
1876 if (pvep == PV_ENTRY_NULL) {
1877 ppattr_clear_internal(pai);
1878 } else {
1879 pve_clr_internal(pvep, idx);
1880 }
1881 }
1882
1883 /**
1884 * Set the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1885 *
1886 * @param pai The physical address index for the entry to update.
1887 */
1888 static inline void
ppattr_set_reffault(unsigned int pai)1889 ppattr_set_reffault(unsigned int pai)
1890 {
1891 ppattr_set_bits(pai, PP_ATTR_REFFAULT);
1892 }
1893
1894 /**
1895 * Clear the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1896 *
1897 * @param pai The physical address index for the entry to update.
1898 */
1899 static inline void
ppattr_clear_reffault(unsigned int pai)1900 ppattr_clear_reffault(unsigned int pai)
1901 {
1902 ppattr_clear_bits(pai, PP_ATTR_REFFAULT);
1903 }
1904
1905 /**
1906 * Return true if the pp_attr_table entry has the PP_ATTR_REFFAULT flag set.
1907 *
1908 * @param pai The physical address index for the entry to test.
1909 */
1910 static inline bool
ppattr_test_reffault(unsigned int pai)1911 ppattr_test_reffault(unsigned int pai)
1912 {
1913 return ppattr_test_bits(pai, PP_ATTR_REFFAULT);
1914 }
1915
1916 /**
1917 * Set the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1918 *
1919 * @param pai The physical address index for the entry to update.
1920 */
1921 static inline void
ppattr_set_modfault(unsigned int pai)1922 ppattr_set_modfault(unsigned int pai)
1923 {
1924 ppattr_set_bits(pai, PP_ATTR_MODFAULT);
1925 }
1926
1927 /**
1928 * Clear the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1929 *
1930 * @param pai The physical address index for the entry to update.
1931 */
1932 static inline void
ppattr_clear_modfault(unsigned int pai)1933 ppattr_clear_modfault(unsigned int pai)
1934 {
1935 ppattr_clear_bits(pai, PP_ATTR_MODFAULT);
1936 }
1937
1938 /**
1939 * Return true if the pp_attr_table entry has the PP_ATTR_MODFAULT flag set.
1940 *
1941 * @param pai The physical address index for the entry to test.
1942 */
1943 static inline bool
ppattr_test_modfault(unsigned int pai)1944 ppattr_test_modfault(unsigned int pai)
1945 {
1946 return ppattr_test_bits(pai, PP_ATTR_MODFAULT);
1947 }
1948
1949 /**
1950 * Retype epoch operations:
1951 *
1952 * The retype epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap
1953 * can ensure all CPUs have observed updated mapping state before retyping a physical page.
1954 *
1955 * There are certain cases in which the pmap, while issuing an SPTM call that modifies
1956 * mappings, cannot hold locks such as the PVH lock which would prevent the page from
1957 * being concurrently retyped. This is particularly true for batched operations such
1958 * as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes().
1959 * In these cases, the pmap may call pmap_retype_epoch_enter() to note that it is
1960 * performing such a sensitive operation on the local CPU. It must then call
1961 * pmap_retype_epoch_exit() upon completion of the sensitive operation.
1962 *
1963 * Then, for any instance in which the pmap needs to retype a page without being
1964 * otherwise guaranteed (e.g. by VM layer locking or the existing page type) that such
1965 * a sensitive operation is not in progress on some other CPU, it must drain these
1966 * sensitive operations from other CPUs. Specifically, it must ensure that any
1967 * sensitive operation which may have observed prior mapping state of the page that
1968 * is to be retyped has completed. This is accomplished by first calling
1969 * pmap_retype_epoch_prepare_drain() to record the initial retype epoch state of
1970 * all CPUs, followed by pmap_retype_epoch_drain() to ensure all remote CPUs are
1971 * either not in an epoch or have advanced beyond the initially recorded epoch.
1972 * These are exposed as two separate functions in order to allow the calling CPU
1973 * to do other work between calling pmap_retype_epoch_prepare_drain() and
1974 * pmap_retype_epoch_drain(), as a best-effort attempt to minimize time wasted
1975 * spinning in pmap_retype_epoch_drain().
1976 *
1977 * When draining the retype epoch, the following assumptions must hold true:
1978 *
1979 * 1) The calling thread must guarantee that prior updates needed to bring the page
1980 * into the correct mapping state for retyping have already been performed and made
1981 * globally visible using the appropriate barriers. In most cases this means that
1982 * all existing mappings of the page must have been removed. For any alterations
1983 * of mapping state, global visibility is conveniently already guaranteed by the
1984 * DSBs that are architecturally required to synchronize PTE updates and the TLBIs
1985 * that follow them.
1986 *
1987 * 2) The calling thread must have some means of ensuring the new mappings cannot
1988 * be added for the page that would bring it out of the correct state for retyping.
1989 * This is typically done by holding the PVH lock and/or the exclusive pmap lock
1990 * such that pmap_enter() cannot concurrently execute against the page.
1991 *
1992 * 3) The calling thread must not perform any operation which requires preemptibility
1993 * between calling pmap_retype_epoch_prepare_drain() and pmap_retype_epoch_drain().
1994 */
1995
1996 /**
1997 * Enter the retype epoch on the local CPU to indicate an in-progress SPTM operation
1998 * that may be sensitive to a concurrent retype operation on another CPU.
1999 *
2000 * @note This function increments the thread's preemption disable count and returns
2001 * with preemption disabled.
2002 *
2003 * @note This function issues all required barriers to ensure correct ordering of
2004 * the epoch update relative to ensuing SPTM accesses.
2005 */
2006 static inline void
pmap_retype_epoch_enter(void)2007 pmap_retype_epoch_enter(void)
2008 {
2009 mp_disable_preemption();
2010 pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2011 assert(!preemption_enabled());
2012
2013 /* Must not already been in a retype epoch on this CPU. */
2014 assert(retype_epoch->local_seq == 0);
2015 retype_epoch->local_seq = ++retype_epoch->next_seq;
2016 /* Unsigned 64-bit per-CPU integer should never overflow on any human timescale. */
2017 assert(retype_epoch->local_seq != 0);
2018
2019 /**
2020 * Issue a store-load barrier to ensure that remote observers of any ensuing
2021 * SPTM accesses will also observe the epoch update.
2022 */
2023 os_atomic_thread_fence(seq_cst);
2024 }
2025
2026 /**
2027 * Exit the retype epoch on the local CPU to indicate completion of an SPTM operation
2028 * that may be sensitive to a concurrent retype operation on another CPU.
2029 *
2030 * @note This function must be called with preemption disabled and will decrement
2031 * the current thread's preemption disable count.
2032 */
2033 static inline void
pmap_retype_epoch_exit(void)2034 pmap_retype_epoch_exit(void)
2035 {
2036 pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2037 assert(!preemption_enabled());
2038 assert(retype_epoch->local_seq == retype_epoch->next_seq);
2039
2040 /**
2041 * Clear the sequence using a store-release operation to ensure that prior
2042 * SPTM modifications will be visible to remote observers before the absence
2043 * of an epoch is visible.
2044 */
2045 os_atomic_store(&retype_epoch->local_seq, 0, release);
2046 mp_enable_preemption();
2047 }
2048
2049 /**
2050 * Prepare the local CPU to perform an epoch drain operation by recording the retype
2051 * epoch state of other CPUs.
2052 *
2053 * @note This function increments the current thread's preemption disable count and
2054 * returns with preemption disabled.
2055 *
2056 * @note This function issues all necessary barriers to ensure that the subsequent
2057 * retype operation is not speculated ahead of the epoch sampling.
2058 *
2059 * @note This function does NOT issue any barriers to ensure that prior updates of
2060 * mapping state are globally visible and have proper store-load ordering with
2061 * respect to the scan performed here. In the cases where this function is
2062 * intended to be used, this ordering should be guaranteed automatically by
2063 * the DSBs used to synchronize prior mapping updates issued by the caller.
2064 * If this function is ever used in a situation where that cannot be guaranteed,
2065 * the caller must issue at least the equivalent of 'dmb ish' (a.k.a. a seq_cst
2066 * thread_fence) before calling this function.
2067 */
2068 static inline void
pmap_retype_epoch_prepare_drain(void)2069 pmap_retype_epoch_prepare_drain(void)
2070 {
2071 mp_disable_preemption();
2072 pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2073 assert(retype_epoch->flags == 0);
2074 unsigned int i = 0;
2075 uint8_t flags = PMAP_RETYPE_EPOCH_PREPARED;
2076
2077 /* Sample each CPU's epoch state. */
2078 percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2079 const uint64_t remote_epoch =
2080 os_atomic_load(&pmap_pcpu->retype_epoch.local_seq, relaxed);
2081 retype_epoch->remote_seq[i] = remote_epoch;
2082
2083 /**
2084 * If the remote CPU has an active epoch, make a note to ourselves that
2085 * we'll need to drain it.
2086 */
2087 if (remote_epoch != 0) {
2088 flags |= PMAP_RETYPE_EPOCH_DRAIN_REQUIRED;
2089 }
2090 ++i;
2091 }
2092 retype_epoch->flags = flags;
2093
2094 /**
2095 * Issue a load-load barrier to ensure subsequent drain or retype operations will
2096 * not be speculated ahead of the sampling we just did.
2097 */
2098 os_atomic_thread_fence(acquire);
2099 }
2100
2101 /**
2102 * Ensure that all CPUs have advanced beyond any active epoch that was recorded in the
2103 * most recent call to pmap_retype_epoch_prepare_drain().
2104 *
2105 * @note This function expects to be called with preemption disabled and will decrement
2106 * the current thread's preemption disable count.
2107 *
2108 * @note pmap_retype_epoch_prepare_drain() must have been called on the local CPU
2109 * prior to calling this function. This function will return immediately if
2110 * this prior call did not observe any active epochs on remote CPUs.
2111 *
2112 * @note This function issues all necessary barriers to ensure that the subsequent
2113 * retype operation is not speculated ahead of the epoch sampling.
2114 */
2115 static inline void
pmap_retype_epoch_drain(void)2116 pmap_retype_epoch_drain(void)
2117 {
2118 assert(!preemption_enabled());
2119 pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2120 const uint8_t flags = retype_epoch->flags;
2121 assert(flags & PMAP_RETYPE_EPOCH_PREPARED);
2122 retype_epoch->flags = 0;
2123 if (!(flags & PMAP_RETYPE_EPOCH_DRAIN_REQUIRED)) {
2124 mp_enable_preemption();
2125 return;
2126 }
2127 unsigned int i = 0;
2128 percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2129 if (retype_epoch->remote_seq[i] != 0) {
2130 assert((pmap_pcpu->retype_epoch.local_seq == 0) ||
2131 (pmap_pcpu->retype_epoch.local_seq >= retype_epoch->remote_seq[i]));
2132 /**
2133 * If the remote CPU was in an epoch, WFE-spin until it either exits the epoch
2134 * or advances to a new epoch.
2135 */
2136 while ((os_atomic_load_exclusive(&pmap_pcpu->retype_epoch.local_seq, relaxed) ==
2137 retype_epoch->remote_seq[i])) {
2138 __builtin_arm_wfe();
2139 }
2140 /* Clear the monitor if we exclusive-loaded a value that didn't require WFE. */
2141 os_atomic_clear_exclusive();
2142 }
2143 ++i;
2144 }
2145 mp_enable_preemption();
2146 /**
2147 * Issue a load-load barrier to ensure subsequent retype operations will
2148 * not be speculated ahead of the sampling we just did.
2149 */
2150 os_atomic_thread_fence(acquire);
2151 }
2152
2153 /**
2154 * Helper to determine whether a frame type is one that requires automatic
2155 * retyping (by the pmap layer) back to XNU_DEFAULT when all mappings of the
2156 * page are gone.
2157 *
2158 * @return true if the type requires auto-retyping, false otherwise.
2159 */
2160 static inline bool
pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type)2161 pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type)
2162 {
2163 return (frame_type == XNU_USER_EXEC) || (frame_type == XNU_USER_DEBUG) ||
2164 (frame_type == XNU_USER_JIT) || (frame_type == XNU_ROZONE) ||
2165 (frame_type == XNU_KERNEL_RESTRICTED);
2166 }
2167
2168
2169 /**
2170 * If necessary, prepare a physical page for being retyped back to XNU_DEFAULT
2171 * after the last CPU mapping has been removed. This is only needed for pages of
2172 * certain special types such as the various executable types and the kernel RO
2173 * zone type.
2174 *
2175 * @note The PVH lock for the physical page that is getting a new mapping
2176 * registered must already be held.
2177 *
2178 * @param pa The physical address of the recently-unmapped page.
2179 *
2180 * @return true if the page will need to be retyped, false otherwise.
2181 */
2182 static inline bool
pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa)2183 pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa)
2184 {
2185 pvh_assert_locked(pa_index(pa));
2186 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
2187 if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
2188 pmap_retype_epoch_prepare_drain();
2189 return true;
2190 }
2191 return false;
2192 }
2193
2194 /**
2195 * If necessary, retype a physical page back to XNU_DEFAULT after the last CPU
2196 * mapping has been removed. This is only needed for pages of certain special
2197 * types such as the various executable types, the kernel RO zone type,
2198 * and XNU_KERNEL_RESTRICTED.
2199 *
2200 * @note The PVH lock for the physical page that is getting a new mapping
2201 * registered must already be held.
2202 *
2203 * @param pa The physical address of the recently-unmapped page.
2204 *
2205 * @return true if the page needed to be retyped, false otherwise.
2206 */
2207 static inline bool
pmap_retype_unmapped_page(pmap_paddr_t pa)2208 pmap_retype_unmapped_page(pmap_paddr_t pa)
2209 {
2210 pvh_assert_locked(pa_index(pa));
2211 const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
2212 if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
2213 sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2214 pmap_retype_epoch_drain();
2215 sptm_retype(pa & ~PAGE_MASK, frame_type, XNU_DEFAULT, retype_params);
2216 return true;
2217 }
2218 return false;
2219 }
2220
2221 static inline boolean_t
pmap_is_preemptible(void)2222 pmap_is_preemptible(void)
2223 {
2224 return preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT) || PMAP_IS_HIBERNATING();
2225 }
2226
2227 /**
2228 * This helper function ensures that potentially-long-running batched operations are
2229 * called in preemptible context before entering the SPTM, so that the SPTM call may
2230 * periodically exit to allow pending urgent ASTs to be taken.
2231 */
2232 static inline void
pmap_verify_preemptible(void)2233 pmap_verify_preemptible(void)
2234 {
2235 assert(pmap_is_preemptible());
2236 }
2237
2238 /**
2239 * The minimum number of pages to keep in the PPL page free list.
2240 *
2241 * We define our target as 8 pages: enough for 2 page table pages, a PTD page,
2242 * and a PV page; in essence, twice as many pages as may be necessary to satisfy
2243 * a single pmap_enter request.
2244 */
2245 #define PMAP_MIN_FREE_PPL_PAGES 8
2246
2247 /**
2248 * Flags passed to various page allocation functions, usually accessed through
2249 * the pmap_page_alloc() API. Each function that can take these flags as
2250 * a part of its option field, will describe these flags in its function header.
2251 */
2252
2253 /* Can be used when no allocation flags are wanted. */
2254 #define PMAP_PAGE_ALLOCATE_NONE 0x0
2255
2256 /**
2257 * Instruct the allocation function to return immediately if no pages are
2258 * current available. Without this flag, the function will spin and wait for a
2259 * page to become available. This flag can be required in some circumstances
2260 * (for instance, when allocating pages from within the PPL).
2261 */
2262 #define PMAP_PAGE_ALLOCATE_NOWAIT 0x1
2263
2264 /**
2265 * Instructs an allocation function to fallback to reclaiming a userspace page
2266 * table if it failed to allocate a page from the free lists. This can be useful
2267 * when allocating from within the PPL because refilling the free lists requires
2268 * exiting and re-entering the PPL (which incurs extra latency).
2269 *
2270 * This is a quick way of allocating a page at the expense of having to
2271 * reallocate the table the next time one of its mappings is accessed.
2272 */
2273 #define PMAP_PAGE_RECLAIM_NOWAIT 0x2
2274
2275 /**
2276 * Instructs an allocation function to avoid zero-filling the newly-allocated
2277 * page. This should be used only if you know the page will be fully initialized
2278 * by some other means on the relevant allocation path.
2279 */
2280 #define PMAP_PAGE_NOZEROFILL 0x4
2281
2282 /**
2283 * Global variables exported to the rest of the internal pmap implementation.
2284 */
2285 extern pmap_paddr_t sptm_cpu_iommu_scratch_start;
2286 extern pmap_paddr_t sptm_cpu_iommu_scratch_end;
2287 extern unsigned int inuse_pmap_pages_count;
2288 extern vm_object_t pmap_object;
2289 extern uint32_t pv_alloc_initial_target;
2290 extern uint32_t pv_kern_alloc_initial_target;
2291
2292 /**
2293 * Functions exported to the rest of the internal pmap implementation.
2294 */
2295 extern void pmap_data_bootstrap(void);
2296 extern void pmap_enqueue_pages(vm_page_t);
2297 extern kern_return_t pmap_page_alloc(pmap_paddr_t *, unsigned);
2298 extern void pmap_page_free(pmap_paddr_t);
2299
2300 /**
2301 * The modes in which a pmap lock can be acquired. Note that shared access
2302 * doesn't necessarily mean "read-only". As long as data is atomically updated
2303 * correctly (to account for multi-cpu accesses) data can still get written with
2304 * a shared lock held. Care just needs to be taken so as to not introduce any
2305 * race conditions when there are multiple writers.
2306 *
2307 * This is here in pmap_data.h because it's a needed parameter for pv_alloc()
2308 * and pmap_enter_pv(). This header is always included in pmap_internal.h before
2309 * the rest of the pmap locking code is defined so there shouldn't be any issues
2310 * with missing types.
2311 */
2312 OS_ENUM(pmap_lock_mode, uint8_t,
2313 PMAP_LOCK_SHARED,
2314 PMAP_LOCK_EXCLUSIVE,
2315 PMAP_LOCK_HELD);
2316
2317 /**
2318 * Possible return values for pv_alloc(). See the pv_alloc() function header for
2319 * a description of each of these values.
2320 */
2321 typedef enum {
2322 PV_ALLOC_SUCCESS,
2323 PV_ALLOC_RETRY,
2324 PV_ALLOC_FAIL
2325 } pv_alloc_return_t;
2326
2327 extern pv_alloc_return_t pv_alloc(
2328 pmap_t, pmap_lock_mode_t, unsigned int, pv_entry_t **, locked_pvh_t *, volatile uint16_t *);
2329 extern void pv_free(pv_entry_t *);
2330 extern void pv_list_free(pv_entry_t *, pv_entry_t *, unsigned int);
2331 extern void pmap_compute_pv_targets(void);
2332 extern pv_alloc_return_t pmap_enter_pv(
2333 pmap_t, pt_entry_t *, unsigned int, pmap_lock_mode_t, locked_pvh_t *, pv_entry_t **, int *);
2334
2335 typedef enum {
2336 PV_REMOVE_SUCCESS, /* found a mapping */
2337 PV_REMOVE_FAIL /* no mapping found */
2338 } pv_remove_return_t;
2339
2340 extern pv_remove_return_t pmap_remove_pv(pmap_t, pt_entry_t *, locked_pvh_t *, bool *, bool *);
2341
2342 extern void ptd_bootstrap(pt_desc_t *, unsigned int);
2343 extern pt_desc_t *ptd_alloc_unlinked(unsigned int);
2344 extern pt_desc_t *ptd_alloc(pmap_t, unsigned int);
2345 extern void ptd_deallocate(pt_desc_t *);
2346 extern void ptd_info_init(
2347 pt_desc_t *, pmap_t, vm_map_address_t, unsigned int, pt_entry_t *);
2348
2349 extern kern_return_t pmap_ledger_credit(pmap_t, int, ledger_amount_t);
2350 extern kern_return_t pmap_ledger_debit(pmap_t, int, ledger_amount_t);
2351
2352 extern void validate_pmap_internal(const volatile struct pmap *, const char *);
2353 extern void validate_pmap_mutable_internal(const volatile struct pmap *, const char *);
2354
2355 /**
2356 * Macro function wrappers around pmap validation so that the calling function
2357 * can be printed in the panic strings for easier validation failure debugging.
2358 */
2359 #define validate_pmap(x) validate_pmap_internal(x, __func__)
2360 #define validate_pmap_mutable(x) validate_pmap_mutable_internal(x, __func__)
2361
2362 /**
2363 * This structure describes a SPTM-owned I/O range.
2364 *
2365 * @note This doesn't necessarily have to represent "I/O" only, this can also
2366 * represent non-kernel-managed DRAM (e.g., iBoot carveouts). Any physical
2367 * address region that isn't considered "kernel-managed" is fair game.
2368 *
2369 * @note The layout of this structure needs to map 1-to-1 with the pmap-io-range
2370 * device tree nodes. Astris (through the LowGlobals) also depends on the
2371 * consistency of this structure.
2372 *
2373 * @note These definitions are copied to SPTM and they need to be in sync.
2374 */
2375 typedef struct pmap_io_range {
2376 /* Physical address of the PPL-owned I/O range. */
2377 uint64_t addr;
2378
2379 /* Length (in bytes) of the PPL-owned I/O range. */
2380 uint64_t len;
2381
2382 /* Strong DSB required for pages in this range. */
2383 #define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31)
2384
2385 /* Corresponds to memory carved out by bootloader. */
2386 #define PMAP_IO_RANGE_CARVEOUT (1UL << 30)
2387
2388 /* Pages in this range need to be included in the hibernation image */
2389 #define PMAP_IO_RANGE_NEEDS_HIBERNATING (1UL << 29)
2390
2391 /* Mark the range as 'owned' by a given subsystem */
2392 #define PMAP_IO_RANGE_OWNED (1UL << 28)
2393
2394 /**
2395 * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional
2396 * mapping flags (defined above).
2397 */
2398 uint32_t wimg;
2399
2400 /* 4 Character Code (4CC) describing what this range is. */
2401 uint32_t signature;
2402 } pmap_io_range_t;
2403
2404 /* Reminder: be sure to change all relevant device trees if you change the layout of pmap_io_range_t */
2405 _Static_assert(sizeof(pmap_io_range_t) == 24, "unexpected size for pmap_io_range_t");
2406
2407 extern pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t);
2408
2409 /**
2410 * This structure describes a sub-page-size I/O region owned by SPTM but the kernel can write to.
2411 *
2412 * @note I/O filter software will use a collection of such data structures to determine access
2413 * permissions to a page owned by SPTM.
2414 *
2415 * @note The {signature, offset} key is used to index a collection of such data structures to
2416 * optimize for space in the case where one page layout is repeated for many devices, such
2417 * as the memory controller channels.
2418 */
2419 typedef struct pmap_io_filter_entry {
2420 /* 4 Character Code (4CC) describing what this range (page) is. */
2421 uint32_t signature;
2422
2423 /* Offset within the page. It has to be within [0, PAGE_SIZE). */
2424 uint16_t offset;
2425
2426 /* Length of the range, and (offset + length) has to be within [0, PAGE_SIZE). */
2427 uint16_t length;
2428 } pmap_io_filter_entry_t;
2429
2430 _Static_assert(sizeof(pmap_io_filter_entry_t) == 8, "unexpected size for pmap_io_filter_entry_t");
2431
2432 extern void pmap_cpu_data_init_internal(unsigned int);
2433