xref: /xnu-12377.81.4/osfmk/arm64/sptm/pmap/pmap_data.h (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /**
29  * This header file is used to store the types, prototypes, and inline functions
30  * that define some of the most important data structures used in the pmap. This
31  * header is only meant for sharing types within the pmap; if a type is meant to
32  * be used by the rest of the kernel, then put it into osfmk/arm64/sptm/pmap/pmap.h.
33  */
34 #pragma once
35 
36 #include <stdint.h>
37 
38 #include <kern/ledger.h>
39 #include <mach/vm_types.h>
40 #include <mach_assert.h>
41 #include <vm/vm_page.h>
42 
43 #include <arm/cpu_data.h>
44 #include <arm/machine_routines.h>
45 #include <arm64/proc_reg.h>
46 
47 #if HIBERNATION
48 #include <arm64/hibernate_secure_hmac.h>
49 #endif /* HIBERNATION */
50 
51 /* Temporary include before moving all ledger functions into pmap_data.c */
52 #include <os/refcnt.h>
53 
54 /**
55  * These headers are safe to be included in this file since they shouldn't rely
56  * on any of the internal pmap header files (so no circular dependencies).
57  */
58 #include <arm64/sptm/pmap/pmap.h>
59 #include <arm64/sptm/pmap/pmap_pt_geometry.h>
60 
61 #include <arm64/sptm/sptm.h>
62 
63 /**
64  * These values represent the first and last kernel-managed physical addresses.
65  * We keep track of extra metadata on kernel-managed pages compared to other
66  * pages (usually iBoot carved out memory or I/O).
67  */
68 extern pmap_paddr_t vm_first_phys, vm_last_phys;
69 
70 #define PMAP_HIB_STATE_REACHED(states) false
71 #define PMAP_ASSERT_NOT_WRITING_HIB()
72 #define PMAP_IS_HIBERNATING() false
73 
74 /**
75  * Return whether the given address represents a kernel-managed physical page.
76  *
77  * Whether a page is considered "kernel-managed" is determined by the BootArgs
78  * passed by the bootloader. Typically memory carved out by the bootloader as
79  * well as I/O memory should return false.
80  *
81  * @param pa The physical address to check.
82  */
83 static inline bool
pa_valid(pmap_paddr_t pa)84 pa_valid(pmap_paddr_t pa)
85 {
86 	return (pa >= vm_first_phys) && (pa < vm_last_phys);
87 }
88 
89 /* Sentinal value indicating an invalid physical address index. */
90 #define INVALID_PAI UINT_MAX
91 
92 /**
93  * The pmap has a variety of data structures (pv_head_table/pp_attr_table) that
94  * contain an entry for every kernel-managed page in the system. These systems
95  * are indexed with physical address indices ("pai") generated by this function.
96  *
97  * The logic is simple since there should be one entry in each of these data
98  * structures for each kernel-managed physical page in the system. These data
99  * structures are allocated on boot based on the amount of memory available.
100  *
101  * @note PAIs are defined using the VM page size, which might not be identical
102  *       to the underlying hardware page size for an arbitrary address space.
103  *       This means that the data structures relying on PAIs will contain one
104  *       entry for each VM page, not hardware page.
105  *
106  * @note This function is only valid for physical addresses that are
107  *       kernel-managed.
108  */
109 static inline unsigned int
pa_index(pmap_paddr_t pa)110 pa_index(pmap_paddr_t pa)
111 {
112 	return (unsigned int)atop(pa - vm_first_phys);
113 }
114 
115 /**
116  * Convert from a physical address index (pai) back to a raw physical address.
117  *
118  * @param pai The physical address index to convert to a PA.
119  *
120  * @return The page-aligned physical address corresponding to [pai].
121  */
122 static inline pmap_paddr_t
pai_to_pa(unsigned int pai)123 pai_to_pa(unsigned int pai)
124 {
125 	return ptoa((pmap_paddr_t)pai) + vm_first_phys;
126 }
127 
128 /* See the definition of pv_head_table for more information. */
129 extern uintptr_t *pv_head_table;
130 
131 /* Represents a NULL entry in the pv_head_table. */
132 #define PV_ENTRY_NULL ((pv_entry_t *) 0)
133 
134 /**
135  * Given a physical address index, return the corresponding pv_head_table entry.
136  *
137  * @note The returned entry might be invalid, or a pointer to a pt_entry_t,
138  *       pv_entry_t, or pt_desc_t depending on the type for this entry.
139  *       Determine the type using pvh_test_type().
140  *
141  * @param pai The index returned by pa_index() for the page whose pv_head_table
142  *            entry should be retrieved.
143  */
144 static inline uintptr_t
pai_to_pvh(unsigned int pai)145 pai_to_pvh(unsigned int pai)
146 {
147 	return pv_head_table[pai];
148 }
149 
150 /**
151  * Each pv_head_table entry can be one of four different types:
152  *
153  * - PVH_TYPE_NULL: No mappings to the physical page exist outside of the
154  *                  physical aperture. Physical aperture mappings are not
155  *                  tracked in the pv_head_table.
156  *
157  * - PVH_TYPE_PVEP: There are multiple mappings to the physical page.
158  *                  These entries are linked lists of pv_entry_t objects (which
159  *                  each contain a pointer to the associated PTE and a pointer
160  *                  to the next entry in the list).
161  *
162  * - PVH_TYPE_PTEP: There is a single mapping to the physical page. Once more
163  *                  mappings are created, this entry will get upgraded to an
164  *                  entry of type PVH_TYPE_PVEP. These entries are pointers
165  *                  directly to the page table entry that contain the mapping
166  *                  (pt_entry_t*).
167  *
168  * - PVH_TYPE_PTDP: The physical page is being used as a page table. These
169  *                  entries are pointers to page table descriptor structures
170  *                  (pt_desc_t) which contain metadata related to each page
171  *                  table.
172  *
173  * The type is stored in the bottom two bits of each pv_head_table entry. That
174  * type needs to be checked before dereferencing the pointer to determine which
175  * pointer type to dereference as.
176  */
177 __enum_closed_decl(pvh_type_t, uint8_t, {
178 	PVH_TYPE_NULL = 0b00,
179 	PVH_TYPE_PVEP = 0b01,
180 	PVH_TYPE_PTEP = 0b10,
181 	PVH_TYPE_PTDP = 0b11,
182 });
183 
184 #define PVH_TYPE_MASK (0x3UL)
185 
186 
187 /**
188  * PV_HEAD_TABLE Flags.
189  *
190  * All flags listed below are stored in the pv_head_table entry/pointer
191  * (per-physical-page) unless otherwise noted.
192  *
193  * Please update the pv_walk LLDB macro if these flags are changed or added to.
194  */
195 
196 /**
197  * This flag is set for every mapping created by an IOMMU.
198  *
199  * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
200  * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
201  */
202 #define PVH_FLAG_IOMMU 0x4UL
203 
204 /**
205  * This flag is only valid when PVH_FLAG_IOMMU is set. For an IOMMU mapping, if
206  * this bit is set, then the PTE pointer points directly into the IOMMU page
207  * table for this mapping. If this bit is cleared, then the "PTE pointer" is
208  * actually a pointer to the IOMMU descriptor object that owns this mapping.
209  *
210  * There are cases where it's not easy to tie an IOMMU mapping directly to a
211  * specific page table, so this allows us to at least get a pointer to which
212  * IOMMU created this mapping which is useful for debugging purposes.
213  *
214  * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
215  * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
216  */
217 #define PVH_FLAG_IOMMU_TABLE (1ULL << 63)
218 
219 /**
220  * This flag is set when the first CPU (non-IOMMU) mapping is created. This is
221  * important to keep track of because various accounting statistics are based on
222  * the options specified for the first CPU mapping. This flag, and thus the
223  * accounting statistics, will persist as long as there *any* mappings of the
224  * page (including IOMMU mappings). This works because the accounting for a page
225  * should not need to change until the page is recycled by the VM layer, and we
226  * double-check that there are no mappings (CPU or IOMMU) when a page is
227  * recycled (see: pmap_verify_free()).
228  */
229 #define PVH_FLAG_CPU (1ULL << 62)
230 
231 /* This bit is used as a lock when modifying a pv_head_table entry. */
232 #define PVH_LOCK_BIT 61
233 #define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT)
234 
235 /**
236  * This flag is set when there are any executable mappings to this physical
237  * page. This is used to prevent any writable mappings from being created at
238  * the same time an executable mapping exists.
239  */
240 #define PVH_FLAG_EXEC (1ULL << 60)
241 
242 /**
243  * This flag is used to mark that a page has been hashed into the hibernation
244  * image.
245  *
246  * The hibernation driver will use this to ensure that all PPL-owned memory is
247  * correctly included into the hibernation image (a missing PPL page could be
248  * a security concern when coming out of hibernation).
249  */
250 #define PVH_FLAG_HASHED (1ULL << 58)
251 
252 /**
253  * Marking a pv_head_table entry with this flag denotes that this page is
254  * retired without any mappings and never should be mapped again.
255  */
256 #define PVH_FLAG_RETIRED (1ULL << 55)
257 
258 /**
259  * This flag is used to mark that a PV head entry has been placed into
260  * "sleep mode", which typically happens when the lock owner needs to
261  * process a long PV list.  If this bit is set, threads which contend
262  * on the PVH lock must call thread_block() to wait until they are awakened
263  * by the current lock owner releasing the lock.
264  */
265 #define PVH_FLAG_SLEEP (1ULL << 54)
266 
267 /**
268  * These bits need to be set to safely dereference a pv_head_table
269  * entry/pointer.
270  *
271  * Any change to this #define should also update the copy located in the pmap.py
272  * LLDB macros file.
273  */
274 #define PVH_MUTABLE_FLAGS (PVH_FLAG_CPU | PVH_FLAG_EXEC | PVH_FLAG_HASHED | PVH_FLAG_RETIRED)
275 
276 #define PVH_LOCK_FLAGS (PVH_FLAG_LOCK | PVH_FLAG_SLEEP)
277 
278 #define PVH_HIGH_FLAGS (PVH_MUTABLE_FLAGS | PVH_LOCK_FLAGS)
279 
280 /* Mask used to clear out the TYPE bits from a pv_head_table entry/pointer. */
281 #define PVH_LIST_MASK (~PVH_TYPE_MASK)
282 
283 /* Which 32-bit word in each pv_head_table entry/pointer contains the LOCK bit. */
284 #define PVH_LOCK_WORD 1 /* Assumes little-endian */
285 
286 /**
287  * Assert that a pv_head_table entry is locked. Will panic if the lock isn't
288  * acquired.
289  *
290  * @param index The physical address index to check.
291  */
292 static inline void
pvh_assert_locked(__assert_only unsigned int index)293 pvh_assert_locked(__assert_only unsigned int index)
294 {
295 	assertf(os_atomic_load(&pv_head_table[index], relaxed) & PVH_LOCK_FLAGS,
296 	    "%s: PVH %p (=%p) for pai 0x%x not locked or in sleep mode", __func__,
297 	    &pv_head_table[index], (void*)(os_atomic_load(&pv_head_table[index], relaxed)), index);
298 }
299 
300 /**
301  * Helper function for returning the 32-bit PVH lock word corresponding
302  * to a physical address index.
303  *
304  * @param index The physical address index of the pv_head_table entry
305  *
306  * @return A pointer to the 32-bit word containing the lock bit
307  */
308 static inline uint32_t*
pvh_lock_word(unsigned int index)309 pvh_lock_word(unsigned int index)
310 {
311 	return (uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD;
312 }
313 
314 /**
315  * Helper macro for computing the lock bit offset within the 32-bit
316  * lock word for each PV head entry.
317  *
318  * @return A 32-bit integer containing the lock bit offset.
319  */
320 #define PVH_LOCK_BIT_OFFSET (PVH_LOCK_BIT - (PVH_LOCK_WORD * 32))
321 
322 /**
323  * Lock a pv_head_table entry, and return the value stored in the pv_head_table array.
324  *
325  * @param index The physical address index of the pv_head_table entry to lock.
326  *
327  * @return A wrapper object with the contents of the locked pv_head_table entry.
328  */
329 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock(unsigned int index)330 pvh_lock(unsigned int index)
331 {
332 	extern unsigned int not_in_kdp;
333 	const bool was_preemptible = preemption_enabled();
334 	assert(was_preemptible || (startup_phase < STARTUP_SUB_EARLY_BOOT) ||
335 	    PMAP_IS_HIBERNATING() || !not_in_kdp);
336 
337 	bool (^check_preemption)(void) = ^bool (void) {
338 		return was_preemptible && pmap_pending_preemption();
339 	};
340 
341 	hw_lock_status_t ret;
342 	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
343 	do {
344 		ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
345 		    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
346 
347 		if (ret == HW_LOCK_ACQUIRED) {
348 			locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
349 			if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
350 				wait_result_t wres;
351 				wres = assert_wait(&pv_head_table[index], THREAD_UNINT);
352 				hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
353 				assertf(wres == THREAD_WAITING, "%s: unexpected wait result %d", __func__, wres);
354 				thread_block(THREAD_CONTINUE_NULL);
355 				ret = HW_LOCK_CONTENDED;
356 			}
357 		}
358 	} while (ret != HW_LOCK_ACQUIRED);
359 
360 	return locked_pvh;
361 }
362 
363 /**
364  * Lock a pvh_head_table entry, possibly in a preemption-disabled context.
365  *
366  * @note This function is only meant for special use cases in which pmap
367  *       functions must be invoked with preemption disabled.  These cases
368  *       are expected to be rare and limited.  If you think you need to
369  *       use this in more places, you're probably wrong.
370  *
371  * @param index The physical address index of the pv_head_table entry to lock.
372  *
373  * @return A wrapper object with the contents of the locked pv_head_table entry.
374  */
375 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock_nopreempt(unsigned int index)376 pvh_lock_nopreempt(unsigned int index)
377 {
378 	if (__improbable(preemption_enabled())) {
379 		return pvh_lock(index);
380 	}
381 	hw_lock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
382 	const locked_pvh_t locked_pvh = {.pvh = os_atomic_load(&pv_head_table[index], relaxed), .pai = index};
383 
384 	if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
385 		panic("%s invoked on sleep-mode PVH %p for pai 0x%x", __func__, &pv_head_table[index], index);
386 	}
387 
388 	return locked_pvh;
389 }
390 
391 /**
392  * Attempt to lock a pv_head_table entry, failing if the lock can't be immediately acquired.
393  *
394  * @param index The physical address index of the pv_head_table entry to lock.
395  *
396  * @return A wrapper object with the contents of the locked pv_head_table entry if successful,
397  *         0 otherwise.
398  */
399 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_try_lock(unsigned int index)400 pvh_try_lock(unsigned int index)
401 {
402 	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
403 	bool locked = hw_lock_bit_try(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
404 
405 	if (locked) {
406 		locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
407 		assert(locked_pvh.pvh != 0);
408 		if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
409 			hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
410 			locked_pvh.pvh = 0;
411 		}
412 	}
413 
414 	return locked_pvh;
415 }
416 
417 /**
418  * Helper for determining whether a preceding pvh_try_lock() call succeeded.
419  *
420  * @param locked_pvh A wrapper representing a possibly-locked PV head table entry
421  *        returned by pvh_try_lock().
422  *
423  * @return True if [locked_pvh] represents a successfully-locked PVH, false otherwise.
424  */
425 static inline bool
pvh_try_lock_success(const locked_pvh_t * locked_pvh)426 pvh_try_lock_success(const locked_pvh_t *locked_pvh)
427 {
428 	assert(locked_pvh != NULL);
429 	return locked_pvh->pvh != 0;
430 }
431 
432 /**
433  * Place a pv_head_table entry in sleep mode, so that other threads contending on the PVH
434  * lock will sleep until this thread calls pvh_unlock().
435  *
436  * @note It is legal to call this function if the lock is already in sleep mode.
437  *       In that case, the call will have no effect.
438  * @note This function must not be called with preemption disabled by any other agent
439  *       but [locked_pvh] itself.  Preemption must be fully re-enabled by the time
440  *       this function returns, either because it was already enabled (because the
441  *       lock was already in sleep mode), or because this function enabled it by placing
442  *       the lock in sleep mode.
443  *
444  * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
445  */
446 static inline void
pvh_lock_enter_sleep_mode(locked_pvh_t * locked_pvh)447 pvh_lock_enter_sleep_mode(locked_pvh_t *locked_pvh)
448 {
449 	assert(locked_pvh != NULL);
450 	assert(locked_pvh->pvh != 0);
451 	unsigned int index = locked_pvh->pai;
452 	pvh_assert_locked(index);
453 	const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
454 	if (!(old_pvh & PVH_FLAG_SLEEP)) {
455 		assert(old_pvh & PVH_FLAG_LOCK);
456 		os_atomic_store(&pv_head_table[index], old_pvh | PVH_FLAG_SLEEP, relaxed);
457 		/**
458 		 * Tell the scheduler that this thread may need a priority boost if it needs to go
459 		 * off-core, to reduce the likelihood of priority inversion.
460 		 */
461 		locked_pvh->pri_token = thread_priority_floor_start();
462 		hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
463 	}
464 
465 	/* Hibernation runs single-core so we can skip this check. */
466 	assert(preemption_enabled() || PMAP_IS_HIBERNATING());
467 }
468 
469 /**
470  * Check that a pv_head_table entry/pointer is a specific type.
471  *
472  * @param pvh The pv_head_table entry/pointer to check.
473  * @param type The type to check for.
474  *
475  * @return True if the pv_head_table entry is of the passed in type, false
476  *         otherwise.
477  */
478 static inline bool
pvh_test_type(uintptr_t pvh,pvh_type_t type)479 pvh_test_type(uintptr_t pvh, pvh_type_t type)
480 {
481 	return (pvh & PVH_TYPE_MASK) == type;
482 }
483 
484 /**
485  * Unlock a pv_head_table entry, updating the contents of the entry with the passed-in value.
486  *
487  * @note Only the non-lock flags, pointer, and type fields of the entry will be updated
488  *       according to the passed-in value.  PVH_LOCK_FLAGS will be ignored as they are
489  *       directly manipulated by this function.
490  *
491  * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
492  *        The pvh field from this entry, except for the PVH_LOCK_FLAGS bits, will be stored
493  *        in pv_head_table to reflect any updates that may have been performed on the PV list
494  *        while the lock was held.
495  */
496 static inline void
pvh_unlock(locked_pvh_t * locked_pvh)497 pvh_unlock(locked_pvh_t *locked_pvh)
498 {
499 	assert(locked_pvh != NULL);
500 	assert(locked_pvh->pvh != 0);
501 	unsigned int index = locked_pvh->pai;
502 	pvh_assert_locked(index);
503 	const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
504 	bool pri_floor_end = false;
505 
506 	if (__improbable(old_pvh & PVH_FLAG_SLEEP)) {
507 		pri_floor_end = true;
508 		const bool was_preemptible = preemption_enabled();
509 		bool (^check_preemption)(void) = ^bool (void) {
510 			return was_preemptible && pmap_pending_preemption();
511 		};
512 
513 		hw_lock_status_t ret;
514 		do {
515 			ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
516 			    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
517 		} while (ret != HW_LOCK_ACQUIRED);
518 
519 		os_atomic_store(&pv_head_table[index],
520 		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
521 		thread_wakeup(&pv_head_table[index]);
522 	} else if ((old_pvh & ~PVH_LOCK_FLAGS) != (locked_pvh->pvh & ~PVH_LOCK_FLAGS)) {
523 		os_atomic_store(&pv_head_table[index],
524 		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
525 	}
526 	hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
527 
528 	if (__improbable(pri_floor_end)) {
529 		thread_priority_floor_end(&locked_pvh->pri_token);
530 	}
531 
532 	locked_pvh->pvh = 0;
533 }
534 
535 /**
536  * Convert a pv_head_table entry/pointer into a page table entry pointer. This
537  * should only be done if the type of this entry is PVH_TYPE_PTEP.
538  *
539  * @param pvh The pv_head_table entry/pointer to convert into a pt_entry_t*.
540  *
541  * @return Return back a safe to derefence pointer to the single mapping of this
542  *         physical page by masking off the TYPE bits and adding any missing
543  *         flags to the upper portion of the pointer.
544  */
545 static inline pt_entry_t*
pvh_ptep(uintptr_t pvh)546 pvh_ptep(uintptr_t pvh)
547 {
548 	assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
549 	return (pt_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
550 }
551 
552 /**
553  * Convert a pv_head_table entry/pointer into a PVE list pointer. This
554  * should only be done if the type of this entry is PVH_TYPE_PVEP.
555  *
556  * @param pvh The pv_head_table entry/pointer to convert into a safe to
557  *            dereference pv_entry_t*.
558  *
559  * @return Return back a safe to derefence pointer to the first mapping of this
560  *         physical page by masking off the TYPE bits and adding any missing
561  *         flags to the upper portion of the pointer.
562  */
563 static inline pv_entry_t*
pvh_pve_list(uintptr_t pvh)564 pvh_pve_list(uintptr_t pvh)
565 {
566 	assert(pvh_test_type(pvh, PVH_TYPE_PVEP));
567 	return (pv_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
568 }
569 
570 /**
571  * Return the mutable flags associated with a pv_head_table entry/pointer.
572  *
573  * @param pvh The pv_head_table entry whose flags to get.
574  *
575  * @return The mutable flags encoded in [pvh].
576  */
577 static inline uintptr_t
pvh_get_flags(uintptr_t pvh)578 pvh_get_flags(uintptr_t pvh)
579 {
580 	return pvh & PVH_MUTABLE_FLAGS;
581 }
582 
583 /**
584  * Update the flags associated with a pv_head_table entry/pointer.
585  *
586  * @note This function does not actually modify the pv_head_table,
587  *       it only installs an updated pv_head_table entry in [locked_pvh]
588  *       that can later be passed to pvh_unlock() to update the actual array
589  *       entry.
590  *
591  * @param locked_pvh A wrapper struct containing the pv_head_table
592  *                   entry/pointer to update.
593  *
594  */
595 static inline void
pvh_set_flags(locked_pvh_t * locked_pvh,uintptr_t flags)596 pvh_set_flags(locked_pvh_t *locked_pvh, uintptr_t flags)
597 {
598 	locked_pvh->pvh = (locked_pvh->pvh & ~PVH_MUTABLE_FLAGS) | (flags & PVH_MUTABLE_FLAGS);
599 }
600 
601 /**
602  * Update a pv_head_table entry/pointer to be a different type and/or point to
603  * a different object.
604  *
605  * @note This function does not actually modify the pv_head_table,
606  *       it only installs an updated pv_head_table entry in [locked_pvh]
607  *       that can later be passed to pvh_unlock() to update the actual array
608  *       entry.
609  *
610  * @param locked_pvh A wrapper struct containing the pv_head_table
611  *                   entry/pointer to update.
612  * @param pvep The new entry to use. This could be either a pt_entry_t*,
613  *             pv_entry_t*, or pt_desc_t* depending on the type.
614  * @param type The type of the new entry.
615  */
616 static inline void
pvh_update_head(locked_pvh_t * locked_pvh,void * pvep,unsigned int type)617 pvh_update_head(locked_pvh_t *locked_pvh, void *pvep, unsigned int type)
618 {
619 	assert(!((uintptr_t)pvep & PVH_TYPE_MASK));
620 	const uintptr_t pvh_flags = locked_pvh->pvh & PVH_HIGH_FLAGS;
621 	locked_pvh->pvh = ((uintptr_t)pvep & ~PVH_HIGH_FLAGS) | type | pvh_flags;
622 }
623 
624 /**
625  * Given a page table entry pointer retrieved from the pv_head_table (from an
626  * entry of type PVH_TYPE_PTEP or PVH_TYPE_PVEP), return back whether the PTE is
627  * an IOMMU mapping.
628  *
629  * @note The way this function determines whether the passed in pointer is
630  *       pointing to an IOMMU PTE, is by checking for a special flag stored in
631  *       the lower bits of the pointer. This flag is only set on pointers stored
632  *       in the pv_head_table, and as such, this function will only work on
633  *       pointers retrieved from the pv_head_table. If a pointer to a PTE was
634  *       directly retrieved from an IOMMU's page tables, this function would
635  *       always return false despite actually being an IOMMU PTE.
636  *
637  * @param ptep A PTE pointer obtained from the pv_head_table to check.
638  *
639  * @return True if the entry is an IOMMU mapping, false otherwise.
640  */
641 static inline bool
pvh_ptep_is_iommu(const pt_entry_t * ptep)642 pvh_ptep_is_iommu(const pt_entry_t *ptep)
643 {
644 #ifdef PVH_FLAG_IOMMU
645 	return (uintptr_t)ptep & PVH_FLAG_IOMMU;
646 #else /* PVH_FLAG_IOMMU */
647 	#pragma unused(ptep)
648 	return false;
649 #endif /* PVH_FLAG_IOMMU */
650 }
651 
652 /**
653  * Sometimes the PTE pointers retrieved from the pv_head_table (from an entry of
654  * type PVH_TYPE_PTEP or PVH_TYPE_PVEP) contain flags themselves. This function
655  * strips out those flags and returns back a dereferencable pointer.
656  *
657  * @param ptep The PTE pointer to strip out the unwanted flags.
658  *
659  * @return A valid dereferencable pointer to the page table entry.
660  */
661 static inline const pt_entry_t*
pvh_strip_ptep(const pt_entry_t * ptep)662 pvh_strip_ptep(const pt_entry_t *ptep)
663 {
664 #ifdef PVH_FLAG_IOMMU
665 	const uintptr_t pte_va = (uintptr_t)ptep;
666 	return (const pt_entry_t*)((pte_va & ~PVH_FLAG_IOMMU) | PVH_FLAG_IOMMU_TABLE);
667 #else /* PVH_FLAG_IOMMU */
668 	return ptep;
669 #endif /* PVH_FLAG_IOMMU */
670 }
671 
672 /**
673  * PVH_TYPE_PVEP Helper Functions.
674  *
675  * The following are methods used to manipulate PVE lists. This is the type of
676  * pv_head_table entry used when there are multiple mappings to a single
677  * physical page.
678  */
679 
680 /**
681  * Whether a physical page is using "alternate accounting" (ALTACCT) for its
682  * ledger statistics is something that needs to be tracked on a per-mapping
683  * basis, not on a per-physical-page basis. Because of that, it's tracked
684  * differently depending on whether there's a single mapping to a page
685  * (PVH_TYPE_PTEP) or multiple (PVH_TYPE_PVEP). For single mappings, the bit is
686  * tracked in the pp_attr_table. But when there are multiple mappings, the least
687  * significant bit of the corresponding "pve_pte" pointer in each pv_entry object
688  * is used as a marker for pages using alternate accounting.
689  *
690  * @note See the definition for PP_ATTR_ALTACCT for a more detailed description
691  *       of what "alternate accounting" actually means in respect to the
692  *       footprint ledger.
693  *
694  * Since some code (KernelDiskImages, e.g.) might map a phsyical page as
695  * "device" memory (i.e. external) while it's also being used as regular
696  * "anonymous" memory (i.e. internal) in user space, we have to manage the
697  * "internal" attribute per mapping rather than per physical page.
698  * When there are multiple mappings, we use the next least significant bit of
699  * the corresponding "pve_pte" pointer for that.
700  */
701 #define PVE_PTEP_ALTACCT ((uintptr_t) 0x1)
702 #define PVE_PTEP_INTERNAL ((uintptr_t) 0x2)
703 #define PVE_PTEP_FLAGS (PVE_PTEP_ALTACCT | PVE_PTEP_INTERNAL)
704 
705 /**
706  * Set the ALTACCT bit for a specific PTE pointer.
707  *
708  * @param pvep A pointer to the current pv_entry mapping in the linked list of
709  *             mappings.
710  * @param idx Index of the chosen PTE pointer inside the PVE.
711  */
712 static inline void
pve_set_altacct(pv_entry_t * pvep,unsigned idx)713 pve_set_altacct(pv_entry_t *pvep, unsigned idx)
714 {
715 	assert(idx < PTE_PER_PVE);
716 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_ALTACCT);
717 }
718 
719 /**
720  * Set the INTERNAL bit for a specific PTE pointer.
721  *
722  * @param pvep A pointer to the current pv_entry mapping in the linked list of
723  *             mappings.
724  * @param idx Index of the chosen PTE pointer inside the PVE.
725  */
726 static inline void
pve_set_internal(pv_entry_t * pvep,unsigned idx)727 pve_set_internal(pv_entry_t *pvep, unsigned idx)
728 {
729 	assert(idx < PTE_PER_PVE);
730 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_INTERNAL);
731 }
732 
733 /**
734  * Clear the ALTACCT bit for a specific PTE pointer.
735  *
736  * @param pvep A pointer to the current pv_entry mapping in the linked list of
737  *             mappings.
738  * @param idx Index of the chosen PTE pointer inside the PVE.
739  */
740 static inline void
pve_clr_altacct(pv_entry_t * pvep,unsigned idx)741 pve_clr_altacct(pv_entry_t *pvep, unsigned idx)
742 {
743 	assert(idx < PTE_PER_PVE);
744 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_ALTACCT);
745 }
746 
747 /**
748  * Clear the INTERNAL bit for a specific PTE pointer.
749  *
750  * @param pvep A pointer to the current pv_entry mapping in the linked list of
751  *             mappings.
752  * @param idx Index of the chosen PTE pointer inside the PVE.
753  */
754 static inline void
pve_clr_internal(pv_entry_t * pvep,unsigned idx)755 pve_clr_internal(pv_entry_t *pvep, unsigned idx)
756 {
757 	assert(idx < PTE_PER_PVE);
758 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_INTERNAL);
759 }
760 
761 /**
762  * Return the ALTACCT bit for a specific PTE pointer.
763  *
764  * @param pvep A pointer to the current pv_entry mapping in the linked list of
765  *             mappings.
766  * @param idx Index of the chosen PTE pointer inside the PVE.
767  */
768 static inline bool
pve_get_altacct(pv_entry_t * pvep,unsigned idx)769 pve_get_altacct(pv_entry_t *pvep, unsigned idx)
770 {
771 	assert(idx < PTE_PER_PVE);
772 	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_ALTACCT;
773 }
774 
775 /**
776  * Return the INTERNAL bit for a specific PTE pointer.
777  *
778  * @param pvep A pointer to the current pv_entry mapping in the linked list of
779  *             mappings.
780  * @param idx Index of the chosen PTE pointer inside the PVE.
781  */
782 static inline bool
pve_get_internal(pv_entry_t * pvep,unsigned idx)783 pve_get_internal(pv_entry_t *pvep, unsigned idx)
784 {
785 	assert(idx < PTE_PER_PVE);
786 	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_INTERNAL;
787 }
788 
789 /**
790  * Return the next mapping (pv_entry) in a linked list of mappings. This applies
791  * to pv_head_table entries of type PVH_TYPE_PVEP.
792  *
793  * @param pvep A pointer to the current pv_entry mapping in the linked list of
794  *             mappings.
795  *
796  * @return The next virtual mapping for a physical page, or PV_ENTRY_NULL if the
797  *         end of the list has been reached.
798  */
799 static inline pv_entry_t *
pve_next(pv_entry_t * pvep)800 pve_next(pv_entry_t *pvep)
801 {
802 	return pvep->pve_next;
803 }
804 
805 /**
806  * Return a pointer to the pve_next field in a pv_entry. This value is used
807  * when adding and removing entries to a PVE list.
808  *
809  * @param pvep The pv_entry whose pve_next field is being accessed.
810  *
811  * @return Pointer to the pve_next field.
812  */
813 static inline pv_entry_t **
pve_next_ptr(pv_entry_t * pvep)814 pve_next_ptr(pv_entry_t *pvep)
815 {
816 	return &pvep->pve_next;
817 }
818 
819 /**
820  * Return a pointer to the page table entry for this mapping.
821  *
822  * @param pvep The pv_entry whose pve_ptep field is to be returned.
823  * @param idx Index of the chosen PTE pointer inside the PVE.
824  *
825  * @return Pointer to the page table entry.
826  */
827 static inline pt_entry_t *
pve_get_ptep(pv_entry_t * pvep,unsigned idx)828 pve_get_ptep(pv_entry_t *pvep, unsigned idx)
829 {
830 	assert(idx < PTE_PER_PVE);
831 	return (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_FLAGS);
832 }
833 
834 /**
835  * Update the page table entry for a specific physical to virtual mapping.
836  *
837  * @param pvep The pv_entry to update.
838  * @param idx Index of the chosen PTE pointer inside the PVE.
839  * @param ptep_new The new page table entry.
840  */
841 static inline void
pve_set_ptep(pv_entry_t * pvep,unsigned idx,pt_entry_t * ptep_new)842 pve_set_ptep(pv_entry_t *pvep, unsigned idx, pt_entry_t *ptep_new)
843 {
844 	assert(idx < PTE_PER_PVE);
845 	pvep->pve_ptep[idx] = ptep_new;
846 }
847 
848 /**
849  * Initialize all fields in a PVE to NULL.
850  *
851  * @param pvep The pv_entry to initialize.
852  */
853 static inline void
pve_init(pv_entry_t * pvep)854 pve_init(pv_entry_t *pvep)
855 {
856 	pvep->pve_next = PV_ENTRY_NULL;
857 	for (int i = 0; i < PTE_PER_PVE; i++) {
858 		pvep->pve_ptep[i] = PT_ENTRY_NULL;
859 	}
860 }
861 
862 /**
863  * Find PTE pointer in PVE and return its index.
864  *
865  * @param pvep The PVE to search.
866  * @param ptep PTE to search for.
867  *
868  * @return Index of the found entry, or -1 if no entry exists.
869  */
870 static inline int
pve_find_ptep_index(pv_entry_t * pvep,pt_entry_t * ptep)871 pve_find_ptep_index(pv_entry_t *pvep, pt_entry_t *ptep)
872 {
873 	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
874 		if (pve_get_ptep(pvep, i) == ptep) {
875 			return (int)i;
876 		}
877 	}
878 
879 	return -1;
880 }
881 
882 /**
883  * Checks if no PTEs are currently associated with this PVE.
884  *
885  * @param pvep The PVE to search.
886  *
887  * @return True if no PTEs are currently associated with this PVE, or false.
888  */
889 static inline bool
pve_is_empty(pv_entry_t * pvep)890 pve_is_empty(pv_entry_t *pvep)
891 {
892 	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
893 		if (pve_get_ptep(pvep, i) != PT_ENTRY_NULL) {
894 			return false;
895 		}
896 	}
897 
898 	return true;
899 }
900 
901 /**
902  * Prepend a new pv_entry node to a PVE list.
903  *
904  * @note This function does not actually modify the pv_head_table,
905  *       it only installs an updated pv_head_table entry in [locked_pvh]
906  *       that can later be passed to pvh_unlock() to update the actual array
907  *       entry.
908  *
909  * @param locked_pvh A wrapper struct containing the pv_head_table
910  *                   entry/pointer to update.  This entry represents
911  *                   the linked list of mappings to update.
912  * @param pvep The new mapping to add to the linked list.
913  */
914 static inline void
pve_add(locked_pvh_t * locked_pvh,pv_entry_t * pvep)915 pve_add(locked_pvh_t *locked_pvh, pv_entry_t *pvep)
916 {
917 	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
918 
919 	pvep->pve_next = pvh_pve_list(locked_pvh->pvh);
920 	pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
921 }
922 
923 /**
924  * Remove an entry from a PVE list of mappings.
925  *
926  * @note This function does not actually modify the pv_head_table,
927  *       it only installs an updated pv_head_table entry in [locked_pvh]
928  *       that can later be passed to pvh_unlock() to update the actual array
929  *       entry.
930  *
931  * @param locked_pvh A wrapper struct containing the pv_head_table entry/pointer
932  *                   to update.  This entry represents the linked list of mappings
933  *                   from which to remove an entry.
934  * @param pvepp A pointer to the pv_entry_t* that's being removed. If this entry
935  *              is the first in the linked list of mappings, then NULL should be
936  *              passed here and the removal will be reflected in the returned
937  *              pv_head_table entry.
938  * @param pvep The entry that should be removed. Should be identical to a
939  *             dereference of the pvepp parameter (unless it's the pv_head_table
940  *             entry).
941  */
942 static inline void
pve_remove(locked_pvh_t * locked_pvh,pv_entry_t ** pvepp,pv_entry_t * pvep)943 pve_remove(locked_pvh_t *locked_pvh, pv_entry_t **pvepp, pv_entry_t *pvep)
944 {
945 	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
946 
947 	if (pvepp == NULL) {
948 		assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
949 		    __func__, (void*)locked_pvh->pvh, pvep);
950 		if (pve_next(pvep) == PV_ENTRY_NULL) {
951 			/* The last mapping to this page is being removed. */
952 			pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
953 		} else {
954 			/**
955 			 * There are still mappings left, make the next one the new head of
956 			 * the list. This effectively removes the first entry from the list.
957 			 */
958 			pvh_update_head(locked_pvh, pve_next(pvep), PVH_TYPE_PVEP);
959 		}
960 	} else {
961 		/**
962 		 * Move the previous entry's next field to the entry after the one being
963 		 * removed. This will clobber the ALTACCT and INTERNAL bits.
964 		 */
965 		*pvepp = pve_next(pvep);
966 	}
967 }
968 
969 /**
970  * PVH_TYPE_PTDP Types and Helper Functions.
971  *
972  * The following are types and methods used to manipulate page table descriptor
973  * (PTD) objects. This is the type of pv_head_table entry used when a page is
974  * being used as a page table.
975  */
976 
977 /**
978  * Page table descriptor (PTD) info structure.
979  *
980  * Contains information about a page table. These pieces of data are separate
981  * from the PTD itself because in address spaces where the VM page size doesn't
982  * match the underlying hardware page size, one PTD could represent multiple
983  * page tables (and so will need multiple PTD info structures).
984  *
985  * These fields are also in their own struct so that they can be allocated
986  * separately from the associated pt_desc_t object. This allows us to allocate
987  * the counts in this structure in a way that ensures they don't fall within the
988  * same cache line as the main pt_desc_t object. This is important because the
989  * fields in this structure are atomically updated which could cause false
990  * sharing cache performance issues with the "va" field in pt_desc_t if all of
991  * the fields were within the same structure.
992  */
993 typedef struct {
994 	/**
995 	 * For non-leaf pagetables, should be 0.
996 	 * For leaf pagetables, should reflect the number of wired entries.
997 	 * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU
998 	 * operations are implicitly wired).
999 	 */
1000 	unsigned short wiredcnt;
1001 } ptd_info_t;
1002 
1003 /**
1004  * This type is used to identify a specific IOMMU driver and an instance of
1005  * that driver which owns a specific page or page table. This type will be used
1006  * within both PTD and PVE lists to track IOMMU-owned pages and IOMMU mappings
1007  * respectively.
1008  *
1009  * Despite the fact this value is not a pointer, we need to make this value sort
1010  * of look like a kernel pointer: the bottom 3-bits must be zero and the upper
1011  * bits must all be ones by default. This is due to the fact that this type can
1012  * be embedded into the PVH table to represent an IOMMU mapping. The PVH table
1013  * code expects "kernel-pointer-like" properties so it can store flags in those
1014  * areas of the 64-bit value.
1015  */
1016 typedef uint64_t iommu_instance_t;
1017 
1018 /* 8-bit ID of the IOMMU driver which the instance derives from. */
1019 #define IOMMU_ID_SHIFT 8U
1020 #define IOMMU_ID_MASK  0x000000000000FF00ULL
1021 
1022 #define GET_IOMMU_ID(x) ((sptm_iommu_id_t)(((x) & IOMMU_ID_MASK) >> IOMMU_ID_SHIFT))
1023 #define SET_IOMMU_ID(x) (((uint64_t)(x) << IOMMU_ID_SHIFT) & IOMMU_ID_MASK)
1024 
1025 /**
1026  * An IOMMU token is a 32-bit value unique to each instance of an IOMMU driver.
1027  * This is strictly used to help with debugging and provides a mechanism to
1028  * trace a mapping or page table back to the exact IOMMU instance that owns it.
1029  * Typically, this would be the instance ID, but for drivers that use only a
1030  * single global instance, this could be something else like a root page table
1031  * ppnum_t.
1032  */
1033 #define IOMMU_TOKEN_SHIFT 16U
1034 #define IOMMU_TOKEN_MASK  0x0000FFFFFFFF0000ULL
1035 
1036 #define GET_IOMMU_TOKEN(x) ((iommu_token_t)(((x) & IOMMU_TOKEN_MASK) >> IOMMU_TOKEN_SHIFT))
1037 #define SET_IOMMU_TOKEN(x) (((uint64_t)(x) << IOMMU_TOKEN_SHIFT) & IOMMU_TOKEN_MASK)
1038 
1039 /**
1040  * The default value for iommu_instance_t. See the type definition for more
1041  * details on why the upper bits need to initially be all ones.
1042  */
1043 #define IOMMU_INSTANCE_DEFAULT 0xFFFF000000000000ULL
1044 
1045 /**
1046  * Since "zero" is a valid IOMMU ID and token, the "NULL" value of an IOMMU
1047  * instance sets the ID and token to all ones as a sentinel invalid value.
1048  */
1049 #define IOMMU_INSTANCE_NULL 0xFFFFFFFFFFFFFF00ULL
1050 
1051 /**
1052  * Page Table Descriptor (PTD).
1053  *
1054  * Provides a per-table data structure and a way of keeping track of all page
1055  * tables in the system.
1056  *
1057  * This structure is also used as a convenient way of keeping track of IOMMU
1058  * pages (which may or may not be used as page tables). In that case the SPTM
1059  * frame type for the page will be XNU_IOMMU, the "iommu" field will describe
1060  * the owner of the page, and ptd_info[0].wiredcnt can be used as an arbitrary
1061  * refcnt controlled by the IOMMU driver.
1062  */
1063 typedef struct pt_desc {
1064 	/* Each page table is either owned by a pmap or a specific IOMMU. */
1065 	union {
1066 		struct pmap *pmap;
1067 	};
1068 
1069 	/**
1070 	 * The following fields contain per-page-table properties, and as such,
1071 	 * might have multiple elements each. This is due to a single PTD
1072 	 * potentially representing multiple page tables (in address spaces where
1073 	 * the VM page size differs from the hardware page size). Use the
1074 	 * ptd_get_index() function to get the correct index for a specific page
1075 	 * table.
1076 	 */
1077 
1078 	/**
1079 	 * The first address of the virtual address space this page table is
1080 	 * translating for, or a value set by an IOMMU driver if this PTD is being
1081 	 * used to track an IOMMU page.
1082 	 */
1083 	vm_offset_t va;
1084 
1085 	/**
1086 	 * ptd_info_t's are allocated separately so as to reduce false sharing
1087 	 * with the va field. This is desirable because ptd_info_t's are updated
1088 	 * atomically from all CPUs.
1089 	 */
1090 	ptd_info_t *ptd_info;
1091 } pt_desc_t;
1092 
1093 /**
1094  * Per-CPU structure for tracking in-flight SPTM retype operations.
1095  *
1096  * This structure is intended to be embedded in the pmap per-CPU data object,
1097  * and is meant to be used for situations in which the caller needs to ensure
1098  * that potentially sensitive concurrent SPTM operations have completed on other
1099  * CPUs prior to an operation (such as a retype) that requires page or mapping
1100  * state to be stable.  When draining these concurrent operations, the caller
1101  * is also expected to have already taken steps to ensure the page/mapping
1102  * state requirements will be visible to any concurrent pmap operation initiated
1103  * after the drain operation is begun, so that only previously-initiated
1104  * operations will need to be purged.
1105  */
1106 typedef struct {
1107 	/**
1108 	 * Critical section sequence number of the local CPU.  A value of zero
1109 	 * indicates that no pmap epoch critical section is currently active on
1110 	 * the CPU.
1111 	 */
1112 	uint64_t local_seq;
1113 
1114 	/**
1115 	 * The sequence number to use the next time a pmap epoch critical section
1116 	 * is entered on the local CPU.  This should monotonically increase.
1117 	 */
1118 	uint64_t next_seq;
1119 
1120 	/**
1121 	 * This array stores the retype sequence numbers observed on remote CPUs.
1122 	 * When the local CPU needs to wait for critical sections to complete on
1123 	 * other CPUs, this is intended to provide an initial sample of those other
1124 	 * CPUs' critical section state.  The caller can then wait for each remote
1125 	 * CPU's sequence number to return to zero or advance beyond the value
1126 	 * stored in its entry in this array.
1127 	 */
1128 	uint64_t remote_seq[MAX_CPUS];
1129 
1130 	/**
1131 	 * Flags used to track the state of an active pmap epoch drain operation
1132 	 * on the local CPU.
1133 	 */
1134 
1135 	/**
1136 	 * This flag indicates that a drain operation has been prepared on the
1137 	 * local CPU by sampling remote CPU epoch states into the remote_seq array.
1138 	 * This must be set before the drain operation can be performed.
1139 	 */
1140 	#define PMAP_EPOCH_PREPARED (1 << 0)
1141 
1142 	/**
1143 	 * This flag indicates that one or more remote CPUs had a non-zero retype
1144 	 * epoch value when the remote_seq array was most recently sampled.
1145 	 * If this flag is not set, then we already know that no remote CPUs can
1146 	 * be in a critical section in which prior mapping state for the page to
1147 	 * be retyped may have been observed, so we can skip the drain operation.
1148 	 */
1149 	#define PMAP_EPOCH_DRAIN_REQUIRED (1 << 1)
1150 	uint8_t flags;
1151 } pmap_epoch_t;
1152 
1153 #define PMAP_SPTM_PCPU_ALIGN (8192)
1154 
1155 typedef struct {
1156 	/**
1157 	 * Per-CPU array of SPTM_MAPPING_LIMIT PTE records, obtained from SPTM
1158 	 * during bootstrap.
1159 	 */
1160 	sptm_pte_t *sptm_prev_ptes;
1161 
1162 	/**
1163 	 * A piece of per-cpu scratch memory used by IOMMU drivers when passing data
1164 	 * into the SPTM. The size is defined by PMAP_IOMMU_SCRATCH_SIZE.
1165 	 */
1166 	void *sptm_iommu_scratch;
1167 
1168 	/* Accumulator for batched user pointer SPTM ops, to avoid excessive stack usage. */
1169 	sptm_user_pointer_op_t sptm_user_pointer_ops[SPTM_MAPPING_LIMIT];
1170 
1171 	/* Accumulator for batched disjoint SPTM ops, to avoid excessive stack usage. */
1172 	sptm_disjoint_op_t sptm_ops[SPTM_MAPPING_LIMIT];
1173 
1174 	union {
1175 		/* Accumulator for batched VA-contiguous SPTM ops, to avoid excessive stack usage. */
1176 		sptm_pte_t sptm_templates[SPTM_MAPPING_LIMIT];
1177 
1178 		/* Accumulator for PA arrays to be passed to the SPTM, to avoid excessive stack usage. */
1179 		sptm_paddr_t sptm_paddrs[SPTM_MAPPING_LIMIT];
1180 	};
1181 
1182 	/* Base PA of user pointer ops array, for passing the ops into the SPTM. */
1183 	pmap_paddr_t sptm_user_pointer_ops_pa;
1184 
1185 	/* Base PA of ops array, for passing the ops into the SPTM. */
1186 	pmap_paddr_t sptm_ops_pa;
1187 
1188 	/* Base PA of templates array, for passing templates into the SPTM. */
1189 	pmap_paddr_t sptm_templates_pa;
1190 
1191 	/* Base PA of physical address array, for passing physical address lists into the SPTM. */
1192 	pmap_paddr_t sptm_paddrs_pa;
1193 
1194 	/* PMAP pagetable descriptors associated with each element of sptm_ops. */
1195 	pt_desc_t *sptm_ptds[SPTM_MAPPING_LIMIT];
1196 
1197 	/* PTD info objects associated with each pmap PTE pointer. */
1198 	ptd_info_t *sptm_ptd_info[SPTM_MAPPING_LIMIT];
1199 
1200 	/* Accounting-related flags for each element of sptm_ops. */
1201 	#define PMAP_SPTM_FLAG_INTERNAL (0x1)
1202 	#define PMAP_SPTM_FLAG_ALTACCT (0x2)
1203 	uint8_t sptm_acct_flags[SPTM_MAPPING_LIMIT];
1204 
1205 	/* pmap epoch tracking structure. */
1206 	pmap_epoch_t pmap_epoch;
1207 
1208 	/* Guest virtual machine dispatch structure. */
1209 	sptm_guest_dispatch_t sptm_guest_dispatch;
1210 
1211 	/* Guest virtual machine dispatch structure physical address. */
1212 	pmap_paddr_t sptm_guest_dispatch_paddr;
1213 
1214 	/* SPTM Logical CPU ID  */
1215 	uint16_t sptm_cpu_id;
1216 
1217 	/* Read index associated with this CPU's SPTM trace buffer  */
1218 	uint64_t sptm_trace_buffer_read_index;
1219 
1220 	/* Previous SPTM state for use with sptm_trace_num_new_traces */
1221 	uint64_t sptm_trace_prev_state;
1222 } __attribute__((aligned(PMAP_SPTM_PCPU_ALIGN))) pmap_sptm_percpu_data_t;
1223 
1224 _Static_assert((PAGE_SIZE % PMAP_SPTM_PCPU_ALIGN) == 0,
1225     "SPTM per-CPU data alignment does not fit evenly within a page");
1226 _Static_assert(sizeof(pmap_sptm_percpu_data_t) <= PMAP_SPTM_PCPU_ALIGN,
1227     "sizeof(pmap_sptm_percpu_data_t) is larger than PMAP_SPTM_PCPU_ALIGN");
1228 
1229 PERCPU_DECL(pmap_sptm_percpu_data_t, pmap_sptm_percpu);
1230 
1231 /**
1232  * Convert a pv_head_table entry/pointer into a page table descriptor pointer.
1233  * This should only be done if the type of this entry is PVH_TYPE_PTDP.
1234  *
1235  * @param pvh The pv_head_table entry/pointer to convert into a safe to
1236  *            dereference pt_desc_t*.
1237  *
1238  * @return Return back a safe to derefence pointer to the page table descriptor
1239  *         for this physical page by masking off the TYPE bits and adding any
1240  *         missing flags to the upper portion of the pointer.
1241  */
1242 static inline pt_desc_t*
pvh_ptd(uintptr_t pvh)1243 pvh_ptd(uintptr_t pvh)
1244 {
1245 	return (pt_desc_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
1246 }
1247 
1248 /**
1249  * Given an arbitrary page table entry, return back the page table descriptor
1250  * (PTD) object for the page table that contains that entry.
1251  *
1252  * @param ptep Pointer to a PTE whose page table descriptor object to return.
1253  *
1254  * @return The PTD object for the passed in page table.
1255  */
1256 static inline pt_desc_t *
ptep_get_ptd(const pt_entry_t * ptep)1257 ptep_get_ptd(const pt_entry_t *ptep)
1258 {
1259 	assert(ptep != NULL);
1260 
1261 	const vm_offset_t pt_base_va = (vm_offset_t)ptep;
1262 	uintptr_t pvh = pai_to_pvh(pa_index(kvtophys(pt_base_va)));
1263 
1264 	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1265 		panic("%s: invalid PV head 0x%llx for PTE %p", __func__, (uint64_t)pvh, ptep);
1266 	}
1267 
1268 	return pvh_ptd(pvh);
1269 }
1270 
1271 /**
1272  * Given an arbitrary page table entry, return back the pmap that owns that
1273  * page table.
1274  *
1275  * @note This won't work correctly for page tables owned by IOMMUs, because
1276  *       those table aren't owned by any specific pmap.
1277  *
1278  * @param ptep Pointer to a page table entry whose owner we're trying to return.
1279  *
1280  * @return The pmap that owns the given page table entry.
1281  */
1282 static inline struct pmap *
ptep_get_pmap(const pt_entry_t * ptep)1283 ptep_get_pmap(const pt_entry_t *ptep)
1284 {
1285 	return ptep_get_ptd(ptep)->pmap;
1286 }
1287 
1288 
1289 /**
1290  * Given an arbitrary translation table entry, get the page table descriptor
1291  * (PTD) object for the page table pointed to by the TTE.
1292  *
1293  * @param tte The translation table entry to parse. For instance, if this is an
1294  *            L2 TTE, then the PTD for the L3 table this entry points to will be
1295  *            returned.
1296  *
1297  * @return The page table descriptor (PTD) for the page table pointed to by this
1298  *         TTE.
1299  */
1300 static inline pt_desc_t *
tte_get_ptd(const tt_entry_t tte)1301 tte_get_ptd(const tt_entry_t tte)
1302 {
1303 	const vm_offset_t pt_base_va = (vm_offset_t)(tte & ~((tt_entry_t)PAGE_MASK));
1304 	uintptr_t pvh = pai_to_pvh(pa_index(pt_base_va));
1305 
1306 	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1307 		panic("%s: invalid PV head 0x%llx for TTE 0x%llx", __func__, (uint64_t)pvh, (uint64_t)tte);
1308 	}
1309 
1310 	return pvh_ptd(pvh);
1311 }
1312 
1313 /**
1314  * This function returns the ptd_info_t structure associated with a given
1315  * page table descriptor.
1316  *
1317  * @param ptd The page table descriptor that's being accessed.
1318  *
1319  * @return ptd_info_t structure associated with [ptd].
1320  */
1321 static inline ptd_info_t *
ptd_get_info(pt_desc_t * ptd)1322 ptd_get_info(pt_desc_t *ptd)
1323 {
1324 	assert(ptd != NULL);
1325 	return ptd->ptd_info;
1326 }
1327 
1328 /**
1329  * Given a pointer to a page table entry, return back the ptd_info structure
1330  * for the page table that contains that entry.
1331  *
1332  * @param ptep Pointer to a PTE whose ptd_info object to return.
1333  *
1334  * @return The ptd_info object for the page table that contains the passed in
1335  *         page table entry.
1336  */
1337 static inline ptd_info_t *
ptep_get_info(const pt_entry_t * ptep)1338 ptep_get_info(const pt_entry_t *ptep)
1339 {
1340 	return ptd_get_info(ptep_get_ptd(ptep));
1341 }
1342 
1343 /**
1344  * Return the virtual address mapped by the passed in leaf page table entry,
1345  * using an already-retrieved pagetable descriptor.
1346  *
1347  * @param ptdp pointer to the descriptor for the pagetable containing ptep
1348  * @param ptep Pointer to a PTE to parse
1349  */
1350 static inline vm_map_address_t
ptd_get_va(const pt_desc_t * ptdp,const pt_entry_t * ptep)1351 ptd_get_va(const pt_desc_t *ptdp, const pt_entry_t *ptep)
1352 {
1353 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);
1354 
1355 	vm_map_address_t va = ptdp->va;
1356 
1357 	const uint64_t pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(ptdp->pmap));
1358 	const vm_offset_t ptep_page = (vm_offset_t)ptep >> pmap_page_shift;
1359 
1360 	/**
1361 	 * Use the difference between the VM page shift and the hardware page shift
1362 	 * to get the index of the correct page table. In practice, this equates to
1363 	 * masking out the bottom two bits of the L3 table index in address spaces
1364 	 * where the VM page size is greater than the hardware page size. In address
1365 	 * spaces where they're identical, the index will always be zero.
1366 	 */
1367 	const unsigned int ttep_index = ptep_page & ((1U << (PAGE_SHIFT - pmap_page_shift)) - 1);
1368 	va += ttep_index * pt_attr_twig_size(pt_attr);
1369 
1370 	/* Increment VA now to target the VA space covered by this specific PTE */
1371 	const vm_offset_t ptep_index = ((vm_offset_t)ptep & pt_attr_leaf_offmask(pt_attr)) / sizeof(*ptep);
1372 	va += (ptep_index << pt_attr_leaf_shift(pt_attr));
1373 
1374 	return va;
1375 }
1376 
1377 /**
1378  * Return the virtual address that is being mapped by the passed in leaf page
1379  * table entry.
1380  *
1381  * @param ptep Pointer to a PTE to parse.
1382  */
1383 static inline vm_map_address_t
ptep_get_va(const pt_entry_t * ptep)1384 ptep_get_va(const pt_entry_t *ptep)
1385 {
1386 	return ptd_get_va(ptep_get_ptd(ptep), ptep);
1387 }
1388 
1389 /**
1390  * Physical Page Attribute Table (pp_attr_table) defines and helper functions.
1391  */
1392 
1393 /* How many bits to use for flags on a per-VM-page basis. */
1394 typedef uint16_t pp_attr_t;
1395 
1396 /* See the definition of pp_attr_table for more information. */
1397 extern volatile pp_attr_t* pp_attr_table;
1398 
1399 /**
1400  * Flags stored in the pp_attr_table on a per-physical-page basis.
1401  *
1402  * Please update the pv_walk LLDB macro if these flags are changed or added to.
1403  */
1404 
1405 /**
1406  * The bottom 6-bits are used to store the default WIMG (cacheability and memory
1407  * type) setting for this physical page. This can be changed by calling
1408  * pmap_set_cache_attributes().
1409  *
1410  * If a default WIMG setting isn't set for a page, then the default is Normal,
1411  * Cached memory (VM_WIMG_DEFAULT).
1412  */
1413 #define PP_ATTR_WIMG_MASK 0x003F
1414 #define PP_ATTR_WIMG(x) ((x) & PP_ATTR_WIMG_MASK)
1415 
1416 /**
1417  * The reference and modify bits keep track of whether a page has been accessed
1418  * or modified since the last time the bits were cleared. These bits are used to
1419  * enforce policy decisions in the VM layer.
1420  */
1421 #define PP_ATTR_REFERENCED 0x0040
1422 #define PP_ATTR_MODIFIED   0x0080
1423 
1424 /**
1425  * This physical page is being used as anonymous memory that's internally
1426  * managed by the VM and is not connected to an external pager. This flag is
1427  * only set/cleared on the first CPU mapping of a page (see PVH_FLAG_CPU). Any
1428  * subsequent mappings won't set/clear this flag until all mappings are removed
1429  * and a new CPU mapping is added.
1430  */
1431 #define PP_ATTR_INTERNAL 0x0100
1432 
1433 /**
1434  * This flag is used to keep track of pages that are still resident but are not
1435  * considered dirty and can be reclaimed under memory pressure. These pages do
1436  * not count as a part of the memory footprint, so the footprint ledger does not
1437  * need to be updated for these pages. This is hinted to the VM by the
1438  * `madvise(MADV_FREE_REUSABLE)` system call.
1439  */
1440 #define PP_ATTR_REUSABLE 0x0200
1441 
1442 /**
1443  * This flag denotes that a page is utilizing "alternate accounting". This means
1444  * that the pmap doesn't need to keep track of these pages with regards to the
1445  * footprint ledger because the VM is already accounting for them in a different
1446  * way. These include IOKit mappings (VM adds their entire virtual size to the
1447  * footprint), and purgeable pages (VM counts them only when non-volatile and
1448  * only for one "owner"), among others.
1449  *
1450  * Note that alternate accounting status is tracked on a per-mapping basis (not
1451  * per-page). Because of that the ALTACCT flag in the pp_attr_table is only used
1452  * when there's a single mapping to a page. When there are multiple mappings,
1453  * the status of this flag is tracked in the pv_head_table (see PVE_PTEP_ALTACCT
1454  * above).
1455  */
1456 #define PP_ATTR_ALTACCT 0x0400
1457 
1458 /**
1459  * This bit was originally used on x86 to keep track of what pages to not
1460  * encrypt during the hibernation process as a performance optimization when
1461  * encryption was done in software. This doesn't apply to the ARM
1462  * hibernation process because all pages are automatically encrypted using
1463  * hardware acceleration. Despite that, the pmap still keeps track of this flag
1464  * as a debugging aid on internal builds.
1465  *
1466  * TODO: This bit can probably be reclaimed:
1467  * rdar://70740650 (PMAP Cleanup: Potentially reclaim the PP_ATTR_NOENCRYPT bit on ARM)
1468  */
1469 #define PP_ATTR_NOENCRYPT 0x0800
1470 
1471 /**
1472  * These bits denote that a physical page is expecting the next access or
1473  * modification to set the PP_ATTR_REFERENCED and PP_ATTR_MODIFIED flags
1474  * respectively.
1475  */
1476 #define PP_ATTR_REFFAULT 0x1000
1477 #define PP_ATTR_MODFAULT 0x2000
1478 
1479 /**
1480  * Atomically set some flags in a pp_attr_table entry.
1481  *
1482  * @param pai The physical address index for the entry to update.
1483  * @param bits The flags to set in the entry.
1484  */
1485 static inline void
ppattr_set_bits(unsigned int pai,pp_attr_t bits)1486 ppattr_set_bits(unsigned int pai, pp_attr_t bits)
1487 {
1488 	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1489 	os_atomic_or(ppattr, bits, relaxed);
1490 }
1491 
1492 /**
1493  * Atomically clear some flags in a pp_attr_table entry.
1494  *
1495  * @param pai The physical address index for the entry to update.
1496  * @param bits The flags to clear in the entry.
1497  */
1498 static inline void
ppattr_clear_bits(unsigned int pai,pp_attr_t bits)1499 ppattr_clear_bits(unsigned int pai, pp_attr_t bits)
1500 {
1501 	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1502 	os_atomic_andnot(ppattr, bits, relaxed);
1503 }
1504 
1505 /**
1506  * General-purpose function for atomically modifying flags in a pp_attr_table entry.
1507  *
1508  * @param pai The physical address index for the entry to update.
1509  * @param bits_to_clear Mask of bits to atomically clear from the entry.
1510  * @param bits_to_set Mask of bits to atomically set in the entry.
1511  *
1512  * @note [bits_to_clear] and [bits_to_set] must not overlap.
1513  */
1514 static inline void
ppattr_modify_bits(unsigned int pai,pp_attr_t bits_to_clear,pp_attr_t bits_to_set)1515 ppattr_modify_bits(unsigned int pai, pp_attr_t bits_to_clear, pp_attr_t bits_to_set)
1516 {
1517 	assert((bits_to_set & bits_to_clear) == 0);
1518 	pp_attr_t prev_ppattr, new_ppattr;
1519 	os_atomic_rmw_loop(&pp_attr_table[pai], prev_ppattr, new_ppattr, relaxed, {
1520 		new_ppattr = (prev_ppattr & ~bits_to_clear) | bits_to_set;
1521 	});
1522 }
1523 
1524 /**
1525  * Return true if the pp_attr_table entry contains the passed in bits.
1526  *
1527  * @param pai The physical address index for the entry to test.
1528  * @param bits The flags to check for.
1529  */
1530 static inline bool
ppattr_test_bits(unsigned int pai,pp_attr_t bits)1531 ppattr_test_bits(unsigned int pai, pp_attr_t bits)
1532 {
1533 	const volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1534 	return (*ppattr & bits) == bits;
1535 }
1536 
1537 /**
1538  * Only set some flags in a pp_attr_table entry if the passed in physical
1539  * address is a kernel-managed address.
1540  *
1541  * @param pa The physical address for the entry to update.
1542  * @param bits The flags to set in the entry.
1543  */
1544 static inline void
ppattr_pa_set_bits(pmap_paddr_t pa,pp_attr_t bits)1545 ppattr_pa_set_bits(pmap_paddr_t pa, pp_attr_t bits)
1546 {
1547 	if (pa_valid(pa)) {
1548 		ppattr_set_bits(pa_index(pa), bits);
1549 	}
1550 }
1551 
1552 /**
1553  * Only clear some flags in a pp_attr_table entry if the passed in physical
1554  * address is a kernel-managed address.
1555  *
1556  * @param pa The physical address for the entry to update.
1557  * @param bits The flags to clear in the entry.
1558  */
1559 static inline void
ppattr_pa_clear_bits(pmap_paddr_t pa,pp_attr_t bits)1560 ppattr_pa_clear_bits(pmap_paddr_t pa, pp_attr_t bits)
1561 {
1562 	if (pa_valid(pa)) {
1563 		ppattr_clear_bits(pa_index(pa), bits);
1564 	}
1565 }
1566 
1567 /**
1568  * Only test flags in a pp_attr_table entry if the passed in physical address
1569  * is a kernel-managed page.
1570  *
1571  * @param pa The physical address for the entry to test.
1572  * @param bits The flags to check for.
1573  *
1574  * @return False if the PA isn't a kernel-managed page, otherwise true/false
1575  *         depending on whether the bits are set.
1576  */
1577 static inline bool
ppattr_pa_test_bits(pmap_paddr_t pa,pp_attr_t bits)1578 ppattr_pa_test_bits(pmap_paddr_t pa, pp_attr_t bits)
1579 {
1580 	return pa_valid(pa) ? ppattr_test_bits(pa_index(pa), bits) : false;
1581 }
1582 
1583 /**
1584  * Set the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the passed
1585  * in physical address is a kernel-managed page.
1586  *
1587  * @param pa The physical address for the entry to update.
1588  */
1589 static inline void
ppattr_pa_set_modify(pmap_paddr_t pa)1590 ppattr_pa_set_modify(pmap_paddr_t pa)
1591 {
1592 	ppattr_pa_set_bits(pa, PP_ATTR_MODIFIED);
1593 }
1594 
1595 /**
1596  * Clear the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the
1597  * passed in physical address is a kernel-managed page.
1598  *
1599  * @param pa The physical address for the entry to update.
1600  */
1601 static inline void
ppattr_pa_clear_modify(pmap_paddr_t pa)1602 ppattr_pa_clear_modify(pmap_paddr_t pa)
1603 {
1604 	ppattr_pa_clear_bits(pa, PP_ATTR_MODIFIED);
1605 }
1606 
1607 /**
1608  * Set the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1609  * passed in physical address is a kernel-managed page.
1610  *
1611  * @param pa The physical address for the entry to update.
1612  */
1613 static inline void
ppattr_pa_set_reference(pmap_paddr_t pa)1614 ppattr_pa_set_reference(pmap_paddr_t pa)
1615 {
1616 	ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
1617 }
1618 
1619 /**
1620  * Clear the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1621  * passed in physical address is a kernel-managed page.
1622  *
1623  * @param pa The physical address for the entry to update.
1624  */
1625 static inline void
ppattr_pa_clear_reference(pmap_paddr_t pa)1626 ppattr_pa_clear_reference(pmap_paddr_t pa)
1627 {
1628 	ppattr_pa_clear_bits(pa, PP_ATTR_REFERENCED);
1629 }
1630 
1631 /**
1632  * Set the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1633  *
1634  * @param pai The physical address index for the entry to update.
1635  */
1636 static inline void
ppattr_set_internal(unsigned int pai)1637 ppattr_set_internal(unsigned int pai)
1638 {
1639 	ppattr_set_bits(pai, PP_ATTR_INTERNAL);
1640 }
1641 
1642 /**
1643  * Clear the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1644  *
1645  * @param pai The physical address index for the entry to update.
1646  */
1647 static inline void
ppattr_clear_internal(unsigned int pai)1648 ppattr_clear_internal(unsigned int pai)
1649 {
1650 	ppattr_clear_bits(pai, PP_ATTR_INTERNAL);
1651 }
1652 
1653 /**
1654  * Return true if the pp_attr_table entry has the PP_ATTR_INTERNAL flag set.
1655  *
1656  * @param pai The physical address index for the entry to test.
1657  */
1658 static inline bool
ppattr_test_internal(unsigned int pai)1659 ppattr_test_internal(unsigned int pai)
1660 {
1661 	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1662 }
1663 
1664 /**
1665  * Set the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1666  *
1667  * @param pai The physical address index for the entry to update.
1668  */
1669 static inline void
ppattr_set_reusable(unsigned int pai)1670 ppattr_set_reusable(unsigned int pai)
1671 {
1672 	ppattr_set_bits(pai, PP_ATTR_REUSABLE);
1673 }
1674 
1675 /**
1676  * Clear the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1677  *
1678  * @param pai The physical address index for the entry to update.
1679  */
1680 static inline void
ppattr_clear_reusable(unsigned int pai)1681 ppattr_clear_reusable(unsigned int pai)
1682 {
1683 	ppattr_clear_bits(pai, PP_ATTR_REUSABLE);
1684 }
1685 
1686 /**
1687  * Return true if the pp_attr_table entry has the PP_ATTR_REUSABLE flag set.
1688  *
1689  * @param pai The physical address index for the entry to test.
1690  */
1691 static inline bool
ppattr_test_reusable(unsigned int pai)1692 ppattr_test_reusable(unsigned int pai)
1693 {
1694 	return ppattr_test_bits(pai, PP_ATTR_REUSABLE);
1695 }
1696 
1697 /**
1698  * Set the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1699  *
1700  * @note This is only valid when the ALTACCT flag is being tracked using the
1701  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1702  *       PP_ATTR_ALTACCT definitions for more information.
1703  *
1704  * @param pai The physical address index for the entry to update.
1705  */
1706 static inline void
ppattr_set_altacct(unsigned int pai)1707 ppattr_set_altacct(unsigned int pai)
1708 {
1709 	ppattr_set_bits(pai, PP_ATTR_ALTACCT);
1710 }
1711 
1712 /**
1713  * Clear the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1714  *
1715  * @note This is only valid when the ALTACCT flag is being tracked using the
1716  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1717  *       PP_ATTR_ALTACCT definitions for more information.
1718  *
1719  * @param pai The physical address index for the entry to update.
1720  */
1721 static inline void
ppattr_clear_altacct(unsigned int pai)1722 ppattr_clear_altacct(unsigned int pai)
1723 {
1724 	ppattr_clear_bits(pai, PP_ATTR_ALTACCT);
1725 }
1726 
1727 /**
1728  * Get the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1729  *
1730  * @note This is only valid when the ALTACCT flag is being tracked using the
1731  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1732  *       PP_ATTR_ALTACCT definitions for more information.
1733  *
1734  * @param pai The physical address index for the entry to test.
1735  *
1736  * @return True if the passed in page uses alternate accounting, false
1737  *         otherwise.
1738  */
1739 static inline bool
ppattr_is_altacct(unsigned int pai)1740 ppattr_is_altacct(unsigned int pai)
1741 {
1742 	return ppattr_test_bits(pai, PP_ATTR_ALTACCT);
1743 }
1744 
1745 /**
1746  * Get the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1747  *
1748  * @note This is only valid when the INTERNAL flag is being tracked using the
1749  *       pp_attr_table. See the descriptions above the PVE_PTEP_INTERNAL and
1750  *       PP_ATTR_INTERNAL definitions for more information.
1751  *
1752  * @param pai The physical address index for the entry to test.
1753  *
1754  * @return True if the passed in page is accounted for as "internal", false
1755  *         otherwise.
1756  */
1757 static inline bool
ppattr_is_internal(unsigned int pai)1758 ppattr_is_internal(unsigned int pai)
1759 {
1760 	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1761 }
1762 
1763 /**
1764  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1765  * depending on whether there are one or multiple mappings to a page. This
1766  * function abstracts out the difference between single and multiple mappings to
1767  * a page and provides a single function for determining whether alternate
1768  * accounting is set for a mapping.
1769  *
1770  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1771  *       definitions for more information.
1772  *
1773  * @param pai The physical address index for the entry to test.
1774  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1775  * @param idx Index of the chosen PTE pointer inside the PVE.
1776  *
1777  * @return True if the passed in page uses alternate accounting, false
1778  *         otherwise.
1779  */
1780 static inline bool
ppattr_pve_is_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1781 ppattr_pve_is_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1782 {
1783 	return (pvep == PV_ENTRY_NULL) ? ppattr_is_altacct(pai) : pve_get_altacct(pvep, idx);
1784 }
1785 
1786 /**
1787  * The "internal" (INTERNAL) status for a page is tracked differently
1788  * depending on whether there are one or multiple mappings to a page. This
1789  * function abstracts out the difference between single and multiple mappings to
1790  * a page and provides a single function for determining whether "internal"
1791  * is set for a mapping.
1792  *
1793  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1794  *       definitions for more information.
1795  *
1796  * @param pai The physical address index for the entry to test.
1797  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1798  * @param idx Index of the chosen PTE pointer inside the PVE.
1799  *
1800  * @return True if the passed in page is "internal", false otherwise.
1801  */
1802 static inline bool
ppattr_pve_is_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1803 ppattr_pve_is_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1804 {
1805 	return (pvep == PV_ENTRY_NULL) ? ppattr_is_internal(pai) : pve_get_internal(pvep, idx);
1806 }
1807 
1808 /**
1809  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1810  * depending on whether there are one or multiple mappings to a page. This
1811  * function abstracts out the difference between single and multiple mappings to
1812  * a page and provides a single function for setting the alternate accounting status
1813  * for a mapping.
1814  *
1815  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1816  *       definitions for more information.
1817  *
1818  * @param pai The physical address index for the entry to update.
1819  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1820  * @param idx Index of the chosen PTE pointer inside the PVE.
1821  */
1822 static inline void
ppattr_pve_set_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1823 ppattr_pve_set_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1824 {
1825 	if (pvep == PV_ENTRY_NULL) {
1826 		ppattr_set_altacct(pai);
1827 	} else {
1828 		pve_set_altacct(pvep, idx);
1829 	}
1830 }
1831 
1832 /**
1833  * The "internal" (INTERNAL) status for a page is tracked differently
1834  * depending on whether there are one or multiple mappings to a page. This
1835  * function abstracts out the difference between single and multiple mappings to
1836  * a page and provides a single function for setting the "internal" status
1837  * for a mapping.
1838  *
1839  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1840  *       definitions for more information.
1841  *
1842  * @param pai The physical address index for the entry to update.
1843  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1844  * @param idx Index of the chosen PTE pointer inside the PVE.
1845  */
1846 static inline void
ppattr_pve_set_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1847 ppattr_pve_set_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1848 {
1849 	if (pvep == PV_ENTRY_NULL) {
1850 		ppattr_set_internal(pai);
1851 	} else {
1852 		pve_set_internal(pvep, idx);
1853 	}
1854 }
1855 
1856 /**
1857  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1858  * depending on whether there are one or multiple mappings to a page. This
1859  * function abstracts out the difference between single and multiple mappings to
1860  * a page and provides a single function for clearing the alternate accounting status
1861  * for a mapping.
1862  *
1863  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1864  *       definitions for more information.
1865  *
1866  * @param pai The physical address index for the entry to update.
1867  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1868  * @param idx Index of the chosen PTE pointer inside the PVE.
1869  */
1870 static inline void
ppattr_pve_clr_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1871 ppattr_pve_clr_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1872 {
1873 	if (pvep == PV_ENTRY_NULL) {
1874 		ppattr_clear_altacct(pai);
1875 	} else {
1876 		pve_clr_altacct(pvep, idx);
1877 	}
1878 }
1879 
1880 /**
1881  * The "internal" (INTERNAL) status for a page is tracked differently
1882  * depending on whether there are one or multiple mappings to a page. This
1883  * function abstracts out the difference between single and multiple mappings to
1884  * a page and provides a single function for clearing the "internal" status
1885  * for a mapping.
1886  *
1887  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1888  *       definitions for more information.
1889  *
1890  * @param pai The physical address index for the entry to update.
1891  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1892  * @param idx Index of the chosen PTE pointer inside the PVE.
1893  */
1894 static inline void
ppattr_pve_clr_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1895 ppattr_pve_clr_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1896 {
1897 	if (pvep == PV_ENTRY_NULL) {
1898 		ppattr_clear_internal(pai);
1899 	} else {
1900 		pve_clr_internal(pvep, idx);
1901 	}
1902 }
1903 
1904 /**
1905  * Set the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1906  *
1907  * @param pai The physical address index for the entry to update.
1908  */
1909 static inline void
ppattr_set_reffault(unsigned int pai)1910 ppattr_set_reffault(unsigned int pai)
1911 {
1912 	ppattr_set_bits(pai, PP_ATTR_REFFAULT);
1913 }
1914 
1915 /**
1916  * Clear the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1917  *
1918  * @param pai The physical address index for the entry to update.
1919  */
1920 static inline void
ppattr_clear_reffault(unsigned int pai)1921 ppattr_clear_reffault(unsigned int pai)
1922 {
1923 	ppattr_clear_bits(pai, PP_ATTR_REFFAULT);
1924 }
1925 
1926 /**
1927  * Return true if the pp_attr_table entry has the PP_ATTR_REFFAULT flag set.
1928  *
1929  * @param pai The physical address index for the entry to test.
1930  */
1931 static inline bool
ppattr_test_reffault(unsigned int pai)1932 ppattr_test_reffault(unsigned int pai)
1933 {
1934 	return ppattr_test_bits(pai, PP_ATTR_REFFAULT);
1935 }
1936 
1937 /**
1938  * Set the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1939  *
1940  * @param pai The physical address index for the entry to update.
1941  */
1942 static inline void
ppattr_set_modfault(unsigned int pai)1943 ppattr_set_modfault(unsigned int pai)
1944 {
1945 	ppattr_set_bits(pai, PP_ATTR_MODFAULT);
1946 }
1947 
1948 /**
1949  * Clear the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1950  *
1951  * @param pai The physical address index for the entry to update.
1952  */
1953 static inline void
ppattr_clear_modfault(unsigned int pai)1954 ppattr_clear_modfault(unsigned int pai)
1955 {
1956 	ppattr_clear_bits(pai, PP_ATTR_MODFAULT);
1957 }
1958 
1959 /**
1960  * Return true if the pp_attr_table entry has the PP_ATTR_MODFAULT flag set.
1961  *
1962  * @param pai The physical address index for the entry to test.
1963  */
1964 static inline bool
ppattr_test_modfault(unsigned int pai)1965 ppattr_test_modfault(unsigned int pai)
1966 {
1967 	return ppattr_test_bits(pai, PP_ATTR_MODFAULT);
1968 }
1969 
1970 /**
1971  * pmap epoch operations:
1972  *
1973  * The pmap epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap
1974  * can ensure all CPUs have observed updated mapping state before performing an operation
1975  * such as a retype which requires that no other operations be in-flight against the
1976  * prior mapping state.
1977  *
1978  * There are certain cases in which the pmap, while issuing an SPTM call that modifies
1979  * mappings, cannot hold locks such as the PVH lock which would prevent the mapped page
1980  * from being concurrently retyped.  This is particularly true for batched operations
1981  * such as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes().
1982  * In these cases, the pmap may call pmap_epoch_enter() to note that it is performing such
1983  * a sensitive operation on the local CPU.  It must then call pmap_epoch_exit() upon
1984  * completion of the sensitive operation.  While retyping is the most common case that
1985  * requires epoch synchronization, there are a few other cases as well, such as marking
1986  * a leaf page table as unnested so that all subsequent mappings in it will be non-global.
1987  *
1988  * For any instance in which the pmap needs to retype a page (or otherwise alter mapping
1989  * policy) without being guaranteed (e.g. by VM layer locking or the existing page type)
1990  * that such a sensitive operation is not in progress on some other CPU, it must drain these
1991  * sensitive operations from other CPUs.  Specifically, it must ensure that any
1992  * sensitive operation which may have observed mapping state under the prior mapping policy
1993  * has completed.  This is accomplished by first calling pmap_epoch_prepare_drain() to
1994  * record the initial pmap epoch state of all CPUs, followed by pmap_epoch_drain() to ensure
1995  * all remote CPUs are either not in an epoch or have advanced beyond the initially recorded
1996  * epoch. These are exposed as two separate functions in order to allow the calling CPU to
1997  * do other work between calling pmap_epoch_prepare_drain() and pmap_epoch_drain(), as a
1998  * best-effort attempt to minimize time wasted spinning in pmap_epoch_drain().
1999  *
2000  * When draining the epoch, the following assumptions must hold true:
2001  *
2002  * 1) The calling thread must guarantee that prior updates needed to apply the new mapping
2003  * policy have already been performed and made globally visible using the appropriate
2004  * barriers.  In the most common (retype) case, this means all existing mappings of the
2005  * page must have been removed.  For any alterations of mapping state, global visibility is
2006  * conveniently already guaranteed by the DSBs that are architecturally required to
2007  * synchronize PTE updates and the TLBIs that follow them.
2008  *
2009  * 2) For operations that require exclusive in-flight page references such as retyping,
2010  * the calling thread must have some means of ensuring that new mappings cannot be added
2011  * for the page that would bring it out of the correct state for the operation, or that
2012  * would cause an SPTM violation due to a shared/exclusive in-flight reference conflict.
2013  * For retyping this is typically done by holding the PVH lock such that pmap_enter()
2014  * cannot concurrently execute against the page.
2015  *
2016  * 3) The calling thread must not perform any operation which requires preemptibility
2017  * between calling pmap_epoch_prepare_drain() and pmap_epoch_drain().
2018  */
2019 
2020 /**
2021  * Enter the pmap epoch on the local CPU to indicate an in-progress SPTM operation
2022  * that may be sensitive to a concurrent retype operation on another CPU.
2023  *
2024  * @note This function increments the thread's preemption disable count and returns
2025  *       with preemption disabled.
2026  *
2027  * @note This function issues all required barriers to ensure correct ordering of
2028  *       the epoch update relative to ensuing SPTM accesses.
2029  */
2030 static inline void
pmap_epoch_enter(void)2031 pmap_epoch_enter(void)
2032 {
2033 	mp_disable_preemption();
2034 	pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2035 	assert(!preemption_enabled());
2036 
2037 	/* Must not already been in a pmap epoch on this CPU. */
2038 	assert(pmap_epoch->local_seq == 0);
2039 	pmap_epoch->local_seq = ++pmap_epoch->next_seq;
2040 	/* Unsigned 64-bit per-CPU integer should never overflow on any human timescale. */
2041 	assert(pmap_epoch->local_seq != 0);
2042 
2043 	/**
2044 	 * Issue a store-load barrier to ensure that remote observers of any ensuing
2045 	 * SPTM accesses will also observe the epoch update.
2046 	 */
2047 	os_atomic_thread_fence(seq_cst);
2048 }
2049 
2050 /**
2051  * Exit the pmap epoch on the local CPU to indicate completion of an SPTM operation
2052  * that may be sensitive to a concurrent retype operation on another CPU.
2053  *
2054  * @note This function must be called with preemption disabled and will decrement
2055  *       the current thread's preemption disable count.
2056  */
2057 static inline void
pmap_epoch_exit(void)2058 pmap_epoch_exit(void)
2059 {
2060 	pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2061 	assert(!preemption_enabled());
2062 	assert(pmap_epoch->local_seq == pmap_epoch->next_seq);
2063 
2064 	/**
2065 	 * Clear the sequence using a store-release operation to ensure that prior
2066 	 * SPTM modifications will be visible to remote observers before the absence
2067 	 * of an epoch is visible.
2068 	 */
2069 	os_atomic_store(&pmap_epoch->local_seq, 0, release);
2070 	mp_enable_preemption();
2071 }
2072 
2073 /**
2074  * Helper for determining whether the current CPU is within an epoch.
2075  *
2076  * @return true if the current CPU holds the epoch, false otherwise.
2077  */
2078 static inline bool
pmap_in_epoch(void)2079 pmap_in_epoch(void)
2080 {
2081 	return !preemption_enabled() && (PERCPU_GET(pmap_sptm_percpu)->pmap_epoch.local_seq != 0);
2082 }
2083 
2084 /**
2085  * Prepare the local CPU to perform an epoch drain operation by recording the retype
2086  * epoch state of other CPUs.
2087  *
2088  * @note This function increments the current thread's preemption disable count and
2089  *       returns with preemption disabled.
2090  *
2091  * @note This function issues all necessary barriers to ensure that the subsequent
2092  *       retype operation is not speculated ahead of the epoch sampling.
2093  *
2094  * @note This function does NOT issue any barriers to ensure that prior updates of
2095  *       mapping state are globally visible and have proper store-load ordering with
2096  *       respect to the scan performed here.  In the cases where this function is
2097  *       intended to be used, this ordering should be guaranteed automatically by
2098  *       the DSBs used to synchronize prior mapping updates issued by the caller.
2099  *       If this function is ever used in a situation where that cannot be guaranteed,
2100  *       the caller must issue at least the equivalent of 'dmb ish' (a.k.a. a seq_cst
2101  *       thread_fence) before calling this function.
2102  */
2103 static inline void
pmap_epoch_prepare_drain(void)2104 pmap_epoch_prepare_drain(void)
2105 {
2106 	mp_disable_preemption();
2107 	pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2108 	assert(pmap_epoch->flags == 0);
2109 	unsigned int i = 0;
2110 	uint8_t flags = PMAP_EPOCH_PREPARED;
2111 
2112 	/* Sample each CPU's epoch state. */
2113 	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2114 		const uint64_t remote_epoch =
2115 		    os_atomic_load(&pmap_pcpu->pmap_epoch.local_seq, relaxed);
2116 		pmap_epoch->remote_seq[i] = remote_epoch;
2117 
2118 		/**
2119 		 * If the remote CPU has an active epoch, make a note to ourselves that
2120 		 * we'll need to drain it.
2121 		 */
2122 		if (remote_epoch != 0) {
2123 			flags |= PMAP_EPOCH_DRAIN_REQUIRED;
2124 		}
2125 		++i;
2126 	}
2127 	pmap_epoch->flags = flags;
2128 
2129 	/**
2130 	 * Issue a load-load barrier to ensure subsequent drain or retype operations will
2131 	 * not be speculated ahead of the sampling we just did.
2132 	 */
2133 	os_atomic_thread_fence(acquire);
2134 }
2135 
2136 /**
2137  * Ensure that all CPUs have advanced beyond any active epoch that was recorded in the
2138  * most recent call to pmap_epoch_prepare_drain().
2139  *
2140  * @note This function expects to be called with preemption disabled and will decrement
2141  *       the current thread's preemption disable count.
2142  *
2143  * @note pmap_epoch_prepare_drain() must have been called on the local CPU
2144  *       prior to calling this function.  This function will return immediately if
2145  *       this prior call did not observe any active epochs on remote CPUs.
2146  *
2147  * @note This function issues all necessary barriers to ensure that the subsequent
2148  *       retype operation is not speculated ahead of the epoch sampling.
2149  */
2150 static inline void
pmap_epoch_drain(void)2151 pmap_epoch_drain(void)
2152 {
2153 	assert(!preemption_enabled());
2154 	pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2155 	const uint8_t flags = pmap_epoch->flags;
2156 	assert(flags & PMAP_EPOCH_PREPARED);
2157 	pmap_epoch->flags = 0;
2158 	if (!(flags & PMAP_EPOCH_DRAIN_REQUIRED)) {
2159 		mp_enable_preemption();
2160 		return;
2161 	}
2162 	unsigned int i = 0;
2163 	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2164 		if (pmap_epoch->remote_seq[i] != 0) {
2165 			assert((pmap_pcpu->pmap_epoch.local_seq == 0) ||
2166 			    (pmap_pcpu->pmap_epoch.local_seq >= pmap_epoch->remote_seq[i]));
2167 			/**
2168 			 * If the remote CPU was in an epoch, WFE-spin until it either exits the epoch
2169 			 * or advances to a new epoch.
2170 			 */
2171 			while ((os_atomic_load_exclusive(&pmap_pcpu->pmap_epoch.local_seq, relaxed) ==
2172 			    pmap_epoch->remote_seq[i])) {
2173 				__builtin_arm_wfe();
2174 			}
2175 			/* Clear the monitor if we exclusive-loaded a value that didn't require WFE. */
2176 			os_atomic_clear_exclusive();
2177 		}
2178 		++i;
2179 	}
2180 	mp_enable_preemption();
2181 	/**
2182 	 * Issue a load-load barrier to ensure subsequent accesses to sensitive state will
2183 	 * not be speculated ahead of the sampling we just did.
2184 	 */
2185 	os_atomic_thread_fence(acquire);
2186 }
2187 
2188 /**
2189  * Helper to determine whether a frame type is one that requires automatic
2190  * retyping (by the pmap layer) back to XNU_DEFAULT when the page is about
2191  * to be recycled by the VM layer.
2192  *
2193  * @return true if the type requires auto-retyping, false otherwise.
2194  */
2195 static inline bool
pmap_type_requires_retype_on_recycle(sptm_frame_type_t frame_type)2196 pmap_type_requires_retype_on_recycle(sptm_frame_type_t frame_type)
2197 {
2198 	return sptm_type_is_user_executable(frame_type) ||
2199 	       (frame_type == XNU_ROZONE) || (frame_type == XNU_KERNEL_RESTRICTED);
2200 }
2201 
2202 static inline boolean_t
pmap_is_preemptible(void)2203 pmap_is_preemptible(void)
2204 {
2205 	return preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT) || PMAP_IS_HIBERNATING();
2206 }
2207 
2208 /**
2209  * This helper function ensures that potentially-long-running batched operations are
2210  * called in preemptible context before entering the SPTM, so that the SPTM call may
2211  * periodically exit to allow pending urgent ASTs to be taken.
2212  */
2213 static inline void
pmap_verify_preemptible(void)2214 pmap_verify_preemptible(void)
2215 {
2216 	assert(pmap_is_preemptible());
2217 }
2218 
2219 /**
2220  * The minimum number of pages to keep in the PPL page free list.
2221  *
2222  * We define our target as 8 pages: enough for 2 page table pages, a PTD page,
2223  * and a PV page; in essence, twice as many pages as may be necessary to satisfy
2224  * a single pmap_enter request.
2225  */
2226 #define PMAP_MIN_FREE_PPL_PAGES 8
2227 
2228 /**
2229  * Flags passed to various page allocation functions, usually accessed through
2230  * the pmap_page_alloc() API. Each function that can take these flags as
2231  * a part of its option field, will describe these flags in its function header.
2232  */
2233 
2234 /* Can be used when no allocation flags are wanted. */
2235 #define PMAP_PAGE_ALLOCATE_NONE 0x0
2236 
2237 /**
2238  * Instruct the allocation function to return immediately if no pages are
2239  * current available. Without this flag, the function will spin and wait for a
2240  * page to become available. This flag can be required in some circumstances
2241  * (for instance, when allocating pages from within the PPL).
2242  */
2243 #define PMAP_PAGE_ALLOCATE_NOWAIT 0x1
2244 
2245 /**
2246  * Instructs an allocation function to fallback to reclaiming a userspace page
2247  * table if it failed to allocate a page from the free lists. This can be useful
2248  * when allocating from within the PPL because refilling the free lists requires
2249  * exiting and re-entering the PPL (which incurs extra latency).
2250  *
2251  * This is a quick way of allocating a page at the expense of having to
2252  * reallocate the table the next time one of its mappings is accessed.
2253  */
2254 #define PMAP_PAGE_RECLAIM_NOWAIT 0x2
2255 
2256 /**
2257  * Instructs an allocation function to avoid zero-filling the newly-allocated
2258  * page.  This should be used only if you know the page will be fully initialized
2259  * by some other means on the relevant allocation path.
2260  */
2261 #define PMAP_PAGE_NOZEROFILL 0x4
2262 
2263 /**
2264  * Global variables exported to the rest of the internal pmap implementation.
2265  */
2266 extern pmap_paddr_t sptm_cpu_iommu_scratch_start;
2267 extern pmap_paddr_t sptm_cpu_iommu_scratch_end;
2268 extern unsigned int inuse_pmap_pages_count;
2269 extern vm_object_t pmap_object;
2270 extern uint32_t pv_alloc_initial_target;
2271 extern uint32_t pv_kern_alloc_initial_target;
2272 
2273 /**
2274  * Functions exported to the rest of the internal pmap implementation.
2275  */
2276 extern void pmap_data_bootstrap(void);
2277 extern void pmap_enqueue_pages(vm_page_t);
2278 extern kern_return_t pmap_page_alloc(pmap_paddr_t *, unsigned);
2279 extern void pmap_page_free(pmap_paddr_t);
2280 
2281 /**
2282  * The modes in which a pmap lock can be acquired. Note that shared access
2283  * doesn't necessarily mean "read-only". As long as data is atomically updated
2284  * correctly (to account for multi-cpu accesses) data can still get written with
2285  * a shared lock held. Care just needs to be taken so as to not introduce any
2286  * race conditions when there are multiple writers.
2287  *
2288  * This is here in pmap_data.h because it's a needed parameter for pv_alloc()
2289  * and pmap_enter_pv(). This header is always included in pmap_internal.h before
2290  * the rest of the pmap locking code is defined so there shouldn't be any issues
2291  * with missing types.
2292  */
2293 OS_ENUM(pmap_lock_mode, uint8_t,
2294     PMAP_LOCK_SHARED,
2295     PMAP_LOCK_EXCLUSIVE,
2296     PMAP_LOCK_HELD);
2297 
2298 /**
2299  * Possible return values for pv_alloc(). See the pv_alloc() function header for
2300  * a description of each of these values.
2301  */
2302 typedef enum {
2303 	PV_ALLOC_SUCCESS,
2304 	PV_ALLOC_RETRY,
2305 	PV_ALLOC_FAIL
2306 } pv_alloc_return_t;
2307 
2308 extern pv_alloc_return_t pv_alloc(
2309 	pmap_t, pmap_lock_mode_t, unsigned int, pv_entry_t **, locked_pvh_t *, volatile uint16_t *);
2310 extern void pv_free(pv_entry_t *);
2311 extern void pv_list_free(pv_entry_t *, pv_entry_t *, unsigned int);
2312 extern void pmap_compute_pv_targets(void);
2313 extern pv_alloc_return_t pmap_enter_pv(
2314 	pmap_t, pt_entry_t *, unsigned int, pmap_lock_mode_t, locked_pvh_t *, pv_entry_t **, int *);
2315 
2316 typedef enum {
2317 	PV_REMOVE_SUCCESS, /* found a mapping */
2318 	PV_REMOVE_FAIL /* no mapping found */
2319 } pv_remove_return_t;
2320 
2321 extern pv_remove_return_t pmap_remove_pv(pmap_t, pt_entry_t *, locked_pvh_t *, bool *, bool *);
2322 
2323 extern void ptd_bootstrap(pt_desc_t *, unsigned int);
2324 extern pt_desc_t *ptd_alloc_unlinked(unsigned int);
2325 extern pt_desc_t *ptd_alloc(pmap_t, unsigned int);
2326 extern void ptd_deallocate(pt_desc_t *);
2327 extern void ptd_info_init(
2328 	pt_desc_t *, pmap_t, vm_map_address_t, unsigned int, pt_entry_t *);
2329 extern void ptd_info_finalize(pt_desc_t *);
2330 
2331 extern kern_return_t pmap_ledger_credit(pmap_t, int, ledger_amount_t);
2332 extern kern_return_t pmap_ledger_debit(pmap_t, int, ledger_amount_t);
2333 
2334 extern void validate_pmap_internal(const volatile struct pmap *, const char *);
2335 extern void validate_pmap_mutable_internal(const volatile struct pmap *, const char *);
2336 
2337 /**
2338  * Macro function wrappers around pmap validation so that the calling function
2339  * can be printed in the panic strings for easier validation failure debugging.
2340  */
2341 #define validate_pmap(x) validate_pmap_internal(x, __func__)
2342 #define validate_pmap_mutable(x) validate_pmap_mutable_internal(x, __func__)
2343 
2344 /**
2345  * This structure describes a SPTM-owned physical memory range.
2346  *
2347  * @note This doesn't necessarily have to represent "I/O" only, this
2348  *       can also represent non-kernel-managed DRAM (e.g., iBoot
2349  *       carveouts). In some special cases, this can also represent
2350  *       kernel-managed DRAM, when adding flags for special behavior
2351  *       (e.g. the range being off limits for hibtext). Such ranges
2352  *       must be marked with the PMAP_IO_RANGE_NOT_IO flag.
2353  *
2354  * @note The layout of this structure needs to map 1-to-1 with the pmap-io-range
2355  *       device tree nodes. Astris (through the LowGlobals) also depends on the
2356  *       consistency of this structure.
2357  *
2358  * @note These definitions are copied to SPTM and they need to be in sync.
2359  */
2360 typedef struct pmap_io_range {
2361 	/* Physical address of the PPL-owned I/O range. */
2362 	uint64_t addr;
2363 
2364 	/* Length (in bytes) of the PPL-owned I/O range. */
2365 	uint64_t len;
2366 
2367 	/* Strong DSB required for pages in this range. */
2368 	#define PMAP_IO_RANGE_STRONG_SYNC (1U << 31)
2369 
2370 	/* Corresponds to memory carved out by bootloader. */
2371 	#define PMAP_IO_RANGE_CARVEOUT (1U << 30)
2372 
2373 	/* Pages in this range need to be included in the hibernation image. */
2374 	#define PMAP_IO_RANGE_NEEDS_HIBERNATING (1U << 29)
2375 
2376 	/* Mark the range as 'owned' by a given subsystem. */
2377 	#define PMAP_IO_RANGE_OWNED (1U << 28)
2378 
2379 	/**
2380 	 * Denotes a range that is *not* to be treated as an I/O range that
2381 	 * needs to be mapped, but only to decorate arbitrary physical
2382 	 * memory ranges (including of managed memory) with extra
2383 	 * flags. I.e. this allows tagging of "ordinary" managed memory
2384 	 * pages with flags like `PMAP_IO_RANGE_PROHIBIT_HIB_WRITE`, or
2385 	 * informing the SPTM that some (nominally) managed memory pages are
2386 	 * unavailable for some reason.
2387 	 *
2388 	 * Notably, `pmap_find_io_attr()`, and anything else that uses
2389 	 * `pmap_io_range`s for denoting to-be-mapped I/O ranges, ignores
2390 	 * entries with this flag.
2391 	 */
2392     #define PMAP_IO_RANGE_NOT_IO (1U << 27)
2393 
2394 	/* Pages in this range may never be written during hibernation restore. */
2395 	#define PMAP_IO_RANGE_PROHIBIT_HIB_WRITE (1U << 26)
2396 
2397 	/**
2398 	 * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional
2399 	 * mapping flags (defined above).
2400 	 */
2401 	uint32_t wimg;
2402 
2403 	/* 4 Character Code (4CC) describing what this range is. */
2404 	uint32_t signature;
2405 } pmap_io_range_t;
2406 
2407 /* Reminder: be sure to change all relevant device trees if you change the layout of pmap_io_range_t */
2408 _Static_assert(sizeof(pmap_io_range_t) == 24, "unexpected size for pmap_io_range_t");
2409 
2410 extern pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t);
2411 
2412 extern void pmap_range_iterate(bool (^step) (pmap_io_range_t const *));
2413 
2414 /**
2415  * This structure describes a sub-page-size I/O region owned by SPTM but the kernel can write to.
2416  *
2417  * @note I/O filter software will use a collection of such data structures to determine access
2418  *       permissions to a page owned by SPTM.
2419  *
2420  * @note The {signature, offset} key is used to index a collection of such data structures to
2421  *       optimize for space in the case where one page layout is repeated for many devices, such
2422  *       as the memory controller channels.
2423  */
2424 typedef struct pmap_io_filter_entry {
2425 	/* 4 Character Code (4CC) describing what this range (page) is. */
2426 	uint32_t signature;
2427 
2428 	/* Offset within the page. It has to be within [0, PAGE_SIZE). */
2429 	uint16_t offset;
2430 
2431 	/* Length of the range, and (offset + length) has to be within [0, PAGE_SIZE). */
2432 	uint16_t length;
2433 } pmap_io_filter_entry_t;
2434 
2435 _Static_assert(sizeof(pmap_io_filter_entry_t) == 8, "unexpected size for pmap_io_filter_entry_t");
2436 
2437 extern void pmap_cpu_data_init_internal(unsigned int);
2438 
2439 /**
2440  * Convert a SURT PA to the containing SURT page's PA.
2441  *
2442  * @param surt_pa The SURT's physical addresss.
2443  *
2444  * @return The containing SURT page's PA.
2445  */
2446 static inline pmap_paddr_t
surt_page_pa_from_surt_pa(pmap_paddr_t surt_pa)2447 surt_page_pa_from_surt_pa(pmap_paddr_t surt_pa)
2448 {
2449 	return surt_pa & ~PAGE_MASK;
2450 }
2451 
2452 /**
2453  * Given a SURT PA, get its index in the containing SURT page.
2454  *
2455  * @param surt_pa The PA of the SURT.
2456  *
2457  * @return The index of the SURT in the containing SURT page.
2458  */
2459 static inline uint8_t
surt_index_from_surt_pa(pmap_paddr_t surt_pa)2460 surt_index_from_surt_pa(pmap_paddr_t surt_pa)
2461 {
2462 	return (uint8_t)((surt_pa & PAGE_MASK) / SUBPAGE_USER_ROOT_TABLE_SIZE);
2463 }
2464 
2465 /**
2466  * Given a SURT page PA and an index, compute the PA of the associated SURT.
2467  *
2468  * @param surt_page_pa The PA of the SURT page.
2469  * @param index THe index of the SURT in the SURT page.
2470  *
2471  * @return The computed PA of the SURT.
2472  */
2473 static inline pmap_paddr_t
surt_pa_from_surt_page_pa_and_index(pmap_paddr_t surt_page_pa,uint8_t index)2474 surt_pa_from_surt_page_pa_and_index(pmap_paddr_t surt_page_pa, uint8_t index)
2475 {
2476 	assert((surt_page_pa & PAGE_MASK) == 0);
2477 	return surt_page_pa + index * SUBPAGE_USER_ROOT_TABLE_SIZE;
2478 }
2479 
2480 #if __ARM64_PMAP_SUBPAGE_L1__
2481 extern void surt_init(void);
2482 extern pmap_paddr_t surt_try_alloc(void);
2483 extern bool surt_free(pmap_paddr_t surt_pa);
2484 extern void surt_feed_page_with_first_table_allocated(pmap_paddr_t surt_page_pa);
2485 extern unsigned int surt_list_len(void);
2486 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2487 
2488 #if DEBUG || DEVELOPMENT
2489 extern unsigned int pmap_wcrt_on_non_dram_count_get(void);
2490 extern void pmap_wcrt_on_non_dram_count_increment_atomic(void);
2491 #endif /* DEBUG || DEVELOPMENT */
2492