xref: /xnu-11417.140.69/osfmk/arm64/sptm/pmap/pmap_data.h (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /**
29  * This header file is used to store the types, prototypes, and inline functions
30  * that define some of the most important data structures used in the pmap. This
31  * header is only meant for sharing types within the pmap; if a type is meant to
32  * be used by the rest of the kernel, then put it into osfmk/arm64/sptm/pmap/pmap.h.
33  */
34 #pragma once
35 
36 #include <stdint.h>
37 
38 #include <kern/ledger.h>
39 #include <mach/vm_types.h>
40 #include <mach_assert.h>
41 #include <vm/vm_page.h>
42 
43 #include <arm/cpu_data.h>
44 #include <arm/machine_routines.h>
45 #include <arm64/proc_reg.h>
46 
47 #if HIBERNATION
48 #include <arm64/hibernate_secure_hmac.h>
49 #endif /* HIBERNATION */
50 
51 /* Temporary include before moving all ledger functions into pmap_data.c */
52 #include <os/refcnt.h>
53 
54 /**
55  * These headers are safe to be included in this file since they shouldn't rely
56  * on any of the internal pmap header files (so no circular dependencies).
57  */
58 #include <arm64/sptm/pmap/pmap.h>
59 #include <arm64/sptm/pmap/pmap_pt_geometry.h>
60 
61 #include <arm64/sptm/sptm.h>
62 
63 /**
64  * These values represent the first and last kernel-managed physical addresses.
65  * We keep track of extra metadata on kernel-managed pages compared to other
66  * pages (usually iBoot carved out memory or I/O).
67  */
68 extern pmap_paddr_t vm_first_phys, vm_last_phys;
69 
70 #define PMAP_HIB_STATE_REACHED(states) false
71 #define PMAP_ASSERT_NOT_WRITING_HIB()
72 #define PMAP_IS_HIBERNATING() false
73 
74 /**
75  * Return whether the given address represents a kernel-managed physical page.
76  *
77  * Whether a page is considered "kernel-managed" is determined by the BootArgs
78  * passed by the bootloader. Typically memory carved out by the bootloader as
79  * well as I/O memory should return false.
80  *
81  * @param pa The physical address to check.
82  */
83 static inline bool
pa_valid(pmap_paddr_t pa)84 pa_valid(pmap_paddr_t pa)
85 {
86 	return (pa >= vm_first_phys) && (pa < vm_last_phys);
87 }
88 
89 /* Sentinal value indicating an invalid physical address index. */
90 #define INVALID_PAI UINT_MAX
91 
92 /**
93  * The pmap has a variety of data structures (pv_head_table/pp_attr_table) that
94  * contain an entry for every kernel-managed page in the system. These systems
95  * are indexed with physical address indices ("pai") generated by this function.
96  *
97  * The logic is simple since there should be one entry in each of these data
98  * structures for each kernel-managed physical page in the system. These data
99  * structures are allocated on boot based on the amount of memory available.
100  *
101  * @note PAIs are defined using the VM page size, which might not be identical
102  *       to the underlying hardware page size for an arbitrary address space.
103  *       This means that the data structures relying on PAIs will contain one
104  *       entry for each VM page, not hardware page.
105  *
106  * @note This function is only valid for physical addresses that are
107  *       kernel-managed.
108  */
109 static inline unsigned int
pa_index(pmap_paddr_t pa)110 pa_index(pmap_paddr_t pa)
111 {
112 	return (unsigned int)atop(pa - vm_first_phys);
113 }
114 
115 /**
116  * Convert from a physical address index (pai) back to a raw physical address.
117  *
118  * @param pai The physical address index to convert to a PA.
119  *
120  * @return The page-aligned physical address corresponding to [pai].
121  */
122 static inline pmap_paddr_t
pai_to_pa(unsigned int pai)123 pai_to_pa(unsigned int pai)
124 {
125 	return ptoa((pmap_paddr_t)pai) + vm_first_phys;
126 }
127 
128 /* See the definition of pv_head_table for more information. */
129 extern uintptr_t *pv_head_table;
130 
131 /* Represents a NULL entry in the pv_head_table. */
132 #define PV_ENTRY_NULL ((pv_entry_t *) 0)
133 
134 /**
135  * Given a physical address index, return the corresponding pv_head_table entry.
136  *
137  * @note The returned entry might be invalid, or a pointer to a pt_entry_t,
138  *       pv_entry_t, or pt_desc_t depending on the type for this entry.
139  *       Determine the type using pvh_test_type().
140  *
141  * @param pai The index returned by pa_index() for the page whose pv_head_table
142  *            entry should be retrieved.
143  */
144 static inline uintptr_t
pai_to_pvh(unsigned int pai)145 pai_to_pvh(unsigned int pai)
146 {
147 	return pv_head_table[pai];
148 }
149 
150 /**
151  * Each pv_head_table entry can be one of four different types:
152  *
153  * - PVH_TYPE_NULL: No mappings to the physical page exist outside of the
154  *                  physical aperture. Physical aperture mappings are not
155  *                  tracked in the pv_head_table.
156  *
157  * - PVH_TYPE_PVEP: There are multiple mappings to the physical page.
158  *                  These entries are linked lists of pv_entry_t objects (which
159  *                  each contain a pointer to the associated PTE and a pointer
160  *                  to the next entry in the list).
161  *
162  * - PVH_TYPE_PTEP: There is a single mapping to the physical page. Once more
163  *                  mappings are created, this entry will get upgraded to an
164  *                  entry of type PVH_TYPE_PVEP. These entries are pointers
165  *                  directly to the page table entry that contain the mapping
166  *                  (pt_entry_t*).
167  *
168  * - PVH_TYPE_PTDP: The physical page is being used as a page table. These
169  *                  entries are pointers to page table descriptor structures
170  *                  (pt_desc_t) which contain metadata related to each page
171  *                  table.
172  *
173  * The type is stored in the bottom two bits of each pv_head_table entry. That
174  * type needs to be checked before dereferencing the pointer to determine which
175  * pointer type to dereference as.
176  */
177 #define PVH_TYPE_NULL 0x0UL
178 #define PVH_TYPE_PVEP 0x1UL
179 #define PVH_TYPE_PTEP 0x2UL
180 #define PVH_TYPE_PTDP 0x3UL
181 
182 #define PVH_TYPE_MASK (0x3UL)
183 
184 
185 /**
186  * PV_HEAD_TABLE Flags.
187  *
188  * All flags listed below are stored in the pv_head_table entry/pointer
189  * (per-physical-page) unless otherwise noted.
190  *
191  * Please update the pv_walk LLDB macro if these flags are changed or added to.
192  */
193 
194 /**
195  * This flag is set for every mapping created by an IOMMU.
196  *
197  * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
198  * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
199  */
200 #define PVH_FLAG_IOMMU 0x4UL
201 
202 /**
203  * This flag is only valid when PVH_FLAG_IOMMU is set. For an IOMMU mapping, if
204  * this bit is set, then the PTE pointer points directly into the IOMMU page
205  * table for this mapping. If this bit is cleared, then the "PTE pointer" is
206  * actually a pointer to the IOMMU descriptor object that owns this mapping.
207  *
208  * There are cases where it's not easy to tie an IOMMU mapping directly to a
209  * specific page table, so this allows us to at least get a pointer to which
210  * IOMMU created this mapping which is useful for debugging purposes.
211  *
212  * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
213  * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
214  */
215 #define PVH_FLAG_IOMMU_TABLE (1ULL << 63)
216 
217 /**
218  * This flag is set when the first CPU (non-IOMMU) mapping is created. This is
219  * important to keep track of because various accounting statistics are based on
220  * the options specified for the first CPU mapping. This flag, and thus the
221  * accounting statistics, will persist as long as there *any* mappings of the
222  * page (including IOMMU mappings). This works because the accounting for a page
223  * should not need to change until the page is recycled by the VM layer, and we
224  * double-check that there are no mappings (CPU or IOMMU) when a page is
225  * recycled (see: pmap_verify_free()).
226  */
227 #define PVH_FLAG_CPU (1ULL << 62)
228 
229 /* This bit is used as a lock when modifying a pv_head_table entry. */
230 #define PVH_LOCK_BIT 61
231 #define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT)
232 
233 /**
234  * This flag is set when there are any executable mappings to this physical
235  * page. This is used to prevent any writable mappings from being created at
236  * the same time an executable mapping exists.
237  */
238 #define PVH_FLAG_EXEC (1ULL << 60)
239 
240 /**
241  * This flag is used to mark that a page has been hashed into the hibernation
242  * image.
243  *
244  * The hibernation driver will use this to ensure that all PPL-owned memory is
245  * correctly included into the hibernation image (a missing PPL page could be
246  * a security concern when coming out of hibernation).
247  */
248 #define PVH_FLAG_HASHED (1ULL << 58)
249 
250 /**
251  * Marking a pv_head_table entry with this flag denotes that this page is
252  * retired without any mappings and never should be mapped again.
253  */
254 #define PVH_FLAG_RETIRED (1ULL << 55)
255 
256 /**
257  * This flag is used to mark that a PV head entry has been placed into
258  * "sleep mode", which typically happens when the lock owner needs to
259  * process a long PV list.  If this bit is set, threads which contend
260  * on the PVH lock must call thread_block() to wait until they are awakened
261  * by the current lock owner releasing the lock.
262  */
263 #define PVH_FLAG_SLEEP (1ULL << 54)
264 
265 /**
266  * These bits need to be set to safely dereference a pv_head_table
267  * entry/pointer.
268  *
269  * Any change to this #define should also update the copy located in the pmap.py
270  * LLDB macros file.
271  */
272 #define PVH_MUTABLE_FLAGS (PVH_FLAG_CPU | PVH_FLAG_EXEC | PVH_FLAG_HASHED | PVH_FLAG_RETIRED)
273 
274 #define PVH_LOCK_FLAGS (PVH_FLAG_LOCK | PVH_FLAG_SLEEP)
275 
276 #define PVH_HIGH_FLAGS (PVH_MUTABLE_FLAGS | PVH_LOCK_FLAGS)
277 
278 /* Mask used to clear out the TYPE bits from a pv_head_table entry/pointer. */
279 #define PVH_LIST_MASK (~PVH_TYPE_MASK)
280 
281 /* Which 32-bit word in each pv_head_table entry/pointer contains the LOCK bit. */
282 #define PVH_LOCK_WORD 1 /* Assumes little-endian */
283 
284 /**
285  * Assert that a pv_head_table entry is locked. Will panic if the lock isn't
286  * acquired.
287  *
288  * @param index The physical address index to check.
289  */
290 static inline void
pvh_assert_locked(__assert_only unsigned int index)291 pvh_assert_locked(__assert_only unsigned int index)
292 {
293 	assertf(os_atomic_load(&pv_head_table[index], relaxed) & PVH_LOCK_FLAGS,
294 	    "%s: PVH %p (=%p) for pai 0x%x not locked or in sleep mode", __func__,
295 	    &pv_head_table[index], (void*)(os_atomic_load(&pv_head_table[index], relaxed)), index);
296 }
297 
298 /**
299  * Helper function for returning the 32-bit PVH lock word corresponding
300  * to a physical address index.
301  *
302  * @param index The physical address index of the pv_head_table entry
303  *
304  * @return A pointer to the 32-bit word containing the lock bit
305  */
306 static inline uint32_t*
pvh_lock_word(unsigned int index)307 pvh_lock_word(unsigned int index)
308 {
309 	return (uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD;
310 }
311 
312 /**
313  * Helper macro for computing the lock bit offset within the 32-bit
314  * lock word for each PV head entry.
315  *
316  * @return A 32-bit integer containing the lock bit offset.
317  */
318 #define PVH_LOCK_BIT_OFFSET (PVH_LOCK_BIT - (PVH_LOCK_WORD * 32))
319 
320 /**
321  * Lock a pv_head_table entry, and return the value stored in the pv_head_table array.
322  *
323  * @param index The physical address index of the pv_head_table entry to lock.
324  *
325  * @return A wrapper object with the contents of the locked pv_head_table entry.
326  */
327 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock(unsigned int index)328 pvh_lock(unsigned int index)
329 {
330 	extern unsigned int not_in_kdp;
331 	const bool was_preemptible = preemption_enabled();
332 	assert(was_preemptible || (startup_phase < STARTUP_SUB_EARLY_BOOT) ||
333 	    PMAP_IS_HIBERNATING() || !not_in_kdp);
334 
335 	bool (^check_preemption)(void) = ^bool (void) {
336 		return was_preemptible && pmap_pending_preemption();
337 	};
338 
339 	hw_lock_status_t ret;
340 	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
341 	do {
342 		ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
343 		    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
344 
345 		if (ret == HW_LOCK_ACQUIRED) {
346 			locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
347 			if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
348 				wait_result_t wres;
349 				wres = assert_wait(&pv_head_table[index], THREAD_UNINT);
350 				hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
351 				assertf(wres == THREAD_WAITING, "%s: unexpected wait result %d", __func__, wres);
352 				thread_block(THREAD_CONTINUE_NULL);
353 				ret = HW_LOCK_CONTENDED;
354 			}
355 		}
356 	} while (ret != HW_LOCK_ACQUIRED);
357 
358 	return locked_pvh;
359 }
360 
361 /**
362  * Lock a pvh_head_table entry, possibly in a preemption-disabled context.
363  *
364  * @note This function is only meant for special use cases in which pmap
365  *       functions must be invoked with preemption disabled.  These cases
366  *       are expected to be rare and limited.  If you think you need to
367  *       use this in more places, you're probably wrong.
368  *
369  * @param index The physical address index of the pv_head_table entry to lock.
370  *
371  * @return A wrapper object with the contents of the locked pv_head_table entry.
372  */
373 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock_nopreempt(unsigned int index)374 pvh_lock_nopreempt(unsigned int index)
375 {
376 	if (__improbable(preemption_enabled())) {
377 		return pvh_lock(index);
378 	}
379 	hw_lock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
380 	const locked_pvh_t locked_pvh = {.pvh = os_atomic_load(&pv_head_table[index], relaxed), .pai = index};
381 
382 	if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
383 		panic("%s invoked on sleep-mode PVH %p for pai 0x%x", __func__, &pv_head_table[index], index);
384 	}
385 
386 	return locked_pvh;
387 }
388 
389 /**
390  * Attempt to lock a pv_head_table entry, failing if the lock can't be immediately acquired.
391  *
392  * @param index The physical address index of the pv_head_table entry to lock.
393  *
394  * @return A wrapper object with the contents of the locked pv_head_table entry if successful,
395  *         0 otherwise.
396  */
397 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_try_lock(unsigned int index)398 pvh_try_lock(unsigned int index)
399 {
400 	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
401 	bool locked = hw_lock_bit_try(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
402 
403 	if (locked) {
404 		locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
405 		assert(locked_pvh.pvh != 0);
406 		if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
407 			hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
408 			locked_pvh.pvh = 0;
409 		}
410 	}
411 
412 	return locked_pvh;
413 }
414 
415 /**
416  * Helper for determining whether a preceding pvh_try_lock() call succeeded.
417  *
418  * @param locked_pvh A wrapper representing a possibly-locked PV head table entry
419  *        returned by pvh_try_lock().
420  *
421  * @return True if [locked_pvh] represents a successfully-locked PVH, false otherwise.
422  */
423 static inline bool
pvh_try_lock_success(const locked_pvh_t * locked_pvh)424 pvh_try_lock_success(const locked_pvh_t *locked_pvh)
425 {
426 	assert(locked_pvh != NULL);
427 	return locked_pvh->pvh != 0;
428 }
429 
430 /**
431  * Place a pv_head_table entry in sleep mode, so that other threads contending on the PVH
432  * lock will sleep until this thread calls pvh_unlock().
433  *
434  * @note It is legal to call this function if the lock is already in sleep mode.
435  *       In that case, the call will have no effect.
436  * @note This function must not be called with preemption disabled by any other agent
437  *       but [locked_pvh] itself.  Preemption must be fully re-enabled by the time
438  *       this function returns, either because it was already enabled (because the
439  *       lock was already in sleep mode), or because this function enabled it by placing
440  *       the lock in sleep mode.
441  *
442  * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
443  */
444 static inline void
pvh_lock_enter_sleep_mode(locked_pvh_t * locked_pvh)445 pvh_lock_enter_sleep_mode(locked_pvh_t *locked_pvh)
446 {
447 	assert(locked_pvh != NULL);
448 	assert(locked_pvh->pvh != 0);
449 	unsigned int index = locked_pvh->pai;
450 	pvh_assert_locked(index);
451 	const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
452 	if (!(old_pvh & PVH_FLAG_SLEEP)) {
453 		assert(old_pvh & PVH_FLAG_LOCK);
454 		os_atomic_store(&pv_head_table[index], old_pvh | PVH_FLAG_SLEEP, relaxed);
455 		/**
456 		 * Tell the scheduler that this thread may need a priority boost if it needs to go
457 		 * off-core, to reduce the likelihood of priority inversion.
458 		 */
459 		locked_pvh->pri_token = thread_priority_floor_start();
460 		hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
461 	}
462 
463 	/* Hibernation runs single-core so we can skip this check. */
464 	assert(preemption_enabled() || PMAP_IS_HIBERNATING());
465 }
466 
467 /**
468  * Check that a pv_head_table entry/pointer is a specific type.
469  *
470  * @param pvh The pv_head_table entry/pointer to check.
471  * @param type The type to check for.
472  *
473  * @return True if the pv_head_table entry is of the passed in type, false
474  *         otherwise.
475  */
476 static inline bool
pvh_test_type(uintptr_t pvh,uintptr_t type)477 pvh_test_type(uintptr_t pvh, uintptr_t type)
478 {
479 	return (pvh & PVH_TYPE_MASK) == type;
480 }
481 
482 /**
483  * Unlock a pv_head_table entry, updating the contents of the entry with the passed-in value.
484  *
485  * @note Only the non-lock flags, pointer, and type fields of the entry will be updated
486  *       according to the passed-in value.  PVH_LOCK_FLAGS will be ignored as they are
487  *       directly manipulated by this function.
488  *
489  * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
490  *        The pvh field from this entry, except for the PVH_LOCK_FLAGS bits, will be stored
491  *        in pv_head_table to reflect any updates that may have been performed on the PV list
492  *        while the lock was held.
493  */
494 static inline void
pvh_unlock(locked_pvh_t * locked_pvh)495 pvh_unlock(locked_pvh_t *locked_pvh)
496 {
497 	assert(locked_pvh != NULL);
498 	assert(locked_pvh->pvh != 0);
499 	unsigned int index = locked_pvh->pai;
500 	pvh_assert_locked(index);
501 	const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
502 	bool pri_floor_end = false;
503 
504 	if (__improbable(old_pvh & PVH_FLAG_SLEEP)) {
505 		pri_floor_end = true;
506 		const bool was_preemptible = preemption_enabled();
507 		bool (^check_preemption)(void) = ^bool (void) {
508 			return was_preemptible && pmap_pending_preemption();
509 		};
510 
511 		hw_lock_status_t ret;
512 		do {
513 			ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
514 			    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
515 		} while (ret != HW_LOCK_ACQUIRED);
516 
517 		os_atomic_store(&pv_head_table[index],
518 		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
519 		thread_wakeup(&pv_head_table[index]);
520 	} else if ((old_pvh & ~PVH_LOCK_FLAGS) != (locked_pvh->pvh & ~PVH_LOCK_FLAGS)) {
521 		os_atomic_store(&pv_head_table[index],
522 		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
523 	}
524 	hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
525 
526 	if (__improbable(pri_floor_end)) {
527 		thread_priority_floor_end(&locked_pvh->pri_token);
528 	}
529 
530 	locked_pvh->pvh = 0;
531 }
532 
533 /**
534  * Convert a pv_head_table entry/pointer into a page table entry pointer. This
535  * should only be done if the type of this entry is PVH_TYPE_PTEP.
536  *
537  * @param pvh The pv_head_table entry/pointer to convert into a pt_entry_t*.
538  *
539  * @return Return back a safe to derefence pointer to the single mapping of this
540  *         physical page by masking off the TYPE bits and adding any missing
541  *         flags to the upper portion of the pointer.
542  */
543 static inline pt_entry_t*
pvh_ptep(uintptr_t pvh)544 pvh_ptep(uintptr_t pvh)
545 {
546 	assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
547 	return (pt_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
548 }
549 
550 /**
551  * Convert a pv_head_table entry/pointer into a PVE list pointer. This
552  * should only be done if the type of this entry is PVH_TYPE_PVEP.
553  *
554  * @param pvh The pv_head_table entry/pointer to convert into a safe to
555  *            dereference pv_entry_t*.
556  *
557  * @return Return back a safe to derefence pointer to the first mapping of this
558  *         physical page by masking off the TYPE bits and adding any missing
559  *         flags to the upper portion of the pointer.
560  */
561 static inline pv_entry_t*
pvh_pve_list(uintptr_t pvh)562 pvh_pve_list(uintptr_t pvh)
563 {
564 	assert(pvh_test_type(pvh, PVH_TYPE_PVEP));
565 	return (pv_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
566 }
567 
568 /**
569  * Return the mutable flags associated with a pv_head_table entry/pointer.
570  *
571  * @param pvh The pv_head_table entry whose flags to get.
572  *
573  * @return The mutable flags encoded in [pvh].
574  */
575 static inline uintptr_t
pvh_get_flags(uintptr_t pvh)576 pvh_get_flags(uintptr_t pvh)
577 {
578 	return pvh & PVH_MUTABLE_FLAGS;
579 }
580 
581 /**
582  * Update the flags associated with a pv_head_table entry/pointer.
583  *
584  * @note This function does not actually modify the pv_head_table,
585  *       it only installs an updated pv_head_table entry in [locked_pvh]
586  *       that can later be passed to pvh_unlock() to update the actual array
587  *       entry.
588  *
589  * @param locked_pvh A wrapper struct containing the pv_head_table
590  *                   entry/pointer to update.
591  *
592  */
593 static inline void
pvh_set_flags(locked_pvh_t * locked_pvh,uintptr_t flags)594 pvh_set_flags(locked_pvh_t *locked_pvh, uintptr_t flags)
595 {
596 	locked_pvh->pvh = (locked_pvh->pvh & ~PVH_MUTABLE_FLAGS) | (flags & PVH_MUTABLE_FLAGS);
597 }
598 
599 /**
600  * Update a pv_head_table entry/pointer to be a different type and/or point to
601  * a different object.
602  *
603  * @note This function does not actually modify the pv_head_table,
604  *       it only installs an updated pv_head_table entry in [locked_pvh]
605  *       that can later be passed to pvh_unlock() to update the actual array
606  *       entry.
607  *
608  * @param locked_pvh A wrapper struct containing the pv_head_table
609  *                   entry/pointer to update.
610  * @param pvep The new entry to use. This could be either a pt_entry_t*,
611  *             pv_entry_t*, or pt_desc_t* depending on the type.
612  * @param type The type of the new entry.
613  */
614 static inline void
pvh_update_head(locked_pvh_t * locked_pvh,void * pvep,unsigned int type)615 pvh_update_head(locked_pvh_t *locked_pvh, void *pvep, unsigned int type)
616 {
617 	assert(!((uintptr_t)pvep & PVH_TYPE_MASK));
618 	const uintptr_t pvh_flags = locked_pvh->pvh & PVH_HIGH_FLAGS;
619 	locked_pvh->pvh = ((uintptr_t)pvep & ~PVH_HIGH_FLAGS) | type | pvh_flags;
620 }
621 
622 /**
623  * Given a page table entry pointer retrieved from the pv_head_table (from an
624  * entry of type PVH_TYPE_PTEP or PVH_TYPE_PVEP), return back whether the PTE is
625  * an IOMMU mapping.
626  *
627  * @note The way this function determines whether the passed in pointer is
628  *       pointing to an IOMMU PTE, is by checking for a special flag stored in
629  *       the lower bits of the pointer. This flag is only set on pointers stored
630  *       in the pv_head_table, and as such, this function will only work on
631  *       pointers retrieved from the pv_head_table. If a pointer to a PTE was
632  *       directly retrieved from an IOMMU's page tables, this function would
633  *       always return false despite actually being an IOMMU PTE.
634  *
635  * @param ptep A PTE pointer obtained from the pv_head_table to check.
636  *
637  * @return True if the entry is an IOMMU mapping, false otherwise.
638  */
639 static inline bool
pvh_ptep_is_iommu(const pt_entry_t * ptep)640 pvh_ptep_is_iommu(const pt_entry_t *ptep)
641 {
642 #ifdef PVH_FLAG_IOMMU
643 	return (uintptr_t)ptep & PVH_FLAG_IOMMU;
644 #else /* PVH_FLAG_IOMMU */
645 	#pragma unused(ptep)
646 	return false;
647 #endif /* PVH_FLAG_IOMMU */
648 }
649 
650 /**
651  * Sometimes the PTE pointers retrieved from the pv_head_table (from an entry of
652  * type PVH_TYPE_PTEP or PVH_TYPE_PVEP) contain flags themselves. This function
653  * strips out those flags and returns back a dereferencable pointer.
654  *
655  * @param ptep The PTE pointer to strip out the unwanted flags.
656  *
657  * @return A valid dereferencable pointer to the page table entry.
658  */
659 static inline const pt_entry_t*
pvh_strip_ptep(const pt_entry_t * ptep)660 pvh_strip_ptep(const pt_entry_t *ptep)
661 {
662 #ifdef PVH_FLAG_IOMMU
663 	const uintptr_t pte_va = (uintptr_t)ptep;
664 	return (const pt_entry_t*)((pte_va & ~PVH_FLAG_IOMMU) | PVH_FLAG_IOMMU_TABLE);
665 #else /* PVH_FLAG_IOMMU */
666 	return ptep;
667 #endif /* PVH_FLAG_IOMMU */
668 }
669 
670 /**
671  * PVH_TYPE_PVEP Helper Functions.
672  *
673  * The following are methods used to manipulate PVE lists. This is the type of
674  * pv_head_table entry used when there are multiple mappings to a single
675  * physical page.
676  */
677 
678 /**
679  * Whether a physical page is using "alternate accounting" (ALTACCT) for its
680  * ledger statistics is something that needs to be tracked on a per-mapping
681  * basis, not on a per-physical-page basis. Because of that, it's tracked
682  * differently depending on whether there's a single mapping to a page
683  * (PVH_TYPE_PTEP) or multiple (PVH_TYPE_PVEP). For single mappings, the bit is
684  * tracked in the pp_attr_table. But when there are multiple mappings, the least
685  * significant bit of the corresponding "pve_pte" pointer in each pv_entry object
686  * is used as a marker for pages using alternate accounting.
687  *
688  * @note See the definition for PP_ATTR_ALTACCT for a more detailed description
689  *       of what "alternate accounting" actually means in respect to the
690  *       footprint ledger.
691  *
692  * Since some code (KernelDiskImages, e.g.) might map a phsyical page as
693  * "device" memory (i.e. external) while it's also being used as regular
694  * "anonymous" memory (i.e. internal) in user space, we have to manage the
695  * "internal" attribute per mapping rather than per physical page.
696  * When there are multiple mappings, we use the next least significant bit of
697  * the corresponding "pve_pte" pointer for that.
698  */
699 #define PVE_PTEP_ALTACCT ((uintptr_t) 0x1)
700 #define PVE_PTEP_INTERNAL ((uintptr_t) 0x2)
701 #define PVE_PTEP_FLAGS (PVE_PTEP_ALTACCT | PVE_PTEP_INTERNAL)
702 
703 /**
704  * Set the ALTACCT bit for a specific PTE pointer.
705  *
706  * @param pvep A pointer to the current pv_entry mapping in the linked list of
707  *             mappings.
708  * @param idx Index of the chosen PTE pointer inside the PVE.
709  */
710 static inline void
pve_set_altacct(pv_entry_t * pvep,unsigned idx)711 pve_set_altacct(pv_entry_t *pvep, unsigned idx)
712 {
713 	assert(idx < PTE_PER_PVE);
714 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_ALTACCT);
715 }
716 
717 /**
718  * Set the INTERNAL bit for a specific PTE pointer.
719  *
720  * @param pvep A pointer to the current pv_entry mapping in the linked list of
721  *             mappings.
722  * @param idx Index of the chosen PTE pointer inside the PVE.
723  */
724 static inline void
pve_set_internal(pv_entry_t * pvep,unsigned idx)725 pve_set_internal(pv_entry_t *pvep, unsigned idx)
726 {
727 	assert(idx < PTE_PER_PVE);
728 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_INTERNAL);
729 }
730 
731 /**
732  * Clear the ALTACCT bit for a specific PTE pointer.
733  *
734  * @param pvep A pointer to the current pv_entry mapping in the linked list of
735  *             mappings.
736  * @param idx Index of the chosen PTE pointer inside the PVE.
737  */
738 static inline void
pve_clr_altacct(pv_entry_t * pvep,unsigned idx)739 pve_clr_altacct(pv_entry_t *pvep, unsigned idx)
740 {
741 	assert(idx < PTE_PER_PVE);
742 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_ALTACCT);
743 }
744 
745 /**
746  * Clear the INTERNAL bit for a specific PTE pointer.
747  *
748  * @param pvep A pointer to the current pv_entry mapping in the linked list of
749  *             mappings.
750  * @param idx Index of the chosen PTE pointer inside the PVE.
751  */
752 static inline void
pve_clr_internal(pv_entry_t * pvep,unsigned idx)753 pve_clr_internal(pv_entry_t *pvep, unsigned idx)
754 {
755 	assert(idx < PTE_PER_PVE);
756 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_INTERNAL);
757 }
758 
759 /**
760  * Return the ALTACCT bit for a specific PTE pointer.
761  *
762  * @param pvep A pointer to the current pv_entry mapping in the linked list of
763  *             mappings.
764  * @param idx Index of the chosen PTE pointer inside the PVE.
765  */
766 static inline bool
pve_get_altacct(pv_entry_t * pvep,unsigned idx)767 pve_get_altacct(pv_entry_t *pvep, unsigned idx)
768 {
769 	assert(idx < PTE_PER_PVE);
770 	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_ALTACCT;
771 }
772 
773 /**
774  * Return the INTERNAL bit for a specific PTE pointer.
775  *
776  * @param pvep A pointer to the current pv_entry mapping in the linked list of
777  *             mappings.
778  * @param idx Index of the chosen PTE pointer inside the PVE.
779  */
780 static inline bool
pve_get_internal(pv_entry_t * pvep,unsigned idx)781 pve_get_internal(pv_entry_t *pvep, unsigned idx)
782 {
783 	assert(idx < PTE_PER_PVE);
784 	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_INTERNAL;
785 }
786 
787 /**
788  * Return the next mapping (pv_entry) in a linked list of mappings. This applies
789  * to pv_head_table entries of type PVH_TYPE_PVEP.
790  *
791  * @param pvep A pointer to the current pv_entry mapping in the linked list of
792  *             mappings.
793  *
794  * @return The next virtual mapping for a physical page, or PV_ENTRY_NULL if the
795  *         end of the list has been reached.
796  */
797 static inline pv_entry_t *
pve_next(pv_entry_t * pvep)798 pve_next(pv_entry_t *pvep)
799 {
800 	return pvep->pve_next;
801 }
802 
803 /**
804  * Return a pointer to the pve_next field in a pv_entry. This value is used
805  * when adding and removing entries to a PVE list.
806  *
807  * @param pvep The pv_entry whose pve_next field is being accessed.
808  *
809  * @return Pointer to the pve_next field.
810  */
811 static inline pv_entry_t **
pve_next_ptr(pv_entry_t * pvep)812 pve_next_ptr(pv_entry_t *pvep)
813 {
814 	return &pvep->pve_next;
815 }
816 
817 /**
818  * Return a pointer to the page table entry for this mapping.
819  *
820  * @param pvep The pv_entry whose pve_ptep field is to be returned.
821  * @param idx Index of the chosen PTE pointer inside the PVE.
822  *
823  * @return Pointer to the page table entry.
824  */
825 static inline pt_entry_t *
pve_get_ptep(pv_entry_t * pvep,unsigned idx)826 pve_get_ptep(pv_entry_t *pvep, unsigned idx)
827 {
828 	assert(idx < PTE_PER_PVE);
829 	return (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_FLAGS);
830 }
831 
832 /**
833  * Update the page table entry for a specific physical to virtual mapping.
834  *
835  * @param pvep The pv_entry to update.
836  * @param idx Index of the chosen PTE pointer inside the PVE.
837  * @param ptep_new The new page table entry.
838  */
839 static inline void
pve_set_ptep(pv_entry_t * pvep,unsigned idx,pt_entry_t * ptep_new)840 pve_set_ptep(pv_entry_t *pvep, unsigned idx, pt_entry_t *ptep_new)
841 {
842 	assert(idx < PTE_PER_PVE);
843 	pvep->pve_ptep[idx] = ptep_new;
844 }
845 
846 /**
847  * Initialize all fields in a PVE to NULL.
848  *
849  * @param pvep The pv_entry to initialize.
850  */
851 static inline void
pve_init(pv_entry_t * pvep)852 pve_init(pv_entry_t *pvep)
853 {
854 	pvep->pve_next = PV_ENTRY_NULL;
855 	for (int i = 0; i < PTE_PER_PVE; i++) {
856 		pvep->pve_ptep[i] = PT_ENTRY_NULL;
857 	}
858 }
859 
860 /**
861  * Find PTE pointer in PVE and return its index.
862  *
863  * @param pvep The PVE to search.
864  * @param ptep PTE to search for.
865  *
866  * @return Index of the found entry, or -1 if no entry exists.
867  */
868 static inline int
pve_find_ptep_index(pv_entry_t * pvep,pt_entry_t * ptep)869 pve_find_ptep_index(pv_entry_t *pvep, pt_entry_t *ptep)
870 {
871 	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
872 		if (pve_get_ptep(pvep, i) == ptep) {
873 			return (int)i;
874 		}
875 	}
876 
877 	return -1;
878 }
879 
880 /**
881  * Checks if no PTEs are currently associated with this PVE.
882  *
883  * @param pvep The PVE to search.
884  *
885  * @return True if no PTEs are currently associated with this PVE, or false.
886  */
887 static inline bool
pve_is_empty(pv_entry_t * pvep)888 pve_is_empty(pv_entry_t *pvep)
889 {
890 	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
891 		if (pve_get_ptep(pvep, i) != PT_ENTRY_NULL) {
892 			return false;
893 		}
894 	}
895 
896 	return true;
897 }
898 
899 /**
900  * Prepend a new pv_entry node to a PVE list.
901  *
902  * @note This function does not actually modify the pv_head_table,
903  *       it only installs an updated pv_head_table entry in [locked_pvh]
904  *       that can later be passed to pvh_unlock() to update the actual array
905  *       entry.
906  *
907  * @param locked_pvh A wrapper struct containing the pv_head_table
908  *                   entry/pointer to update.  This entry represents
909  *                   the linked list of mappings to update.
910  * @param pvep The new mapping to add to the linked list.
911  */
912 static inline void
pve_add(locked_pvh_t * locked_pvh,pv_entry_t * pvep)913 pve_add(locked_pvh_t *locked_pvh, pv_entry_t *pvep)
914 {
915 	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
916 
917 	pvep->pve_next = pvh_pve_list(locked_pvh->pvh);
918 	pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
919 }
920 
921 /**
922  * Remove an entry from a PVE list of mappings.
923  *
924  * @note This function does not actually modify the pv_head_table,
925  *       it only installs an updated pv_head_table entry in [locked_pvh]
926  *       that can later be passed to pvh_unlock() to update the actual array
927  *       entry.
928  *
929  * @param locked_pvh A wrapper struct containing the pv_head_table entry/pointer
930  *                   to update.  This entry represents the linked list of mappings
931  *                   from which to remove an entry.
932  * @param pvepp A pointer to the pv_entry_t* that's being removed. If this entry
933  *              is the first in the linked list of mappings, then NULL should be
934  *              passed here and the removal will be reflected in the returned
935  *              pv_head_table entry.
936  * @param pvep The entry that should be removed. Should be identical to a
937  *             dereference of the pvepp parameter (unless it's the pv_head_table
938  *             entry).
939  */
940 static inline void
pve_remove(locked_pvh_t * locked_pvh,pv_entry_t ** pvepp,pv_entry_t * pvep)941 pve_remove(locked_pvh_t *locked_pvh, pv_entry_t **pvepp, pv_entry_t *pvep)
942 {
943 	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
944 
945 	if (pvepp == NULL) {
946 		assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
947 		    __func__, (void*)locked_pvh->pvh, pvep);
948 		if (pve_next(pvep) == PV_ENTRY_NULL) {
949 			/* The last mapping to this page is being removed. */
950 			pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
951 		} else {
952 			/**
953 			 * There are still mappings left, make the next one the new head of
954 			 * the list. This effectively removes the first entry from the list.
955 			 */
956 			pvh_update_head(locked_pvh, pve_next(pvep), PVH_TYPE_PVEP);
957 		}
958 	} else {
959 		/**
960 		 * Move the previous entry's next field to the entry after the one being
961 		 * removed. This will clobber the ALTACCT and INTERNAL bits.
962 		 */
963 		*pvepp = pve_next(pvep);
964 	}
965 }
966 
967 /**
968  * PVH_TYPE_PTDP Types and Helper Functions.
969  *
970  * The following are types and methods used to manipulate page table descriptor
971  * (PTD) objects. This is the type of pv_head_table entry used when a page is
972  * being used as a page table.
973  */
974 
975 /**
976  * Page table descriptor (PTD) info structure.
977  *
978  * Contains information about a page table. These pieces of data are separate
979  * from the PTD itself because in address spaces where the VM page size doesn't
980  * match the underlying hardware page size, one PTD could represent multiple
981  * page tables (and so will need multiple PTD info structures).
982  *
983  * These fields are also in their own struct so that they can be allocated
984  * separately from the associated pt_desc_t object. This allows us to allocate
985  * the counts in this structure in a way that ensures they don't fall within the
986  * same cache line as the main pt_desc_t object. This is important because the
987  * fields in this structure are atomically updated which could cause false
988  * sharing cache performance issues with the "va" field in pt_desc_t if all of
989  * the fields were within the same structure.
990  */
991 typedef struct {
992 	/*
993 	 * For non-leaf pagetables, should be 0.
994 	 * For leaf pagetables, should reflect the number of wired entries.
995 	 * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU
996 	 * operations are implicitly wired).
997 	 */
998 	unsigned short wiredcnt;
999 } ptd_info_t;
1000 
1001 /**
1002  * This type is used to identify a specific IOMMU driver and an instance of
1003  * that driver which owns a specific page or page table. This type will be used
1004  * within both PTD and PVE lists to track IOMMU-owned pages and IOMMU mappings
1005  * respectively.
1006  *
1007  * Despite the fact this value is not a pointer, we need to make this value sort
1008  * of look like a kernel pointer: the bottom 3-bits must be zero and the upper
1009  * bits must all be ones by default. This is due to the fact that this type can
1010  * be embedded into the PVH table to represent an IOMMU mapping. The PVH table
1011  * code expects "kernel-pointer-like" properties so it can store flags in those
1012  * areas of the 64-bit value.
1013  */
1014 typedef uint64_t iommu_instance_t;
1015 
1016 /* 8-bit ID of the IOMMU driver which the instance derives from. */
1017 #define IOMMU_ID_SHIFT 8U
1018 #define IOMMU_ID_MASK  0x000000000000FF00ULL
1019 
1020 #define GET_IOMMU_ID(x) ((sptm_iommu_id_t)(((x) & IOMMU_ID_MASK) >> IOMMU_ID_SHIFT))
1021 #define SET_IOMMU_ID(x) (((uint64_t)(x) << IOMMU_ID_SHIFT) & IOMMU_ID_MASK)
1022 
1023 /**
1024  * An IOMMU token is a 32-bit value unique to each instance of an IOMMU driver.
1025  * This is strictly used to help with debugging and provides a mechanism to
1026  * trace a mapping or page table back to the exact IOMMU instance that owns it.
1027  * Typically, this would be the instance ID, but for drivers that use only a
1028  * single global instance, this could be something else like a root page table
1029  * ppnum_t.
1030  */
1031 #define IOMMU_TOKEN_SHIFT 16U
1032 #define IOMMU_TOKEN_MASK  0x0000FFFFFFFF0000ULL
1033 
1034 #define GET_IOMMU_TOKEN(x) ((iommu_token_t)(((x) & IOMMU_TOKEN_MASK) >> IOMMU_TOKEN_SHIFT))
1035 #define SET_IOMMU_TOKEN(x) (((uint64_t)(x) << IOMMU_TOKEN_SHIFT) & IOMMU_TOKEN_MASK)
1036 
1037 /**
1038  * The default value for iommu_instance_t. See the type definition for more
1039  * details on why the upper bits need to initially be all ones.
1040  */
1041 #define IOMMU_INSTANCE_DEFAULT 0xFFFF000000000000ULL
1042 
1043 /**
1044  * Since "zero" is a valid IOMMU ID and token, the "NULL" value of an IOMMU
1045  * instance sets the ID and token to all ones as a sentinel invalid value.
1046  */
1047 #define IOMMU_INSTANCE_NULL 0xFFFFFFFFFFFFFF00ULL
1048 
1049 /**
1050  * Page Table Descriptor (PTD).
1051  *
1052  * Provides a per-table data structure and a way of keeping track of all page
1053  * tables in the system.
1054  *
1055  * This structure is also used as a convenient way of keeping track of IOMMU
1056  * pages (which may or may not be used as page tables). In that case the SPTM
1057  * frame type for the page will be XNU_IOMMU, the "iommu" field will describe
1058  * the owner of the page, and ptd_info[0].wiredcnt can be used as an arbitrary
1059  * refcnt controlled by the IOMMU driver.
1060  */
1061 typedef struct pt_desc {
1062 	/* Each page table is either owned by a pmap or a specific IOMMU. */
1063 	union {
1064 		struct pmap *pmap;
1065 	};
1066 
1067 	/**
1068 	 * The following fields contain per-page-table properties, and as such,
1069 	 * might have multiple elements each. This is due to a single PTD
1070 	 * potentially representing multiple page tables (in address spaces where
1071 	 * the VM page size differs from the hardware page size). Use the
1072 	 * ptd_get_index() function to get the correct index for a specific page
1073 	 * table.
1074 	 */
1075 
1076 	/**
1077 	 * The first address of the virtual address space this page table is
1078 	 * translating for, or a value set by an IOMMU driver if this PTD is being
1079 	 * used to track an IOMMU page.
1080 	 */
1081 	vm_offset_t va;
1082 
1083 	/**
1084 	 * ptd_info_t's are allocated separately so as to reduce false sharing
1085 	 * with the va field. This is desirable because ptd_info_t's are updated
1086 	 * atomically from all CPUs.
1087 	 */
1088 	ptd_info_t *ptd_info;
1089 } pt_desc_t;
1090 
1091 /**
1092  * Per-CPU structure for tracking in-flight SPTM retype operations.
1093  *
1094  * This structure is intended to be embedded in the pmap per-CPU data object,
1095  * and is meant to be used for situations in which the caller needs to ensure
1096  * that potentially sensitive concurrent SPTM operations have completed on other
1097  * CPUs prior to retyping a page.  If these sensitive operations haven't completed
1098  * when the retype occurs, and they happen to involve the page being retyped
1099  * (either directly or through mappings thereof), an SPTM violation panic may
1100  * result.
1101  */
1102 typedef struct {
1103 	/**
1104 	 * Critical section sequence number of the local CPU.  A value of zero
1105 	 * indicates that no retype epoch critical section is currently active on
1106 	 * the CPU.
1107 	 */
1108 	uint64_t local_seq;
1109 
1110 	/**
1111 	 * The sequence number to use the next time a retype epoch critical section
1112 	 * is entered on the local CPU.  This should monotonically increase.
1113 	 */
1114 	uint64_t next_seq;
1115 
1116 	/**
1117 	 * This array stores the retype sequence numbers observed on remote CPUs.
1118 	 * When the local CPU needs to wait for critical sections to complete on
1119 	 * other CPUs, this is intended to provide an initial sample of those other
1120 	 * CPUs' critical section state.  The caller can then wait for each remote
1121 	 * CPU's sequence number to return to zero or advance beyond the value
1122 	 * stored in its entry in this array.
1123 	 */
1124 	uint64_t remote_seq[MAX_CPUS];
1125 
1126 	/**
1127 	 * Flags used to track the state of an active retype epoch drain operation
1128 	 * on the local CPU.
1129 	 */
1130 
1131 	/**
1132 	 * This flag indicates that a drain operation has been prepared on the
1133 	 * local CPU by sampling remote CPU epoch states into the remote_seq array.
1134 	 * This must be set before the drain operation can be performed.
1135 	 */
1136 	#define PMAP_RETYPE_EPOCH_PREPARED (1 << 0)
1137 
1138 	/**
1139 	 * This flag indicates that one or more remote CPUs had a non-zero retype
1140 	 * epoch value when the remote_seq array was most recently sampled.
1141 	 * If this flag is not set, then we already know that no remote CPUs can
1142 	 * be in a critical section in which prior mapping state for the page to
1143 	 * be retyped may have been observed, so we can skip the drain operation.
1144 	 */
1145 	#define PMAP_RETYPE_EPOCH_DRAIN_REQUIRED (1 << 1)
1146 	uint8_t flags;
1147 } pmap_retype_epoch_t;
1148 
1149 #define PMAP_SPTM_PCPU_ALIGN (8192)
1150 
1151 typedef struct {
1152 	/**
1153 	 * Per-CPU array of SPTM_MAPPING_LIMIT PTE records, obtained from SPTM
1154 	 * during bootstrap.
1155 	 */
1156 	sptm_pte_t *sptm_prev_ptes;
1157 
1158 	/**
1159 	 * A piece of per-cpu scratch memory used by IOMMU drivers when passing data
1160 	 * into the SPTM. The size is defined by PMAP_IOMMU_SCRATCH_SIZE.
1161 	 */
1162 	void *sptm_iommu_scratch;
1163 
1164 	/* Accumulator for batched disjoint SPTM ops, to avoid excessive stack usage. */
1165 	sptm_disjoint_op_t sptm_ops[SPTM_MAPPING_LIMIT];
1166 
1167 	union {
1168 		/* Accumulator for batched VA-contiguous SPTM ops, to avoid excessive stack usage. */
1169 		sptm_pte_t sptm_templates[SPTM_MAPPING_LIMIT];
1170 
1171 		/* Accumulator for PA arrays to be passed to the SPTM, to avoid excessive stack usage. */
1172 		sptm_paddr_t sptm_paddrs[SPTM_MAPPING_LIMIT];
1173 	};
1174 
1175 	/* Base PA of ops array, for passing the ops into the SPTM. */
1176 	pmap_paddr_t sptm_ops_pa;
1177 
1178 	/* Base PA of templates array, for passing templates into the SPTM. */
1179 	pmap_paddr_t sptm_templates_pa;
1180 
1181 	/* Base PA of physical address array, for passing physical address lists into the SPTM. */
1182 	pmap_paddr_t sptm_paddrs_pa;
1183 
1184 	/* PMAP pagetable descriptors associated with each element of sptm_ops. */
1185 	pt_desc_t *sptm_ptds[SPTM_MAPPING_LIMIT];
1186 
1187 	/* PTD info objects associated with each pmap PTE pointer. */
1188 	ptd_info_t *sptm_ptd_info[SPTM_MAPPING_LIMIT];
1189 
1190 	/* Accounting-related flags for each element of sptm_ops. */
1191 	#define PMAP_SPTM_FLAG_INTERNAL (0x1)
1192 	#define PMAP_SPTM_FLAG_ALTACCT (0x2)
1193 	uint8_t sptm_acct_flags[SPTM_MAPPING_LIMIT];
1194 
1195 	/* Retype epoch tracking structure. */
1196 	pmap_retype_epoch_t retype_epoch;
1197 
1198 	/* Guest virtual machine dispatch structure. */
1199 	sptm_guest_dispatch_t sptm_guest_dispatch;
1200 
1201 	/* Guest virtual machine dispatch structure physical address. */
1202 	pmap_paddr_t sptm_guest_dispatch_paddr;
1203 
1204 	/* SPTM Logical CPU ID  */
1205 	uint16_t sptm_cpu_id;
1206 
1207 	/* Read index associated with this CPU's SPTM trace buffer  */
1208 	uint64_t sptm_trace_buffer_read_index;
1209 
1210 	/* Previous SPTM state for use with sptm_trace_num_new_traces */
1211 	uint64_t sptm_trace_prev_state;
1212 } __attribute__((aligned(PMAP_SPTM_PCPU_ALIGN))) pmap_sptm_percpu_data_t;
1213 
1214 _Static_assert((PAGE_SIZE % PMAP_SPTM_PCPU_ALIGN) == 0,
1215     "SPTM per-CPU data alignment does not fit evenly within a page");
1216 _Static_assert(sizeof(pmap_sptm_percpu_data_t) <= PMAP_SPTM_PCPU_ALIGN,
1217     "sizeof(pmap_sptm_percpu_data_t) is larger than PMAP_SPTM_PCPU_ALIGN");
1218 
1219 PERCPU_DECL(pmap_sptm_percpu_data_t, pmap_sptm_percpu);
1220 
1221 /**
1222  * Convert a pv_head_table entry/pointer into a page table descriptor pointer.
1223  * This should only be done if the type of this entry is PVH_TYPE_PTDP.
1224  *
1225  * @param pvh The pv_head_table entry/pointer to convert into a safe to
1226  *            dereference pt_desc_t*.
1227  *
1228  * @return Return back a safe to derefence pointer to the page table descriptor
1229  *         for this physical page by masking off the TYPE bits and adding any
1230  *         missing flags to the upper portion of the pointer.
1231  */
1232 static inline pt_desc_t*
pvh_ptd(uintptr_t pvh)1233 pvh_ptd(uintptr_t pvh)
1234 {
1235 	return (pt_desc_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
1236 }
1237 
1238 /**
1239  * Given an arbitrary page table entry, return back the page table descriptor
1240  * (PTD) object for the page table that contains that entry.
1241  *
1242  * @param ptep Pointer to a PTE whose page table descriptor object to return.
1243  *
1244  * @return The PTD object for the passed in page table.
1245  */
1246 static inline pt_desc_t *
ptep_get_ptd(const pt_entry_t * ptep)1247 ptep_get_ptd(const pt_entry_t *ptep)
1248 {
1249 	assert(ptep != NULL);
1250 
1251 	const vm_offset_t pt_base_va = (vm_offset_t)ptep;
1252 	uintptr_t pvh = pai_to_pvh(pa_index(kvtophys(pt_base_va)));
1253 
1254 	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1255 		panic("%s: invalid PV head 0x%llx for PTE %p", __func__, (uint64_t)pvh, ptep);
1256 	}
1257 
1258 	return pvh_ptd(pvh);
1259 }
1260 
1261 /**
1262  * Given an arbitrary page table entry, return back the pmap that owns that
1263  * page table.
1264  *
1265  * @note This won't work correctly for page tables owned by IOMMUs, because
1266  *       those table aren't owned by any specific pmap.
1267  *
1268  * @param ptep Pointer to a page table entry whose owner we're trying to return.
1269  *
1270  * @return The pmap that owns the given page table entry.
1271  */
1272 static inline struct pmap *
ptep_get_pmap(const pt_entry_t * ptep)1273 ptep_get_pmap(const pt_entry_t *ptep)
1274 {
1275 	return ptep_get_ptd(ptep)->pmap;
1276 }
1277 
1278 
1279 /**
1280  * Given an arbitrary translation table entry, get the page table descriptor
1281  * (PTD) object for the page table pointed to by the TTE.
1282  *
1283  * @param tte The translation table entry to parse. For instance, if this is an
1284  *            L2 TTE, then the PTD for the L3 table this entry points to will be
1285  *            returned.
1286  *
1287  * @return The page table descriptor (PTD) for the page table pointed to by this
1288  *         TTE.
1289  */
1290 static inline pt_desc_t *
tte_get_ptd(const tt_entry_t tte)1291 tte_get_ptd(const tt_entry_t tte)
1292 {
1293 	const vm_offset_t pt_base_va = (vm_offset_t)(tte & ~((tt_entry_t)PAGE_MASK));
1294 	uintptr_t pvh = pai_to_pvh(pa_index(pt_base_va));
1295 
1296 	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1297 		panic("%s: invalid PV head 0x%llx for TTE 0x%llx", __func__, (uint64_t)pvh, (uint64_t)tte);
1298 	}
1299 
1300 	return pvh_ptd(pvh);
1301 }
1302 
1303 /**
1304  * This function returns the ptd_info_t structure associated with a given
1305  * page table descriptor.
1306  *
1307  * @param ptd The page table descriptor that's being accessed.
1308  *
1309  * @return ptd_info_t structure associated with [ptd].
1310  */
1311 static inline ptd_info_t *
ptd_get_info(pt_desc_t * ptd)1312 ptd_get_info(pt_desc_t *ptd)
1313 {
1314 	assert(ptd != NULL);
1315 	return ptd->ptd_info;
1316 }
1317 
1318 /**
1319  * Given a pointer to a page table entry, return back the ptd_info structure
1320  * for the page table that contains that entry.
1321  *
1322  * @param ptep Pointer to a PTE whose ptd_info object to return.
1323  *
1324  * @return The ptd_info object for the page table that contains the passed in
1325  *         page table entry.
1326  */
1327 static inline ptd_info_t *
ptep_get_info(const pt_entry_t * ptep)1328 ptep_get_info(const pt_entry_t *ptep)
1329 {
1330 	return ptd_get_info(ptep_get_ptd(ptep));
1331 }
1332 
1333 /**
1334  * Return the virtual address mapped by the passed in leaf page table entry,
1335  * using an already-retrieved pagetable descriptor.
1336  *
1337  * @param ptdp pointer to the descriptor for the pagetable containing ptep
1338  * @param ptep Pointer to a PTE to parse
1339  */
1340 static inline vm_map_address_t
ptd_get_va(const pt_desc_t * ptdp,const pt_entry_t * ptep)1341 ptd_get_va(const pt_desc_t *ptdp, const pt_entry_t *ptep)
1342 {
1343 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);
1344 
1345 	vm_map_address_t va = ptdp->va;
1346 
1347 	const uint64_t pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(ptdp->pmap));
1348 	const vm_offset_t ptep_page = (vm_offset_t)ptep >> pmap_page_shift;
1349 
1350 	/**
1351 	 * Use the difference between the VM page shift and the hardware page shift
1352 	 * to get the index of the correct page table. In practice, this equates to
1353 	 * masking out the bottom two bits of the L3 table index in address spaces
1354 	 * where the VM page size is greater than the hardware page size. In address
1355 	 * spaces where they're identical, the index will always be zero.
1356 	 */
1357 	const unsigned int ttep_index = ptep_page & ((1U << (PAGE_SHIFT - pmap_page_shift)) - 1);
1358 	va += ttep_index * pt_attr_twig_size(pt_attr);
1359 
1360 	/* Increment VA now to target the VA space covered by this specific PTE */
1361 	const vm_offset_t ptep_index = ((vm_offset_t)ptep & pt_attr_leaf_offmask(pt_attr)) / sizeof(*ptep);
1362 	va += (ptep_index << pt_attr_leaf_shift(pt_attr));
1363 
1364 	return va;
1365 }
1366 
1367 /**
1368  * Return the virtual address that is being mapped by the passed in leaf page
1369  * table entry.
1370  *
1371  * @param ptep Pointer to a PTE to parse.
1372  */
1373 static inline vm_map_address_t
ptep_get_va(const pt_entry_t * ptep)1374 ptep_get_va(const pt_entry_t *ptep)
1375 {
1376 	return ptd_get_va(ptep_get_ptd(ptep), ptep);
1377 }
1378 
1379 /**
1380  * Physical Page Attribute Table (pp_attr_table) defines and helper functions.
1381  */
1382 
1383 /* How many bits to use for flags on a per-VM-page basis. */
1384 typedef uint16_t pp_attr_t;
1385 
1386 /* See the definition of pp_attr_table for more information. */
1387 extern volatile pp_attr_t* pp_attr_table;
1388 
1389 /**
1390  * Flags stored in the pp_attr_table on a per-physical-page basis.
1391  *
1392  * Please update the pv_walk LLDB macro if these flags are changed or added to.
1393  */
1394 
1395 /**
1396  * The bottom 6-bits are used to store the default WIMG (cacheability and memory
1397  * type) setting for this physical page. This can be changed by calling
1398  * pmap_set_cache_attributes().
1399  *
1400  * If a default WIMG setting isn't set for a page, then the default is Normal,
1401  * Cached memory (VM_WIMG_DEFAULT).
1402  */
1403 #define PP_ATTR_WIMG_MASK 0x003F
1404 #define PP_ATTR_WIMG(x) ((x) & PP_ATTR_WIMG_MASK)
1405 
1406 /**
1407  * The reference and modify bits keep track of whether a page has been accessed
1408  * or modified since the last time the bits were cleared. These bits are used to
1409  * enforce policy decisions in the VM layer.
1410  */
1411 #define PP_ATTR_REFERENCED 0x0040
1412 #define PP_ATTR_MODIFIED   0x0080
1413 
1414 /**
1415  * This physical page is being used as anonymous memory that's internally
1416  * managed by the VM and is not connected to an external pager. This flag is
1417  * only set/cleared on the first CPU mapping of a page (see PVH_FLAG_CPU). Any
1418  * subsequent mappings won't set/clear this flag until all mappings are removed
1419  * and a new CPU mapping is added.
1420  */
1421 #define PP_ATTR_INTERNAL 0x0100
1422 
1423 /**
1424  * This flag is used to keep track of pages that are still resident but are not
1425  * considered dirty and can be reclaimed under memory pressure. These pages do
1426  * not count as a part of the memory footprint, so the footprint ledger does not
1427  * need to be updated for these pages. This is hinted to the VM by the
1428  * `madvise(MADV_FREE_REUSABLE)` system call.
1429  */
1430 #define PP_ATTR_REUSABLE 0x0200
1431 
1432 /**
1433  * This flag denotes that a page is utilizing "alternate accounting". This means
1434  * that the pmap doesn't need to keep track of these pages with regards to the
1435  * footprint ledger because the VM is already accounting for them in a different
1436  * way. These include IOKit mappings (VM adds their entire virtual size to the
1437  * footprint), and purgeable pages (VM counts them only when non-volatile and
1438  * only for one "owner"), among others.
1439  *
1440  * Note that alternate accounting status is tracked on a per-mapping basis (not
1441  * per-page). Because of that the ALTACCT flag in the pp_attr_table is only used
1442  * when there's a single mapping to a page. When there are multiple mappings,
1443  * the status of this flag is tracked in the pv_head_table (see PVE_PTEP_ALTACCT
1444  * above).
1445  */
1446 #define PP_ATTR_ALTACCT 0x0400
1447 
1448 /**
1449  * This bit was originally used on x86 to keep track of what pages to not
1450  * encrypt during the hibernation process as a performance optimization when
1451  * encryption was done in software. This doesn't apply to the ARM
1452  * hibernation process because all pages are automatically encrypted using
1453  * hardware acceleration. Despite that, the pmap still keeps track of this flag
1454  * as a debugging aid on internal builds.
1455  *
1456  * TODO: This bit can probably be reclaimed:
1457  * rdar://70740650 (PMAP Cleanup: Potentially reclaim the PP_ATTR_NOENCRYPT bit on ARM)
1458  */
1459 #define PP_ATTR_NOENCRYPT 0x0800
1460 
1461 /**
1462  * These bits denote that a physical page is expecting the next access or
1463  * modification to set the PP_ATTR_REFERENCED and PP_ATTR_MODIFIED flags
1464  * respectively.
1465  */
1466 #define PP_ATTR_REFFAULT 0x1000
1467 #define PP_ATTR_MODFAULT 0x2000
1468 
1469 /**
1470  * Atomically set some flags in a pp_attr_table entry.
1471  *
1472  * @param pai The physical address index for the entry to update.
1473  * @param bits The flags to set in the entry.
1474  */
1475 static inline void
ppattr_set_bits(unsigned int pai,pp_attr_t bits)1476 ppattr_set_bits(unsigned int pai, pp_attr_t bits)
1477 {
1478 	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1479 	os_atomic_or(ppattr, bits, relaxed);
1480 }
1481 
1482 /**
1483  * Atomically clear some flags in a pp_attr_table entry.
1484  *
1485  * @param pai The physical address index for the entry to update.
1486  * @param bits The flags to clear in the entry.
1487  */
1488 static inline void
ppattr_clear_bits(unsigned int pai,pp_attr_t bits)1489 ppattr_clear_bits(unsigned int pai, pp_attr_t bits)
1490 {
1491 	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1492 	os_atomic_andnot(ppattr, bits, relaxed);
1493 }
1494 
1495 /**
1496  * General-purpose function for atomically modifying flags in a pp_attr_table entry.
1497  *
1498  * @param pai The physical address index for the entry to update.
1499  * @param bits_to_clear Mask of bits to atomically clear from the entry.
1500  * @param bits_to_set Mask of bits to atomically set in the entry.
1501  *
1502  * @note [bits_to_clear] and [bits_to_set] must not overlap.
1503  */
1504 static inline void
ppattr_modify_bits(unsigned int pai,pp_attr_t bits_to_clear,pp_attr_t bits_to_set)1505 ppattr_modify_bits(unsigned int pai, pp_attr_t bits_to_clear, pp_attr_t bits_to_set)
1506 {
1507 	assert((bits_to_set & bits_to_clear) == 0);
1508 	pp_attr_t prev_ppattr, new_ppattr;
1509 	os_atomic_rmw_loop(&pp_attr_table[pai], prev_ppattr, new_ppattr, relaxed, {
1510 		new_ppattr = (prev_ppattr & ~bits_to_clear) | bits_to_set;
1511 	});
1512 }
1513 
1514 /**
1515  * Return true if the pp_attr_table entry contains the passed in bits.
1516  *
1517  * @param pai The physical address index for the entry to test.
1518  * @param bits The flags to check for.
1519  */
1520 static inline bool
ppattr_test_bits(unsigned int pai,pp_attr_t bits)1521 ppattr_test_bits(unsigned int pai, pp_attr_t bits)
1522 {
1523 	const volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1524 	return (*ppattr & bits) == bits;
1525 }
1526 
1527 /**
1528  * Only set some flags in a pp_attr_table entry if the passed in physical
1529  * address is a kernel-managed address.
1530  *
1531  * @param pa The physical address for the entry to update.
1532  * @param bits The flags to set in the entry.
1533  */
1534 static inline void
ppattr_pa_set_bits(pmap_paddr_t pa,pp_attr_t bits)1535 ppattr_pa_set_bits(pmap_paddr_t pa, pp_attr_t bits)
1536 {
1537 	if (pa_valid(pa)) {
1538 		ppattr_set_bits(pa_index(pa), bits);
1539 	}
1540 }
1541 
1542 /**
1543  * Only clear some flags in a pp_attr_table entry if the passed in physical
1544  * address is a kernel-managed address.
1545  *
1546  * @param pa The physical address for the entry to update.
1547  * @param bits The flags to clear in the entry.
1548  */
1549 static inline void
ppattr_pa_clear_bits(pmap_paddr_t pa,pp_attr_t bits)1550 ppattr_pa_clear_bits(pmap_paddr_t pa, pp_attr_t bits)
1551 {
1552 	if (pa_valid(pa)) {
1553 		ppattr_clear_bits(pa_index(pa), bits);
1554 	}
1555 }
1556 
1557 /**
1558  * Only test flags in a pp_attr_table entry if the passed in physical address
1559  * is a kernel-managed page.
1560  *
1561  * @param pa The physical address for the entry to test.
1562  * @param bits The flags to check for.
1563  *
1564  * @return False if the PA isn't a kernel-managed page, otherwise true/false
1565  *         depending on whether the bits are set.
1566  */
1567 static inline bool
ppattr_pa_test_bits(pmap_paddr_t pa,pp_attr_t bits)1568 ppattr_pa_test_bits(pmap_paddr_t pa, pp_attr_t bits)
1569 {
1570 	return pa_valid(pa) ? ppattr_test_bits(pa_index(pa), bits) : false;
1571 }
1572 
1573 /**
1574  * Set the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the passed
1575  * in physical address is a kernel-managed page.
1576  *
1577  * @param pa The physical address for the entry to update.
1578  */
1579 static inline void
ppattr_pa_set_modify(pmap_paddr_t pa)1580 ppattr_pa_set_modify(pmap_paddr_t pa)
1581 {
1582 	ppattr_pa_set_bits(pa, PP_ATTR_MODIFIED);
1583 }
1584 
1585 /**
1586  * Clear the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the
1587  * passed in physical address is a kernel-managed page.
1588  *
1589  * @param pa The physical address for the entry to update.
1590  */
1591 static inline void
ppattr_pa_clear_modify(pmap_paddr_t pa)1592 ppattr_pa_clear_modify(pmap_paddr_t pa)
1593 {
1594 	ppattr_pa_clear_bits(pa, PP_ATTR_MODIFIED);
1595 }
1596 
1597 /**
1598  * Set the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1599  * passed in physical address is a kernel-managed page.
1600  *
1601  * @param pa The physical address for the entry to update.
1602  */
1603 static inline void
ppattr_pa_set_reference(pmap_paddr_t pa)1604 ppattr_pa_set_reference(pmap_paddr_t pa)
1605 {
1606 	ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
1607 }
1608 
1609 /**
1610  * Clear the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1611  * passed in physical address is a kernel-managed page.
1612  *
1613  * @param pa The physical address for the entry to update.
1614  */
1615 static inline void
ppattr_pa_clear_reference(pmap_paddr_t pa)1616 ppattr_pa_clear_reference(pmap_paddr_t pa)
1617 {
1618 	ppattr_pa_clear_bits(pa, PP_ATTR_REFERENCED);
1619 }
1620 
1621 /**
1622  * Set the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1623  *
1624  * @param pai The physical address index for the entry to update.
1625  */
1626 static inline void
ppattr_set_internal(unsigned int pai)1627 ppattr_set_internal(unsigned int pai)
1628 {
1629 	ppattr_set_bits(pai, PP_ATTR_INTERNAL);
1630 }
1631 
1632 /**
1633  * Clear the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1634  *
1635  * @param pai The physical address index for the entry to update.
1636  */
1637 static inline void
ppattr_clear_internal(unsigned int pai)1638 ppattr_clear_internal(unsigned int pai)
1639 {
1640 	ppattr_clear_bits(pai, PP_ATTR_INTERNAL);
1641 }
1642 
1643 /**
1644  * Return true if the pp_attr_table entry has the PP_ATTR_INTERNAL flag set.
1645  *
1646  * @param pai The physical address index for the entry to test.
1647  */
1648 static inline bool
ppattr_test_internal(unsigned int pai)1649 ppattr_test_internal(unsigned int pai)
1650 {
1651 	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1652 }
1653 
1654 /**
1655  * Set the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1656  *
1657  * @param pai The physical address index for the entry to update.
1658  */
1659 static inline void
ppattr_set_reusable(unsigned int pai)1660 ppattr_set_reusable(unsigned int pai)
1661 {
1662 	ppattr_set_bits(pai, PP_ATTR_REUSABLE);
1663 }
1664 
1665 /**
1666  * Clear the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1667  *
1668  * @param pai The physical address index for the entry to update.
1669  */
1670 static inline void
ppattr_clear_reusable(unsigned int pai)1671 ppattr_clear_reusable(unsigned int pai)
1672 {
1673 	ppattr_clear_bits(pai, PP_ATTR_REUSABLE);
1674 }
1675 
1676 /**
1677  * Return true if the pp_attr_table entry has the PP_ATTR_REUSABLE flag set.
1678  *
1679  * @param pai The physical address index for the entry to test.
1680  */
1681 static inline bool
ppattr_test_reusable(unsigned int pai)1682 ppattr_test_reusable(unsigned int pai)
1683 {
1684 	return ppattr_test_bits(pai, PP_ATTR_REUSABLE);
1685 }
1686 
1687 /**
1688  * Set the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1689  *
1690  * @note This is only valid when the ALTACCT flag is being tracked using the
1691  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1692  *       PP_ATTR_ALTACCT definitions for more information.
1693  *
1694  * @param pai The physical address index for the entry to update.
1695  */
1696 static inline void
ppattr_set_altacct(unsigned int pai)1697 ppattr_set_altacct(unsigned int pai)
1698 {
1699 	ppattr_set_bits(pai, PP_ATTR_ALTACCT);
1700 }
1701 
1702 /**
1703  * Clear the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1704  *
1705  * @note This is only valid when the ALTACCT flag is being tracked using the
1706  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1707  *       PP_ATTR_ALTACCT definitions for more information.
1708  *
1709  * @param pai The physical address index for the entry to update.
1710  */
1711 static inline void
ppattr_clear_altacct(unsigned int pai)1712 ppattr_clear_altacct(unsigned int pai)
1713 {
1714 	ppattr_clear_bits(pai, PP_ATTR_ALTACCT);
1715 }
1716 
1717 /**
1718  * Get the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1719  *
1720  * @note This is only valid when the ALTACCT flag is being tracked using the
1721  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1722  *       PP_ATTR_ALTACCT definitions for more information.
1723  *
1724  * @param pai The physical address index for the entry to test.
1725  *
1726  * @return True if the passed in page uses alternate accounting, false
1727  *         otherwise.
1728  */
1729 static inline bool
ppattr_is_altacct(unsigned int pai)1730 ppattr_is_altacct(unsigned int pai)
1731 {
1732 	return ppattr_test_bits(pai, PP_ATTR_ALTACCT);
1733 }
1734 
1735 /**
1736  * Get the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1737  *
1738  * @note This is only valid when the INTERNAL flag is being tracked using the
1739  *       pp_attr_table. See the descriptions above the PVE_PTEP_INTERNAL and
1740  *       PP_ATTR_INTERNAL definitions for more information.
1741  *
1742  * @param pai The physical address index for the entry to test.
1743  *
1744  * @return True if the passed in page is accounted for as "internal", false
1745  *         otherwise.
1746  */
1747 static inline bool
ppattr_is_internal(unsigned int pai)1748 ppattr_is_internal(unsigned int pai)
1749 {
1750 	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1751 }
1752 
1753 /**
1754  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1755  * depending on whether there are one or multiple mappings to a page. This
1756  * function abstracts out the difference between single and multiple mappings to
1757  * a page and provides a single function for determining whether alternate
1758  * accounting is set for a mapping.
1759  *
1760  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1761  *       definitions for more information.
1762  *
1763  * @param pai The physical address index for the entry to test.
1764  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1765  * @param idx Index of the chosen PTE pointer inside the PVE.
1766  *
1767  * @return True if the passed in page uses alternate accounting, false
1768  *         otherwise.
1769  */
1770 static inline bool
ppattr_pve_is_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1771 ppattr_pve_is_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1772 {
1773 	return (pvep == PV_ENTRY_NULL) ? ppattr_is_altacct(pai) : pve_get_altacct(pvep, idx);
1774 }
1775 
1776 /**
1777  * The "internal" (INTERNAL) status for a page is tracked differently
1778  * depending on whether there are one or multiple mappings to a page. This
1779  * function abstracts out the difference between single and multiple mappings to
1780  * a page and provides a single function for determining whether "internal"
1781  * is set for a mapping.
1782  *
1783  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1784  *       definitions for more information.
1785  *
1786  * @param pai The physical address index for the entry to test.
1787  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1788  * @param idx Index of the chosen PTE pointer inside the PVE.
1789  *
1790  * @return True if the passed in page is "internal", false otherwise.
1791  */
1792 static inline bool
ppattr_pve_is_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1793 ppattr_pve_is_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1794 {
1795 	return (pvep == PV_ENTRY_NULL) ? ppattr_is_internal(pai) : pve_get_internal(pvep, idx);
1796 }
1797 
1798 /**
1799  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1800  * depending on whether there are one or multiple mappings to a page. This
1801  * function abstracts out the difference between single and multiple mappings to
1802  * a page and provides a single function for setting the alternate accounting status
1803  * for a mapping.
1804  *
1805  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1806  *       definitions for more information.
1807  *
1808  * @param pai The physical address index for the entry to update.
1809  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1810  * @param idx Index of the chosen PTE pointer inside the PVE.
1811  */
1812 static inline void
ppattr_pve_set_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1813 ppattr_pve_set_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1814 {
1815 	if (pvep == PV_ENTRY_NULL) {
1816 		ppattr_set_altacct(pai);
1817 	} else {
1818 		pve_set_altacct(pvep, idx);
1819 	}
1820 }
1821 
1822 /**
1823  * The "internal" (INTERNAL) status for a page is tracked differently
1824  * depending on whether there are one or multiple mappings to a page. This
1825  * function abstracts out the difference between single and multiple mappings to
1826  * a page and provides a single function for setting the "internal" status
1827  * for a mapping.
1828  *
1829  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1830  *       definitions for more information.
1831  *
1832  * @param pai The physical address index for the entry to update.
1833  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1834  * @param idx Index of the chosen PTE pointer inside the PVE.
1835  */
1836 static inline void
ppattr_pve_set_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1837 ppattr_pve_set_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1838 {
1839 	if (pvep == PV_ENTRY_NULL) {
1840 		ppattr_set_internal(pai);
1841 	} else {
1842 		pve_set_internal(pvep, idx);
1843 	}
1844 }
1845 
1846 /**
1847  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1848  * depending on whether there are one or multiple mappings to a page. This
1849  * function abstracts out the difference between single and multiple mappings to
1850  * a page and provides a single function for clearing the alternate accounting status
1851  * for a mapping.
1852  *
1853  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1854  *       definitions for more information.
1855  *
1856  * @param pai The physical address index for the entry to update.
1857  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1858  * @param idx Index of the chosen PTE pointer inside the PVE.
1859  */
1860 static inline void
ppattr_pve_clr_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1861 ppattr_pve_clr_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1862 {
1863 	if (pvep == PV_ENTRY_NULL) {
1864 		ppattr_clear_altacct(pai);
1865 	} else {
1866 		pve_clr_altacct(pvep, idx);
1867 	}
1868 }
1869 
1870 /**
1871  * The "internal" (INTERNAL) status for a page is tracked differently
1872  * depending on whether there are one or multiple mappings to a page. This
1873  * function abstracts out the difference between single and multiple mappings to
1874  * a page and provides a single function for clearing the "internal" status
1875  * for a mapping.
1876  *
1877  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1878  *       definitions for more information.
1879  *
1880  * @param pai The physical address index for the entry to update.
1881  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1882  * @param idx Index of the chosen PTE pointer inside the PVE.
1883  */
1884 static inline void
ppattr_pve_clr_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1885 ppattr_pve_clr_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1886 {
1887 	if (pvep == PV_ENTRY_NULL) {
1888 		ppattr_clear_internal(pai);
1889 	} else {
1890 		pve_clr_internal(pvep, idx);
1891 	}
1892 }
1893 
1894 /**
1895  * Set the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1896  *
1897  * @param pai The physical address index for the entry to update.
1898  */
1899 static inline void
ppattr_set_reffault(unsigned int pai)1900 ppattr_set_reffault(unsigned int pai)
1901 {
1902 	ppattr_set_bits(pai, PP_ATTR_REFFAULT);
1903 }
1904 
1905 /**
1906  * Clear the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1907  *
1908  * @param pai The physical address index for the entry to update.
1909  */
1910 static inline void
ppattr_clear_reffault(unsigned int pai)1911 ppattr_clear_reffault(unsigned int pai)
1912 {
1913 	ppattr_clear_bits(pai, PP_ATTR_REFFAULT);
1914 }
1915 
1916 /**
1917  * Return true if the pp_attr_table entry has the PP_ATTR_REFFAULT flag set.
1918  *
1919  * @param pai The physical address index for the entry to test.
1920  */
1921 static inline bool
ppattr_test_reffault(unsigned int pai)1922 ppattr_test_reffault(unsigned int pai)
1923 {
1924 	return ppattr_test_bits(pai, PP_ATTR_REFFAULT);
1925 }
1926 
1927 /**
1928  * Set the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1929  *
1930  * @param pai The physical address index for the entry to update.
1931  */
1932 static inline void
ppattr_set_modfault(unsigned int pai)1933 ppattr_set_modfault(unsigned int pai)
1934 {
1935 	ppattr_set_bits(pai, PP_ATTR_MODFAULT);
1936 }
1937 
1938 /**
1939  * Clear the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1940  *
1941  * @param pai The physical address index for the entry to update.
1942  */
1943 static inline void
ppattr_clear_modfault(unsigned int pai)1944 ppattr_clear_modfault(unsigned int pai)
1945 {
1946 	ppattr_clear_bits(pai, PP_ATTR_MODFAULT);
1947 }
1948 
1949 /**
1950  * Return true if the pp_attr_table entry has the PP_ATTR_MODFAULT flag set.
1951  *
1952  * @param pai The physical address index for the entry to test.
1953  */
1954 static inline bool
ppattr_test_modfault(unsigned int pai)1955 ppattr_test_modfault(unsigned int pai)
1956 {
1957 	return ppattr_test_bits(pai, PP_ATTR_MODFAULT);
1958 }
1959 
1960 /**
1961  * Retype epoch operations:
1962  *
1963  * The retype epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap
1964  * can ensure all CPUs have observed updated mapping state before retyping a physical page.
1965  *
1966  * There are certain cases in which the pmap, while issuing an SPTM call that modifies
1967  * mappings, cannot hold locks such as the PVH lock which would prevent the page from
1968  * being concurrently retyped.  This is particularly true for batched operations such
1969  * as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes().
1970  * In these cases, the pmap may call pmap_retype_epoch_enter() to note that it is
1971  * performing such a sensitive operation on the local CPU.  It must then call
1972  * pmap_retype_epoch_exit() upon completion of the sensitive operation.
1973  *
1974  * Then, for any instance in which the pmap needs to retype a page without being
1975  * otherwise guaranteed (e.g. by VM layer locking or the existing page type) that such
1976  * a sensitive operation is not in progress on some other CPU, it must drain these
1977  * sensitive operations from other CPUs.  Specifically, it must ensure that any
1978  * sensitive operation which may have observed prior mapping state of the page that
1979  * is to be retyped has completed.  This is accomplished by first calling
1980  * pmap_retype_epoch_prepare_drain() to record the initial retype epoch state of
1981  * all CPUs, followed by pmap_retype_epoch_drain() to ensure all remote CPUs are
1982  * either not in an epoch or have advanced beyond the initially recorded epoch.
1983  * These are exposed as two separate functions in order to allow the calling CPU
1984  * to do other work between calling pmap_retype_epoch_prepare_drain() and
1985  * pmap_retype_epoch_drain(), as a best-effort attempt to minimize time wasted
1986  * spinning in pmap_retype_epoch_drain().
1987  *
1988  * When draining the retype epoch, the following assumptions must hold true:
1989  *
1990  * 1) The calling thread must guarantee that prior updates needed to bring the page
1991  * into the correct mapping state for retyping have already been performed and made
1992  * globally visible using the appropriate barriers.  In most cases this means that
1993  * all existing mappings of the page must have been removed.  For any alterations
1994  * of mapping state, global visibility is conveniently already guaranteed by the
1995  * DSBs that are architecturally required to synchronize PTE updates and the TLBIs
1996  * that follow them.
1997  *
1998  * 2) The calling thread must have some means of ensuring the new mappings cannot
1999  * be added for the page that would bring it out of the correct state for retyping.
2000  * This is typically done by holding the PVH lock and/or the exclusive pmap lock
2001  * such that pmap_enter() cannot concurrently execute against the page.
2002  *
2003  * 3) The calling thread must not perform any operation which requires preemptibility
2004  * between calling pmap_retype_epoch_prepare_drain() and pmap_retype_epoch_drain().
2005  */
2006 
2007 /**
2008  * Enter the retype epoch on the local CPU to indicate an in-progress SPTM operation
2009  * that may be sensitive to a concurrent retype operation on another CPU.
2010  *
2011  * @note This function increments the thread's preemption disable count and returns
2012  *       with preemption disabled.
2013  *
2014  * @note This function issues all required barriers to ensure correct ordering of
2015  *       the epoch update relative to ensuing SPTM accesses.
2016  */
2017 static inline void
pmap_retype_epoch_enter(void)2018 pmap_retype_epoch_enter(void)
2019 {
2020 	mp_disable_preemption();
2021 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2022 	assert(!preemption_enabled());
2023 
2024 	/* Must not already been in a retype epoch on this CPU. */
2025 	assert(retype_epoch->local_seq == 0);
2026 	retype_epoch->local_seq = ++retype_epoch->next_seq;
2027 	/* Unsigned 64-bit per-CPU integer should never overflow on any human timescale. */
2028 	assert(retype_epoch->local_seq != 0);
2029 
2030 	/**
2031 	 * Issue a store-load barrier to ensure that remote observers of any ensuing
2032 	 * SPTM accesses will also observe the epoch update.
2033 	 */
2034 	os_atomic_thread_fence(seq_cst);
2035 }
2036 
2037 /**
2038  * Exit the retype epoch on the local CPU to indicate completion of an SPTM operation
2039  * that may be sensitive to a concurrent retype operation on another CPU.
2040  *
2041  * @note This function must be called with preemption disabled and will decrement
2042  *       the current thread's preemption disable count.
2043  */
2044 static inline void
pmap_retype_epoch_exit(void)2045 pmap_retype_epoch_exit(void)
2046 {
2047 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2048 	assert(!preemption_enabled());
2049 	assert(retype_epoch->local_seq == retype_epoch->next_seq);
2050 
2051 	/**
2052 	 * Clear the sequence using a store-release operation to ensure that prior
2053 	 * SPTM modifications will be visible to remote observers before the absence
2054 	 * of an epoch is visible.
2055 	 */
2056 	os_atomic_store(&retype_epoch->local_seq, 0, release);
2057 	mp_enable_preemption();
2058 }
2059 
2060 /**
2061  * Helper for determining whether the current CPU is within an epoch.
2062  *
2063  * @return true if the current CPU holds the epoch, false otherwise.
2064  */
2065 static inline bool
pmap_in_epoch(void)2066 pmap_in_epoch(void)
2067 {
2068 	return !preemption_enabled() && (PERCPU_GET(pmap_sptm_percpu)->retype_epoch.local_seq != 0);
2069 }
2070 
2071 /**
2072  * Prepare the local CPU to perform an epoch drain operation by recording the retype
2073  * epoch state of other CPUs.
2074  *
2075  * @note This function increments the current thread's preemption disable count and
2076  *       returns with preemption disabled.
2077  *
2078  * @note This function issues all necessary barriers to ensure that the subsequent
2079  *       retype operation is not speculated ahead of the epoch sampling.
2080  *
2081  * @note This function does NOT issue any barriers to ensure that prior updates of
2082  *       mapping state are globally visible and have proper store-load ordering with
2083  *       respect to the scan performed here.  In the cases where this function is
2084  *       intended to be used, this ordering should be guaranteed automatically by
2085  *       the DSBs used to synchronize prior mapping updates issued by the caller.
2086  *       If this function is ever used in a situation where that cannot be guaranteed,
2087  *       the caller must issue at least the equivalent of 'dmb ish' (a.k.a. a seq_cst
2088  *       thread_fence) before calling this function.
2089  */
2090 static inline void
pmap_retype_epoch_prepare_drain(void)2091 pmap_retype_epoch_prepare_drain(void)
2092 {
2093 	mp_disable_preemption();
2094 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2095 	assert(retype_epoch->flags == 0);
2096 	unsigned int i = 0;
2097 	uint8_t flags = PMAP_RETYPE_EPOCH_PREPARED;
2098 
2099 	/* Sample each CPU's epoch state. */
2100 	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2101 		const uint64_t remote_epoch =
2102 		    os_atomic_load(&pmap_pcpu->retype_epoch.local_seq, relaxed);
2103 		retype_epoch->remote_seq[i] = remote_epoch;
2104 
2105 		/**
2106 		 * If the remote CPU has an active epoch, make a note to ourselves that
2107 		 * we'll need to drain it.
2108 		 */
2109 		if (remote_epoch != 0) {
2110 			flags |= PMAP_RETYPE_EPOCH_DRAIN_REQUIRED;
2111 		}
2112 		++i;
2113 	}
2114 	retype_epoch->flags = flags;
2115 
2116 	/**
2117 	 * Issue a load-load barrier to ensure subsequent drain or retype operations will
2118 	 * not be speculated ahead of the sampling we just did.
2119 	 */
2120 	os_atomic_thread_fence(acquire);
2121 }
2122 
2123 /**
2124  * Ensure that all CPUs have advanced beyond any active epoch that was recorded in the
2125  * most recent call to pmap_retype_epoch_prepare_drain().
2126  *
2127  * @note This function expects to be called with preemption disabled and will decrement
2128  *       the current thread's preemption disable count.
2129  *
2130  * @note pmap_retype_epoch_prepare_drain() must have been called on the local CPU
2131  *       prior to calling this function.  This function will return immediately if
2132  *       this prior call did not observe any active epochs on remote CPUs.
2133  *
2134  * @note This function issues all necessary barriers to ensure that the subsequent
2135  *       retype operation is not speculated ahead of the epoch sampling.
2136  */
2137 static inline void
pmap_retype_epoch_drain(void)2138 pmap_retype_epoch_drain(void)
2139 {
2140 	assert(!preemption_enabled());
2141 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2142 	const uint8_t flags = retype_epoch->flags;
2143 	assert(flags & PMAP_RETYPE_EPOCH_PREPARED);
2144 	retype_epoch->flags = 0;
2145 	if (!(flags & PMAP_RETYPE_EPOCH_DRAIN_REQUIRED)) {
2146 		mp_enable_preemption();
2147 		return;
2148 	}
2149 	unsigned int i = 0;
2150 	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2151 		if (retype_epoch->remote_seq[i] != 0) {
2152 			assert((pmap_pcpu->retype_epoch.local_seq == 0) ||
2153 			    (pmap_pcpu->retype_epoch.local_seq >= retype_epoch->remote_seq[i]));
2154 			/**
2155 			 * If the remote CPU was in an epoch, WFE-spin until it either exits the epoch
2156 			 * or advances to a new epoch.
2157 			 */
2158 			while ((os_atomic_load_exclusive(&pmap_pcpu->retype_epoch.local_seq, relaxed) ==
2159 			    retype_epoch->remote_seq[i])) {
2160 				__builtin_arm_wfe();
2161 			}
2162 			/* Clear the monitor if we exclusive-loaded a value that didn't require WFE. */
2163 			os_atomic_clear_exclusive();
2164 		}
2165 		++i;
2166 	}
2167 	mp_enable_preemption();
2168 	/**
2169 	 * Issue a load-load barrier to ensure subsequent retype operations will
2170 	 * not be speculated ahead of the sampling we just did.
2171 	 */
2172 	os_atomic_thread_fence(acquire);
2173 }
2174 
2175 /**
2176  * Helper to determine whether a frame type is one that requires automatic
2177  * retyping (by the pmap layer) back to XNU_DEFAULT when all mappings of the
2178  * page are gone.
2179  *
2180  * @return true if the type requires auto-retyping, false otherwise.
2181  */
2182 static inline bool
pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type)2183 pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type)
2184 {
2185 	return (frame_type == XNU_USER_EXEC) || (frame_type == XNU_USER_DEBUG) ||
2186 	       (frame_type == XNU_USER_JIT) || (frame_type == XNU_ROZONE) ||
2187 	       (frame_type == XNU_KERNEL_RESTRICTED);
2188 }
2189 
2190 
2191 /**
2192  * If necessary, prepare a physical page for being retyped back to XNU_DEFAULT
2193  * after the last CPU mapping has been removed.  This is only needed for pages of
2194  * certain special types such as the various executable types and the kernel RO
2195  * zone type.
2196  *
2197  * @note The PVH lock for the physical page that is getting a new mapping
2198  *       registered must already be held.
2199  *
2200  * @param pa The physical address of the recently-unmapped page.
2201  *
2202  * @return true if the page will need to be retyped, false otherwise.
2203  */
2204 static inline bool
pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa)2205 pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa)
2206 {
2207 	pvh_assert_locked(pa_index(pa));
2208 	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
2209 	if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
2210 		pmap_retype_epoch_prepare_drain();
2211 		return true;
2212 	}
2213 	return false;
2214 }
2215 
2216 /**
2217  * If necessary, retype a physical page back to XNU_DEFAULT after the last CPU
2218  * mapping has been removed.  This is only needed for pages of certain special
2219  * types such as the various executable types, the kernel RO zone type,
2220  * and XNU_KERNEL_RESTRICTED.
2221  *
2222  * @note The PVH lock for the physical page that is getting a new mapping
2223  *       registered must already be held.
2224  *
2225  * @param pa The physical address of the recently-unmapped page.
2226  *
2227  * @return true if the page needed to be retyped, false otherwise.
2228  */
2229 static inline bool
pmap_retype_unmapped_page(pmap_paddr_t pa)2230 pmap_retype_unmapped_page(pmap_paddr_t pa)
2231 {
2232 	pvh_assert_locked(pa_index(pa));
2233 	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
2234 	if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
2235 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2236 		pmap_retype_epoch_drain();
2237 		sptm_retype(pa & ~PAGE_MASK, frame_type, XNU_DEFAULT, retype_params);
2238 		return true;
2239 	}
2240 	return false;
2241 }
2242 
2243 static inline boolean_t
pmap_is_preemptible(void)2244 pmap_is_preemptible(void)
2245 {
2246 	return preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT) || PMAP_IS_HIBERNATING();
2247 }
2248 
2249 /**
2250  * This helper function ensures that potentially-long-running batched operations are
2251  * called in preemptible context before entering the SPTM, so that the SPTM call may
2252  * periodically exit to allow pending urgent ASTs to be taken.
2253  */
2254 static inline void
pmap_verify_preemptible(void)2255 pmap_verify_preemptible(void)
2256 {
2257 	assert(pmap_is_preemptible());
2258 }
2259 
2260 /**
2261  * The minimum number of pages to keep in the PPL page free list.
2262  *
2263  * We define our target as 8 pages: enough for 2 page table pages, a PTD page,
2264  * and a PV page; in essence, twice as many pages as may be necessary to satisfy
2265  * a single pmap_enter request.
2266  */
2267 #define PMAP_MIN_FREE_PPL_PAGES 8
2268 
2269 /**
2270  * Flags passed to various page allocation functions, usually accessed through
2271  * the pmap_page_alloc() API. Each function that can take these flags as
2272  * a part of its option field, will describe these flags in its function header.
2273  */
2274 
2275 /* Can be used when no allocation flags are wanted. */
2276 #define PMAP_PAGE_ALLOCATE_NONE 0x0
2277 
2278 /**
2279  * Instruct the allocation function to return immediately if no pages are
2280  * current available. Without this flag, the function will spin and wait for a
2281  * page to become available. This flag can be required in some circumstances
2282  * (for instance, when allocating pages from within the PPL).
2283  */
2284 #define PMAP_PAGE_ALLOCATE_NOWAIT 0x1
2285 
2286 /**
2287  * Instructs an allocation function to fallback to reclaiming a userspace page
2288  * table if it failed to allocate a page from the free lists. This can be useful
2289  * when allocating from within the PPL because refilling the free lists requires
2290  * exiting and re-entering the PPL (which incurs extra latency).
2291  *
2292  * This is a quick way of allocating a page at the expense of having to
2293  * reallocate the table the next time one of its mappings is accessed.
2294  */
2295 #define PMAP_PAGE_RECLAIM_NOWAIT 0x2
2296 
2297 /**
2298  * Instructs an allocation function to avoid zero-filling the newly-allocated
2299  * page.  This should be used only if you know the page will be fully initialized
2300  * by some other means on the relevant allocation path.
2301  */
2302 #define PMAP_PAGE_NOZEROFILL 0x4
2303 
2304 /**
2305  * Global variables exported to the rest of the internal pmap implementation.
2306  */
2307 extern pmap_paddr_t sptm_cpu_iommu_scratch_start;
2308 extern pmap_paddr_t sptm_cpu_iommu_scratch_end;
2309 extern unsigned int inuse_pmap_pages_count;
2310 extern vm_object_t pmap_object;
2311 extern uint32_t pv_alloc_initial_target;
2312 extern uint32_t pv_kern_alloc_initial_target;
2313 
2314 /**
2315  * Functions exported to the rest of the internal pmap implementation.
2316  */
2317 extern void pmap_data_bootstrap(void);
2318 extern void pmap_enqueue_pages(vm_page_t);
2319 extern kern_return_t pmap_page_alloc(pmap_paddr_t *, unsigned);
2320 extern void pmap_page_free(pmap_paddr_t);
2321 
2322 /**
2323  * The modes in which a pmap lock can be acquired. Note that shared access
2324  * doesn't necessarily mean "read-only". As long as data is atomically updated
2325  * correctly (to account for multi-cpu accesses) data can still get written with
2326  * a shared lock held. Care just needs to be taken so as to not introduce any
2327  * race conditions when there are multiple writers.
2328  *
2329  * This is here in pmap_data.h because it's a needed parameter for pv_alloc()
2330  * and pmap_enter_pv(). This header is always included in pmap_internal.h before
2331  * the rest of the pmap locking code is defined so there shouldn't be any issues
2332  * with missing types.
2333  */
2334 OS_ENUM(pmap_lock_mode, uint8_t,
2335     PMAP_LOCK_SHARED,
2336     PMAP_LOCK_EXCLUSIVE,
2337     PMAP_LOCK_HELD);
2338 
2339 /**
2340  * Possible return values for pv_alloc(). See the pv_alloc() function header for
2341  * a description of each of these values.
2342  */
2343 typedef enum {
2344 	PV_ALLOC_SUCCESS,
2345 	PV_ALLOC_RETRY,
2346 	PV_ALLOC_FAIL
2347 } pv_alloc_return_t;
2348 
2349 extern pv_alloc_return_t pv_alloc(
2350 	pmap_t, pmap_lock_mode_t, unsigned int, pv_entry_t **, locked_pvh_t *, volatile uint16_t *);
2351 extern void pv_free(pv_entry_t *);
2352 extern void pv_list_free(pv_entry_t *, pv_entry_t *, unsigned int);
2353 extern void pmap_compute_pv_targets(void);
2354 extern pv_alloc_return_t pmap_enter_pv(
2355 	pmap_t, pt_entry_t *, unsigned int, pmap_lock_mode_t, locked_pvh_t *, pv_entry_t **, int *);
2356 
2357 typedef enum {
2358 	PV_REMOVE_SUCCESS, /* found a mapping */
2359 	PV_REMOVE_FAIL /* no mapping found */
2360 } pv_remove_return_t;
2361 
2362 extern pv_remove_return_t pmap_remove_pv(pmap_t, pt_entry_t *, locked_pvh_t *, bool *, bool *);
2363 
2364 extern void ptd_bootstrap(pt_desc_t *, unsigned int);
2365 extern pt_desc_t *ptd_alloc_unlinked(unsigned int);
2366 extern pt_desc_t *ptd_alloc(pmap_t, unsigned int);
2367 extern void ptd_deallocate(pt_desc_t *);
2368 extern void ptd_info_init(
2369 	pt_desc_t *, pmap_t, vm_map_address_t, unsigned int, pt_entry_t *);
2370 
2371 extern kern_return_t pmap_ledger_credit(pmap_t, int, ledger_amount_t);
2372 extern kern_return_t pmap_ledger_debit(pmap_t, int, ledger_amount_t);
2373 
2374 extern void validate_pmap_internal(const volatile struct pmap *, const char *);
2375 extern void validate_pmap_mutable_internal(const volatile struct pmap *, const char *);
2376 
2377 /**
2378  * Macro function wrappers around pmap validation so that the calling function
2379  * can be printed in the panic strings for easier validation failure debugging.
2380  */
2381 #define validate_pmap(x) validate_pmap_internal(x, __func__)
2382 #define validate_pmap_mutable(x) validate_pmap_mutable_internal(x, __func__)
2383 
2384 /**
2385  * This structure describes a SPTM-owned I/O range.
2386  *
2387  * @note This doesn't necessarily have to represent "I/O" only, this can also
2388  *       represent non-kernel-managed DRAM (e.g., iBoot carveouts). Any physical
2389  *       address region that isn't considered "kernel-managed" is fair game.
2390  *
2391  * @note The layout of this structure needs to map 1-to-1 with the pmap-io-range
2392  *       device tree nodes. Astris (through the LowGlobals) also depends on the
2393  *       consistency of this structure.
2394  *
2395  * @note These definitions are copied to SPTM and they need to be in sync.
2396  */
2397 typedef struct pmap_io_range {
2398 	/* Physical address of the PPL-owned I/O range. */
2399 	uint64_t addr;
2400 
2401 	/* Length (in bytes) of the PPL-owned I/O range. */
2402 	uint64_t len;
2403 
2404 	/* Strong DSB required for pages in this range. */
2405 	#define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31)
2406 
2407 	/* Corresponds to memory carved out by bootloader. */
2408 	#define PMAP_IO_RANGE_CARVEOUT (1UL << 30)
2409 
2410 	/* Pages in this range need to be included in the hibernation image */
2411 	#define PMAP_IO_RANGE_NEEDS_HIBERNATING (1UL << 29)
2412 
2413 	/* Mark the range as 'owned' by a given subsystem */
2414 	#define PMAP_IO_RANGE_OWNED (1UL << 28)
2415 
2416 	/**
2417 	 * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional
2418 	 * mapping flags (defined above).
2419 	 */
2420 	uint32_t wimg;
2421 
2422 	/* 4 Character Code (4CC) describing what this range is. */
2423 	uint32_t signature;
2424 } pmap_io_range_t;
2425 
2426 /* Reminder: be sure to change all relevant device trees if you change the layout of pmap_io_range_t */
2427 _Static_assert(sizeof(pmap_io_range_t) == 24, "unexpected size for pmap_io_range_t");
2428 
2429 extern pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t);
2430 
2431 /**
2432  * This structure describes a sub-page-size I/O region owned by SPTM but the kernel can write to.
2433  *
2434  * @note I/O filter software will use a collection of such data structures to determine access
2435  *       permissions to a page owned by SPTM.
2436  *
2437  * @note The {signature, offset} key is used to index a collection of such data structures to
2438  *       optimize for space in the case where one page layout is repeated for many devices, such
2439  *       as the memory controller channels.
2440  */
2441 typedef struct pmap_io_filter_entry {
2442 	/* 4 Character Code (4CC) describing what this range (page) is. */
2443 	uint32_t signature;
2444 
2445 	/* Offset within the page. It has to be within [0, PAGE_SIZE). */
2446 	uint16_t offset;
2447 
2448 	/* Length of the range, and (offset + length) has to be within [0, PAGE_SIZE). */
2449 	uint16_t length;
2450 } pmap_io_filter_entry_t;
2451 
2452 _Static_assert(sizeof(pmap_io_filter_entry_t) == 8, "unexpected size for pmap_io_filter_entry_t");
2453 
2454 extern void pmap_cpu_data_init_internal(unsigned int);
2455