xref: /xnu-10063.121.3/osfmk/arm64/sptm/pmap/pmap_data.h (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa) !
1 /*
2  * Copyright (c) 2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /**
29  * This header file is used to store the types, prototypes, and inline functions
30  * that define some of the most important data structures used in the pmap. This
31  * header is only meant for sharing types within the pmap; if a type is meant to
32  * be used by the rest of the kernel, then put it into osfmk/arm64/sptm/pmap/pmap.h.
33  */
34 #pragma once
35 
36 #include <stdint.h>
37 
38 #include <kern/ledger.h>
39 #include <mach/vm_types.h>
40 #include <mach_assert.h>
41 #include <vm/vm_page.h>
42 
43 #include <arm/cpu_data.h>
44 #include <arm/machine_routines.h>
45 #include <arm64/proc_reg.h>
46 
47 /* Temporary include before moving all ledger functions into pmap_data.c */
48 #include <os/refcnt.h>
49 
50 /**
51  * These headers are safe to be included in this file since they shouldn't rely
52  * on any of the internal pmap header files (so no circular dependencies).
53  */
54 #include <arm64/sptm/pmap/pmap.h>
55 #include <arm64/sptm/pmap/pmap_pt_geometry.h>
56 
57 #include <arm64/sptm/sptm.h>
58 
59 /**
60  * These values represent the first and last kernel-managed physical addresses.
61  * We keep track of extra metadata on kernel-managed pages compared to other
62  * pages (usually iBoot carved out memory or I/O).
63  */
64 extern pmap_paddr_t vm_first_phys, vm_last_phys;
65 
66 /**
67  * Return whether the given address represents a kernel-managed physical page.
68  *
69  * Whether a page is considered "kernel-managed" is determined by the BootArgs
70  * passed by the bootloader. Typically memory carved out by the bootloader as
71  * well as I/O memory should return false.
72  *
73  * @param pa The physical address to check.
74  */
75 static inline bool
pa_valid(pmap_paddr_t pa)76 pa_valid(pmap_paddr_t pa)
77 {
78 	return (pa >= vm_first_phys) && (pa < vm_last_phys);
79 }
80 
81 /* Sentinal value indicating an invalid physical address index. */
82 #define INVALID_PAI UINT_MAX
83 
84 /**
85  * The pmap has a variety of data structures (pv_head_table/pp_attr_table) that
86  * contain an entry for every kernel-managed page in the system. These systems
87  * are indexed with physical address indices ("pai") generated by this function.
88  *
89  * The logic is simple since there should be one entry in each of these data
90  * structures for each kernel-managed physical page in the system. These data
91  * structures are allocated on boot based on the amount of memory available.
92  *
93  * @note PAIs are defined using the VM page size, which might not be identical
94  *       to the underlying hardware page size for an arbitrary address space.
95  *       This means that the data structures relying on PAIs will contain one
96  *       entry for each VM page, not hardware page.
97  *
98  * @note This function is only valid for physical addresses that are
99  *       kernel-managed.
100  */
101 static inline unsigned int
pa_index(pmap_paddr_t pa)102 pa_index(pmap_paddr_t pa)
103 {
104 	return (unsigned int)atop(pa - vm_first_phys);
105 }
106 
107 /**
108  * Convert from a physical address index (pai) back to a raw physical address.
109  *
110  * @param pai The physical address index to convert to a PA.
111  *
112  * @return The page-aligned physical address corresponding to [pai].
113  */
114 static inline pmap_paddr_t
pai_to_pa(unsigned int pai)115 pai_to_pa(unsigned int pai)
116 {
117 	return ptoa((pmap_paddr_t)pai) + vm_first_phys;
118 }
119 
120 /* See the definition of pv_head_table for more information. */
121 extern uintptr_t *pv_head_table;
122 
123 /* Represents a NULL entry in the pv_head_table. */
124 #define PV_ENTRY_NULL ((pv_entry_t *) 0)
125 
126 /**
127  * Given a physical address index, return the corresponding pv_head_table entry.
128  *
129  * @note The returned entry might be invalid, or a pointer to a pt_entry_t,
130  *       pv_entry_t, or pt_desc_t depending on the type for this entry.
131  *       Determine the type using pvh_test_type().
132  *
133  * @param pai The index returned by pa_index() for the page whose pv_head_table
134  *            entry should be retrieved.
135  */
136 static inline uintptr_t
pai_to_pvh(unsigned int pai)137 pai_to_pvh(unsigned int pai)
138 {
139 	return pv_head_table[pai];
140 }
141 
142 /**
143  * Each pv_head_table entry can be one of four different types:
144  *
145  * - PVH_TYPE_NULL: No mappings to the physical page exist outside of the
146  *                  physical aperture. Physical aperture mappings are not
147  *                  tracked in the pv_head_table.
148  *
149  * - PVH_TYPE_PVEP: There are multiple mappings to the physical page.
150  *                  These entries are linked lists of pv_entry_t objects (which
151  *                  each contain a pointer to the associated PTE and a pointer
152  *                  to the next entry in the list).
153  *
154  * - PVH_TYPE_PTEP: There is a single mapping to the physical page. Once more
155  *                  mappings are created, this entry will get upgraded to an
156  *                  entry of type PVH_TYPE_PVEP. These entries are pointers
157  *                  directly to the page table entry that contain the mapping
158  *                  (pt_entry_t*).
159  *
160  * - PVH_TYPE_PTDP: The physical page is being used as a page table. These
161  *                  entries are pointers to page table descriptor structures
162  *                  (pt_desc_t) which contain metadata related to each page
163  *                  table.
164  *
165  * The type is stored in the bottom two bits of each pv_head_table entry. That
166  * type needs to be checked before dereferencing the pointer to determine which
167  * pointer type to dereference as.
168  */
169 #define PVH_TYPE_NULL 0x0UL
170 #define PVH_TYPE_PVEP 0x1UL
171 #define PVH_TYPE_PTEP 0x2UL
172 #define PVH_TYPE_PTDP 0x3UL
173 
174 #define PVH_TYPE_MASK (0x3UL)
175 
176 
177 /**
178  * PV_HEAD_TABLE Flags.
179  *
180  * All flags listed below are stored in the pv_head_table entry/pointer
181  * (per-physical-page) unless otherwise noted.
182  *
183  * Please update the pv_walk LLDB macro if these flags are changed or added to.
184  */
185 
186 /**
187  * This flag is set for every mapping created by an IOMMU.
188  *
189  * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
190  * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
191  */
192 #define PVH_FLAG_IOMMU 0x4UL
193 
194 /**
195  * This flag is only valid when PVH_FLAG_IOMMU is set. For an IOMMU mapping, if
196  * this bit is set, then the PTE pointer points directly into the IOMMU page
197  * table for this mapping. If this bit is cleared, then the "PTE pointer" is
198  * actually a pointer to the IOMMU descriptor object that owns this mapping.
199  *
200  * There are cases where it's not easy to tie an IOMMU mapping directly to a
201  * specific page table, so this allows us to at least get a pointer to which
202  * IOMMU created this mapping which is useful for debugging purposes.
203  *
204  * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
205  * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
206  */
207 #define PVH_FLAG_IOMMU_TABLE (1ULL << 63)
208 
209 /**
210  * This flag is set when the first CPU (non-IOMMU) mapping is created. This is
211  * important to keep track of because various accounting statistics are based on
212  * the options specified for the first CPU mapping. This flag, and thus the
213  * accounting statistics, will persist as long as there *any* mappings of the
214  * page (including IOMMU mappings). This works because the accounting for a page
215  * should not need to change until the page is recycled by the VM layer, and we
216  * double-check that there are no mappings (CPU or IOMMU) when a page is
217  * recycled (see: pmap_verify_free()).
218  */
219 #define PVH_FLAG_CPU (1ULL << 62)
220 
221 /* This bit is used as a lock when modifying a pv_head_table entry. */
222 #define PVH_LOCK_BIT 61
223 #define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT)
224 
225 /**
226  * This flag is set when there are any executable mappings to this physical
227  * page. This is used to prevent any writable mappings from being created at
228  * the same time an executable mapping exists.
229  */
230 #define PVH_FLAG_EXEC (1ULL << 60)
231 
232 /**
233  * This flag is used to mark that a page has been hashed into the hibernation
234  * image.
235  *
236  * The hibernation driver will use this to ensure that all PPL-owned memory is
237  * correctly included into the hibernation image (a missing PPL page could be
238  * a security concern when coming out of hibernation).
239  */
240 #define PVH_FLAG_HASHED (1ULL << 58)
241 
242 
243 /**
244  * This flag is used to mark that a PV head entry has been placed into
245  * "sleep mode", which typically happens when the lock owner needs to
246  * process a long PV list.  If this bit is set, threads which contend
247  * on the PVH lock must call thread_block() to wait until they are awakened
248  * by the current lock owner releasing the lock.
249  */
250 #define PVH_FLAG_SLEEP (1ULL << 54)
251 
252 /**
253  * These bits need to be set to safely dereference a pv_head_table
254  * entry/pointer.
255  *
256  * Any change to this #define should also update the copy located in the pmap.py
257  * LLDB macros file.
258  */
259 #define PVH_MUTABLE_FLAGS (PVH_FLAG_CPU | PVH_FLAG_EXEC | PVH_FLAG_HASHED)
260 
261 #define PVH_LOCK_FLAGS (PVH_FLAG_LOCK | PVH_FLAG_SLEEP)
262 
263 #define PVH_HIGH_FLAGS (PVH_MUTABLE_FLAGS | PVH_LOCK_FLAGS)
264 
265 /* Mask used to clear out the TYPE bits from a pv_head_table entry/pointer. */
266 #define PVH_LIST_MASK (~PVH_TYPE_MASK)
267 
268 /* Which 32-bit word in each pv_head_table entry/pointer contains the LOCK bit. */
269 #define PVH_LOCK_WORD 1 /* Assumes little-endian */
270 
271 /**
272  * Assert that a pv_head_table entry is locked. Will panic if the lock isn't
273  * acquired.
274  *
275  * @param index The physical address index to check.
276  */
277 static inline void
pvh_assert_locked(__assert_only unsigned int index)278 pvh_assert_locked(__assert_only unsigned int index)
279 {
280 	assertf(pv_head_table[index] & PVH_LOCK_FLAGS,
281 	    "%s: PVH %p for pai 0x%x not locked or in sleep mode", __func__,
282 	    &pv_head_table[index], index);
283 }
284 
285 /**
286  * Helper function for returning the 32-bit PVH lock word corresponding
287  * to a physical address index.
288  *
289  * @param index The physical address index of the pv_head_table entry
290  *
291  * @return A pointer to the 32-bit word containing the lock bit
292  */
293 static inline uint32_t*
pvh_lock_word(unsigned int index)294 pvh_lock_word(unsigned int index)
295 {
296 	return (uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD;
297 }
298 
299 /**
300  * Helper macro for computing the lock bit offset within the 32-bit
301  * lock word for each PV head entry.
302  *
303  * @return A 32-bit integer containing the lock bit offset.
304  */
305 #define PVH_LOCK_BIT_OFFSET (PVH_LOCK_BIT - (PVH_LOCK_WORD * 32))
306 
307 /**
308  * Lock a pv_head_table entry, and return the value stored in the pv_head_table array.
309  *
310  * @param index The physical address index of the pv_head_table entry to lock.
311  *
312  * @return A wrapper object with the contents of the locked pv_head_table entry.
313  */
314 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock(unsigned int index)315 pvh_lock(unsigned int index)
316 {
317 	extern unsigned int not_in_kdp;
318 	const bool was_preemptible = preemption_enabled();
319 	assert(was_preemptible || (startup_phase < STARTUP_SUB_EARLY_BOOT) || !not_in_kdp);
320 
321 	bool (^check_preemption)(void) = ^bool (void) {
322 		return was_preemptible && pmap_pending_preemption();
323 	};
324 
325 	hw_lock_status_t ret;
326 	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
327 	do {
328 		ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
329 		    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
330 
331 		if (ret == HW_LOCK_ACQUIRED) {
332 			locked_pvh.pvh = pv_head_table[index];
333 			if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
334 				wait_result_t wres;
335 				wres = assert_wait(&pv_head_table[index], THREAD_UNINT);
336 				hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
337 				assertf(wres == THREAD_WAITING, "%s: unexpected wait result %d", __func__, wres);
338 				thread_block(THREAD_CONTINUE_NULL);
339 				ret = HW_LOCK_CONTENDED;
340 			}
341 		}
342 	} while (ret != HW_LOCK_ACQUIRED);
343 
344 	return locked_pvh;
345 }
346 
347 /**
348  * Lock a pvh_head_table entry, possibly in a preemption-disabled context.
349  *
350  * @note This function is only meant for special use cases in which pmap
351  *       functions must be invoked with preemption disabled.  These cases
352  *       are expected to be rare and limited.  If you think you need to
353  *       use this in more places, you're probably wrong.
354  *
355  * @param index The physical address index of the pv_head_table entry to lock.
356  *
357  * @return A wrapper object with the contents of the locked pv_head_table entry.
358  */
359 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock_nopreempt(unsigned int index)360 pvh_lock_nopreempt(unsigned int index)
361 {
362 	if (__improbable(preemption_enabled())) {
363 		return pvh_lock(index);
364 	}
365 	hw_lock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
366 	const locked_pvh_t locked_pvh = {.pvh = pv_head_table[index], .pai = index};
367 
368 	if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
369 		panic("%s invoked on sleep-mode PVH %p for pai 0x%x", __func__, &pv_head_table[index], index);
370 	}
371 
372 	return locked_pvh;
373 }
374 
375 /**
376  * Attempt to lock a pv_head_table entry, failing if the lock can't be immediately acquired.
377  *
378  * @param index The physical address index of the pv_head_table entry to lock.
379  *
380  * @return A wrapper object with the contents of the locked pv_head_table entry if successful,
381  *         0 otherwise.
382  */
383 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_try_lock(unsigned int index)384 pvh_try_lock(unsigned int index)
385 {
386 	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
387 	bool locked = hw_lock_bit_try(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
388 
389 	if (locked) {
390 		locked_pvh.pvh = pv_head_table[index];
391 		assert(locked_pvh.pvh != 0);
392 		if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
393 			hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
394 			locked_pvh.pvh = 0;
395 		}
396 	}
397 
398 	return locked_pvh;
399 }
400 
401 /**
402  * Helper for determining whether a preceding pvh_try_lock() call succeeded.
403  *
404  * @param locked_pvh A wrapper representing a possibly-locked PV head table entry
405  *        returned by pvh_try_lock().
406  *
407  * @return True if [locked_pvh] represents a successfully-locked PVH, false otherwise.
408  */
409 static inline bool
pvh_try_lock_success(const locked_pvh_t * locked_pvh)410 pvh_try_lock_success(const locked_pvh_t *locked_pvh)
411 {
412 	assert(locked_pvh != NULL);
413 	return locked_pvh->pvh != 0;
414 }
415 
416 /**
417  * Place a pv_head_table entry in sleep mode, so that other threads contending on the PVH
418  * lock will sleep until this thread calls pvh_unlock().
419  *
420  * @note It is legal to call this function if the lock is already in sleep mode.
421  *       In that case, the call will have no effect.
422  * @note This function must not be called with preemption disabled by any other agent
423  *       but [locked_pvh] itself.  Preemption must be fully re-enabled by the time
424  *       this function returns, either because it was already enabled (because the
425  *       lock was already in sleep mode), or because this function enabled it by placing
426  *       the lock in sleep mode.
427  *
428  * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
429  */
430 static inline void
pvh_lock_enter_sleep_mode(locked_pvh_t * locked_pvh)431 pvh_lock_enter_sleep_mode(locked_pvh_t *locked_pvh)
432 {
433 	assert(locked_pvh != NULL);
434 	assert(locked_pvh->pvh != 0);
435 	unsigned int index = locked_pvh->pai;
436 	pvh_assert_locked(index);
437 	if (!(pv_head_table[index] & PVH_FLAG_SLEEP)) {
438 		os_atomic_store(&pv_head_table[index], pv_head_table[index] | PVH_FLAG_SLEEP, relaxed);
439 		/**
440 		 * Tell the scheduler that this thread may need a priority boost if it needs to go
441 		 * off-core, to reduce the likelihood of priority inversion.
442 		 */
443 		locked_pvh->pri_token = thread_priority_floor_start();
444 		hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
445 	}
446 	assert(preemption_enabled());
447 }
448 
449 /**
450  * Check that a pv_head_table entry/pointer is a specific type.
451  *
452  * @param pvh The pv_head_table entry/pointer to check.
453  * @param type The type to check for.
454  *
455  * @return True if the pv_head_table entry is of the passed in type, false
456  *         otherwise.
457  */
458 static inline bool
pvh_test_type(uintptr_t pvh,uintptr_t type)459 pvh_test_type(uintptr_t pvh, uintptr_t type)
460 {
461 	return (pvh & PVH_TYPE_MASK) == type;
462 }
463 
464 /**
465  * Unlock a pv_head_table entry, updating the contents of the entry with the passed-in value.
466  *
467  * @note Only the non-lock flags, pointer, and type fields of the entry will be updated
468  *       according to the passed-in value.  PVH_LOCK_FLAGS will be ignored as they are
469  *       directly manipulated by this function.
470  *
471  * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
472  *        The pvh field from this entry, except for the PVH_LOCK_FLAGS bits, will be stored
473  *        in pv_head_table to reflect any updates that may have been performed on the PV list
474  *        while the lock was held.
475  */
476 static inline void
pvh_unlock(locked_pvh_t * locked_pvh)477 pvh_unlock(locked_pvh_t *locked_pvh)
478 {
479 	assert(locked_pvh != NULL);
480 	assert(locked_pvh->pvh != 0);
481 	unsigned int index = locked_pvh->pai;
482 	pvh_assert_locked(index);
483 	const uintptr_t old_pvh = pv_head_table[index];
484 	bool pri_floor_end = false;
485 
486 	if (__improbable(old_pvh & PVH_FLAG_SLEEP)) {
487 		pri_floor_end = true;
488 		const bool was_preemptible = preemption_enabled();
489 		bool (^check_preemption)(void) = ^bool (void) {
490 			return was_preemptible && pmap_pending_preemption();
491 		};
492 
493 		hw_lock_status_t ret;
494 		do {
495 			ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
496 			    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
497 		} while (ret != HW_LOCK_ACQUIRED);
498 
499 		os_atomic_store(&pv_head_table[index],
500 		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
501 		thread_wakeup(&pv_head_table[index]);
502 	} else if ((old_pvh & ~PVH_LOCK_FLAGS) != (locked_pvh->pvh & ~PVH_LOCK_FLAGS)) {
503 		os_atomic_store(&pv_head_table[index],
504 		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
505 	}
506 	hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
507 
508 	if (__improbable(pri_floor_end)) {
509 		thread_priority_floor_end(&locked_pvh->pri_token);
510 	}
511 
512 	locked_pvh->pvh = 0;
513 }
514 
515 /**
516  * Convert a pv_head_table entry/pointer into a page table entry pointer. This
517  * should only be done if the type of this entry is PVH_TYPE_PTEP.
518  *
519  * @param pvh The pv_head_table entry/pointer to convert into a pt_entry_t*.
520  *
521  * @return Return back a safe to derefence pointer to the single mapping of this
522  *         physical page by masking off the TYPE bits and adding any missing
523  *         flags to the upper portion of the pointer.
524  */
525 static inline pt_entry_t*
pvh_ptep(uintptr_t pvh)526 pvh_ptep(uintptr_t pvh)
527 {
528 	assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
529 	return (pt_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
530 }
531 
532 /**
533  * Convert a pv_head_table entry/pointer into a PVE list pointer. This
534  * should only be done if the type of this entry is PVH_TYPE_PVEP.
535  *
536  * @param pvh The pv_head_table entry/pointer to convert into a safe to
537  *            dereference pv_entry_t*.
538  *
539  * @return Return back a safe to derefence pointer to the first mapping of this
540  *         physical page by masking off the TYPE bits and adding any missing
541  *         flags to the upper portion of the pointer.
542  */
543 static inline pv_entry_t*
pvh_pve_list(uintptr_t pvh)544 pvh_pve_list(uintptr_t pvh)
545 {
546 	assert(pvh_test_type(pvh, PVH_TYPE_PVEP));
547 	return (pv_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
548 }
549 
550 /**
551  * Return the mutable flags associated with a pv_head_table entry/pointer.
552  *
553  * @param pvh The pv_head_table entry whose flags to get.
554  *
555  * @return The mutable flags encoded in [pvh].
556  */
557 static inline uintptr_t
pvh_get_flags(uintptr_t pvh)558 pvh_get_flags(uintptr_t pvh)
559 {
560 	return pvh & PVH_MUTABLE_FLAGS;
561 }
562 
563 /**
564  * Update the flags associated with a pv_head_table entry/pointer.
565  *
566  * @note This function does not actually modify the pv_head_table,
567  *       it only installs an updated pv_head_table entry in [locked_pvh]
568  *       that can later be passed to pvh_unlock() to update the actual array
569  *       entry.
570  *
571  * @param locked_pvh A wrapper struct containing the pv_head_table
572  *                   entry/pointer to update.
573  *
574  */
575 static inline void
pvh_set_flags(locked_pvh_t * locked_pvh,uintptr_t flags)576 pvh_set_flags(locked_pvh_t *locked_pvh, uintptr_t flags)
577 {
578 	locked_pvh->pvh = (locked_pvh->pvh & ~PVH_MUTABLE_FLAGS) | (flags & PVH_MUTABLE_FLAGS);
579 }
580 
581 /**
582  * Update a pv_head_table entry/pointer to be a different type and/or point to
583  * a different object.
584  *
585  * @note This function does not actually modify the pv_head_table,
586  *       it only installs an updated pv_head_table entry in [locked_pvh]
587  *       that can later be passed to pvh_unlock() to update the actual array
588  *       entry.
589  *
590  * @param locked_pvh A wrapper struct containing the pv_head_table
591  *                   entry/pointer to update.
592  * @param pvep The new entry to use. This could be either a pt_entry_t*,
593  *             pv_entry_t*, or pt_desc_t* depending on the type.
594  * @param type The type of the new entry.
595  */
596 static inline void
pvh_update_head(locked_pvh_t * locked_pvh,void * pvep,unsigned int type)597 pvh_update_head(locked_pvh_t *locked_pvh, void *pvep, unsigned int type)
598 {
599 	assert(!((uintptr_t)pvep & PVH_TYPE_MASK));
600 	const uintptr_t pvh_flags = locked_pvh->pvh & PVH_HIGH_FLAGS;
601 	locked_pvh->pvh = ((uintptr_t)pvep & ~PVH_HIGH_FLAGS) | type | pvh_flags;
602 }
603 
604 /**
605  * Performs an in-place update of a pv_head_table entry/pointer to be a
606  * different type and/or point to a different object.
607  *
608  * @note The pv_head_table entry CAN'T already be locked.
609  *
610  * @note This function will clobber any existing flags stored in the PVH
611  *       pointer. It's up to the caller to preserve flags if that functionality
612  *       is needed (either by ensuring `pvep` contains those flags, or by
613  *       manually setting the flags after this call).
614  *
615  * @param index The array index of the pv_head_table entry to update.
616  * @param pvh The new entry to use. This could be either a pt_entry_t*,
617  *            pv_entry_t*, or pt_desc_t* depending on the type.
618  * @param type The type of the new entry.
619  */
620 static inline void
pvh_store_head_unlocked(unsigned int index,uintptr_t pvh,unsigned int type)621 pvh_store_head_unlocked(unsigned int index, uintptr_t pvh, unsigned int type)
622 {
623 	assert(!(pv_head_table[index] & PVH_LOCK_FLAGS));
624 	pv_head_table[index] = (pvh | type) & ~PVH_LOCK_FLAGS;
625 }
626 
627 /**
628  * Given a page table entry pointer retrieved from the pv_head_table (from an
629  * entry of type PVH_TYPE_PTEP or PVH_TYPE_PVEP), return back whether the PTE is
630  * an IOMMU mapping.
631  *
632  * @note The way this function determines whether the passed in pointer is
633  *       pointing to an IOMMU PTE, is by checking for a special flag stored in
634  *       the lower bits of the pointer. This flag is only set on pointers stored
635  *       in the pv_head_table, and as such, this function will only work on
636  *       pointers retrieved from the pv_head_table. If a pointer to a PTE was
637  *       directly retrieved from an IOMMU's page tables, this function would
638  *       always return false despite actually being an IOMMU PTE.
639  *
640  * @param ptep A PTE pointer obtained from the pv_head_table to check.
641  *
642  * @return True if the entry is an IOMMU mapping, false otherwise.
643  */
644 static inline bool
pvh_ptep_is_iommu(const pt_entry_t * ptep)645 pvh_ptep_is_iommu(const pt_entry_t *ptep)
646 {
647 #ifdef PVH_FLAG_IOMMU
648 	return (uintptr_t)ptep & PVH_FLAG_IOMMU;
649 #else /* PVH_FLAG_IOMMU */
650 	#pragma unused(ptep)
651 	return false;
652 #endif /* PVH_FLAG_IOMMU */
653 }
654 
655 /**
656  * Sometimes the PTE pointers retrieved from the pv_head_table (from an entry of
657  * type PVH_TYPE_PTEP or PVH_TYPE_PVEP) contain flags themselves. This function
658  * strips out those flags and returns back a dereferencable pointer.
659  *
660  * @param ptep The PTE pointer to strip out the unwanted flags.
661  *
662  * @return A valid dereferencable pointer to the page table entry.
663  */
664 static inline const pt_entry_t*
pvh_strip_ptep(const pt_entry_t * ptep)665 pvh_strip_ptep(const pt_entry_t *ptep)
666 {
667 #ifdef PVH_FLAG_IOMMU
668 	const uintptr_t pte_va = (uintptr_t)ptep;
669 	return (const pt_entry_t*)((pte_va & ~PVH_FLAG_IOMMU) | PVH_FLAG_IOMMU_TABLE);
670 #else /* PVH_FLAG_IOMMU */
671 	return ptep;
672 #endif /* PVH_FLAG_IOMMU */
673 }
674 
675 /**
676  * PVH_TYPE_PVEP Helper Functions.
677  *
678  * The following are methods used to manipulate PVE lists. This is the type of
679  * pv_head_table entry used when there are multiple mappings to a single
680  * physical page.
681  */
682 
683 /**
684  * Whether a physical page is using "alternate accounting" (ALTACCT) for its
685  * ledger statistics is something that needs to be tracked on a per-mapping
686  * basis, not on a per-physical-page basis. Because of that, it's tracked
687  * differently depending on whether there's a single mapping to a page
688  * (PVH_TYPE_PTEP) or multiple (PVH_TYPE_PVEP). For single mappings, the bit is
689  * tracked in the pp_attr_table. But when there are multiple mappings, the least
690  * significant bit of the corresponding "pve_pte" pointer in each pv_entry object
691  * is used as a marker for pages using alternate accounting.
692  *
693  * @note See the definition for PP_ATTR_ALTACCT for a more detailed description
694  *       of what "alternate accounting" actually means in respect to the
695  *       footprint ledger.
696  *
697  * Since some code (KernelDiskImages, e.g.) might map a phsyical page as
698  * "device" memory (i.e. external) while it's also being used as regular
699  * "anonymous" memory (i.e. internal) in user space, we have to manage the
700  * "internal" attribute per mapping rather than per physical page.
701  * When there are multiple mappings, we use the next least significant bit of
702  * the corresponding "pve_pte" pointer for that.
703  */
704 #define PVE_PTEP_ALTACCT ((uintptr_t) 0x1)
705 #define PVE_PTEP_INTERNAL ((uintptr_t) 0x2)
706 #define PVE_PTEP_FLAGS (PVE_PTEP_ALTACCT | PVE_PTEP_INTERNAL)
707 
708 /**
709  * Set the ALTACCT bit for a specific PTE pointer.
710  *
711  * @param pvep A pointer to the current pv_entry mapping in the linked list of
712  *             mappings.
713  * @param idx Index of the chosen PTE pointer inside the PVE.
714  */
715 static inline void
pve_set_altacct(pv_entry_t * pvep,unsigned idx)716 pve_set_altacct(pv_entry_t *pvep, unsigned idx)
717 {
718 	assert(idx < PTE_PER_PVE);
719 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_ALTACCT);
720 }
721 
722 /**
723  * Set the INTERNAL bit for a specific PTE pointer.
724  *
725  * @param pvep A pointer to the current pv_entry mapping in the linked list of
726  *             mappings.
727  * @param idx Index of the chosen PTE pointer inside the PVE.
728  */
729 static inline void
pve_set_internal(pv_entry_t * pvep,unsigned idx)730 pve_set_internal(pv_entry_t *pvep, unsigned idx)
731 {
732 	assert(idx < PTE_PER_PVE);
733 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_INTERNAL);
734 }
735 
736 /**
737  * Clear the ALTACCT bit for a specific PTE pointer.
738  *
739  * @param pvep A pointer to the current pv_entry mapping in the linked list of
740  *             mappings.
741  * @param idx Index of the chosen PTE pointer inside the PVE.
742  */
743 static inline void
pve_clr_altacct(pv_entry_t * pvep,unsigned idx)744 pve_clr_altacct(pv_entry_t *pvep, unsigned idx)
745 {
746 	assert(idx < PTE_PER_PVE);
747 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_ALTACCT);
748 }
749 
750 /**
751  * Clear the INTERNAL bit for a specific PTE pointer.
752  *
753  * @param pvep A pointer to the current pv_entry mapping in the linked list of
754  *             mappings.
755  * @param idx Index of the chosen PTE pointer inside the PVE.
756  */
757 static inline void
pve_clr_internal(pv_entry_t * pvep,unsigned idx)758 pve_clr_internal(pv_entry_t *pvep, unsigned idx)
759 {
760 	assert(idx < PTE_PER_PVE);
761 	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_INTERNAL);
762 }
763 
764 /**
765  * Return the ALTACCT bit for a specific PTE pointer.
766  *
767  * @param pvep A pointer to the current pv_entry mapping in the linked list of
768  *             mappings.
769  * @param idx Index of the chosen PTE pointer inside the PVE.
770  */
771 static inline bool
pve_get_altacct(pv_entry_t * pvep,unsigned idx)772 pve_get_altacct(pv_entry_t *pvep, unsigned idx)
773 {
774 	assert(idx < PTE_PER_PVE);
775 	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_ALTACCT;
776 }
777 
778 /**
779  * Return the INTERNAL bit for a specific PTE pointer.
780  *
781  * @param pvep A pointer to the current pv_entry mapping in the linked list of
782  *             mappings.
783  * @param idx Index of the chosen PTE pointer inside the PVE.
784  */
785 static inline bool
pve_get_internal(pv_entry_t * pvep,unsigned idx)786 pve_get_internal(pv_entry_t *pvep, unsigned idx)
787 {
788 	assert(idx < PTE_PER_PVE);
789 	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_INTERNAL;
790 }
791 
792 /**
793  * Return the next mapping (pv_entry) in a linked list of mappings. This applies
794  * to pv_head_table entries of type PVH_TYPE_PVEP.
795  *
796  * @param pvep A pointer to the current pv_entry mapping in the linked list of
797  *             mappings.
798  *
799  * @return The next virtual mapping for a physical page, or PV_ENTRY_NULL if the
800  *         end of the list has been reached.
801  */
802 static inline pv_entry_t *
pve_next(pv_entry_t * pvep)803 pve_next(pv_entry_t *pvep)
804 {
805 	return pvep->pve_next;
806 }
807 
808 /**
809  * Return a pointer to the pve_next field in a pv_entry. This value is used
810  * when adding and removing entries to a PVE list.
811  *
812  * @param pvep The pv_entry whose pve_next field is being accessed.
813  *
814  * @return Pointer to the pve_next field.
815  */
816 static inline pv_entry_t **
pve_next_ptr(pv_entry_t * pvep)817 pve_next_ptr(pv_entry_t *pvep)
818 {
819 	return &pvep->pve_next;
820 }
821 
822 /**
823  * Return a pointer to the page table entry for this mapping.
824  *
825  * @param pvep The pv_entry whose pve_ptep field is to be returned.
826  * @param idx Index of the chosen PTE pointer inside the PVE.
827  *
828  * @return Pointer to the page table entry.
829  */
830 static inline pt_entry_t *
pve_get_ptep(pv_entry_t * pvep,unsigned idx)831 pve_get_ptep(pv_entry_t *pvep, unsigned idx)
832 {
833 	assert(idx < PTE_PER_PVE);
834 	return (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_FLAGS);
835 }
836 
837 /**
838  * Update the page table entry for a specific physical to virtual mapping.
839  *
840  * @param pvep The pv_entry to update.
841  * @param idx Index of the chosen PTE pointer inside the PVE.
842  * @param ptep_new The new page table entry.
843  */
844 static inline void
pve_set_ptep(pv_entry_t * pvep,unsigned idx,pt_entry_t * ptep_new)845 pve_set_ptep(pv_entry_t *pvep, unsigned idx, pt_entry_t *ptep_new)
846 {
847 	assert(idx < PTE_PER_PVE);
848 	pvep->pve_ptep[idx] = ptep_new;
849 }
850 
851 /**
852  * Initialize all fields in a PVE to NULL.
853  *
854  * @param pvep The pv_entry to initialize.
855  */
856 static inline void
pve_init(pv_entry_t * pvep)857 pve_init(pv_entry_t *pvep)
858 {
859 	pvep->pve_next = PV_ENTRY_NULL;
860 	for (int i = 0; i < PTE_PER_PVE; i++) {
861 		pvep->pve_ptep[i] = PT_ENTRY_NULL;
862 	}
863 }
864 
865 /**
866  * Find PTE pointer in PVE and return its index.
867  *
868  * @param pvep The PVE to search.
869  * @param ptep PTE to search for.
870  *
871  * @return Index of the found entry, or -1 if no entry exists.
872  */
873 static inline int
pve_find_ptep_index(pv_entry_t * pvep,pt_entry_t * ptep)874 pve_find_ptep_index(pv_entry_t *pvep, pt_entry_t *ptep)
875 {
876 	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
877 		if (pve_get_ptep(pvep, i) == ptep) {
878 			return (int)i;
879 		}
880 	}
881 
882 	return -1;
883 }
884 
885 /**
886  * Checks if no PTEs are currently associated with this PVE.
887  *
888  * @param pvep The PVE to search.
889  *
890  * @return True if no PTEs are currently associated with this PVE, or false.
891  */
892 static inline bool
pve_is_empty(pv_entry_t * pvep)893 pve_is_empty(pv_entry_t *pvep)
894 {
895 	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
896 		if (pve_get_ptep(pvep, i) != PT_ENTRY_NULL) {
897 			return false;
898 		}
899 	}
900 
901 	return true;
902 }
903 
904 /**
905  * Prepend a new pv_entry node to a PVE list.
906  *
907  * @note This function does not actually modify the pv_head_table,
908  *       it only installs an updated pv_head_table entry in [locked_pvh]
909  *       that can later be passed to pvh_unlock() to update the actual array
910  *       entry.
911  *
912  * @param locked_pvh A wrapper struct containing the pv_head_table
913  *                   entry/pointer to update.  This entry represents
914  *                   the linked list of mappings to update.
915  * @param pvep The new mapping to add to the linked list.
916  */
917 static inline void
pve_add(locked_pvh_t * locked_pvh,pv_entry_t * pvep)918 pve_add(locked_pvh_t *locked_pvh, pv_entry_t *pvep)
919 {
920 	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
921 
922 	pvep->pve_next = pvh_pve_list(locked_pvh->pvh);
923 	pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
924 }
925 
926 /**
927  * Remove an entry from a PVE list of mappings.
928  *
929  * @note This function does not actually modify the pv_head_table,
930  *       it only installs an updated pv_head_table entry in [locked_pvh]
931  *       that can later be passed to pvh_unlock() to update the actual array
932  *       entry.
933  *
934  * @param locked_pvh A wrapper struct containing the pv_head_table entry/pointer
935  *                   to update.  This entry represents the linked list of mappings
936  *                   from which to remove an entry.
937  * @param pvepp A pointer to the pv_entry_t* that's being removed. If this entry
938  *              is the first in the linked list of mappings, then NULL should be
939  *              passed here and the removal will be reflected in the returned
940  *              pv_head_table entry.
941  * @param pvep The entry that should be removed. Should be identical to a
942  *             dereference of the pvepp parameter (unless it's the pv_head_table
943  *             entry).
944  */
945 static inline void
pve_remove(locked_pvh_t * locked_pvh,pv_entry_t ** pvepp,pv_entry_t * pvep)946 pve_remove(locked_pvh_t *locked_pvh, pv_entry_t **pvepp, pv_entry_t *pvep)
947 {
948 	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
949 
950 	if (pvepp == NULL) {
951 		assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
952 		    __func__, (void*)locked_pvh->pvh, pvep);
953 		if (pve_next(pvep) == PV_ENTRY_NULL) {
954 			/* The last mapping to this page is being removed. */
955 			pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
956 		} else {
957 			/**
958 			 * There are still mappings left, make the next one the new head of
959 			 * the list. This effectively removes the first entry from the list.
960 			 */
961 			pvh_update_head(locked_pvh, pve_next(pvep), PVH_TYPE_PVEP);
962 		}
963 	} else {
964 		/**
965 		 * Move the previous entry's next field to the entry after the one being
966 		 * removed. This will clobber the ALTACCT and INTERNAL bits.
967 		 */
968 		*pvepp = pve_next(pvep);
969 	}
970 }
971 
972 /**
973  * PVH_TYPE_PTDP Types and Helper Functions.
974  *
975  * The following are types and methods used to manipulate page table descriptor
976  * (PTD) objects. This is the type of pv_head_table entry used when a page is
977  * being used as a page table.
978  */
979 
980 /**
981  * When the pmap layer allocates memory, it always does so in chunks of the VM
982  * page size (which are represented by the PAGE_SIZE/PAGE_SHIFT macros). The VM
983  * page size might not match up with the hardware page size for a given address
984  * space (this is especially true on systems that support more than one page
985  * size).
986  *
987  * The pv_head_table is allocated to have one entry per VM page, not hardware
988  * page (which can change depending on the address space). Because of that, a
989  * single VM-page-sized region (single pv_head_table entry) can potentially hold
990  * up to four page tables. Only one page table descriptor (PTD) is allocated per
991  * pv_head_table entry (per VM page), so on some systems, one PTD might have to
992  * keep track of up to four different page tables.
993  */
994 
995 #if __ARM_MIXED_PAGE_SIZE__
996 #define PT_INDEX_MAX (ARM_PGBYTES / 4096)
997 #elif (ARM_PGSHIFT == 14)
998 #define PT_INDEX_MAX 1
999 #elif (ARM_PGSHIFT == 12)
1000 #define PT_INDEX_MAX 4
1001 #else
1002 #error Unsupported ARM_PGSHIFT
1003 #endif /* __ARM_MIXED_PAGE_SIZE__ || ARM_PGSHIFT == 14 || ARM_PGSHIFT == 12 */
1004 
1005 
1006 /**
1007  * Page table descriptor (PTD) info structure.
1008  *
1009  * Contains information about a page table. These pieces of data are separate
1010  * from the PTD itself because in address spaces where the VM page size doesn't
1011  * match the underlying hardware page size, one PTD could represent multiple
1012  * page tables (and so will need multiple PTD info structures).
1013  *
1014  * These fields are also in their own struct so that they can be allocated
1015  * separately from the associated pt_desc_t object. This allows us to allocate
1016  * the counts in this structure in a way that ensures they don't fall within the
1017  * same cache line as the main pt_desc_t object. This is important because the
1018  * fields in this structure are atomically updated which could cause false
1019  * sharing cache performance issues with the "va" field in pt_desc_t if all of
1020  * the fields were within the same structure.
1021  */
1022 typedef struct {
1023 	/*
1024 	 * For non-leaf pagetables, should be 0.
1025 	 * For leaf pagetables, should reflect the number of wired entries.
1026 	 * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU
1027 	 * operations are implicitly wired).
1028 	 */
1029 	unsigned short wiredcnt;
1030 } ptd_info_t;
1031 
1032 /**
1033  * This type is used to identify a specific IOMMU driver and an instance of
1034  * that driver which owns a specific page or page table. This type will be used
1035  * within both PTD and PVE lists to track IOMMU-owned pages and IOMMU mappings
1036  * respectively.
1037  *
1038  * Despite the fact this value is not a pointer, we need to make this value sort
1039  * of look like a kernel pointer: the bottom 3-bits must be zero and the upper
1040  * bits must all be ones by default. This is due to the fact that this type can
1041  * be embedded into the PVH table to represent an IOMMU mapping. The PVH table
1042  * code expects "kernel-pointer-like" properties so it can store flags in those
1043  * areas of the 64-bit value.
1044  */
1045 typedef uint64_t iommu_instance_t;
1046 
1047 /* 8-bit ID of the IOMMU driver which the instance derives from. */
1048 #define IOMMU_ID_SHIFT 8U
1049 #define IOMMU_ID_MASK  0x000000000000FF00ULL
1050 
1051 #define GET_IOMMU_ID(x) ((sptm_iommu_id_t)(((x) & IOMMU_ID_MASK) >> IOMMU_ID_SHIFT))
1052 #define SET_IOMMU_ID(x) (((uint64_t)(x) << IOMMU_ID_SHIFT) & IOMMU_ID_MASK)
1053 
1054 /**
1055  * An IOMMU token is a 32-bit value unique to each instance of an IOMMU driver.
1056  * This is strictly used to help with debugging and provides a mechanism to
1057  * trace a mapping or page table back to the exact IOMMU instance that owns it.
1058  * Typically, this would be the instance ID, but for drivers that use only a
1059  * single global instance, this could be something else like a root page table
1060  * ppnum_t.
1061  */
1062 #define IOMMU_TOKEN_SHIFT 16U
1063 #define IOMMU_TOKEN_MASK  0x0000FFFFFFFF0000ULL
1064 
1065 #define GET_IOMMU_TOKEN(x) ((iommu_token_t)(((x) & IOMMU_TOKEN_MASK) >> IOMMU_TOKEN_SHIFT))
1066 #define SET_IOMMU_TOKEN(x) (((uint64_t)(x) << IOMMU_TOKEN_SHIFT) & IOMMU_TOKEN_MASK)
1067 
1068 /**
1069  * The default value for iommu_instance_t. See the type definition for more
1070  * details on why the upper bits need to initially be all ones.
1071  */
1072 #define IOMMU_INSTANCE_DEFAULT 0xFFFF000000000000ULL
1073 
1074 /**
1075  * Since "zero" is a valid IOMMU ID and token, the "NULL" value of an IOMMU
1076  * instance sets the ID and token to all ones as a sentinel invalid value.
1077  */
1078 #define IOMMU_INSTANCE_NULL 0xFFFFFFFFFFFFFF00ULL
1079 
1080 /**
1081  * Page Table Descriptor (PTD).
1082  *
1083  * Provides a per-table data structure and a way of keeping track of all page
1084  * tables in the system.
1085  *
1086  * This structure is also used as a convenient way of keeping track of IOMMU
1087  * pages (which may or may not be used as page tables). In that case the SPTM
1088  * frame type for the page will be XNU_IOMMU, the "iommu" field will describe
1089  * the owner of the page, and ptd_info[0].wiredcnt can be used as an arbitrary
1090  * refcnt controlled by the IOMMU driver.
1091  */
1092 typedef struct pt_desc {
1093 	/* Each page table is either owned by a pmap or a specific IOMMU. */
1094 	union {
1095 		struct pmap *pmap;
1096 	};
1097 
1098 	/**
1099 	 * The following fields contain per-page-table properties, and as such,
1100 	 * might have multiple elements each. This is due to a single PTD
1101 	 * potentially representing multiple page tables (in address spaces where
1102 	 * the VM page size differs from the hardware page size). Use the
1103 	 * ptd_get_index() function to get the correct index for a specific page
1104 	 * table.
1105 	 */
1106 
1107 	/**
1108 	 * The first address of the virtual address space this page table is
1109 	 * translating for, or a value set by an IOMMU driver if this PTD is being
1110 	 * used to track an IOMMU page.
1111 	 */
1112 	vm_offset_t va[PT_INDEX_MAX];
1113 
1114 	/**
1115 	 * ptd_info_t's are allocated separately so as to reduce false sharing
1116 	 * with the va field. This is desirable because ptd_info_t's are updated
1117 	 * atomically from all CPUs.
1118 	 */
1119 	ptd_info_t *ptd_info;
1120 } pt_desc_t;
1121 
1122 /**
1123  * Per-CPU structure for tracking in-flight SPTM retype operations.
1124  *
1125  * This structure is intended to be embedded in the pmap per-CPU data object,
1126  * and is meant to be used for situations in which the caller needs to ensure
1127  * that potentially sensitive concurrent SPTM operations have completed on other
1128  * CPUs prior to retyping a page.  If these sensitive operations haven't completed
1129  * when the retype occurs, and they happen to involve the page being retyped
1130  * (either directly or through mappings thereof), an SPTM violation panic may
1131  * result.
1132  */
1133 typedef struct {
1134 	/**
1135 	 * Critical section sequence number of the local CPU.  A value of zero
1136 	 * indicates that no retype epoch critical section is currently active on
1137 	 * the CPU.
1138 	 */
1139 	uint64_t local_seq;
1140 
1141 	/**
1142 	 * The sequence number to use the next time a retype epoch critical section
1143 	 * is entered on the local CPU.  This should monotonically increase.
1144 	 */
1145 	uint64_t next_seq;
1146 
1147 	/**
1148 	 * This array stores the retype sequence numbers observed on remote CPUs.
1149 	 * When the local CPU needs to wait for critical sections to complete on
1150 	 * other CPUs, this is intended to provide an initial sample of those other
1151 	 * CPUs' critical section state.  The caller can then wait for each remote
1152 	 * CPU's sequence number to return to zero or advance beyond the value
1153 	 * stored in its entry in this array.
1154 	 */
1155 	uint64_t remote_seq[MAX_CPUS];
1156 
1157 	/**
1158 	 * Flags used to track the state of an active retype epoch drain operation
1159 	 * on the local CPU.
1160 	 */
1161 
1162 	/**
1163 	 * This flag indicates that a drain operation has been prepared on the
1164 	 * local CPU by sampling remote CPU epoch states into the remote_seq array.
1165 	 * This must be set before the drain operation can be performed.
1166 	 */
1167 	#define PMAP_RETYPE_EPOCH_PREPARED (1 << 0)
1168 
1169 	/**
1170 	 * This flag indicates that one or more remote CPUs had a non-zero retype
1171 	 * epoch value when the remote_seq array was most recently sampled.
1172 	 * If this flag is not set, then we already know that no remote CPUs can
1173 	 * be in a critical section in which prior mapping state for the page to
1174 	 * be retyped may have been observed, so we can skip the drain operation.
1175 	 */
1176 	#define PMAP_RETYPE_EPOCH_DRAIN_REQUIRED (1 << 1)
1177 	uint8_t flags;
1178 } pmap_retype_epoch_t;
1179 
1180 #define PMAP_SPTM_PCPU_ALIGN (4096)
1181 
1182 typedef struct {
1183 	/**
1184 	 * Per-CPU array of SPTM_MAPPING_LIMIT PTE records, obtained from SPTM
1185 	 * during bootstrap.
1186 	 */
1187 	sptm_pte_t *sptm_prev_ptes;
1188 
1189 	/**
1190 	 * A piece of per-cpu scratch memory used by IOMMU drivers when passing data
1191 	 * into the SPTM. The size is defined by PMAP_IOMMU_SCRATCH_SIZE.
1192 	 */
1193 	void *sptm_iommu_scratch;
1194 
1195 	/* Accumulator for batched disjoint SPTM ops, to avoid excessive stack usage. */
1196 	sptm_disjoint_op_t sptm_ops[SPTM_MAPPING_LIMIT];
1197 
1198 	/* Accumulator for batched VA-contiguous SPTM ops, to avoid excessive stack usage. */
1199 	sptm_pte_t sptm_templates[SPTM_MAPPING_LIMIT];
1200 
1201 	/* Base PA of ops array, for passing the ops into the SPTM. */
1202 	pmap_paddr_t sptm_ops_pa;
1203 
1204 	/* Base PA of templates array, for passing templates into the SPTM. */
1205 	pmap_paddr_t sptm_templates_pa;
1206 
1207 	/* PMAP pagetable descriptors associated with each element of sptm_ops. */
1208 	pt_desc_t *sptm_ptds[SPTM_MAPPING_LIMIT];
1209 
1210 	/* PTD info objects associated with each pmap PTE pointer. */
1211 	ptd_info_t *sptm_ptd_info[SPTM_MAPPING_LIMIT];
1212 
1213 	/* Accounting-related flags for each element of sptm_ops. */
1214 	#define PMAP_SPTM_FLAG_INTERNAL (0x1)
1215 	#define PMAP_SPTM_FLAG_ALTACCT (0x2)
1216 	uint8_t sptm_acct_flags[SPTM_MAPPING_LIMIT];
1217 
1218 	/* Retype epoch tracking structure. */
1219 	pmap_retype_epoch_t retype_epoch;
1220 
1221 	uint16_t sptm_cpu_id;
1222 } __attribute__((aligned(PMAP_SPTM_PCPU_ALIGN))) pmap_sptm_percpu_data_t;
1223 
1224 _Static_assert((PAGE_SIZE % PMAP_SPTM_PCPU_ALIGN) == 0,
1225     "SPTM per-CPU data alignment does not fit evenly within a page");
1226 _Static_assert(sizeof(pmap_sptm_percpu_data_t) <= PMAP_SPTM_PCPU_ALIGN,
1227     "sizeof(pmap_sptm_percpu_data_t) is larger than PMAP_SPTM_PCPU_ALIGN");
1228 
1229 PERCPU_DECL(pmap_sptm_percpu_data_t, pmap_sptm_percpu);
1230 
1231 /**
1232  * Convert a pv_head_table entry/pointer into a page table descriptor pointer.
1233  * This should only be done if the type of this entry is PVH_TYPE_PTDP.
1234  *
1235  * @param pvh The pv_head_table entry/pointer to convert into a safe to
1236  *            dereference pt_desc_t*.
1237  *
1238  * @return Return back a safe to derefence pointer to the page table descriptor
1239  *         for this physical page by masking off the TYPE bits and adding any
1240  *         missing flags to the upper portion of the pointer.
1241  */
1242 static inline pt_desc_t*
pvh_ptd(uintptr_t pvh)1243 pvh_ptd(uintptr_t pvh)
1244 {
1245 	return (pt_desc_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
1246 }
1247 
1248 /**
1249  * Given an arbitrary page table entry, return back the page table descriptor
1250  * (PTD) object for the page table that contains that entry.
1251  *
1252  * @param ptep Pointer to a PTE whose page table descriptor object to return.
1253  *
1254  * @return The PTD object for the passed in page table.
1255  */
1256 static inline pt_desc_t *
ptep_get_ptd(const pt_entry_t * ptep)1257 ptep_get_ptd(const pt_entry_t *ptep)
1258 {
1259 	assert(ptep != NULL);
1260 
1261 	const vm_offset_t pt_base_va = (vm_offset_t)ptep;
1262 	uintptr_t pvh = pai_to_pvh(pa_index(kvtophys(pt_base_va)));
1263 
1264 	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1265 		panic("%s: invalid PV head 0x%llx for PTE %p", __func__, (uint64_t)pvh, ptep);
1266 	}
1267 
1268 	return pvh_ptd(pvh);
1269 }
1270 
1271 /**
1272  * Given an arbitrary page table entry, return back the pmap that owns that
1273  * page table.
1274  *
1275  * @note This won't work correctly for page tables owned by IOMMUs, because
1276  *       those table aren't owned by any specific pmap.
1277  *
1278  * @param ptep Pointer to a page table entry whose owner we're trying to return.
1279  *
1280  * @return The pmap that owns the given page table entry.
1281  */
1282 static inline struct pmap *
ptep_get_pmap(const pt_entry_t * ptep)1283 ptep_get_pmap(const pt_entry_t *ptep)
1284 {
1285 	return ptep_get_ptd(ptep)->pmap;
1286 }
1287 
1288 
1289 /**
1290  * Given an arbitrary translation table entry, get the page table descriptor
1291  * (PTD) object for the page table pointed to by the TTE.
1292  *
1293  * @param tte The translation table entry to parse. For instance, if this is an
1294  *            L2 TTE, then the PTD for the L3 table this entry points to will be
1295  *            returned.
1296  *
1297  * @return The page table descriptor (PTD) for the page table pointed to by this
1298  *         TTE.
1299  */
1300 static inline pt_desc_t *
tte_get_ptd(const tt_entry_t tte)1301 tte_get_ptd(const tt_entry_t tte)
1302 {
1303 	const vm_offset_t pt_base_va = (vm_offset_t)(tte & ~((tt_entry_t)PAGE_MASK));
1304 	uintptr_t pvh = pai_to_pvh(pa_index(pt_base_va));
1305 
1306 	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1307 		panic("%s: invalid PV head 0x%llx for TTE 0x%llx", __func__, (uint64_t)pvh, (uint64_t)tte);
1308 	}
1309 
1310 	return pvh_ptd(pvh);
1311 }
1312 
1313 /**
1314  * In address spaces where the VM page size doesn't match the underlying
1315  * hardware page size, one PTD could represent multiple page tables. This
1316  * function returns the correct index value depending on which page table is
1317  * being accessed. That index value can then be used to access the
1318  * per-page-table properties stored within a PTD.
1319  *
1320  * @note See the description above the PT_INDEX_MAX definition for a more
1321  *       detailed explanation of why multiple page tables can be represented
1322  *       by a single PTD object in the pv_head_table.
1323  *
1324  * @param ptd The page table descriptor that's being accessed.
1325  * @param ttep Pointer to the translation table entry that's being accessed.
1326  *
1327  * @return The correct index value for a specific, hardware-sized page
1328  *         table.
1329  */
1330 static inline unsigned
ptd_get_index(__unused const pt_desc_t * ptd,__unused const tt_entry_t * ttep)1331 ptd_get_index(__unused const pt_desc_t *ptd, __unused const tt_entry_t *ttep)
1332 {
1333 #if PT_INDEX_MAX == 1
1334 	return 0;
1335 #else
1336 	assert(ptd != NULL);
1337 
1338 	const uint64_t pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(ptd->pmap));
1339 	const vm_offset_t ttep_page = (vm_offset_t)ttep >> pmap_page_shift;
1340 
1341 	/**
1342 	 * Use the difference between the VM page shift and the hardware page shift
1343 	 * to get the index of the correct page table. In practice, this equates to
1344 	 * masking out the bottom two bits of the L3 table index in address spaces
1345 	 * where the VM page size is greater than the hardware page size. In address
1346 	 * spaces where they're identical, the index will always be zero.
1347 	 */
1348 	const unsigned int ttep_index = ttep_page & ((1U << (PAGE_SHIFT - pmap_page_shift)) - 1);
1349 	assert(ttep_index < PT_INDEX_MAX);
1350 
1351 	return ttep_index;
1352 #endif
1353 }
1354 
1355 /**
1356  * In address spaces where the VM page size doesn't match the underlying
1357  * hardware page size, one PTD could represent multiple page tables. This
1358  * function returns the correct ptd_info_t structure depending on which page
1359  * table is being accessed.
1360  *
1361  * @note See the description above the PT_INDEX_MAX definition for a more
1362  *       detailed explanation of why multiple page tables can be represented
1363  *       by a single PTD object in the pv_head_table.
1364  *
1365  * @param ptd The page table descriptor that's being accessed.
1366  * @param ttep Pointer to the translation table entry that's being accessed.
1367  *
1368  * @return The correct ptd_info_t structure for a specific, hardware-sized page
1369  *         table.
1370  */
1371 static inline ptd_info_t *
ptd_get_info(pt_desc_t * ptd,const tt_entry_t * ttep)1372 ptd_get_info(pt_desc_t *ptd, const tt_entry_t *ttep)
1373 {
1374 	assert(ptd != NULL);
1375 	return &ptd->ptd_info[ptd_get_index(ptd, ttep)];
1376 }
1377 
1378 /**
1379  * Given a pointer to a page table entry, return back the ptd_info structure
1380  * for the page table that contains that entry.
1381  *
1382  * @param ptep Pointer to a PTE whose ptd_info object to return.
1383  *
1384  * @return The ptd_info object for the page table that contains the passed in
1385  *         page table entry.
1386  */
1387 static inline ptd_info_t *
ptep_get_info(const pt_entry_t * ptep)1388 ptep_get_info(const pt_entry_t *ptep)
1389 {
1390 	return ptd_get_info(ptep_get_ptd(ptep), ptep);
1391 }
1392 
1393 /**
1394  * Return the virtual address mapped by the passed in leaf page table entry,
1395  * using an already-retrieved pagetable descriptor.
1396  *
1397  * @param ptdp pointer to the descriptor for the pagetable containing ptep
1398  * @param ptep Pointer to a PTE to parse
1399  */
1400 static inline vm_map_address_t
ptd_get_va(const pt_desc_t * ptdp,const pt_entry_t * ptep)1401 ptd_get_va(const pt_desc_t *ptdp, const pt_entry_t *ptep)
1402 {
1403 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);
1404 
1405 	vm_map_address_t va = ptdp->va[ptd_get_index(ptdp, ptep)];
1406 	vm_offset_t ptep_index = ((vm_offset_t)ptep & pt_attr_leaf_offmask(pt_attr)) / sizeof(*ptep);
1407 
1408 	va += (ptep_index << pt_attr_leaf_shift(pt_attr));
1409 
1410 	return va;
1411 }
1412 
1413 /**
1414  * Return the virtual address that is being mapped by the passed in leaf page
1415  * table entry.
1416  *
1417  * @param ptep Pointer to a PTE to parse.
1418  */
1419 static inline vm_map_address_t
ptep_get_va(const pt_entry_t * ptep)1420 ptep_get_va(const pt_entry_t *ptep)
1421 {
1422 	return ptd_get_va(ptep_get_ptd(ptep), ptep);
1423 }
1424 
1425 /**
1426  * Physical Page Attribute Table (pp_attr_table) defines and helper functions.
1427  */
1428 
1429 /* How many bits to use for flags on a per-VM-page basis. */
1430 typedef uint16_t pp_attr_t;
1431 
1432 /* See the definition of pp_attr_table for more information. */
1433 extern volatile pp_attr_t* pp_attr_table;
1434 
1435 /**
1436  * Flags stored in the pp_attr_table on a per-physical-page basis.
1437  *
1438  * Please update the pv_walk LLDB macro if these flags are changed or added to.
1439  */
1440 
1441 /**
1442  * The bottom 6-bits are used to store the default WIMG (cacheability and memory
1443  * type) setting for this physical page. This can be changed by calling
1444  * pmap_set_cache_attributes().
1445  *
1446  * If a default WIMG setting isn't set for a page, then the default is Normal,
1447  * Cached memory (VM_WIMG_DEFAULT).
1448  */
1449 #define PP_ATTR_WIMG_MASK 0x003F
1450 #define PP_ATTR_WIMG(x) ((x) & PP_ATTR_WIMG_MASK)
1451 
1452 /**
1453  * The reference and modify bits keep track of whether a page has been accessed
1454  * or modified since the last time the bits were cleared. These bits are used to
1455  * enforce policy decisions in the VM layer.
1456  */
1457 #define PP_ATTR_REFERENCED 0x0040
1458 #define PP_ATTR_MODIFIED   0x0080
1459 
1460 /**
1461  * This physical page is being used as anonymous memory that's internally
1462  * managed by the VM and is not connected to an external pager. This flag is
1463  * only set/cleared on the first CPU mapping of a page (see PVH_FLAG_CPU). Any
1464  * subsequent mappings won't set/clear this flag until all mappings are removed
1465  * and a new CPU mapping is added.
1466  */
1467 #define PP_ATTR_INTERNAL 0x0100
1468 
1469 /**
1470  * This flag is used to keep track of pages that are still resident but are not
1471  * considered dirty and can be reclaimed under memory pressure. These pages do
1472  * not count as a part of the memory footprint, so the footprint ledger does not
1473  * need to be updated for these pages. This is hinted to the VM by the
1474  * `madvise(MADV_FREE_REUSABLE)` system call.
1475  */
1476 #define PP_ATTR_REUSABLE 0x0200
1477 
1478 /**
1479  * This flag denotes that a page is utilizing "alternate accounting". This means
1480  * that the pmap doesn't need to keep track of these pages with regards to the
1481  * footprint ledger because the VM is already accounting for them in a different
1482  * way. These include IOKit mappings (VM adds their entire virtual size to the
1483  * footprint), and purgeable pages (VM counts them only when non-volatile and
1484  * only for one "owner"), among others.
1485  *
1486  * Note that alternate accounting status is tracked on a per-mapping basis (not
1487  * per-page). Because of that the ALTACCT flag in the pp_attr_table is only used
1488  * when there's a single mapping to a page. When there are multiple mappings,
1489  * the status of this flag is tracked in the pv_head_table (see PVE_PTEP_ALTACCT
1490  * above).
1491  */
1492 #define PP_ATTR_ALTACCT 0x0400
1493 
1494 /**
1495  * This bit was originally used on x86 to keep track of what pages to not
1496  * encrypt during the hibernation process as a performance optimization when
1497  * encryption was done in software. This doesn't apply to the ARM
1498  * hibernation process because all pages are automatically encrypted using
1499  * hardware acceleration. Despite that, the pmap still keeps track of this flag
1500  * as a debugging aid on internal builds.
1501  *
1502  * TODO: This bit can probably be reclaimed:
1503  * rdar://70740650 (PMAP Cleanup: Potentially reclaim the PP_ATTR_NOENCRYPT bit on ARM)
1504  */
1505 #define PP_ATTR_NOENCRYPT 0x0800
1506 
1507 /**
1508  * These bits denote that a physical page is expecting the next access or
1509  * modification to set the PP_ATTR_REFERENCED and PP_ATTR_MODIFIED flags
1510  * respectively.
1511  */
1512 #define PP_ATTR_REFFAULT 0x1000
1513 #define PP_ATTR_MODFAULT 0x2000
1514 
1515 /**
1516  * Atomically set some flags in a pp_attr_table entry.
1517  *
1518  * @param pai The physical address index for the entry to update.
1519  * @param bits The flags to set in the entry.
1520  */
1521 static inline void
ppattr_set_bits(unsigned int pai,pp_attr_t bits)1522 ppattr_set_bits(unsigned int pai, pp_attr_t bits)
1523 {
1524 	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1525 	os_atomic_or(ppattr, bits, relaxed);
1526 }
1527 
1528 /**
1529  * Atomically clear some flags in a pp_attr_table entry.
1530  *
1531  * @param pai The physical address index for the entry to update.
1532  * @param bits The flags to clear in the entry.
1533  */
1534 static inline void
ppattr_clear_bits(unsigned int pai,pp_attr_t bits)1535 ppattr_clear_bits(unsigned int pai, pp_attr_t bits)
1536 {
1537 	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1538 	os_atomic_andnot(ppattr, bits, relaxed);
1539 }
1540 
1541 /**
1542  * General-purpose function for atomically modifying flags in a pp_attr_table entry.
1543  *
1544  * @param pai The physical address index for the entry to update.
1545  * @param bits_to_clear Mask of bits to atomically clear from the entry.
1546  * @param bits_to_set Mask of bits to atomically set in the entry.
1547  *
1548  * @note [bits_to_clear] and [bits_to_set] must not overlap.
1549  */
1550 static inline void
ppattr_modify_bits(unsigned int pai,pp_attr_t bits_to_clear,pp_attr_t bits_to_set)1551 ppattr_modify_bits(unsigned int pai, pp_attr_t bits_to_clear, pp_attr_t bits_to_set)
1552 {
1553 	assert((bits_to_set & bits_to_clear) == 0);
1554 	pp_attr_t prev_ppattr, new_ppattr;
1555 	os_atomic_rmw_loop(&pp_attr_table[pai], prev_ppattr, new_ppattr, relaxed, {
1556 		new_ppattr = (prev_ppattr & ~bits_to_clear) | bits_to_set;
1557 	});
1558 }
1559 
1560 /**
1561  * Return true if the pp_attr_table entry contains the passed in bits.
1562  *
1563  * @param pai The physical address index for the entry to test.
1564  * @param bits The flags to check for.
1565  */
1566 static inline bool
ppattr_test_bits(unsigned int pai,pp_attr_t bits)1567 ppattr_test_bits(unsigned int pai, pp_attr_t bits)
1568 {
1569 	const volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1570 	return (*ppattr & bits) == bits;
1571 }
1572 
1573 /**
1574  * Only set some flags in a pp_attr_table entry if the passed in physical
1575  * address is a kernel-managed address.
1576  *
1577  * @param pa The physical address for the entry to update.
1578  * @param bits The flags to set in the entry.
1579  */
1580 static inline void
ppattr_pa_set_bits(pmap_paddr_t pa,pp_attr_t bits)1581 ppattr_pa_set_bits(pmap_paddr_t pa, pp_attr_t bits)
1582 {
1583 	if (pa_valid(pa)) {
1584 		ppattr_set_bits(pa_index(pa), bits);
1585 	}
1586 }
1587 
1588 /**
1589  * Only clear some flags in a pp_attr_table entry if the passed in physical
1590  * address is a kernel-managed address.
1591  *
1592  * @param pa The physical address for the entry to update.
1593  * @param bits The flags to clear in the entry.
1594  */
1595 static inline void
ppattr_pa_clear_bits(pmap_paddr_t pa,pp_attr_t bits)1596 ppattr_pa_clear_bits(pmap_paddr_t pa, pp_attr_t bits)
1597 {
1598 	if (pa_valid(pa)) {
1599 		ppattr_clear_bits(pa_index(pa), bits);
1600 	}
1601 }
1602 
1603 /**
1604  * Only test flags in a pp_attr_table entry if the passed in physical address
1605  * is a kernel-managed page.
1606  *
1607  * @param pa The physical address for the entry to test.
1608  * @param bits The flags to check for.
1609  *
1610  * @return False if the PA isn't a kernel-managed page, otherwise true/false
1611  *         depending on whether the bits are set.
1612  */
1613 static inline bool
ppattr_pa_test_bits(pmap_paddr_t pa,pp_attr_t bits)1614 ppattr_pa_test_bits(pmap_paddr_t pa, pp_attr_t bits)
1615 {
1616 	return pa_valid(pa) ? ppattr_test_bits(pa_index(pa), bits) : false;
1617 }
1618 
1619 /**
1620  * Set the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the passed
1621  * in physical address is a kernel-managed page.
1622  *
1623  * @param pa The physical address for the entry to update.
1624  */
1625 static inline void
ppattr_pa_set_modify(pmap_paddr_t pa)1626 ppattr_pa_set_modify(pmap_paddr_t pa)
1627 {
1628 	ppattr_pa_set_bits(pa, PP_ATTR_MODIFIED);
1629 }
1630 
1631 /**
1632  * Clear the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the
1633  * passed in physical address is a kernel-managed page.
1634  *
1635  * @param pa The physical address for the entry to update.
1636  */
1637 static inline void
ppattr_pa_clear_modify(pmap_paddr_t pa)1638 ppattr_pa_clear_modify(pmap_paddr_t pa)
1639 {
1640 	ppattr_pa_clear_bits(pa, PP_ATTR_MODIFIED);
1641 }
1642 
1643 /**
1644  * Set the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1645  * passed in physical address is a kernel-managed page.
1646  *
1647  * @param pa The physical address for the entry to update.
1648  */
1649 static inline void
ppattr_pa_set_reference(pmap_paddr_t pa)1650 ppattr_pa_set_reference(pmap_paddr_t pa)
1651 {
1652 	ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
1653 }
1654 
1655 /**
1656  * Clear the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1657  * passed in physical address is a kernel-managed page.
1658  *
1659  * @param pa The physical address for the entry to update.
1660  */
1661 static inline void
ppattr_pa_clear_reference(pmap_paddr_t pa)1662 ppattr_pa_clear_reference(pmap_paddr_t pa)
1663 {
1664 	ppattr_pa_clear_bits(pa, PP_ATTR_REFERENCED);
1665 }
1666 
1667 /**
1668  * Set the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1669  *
1670  * @param pai The physical address index for the entry to update.
1671  */
1672 static inline void
ppattr_set_internal(unsigned int pai)1673 ppattr_set_internal(unsigned int pai)
1674 {
1675 	ppattr_set_bits(pai, PP_ATTR_INTERNAL);
1676 }
1677 
1678 /**
1679  * Clear the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1680  *
1681  * @param pai The physical address index for the entry to update.
1682  */
1683 static inline void
ppattr_clear_internal(unsigned int pai)1684 ppattr_clear_internal(unsigned int pai)
1685 {
1686 	ppattr_clear_bits(pai, PP_ATTR_INTERNAL);
1687 }
1688 
1689 /**
1690  * Return true if the pp_attr_table entry has the PP_ATTR_INTERNAL flag set.
1691  *
1692  * @param pai The physical address index for the entry to test.
1693  */
1694 static inline bool
ppattr_test_internal(unsigned int pai)1695 ppattr_test_internal(unsigned int pai)
1696 {
1697 	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1698 }
1699 
1700 /**
1701  * Set the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1702  *
1703  * @param pai The physical address index for the entry to update.
1704  */
1705 static inline void
ppattr_set_reusable(unsigned int pai)1706 ppattr_set_reusable(unsigned int pai)
1707 {
1708 	ppattr_set_bits(pai, PP_ATTR_REUSABLE);
1709 }
1710 
1711 /**
1712  * Clear the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1713  *
1714  * @param pai The physical address index for the entry to update.
1715  */
1716 static inline void
ppattr_clear_reusable(unsigned int pai)1717 ppattr_clear_reusable(unsigned int pai)
1718 {
1719 	ppattr_clear_bits(pai, PP_ATTR_REUSABLE);
1720 }
1721 
1722 /**
1723  * Return true if the pp_attr_table entry has the PP_ATTR_REUSABLE flag set.
1724  *
1725  * @param pai The physical address index for the entry to test.
1726  */
1727 static inline bool
ppattr_test_reusable(unsigned int pai)1728 ppattr_test_reusable(unsigned int pai)
1729 {
1730 	return ppattr_test_bits(pai, PP_ATTR_REUSABLE);
1731 }
1732 
1733 /**
1734  * Set the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1735  *
1736  * @note This is only valid when the ALTACCT flag is being tracked using the
1737  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1738  *       PP_ATTR_ALTACCT definitions for more information.
1739  *
1740  * @param pai The physical address index for the entry to update.
1741  */
1742 static inline void
ppattr_set_altacct(unsigned int pai)1743 ppattr_set_altacct(unsigned int pai)
1744 {
1745 	ppattr_set_bits(pai, PP_ATTR_ALTACCT);
1746 }
1747 
1748 /**
1749  * Clear the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1750  *
1751  * @note This is only valid when the ALTACCT flag is being tracked using the
1752  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1753  *       PP_ATTR_ALTACCT definitions for more information.
1754  *
1755  * @param pai The physical address index for the entry to update.
1756  */
1757 static inline void
ppattr_clear_altacct(unsigned int pai)1758 ppattr_clear_altacct(unsigned int pai)
1759 {
1760 	ppattr_clear_bits(pai, PP_ATTR_ALTACCT);
1761 }
1762 
1763 /**
1764  * Get the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1765  *
1766  * @note This is only valid when the ALTACCT flag is being tracked using the
1767  *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1768  *       PP_ATTR_ALTACCT definitions for more information.
1769  *
1770  * @param pai The physical address index for the entry to test.
1771  *
1772  * @return True if the passed in page uses alternate accounting, false
1773  *         otherwise.
1774  */
1775 static inline bool
ppattr_is_altacct(unsigned int pai)1776 ppattr_is_altacct(unsigned int pai)
1777 {
1778 	return ppattr_test_bits(pai, PP_ATTR_ALTACCT);
1779 }
1780 
1781 /**
1782  * Get the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1783  *
1784  * @note This is only valid when the INTERNAL flag is being tracked using the
1785  *       pp_attr_table. See the descriptions above the PVE_PTEP_INTERNAL and
1786  *       PP_ATTR_INTERNAL definitions for more information.
1787  *
1788  * @param pai The physical address index for the entry to test.
1789  *
1790  * @return True if the passed in page is accounted for as "internal", false
1791  *         otherwise.
1792  */
1793 static inline bool
ppattr_is_internal(unsigned int pai)1794 ppattr_is_internal(unsigned int pai)
1795 {
1796 	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1797 }
1798 
1799 /**
1800  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1801  * depending on whether there are one or multiple mappings to a page. This
1802  * function abstracts out the difference between single and multiple mappings to
1803  * a page and provides a single function for determining whether alternate
1804  * accounting is set for a mapping.
1805  *
1806  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1807  *       definitions for more information.
1808  *
1809  * @param pai The physical address index for the entry to test.
1810  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1811  * @param idx Index of the chosen PTE pointer inside the PVE.
1812  *
1813  * @return True if the passed in page uses alternate accounting, false
1814  *         otherwise.
1815  */
1816 static inline bool
ppattr_pve_is_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1817 ppattr_pve_is_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1818 {
1819 	return (pvep == PV_ENTRY_NULL) ? ppattr_is_altacct(pai) : pve_get_altacct(pvep, idx);
1820 }
1821 
1822 /**
1823  * The "internal" (INTERNAL) status for a page is tracked differently
1824  * depending on whether there are one or multiple mappings to a page. This
1825  * function abstracts out the difference between single and multiple mappings to
1826  * a page and provides a single function for determining whether "internal"
1827  * is set for a mapping.
1828  *
1829  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1830  *       definitions for more information.
1831  *
1832  * @param pai The physical address index for the entry to test.
1833  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1834  * @param idx Index of the chosen PTE pointer inside the PVE.
1835  *
1836  * @return True if the passed in page is "internal", false otherwise.
1837  */
1838 static inline bool
ppattr_pve_is_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1839 ppattr_pve_is_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1840 {
1841 	return (pvep == PV_ENTRY_NULL) ? ppattr_is_internal(pai) : pve_get_internal(pvep, idx);
1842 }
1843 
1844 /**
1845  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1846  * depending on whether there are one or multiple mappings to a page. This
1847  * function abstracts out the difference between single and multiple mappings to
1848  * a page and provides a single function for setting the alternate accounting status
1849  * for a mapping.
1850  *
1851  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1852  *       definitions for more information.
1853  *
1854  * @param pai The physical address index for the entry to update.
1855  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1856  * @param idx Index of the chosen PTE pointer inside the PVE.
1857  */
1858 static inline void
ppattr_pve_set_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1859 ppattr_pve_set_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1860 {
1861 	if (pvep == PV_ENTRY_NULL) {
1862 		ppattr_set_altacct(pai);
1863 	} else {
1864 		pve_set_altacct(pvep, idx);
1865 	}
1866 }
1867 
1868 /**
1869  * The "internal" (INTERNAL) status for a page is tracked differently
1870  * depending on whether there are one or multiple mappings to a page. This
1871  * function abstracts out the difference between single and multiple mappings to
1872  * a page and provides a single function for setting the "internal" status
1873  * for a mapping.
1874  *
1875  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1876  *       definitions for more information.
1877  *
1878  * @param pai The physical address index for the entry to update.
1879  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1880  * @param idx Index of the chosen PTE pointer inside the PVE.
1881  */
1882 static inline void
ppattr_pve_set_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1883 ppattr_pve_set_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1884 {
1885 	if (pvep == PV_ENTRY_NULL) {
1886 		ppattr_set_internal(pai);
1887 	} else {
1888 		pve_set_internal(pvep, idx);
1889 	}
1890 }
1891 
1892 /**
1893  * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1894  * depending on whether there are one or multiple mappings to a page. This
1895  * function abstracts out the difference between single and multiple mappings to
1896  * a page and provides a single function for clearing the alternate accounting status
1897  * for a mapping.
1898  *
1899  * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1900  *       definitions for more information.
1901  *
1902  * @param pai The physical address index for the entry to update.
1903  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1904  * @param idx Index of the chosen PTE pointer inside the PVE.
1905  */
1906 static inline void
ppattr_pve_clr_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1907 ppattr_pve_clr_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1908 {
1909 	if (pvep == PV_ENTRY_NULL) {
1910 		ppattr_clear_altacct(pai);
1911 	} else {
1912 		pve_clr_altacct(pvep, idx);
1913 	}
1914 }
1915 
1916 /**
1917  * The "internal" (INTERNAL) status for a page is tracked differently
1918  * depending on whether there are one or multiple mappings to a page. This
1919  * function abstracts out the difference between single and multiple mappings to
1920  * a page and provides a single function for clearing the "internal" status
1921  * for a mapping.
1922  *
1923  * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1924  *       definitions for more information.
1925  *
1926  * @param pai The physical address index for the entry to update.
1927  * @param pvep Pointer to the pv_entry_t object containing that mapping.
1928  * @param idx Index of the chosen PTE pointer inside the PVE.
1929  */
1930 static inline void
ppattr_pve_clr_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1931 ppattr_pve_clr_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1932 {
1933 	if (pvep == PV_ENTRY_NULL) {
1934 		ppattr_clear_internal(pai);
1935 	} else {
1936 		pve_clr_internal(pvep, idx);
1937 	}
1938 }
1939 
1940 /**
1941  * Set the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1942  *
1943  * @param pai The physical address index for the entry to update.
1944  */
1945 static inline void
ppattr_set_reffault(unsigned int pai)1946 ppattr_set_reffault(unsigned int pai)
1947 {
1948 	ppattr_set_bits(pai, PP_ATTR_REFFAULT);
1949 }
1950 
1951 /**
1952  * Clear the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1953  *
1954  * @param pai The physical address index for the entry to update.
1955  */
1956 static inline void
ppattr_clear_reffault(unsigned int pai)1957 ppattr_clear_reffault(unsigned int pai)
1958 {
1959 	ppattr_clear_bits(pai, PP_ATTR_REFFAULT);
1960 }
1961 
1962 /**
1963  * Return true if the pp_attr_table entry has the PP_ATTR_REFFAULT flag set.
1964  *
1965  * @param pai The physical address index for the entry to test.
1966  */
1967 static inline bool
ppattr_test_reffault(unsigned int pai)1968 ppattr_test_reffault(unsigned int pai)
1969 {
1970 	return ppattr_test_bits(pai, PP_ATTR_REFFAULT);
1971 }
1972 
1973 /**
1974  * Set the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1975  *
1976  * @param pai The physical address index for the entry to update.
1977  */
1978 static inline void
ppattr_set_modfault(unsigned int pai)1979 ppattr_set_modfault(unsigned int pai)
1980 {
1981 	ppattr_set_bits(pai, PP_ATTR_MODFAULT);
1982 }
1983 
1984 /**
1985  * Clear the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1986  *
1987  * @param pai The physical address index for the entry to update.
1988  */
1989 static inline void
ppattr_clear_modfault(unsigned int pai)1990 ppattr_clear_modfault(unsigned int pai)
1991 {
1992 	ppattr_clear_bits(pai, PP_ATTR_MODFAULT);
1993 }
1994 
1995 /**
1996  * Return true if the pp_attr_table entry has the PP_ATTR_MODFAULT flag set.
1997  *
1998  * @param pai The physical address index for the entry to test.
1999  */
2000 static inline bool
ppattr_test_modfault(unsigned int pai)2001 ppattr_test_modfault(unsigned int pai)
2002 {
2003 	return ppattr_test_bits(pai, PP_ATTR_MODFAULT);
2004 }
2005 
2006 /**
2007  * Retype epoch operations:
2008  *
2009  * The retype epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap
2010  * can ensure all CPUs have observed updated mapping state before retyping a physical page.
2011  *
2012  * There are certain cases in which the pmap, while issuing an SPTM call that modifies
2013  * mappings, cannot hold locks such as the PVH lock which would prevent the page from
2014  * being concurrently retyped.  This is particularly true for batched operations such
2015  * as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes().
2016  * In these cases, the pmap may call pmap_retype_epoch_enter() to note that it is
2017  * performing such a sensitive operation on the local CPU.  It must then call
2018  * pmap_retype_epoch_exit() upon completion of the sensitive operation.
2019  *
2020  * Then, for any instance in which the pmap needs to retype a page without being
2021  * otherwise guaranteed (e.g. by VM layer locking or the existing page type) that such
2022  * a sensitive operation is not in progress on some other CPU, it must drain these
2023  * sensitive operations from other CPUs.  Specifically, it must ensure that any
2024  * sensitive operation which may have observed prior mapping state of the page that
2025  * is to be retyped has completed.  This is accomplished by first calling
2026  * pmap_retype_epoch_prepare_drain() to record the initial retype epoch state of
2027  * all CPUs, followed by pmap_retype_epoch_drain() to ensure all remote CPUs are
2028  * either not in an epoch or have advanced beyond the initially recorded epoch.
2029  * These are exposed as two separate functions in order to allow the calling CPU
2030  * to do other work between calling pmap_retype_epoch_prepare_drain() and
2031  * pmap_retype_epoch_drain(), as a best-effort attempt to minimize time wasted
2032  * spinning in pmap_retype_epoch_drain().
2033  *
2034  * When draining the retype epoch, the following assumptions must hold true:
2035  *
2036  * 1) The calling thread must guarantee that prior updates needed to bring the page
2037  * into the correct mapping state for retyping have already been performed and made
2038  * globally visible using the appropriate barriers.  In most cases this means that
2039  * all existing mappings of the page must have been removed.  For any alterations
2040  * of mapping state, global visibility is conveniently already guaranteed by the
2041  * DSBs that are architecturally required to synchronize PTE updates and the TLBIs
2042  * that follow them.
2043  *
2044  * 2) The calling thread must have some means of ensuring the new mappings cannot
2045  * be added for the page that would bring it out of the correct state for retyping.
2046  * This is typically done by holding the PVH lock and/or the exclusive pmap lock
2047  * such that pmap_enter() cannot concurrently execute against the page.
2048  *
2049  * 3) The calling thread must not perform any operation which requires preemptibility
2050  * between calling pmap_retype_epoch_prepare_drain() and pmap_retype_epoch_drain().
2051  */
2052 
2053 /**
2054  * Enter the retype epoch on the local CPU to indicate an in-progress SPTM operation
2055  * that may be sensitive to a concurrent retype operation on another CPU.
2056  *
2057  * @note This function increments the thread's preemption disable count and returns
2058  *       with preemption disabled.
2059  *
2060  * @note This function issues all required barriers to ensure correct ordering of
2061  *       the epoch update relative to ensuing SPTM accesses.
2062  */
2063 static inline void
pmap_retype_epoch_enter(void)2064 pmap_retype_epoch_enter(void)
2065 {
2066 	mp_disable_preemption();
2067 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2068 	assert(!preemption_enabled());
2069 
2070 	/* Must not already been in a retype epoch on this CPU. */
2071 	assert(retype_epoch->local_seq == 0);
2072 	retype_epoch->local_seq = ++retype_epoch->next_seq;
2073 	/* Unsigned 64-bit per-CPU integer should never overflow on any human timescale. */
2074 	assert(retype_epoch->local_seq != 0);
2075 
2076 	/**
2077 	 * Issue a store-load barrier to ensure that remote observers of any ensuing
2078 	 * SPTM accesses will also observe the epoch update.
2079 	 */
2080 	os_atomic_thread_fence(seq_cst);
2081 }
2082 
2083 /**
2084  * Exit the retype epoch on the local CPU to indicate completion of an SPTM operation
2085  * that may be sensitive to a concurrent retype operation on another CPU.
2086  *
2087  * @note This function must be called with preemption disabled and will decrement
2088  *       the current thread's preemption disable count.
2089  */
2090 static inline void
pmap_retype_epoch_exit(void)2091 pmap_retype_epoch_exit(void)
2092 {
2093 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2094 	assert(!preemption_enabled());
2095 	assert(retype_epoch->local_seq == retype_epoch->next_seq);
2096 
2097 	/**
2098 	 * Clear the sequence using a store-release operation to ensure that prior
2099 	 * SPTM modifications will be visible to remote observers before the absence
2100 	 * of an epoch is visible.
2101 	 */
2102 	os_atomic_store(&retype_epoch->local_seq, 0, release);
2103 	mp_enable_preemption();
2104 }
2105 
2106 /**
2107  * Prepare the local CPU to perform an epoch drain operation by recording the retype
2108  * epoch state of other CPUs.
2109  *
2110  * @note This function increments the current thread's preemption disable count and
2111  *       returns with preemption disabled.
2112  *
2113  * @note This function issues all necessary barriers to ensure that the subsequent
2114  *       retype operation is not speculated ahead of the epoch sampling.
2115  *
2116  * @note This function does NOT issue any barriers to ensure that prior updates of
2117  *       mapping state are globally visible and have proper store-load ordering with
2118  *       respect to the scan performed here.  In the cases where this function is
2119  *       intended to be used, this ordering should be guaranteed automatically by
2120  *       the DSBs used to synchronize prior mapping updates issued by the caller.
2121  *       If this function is ever used in a situation where that cannot be guaranteed,
2122  *       the caller must issue at least the equivalent of 'dmb ish' (a.k.a. a seq_cst
2123  *       thread_fence) before calling this function.
2124  */
2125 static inline void
pmap_retype_epoch_prepare_drain(void)2126 pmap_retype_epoch_prepare_drain(void)
2127 {
2128 	mp_disable_preemption();
2129 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2130 	assert(retype_epoch->flags == 0);
2131 	unsigned int i = 0;
2132 	uint8_t flags = PMAP_RETYPE_EPOCH_PREPARED;
2133 
2134 	/* Sample each CPU's epoch state. */
2135 	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2136 		const uint64_t remote_epoch =
2137 		    os_atomic_load(&pmap_pcpu->retype_epoch.local_seq, relaxed);
2138 		retype_epoch->remote_seq[i] = remote_epoch;
2139 
2140 		/**
2141 		 * If the remote CPU has an active epoch, make a note to ourselves that
2142 		 * we'll need to drain it.
2143 		 */
2144 		if (remote_epoch != 0) {
2145 			flags |= PMAP_RETYPE_EPOCH_DRAIN_REQUIRED;
2146 		}
2147 		++i;
2148 	}
2149 	retype_epoch->flags = flags;
2150 
2151 	/**
2152 	 * Issue a load-load barrier to ensure subsequent drain or retype operations will
2153 	 * not be speculated ahead of the sampling we just did.
2154 	 */
2155 	os_atomic_thread_fence(acquire);
2156 }
2157 
2158 /**
2159  * Ensure that all CPUs have advanced beyond any active epoch that was recorded in the
2160  * most recent call to pmap_retype_epoch_prepare_drain().
2161  *
2162  * @note This function expects to be called with preemption disabled and will decrement
2163  *       the current thread's preemption disable count.
2164  *
2165  * @note pmap_retype_epoch_prepare_drain() must have been called on the local CPU
2166  *       prior to calling this function.  This function will return immediately if
2167  *       this prior call did not observe any active epochs on remote CPUs.
2168  *
2169  * @note This function issues all necessary barriers to ensure that the subsequent
2170  *       retype operation is not speculated ahead of the epoch sampling.
2171  */
2172 static inline void
pmap_retype_epoch_drain(void)2173 pmap_retype_epoch_drain(void)
2174 {
2175 	assert(!preemption_enabled());
2176 	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
2177 	const uint8_t flags = retype_epoch->flags;
2178 	assert(flags & PMAP_RETYPE_EPOCH_PREPARED);
2179 	retype_epoch->flags = 0;
2180 	if (!(flags & PMAP_RETYPE_EPOCH_DRAIN_REQUIRED)) {
2181 		mp_enable_preemption();
2182 		return;
2183 	}
2184 	unsigned int i = 0;
2185 	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2186 		if (retype_epoch->remote_seq[i] != 0) {
2187 			assert((pmap_pcpu->retype_epoch.local_seq == 0) ||
2188 			    (pmap_pcpu->retype_epoch.local_seq >= retype_epoch->remote_seq[i]));
2189 			/**
2190 			 * If the remote CPU was in an epoch, WFE-spin until it either exits the epoch
2191 			 * or advances to a new epoch.
2192 			 */
2193 			while ((os_atomic_load_exclusive(&pmap_pcpu->retype_epoch.local_seq, relaxed) ==
2194 			    retype_epoch->remote_seq[i])) {
2195 				__builtin_arm_wfe();
2196 			}
2197 			/* Clear the monitor if we exclusive-loaded a value that didn't require WFE. */
2198 			os_atomic_clear_exclusive();
2199 		}
2200 		++i;
2201 	}
2202 	mp_enable_preemption();
2203 	/**
2204 	 * Issue a load-load barrier to ensure subsequent retype operations will
2205 	 * not be speculated ahead of the sampling we just did.
2206 	 */
2207 	os_atomic_thread_fence(acquire);
2208 }
2209 
2210 /**
2211  * Helper to determine whether a frame type is one that requires automatic
2212  * retyping (by the pmap layer) back to XNU_DEFAULT when all mappings of the
2213  * page are gone.
2214  *
2215  * @return true if the type requires auto-retyping, false otherwise.
2216  */
2217 static inline bool
pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type)2218 pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type)
2219 {
2220 	return (frame_type == XNU_USER_EXEC) || (frame_type == XNU_USER_DEBUG) ||
2221 	       (frame_type == XNU_USER_JIT) || (frame_type == XNU_ROZONE) ||
2222 	       (frame_type == XNU_KERNEL_RESTRICTED);
2223 }
2224 
2225 
2226 /**
2227  * If necessary, prepare a physical page for being retyped back to XNU_DEFAULT
2228  * after the last CPU mapping has been removed.  This is only needed for pages of
2229  * certain special types such as the various executable types and the kernel RO
2230  * zone type.
2231  *
2232  * @note The PVH lock for the physical page that is getting a new mapping
2233  *       registered must already be held.
2234  *
2235  * @param pa The physical address of the recently-unmapped page.
2236  *
2237  * @return true if the page will need to be retyped, false otherwise.
2238  */
2239 static inline bool
pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa)2240 pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa)
2241 {
2242 	pvh_assert_locked(pa_index(pa));
2243 	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
2244 	if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
2245 		pmap_retype_epoch_prepare_drain();
2246 		return true;
2247 	}
2248 	return false;
2249 }
2250 
2251 /**
2252  * If necessary, retype a physical page back to XNU_DEFAULT after the last CPU
2253  * mapping has been removed.  This is only needed for pages of certain special
2254  * types such as the various executable types, the kernel RO zone type,
2255  * and XNU_KERNEL_RESTRICTED.
2256  *
2257  * @note The PVH lock for the physical page that is getting a new mapping
2258  *       registered must already be held.
2259  *
2260  * @param pa The physical address of the recently-unmapped page.
2261  *
2262  * @return true if the page needed to be retyped, false otherwise.
2263  */
2264 static inline bool
pmap_retype_unmapped_page(pmap_paddr_t pa)2265 pmap_retype_unmapped_page(pmap_paddr_t pa)
2266 {
2267 	pvh_assert_locked(pa_index(pa));
2268 	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
2269 	if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
2270 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2271 		pmap_retype_epoch_drain();
2272 		sptm_retype(pa, frame_type, XNU_DEFAULT, retype_params);
2273 		return true;
2274 	}
2275 	return false;
2276 }
2277 
2278 /**
2279  * The minimum number of pages to keep in the PPL page free list.
2280  *
2281  * We define our target as 8 pages: enough for 2 page table pages, a PTD page,
2282  * and a PV page; in essence, twice as many pages as may be necessary to satisfy
2283  * a single pmap_enter request.
2284  */
2285 #define PMAP_MIN_FREE_PPL_PAGES 8
2286 
2287 /**
2288  * Flags passed to various page allocation functions, usually accessed through
2289  * the pmap_page_alloc() API. Each function that can take these flags as
2290  * a part of its option field, will describe these flags in its function header.
2291  */
2292 
2293 /* Can be used when no allocation flags are wanted. */
2294 #define PMAP_PAGE_ALLOCATE_NONE 0x0
2295 
2296 /**
2297  * Instruct the allocation function to return immediately if no pages are
2298  * current available. Without this flag, the function will spin and wait for a
2299  * page to become available. This flag can be required in some circumstances
2300  * (for instance, when allocating pages from within the PPL).
2301  */
2302 #define PMAP_PAGE_ALLOCATE_NOWAIT 0x1
2303 
2304 /**
2305  * Instructs an allocation function to fallback to reclaiming a userspace page
2306  * table if it failed to allocate a page from the free lists. This can be useful
2307  * when allocating from within the PPL because refilling the free lists requires
2308  * exiting and re-entering the PPL (which incurs extra latency).
2309  *
2310  * This is a quick way of allocating a page at the expense of having to
2311  * reallocate the table the next time one of its mappings is accessed.
2312  */
2313 #define PMAP_PAGE_RECLAIM_NOWAIT 0x2
2314 
2315 /**
2316  * Instructs an allocation function to avoid zero-filling the newly-allocated
2317  * page.  This should be used only if you know the page will be fully initialized
2318  * by some other means on the relevant allocation path.
2319  */
2320 #define PMAP_PAGE_NOZEROFILL 0x4
2321 
2322 /**
2323  * Global variables exported to the rest of the internal pmap implementation.
2324  */
2325 extern pmap_paddr_t sptm_cpu_iommu_scratch_start;
2326 extern pmap_paddr_t sptm_cpu_iommu_scratch_end;
2327 extern unsigned int inuse_pmap_pages_count;
2328 extern vm_object_t pmap_object;
2329 extern uint32_t pv_alloc_initial_target;
2330 extern uint32_t pv_kern_alloc_initial_target;
2331 
2332 /**
2333  * Functions exported to the rest of the internal pmap implementation.
2334  */
2335 extern void pmap_data_bootstrap(void);
2336 extern void pmap_enqueue_pages(vm_page_t);
2337 extern kern_return_t pmap_page_alloc(pmap_paddr_t *, unsigned);
2338 extern void pmap_page_free(pmap_paddr_t);
2339 
2340 /**
2341  * The modes in which a pmap lock can be acquired. Note that shared access
2342  * doesn't necessarily mean "read-only". As long as data is atomically updated
2343  * correctly (to account for multi-cpu accesses) data can still get written with
2344  * a shared lock held. Care just needs to be taken so as to not introduce any
2345  * race conditions when there are multiple writers.
2346  *
2347  * This is here in pmap_data.h because it's a needed parameter for pv_alloc()
2348  * and pmap_enter_pv(). This header is always included in pmap_internal.h before
2349  * the rest of the pmap locking code is defined so there shouldn't be any issues
2350  * with missing types.
2351  */
2352 OS_ENUM(pmap_lock_mode, uint8_t,
2353     PMAP_LOCK_SHARED,
2354     PMAP_LOCK_EXCLUSIVE,
2355     PMAP_LOCK_HELD);
2356 
2357 /**
2358  * Possible return values for pv_alloc(). See the pv_alloc() function header for
2359  * a description of each of these values.
2360  */
2361 typedef enum {
2362 	PV_ALLOC_SUCCESS,
2363 	PV_ALLOC_RETRY,
2364 	PV_ALLOC_FAIL
2365 } pv_alloc_return_t;
2366 
2367 extern pv_alloc_return_t pv_alloc(
2368 	pmap_t, pmap_lock_mode_t, unsigned int, pv_entry_t **, locked_pvh_t *, volatile uint16_t *);
2369 extern void pv_free(pv_entry_t *);
2370 extern void pv_list_free(pv_entry_t *, pv_entry_t *, unsigned int);
2371 extern void pmap_compute_pv_targets(void);
2372 extern pv_alloc_return_t pmap_enter_pv(
2373 	pmap_t, pt_entry_t *, unsigned int, pmap_lock_mode_t, locked_pvh_t *, pv_entry_t **, int *);
2374 
2375 typedef enum {
2376 	PV_REMOVE_SUCCESS, /* found a mapping */
2377 	PV_REMOVE_FAIL /* no mapping found */
2378 } pv_remove_return_t;
2379 
2380 extern pv_remove_return_t pmap_remove_pv(pmap_t, pt_entry_t *, locked_pvh_t *, bool *, bool *);
2381 
2382 extern void ptd_bootstrap(pt_desc_t *, unsigned int);
2383 extern pt_desc_t *ptd_alloc_unlinked(unsigned int);
2384 extern pt_desc_t *ptd_alloc(pmap_t, unsigned int);
2385 extern void ptd_deallocate(pt_desc_t *);
2386 extern void ptd_info_init(
2387 	pt_desc_t *, pmap_t, vm_map_address_t, unsigned int, pt_entry_t *);
2388 
2389 extern kern_return_t pmap_ledger_credit(pmap_t, int, ledger_amount_t);
2390 extern kern_return_t pmap_ledger_debit(pmap_t, int, ledger_amount_t);
2391 
2392 extern void validate_pmap_internal(const volatile struct pmap *, const char *);
2393 extern void validate_pmap_mutable_internal(const volatile struct pmap *, const char *);
2394 
2395 /**
2396  * Macro function wrappers around pmap validation so that the calling function
2397  * can be printed in the panic strings for easier validation failure debugging.
2398  */
2399 #define validate_pmap(x) validate_pmap_internal(x, __func__)
2400 #define validate_pmap_mutable(x) validate_pmap_mutable_internal(x, __func__)
2401 
2402 /**
2403  * This structure describes a SPTM-owned I/O range.
2404  *
2405  * @note This doesn't necessarily have to represent "I/O" only, this can also
2406  *       represent non-kernel-managed DRAM (e.g., iBoot carveouts). Any physical
2407  *       address region that isn't considered "kernel-managed" is fair game.
2408  *
2409  * @note The layout of this structure needs to map 1-to-1 with the pmap-io-range
2410  *       device tree nodes. Astris (through the LowGlobals) also depends on the
2411  *       consistency of this structure.
2412  *
2413  * @note These definitions are copied to SPTM and they need to be in sync.
2414  */
2415 typedef struct pmap_io_range {
2416 	/* Physical address of the PPL-owned I/O range. */
2417 	uint64_t addr;
2418 
2419 	/* Length (in bytes) of the PPL-owned I/O range. */
2420 	uint64_t len;
2421 
2422 	/* Strong DSB required for pages in this range. */
2423 	#define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31)
2424 
2425 	/* Corresponds to memory carved out by bootloader. */
2426 	#define PMAP_IO_RANGE_CARVEOUT (1UL << 30)
2427 
2428 	/* Pages in this range need to be included in the hibernation image */
2429 	#define PMAP_IO_RANGE_NEEDS_HIBERNATING (1UL << 29)
2430 
2431 	/* Mark the range as 'owned' by a given subsystem */
2432 	#define PMAP_IO_RANGE_OWNED (1UL << 28)
2433 
2434 	/**
2435 	 * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional
2436 	 * mapping flags (defined above).
2437 	 */
2438 	uint32_t wimg;
2439 
2440 	/* 4 Character Code (4CC) describing what this range is. */
2441 	uint32_t signature;
2442 } pmap_io_range_t;
2443 
2444 /* Reminder: be sure to change all relevant device trees if you change the layout of pmap_io_range_t */
2445 _Static_assert(sizeof(pmap_io_range_t) == 24, "unexpected size for pmap_io_range_t");
2446 
2447 extern pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t);
2448 
2449 extern void pmap_cpu_data_init_internal(unsigned int);
2450