1 /*
2 * Copyright (c) 2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /**
29 * This header file is used to store the types, prototypes, and inline functions
30 * that define some of the most important data structures used in the pmap. This
31 * header is only meant for sharing types within the pmap; if a type is meant to
32 * be used by the rest of the kernel, then put it into osfmk/arm64/sptm/pmap/pmap.h.
33 */
34 #pragma once
35
36 #include <stdint.h>
37
38 #include <kern/ledger.h>
39 #include <mach/vm_types.h>
40 #include <mach_assert.h>
41 #include <vm/vm_page.h>
42
43 #include <arm/cpu_data.h>
44 #include <arm/machine_routines.h>
45 #include <arm64/proc_reg.h>
46
47 #if HIBERNATION
48 #include <arm64/hibernate_secure_hmac.h>
49 #endif /* HIBERNATION */
50
51 /* Temporary include before moving all ledger functions into pmap_data.c */
52 #include <os/refcnt.h>
53
54 /**
55 * These headers are safe to be included in this file since they shouldn't rely
56 * on any of the internal pmap header files (so no circular dependencies).
57 */
58 #include <arm64/sptm/pmap/pmap.h>
59 #include <arm64/sptm/pmap/pmap_pt_geometry.h>
60
61 #include <arm64/sptm/sptm.h>
62
63 /**
64 * These values represent the first and last kernel-managed physical addresses.
65 * We keep track of extra metadata on kernel-managed pages compared to other
66 * pages (usually iBoot carved out memory or I/O).
67 */
68 extern pmap_paddr_t vm_first_phys, vm_last_phys;
69
70 #define PMAP_HIB_STATE_REACHED(states) false
71 #define PMAP_ASSERT_NOT_WRITING_HIB()
72 #define PMAP_IS_HIBERNATING() false
73
74 /**
75 * Return whether the given address represents a kernel-managed physical page.
76 *
77 * Whether a page is considered "kernel-managed" is determined by the BootArgs
78 * passed by the bootloader. Typically memory carved out by the bootloader as
79 * well as I/O memory should return false.
80 *
81 * @param pa The physical address to check.
82 */
83 static inline bool
pa_valid(pmap_paddr_t pa)84 pa_valid(pmap_paddr_t pa)
85 {
86 return (pa >= vm_first_phys) && (pa < vm_last_phys);
87 }
88
89 /* Sentinal value indicating an invalid physical address index. */
90 #define INVALID_PAI UINT_MAX
91
92 /**
93 * The pmap has a variety of data structures (pv_head_table/pp_attr_table) that
94 * contain an entry for every kernel-managed page in the system. These systems
95 * are indexed with physical address indices ("pai") generated by this function.
96 *
97 * The logic is simple since there should be one entry in each of these data
98 * structures for each kernel-managed physical page in the system. These data
99 * structures are allocated on boot based on the amount of memory available.
100 *
101 * @note PAIs are defined using the VM page size, which might not be identical
102 * to the underlying hardware page size for an arbitrary address space.
103 * This means that the data structures relying on PAIs will contain one
104 * entry for each VM page, not hardware page.
105 *
106 * @note This function is only valid for physical addresses that are
107 * kernel-managed.
108 */
109 static inline unsigned int
pa_index(pmap_paddr_t pa)110 pa_index(pmap_paddr_t pa)
111 {
112 return (unsigned int)atop(pa - vm_first_phys);
113 }
114
115 /**
116 * Convert from a physical address index (pai) back to a raw physical address.
117 *
118 * @param pai The physical address index to convert to a PA.
119 *
120 * @return The page-aligned physical address corresponding to [pai].
121 */
122 static inline pmap_paddr_t
pai_to_pa(unsigned int pai)123 pai_to_pa(unsigned int pai)
124 {
125 return ptoa((pmap_paddr_t)pai) + vm_first_phys;
126 }
127
128 /* See the definition of pv_head_table for more information. */
129 extern uintptr_t *pv_head_table;
130
131 /* Represents a NULL entry in the pv_head_table. */
132 #define PV_ENTRY_NULL ((pv_entry_t *) 0)
133
134 /**
135 * Given a physical address index, return the corresponding pv_head_table entry.
136 *
137 * @note The returned entry might be invalid, or a pointer to a pt_entry_t,
138 * pv_entry_t, or pt_desc_t depending on the type for this entry.
139 * Determine the type using pvh_test_type().
140 *
141 * @param pai The index returned by pa_index() for the page whose pv_head_table
142 * entry should be retrieved.
143 */
144 static inline uintptr_t
pai_to_pvh(unsigned int pai)145 pai_to_pvh(unsigned int pai)
146 {
147 return pv_head_table[pai];
148 }
149
150 /**
151 * Each pv_head_table entry can be one of four different types:
152 *
153 * - PVH_TYPE_NULL: No mappings to the physical page exist outside of the
154 * physical aperture. Physical aperture mappings are not
155 * tracked in the pv_head_table.
156 *
157 * - PVH_TYPE_PVEP: There are multiple mappings to the physical page.
158 * These entries are linked lists of pv_entry_t objects (which
159 * each contain a pointer to the associated PTE and a pointer
160 * to the next entry in the list).
161 *
162 * - PVH_TYPE_PTEP: There is a single mapping to the physical page. Once more
163 * mappings are created, this entry will get upgraded to an
164 * entry of type PVH_TYPE_PVEP. These entries are pointers
165 * directly to the page table entry that contain the mapping
166 * (pt_entry_t*).
167 *
168 * - PVH_TYPE_PTDP: The physical page is being used as a page table. These
169 * entries are pointers to page table descriptor structures
170 * (pt_desc_t) which contain metadata related to each page
171 * table.
172 *
173 * The type is stored in the bottom two bits of each pv_head_table entry. That
174 * type needs to be checked before dereferencing the pointer to determine which
175 * pointer type to dereference as.
176 */
177 __enum_closed_decl(pvh_type_t, uint8_t, {
178 PVH_TYPE_NULL = 0b00,
179 PVH_TYPE_PVEP = 0b01,
180 PVH_TYPE_PTEP = 0b10,
181 PVH_TYPE_PTDP = 0b11,
182 });
183
184 #define PVH_TYPE_MASK (0x3UL)
185
186
187 /**
188 * PV_HEAD_TABLE Flags.
189 *
190 * All flags listed below are stored in the pv_head_table entry/pointer
191 * (per-physical-page) unless otherwise noted.
192 *
193 * Please update the pv_walk LLDB macro if these flags are changed or added to.
194 */
195
196 /**
197 * This flag is set for every mapping created by an IOMMU.
198 *
199 * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
200 * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
201 */
202 #define PVH_FLAG_IOMMU 0x4UL
203
204 /**
205 * This flag is only valid when PVH_FLAG_IOMMU is set. For an IOMMU mapping, if
206 * this bit is set, then the PTE pointer points directly into the IOMMU page
207 * table for this mapping. If this bit is cleared, then the "PTE pointer" is
208 * actually a pointer to the IOMMU descriptor object that owns this mapping.
209 *
210 * There are cases where it's not easy to tie an IOMMU mapping directly to a
211 * specific page table, so this allows us to at least get a pointer to which
212 * IOMMU created this mapping which is useful for debugging purposes.
213 *
214 * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
215 * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
216 */
217 #define PVH_FLAG_IOMMU_TABLE (1ULL << 63)
218
219 /**
220 * This flag is set when the first CPU (non-IOMMU) mapping is created. This is
221 * important to keep track of because various accounting statistics are based on
222 * the options specified for the first CPU mapping. This flag, and thus the
223 * accounting statistics, will persist as long as there *any* mappings of the
224 * page (including IOMMU mappings). This works because the accounting for a page
225 * should not need to change until the page is recycled by the VM layer, and we
226 * double-check that there are no mappings (CPU or IOMMU) when a page is
227 * recycled (see: pmap_verify_free()).
228 */
229 #define PVH_FLAG_CPU (1ULL << 62)
230
231 /* This bit is used as a lock when modifying a pv_head_table entry. */
232 #define PVH_LOCK_BIT 61
233 #define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT)
234
235 /**
236 * This flag is set when there are any executable mappings to this physical
237 * page. This is used to prevent any writable mappings from being created at
238 * the same time an executable mapping exists.
239 */
240 #define PVH_FLAG_EXEC (1ULL << 60)
241
242 /**
243 * This flag is used to mark that a page has been hashed into the hibernation
244 * image.
245 *
246 * The hibernation driver will use this to ensure that all PPL-owned memory is
247 * correctly included into the hibernation image (a missing PPL page could be
248 * a security concern when coming out of hibernation).
249 */
250 #define PVH_FLAG_HASHED (1ULL << 58)
251
252 /**
253 * Marking a pv_head_table entry with this flag denotes that this page is
254 * retired without any mappings and never should be mapped again.
255 */
256 #define PVH_FLAG_RETIRED (1ULL << 55)
257
258 /**
259 * This flag is used to mark that a PV head entry has been placed into
260 * "sleep mode", which typically happens when the lock owner needs to
261 * process a long PV list. If this bit is set, threads which contend
262 * on the PVH lock must call thread_block() to wait until they are awakened
263 * by the current lock owner releasing the lock.
264 */
265 #define PVH_FLAG_SLEEP (1ULL << 54)
266
267 /**
268 * These bits need to be set to safely dereference a pv_head_table
269 * entry/pointer.
270 *
271 * Any change to this #define should also update the copy located in the pmap.py
272 * LLDB macros file.
273 */
274 #define PVH_MUTABLE_FLAGS (PVH_FLAG_CPU | PVH_FLAG_EXEC | PVH_FLAG_HASHED | PVH_FLAG_RETIRED)
275
276 #define PVH_LOCK_FLAGS (PVH_FLAG_LOCK | PVH_FLAG_SLEEP)
277
278 #define PVH_HIGH_FLAGS (PVH_MUTABLE_FLAGS | PVH_LOCK_FLAGS)
279
280 /* Mask used to clear out the TYPE bits from a pv_head_table entry/pointer. */
281 #define PVH_LIST_MASK (~PVH_TYPE_MASK)
282
283 /* Which 32-bit word in each pv_head_table entry/pointer contains the LOCK bit. */
284 #define PVH_LOCK_WORD 1 /* Assumes little-endian */
285
286 /**
287 * Assert that a pv_head_table entry is locked. Will panic if the lock isn't
288 * acquired.
289 *
290 * @param index The physical address index to check.
291 */
292 static inline void
pvh_assert_locked(__assert_only unsigned int index)293 pvh_assert_locked(__assert_only unsigned int index)
294 {
295 assertf(os_atomic_load(&pv_head_table[index], relaxed) & PVH_LOCK_FLAGS,
296 "%s: PVH %p (=%p) for pai 0x%x not locked or in sleep mode", __func__,
297 &pv_head_table[index], (void*)(os_atomic_load(&pv_head_table[index], relaxed)), index);
298 }
299
300 /**
301 * Helper function for returning the 32-bit PVH lock word corresponding
302 * to a physical address index.
303 *
304 * @param index The physical address index of the pv_head_table entry
305 *
306 * @return A pointer to the 32-bit word containing the lock bit
307 */
308 static inline uint32_t*
pvh_lock_word(unsigned int index)309 pvh_lock_word(unsigned int index)
310 {
311 return (uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD;
312 }
313
314 /**
315 * Helper macro for computing the lock bit offset within the 32-bit
316 * lock word for each PV head entry.
317 *
318 * @return A 32-bit integer containing the lock bit offset.
319 */
320 #define PVH_LOCK_BIT_OFFSET (PVH_LOCK_BIT - (PVH_LOCK_WORD * 32))
321
322 /**
323 * Lock a pv_head_table entry, and return the value stored in the pv_head_table array.
324 *
325 * @param index The physical address index of the pv_head_table entry to lock.
326 *
327 * @return A wrapper object with the contents of the locked pv_head_table entry.
328 */
329 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock(unsigned int index)330 pvh_lock(unsigned int index)
331 {
332 extern unsigned int not_in_kdp;
333 const bool was_preemptible = preemption_enabled();
334 assert(was_preemptible || (startup_phase < STARTUP_SUB_EARLY_BOOT) ||
335 PMAP_IS_HIBERNATING() || !not_in_kdp);
336
337 bool (^check_preemption)(void) = ^bool (void) {
338 return was_preemptible && pmap_pending_preemption();
339 };
340
341 hw_lock_status_t ret;
342 locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
343 do {
344 ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
345 &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
346
347 if (ret == HW_LOCK_ACQUIRED) {
348 locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
349 if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
350 wait_result_t wres;
351 wres = assert_wait(&pv_head_table[index], THREAD_UNINT);
352 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
353 assertf(wres == THREAD_WAITING, "%s: unexpected wait result %d", __func__, wres);
354 thread_block(THREAD_CONTINUE_NULL);
355 ret = HW_LOCK_CONTENDED;
356 }
357 }
358 } while (ret != HW_LOCK_ACQUIRED);
359
360 return locked_pvh;
361 }
362
363 /**
364 * Lock a pvh_head_table entry, possibly in a preemption-disabled context.
365 *
366 * @note This function is only meant for special use cases in which pmap
367 * functions must be invoked with preemption disabled. These cases
368 * are expected to be rare and limited. If you think you need to
369 * use this in more places, you're probably wrong.
370 *
371 * @param index The physical address index of the pv_head_table entry to lock.
372 *
373 * @return A wrapper object with the contents of the locked pv_head_table entry.
374 */
375 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock_nopreempt(unsigned int index)376 pvh_lock_nopreempt(unsigned int index)
377 {
378 if (__improbable(preemption_enabled())) {
379 return pvh_lock(index);
380 }
381 hw_lock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
382 const locked_pvh_t locked_pvh = {.pvh = os_atomic_load(&pv_head_table[index], relaxed), .pai = index};
383
384 if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
385 panic("%s invoked on sleep-mode PVH %p for pai 0x%x", __func__, &pv_head_table[index], index);
386 }
387
388 return locked_pvh;
389 }
390
391 /**
392 * Attempt to lock a pv_head_table entry, failing if the lock can't be immediately acquired.
393 *
394 * @param index The physical address index of the pv_head_table entry to lock.
395 *
396 * @return A wrapper object with the contents of the locked pv_head_table entry if successful,
397 * 0 otherwise.
398 */
399 static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_try_lock(unsigned int index)400 pvh_try_lock(unsigned int index)
401 {
402 locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
403 bool locked = hw_lock_bit_try(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
404
405 if (locked) {
406 locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
407 assert(locked_pvh.pvh != 0);
408 if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
409 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
410 locked_pvh.pvh = 0;
411 }
412 }
413
414 return locked_pvh;
415 }
416
417 /**
418 * Helper for determining whether a preceding pvh_try_lock() call succeeded.
419 *
420 * @param locked_pvh A wrapper representing a possibly-locked PV head table entry
421 * returned by pvh_try_lock().
422 *
423 * @return True if [locked_pvh] represents a successfully-locked PVH, false otherwise.
424 */
425 static inline bool
pvh_try_lock_success(const locked_pvh_t * locked_pvh)426 pvh_try_lock_success(const locked_pvh_t *locked_pvh)
427 {
428 assert(locked_pvh != NULL);
429 return locked_pvh->pvh != 0;
430 }
431
432 /**
433 * Place a pv_head_table entry in sleep mode, so that other threads contending on the PVH
434 * lock will sleep until this thread calls pvh_unlock().
435 *
436 * @note It is legal to call this function if the lock is already in sleep mode.
437 * In that case, the call will have no effect.
438 * @note This function must not be called with preemption disabled by any other agent
439 * but [locked_pvh] itself. Preemption must be fully re-enabled by the time
440 * this function returns, either because it was already enabled (because the
441 * lock was already in sleep mode), or because this function enabled it by placing
442 * the lock in sleep mode.
443 *
444 * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
445 */
446 static inline void
pvh_lock_enter_sleep_mode(locked_pvh_t * locked_pvh)447 pvh_lock_enter_sleep_mode(locked_pvh_t *locked_pvh)
448 {
449 assert(locked_pvh != NULL);
450 assert(locked_pvh->pvh != 0);
451 unsigned int index = locked_pvh->pai;
452 pvh_assert_locked(index);
453 const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
454 if (!(old_pvh & PVH_FLAG_SLEEP)) {
455 assert(old_pvh & PVH_FLAG_LOCK);
456 os_atomic_store(&pv_head_table[index], old_pvh | PVH_FLAG_SLEEP, relaxed);
457 /**
458 * Tell the scheduler that this thread may need a priority boost if it needs to go
459 * off-core, to reduce the likelihood of priority inversion.
460 */
461 locked_pvh->pri_token = thread_priority_floor_start();
462 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
463 }
464
465 /* Hibernation runs single-core so we can skip this check. */
466 assert(preemption_enabled() || PMAP_IS_HIBERNATING());
467 }
468
469 /**
470 * Check that a pv_head_table entry/pointer is a specific type.
471 *
472 * @param pvh The pv_head_table entry/pointer to check.
473 * @param type The type to check for.
474 *
475 * @return True if the pv_head_table entry is of the passed in type, false
476 * otherwise.
477 */
478 static inline bool
pvh_test_type(uintptr_t pvh,pvh_type_t type)479 pvh_test_type(uintptr_t pvh, pvh_type_t type)
480 {
481 return (pvh & PVH_TYPE_MASK) == type;
482 }
483
484 /**
485 * Unlock a pv_head_table entry, updating the contents of the entry with the passed-in value.
486 *
487 * @note Only the non-lock flags, pointer, and type fields of the entry will be updated
488 * according to the passed-in value. PVH_LOCK_FLAGS will be ignored as they are
489 * directly manipulated by this function.
490 *
491 * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
492 * The pvh field from this entry, except for the PVH_LOCK_FLAGS bits, will be stored
493 * in pv_head_table to reflect any updates that may have been performed on the PV list
494 * while the lock was held.
495 */
496 static inline void
pvh_unlock(locked_pvh_t * locked_pvh)497 pvh_unlock(locked_pvh_t *locked_pvh)
498 {
499 assert(locked_pvh != NULL);
500 assert(locked_pvh->pvh != 0);
501 unsigned int index = locked_pvh->pai;
502 pvh_assert_locked(index);
503 const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
504 bool pri_floor_end = false;
505
506 if (__improbable(old_pvh & PVH_FLAG_SLEEP)) {
507 pri_floor_end = true;
508 const bool was_preemptible = preemption_enabled();
509 bool (^check_preemption)(void) = ^bool (void) {
510 return was_preemptible && pmap_pending_preemption();
511 };
512
513 hw_lock_status_t ret;
514 do {
515 ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
516 &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
517 } while (ret != HW_LOCK_ACQUIRED);
518
519 os_atomic_store(&pv_head_table[index],
520 (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
521 thread_wakeup(&pv_head_table[index]);
522 } else if ((old_pvh & ~PVH_LOCK_FLAGS) != (locked_pvh->pvh & ~PVH_LOCK_FLAGS)) {
523 os_atomic_store(&pv_head_table[index],
524 (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
525 }
526 hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
527
528 if (__improbable(pri_floor_end)) {
529 thread_priority_floor_end(&locked_pvh->pri_token);
530 }
531
532 locked_pvh->pvh = 0;
533 }
534
535 /**
536 * Convert a pv_head_table entry/pointer into a page table entry pointer. This
537 * should only be done if the type of this entry is PVH_TYPE_PTEP.
538 *
539 * @param pvh The pv_head_table entry/pointer to convert into a pt_entry_t*.
540 *
541 * @return Return back a safe to derefence pointer to the single mapping of this
542 * physical page by masking off the TYPE bits and adding any missing
543 * flags to the upper portion of the pointer.
544 */
545 static inline pt_entry_t*
pvh_ptep(uintptr_t pvh)546 pvh_ptep(uintptr_t pvh)
547 {
548 assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
549 return (pt_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
550 }
551
552 /**
553 * Convert a pv_head_table entry/pointer into a PVE list pointer. This
554 * should only be done if the type of this entry is PVH_TYPE_PVEP.
555 *
556 * @param pvh The pv_head_table entry/pointer to convert into a safe to
557 * dereference pv_entry_t*.
558 *
559 * @return Return back a safe to derefence pointer to the first mapping of this
560 * physical page by masking off the TYPE bits and adding any missing
561 * flags to the upper portion of the pointer.
562 */
563 static inline pv_entry_t*
pvh_pve_list(uintptr_t pvh)564 pvh_pve_list(uintptr_t pvh)
565 {
566 assert(pvh_test_type(pvh, PVH_TYPE_PVEP));
567 return (pv_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
568 }
569
570 /**
571 * Return the mutable flags associated with a pv_head_table entry/pointer.
572 *
573 * @param pvh The pv_head_table entry whose flags to get.
574 *
575 * @return The mutable flags encoded in [pvh].
576 */
577 static inline uintptr_t
pvh_get_flags(uintptr_t pvh)578 pvh_get_flags(uintptr_t pvh)
579 {
580 return pvh & PVH_MUTABLE_FLAGS;
581 }
582
583 /**
584 * Update the flags associated with a pv_head_table entry/pointer.
585 *
586 * @note This function does not actually modify the pv_head_table,
587 * it only installs an updated pv_head_table entry in [locked_pvh]
588 * that can later be passed to pvh_unlock() to update the actual array
589 * entry.
590 *
591 * @param locked_pvh A wrapper struct containing the pv_head_table
592 * entry/pointer to update.
593 *
594 */
595 static inline void
pvh_set_flags(locked_pvh_t * locked_pvh,uintptr_t flags)596 pvh_set_flags(locked_pvh_t *locked_pvh, uintptr_t flags)
597 {
598 locked_pvh->pvh = (locked_pvh->pvh & ~PVH_MUTABLE_FLAGS) | (flags & PVH_MUTABLE_FLAGS);
599 }
600
601 /**
602 * Update a pv_head_table entry/pointer to be a different type and/or point to
603 * a different object.
604 *
605 * @note This function does not actually modify the pv_head_table,
606 * it only installs an updated pv_head_table entry in [locked_pvh]
607 * that can later be passed to pvh_unlock() to update the actual array
608 * entry.
609 *
610 * @param locked_pvh A wrapper struct containing the pv_head_table
611 * entry/pointer to update.
612 * @param pvep The new entry to use. This could be either a pt_entry_t*,
613 * pv_entry_t*, or pt_desc_t* depending on the type.
614 * @param type The type of the new entry.
615 */
616 static inline void
pvh_update_head(locked_pvh_t * locked_pvh,void * pvep,unsigned int type)617 pvh_update_head(locked_pvh_t *locked_pvh, void *pvep, unsigned int type)
618 {
619 assert(!((uintptr_t)pvep & PVH_TYPE_MASK));
620 const uintptr_t pvh_flags = locked_pvh->pvh & PVH_HIGH_FLAGS;
621 locked_pvh->pvh = ((uintptr_t)pvep & ~PVH_HIGH_FLAGS) | type | pvh_flags;
622 }
623
624 /**
625 * Given a page table entry pointer retrieved from the pv_head_table (from an
626 * entry of type PVH_TYPE_PTEP or PVH_TYPE_PVEP), return back whether the PTE is
627 * an IOMMU mapping.
628 *
629 * @note The way this function determines whether the passed in pointer is
630 * pointing to an IOMMU PTE, is by checking for a special flag stored in
631 * the lower bits of the pointer. This flag is only set on pointers stored
632 * in the pv_head_table, and as such, this function will only work on
633 * pointers retrieved from the pv_head_table. If a pointer to a PTE was
634 * directly retrieved from an IOMMU's page tables, this function would
635 * always return false despite actually being an IOMMU PTE.
636 *
637 * @param ptep A PTE pointer obtained from the pv_head_table to check.
638 *
639 * @return True if the entry is an IOMMU mapping, false otherwise.
640 */
641 static inline bool
pvh_ptep_is_iommu(const pt_entry_t * ptep)642 pvh_ptep_is_iommu(const pt_entry_t *ptep)
643 {
644 #ifdef PVH_FLAG_IOMMU
645 return (uintptr_t)ptep & PVH_FLAG_IOMMU;
646 #else /* PVH_FLAG_IOMMU */
647 #pragma unused(ptep)
648 return false;
649 #endif /* PVH_FLAG_IOMMU */
650 }
651
652 /**
653 * Sometimes the PTE pointers retrieved from the pv_head_table (from an entry of
654 * type PVH_TYPE_PTEP or PVH_TYPE_PVEP) contain flags themselves. This function
655 * strips out those flags and returns back a dereferencable pointer.
656 *
657 * @param ptep The PTE pointer to strip out the unwanted flags.
658 *
659 * @return A valid dereferencable pointer to the page table entry.
660 */
661 static inline const pt_entry_t*
pvh_strip_ptep(const pt_entry_t * ptep)662 pvh_strip_ptep(const pt_entry_t *ptep)
663 {
664 #ifdef PVH_FLAG_IOMMU
665 const uintptr_t pte_va = (uintptr_t)ptep;
666 return (const pt_entry_t*)((pte_va & ~PVH_FLAG_IOMMU) | PVH_FLAG_IOMMU_TABLE);
667 #else /* PVH_FLAG_IOMMU */
668 return ptep;
669 #endif /* PVH_FLAG_IOMMU */
670 }
671
672 /**
673 * PVH_TYPE_PVEP Helper Functions.
674 *
675 * The following are methods used to manipulate PVE lists. This is the type of
676 * pv_head_table entry used when there are multiple mappings to a single
677 * physical page.
678 */
679
680 /**
681 * Whether a physical page is using "alternate accounting" (ALTACCT) for its
682 * ledger statistics is something that needs to be tracked on a per-mapping
683 * basis, not on a per-physical-page basis. Because of that, it's tracked
684 * differently depending on whether there's a single mapping to a page
685 * (PVH_TYPE_PTEP) or multiple (PVH_TYPE_PVEP). For single mappings, the bit is
686 * tracked in the pp_attr_table. But when there are multiple mappings, the least
687 * significant bit of the corresponding "pve_pte" pointer in each pv_entry object
688 * is used as a marker for pages using alternate accounting.
689 *
690 * @note See the definition for PP_ATTR_ALTACCT for a more detailed description
691 * of what "alternate accounting" actually means in respect to the
692 * footprint ledger.
693 *
694 * Since some code (KernelDiskImages, e.g.) might map a phsyical page as
695 * "device" memory (i.e. external) while it's also being used as regular
696 * "anonymous" memory (i.e. internal) in user space, we have to manage the
697 * "internal" attribute per mapping rather than per physical page.
698 * When there are multiple mappings, we use the next least significant bit of
699 * the corresponding "pve_pte" pointer for that.
700 */
701 #define PVE_PTEP_ALTACCT ((uintptr_t) 0x1)
702 #define PVE_PTEP_INTERNAL ((uintptr_t) 0x2)
703 #define PVE_PTEP_FLAGS (PVE_PTEP_ALTACCT | PVE_PTEP_INTERNAL)
704
705 /**
706 * Set the ALTACCT bit for a specific PTE pointer.
707 *
708 * @param pvep A pointer to the current pv_entry mapping in the linked list of
709 * mappings.
710 * @param idx Index of the chosen PTE pointer inside the PVE.
711 */
712 static inline void
pve_set_altacct(pv_entry_t * pvep,unsigned idx)713 pve_set_altacct(pv_entry_t *pvep, unsigned idx)
714 {
715 assert(idx < PTE_PER_PVE);
716 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_ALTACCT);
717 }
718
719 /**
720 * Set the INTERNAL bit for a specific PTE pointer.
721 *
722 * @param pvep A pointer to the current pv_entry mapping in the linked list of
723 * mappings.
724 * @param idx Index of the chosen PTE pointer inside the PVE.
725 */
726 static inline void
pve_set_internal(pv_entry_t * pvep,unsigned idx)727 pve_set_internal(pv_entry_t *pvep, unsigned idx)
728 {
729 assert(idx < PTE_PER_PVE);
730 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_INTERNAL);
731 }
732
733 /**
734 * Clear the ALTACCT bit for a specific PTE pointer.
735 *
736 * @param pvep A pointer to the current pv_entry mapping in the linked list of
737 * mappings.
738 * @param idx Index of the chosen PTE pointer inside the PVE.
739 */
740 static inline void
pve_clr_altacct(pv_entry_t * pvep,unsigned idx)741 pve_clr_altacct(pv_entry_t *pvep, unsigned idx)
742 {
743 assert(idx < PTE_PER_PVE);
744 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_ALTACCT);
745 }
746
747 /**
748 * Clear the INTERNAL bit for a specific PTE pointer.
749 *
750 * @param pvep A pointer to the current pv_entry mapping in the linked list of
751 * mappings.
752 * @param idx Index of the chosen PTE pointer inside the PVE.
753 */
754 static inline void
pve_clr_internal(pv_entry_t * pvep,unsigned idx)755 pve_clr_internal(pv_entry_t *pvep, unsigned idx)
756 {
757 assert(idx < PTE_PER_PVE);
758 pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_INTERNAL);
759 }
760
761 /**
762 * Return the ALTACCT bit for a specific PTE pointer.
763 *
764 * @param pvep A pointer to the current pv_entry mapping in the linked list of
765 * mappings.
766 * @param idx Index of the chosen PTE pointer inside the PVE.
767 */
768 static inline bool
pve_get_altacct(pv_entry_t * pvep,unsigned idx)769 pve_get_altacct(pv_entry_t *pvep, unsigned idx)
770 {
771 assert(idx < PTE_PER_PVE);
772 return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_ALTACCT;
773 }
774
775 /**
776 * Return the INTERNAL bit for a specific PTE pointer.
777 *
778 * @param pvep A pointer to the current pv_entry mapping in the linked list of
779 * mappings.
780 * @param idx Index of the chosen PTE pointer inside the PVE.
781 */
782 static inline bool
pve_get_internal(pv_entry_t * pvep,unsigned idx)783 pve_get_internal(pv_entry_t *pvep, unsigned idx)
784 {
785 assert(idx < PTE_PER_PVE);
786 return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_INTERNAL;
787 }
788
789 /**
790 * Return the next mapping (pv_entry) in a linked list of mappings. This applies
791 * to pv_head_table entries of type PVH_TYPE_PVEP.
792 *
793 * @param pvep A pointer to the current pv_entry mapping in the linked list of
794 * mappings.
795 *
796 * @return The next virtual mapping for a physical page, or PV_ENTRY_NULL if the
797 * end of the list has been reached.
798 */
799 static inline pv_entry_t *
pve_next(pv_entry_t * pvep)800 pve_next(pv_entry_t *pvep)
801 {
802 return pvep->pve_next;
803 }
804
805 /**
806 * Return a pointer to the pve_next field in a pv_entry. This value is used
807 * when adding and removing entries to a PVE list.
808 *
809 * @param pvep The pv_entry whose pve_next field is being accessed.
810 *
811 * @return Pointer to the pve_next field.
812 */
813 static inline pv_entry_t **
pve_next_ptr(pv_entry_t * pvep)814 pve_next_ptr(pv_entry_t *pvep)
815 {
816 return &pvep->pve_next;
817 }
818
819 /**
820 * Return a pointer to the page table entry for this mapping.
821 *
822 * @param pvep The pv_entry whose pve_ptep field is to be returned.
823 * @param idx Index of the chosen PTE pointer inside the PVE.
824 *
825 * @return Pointer to the page table entry.
826 */
827 static inline pt_entry_t *
pve_get_ptep(pv_entry_t * pvep,unsigned idx)828 pve_get_ptep(pv_entry_t *pvep, unsigned idx)
829 {
830 assert(idx < PTE_PER_PVE);
831 return (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_FLAGS);
832 }
833
834 /**
835 * Update the page table entry for a specific physical to virtual mapping.
836 *
837 * @param pvep The pv_entry to update.
838 * @param idx Index of the chosen PTE pointer inside the PVE.
839 * @param ptep_new The new page table entry.
840 */
841 static inline void
pve_set_ptep(pv_entry_t * pvep,unsigned idx,pt_entry_t * ptep_new)842 pve_set_ptep(pv_entry_t *pvep, unsigned idx, pt_entry_t *ptep_new)
843 {
844 assert(idx < PTE_PER_PVE);
845 pvep->pve_ptep[idx] = ptep_new;
846 }
847
848 /**
849 * Initialize all fields in a PVE to NULL.
850 *
851 * @param pvep The pv_entry to initialize.
852 */
853 static inline void
pve_init(pv_entry_t * pvep)854 pve_init(pv_entry_t *pvep)
855 {
856 pvep->pve_next = PV_ENTRY_NULL;
857 for (int i = 0; i < PTE_PER_PVE; i++) {
858 pvep->pve_ptep[i] = PT_ENTRY_NULL;
859 }
860 }
861
862 /**
863 * Find PTE pointer in PVE and return its index.
864 *
865 * @param pvep The PVE to search.
866 * @param ptep PTE to search for.
867 *
868 * @return Index of the found entry, or -1 if no entry exists.
869 */
870 static inline int
pve_find_ptep_index(pv_entry_t * pvep,pt_entry_t * ptep)871 pve_find_ptep_index(pv_entry_t *pvep, pt_entry_t *ptep)
872 {
873 for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
874 if (pve_get_ptep(pvep, i) == ptep) {
875 return (int)i;
876 }
877 }
878
879 return -1;
880 }
881
882 /**
883 * Checks if no PTEs are currently associated with this PVE.
884 *
885 * @param pvep The PVE to search.
886 *
887 * @return True if no PTEs are currently associated with this PVE, or false.
888 */
889 static inline bool
pve_is_empty(pv_entry_t * pvep)890 pve_is_empty(pv_entry_t *pvep)
891 {
892 for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
893 if (pve_get_ptep(pvep, i) != PT_ENTRY_NULL) {
894 return false;
895 }
896 }
897
898 return true;
899 }
900
901 /**
902 * Prepend a new pv_entry node to a PVE list.
903 *
904 * @note This function does not actually modify the pv_head_table,
905 * it only installs an updated pv_head_table entry in [locked_pvh]
906 * that can later be passed to pvh_unlock() to update the actual array
907 * entry.
908 *
909 * @param locked_pvh A wrapper struct containing the pv_head_table
910 * entry/pointer to update. This entry represents
911 * the linked list of mappings to update.
912 * @param pvep The new mapping to add to the linked list.
913 */
914 static inline void
pve_add(locked_pvh_t * locked_pvh,pv_entry_t * pvep)915 pve_add(locked_pvh_t *locked_pvh, pv_entry_t *pvep)
916 {
917 assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
918
919 pvep->pve_next = pvh_pve_list(locked_pvh->pvh);
920 pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
921 }
922
923 /**
924 * Remove an entry from a PVE list of mappings.
925 *
926 * @note This function does not actually modify the pv_head_table,
927 * it only installs an updated pv_head_table entry in [locked_pvh]
928 * that can later be passed to pvh_unlock() to update the actual array
929 * entry.
930 *
931 * @param locked_pvh A wrapper struct containing the pv_head_table entry/pointer
932 * to update. This entry represents the linked list of mappings
933 * from which to remove an entry.
934 * @param pvepp A pointer to the pv_entry_t* that's being removed. If this entry
935 * is the first in the linked list of mappings, then NULL should be
936 * passed here and the removal will be reflected in the returned
937 * pv_head_table entry.
938 * @param pvep The entry that should be removed. Should be identical to a
939 * dereference of the pvepp parameter (unless it's the pv_head_table
940 * entry).
941 */
942 static inline void
pve_remove(locked_pvh_t * locked_pvh,pv_entry_t ** pvepp,pv_entry_t * pvep)943 pve_remove(locked_pvh_t *locked_pvh, pv_entry_t **pvepp, pv_entry_t *pvep)
944 {
945 assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));
946
947 if (pvepp == NULL) {
948 assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
949 __func__, (void*)locked_pvh->pvh, pvep);
950 if (pve_next(pvep) == PV_ENTRY_NULL) {
951 /* The last mapping to this page is being removed. */
952 pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
953 } else {
954 /**
955 * There are still mappings left, make the next one the new head of
956 * the list. This effectively removes the first entry from the list.
957 */
958 pvh_update_head(locked_pvh, pve_next(pvep), PVH_TYPE_PVEP);
959 }
960 } else {
961 /**
962 * Move the previous entry's next field to the entry after the one being
963 * removed. This will clobber the ALTACCT and INTERNAL bits.
964 */
965 *pvepp = pve_next(pvep);
966 }
967 }
968
969 /**
970 * PVH_TYPE_PTDP Types and Helper Functions.
971 *
972 * The following are types and methods used to manipulate page table descriptor
973 * (PTD) objects. This is the type of pv_head_table entry used when a page is
974 * being used as a page table.
975 */
976
977 /**
978 * Page table descriptor (PTD) info structure.
979 *
980 * Contains information about a page table. These pieces of data are separate
981 * from the PTD itself because in address spaces where the VM page size doesn't
982 * match the underlying hardware page size, one PTD could represent multiple
983 * page tables (and so will need multiple PTD info structures).
984 *
985 * These fields are also in their own struct so that they can be allocated
986 * separately from the associated pt_desc_t object. This allows us to allocate
987 * the counts in this structure in a way that ensures they don't fall within the
988 * same cache line as the main pt_desc_t object. This is important because the
989 * fields in this structure are atomically updated which could cause false
990 * sharing cache performance issues with the "va" field in pt_desc_t if all of
991 * the fields were within the same structure.
992 */
993 typedef struct {
994 /**
995 * For non-leaf pagetables, should be 0.
996 * For leaf pagetables, should reflect the number of wired entries.
997 * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU
998 * operations are implicitly wired).
999 */
1000 unsigned short wiredcnt;
1001 } ptd_info_t;
1002
1003 /**
1004 * This type is used to identify a specific IOMMU driver and an instance of
1005 * that driver which owns a specific page or page table. This type will be used
1006 * within both PTD and PVE lists to track IOMMU-owned pages and IOMMU mappings
1007 * respectively.
1008 *
1009 * Despite the fact this value is not a pointer, we need to make this value sort
1010 * of look like a kernel pointer: the bottom 3-bits must be zero and the upper
1011 * bits must all be ones by default. This is due to the fact that this type can
1012 * be embedded into the PVH table to represent an IOMMU mapping. The PVH table
1013 * code expects "kernel-pointer-like" properties so it can store flags in those
1014 * areas of the 64-bit value.
1015 */
1016 typedef uint64_t iommu_instance_t;
1017
1018 /* 8-bit ID of the IOMMU driver which the instance derives from. */
1019 #define IOMMU_ID_SHIFT 8U
1020 #define IOMMU_ID_MASK 0x000000000000FF00ULL
1021
1022 #define GET_IOMMU_ID(x) ((sptm_iommu_id_t)(((x) & IOMMU_ID_MASK) >> IOMMU_ID_SHIFT))
1023 #define SET_IOMMU_ID(x) (((uint64_t)(x) << IOMMU_ID_SHIFT) & IOMMU_ID_MASK)
1024
1025 /**
1026 * An IOMMU token is a 32-bit value unique to each instance of an IOMMU driver.
1027 * This is strictly used to help with debugging and provides a mechanism to
1028 * trace a mapping or page table back to the exact IOMMU instance that owns it.
1029 * Typically, this would be the instance ID, but for drivers that use only a
1030 * single global instance, this could be something else like a root page table
1031 * ppnum_t.
1032 */
1033 #define IOMMU_TOKEN_SHIFT 16U
1034 #define IOMMU_TOKEN_MASK 0x0000FFFFFFFF0000ULL
1035
1036 #define GET_IOMMU_TOKEN(x) ((iommu_token_t)(((x) & IOMMU_TOKEN_MASK) >> IOMMU_TOKEN_SHIFT))
1037 #define SET_IOMMU_TOKEN(x) (((uint64_t)(x) << IOMMU_TOKEN_SHIFT) & IOMMU_TOKEN_MASK)
1038
1039 /**
1040 * The default value for iommu_instance_t. See the type definition for more
1041 * details on why the upper bits need to initially be all ones.
1042 */
1043 #define IOMMU_INSTANCE_DEFAULT 0xFFFF000000000000ULL
1044
1045 /**
1046 * Since "zero" is a valid IOMMU ID and token, the "NULL" value of an IOMMU
1047 * instance sets the ID and token to all ones as a sentinel invalid value.
1048 */
1049 #define IOMMU_INSTANCE_NULL 0xFFFFFFFFFFFFFF00ULL
1050
1051 /**
1052 * Page Table Descriptor (PTD).
1053 *
1054 * Provides a per-table data structure and a way of keeping track of all page
1055 * tables in the system.
1056 *
1057 * This structure is also used as a convenient way of keeping track of IOMMU
1058 * pages (which may or may not be used as page tables). In that case the SPTM
1059 * frame type for the page will be XNU_IOMMU, the "iommu" field will describe
1060 * the owner of the page, and ptd_info[0].wiredcnt can be used as an arbitrary
1061 * refcnt controlled by the IOMMU driver.
1062 */
1063 typedef struct pt_desc {
1064 /* Each page table is either owned by a pmap or a specific IOMMU. */
1065 union {
1066 struct pmap *pmap;
1067 };
1068
1069 /**
1070 * The following fields contain per-page-table properties, and as such,
1071 * might have multiple elements each. This is due to a single PTD
1072 * potentially representing multiple page tables (in address spaces where
1073 * the VM page size differs from the hardware page size). Use the
1074 * ptd_get_index() function to get the correct index for a specific page
1075 * table.
1076 */
1077
1078 /**
1079 * The first address of the virtual address space this page table is
1080 * translating for, or a value set by an IOMMU driver if this PTD is being
1081 * used to track an IOMMU page.
1082 */
1083 vm_offset_t va;
1084
1085 /**
1086 * ptd_info_t's are allocated separately so as to reduce false sharing
1087 * with the va field. This is desirable because ptd_info_t's are updated
1088 * atomically from all CPUs.
1089 */
1090 ptd_info_t *ptd_info;
1091 } pt_desc_t;
1092
1093 /**
1094 * Per-CPU structure for tracking in-flight SPTM retype operations.
1095 *
1096 * This structure is intended to be embedded in the pmap per-CPU data object,
1097 * and is meant to be used for situations in which the caller needs to ensure
1098 * that potentially sensitive concurrent SPTM operations have completed on other
1099 * CPUs prior to an operation (such as a retype) that requires page or mapping
1100 * state to be stable. When draining these concurrent operations, the caller
1101 * is also expected to have already taken steps to ensure the page/mapping
1102 * state requirements will be visible to any concurrent pmap operation initiated
1103 * after the drain operation is begun, so that only previously-initiated
1104 * operations will need to be purged.
1105 */
1106 typedef struct {
1107 /**
1108 * Critical section sequence number of the local CPU. A value of zero
1109 * indicates that no pmap epoch critical section is currently active on
1110 * the CPU.
1111 */
1112 uint64_t local_seq;
1113
1114 /**
1115 * The sequence number to use the next time a pmap epoch critical section
1116 * is entered on the local CPU. This should monotonically increase.
1117 */
1118 uint64_t next_seq;
1119
1120 /**
1121 * This array stores the retype sequence numbers observed on remote CPUs.
1122 * When the local CPU needs to wait for critical sections to complete on
1123 * other CPUs, this is intended to provide an initial sample of those other
1124 * CPUs' critical section state. The caller can then wait for each remote
1125 * CPU's sequence number to return to zero or advance beyond the value
1126 * stored in its entry in this array.
1127 */
1128 uint64_t remote_seq[MAX_CPUS];
1129
1130 /**
1131 * Flags used to track the state of an active pmap epoch drain operation
1132 * on the local CPU.
1133 */
1134
1135 /**
1136 * This flag indicates that a drain operation has been prepared on the
1137 * local CPU by sampling remote CPU epoch states into the remote_seq array.
1138 * This must be set before the drain operation can be performed.
1139 */
1140 #define PMAP_EPOCH_PREPARED (1 << 0)
1141
1142 /**
1143 * This flag indicates that one or more remote CPUs had a non-zero retype
1144 * epoch value when the remote_seq array was most recently sampled.
1145 * If this flag is not set, then we already know that no remote CPUs can
1146 * be in a critical section in which prior mapping state for the page to
1147 * be retyped may have been observed, so we can skip the drain operation.
1148 */
1149 #define PMAP_EPOCH_DRAIN_REQUIRED (1 << 1)
1150 uint8_t flags;
1151 } pmap_epoch_t;
1152
1153 #define PMAP_SPTM_PCPU_ALIGN (8192)
1154
1155 typedef struct {
1156 /**
1157 * Per-CPU array of SPTM_MAPPING_LIMIT PTE records, obtained from SPTM
1158 * during bootstrap.
1159 */
1160 sptm_pte_t *sptm_prev_ptes;
1161
1162 /**
1163 * A piece of per-cpu scratch memory used by IOMMU drivers when passing data
1164 * into the SPTM. The size is defined by PMAP_IOMMU_SCRATCH_SIZE.
1165 */
1166 void *sptm_iommu_scratch;
1167
1168 /* Accumulator for batched user pointer SPTM ops, to avoid excessive stack usage. */
1169 sptm_user_pointer_op_t sptm_user_pointer_ops[SPTM_MAPPING_LIMIT];
1170
1171 /* Accumulator for batched disjoint SPTM ops, to avoid excessive stack usage. */
1172 sptm_disjoint_op_t sptm_ops[SPTM_MAPPING_LIMIT];
1173
1174 union {
1175 /* Accumulator for batched VA-contiguous SPTM ops, to avoid excessive stack usage. */
1176 sptm_pte_t sptm_templates[SPTM_MAPPING_LIMIT];
1177
1178 /* Accumulator for PA arrays to be passed to the SPTM, to avoid excessive stack usage. */
1179 sptm_paddr_t sptm_paddrs[SPTM_MAPPING_LIMIT];
1180 };
1181
1182 /* Base PA of user pointer ops array, for passing the ops into the SPTM. */
1183 pmap_paddr_t sptm_user_pointer_ops_pa;
1184
1185 /* Base PA of ops array, for passing the ops into the SPTM. */
1186 pmap_paddr_t sptm_ops_pa;
1187
1188 /* Base PA of templates array, for passing templates into the SPTM. */
1189 pmap_paddr_t sptm_templates_pa;
1190
1191 /* Base PA of physical address array, for passing physical address lists into the SPTM. */
1192 pmap_paddr_t sptm_paddrs_pa;
1193
1194 /* PMAP pagetable descriptors associated with each element of sptm_ops. */
1195 pt_desc_t *sptm_ptds[SPTM_MAPPING_LIMIT];
1196
1197 /* PTD info objects associated with each pmap PTE pointer. */
1198 ptd_info_t *sptm_ptd_info[SPTM_MAPPING_LIMIT];
1199
1200 /* Accounting-related flags for each element of sptm_ops. */
1201 #define PMAP_SPTM_FLAG_INTERNAL (0x1)
1202 #define PMAP_SPTM_FLAG_ALTACCT (0x2)
1203 uint8_t sptm_acct_flags[SPTM_MAPPING_LIMIT];
1204
1205 /* pmap epoch tracking structure. */
1206 pmap_epoch_t pmap_epoch;
1207
1208 /* Guest virtual machine dispatch structure. */
1209 sptm_guest_dispatch_t sptm_guest_dispatch;
1210
1211 /* Guest virtual machine dispatch structure physical address. */
1212 pmap_paddr_t sptm_guest_dispatch_paddr;
1213
1214 /* SPTM Logical CPU ID */
1215 uint16_t sptm_cpu_id;
1216
1217 /* Read index associated with this CPU's SPTM trace buffer */
1218 uint64_t sptm_trace_buffer_read_index;
1219
1220 /* Previous SPTM state for use with sptm_trace_num_new_traces */
1221 uint64_t sptm_trace_prev_state;
1222 } __attribute__((aligned(PMAP_SPTM_PCPU_ALIGN))) pmap_sptm_percpu_data_t;
1223
1224 _Static_assert((PAGE_SIZE % PMAP_SPTM_PCPU_ALIGN) == 0,
1225 "SPTM per-CPU data alignment does not fit evenly within a page");
1226 _Static_assert(sizeof(pmap_sptm_percpu_data_t) <= PMAP_SPTM_PCPU_ALIGN,
1227 "sizeof(pmap_sptm_percpu_data_t) is larger than PMAP_SPTM_PCPU_ALIGN");
1228
1229 PERCPU_DECL(pmap_sptm_percpu_data_t, pmap_sptm_percpu);
1230
1231 /**
1232 * Convert a pv_head_table entry/pointer into a page table descriptor pointer.
1233 * This should only be done if the type of this entry is PVH_TYPE_PTDP.
1234 *
1235 * @param pvh The pv_head_table entry/pointer to convert into a safe to
1236 * dereference pt_desc_t*.
1237 *
1238 * @return Return back a safe to derefence pointer to the page table descriptor
1239 * for this physical page by masking off the TYPE bits and adding any
1240 * missing flags to the upper portion of the pointer.
1241 */
1242 static inline pt_desc_t*
pvh_ptd(uintptr_t pvh)1243 pvh_ptd(uintptr_t pvh)
1244 {
1245 return (pt_desc_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
1246 }
1247
1248 /**
1249 * Given an arbitrary page table entry, return back the page table descriptor
1250 * (PTD) object for the page table that contains that entry.
1251 *
1252 * @param ptep Pointer to a PTE whose page table descriptor object to return.
1253 *
1254 * @return The PTD object for the passed in page table.
1255 */
1256 static inline pt_desc_t *
ptep_get_ptd(const pt_entry_t * ptep)1257 ptep_get_ptd(const pt_entry_t *ptep)
1258 {
1259 assert(ptep != NULL);
1260
1261 const vm_offset_t pt_base_va = (vm_offset_t)ptep;
1262 uintptr_t pvh = pai_to_pvh(pa_index(kvtophys(pt_base_va)));
1263
1264 if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1265 panic("%s: invalid PV head 0x%llx for PTE %p", __func__, (uint64_t)pvh, ptep);
1266 }
1267
1268 return pvh_ptd(pvh);
1269 }
1270
1271 /**
1272 * Given an arbitrary page table entry, return back the pmap that owns that
1273 * page table.
1274 *
1275 * @note This won't work correctly for page tables owned by IOMMUs, because
1276 * those table aren't owned by any specific pmap.
1277 *
1278 * @param ptep Pointer to a page table entry whose owner we're trying to return.
1279 *
1280 * @return The pmap that owns the given page table entry.
1281 */
1282 static inline struct pmap *
ptep_get_pmap(const pt_entry_t * ptep)1283 ptep_get_pmap(const pt_entry_t *ptep)
1284 {
1285 return ptep_get_ptd(ptep)->pmap;
1286 }
1287
1288
1289 /**
1290 * Given an arbitrary translation table entry, get the page table descriptor
1291 * (PTD) object for the page table pointed to by the TTE.
1292 *
1293 * @param tte The translation table entry to parse. For instance, if this is an
1294 * L2 TTE, then the PTD for the L3 table this entry points to will be
1295 * returned.
1296 *
1297 * @return The page table descriptor (PTD) for the page table pointed to by this
1298 * TTE.
1299 */
1300 static inline pt_desc_t *
tte_get_ptd(const tt_entry_t tte)1301 tte_get_ptd(const tt_entry_t tte)
1302 {
1303 const vm_offset_t pt_base_va = (vm_offset_t)(tte & ~((tt_entry_t)PAGE_MASK));
1304 uintptr_t pvh = pai_to_pvh(pa_index(pt_base_va));
1305
1306 if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
1307 panic("%s: invalid PV head 0x%llx for TTE 0x%llx", __func__, (uint64_t)pvh, (uint64_t)tte);
1308 }
1309
1310 return pvh_ptd(pvh);
1311 }
1312
1313 /**
1314 * This function returns the ptd_info_t structure associated with a given
1315 * page table descriptor.
1316 *
1317 * @param ptd The page table descriptor that's being accessed.
1318 *
1319 * @return ptd_info_t structure associated with [ptd].
1320 */
1321 static inline ptd_info_t *
ptd_get_info(pt_desc_t * ptd)1322 ptd_get_info(pt_desc_t *ptd)
1323 {
1324 assert(ptd != NULL);
1325 return ptd->ptd_info;
1326 }
1327
1328 /**
1329 * Given a pointer to a page table entry, return back the ptd_info structure
1330 * for the page table that contains that entry.
1331 *
1332 * @param ptep Pointer to a PTE whose ptd_info object to return.
1333 *
1334 * @return The ptd_info object for the page table that contains the passed in
1335 * page table entry.
1336 */
1337 static inline ptd_info_t *
ptep_get_info(const pt_entry_t * ptep)1338 ptep_get_info(const pt_entry_t *ptep)
1339 {
1340 return ptd_get_info(ptep_get_ptd(ptep));
1341 }
1342
1343 /**
1344 * Return the virtual address mapped by the passed in leaf page table entry,
1345 * using an already-retrieved pagetable descriptor.
1346 *
1347 * @param ptdp pointer to the descriptor for the pagetable containing ptep
1348 * @param ptep Pointer to a PTE to parse
1349 */
1350 static inline vm_map_address_t
ptd_get_va(const pt_desc_t * ptdp,const pt_entry_t * ptep)1351 ptd_get_va(const pt_desc_t *ptdp, const pt_entry_t *ptep)
1352 {
1353 const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);
1354
1355 vm_map_address_t va = ptdp->va;
1356
1357 const uint64_t pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(ptdp->pmap));
1358 const vm_offset_t ptep_page = (vm_offset_t)ptep >> pmap_page_shift;
1359
1360 /**
1361 * Use the difference between the VM page shift and the hardware page shift
1362 * to get the index of the correct page table. In practice, this equates to
1363 * masking out the bottom two bits of the L3 table index in address spaces
1364 * where the VM page size is greater than the hardware page size. In address
1365 * spaces where they're identical, the index will always be zero.
1366 */
1367 const unsigned int ttep_index = ptep_page & ((1U << (PAGE_SHIFT - pmap_page_shift)) - 1);
1368 va += ttep_index * pt_attr_twig_size(pt_attr);
1369
1370 /* Increment VA now to target the VA space covered by this specific PTE */
1371 const vm_offset_t ptep_index = ((vm_offset_t)ptep & pt_attr_leaf_offmask(pt_attr)) / sizeof(*ptep);
1372 va += (ptep_index << pt_attr_leaf_shift(pt_attr));
1373
1374 return va;
1375 }
1376
1377 /**
1378 * Return the virtual address that is being mapped by the passed in leaf page
1379 * table entry.
1380 *
1381 * @param ptep Pointer to a PTE to parse.
1382 */
1383 static inline vm_map_address_t
ptep_get_va(const pt_entry_t * ptep)1384 ptep_get_va(const pt_entry_t *ptep)
1385 {
1386 return ptd_get_va(ptep_get_ptd(ptep), ptep);
1387 }
1388
1389 /**
1390 * Physical Page Attribute Table (pp_attr_table) defines and helper functions.
1391 */
1392
1393 /* How many bits to use for flags on a per-VM-page basis. */
1394 typedef uint16_t pp_attr_t;
1395
1396 /* See the definition of pp_attr_table for more information. */
1397 extern volatile pp_attr_t* pp_attr_table;
1398
1399 /**
1400 * Flags stored in the pp_attr_table on a per-physical-page basis.
1401 *
1402 * Please update the pv_walk LLDB macro if these flags are changed or added to.
1403 */
1404
1405 /**
1406 * The bottom 6-bits are used to store the default WIMG (cacheability and memory
1407 * type) setting for this physical page. This can be changed by calling
1408 * pmap_set_cache_attributes().
1409 *
1410 * If a default WIMG setting isn't set for a page, then the default is Normal,
1411 * Cached memory (VM_WIMG_DEFAULT).
1412 */
1413 #define PP_ATTR_WIMG_MASK 0x003F
1414 #define PP_ATTR_WIMG(x) ((x) & PP_ATTR_WIMG_MASK)
1415
1416 /**
1417 * The reference and modify bits keep track of whether a page has been accessed
1418 * or modified since the last time the bits were cleared. These bits are used to
1419 * enforce policy decisions in the VM layer.
1420 */
1421 #define PP_ATTR_REFERENCED 0x0040
1422 #define PP_ATTR_MODIFIED 0x0080
1423
1424 /**
1425 * This physical page is being used as anonymous memory that's internally
1426 * managed by the VM and is not connected to an external pager. This flag is
1427 * only set/cleared on the first CPU mapping of a page (see PVH_FLAG_CPU). Any
1428 * subsequent mappings won't set/clear this flag until all mappings are removed
1429 * and a new CPU mapping is added.
1430 */
1431 #define PP_ATTR_INTERNAL 0x0100
1432
1433 /**
1434 * This flag is used to keep track of pages that are still resident but are not
1435 * considered dirty and can be reclaimed under memory pressure. These pages do
1436 * not count as a part of the memory footprint, so the footprint ledger does not
1437 * need to be updated for these pages. This is hinted to the VM by the
1438 * `madvise(MADV_FREE_REUSABLE)` system call.
1439 */
1440 #define PP_ATTR_REUSABLE 0x0200
1441
1442 /**
1443 * This flag denotes that a page is utilizing "alternate accounting". This means
1444 * that the pmap doesn't need to keep track of these pages with regards to the
1445 * footprint ledger because the VM is already accounting for them in a different
1446 * way. These include IOKit mappings (VM adds their entire virtual size to the
1447 * footprint), and purgeable pages (VM counts them only when non-volatile and
1448 * only for one "owner"), among others.
1449 *
1450 * Note that alternate accounting status is tracked on a per-mapping basis (not
1451 * per-page). Because of that the ALTACCT flag in the pp_attr_table is only used
1452 * when there's a single mapping to a page. When there are multiple mappings,
1453 * the status of this flag is tracked in the pv_head_table (see PVE_PTEP_ALTACCT
1454 * above).
1455 */
1456 #define PP_ATTR_ALTACCT 0x0400
1457
1458 /**
1459 * This bit was originally used on x86 to keep track of what pages to not
1460 * encrypt during the hibernation process as a performance optimization when
1461 * encryption was done in software. This doesn't apply to the ARM
1462 * hibernation process because all pages are automatically encrypted using
1463 * hardware acceleration. Despite that, the pmap still keeps track of this flag
1464 * as a debugging aid on internal builds.
1465 *
1466 * TODO: This bit can probably be reclaimed:
1467 * rdar://70740650 (PMAP Cleanup: Potentially reclaim the PP_ATTR_NOENCRYPT bit on ARM)
1468 */
1469 #define PP_ATTR_NOENCRYPT 0x0800
1470
1471 /**
1472 * These bits denote that a physical page is expecting the next access or
1473 * modification to set the PP_ATTR_REFERENCED and PP_ATTR_MODIFIED flags
1474 * respectively.
1475 */
1476 #define PP_ATTR_REFFAULT 0x1000
1477 #define PP_ATTR_MODFAULT 0x2000
1478
1479 /**
1480 * Atomically set some flags in a pp_attr_table entry.
1481 *
1482 * @param pai The physical address index for the entry to update.
1483 * @param bits The flags to set in the entry.
1484 */
1485 static inline void
ppattr_set_bits(unsigned int pai,pp_attr_t bits)1486 ppattr_set_bits(unsigned int pai, pp_attr_t bits)
1487 {
1488 volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1489 os_atomic_or(ppattr, bits, relaxed);
1490 }
1491
1492 /**
1493 * Atomically clear some flags in a pp_attr_table entry.
1494 *
1495 * @param pai The physical address index for the entry to update.
1496 * @param bits The flags to clear in the entry.
1497 */
1498 static inline void
ppattr_clear_bits(unsigned int pai,pp_attr_t bits)1499 ppattr_clear_bits(unsigned int pai, pp_attr_t bits)
1500 {
1501 volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1502 os_atomic_andnot(ppattr, bits, relaxed);
1503 }
1504
1505 /**
1506 * General-purpose function for atomically modifying flags in a pp_attr_table entry.
1507 *
1508 * @param pai The physical address index for the entry to update.
1509 * @param bits_to_clear Mask of bits to atomically clear from the entry.
1510 * @param bits_to_set Mask of bits to atomically set in the entry.
1511 *
1512 * @note [bits_to_clear] and [bits_to_set] must not overlap.
1513 */
1514 static inline void
ppattr_modify_bits(unsigned int pai,pp_attr_t bits_to_clear,pp_attr_t bits_to_set)1515 ppattr_modify_bits(unsigned int pai, pp_attr_t bits_to_clear, pp_attr_t bits_to_set)
1516 {
1517 assert((bits_to_set & bits_to_clear) == 0);
1518 pp_attr_t prev_ppattr, new_ppattr;
1519 os_atomic_rmw_loop(&pp_attr_table[pai], prev_ppattr, new_ppattr, relaxed, {
1520 new_ppattr = (prev_ppattr & ~bits_to_clear) | bits_to_set;
1521 });
1522 }
1523
1524 /**
1525 * Return true if the pp_attr_table entry contains the passed in bits.
1526 *
1527 * @param pai The physical address index for the entry to test.
1528 * @param bits The flags to check for.
1529 */
1530 static inline bool
ppattr_test_bits(unsigned int pai,pp_attr_t bits)1531 ppattr_test_bits(unsigned int pai, pp_attr_t bits)
1532 {
1533 const volatile pp_attr_t *ppattr = &pp_attr_table[pai];
1534 return (*ppattr & bits) == bits;
1535 }
1536
1537 /**
1538 * Only set some flags in a pp_attr_table entry if the passed in physical
1539 * address is a kernel-managed address.
1540 *
1541 * @param pa The physical address for the entry to update.
1542 * @param bits The flags to set in the entry.
1543 */
1544 static inline void
ppattr_pa_set_bits(pmap_paddr_t pa,pp_attr_t bits)1545 ppattr_pa_set_bits(pmap_paddr_t pa, pp_attr_t bits)
1546 {
1547 if (pa_valid(pa)) {
1548 ppattr_set_bits(pa_index(pa), bits);
1549 }
1550 }
1551
1552 /**
1553 * Only clear some flags in a pp_attr_table entry if the passed in physical
1554 * address is a kernel-managed address.
1555 *
1556 * @param pa The physical address for the entry to update.
1557 * @param bits The flags to clear in the entry.
1558 */
1559 static inline void
ppattr_pa_clear_bits(pmap_paddr_t pa,pp_attr_t bits)1560 ppattr_pa_clear_bits(pmap_paddr_t pa, pp_attr_t bits)
1561 {
1562 if (pa_valid(pa)) {
1563 ppattr_clear_bits(pa_index(pa), bits);
1564 }
1565 }
1566
1567 /**
1568 * Only test flags in a pp_attr_table entry if the passed in physical address
1569 * is a kernel-managed page.
1570 *
1571 * @param pa The physical address for the entry to test.
1572 * @param bits The flags to check for.
1573 *
1574 * @return False if the PA isn't a kernel-managed page, otherwise true/false
1575 * depending on whether the bits are set.
1576 */
1577 static inline bool
ppattr_pa_test_bits(pmap_paddr_t pa,pp_attr_t bits)1578 ppattr_pa_test_bits(pmap_paddr_t pa, pp_attr_t bits)
1579 {
1580 return pa_valid(pa) ? ppattr_test_bits(pa_index(pa), bits) : false;
1581 }
1582
1583 /**
1584 * Set the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the passed
1585 * in physical address is a kernel-managed page.
1586 *
1587 * @param pa The physical address for the entry to update.
1588 */
1589 static inline void
ppattr_pa_set_modify(pmap_paddr_t pa)1590 ppattr_pa_set_modify(pmap_paddr_t pa)
1591 {
1592 ppattr_pa_set_bits(pa, PP_ATTR_MODIFIED);
1593 }
1594
1595 /**
1596 * Clear the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the
1597 * passed in physical address is a kernel-managed page.
1598 *
1599 * @param pa The physical address for the entry to update.
1600 */
1601 static inline void
ppattr_pa_clear_modify(pmap_paddr_t pa)1602 ppattr_pa_clear_modify(pmap_paddr_t pa)
1603 {
1604 ppattr_pa_clear_bits(pa, PP_ATTR_MODIFIED);
1605 }
1606
1607 /**
1608 * Set the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1609 * passed in physical address is a kernel-managed page.
1610 *
1611 * @param pa The physical address for the entry to update.
1612 */
1613 static inline void
ppattr_pa_set_reference(pmap_paddr_t pa)1614 ppattr_pa_set_reference(pmap_paddr_t pa)
1615 {
1616 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
1617 }
1618
1619 /**
1620 * Clear the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
1621 * passed in physical address is a kernel-managed page.
1622 *
1623 * @param pa The physical address for the entry to update.
1624 */
1625 static inline void
ppattr_pa_clear_reference(pmap_paddr_t pa)1626 ppattr_pa_clear_reference(pmap_paddr_t pa)
1627 {
1628 ppattr_pa_clear_bits(pa, PP_ATTR_REFERENCED);
1629 }
1630
1631 /**
1632 * Set the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1633 *
1634 * @param pai The physical address index for the entry to update.
1635 */
1636 static inline void
ppattr_set_internal(unsigned int pai)1637 ppattr_set_internal(unsigned int pai)
1638 {
1639 ppattr_set_bits(pai, PP_ATTR_INTERNAL);
1640 }
1641
1642 /**
1643 * Clear the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1644 *
1645 * @param pai The physical address index for the entry to update.
1646 */
1647 static inline void
ppattr_clear_internal(unsigned int pai)1648 ppattr_clear_internal(unsigned int pai)
1649 {
1650 ppattr_clear_bits(pai, PP_ATTR_INTERNAL);
1651 }
1652
1653 /**
1654 * Return true if the pp_attr_table entry has the PP_ATTR_INTERNAL flag set.
1655 *
1656 * @param pai The physical address index for the entry to test.
1657 */
1658 static inline bool
ppattr_test_internal(unsigned int pai)1659 ppattr_test_internal(unsigned int pai)
1660 {
1661 return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1662 }
1663
1664 /**
1665 * Set the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1666 *
1667 * @param pai The physical address index for the entry to update.
1668 */
1669 static inline void
ppattr_set_reusable(unsigned int pai)1670 ppattr_set_reusable(unsigned int pai)
1671 {
1672 ppattr_set_bits(pai, PP_ATTR_REUSABLE);
1673 }
1674
1675 /**
1676 * Clear the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
1677 *
1678 * @param pai The physical address index for the entry to update.
1679 */
1680 static inline void
ppattr_clear_reusable(unsigned int pai)1681 ppattr_clear_reusable(unsigned int pai)
1682 {
1683 ppattr_clear_bits(pai, PP_ATTR_REUSABLE);
1684 }
1685
1686 /**
1687 * Return true if the pp_attr_table entry has the PP_ATTR_REUSABLE flag set.
1688 *
1689 * @param pai The physical address index for the entry to test.
1690 */
1691 static inline bool
ppattr_test_reusable(unsigned int pai)1692 ppattr_test_reusable(unsigned int pai)
1693 {
1694 return ppattr_test_bits(pai, PP_ATTR_REUSABLE);
1695 }
1696
1697 /**
1698 * Set the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1699 *
1700 * @note This is only valid when the ALTACCT flag is being tracked using the
1701 * pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1702 * PP_ATTR_ALTACCT definitions for more information.
1703 *
1704 * @param pai The physical address index for the entry to update.
1705 */
1706 static inline void
ppattr_set_altacct(unsigned int pai)1707 ppattr_set_altacct(unsigned int pai)
1708 {
1709 ppattr_set_bits(pai, PP_ATTR_ALTACCT);
1710 }
1711
1712 /**
1713 * Clear the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1714 *
1715 * @note This is only valid when the ALTACCT flag is being tracked using the
1716 * pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1717 * PP_ATTR_ALTACCT definitions for more information.
1718 *
1719 * @param pai The physical address index for the entry to update.
1720 */
1721 static inline void
ppattr_clear_altacct(unsigned int pai)1722 ppattr_clear_altacct(unsigned int pai)
1723 {
1724 ppattr_clear_bits(pai, PP_ATTR_ALTACCT);
1725 }
1726
1727 /**
1728 * Get the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
1729 *
1730 * @note This is only valid when the ALTACCT flag is being tracked using the
1731 * pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
1732 * PP_ATTR_ALTACCT definitions for more information.
1733 *
1734 * @param pai The physical address index for the entry to test.
1735 *
1736 * @return True if the passed in page uses alternate accounting, false
1737 * otherwise.
1738 */
1739 static inline bool
ppattr_is_altacct(unsigned int pai)1740 ppattr_is_altacct(unsigned int pai)
1741 {
1742 return ppattr_test_bits(pai, PP_ATTR_ALTACCT);
1743 }
1744
1745 /**
1746 * Get the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
1747 *
1748 * @note This is only valid when the INTERNAL flag is being tracked using the
1749 * pp_attr_table. See the descriptions above the PVE_PTEP_INTERNAL and
1750 * PP_ATTR_INTERNAL definitions for more information.
1751 *
1752 * @param pai The physical address index for the entry to test.
1753 *
1754 * @return True if the passed in page is accounted for as "internal", false
1755 * otherwise.
1756 */
1757 static inline bool
ppattr_is_internal(unsigned int pai)1758 ppattr_is_internal(unsigned int pai)
1759 {
1760 return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
1761 }
1762
1763 /**
1764 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1765 * depending on whether there are one or multiple mappings to a page. This
1766 * function abstracts out the difference between single and multiple mappings to
1767 * a page and provides a single function for determining whether alternate
1768 * accounting is set for a mapping.
1769 *
1770 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1771 * definitions for more information.
1772 *
1773 * @param pai The physical address index for the entry to test.
1774 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1775 * @param idx Index of the chosen PTE pointer inside the PVE.
1776 *
1777 * @return True if the passed in page uses alternate accounting, false
1778 * otherwise.
1779 */
1780 static inline bool
ppattr_pve_is_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1781 ppattr_pve_is_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1782 {
1783 return (pvep == PV_ENTRY_NULL) ? ppattr_is_altacct(pai) : pve_get_altacct(pvep, idx);
1784 }
1785
1786 /**
1787 * The "internal" (INTERNAL) status for a page is tracked differently
1788 * depending on whether there are one or multiple mappings to a page. This
1789 * function abstracts out the difference between single and multiple mappings to
1790 * a page and provides a single function for determining whether "internal"
1791 * is set for a mapping.
1792 *
1793 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1794 * definitions for more information.
1795 *
1796 * @param pai The physical address index for the entry to test.
1797 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1798 * @param idx Index of the chosen PTE pointer inside the PVE.
1799 *
1800 * @return True if the passed in page is "internal", false otherwise.
1801 */
1802 static inline bool
ppattr_pve_is_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1803 ppattr_pve_is_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1804 {
1805 return (pvep == PV_ENTRY_NULL) ? ppattr_is_internal(pai) : pve_get_internal(pvep, idx);
1806 }
1807
1808 /**
1809 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1810 * depending on whether there are one or multiple mappings to a page. This
1811 * function abstracts out the difference between single and multiple mappings to
1812 * a page and provides a single function for setting the alternate accounting status
1813 * for a mapping.
1814 *
1815 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1816 * definitions for more information.
1817 *
1818 * @param pai The physical address index for the entry to update.
1819 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1820 * @param idx Index of the chosen PTE pointer inside the PVE.
1821 */
1822 static inline void
ppattr_pve_set_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1823 ppattr_pve_set_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1824 {
1825 if (pvep == PV_ENTRY_NULL) {
1826 ppattr_set_altacct(pai);
1827 } else {
1828 pve_set_altacct(pvep, idx);
1829 }
1830 }
1831
1832 /**
1833 * The "internal" (INTERNAL) status for a page is tracked differently
1834 * depending on whether there are one or multiple mappings to a page. This
1835 * function abstracts out the difference between single and multiple mappings to
1836 * a page and provides a single function for setting the "internal" status
1837 * for a mapping.
1838 *
1839 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1840 * definitions for more information.
1841 *
1842 * @param pai The physical address index for the entry to update.
1843 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1844 * @param idx Index of the chosen PTE pointer inside the PVE.
1845 */
1846 static inline void
ppattr_pve_set_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1847 ppattr_pve_set_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1848 {
1849 if (pvep == PV_ENTRY_NULL) {
1850 ppattr_set_internal(pai);
1851 } else {
1852 pve_set_internal(pvep, idx);
1853 }
1854 }
1855
1856 /**
1857 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
1858 * depending on whether there are one or multiple mappings to a page. This
1859 * function abstracts out the difference between single and multiple mappings to
1860 * a page and provides a single function for clearing the alternate accounting status
1861 * for a mapping.
1862 *
1863 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
1864 * definitions for more information.
1865 *
1866 * @param pai The physical address index for the entry to update.
1867 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1868 * @param idx Index of the chosen PTE pointer inside the PVE.
1869 */
1870 static inline void
ppattr_pve_clr_altacct(unsigned int pai,pv_entry_t * pvep,unsigned idx)1871 ppattr_pve_clr_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1872 {
1873 if (pvep == PV_ENTRY_NULL) {
1874 ppattr_clear_altacct(pai);
1875 } else {
1876 pve_clr_altacct(pvep, idx);
1877 }
1878 }
1879
1880 /**
1881 * The "internal" (INTERNAL) status for a page is tracked differently
1882 * depending on whether there are one or multiple mappings to a page. This
1883 * function abstracts out the difference between single and multiple mappings to
1884 * a page and provides a single function for clearing the "internal" status
1885 * for a mapping.
1886 *
1887 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
1888 * definitions for more information.
1889 *
1890 * @param pai The physical address index for the entry to update.
1891 * @param pvep Pointer to the pv_entry_t object containing that mapping.
1892 * @param idx Index of the chosen PTE pointer inside the PVE.
1893 */
1894 static inline void
ppattr_pve_clr_internal(unsigned int pai,pv_entry_t * pvep,unsigned idx)1895 ppattr_pve_clr_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
1896 {
1897 if (pvep == PV_ENTRY_NULL) {
1898 ppattr_clear_internal(pai);
1899 } else {
1900 pve_clr_internal(pvep, idx);
1901 }
1902 }
1903
1904 /**
1905 * Set the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1906 *
1907 * @param pai The physical address index for the entry to update.
1908 */
1909 static inline void
ppattr_set_reffault(unsigned int pai)1910 ppattr_set_reffault(unsigned int pai)
1911 {
1912 ppattr_set_bits(pai, PP_ATTR_REFFAULT);
1913 }
1914
1915 /**
1916 * Clear the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
1917 *
1918 * @param pai The physical address index for the entry to update.
1919 */
1920 static inline void
ppattr_clear_reffault(unsigned int pai)1921 ppattr_clear_reffault(unsigned int pai)
1922 {
1923 ppattr_clear_bits(pai, PP_ATTR_REFFAULT);
1924 }
1925
1926 /**
1927 * Return true if the pp_attr_table entry has the PP_ATTR_REFFAULT flag set.
1928 *
1929 * @param pai The physical address index for the entry to test.
1930 */
1931 static inline bool
ppattr_test_reffault(unsigned int pai)1932 ppattr_test_reffault(unsigned int pai)
1933 {
1934 return ppattr_test_bits(pai, PP_ATTR_REFFAULT);
1935 }
1936
1937 /**
1938 * Set the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1939 *
1940 * @param pai The physical address index for the entry to update.
1941 */
1942 static inline void
ppattr_set_modfault(unsigned int pai)1943 ppattr_set_modfault(unsigned int pai)
1944 {
1945 ppattr_set_bits(pai, PP_ATTR_MODFAULT);
1946 }
1947
1948 /**
1949 * Clear the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
1950 *
1951 * @param pai The physical address index for the entry to update.
1952 */
1953 static inline void
ppattr_clear_modfault(unsigned int pai)1954 ppattr_clear_modfault(unsigned int pai)
1955 {
1956 ppattr_clear_bits(pai, PP_ATTR_MODFAULT);
1957 }
1958
1959 /**
1960 * Return true if the pp_attr_table entry has the PP_ATTR_MODFAULT flag set.
1961 *
1962 * @param pai The physical address index for the entry to test.
1963 */
1964 static inline bool
ppattr_test_modfault(unsigned int pai)1965 ppattr_test_modfault(unsigned int pai)
1966 {
1967 return ppattr_test_bits(pai, PP_ATTR_MODFAULT);
1968 }
1969
1970 /**
1971 * pmap epoch operations:
1972 *
1973 * The pmap epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap
1974 * can ensure all CPUs have observed updated mapping state before performing an operation
1975 * such as a retype which requires that no other operations be in-flight against the
1976 * prior mapping state.
1977 *
1978 * There are certain cases in which the pmap, while issuing an SPTM call that modifies
1979 * mappings, cannot hold locks such as the PVH lock which would prevent the mapped page
1980 * from being concurrently retyped. This is particularly true for batched operations
1981 * such as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes().
1982 * In these cases, the pmap may call pmap_epoch_enter() to note that it is performing such
1983 * a sensitive operation on the local CPU. It must then call pmap_epoch_exit() upon
1984 * completion of the sensitive operation. While retyping is the most common case that
1985 * requires epoch synchronization, there are a few other cases as well, such as marking
1986 * a leaf page table as unnested so that all subsequent mappings in it will be non-global.
1987 *
1988 * For any instance in which the pmap needs to retype a page (or otherwise alter mapping
1989 * policy) without being guaranteed (e.g. by VM layer locking or the existing page type)
1990 * that such a sensitive operation is not in progress on some other CPU, it must drain these
1991 * sensitive operations from other CPUs. Specifically, it must ensure that any
1992 * sensitive operation which may have observed mapping state under the prior mapping policy
1993 * has completed. This is accomplished by first calling pmap_epoch_prepare_drain() to
1994 * record the initial pmap epoch state of all CPUs, followed by pmap_epoch_drain() to ensure
1995 * all remote CPUs are either not in an epoch or have advanced beyond the initially recorded
1996 * epoch. These are exposed as two separate functions in order to allow the calling CPU to
1997 * do other work between calling pmap_epoch_prepare_drain() and pmap_epoch_drain(), as a
1998 * best-effort attempt to minimize time wasted spinning in pmap_epoch_drain().
1999 *
2000 * When draining the epoch, the following assumptions must hold true:
2001 *
2002 * 1) The calling thread must guarantee that prior updates needed to apply the new mapping
2003 * policy have already been performed and made globally visible using the appropriate
2004 * barriers. In the most common (retype) case, this means all existing mappings of the
2005 * page must have been removed. For any alterations of mapping state, global visibility is
2006 * conveniently already guaranteed by the DSBs that are architecturally required to
2007 * synchronize PTE updates and the TLBIs that follow them.
2008 *
2009 * 2) For operations that require exclusive in-flight page references such as retyping,
2010 * the calling thread must have some means of ensuring that new mappings cannot be added
2011 * for the page that would bring it out of the correct state for the operation, or that
2012 * would cause an SPTM violation due to a shared/exclusive in-flight reference conflict.
2013 * For retyping this is typically done by holding the PVH lock such that pmap_enter()
2014 * cannot concurrently execute against the page.
2015 *
2016 * 3) The calling thread must not perform any operation which requires preemptibility
2017 * between calling pmap_epoch_prepare_drain() and pmap_epoch_drain().
2018 */
2019
2020 /**
2021 * Enter the pmap epoch on the local CPU to indicate an in-progress SPTM operation
2022 * that may be sensitive to a concurrent retype operation on another CPU.
2023 *
2024 * @note This function increments the thread's preemption disable count and returns
2025 * with preemption disabled.
2026 *
2027 * @note This function issues all required barriers to ensure correct ordering of
2028 * the epoch update relative to ensuing SPTM accesses.
2029 */
2030 static inline void
pmap_epoch_enter(void)2031 pmap_epoch_enter(void)
2032 {
2033 mp_disable_preemption();
2034 pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2035 assert(!preemption_enabled());
2036
2037 /* Must not already been in a pmap epoch on this CPU. */
2038 assert(pmap_epoch->local_seq == 0);
2039 pmap_epoch->local_seq = ++pmap_epoch->next_seq;
2040 /* Unsigned 64-bit per-CPU integer should never overflow on any human timescale. */
2041 assert(pmap_epoch->local_seq != 0);
2042
2043 /**
2044 * Issue a store-load barrier to ensure that remote observers of any ensuing
2045 * SPTM accesses will also observe the epoch update.
2046 */
2047 os_atomic_thread_fence(seq_cst);
2048 }
2049
2050 /**
2051 * Exit the pmap epoch on the local CPU to indicate completion of an SPTM operation
2052 * that may be sensitive to a concurrent retype operation on another CPU.
2053 *
2054 * @note This function must be called with preemption disabled and will decrement
2055 * the current thread's preemption disable count.
2056 */
2057 static inline void
pmap_epoch_exit(void)2058 pmap_epoch_exit(void)
2059 {
2060 pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2061 assert(!preemption_enabled());
2062 assert(pmap_epoch->local_seq == pmap_epoch->next_seq);
2063
2064 /**
2065 * Clear the sequence using a store-release operation to ensure that prior
2066 * SPTM modifications will be visible to remote observers before the absence
2067 * of an epoch is visible.
2068 */
2069 os_atomic_store(&pmap_epoch->local_seq, 0, release);
2070 mp_enable_preemption();
2071 }
2072
2073 /**
2074 * Helper for determining whether the current CPU is within an epoch.
2075 *
2076 * @return true if the current CPU holds the epoch, false otherwise.
2077 */
2078 static inline bool
pmap_in_epoch(void)2079 pmap_in_epoch(void)
2080 {
2081 return !preemption_enabled() && (PERCPU_GET(pmap_sptm_percpu)->pmap_epoch.local_seq != 0);
2082 }
2083
2084 /**
2085 * Prepare the local CPU to perform an epoch drain operation by recording the retype
2086 * epoch state of other CPUs.
2087 *
2088 * @note This function increments the current thread's preemption disable count and
2089 * returns with preemption disabled.
2090 *
2091 * @note This function issues all necessary barriers to ensure that the subsequent
2092 * retype operation is not speculated ahead of the epoch sampling.
2093 *
2094 * @note This function does NOT issue any barriers to ensure that prior updates of
2095 * mapping state are globally visible and have proper store-load ordering with
2096 * respect to the scan performed here. In the cases where this function is
2097 * intended to be used, this ordering should be guaranteed automatically by
2098 * the DSBs used to synchronize prior mapping updates issued by the caller.
2099 * If this function is ever used in a situation where that cannot be guaranteed,
2100 * the caller must issue at least the equivalent of 'dmb ish' (a.k.a. a seq_cst
2101 * thread_fence) before calling this function.
2102 */
2103 static inline void
pmap_epoch_prepare_drain(void)2104 pmap_epoch_prepare_drain(void)
2105 {
2106 mp_disable_preemption();
2107 pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2108 assert(pmap_epoch->flags == 0);
2109 unsigned int i = 0;
2110 uint8_t flags = PMAP_EPOCH_PREPARED;
2111
2112 /* Sample each CPU's epoch state. */
2113 percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2114 const uint64_t remote_epoch =
2115 os_atomic_load(&pmap_pcpu->pmap_epoch.local_seq, relaxed);
2116 pmap_epoch->remote_seq[i] = remote_epoch;
2117
2118 /**
2119 * If the remote CPU has an active epoch, make a note to ourselves that
2120 * we'll need to drain it.
2121 */
2122 if (remote_epoch != 0) {
2123 flags |= PMAP_EPOCH_DRAIN_REQUIRED;
2124 }
2125 ++i;
2126 }
2127 pmap_epoch->flags = flags;
2128
2129 /**
2130 * Issue a load-load barrier to ensure subsequent drain or retype operations will
2131 * not be speculated ahead of the sampling we just did.
2132 */
2133 os_atomic_thread_fence(acquire);
2134 }
2135
2136 /**
2137 * Ensure that all CPUs have advanced beyond any active epoch that was recorded in the
2138 * most recent call to pmap_epoch_prepare_drain().
2139 *
2140 * @note This function expects to be called with preemption disabled and will decrement
2141 * the current thread's preemption disable count.
2142 *
2143 * @note pmap_epoch_prepare_drain() must have been called on the local CPU
2144 * prior to calling this function. This function will return immediately if
2145 * this prior call did not observe any active epochs on remote CPUs.
2146 *
2147 * @note This function issues all necessary barriers to ensure that the subsequent
2148 * retype operation is not speculated ahead of the epoch sampling.
2149 */
2150 static inline void
pmap_epoch_drain(void)2151 pmap_epoch_drain(void)
2152 {
2153 assert(!preemption_enabled());
2154 pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch;
2155 const uint8_t flags = pmap_epoch->flags;
2156 assert(flags & PMAP_EPOCH_PREPARED);
2157 pmap_epoch->flags = 0;
2158 if (!(flags & PMAP_EPOCH_DRAIN_REQUIRED)) {
2159 mp_enable_preemption();
2160 return;
2161 }
2162 unsigned int i = 0;
2163 percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
2164 if (pmap_epoch->remote_seq[i] != 0) {
2165 assert((pmap_pcpu->pmap_epoch.local_seq == 0) ||
2166 (pmap_pcpu->pmap_epoch.local_seq >= pmap_epoch->remote_seq[i]));
2167 /**
2168 * If the remote CPU was in an epoch, WFE-spin until it either exits the epoch
2169 * or advances to a new epoch.
2170 */
2171 while ((os_atomic_load_exclusive(&pmap_pcpu->pmap_epoch.local_seq, relaxed) ==
2172 pmap_epoch->remote_seq[i])) {
2173 __builtin_arm_wfe();
2174 }
2175 /* Clear the monitor if we exclusive-loaded a value that didn't require WFE. */
2176 os_atomic_clear_exclusive();
2177 }
2178 ++i;
2179 }
2180 mp_enable_preemption();
2181 /**
2182 * Issue a load-load barrier to ensure subsequent accesses to sensitive state will
2183 * not be speculated ahead of the sampling we just did.
2184 */
2185 os_atomic_thread_fence(acquire);
2186 }
2187
2188 /**
2189 * Helper to determine whether a frame type is one that requires automatic
2190 * retyping (by the pmap layer) back to XNU_DEFAULT when the page is about
2191 * to be recycled by the VM layer.
2192 *
2193 * @return true if the type requires auto-retyping, false otherwise.
2194 */
2195 static inline bool
pmap_type_requires_retype_on_recycle(sptm_frame_type_t frame_type)2196 pmap_type_requires_retype_on_recycle(sptm_frame_type_t frame_type)
2197 {
2198 return sptm_type_is_user_executable(frame_type) ||
2199 (frame_type == XNU_ROZONE) || (frame_type == XNU_KERNEL_RESTRICTED);
2200 }
2201
2202 static inline boolean_t
pmap_is_preemptible(void)2203 pmap_is_preemptible(void)
2204 {
2205 return preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT) || PMAP_IS_HIBERNATING();
2206 }
2207
2208 /**
2209 * This helper function ensures that potentially-long-running batched operations are
2210 * called in preemptible context before entering the SPTM, so that the SPTM call may
2211 * periodically exit to allow pending urgent ASTs to be taken.
2212 */
2213 static inline void
pmap_verify_preemptible(void)2214 pmap_verify_preemptible(void)
2215 {
2216 assert(pmap_is_preemptible());
2217 }
2218
2219 /**
2220 * The minimum number of pages to keep in the PPL page free list.
2221 *
2222 * We define our target as 8 pages: enough for 2 page table pages, a PTD page,
2223 * and a PV page; in essence, twice as many pages as may be necessary to satisfy
2224 * a single pmap_enter request.
2225 */
2226 #define PMAP_MIN_FREE_PPL_PAGES 8
2227
2228 /**
2229 * Flags passed to various page allocation functions, usually accessed through
2230 * the pmap_page_alloc() API. Each function that can take these flags as
2231 * a part of its option field, will describe these flags in its function header.
2232 */
2233
2234 /* Can be used when no allocation flags are wanted. */
2235 #define PMAP_PAGE_ALLOCATE_NONE 0x0
2236
2237 /**
2238 * Instruct the allocation function to return immediately if no pages are
2239 * current available. Without this flag, the function will spin and wait for a
2240 * page to become available. This flag can be required in some circumstances
2241 * (for instance, when allocating pages from within the PPL).
2242 */
2243 #define PMAP_PAGE_ALLOCATE_NOWAIT 0x1
2244
2245 /**
2246 * Instructs an allocation function to fallback to reclaiming a userspace page
2247 * table if it failed to allocate a page from the free lists. This can be useful
2248 * when allocating from within the PPL because refilling the free lists requires
2249 * exiting and re-entering the PPL (which incurs extra latency).
2250 *
2251 * This is a quick way of allocating a page at the expense of having to
2252 * reallocate the table the next time one of its mappings is accessed.
2253 */
2254 #define PMAP_PAGE_RECLAIM_NOWAIT 0x2
2255
2256 /**
2257 * Instructs an allocation function to avoid zero-filling the newly-allocated
2258 * page. This should be used only if you know the page will be fully initialized
2259 * by some other means on the relevant allocation path.
2260 */
2261 #define PMAP_PAGE_NOZEROFILL 0x4
2262
2263 /**
2264 * Global variables exported to the rest of the internal pmap implementation.
2265 */
2266 extern pmap_paddr_t sptm_cpu_iommu_scratch_start;
2267 extern pmap_paddr_t sptm_cpu_iommu_scratch_end;
2268 extern unsigned int inuse_pmap_pages_count;
2269 extern vm_object_t pmap_object;
2270 extern uint32_t pv_alloc_initial_target;
2271 extern uint32_t pv_kern_alloc_initial_target;
2272
2273 /**
2274 * Functions exported to the rest of the internal pmap implementation.
2275 */
2276 extern void pmap_data_bootstrap(void);
2277 extern void pmap_enqueue_pages(vm_page_t);
2278 extern kern_return_t pmap_page_alloc(pmap_paddr_t *, unsigned);
2279 extern void pmap_page_free(pmap_paddr_t);
2280
2281 /**
2282 * The modes in which a pmap lock can be acquired. Note that shared access
2283 * doesn't necessarily mean "read-only". As long as data is atomically updated
2284 * correctly (to account for multi-cpu accesses) data can still get written with
2285 * a shared lock held. Care just needs to be taken so as to not introduce any
2286 * race conditions when there are multiple writers.
2287 *
2288 * This is here in pmap_data.h because it's a needed parameter for pv_alloc()
2289 * and pmap_enter_pv(). This header is always included in pmap_internal.h before
2290 * the rest of the pmap locking code is defined so there shouldn't be any issues
2291 * with missing types.
2292 */
2293 OS_ENUM(pmap_lock_mode, uint8_t,
2294 PMAP_LOCK_SHARED,
2295 PMAP_LOCK_EXCLUSIVE,
2296 PMAP_LOCK_HELD);
2297
2298 /**
2299 * Possible return values for pv_alloc(). See the pv_alloc() function header for
2300 * a description of each of these values.
2301 */
2302 typedef enum {
2303 PV_ALLOC_SUCCESS,
2304 PV_ALLOC_RETRY,
2305 PV_ALLOC_FAIL
2306 } pv_alloc_return_t;
2307
2308 extern pv_alloc_return_t pv_alloc(
2309 pmap_t, pmap_lock_mode_t, unsigned int, pv_entry_t **, locked_pvh_t *, volatile uint16_t *);
2310 extern void pv_free(pv_entry_t *);
2311 extern void pv_list_free(pv_entry_t *, pv_entry_t *, unsigned int);
2312 extern void pmap_compute_pv_targets(void);
2313 extern pv_alloc_return_t pmap_enter_pv(
2314 pmap_t, pt_entry_t *, unsigned int, pmap_lock_mode_t, locked_pvh_t *, pv_entry_t **, int *);
2315
2316 typedef enum {
2317 PV_REMOVE_SUCCESS, /* found a mapping */
2318 PV_REMOVE_FAIL /* no mapping found */
2319 } pv_remove_return_t;
2320
2321 extern pv_remove_return_t pmap_remove_pv(pmap_t, pt_entry_t *, locked_pvh_t *, bool *, bool *);
2322
2323 extern void ptd_bootstrap(pt_desc_t *, unsigned int);
2324 extern pt_desc_t *ptd_alloc_unlinked(unsigned int);
2325 extern pt_desc_t *ptd_alloc(pmap_t, unsigned int);
2326 extern void ptd_deallocate(pt_desc_t *);
2327 extern void ptd_info_init(
2328 pt_desc_t *, pmap_t, vm_map_address_t, unsigned int, pt_entry_t *);
2329 extern void ptd_info_finalize(pt_desc_t *);
2330
2331 extern kern_return_t pmap_ledger_credit(pmap_t, int, ledger_amount_t);
2332 extern kern_return_t pmap_ledger_debit(pmap_t, int, ledger_amount_t);
2333
2334 extern void validate_pmap_internal(const volatile struct pmap *, const char *);
2335 extern void validate_pmap_mutable_internal(const volatile struct pmap *, const char *);
2336
2337 /**
2338 * Macro function wrappers around pmap validation so that the calling function
2339 * can be printed in the panic strings for easier validation failure debugging.
2340 */
2341 #define validate_pmap(x) validate_pmap_internal(x, __func__)
2342 #define validate_pmap_mutable(x) validate_pmap_mutable_internal(x, __func__)
2343
2344 /**
2345 * This structure describes a SPTM-owned physical memory range.
2346 *
2347 * @note This doesn't necessarily have to represent "I/O" only, this
2348 * can also represent non-kernel-managed DRAM (e.g., iBoot
2349 * carveouts). In some special cases, this can also represent
2350 * kernel-managed DRAM, when adding flags for special behavior
2351 * (e.g. the range being off limits for hibtext). Such ranges
2352 * must be marked with the PMAP_IO_RANGE_NOT_IO flag.
2353 *
2354 * @note The layout of this structure needs to map 1-to-1 with the pmap-io-range
2355 * device tree nodes. Astris (through the LowGlobals) also depends on the
2356 * consistency of this structure.
2357 *
2358 * @note These definitions are copied to SPTM and they need to be in sync.
2359 */
2360 typedef struct pmap_io_range {
2361 /* Physical address of the PPL-owned I/O range. */
2362 uint64_t addr;
2363
2364 /* Length (in bytes) of the PPL-owned I/O range. */
2365 uint64_t len;
2366
2367 /* Strong DSB required for pages in this range. */
2368 #define PMAP_IO_RANGE_STRONG_SYNC (1U << 31)
2369
2370 /* Corresponds to memory carved out by bootloader. */
2371 #define PMAP_IO_RANGE_CARVEOUT (1U << 30)
2372
2373 /* Pages in this range need to be included in the hibernation image. */
2374 #define PMAP_IO_RANGE_NEEDS_HIBERNATING (1U << 29)
2375
2376 /* Mark the range as 'owned' by a given subsystem. */
2377 #define PMAP_IO_RANGE_OWNED (1U << 28)
2378
2379 /**
2380 * Denotes a range that is *not* to be treated as an I/O range that
2381 * needs to be mapped, but only to decorate arbitrary physical
2382 * memory ranges (including of managed memory) with extra
2383 * flags. I.e. this allows tagging of "ordinary" managed memory
2384 * pages with flags like `PMAP_IO_RANGE_PROHIBIT_HIB_WRITE`, or
2385 * informing the SPTM that some (nominally) managed memory pages are
2386 * unavailable for some reason.
2387 *
2388 * Notably, `pmap_find_io_attr()`, and anything else that uses
2389 * `pmap_io_range`s for denoting to-be-mapped I/O ranges, ignores
2390 * entries with this flag.
2391 */
2392 #define PMAP_IO_RANGE_NOT_IO (1U << 27)
2393
2394 /* Pages in this range may never be written during hibernation restore. */
2395 #define PMAP_IO_RANGE_PROHIBIT_HIB_WRITE (1U << 26)
2396
2397 /**
2398 * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional
2399 * mapping flags (defined above).
2400 */
2401 uint32_t wimg;
2402
2403 /* 4 Character Code (4CC) describing what this range is. */
2404 uint32_t signature;
2405 } pmap_io_range_t;
2406
2407 /* Reminder: be sure to change all relevant device trees if you change the layout of pmap_io_range_t */
2408 _Static_assert(sizeof(pmap_io_range_t) == 24, "unexpected size for pmap_io_range_t");
2409
2410 extern pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t);
2411
2412 extern void pmap_range_iterate(bool (^step) (pmap_io_range_t const *));
2413
2414 /**
2415 * This structure describes a sub-page-size I/O region owned by SPTM but the kernel can write to.
2416 *
2417 * @note I/O filter software will use a collection of such data structures to determine access
2418 * permissions to a page owned by SPTM.
2419 *
2420 * @note The {signature, offset} key is used to index a collection of such data structures to
2421 * optimize for space in the case where one page layout is repeated for many devices, such
2422 * as the memory controller channels.
2423 */
2424 typedef struct pmap_io_filter_entry {
2425 /* 4 Character Code (4CC) describing what this range (page) is. */
2426 uint32_t signature;
2427
2428 /* Offset within the page. It has to be within [0, PAGE_SIZE). */
2429 uint16_t offset;
2430
2431 /* Length of the range, and (offset + length) has to be within [0, PAGE_SIZE). */
2432 uint16_t length;
2433 } pmap_io_filter_entry_t;
2434
2435 _Static_assert(sizeof(pmap_io_filter_entry_t) == 8, "unexpected size for pmap_io_filter_entry_t");
2436
2437 extern void pmap_cpu_data_init_internal(unsigned int);
2438
2439 /**
2440 * Convert a SURT PA to the containing SURT page's PA.
2441 *
2442 * @param surt_pa The SURT's physical addresss.
2443 *
2444 * @return The containing SURT page's PA.
2445 */
2446 static inline pmap_paddr_t
surt_page_pa_from_surt_pa(pmap_paddr_t surt_pa)2447 surt_page_pa_from_surt_pa(pmap_paddr_t surt_pa)
2448 {
2449 return surt_pa & ~PAGE_MASK;
2450 }
2451
2452 /**
2453 * Given a SURT PA, get its index in the containing SURT page.
2454 *
2455 * @param surt_pa The PA of the SURT.
2456 *
2457 * @return The index of the SURT in the containing SURT page.
2458 */
2459 static inline uint8_t
surt_index_from_surt_pa(pmap_paddr_t surt_pa)2460 surt_index_from_surt_pa(pmap_paddr_t surt_pa)
2461 {
2462 return (uint8_t)((surt_pa & PAGE_MASK) / SUBPAGE_USER_ROOT_TABLE_SIZE);
2463 }
2464
2465 /**
2466 * Given a SURT page PA and an index, compute the PA of the associated SURT.
2467 *
2468 * @param surt_page_pa The PA of the SURT page.
2469 * @param index THe index of the SURT in the SURT page.
2470 *
2471 * @return The computed PA of the SURT.
2472 */
2473 static inline pmap_paddr_t
surt_pa_from_surt_page_pa_and_index(pmap_paddr_t surt_page_pa,uint8_t index)2474 surt_pa_from_surt_page_pa_and_index(pmap_paddr_t surt_page_pa, uint8_t index)
2475 {
2476 assert((surt_page_pa & PAGE_MASK) == 0);
2477 return surt_page_pa + index * SUBPAGE_USER_ROOT_TABLE_SIZE;
2478 }
2479
2480 #if __ARM64_PMAP_SUBPAGE_L1__
2481 extern void surt_init(void);
2482 extern pmap_paddr_t surt_try_alloc(void);
2483 extern bool surt_free(pmap_paddr_t surt_pa);
2484 extern void surt_feed_page_with_first_table_allocated(pmap_paddr_t surt_page_pa);
2485 extern unsigned int surt_list_len(void);
2486 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2487
2488 #if DEBUG || DEVELOPMENT
2489 extern unsigned int pmap_wcrt_on_non_dram_count_get(void);
2490 extern void pmap_wcrt_on_non_dram_count_increment_atomic(void);
2491 #endif /* DEBUG || DEVELOPMENT */
2492