1 /*
2 * Copyright (c) 2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <arm/cpu_data_internal.h>
29 #include <kern/queue.h>
30 #include <libkern/OSAtomic.h>
31 #include <libkern/section_keywords.h>
32 #include <pexpert/device_tree.h>
33 #include <os/atomic_private.h>
34 #include <vm/cpm_internal.h>
35 #include <vm/vm_kern.h>
36 #include <vm/vm_protos.h>
37 #include <vm/vm_object_xnu.h>
38 #include <vm/vm_page_internal.h>
39 #include <vm/vm_pageout.h>
40
41 #include <arm64/sptm/pmap/pmap_internal.h>
42
43 /**
44 * Physical Page Attribute Table.
45 *
46 * Array that contains a set of flags for each kernel-managed physical VM page.
47 *
48 * @note There can be a disparity between the VM page size and the underlying
49 * hardware page size for a specific address space. In those cases, it's
50 * possible that multiple hardware pages will share the same set of
51 * attributes. The VM operates on regions of memory by the VM page size
52 * and is aware that all hardware pages within each VM page share
53 * attributes.
54 */
55 SECURITY_READ_ONLY_LATE(volatile pp_attr_t*) pp_attr_table = (volatile pp_attr_t*)NULL;
56
57 /**
58 * Physical to Virtual Table.
59 *
60 * Data structure that contains a list of virtual mappings for each kernel-
61 * managed physical page. Other flags and metadata are also stored in this
62 * structure on a per-physical-page basis.
63 *
64 * This structure is arranged as an array of pointers, where each pointer can
65 * point to one of three different types of data (single mapping, multiple
66 * mappings, or page table descriptor). Metadata about each page (including the
67 * type of pointer) are located in the lower and upper bits of the pointer.
68 * These bits need to be set/masked out to be able to dereference the pointer,
69 * so it's recommended to use the provided API in pmap_data.h to access the
70 * pv_head_table since it handles these details for you.
71 */
72 SECURITY_READ_ONLY_LATE(uintptr_t*) pv_head_table = NULL;
73
74 /* Simple linked-list structure used in various page free lists. */
75 typedef struct page_free_entry {
76 /**
77 * The first word in an empty page on a free list is used as a pointer to
78 * the next free page in the list.
79 */
80 struct page_free_entry *next;
81 } page_free_entry_t;
82
83 /* Represents a NULL entry in various page free lists. */
84 #define PAGE_FREE_ENTRY_NULL ((page_free_entry_t *) 0)
85
86 /**
87 * This VM object will contain every VM page being used by the pmap. This acts
88 * as a convenient place to put pmap pages to keep the VM from reusing them, as
89 * well as providing a way for looping over every page being used by the pmap.
90 */
91 struct vm_object pmap_object_store VM_PAGE_PACKED_ALIGNED;
92
93 /* Pointer to the pmap's VM object that can't be modified after machine_lockdown(). */
94 SECURITY_READ_ONLY_LATE(vm_object_t) pmap_object = &pmap_object_store;
95
96 /**
97 * This variable, used for debugging purposes only, keeps track of how many pages
98 * are currently in use by the pmap layer. Once a page is given back to the VM,
99 * then inuse_pmap_pages_count will be decremented.
100 *
101 * Even if a page is sitting in one of the pmap's various free lists and hasn't
102 * been allocated for usage, it is still considered "used" by the pmap, from
103 * the perspective of the VM.
104 */
105 unsigned int inuse_pmap_pages_count = 0;
106
107 /**
108 * Default watermark values used to keep a healthy supply of physical-to-virtual
109 * entries (PVEs) always available. These values can be overriden by the device
110 * tree (see pmap_compute_pv_targets() for more info).
111 */
112 #define PV_KERN_LOW_WATER_MARK_DEFAULT (0x400)
113 #define PV_ALLOC_CHUNK_INITIAL (0x400)
114 #define PV_KERN_ALLOC_CHUNK_INITIAL (0x400)
115
116 /**
117 * The pv_free array acts as a ring buffer where each entry points to a linked
118 * list of PVEs that have a length set by this define.
119 */
120 #define PV_BATCH_SIZE (PAGE_SIZE / sizeof(pv_entry_t))
121
122 /* The batch allocation code assumes that a batch can fit within a single page. */
123 #if __ARM_16K_PG__
124 /**
125 * PAGE_SIZE is a variable on arm64 systems with 4K VM pages, so no static
126 * assert on those systems.
127 */
128 static_assert((PV_BATCH_SIZE * sizeof(pv_entry_t)) <= PAGE_SIZE);
129 #endif /* __ARM_16K_PG__ */
130
131 /**
132 * The number of PVEs to attempt to keep in the kernel-dedicated free list. If
133 * the number of entries is below this value, then allocate more.
134 */
135 static uint32_t pv_kern_low_water_mark MARK_AS_PMAP_DATA = PV_KERN_LOW_WATER_MARK_DEFAULT;
136
137 /**
138 * The initial number of PVEs to allocate during bootstrap (can be overriden in
139 * the device tree, see pmap_compute_pv_targets() for more info).
140 */
141 uint32_t pv_alloc_initial_target MARK_AS_PMAP_DATA = PV_ALLOC_CHUNK_INITIAL * MAX_CPUS;
142 uint32_t pv_kern_alloc_initial_target MARK_AS_PMAP_DATA = PV_KERN_ALLOC_CHUNK_INITIAL;
143
144 /**
145 * Global variables strictly used for debugging purposes. These variables keep
146 * track of the number of pages being used for PVE objects, PTD objects, and the
147 * total number of PVEs that have been added to the global or kernel-dedicated
148 * free lists respectively.
149 */
150 static _Atomic unsigned int pv_page_count MARK_AS_PMAP_DATA = 0;
151 static unsigned int ptd_page_count MARK_AS_PMAP_DATA = 0;
152 static unsigned pmap_reserve_replenish_stat MARK_AS_PMAP_DATA = 0;
153 static unsigned pmap_kern_reserve_alloc_stat MARK_AS_PMAP_DATA = 0;
154
155 /**
156 * Number of linked lists of PVEs ("batches") in the global PV free ring buffer.
157 * This must be a power of two for the pv_free_array_n_elems() logic to work.
158 */
159 #define PV_FREE_ARRAY_SIZE (256U)
160
161 /**
162 * A ring buffer where each entry in the buffer is a linked list of PV entries
163 * (called "batches"). Allocations out of this array will always operate on
164 * a PV_BATCH_SIZE amount of entries at a time.
165 */
166 static pv_free_list_t pv_free_ring[PV_FREE_ARRAY_SIZE] MARK_AS_PMAP_DATA = {0};
167
168 /* Read and write indices for the pv_free ring buffer. */
169 static uint16_t pv_free_read_idx MARK_AS_PMAP_DATA = 0;
170 static uint16_t pv_free_write_idx MARK_AS_PMAP_DATA = 0;
171
172 /**
173 * Make sure the PV free array is small enough so that all elements can be
174 * properly indexed by pv_free_[read/write]_idx.
175 */
176 static_assert(PV_FREE_ARRAY_SIZE <= (1 << (sizeof(pv_free_read_idx) * 8)));
177
178 /**
179 * Return the number of free batches available for allocation out of the PV free
180 * ring buffer. Each batch is a linked list of PVEs with length PV_BATCH_SIZE.
181 *
182 * @note This function requires that PV_FREE_ARRAY_SIZE is a power of two.
183 */
184 static inline uint16_t
pv_free_array_n_elems(void)185 pv_free_array_n_elems(void)
186 {
187 return (pv_free_write_idx - pv_free_read_idx) & (PV_FREE_ARRAY_SIZE - 1);
188 }
189
190 /* Free list of PV entries dedicated for usage by the kernel. */
191 static pv_free_list_t pv_kern_free MARK_AS_PMAP_DATA = {0};
192
193 /* Locks for the global and kernel-dedicated PV free lists. */
194 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_free_array_lock, 0);
195 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_kern_free_list_lock, 0);
196
197 /* Represents a null page table descriptor (PTD). */
198 #define PTD_ENTRY_NULL ((pt_desc_t *) 0)
199
200 /* Running free list of PTD nodes. */
201 static pt_desc_t *ptd_free_list MARK_AS_PMAP_DATA = PTD_ENTRY_NULL;
202
203 /* The number of free PTD nodes available in the free list. */
204 static unsigned int ptd_free_count MARK_AS_PMAP_DATA = 0;
205
206 /**
207 * The number of PTD objects located in each page being used by the PTD
208 * allocator. The PTD objects share each page with their associated ptd_info_t
209 * objects (with cache-line alignment padding between them). The maximum number
210 * of PTDs that can be placed into a single page is calculated once at boot.
211 */
212 static SECURITY_READ_ONLY_LATE(unsigned) ptd_per_page = 0;
213
214 /**
215 * The offset in bytes from the beginning of a page of PTD objects where you
216 * start seeing the associated ptd_info_t objects. This is calculated once
217 * during boot to maximize the number of PTD and ptd_info_t objects that can
218 * reside within a page without sharing a cache-line.
219 */
220 static SECURITY_READ_ONLY_LATE(unsigned) ptd_info_offset = 0;
221
222 /* Lock to protect accesses to the PTD free list. */
223 static decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA);
224
225 /**
226 * Dummy _internal() prototypes so Clang doesn't complain about missing
227 * prototypes on a non-static function. These functions can't be marked as
228 * static because they need to be called from pmap_ppl_interface.c where the
229 * PMAP_SUPPORT_PROTOYPES() macro will auto-generate the prototype implicitly.
230 */
231 kern_return_t mapping_free_prime_internal(void);
232
233 /**
234 * Flag indicating whether any I/O regions that require strong DSB are present.
235 * If not, certain TLB maintenance operations can be streamlined.
236 */
237 SECURITY_READ_ONLY_LATE(bool) sdsb_io_rgns_present = false;
238
239 /**
240 * Sorted representation of the pmap-io-ranges nodes in the device tree. These
241 * nodes describe all of the SPTM/PPL-owned I/O ranges.
242 */
243 SECURITY_READ_ONLY_LATE(pmap_io_range_t*) io_attr_table = (pmap_io_range_t*)0;
244
245 /* The number of ranges described by io_attr_table. */
246 SECURITY_READ_ONLY_LATE(unsigned int) num_io_rgns = 0;
247
248 /**
249 * Sorted representation of the pmap-io-filter entries in the device tree
250 * The entries are sorted and queried by {signature, range}.
251 */
252 SECURITY_READ_ONLY_LATE(pmap_io_filter_entry_t*) io_filter_table = (pmap_io_filter_entry_t*)0;
253
254 /* Number of total pmap-io-filter entries. */
255 SECURITY_READ_ONLY_LATE(unsigned int) num_io_filter_entries = 0;
256
257 /**
258 * A list of pages that define the per-cpu scratch areas used by IOMMU drivers
259 * when preparing data to be passed into the SPTM. The size allocated per-cpu is
260 * defined by PMAP_IOMMU_SCRATCH_SIZE.
261 *
262 * SPTM TODO: Only have these variables on systems with IOMMU drivers (H11+).
263 */
264 #define PMAP_IOMMU_SCRATCH_SIZE (PMAP_IOMMU_NUM_SCRATCH_PAGES * PAGE_SIZE)
265 SECURITY_READ_ONLY_LATE(pmap_paddr_t) sptm_cpu_iommu_scratch_start = 0;
266 SECURITY_READ_ONLY_LATE(pmap_paddr_t) sptm_cpu_iommu_scratch_end = 0;
267
268 /* Prototypes used by pmap_data_bootstrap(). */
269 void pmap_cpu_data_array_init(void);
270
271 /**
272 * This function is called once during pmap_bootstrap() to allocate and
273 * initialize many of the core data structures that are implemented in this
274 * file.
275 *
276 * Memory for these data structures is carved out of `avail_start` which is a
277 * global setup by arm_vm_init() that points to a physically contiguous region
278 * used for bootstrap allocations.
279 *
280 * @note There is no guaranteed alignment of `avail_start` when this function
281 * returns. If avail_start needs to be aligned to a specific value then it
282 * must be done so by the caller before they use it for more allocations.
283 */
284 void
pmap_data_bootstrap(void)285 pmap_data_bootstrap(void)
286 {
287 /**
288 * Set ptd_per_page to the maximum number of (pt_desc_t + ptd_info_t) we can
289 * fit in a single page. We need to allow for some padding between the two,
290 * so that no ptd_info_t shares a cache line with a pt_desc_t.
291 */
292 const unsigned ptd_info_size = sizeof(ptd_info_t);
293 const unsigned l2_cline_bytes = 1 << MAX_L2_CLINE;
294 ptd_per_page = (PAGE_SIZE - (l2_cline_bytes - 1)) / (sizeof(pt_desc_t) + ptd_info_size);
295 unsigned increment = 0;
296 bool try_next = true;
297
298 /**
299 * The current ptd_per_page calculation was done assuming the worst-case
300 * scenario in terms of padding between the two object arrays that reside in
301 * the same page. The following loop attempts to optimize this further by
302 * finding the smallest possible amount of padding while still ensuring that
303 * the two object arrays don't share a cache line.
304 */
305 while (try_next) {
306 increment++;
307 const unsigned pt_desc_total_size =
308 PMAP_ALIGN((ptd_per_page + increment) * sizeof(pt_desc_t), l2_cline_bytes);
309 const unsigned ptd_info_total_size = (ptd_per_page + increment) * ptd_info_size;
310 try_next = (pt_desc_total_size + ptd_info_total_size) <= PAGE_SIZE;
311 }
312 ptd_per_page += increment - 1;
313 assert(ptd_per_page > 0);
314
315 /**
316 * ptd_info objects reside after the ptd descriptor objects, with some
317 * padding in between if necessary to ensure that they don't co-exist in the
318 * same cache line.
319 */
320 const unsigned pt_desc_bytes = ptd_per_page * sizeof(pt_desc_t);
321 ptd_info_offset = PMAP_ALIGN(pt_desc_bytes, l2_cline_bytes);
322
323 /* The maximum amount of padding should be (l2_cline_bytes - 1). */
324 assert((ptd_info_offset - pt_desc_bytes) < l2_cline_bytes);
325
326 /**
327 * Allocate enough initial PTDs to map twice the available physical memory.
328 *
329 * To do this, start by calculating the number of leaf page tables that are
330 * needed to cover all of kernel-managed physical memory.
331 */
332 const uint32_t num_leaf_page_tables =
333 (uint32_t)(mem_size / ((PAGE_SIZE / sizeof(pt_entry_t)) * ARM_PGBYTES));
334
335 /**
336 * There should be one PTD per page table (times 2 since we want twice the
337 * number of required PTDs), plus round the number of PTDs up to the next
338 * `ptd_per_page` value so there's no wasted space.
339 */
340 const uint32_t ptd_root_table_n_ptds =
341 (ptd_per_page * ((num_leaf_page_tables * 2) / ptd_per_page)) + ptd_per_page;
342
343 /* Lastly, calculate the number of VM pages and bytes these PTDs take up. */
344 const uint32_t num_ptd_pages = ptd_root_table_n_ptds / ptd_per_page;
345 vm_size_t ptd_root_table_size = num_ptd_pages * PAGE_SIZE;
346
347 /* Number of VM pages that span all of kernel-managed memory. */
348 unsigned int npages = (unsigned int)atop(mem_size);
349
350
351 /* The pv_head_table and pp_attr_table both have one entry per VM page. */
352 const vm_size_t pp_attr_table_size = npages * sizeof(pp_attr_t);
353 const vm_size_t pv_head_size = round_page(npages * sizeof(*pv_head_table));
354
355 /* Scan the device tree and override heuristics in the PV entry management code. */
356 pmap_compute_pv_targets();
357
358 io_attr_table = (pmap_io_range_t *) SPTMArgs->sptm_pmap_io_ranges;
359 num_io_rgns = SPTMArgs->sptm_pmap_io_ranges_count;
360 io_filter_table = (pmap_io_filter_entry_t *) SPTMArgs->sptm_pmap_io_filters;
361 num_io_filter_entries = SPTMArgs->sptm_pmap_io_filters_count;
362
363 /**
364 * Don't make any assumptions about the alignment of avail_start before
365 * execution of this function. Always re-align it to ensure the first
366 * allocated data structure is aligned correctly.
367 */
368 avail_start = PMAP_ALIGN(avail_start, __alignof(pp_attr_t));
369
370 /**
371 * Keep track of where the data structures start so we can clear this memory
372 * later.
373 */
374 const pmap_paddr_t pmap_struct_start = avail_start;
375
376 pp_attr_table = (pp_attr_t *)phystokv(avail_start);
377 avail_start = PMAP_ALIGN(avail_start + pp_attr_table_size, __alignof(pv_entry_t *));
378
379 pv_head_table = (uintptr_t *)phystokv(avail_start);
380
381 /**
382 * ptd_root_table must start on a page boundary because all of the math for
383 * associating pt_desc_t objects with ptd_info objects assumes the first
384 * pt_desc_t in a page starts at the beginning of the page it resides in.
385 */
386 avail_start = round_page(avail_start + pv_head_size);
387
388 pt_desc_t *ptd_root_table = (pt_desc_t *)phystokv(avail_start);
389 avail_start = round_page(avail_start + ptd_root_table_size);
390
391 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
392
393 /* This function assumes that ptd_root_table has been zeroed out already. */
394 ptd_bootstrap(ptd_root_table, num_ptd_pages);
395
396 /* Setup the pmap per-cpu data structures. */
397 pmap_cpu_data_array_init();
398 }
399
400 /**
401 * Add a queue of VM pages to the pmap's VM object. This informs the VM that
402 * these pages are being used by the pmap and shouldn't be reused.
403 *
404 * This also means that the pmap_object can be used as a convenient way to loop
405 * through every page currently being used by the pmap. For instance, this queue
406 * of pages is exposed to the debugger through the Low Globals, where it's used
407 * to ensure that all pmap data is saved in an active core dump.
408 *
409 * @param mem The head of the queue of VM pages to add to the pmap's VM object.
410 */
411 void
pmap_enqueue_pages(vm_page_t mem)412 pmap_enqueue_pages(vm_page_t mem)
413 {
414 vm_page_t m_prev;
415 vm_object_lock(pmap_object);
416 while (mem != VM_PAGE_NULL) {
417 const vm_object_offset_t offset =
418 (vm_object_offset_t) ((ptoa(VM_PAGE_GET_PHYS_PAGE(mem))) - gPhysBase);
419
420 vm_page_insert_wired(mem, pmap_object, offset, VM_KERN_MEMORY_PTE);
421 m_prev = mem;
422 mem = NEXT_PAGE(m_prev);
423 *(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL;
424 }
425 vm_object_unlock(pmap_object);
426 }
427
428 /**
429 * Allocate a page from the VM for usage within the pmap.
430 *
431 * @param ppa Output parameter to store the physical address of the allocated
432 * page if one was able to be allocated (NULL otherwise).
433 * @param options The following options can be specified:
434 * - PMAP_PAGE_ALLOCATE_NOWAIT: If the VM page free list doesn't have
435 * any free pages available then don't wait for one, just return
436 * immediately without allocating a page.
437 *
438 * - PMAP_PAGE_RECLAIM_NOWAIT: If memory can't be allocated from the VM,
439 * then fall back to attempting to reclaim a userspace page table. This
440 * should only be specified in paths that absolutely can't take the
441 * latency hit of waiting for the VM to allocate more pages. This flag
442 * doesn't make much sense unless it's paired with
443 * PMAP_PAGE_ALLOCATE_NOWAIT.
444 *
445 * - PMAP_PAGE_NOZEROFILL: don't zero-fill the pages. This should only be
446 * used if you know that something else in the relevant code path will
447 * zero-fill or otherwise fully initialize the page with consistent data.
448 * This is mostly intended for cases in which sptm_retype() is guaranteed
449 * to zero-fill the page for us.
450 *
451 * @return KERN_SUCCESS if a page was successfully allocated, or
452 * KERN_RESOURCE_SHORTAGE if a page failed to get allocated. This should
453 * only be returned if PMAP_PAGE_ALLOCATE_NOWAIT is passed or if
454 * preemption is disabled after early boot since allocating memory from
455 * the VM requires grabbing a mutex. If PMAP_PAGE_ALLOCATE_NOWAIT is not
456 * passed and the system is in a preemptable state, then the return
457 * value should always be KERN_SUCCESS (as the thread will block until
458 * there are free pages available).
459 */
460 MARK_AS_PMAP_TEXT kern_return_t
pmap_page_alloc(pmap_paddr_t * ppa,unsigned options)461 pmap_page_alloc(pmap_paddr_t *ppa, unsigned options)
462 {
463 assert(ppa != NULL);
464 pmap_paddr_t pa = 0;
465 PMAP_ASSERT_NOT_WRITING_HIB();
466 vm_page_t mem = VM_PAGE_NULL;
467 thread_t self = current_thread();
468
469 /**
470 * It's not possible to allocate memory from the VM in a preemption disabled
471 * environment except during early boot (since the VM needs to grab a mutex).
472 * In those cases just return a resource shortage error and let the caller
473 * deal with it.
474 *
475 * We don't panic here as there are genuinely some cases where pmap_enter()
476 * is called with preemption disabled, and it's better to return an error
477 * to those callers to notify them to try again with preemption enabled.
478 */
479 if (!pmap_is_preemptible()) {
480 return KERN_RESOURCE_SHORTAGE;
481 }
482
483 *ppa = 0;
484
485 /**
486 * We qualify for allocating reserved memory so set TH_OPT_VMPRIV to inform
487 * the VM of this.
488 *
489 * This field should only be modified by the local thread itself, so no lock
490 * needs to be taken.
491 */
492 uint16_t thread_options = self->options;
493 self->options |= TH_OPT_VMPRIV;
494
495 /**
496 * If we're only allocating a single page, just grab one off the VM's
497 * global page free list.
498 */
499 vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
500 while ((mem = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) {
501 if (options & PMAP_PAGE_ALLOCATE_NOWAIT) {
502 break;
503 }
504
505 VM_PAGE_WAIT();
506 }
507
508 if (mem != VM_PAGE_NULL) {
509 vm_page_lock_queues();
510 vm_page_wire(mem, VM_KERN_MEMORY_PTE, TRUE);
511 vm_page_unlock_queues();
512 }
513
514 self->options = thread_options;
515
516 if (mem == VM_PAGE_NULL) {
517 return KERN_RESOURCE_SHORTAGE;
518 }
519
520 pa = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(mem));
521
522 /* Add the allocated VM page(s) to the pmap's VM object. */
523 pmap_enqueue_pages(mem);
524
525 /* Pages are considered "in use" by the pmap until returned to the VM. */
526 OSAddAtomic(1, &inuse_pmap_pages_count);
527
528 /* SPTM TODO: assert that the returned page is of type XNU_DEFAULT in frame table */
529 if (!(options & PMAP_PAGE_NOZEROFILL)) {
530 bzero((void*)phystokv(pa), PAGE_SIZE);
531 }
532 *ppa = pa;
533 return KERN_SUCCESS;
534 }
535
536 /**
537 * Free memory previously allocated through pmap_page_alloc() back to the VM.
538 *
539 * @param pa Physical address of the page(s) to free.
540 */
541 void
pmap_page_free(pmap_paddr_t pa)542 pmap_page_free(pmap_paddr_t pa)
543 {
544 /* SPTM TODO: assert that the page to be freed is of type XNU_DEFAULT in frame table */
545
546 /* Pages are considered "in use" until given back to the VM. */
547 OSAddAtomic(-1, &inuse_pmap_pages_count);
548
549 vm_page_t mem = VM_PAGE_NULL;
550 vm_object_lock(pmap_object);
551
552 /**
553 * Remove the page from the pmap's VM object and return it back to the
554 * VM's global free list of pages.
555 */
556 mem = vm_page_lookup(pmap_object, (pa - gPhysBase));
557 assert(mem != VM_PAGE_NULL);
558 assert(VM_PAGE_WIRED(mem));
559 vm_page_lock_queues();
560 vm_page_free(mem);
561 vm_page_unlock_queues();
562 vm_object_unlock(pmap_object);
563 }
564
565 /**
566 * Called by the VM to reclaim pages that we can reclaim quickly and cheaply.
567 * This will take pages in the pmap's VM object and add them back to the VM's
568 * global list of free pages.
569 *
570 * @return The number of pages returned to the VM.
571 */
572 uint64_t
pmap_release_pages_fast(void)573 pmap_release_pages_fast(void)
574 {
575 return 0;
576 }
577
578 /**
579 * Allocates a batch (list) of pv_entry_t's from the global PV free array.
580 *
581 * @return A pointer to the head of the newly-allocated batch, or PV_ENTRY_NULL
582 * if empty.
583 */
584 MARK_AS_PMAP_TEXT static pv_entry_t *
pv_free_array_get_batch(void)585 pv_free_array_get_batch(void)
586 {
587 pv_entry_t *new_batch = PV_ENTRY_NULL;
588
589 pmap_simple_lock(&pv_free_array_lock);
590 if (pv_free_array_n_elems() > 0) {
591 /**
592 * The global PV array acts as a ring buffer where each entry points to
593 * a linked list of PVEs of length PV_BATCH_SIZE. Get the next free
594 * batch.
595 */
596 const size_t index = pv_free_read_idx++ & (PV_FREE_ARRAY_SIZE - 1);
597 pv_free_list_t *free_list = &pv_free_ring[index];
598
599 assert((free_list->count == PV_BATCH_SIZE) && (free_list->list != PV_ENTRY_NULL));
600 new_batch = free_list->list;
601 }
602 pmap_simple_unlock(&pv_free_array_lock);
603
604 return new_batch;
605 }
606
607 /**
608 * Frees a batch (list) of pv_entry_t's into the global PV free array.
609 *
610 * @param batch_head Pointer to the first entry in the batch to be returned to
611 * the array. This must be a linked list of pv_entry_t's of
612 * length PV_BATCH_SIZE.
613 *
614 * @return KERN_SUCCESS, or KERN_FAILURE if the global array is full.
615 */
616 MARK_AS_PMAP_TEXT static kern_return_t
pv_free_array_give_batch(pv_entry_t * batch_head)617 pv_free_array_give_batch(pv_entry_t *batch_head)
618 {
619 assert(batch_head != NULL);
620
621 pmap_simple_lock(&pv_free_array_lock);
622 if (pv_free_array_n_elems() == (PV_FREE_ARRAY_SIZE - 1)) {
623 pmap_simple_unlock(&pv_free_array_lock);
624 return KERN_FAILURE;
625 }
626
627 const size_t index = pv_free_write_idx++ & (PV_FREE_ARRAY_SIZE - 1);
628 pv_free_list_t *free_list = &pv_free_ring[index];
629 free_list->list = batch_head;
630 free_list->count = PV_BATCH_SIZE;
631 pmap_simple_unlock(&pv_free_array_lock);
632
633 return KERN_SUCCESS;
634 }
635
636 /**
637 * Helper function for allocating a single PVE from an arbitrary free list.
638 *
639 * @param free_list The free list to allocate a node from.
640 * @param pvepp Output parameter that will get updated with a pointer to the
641 * allocated node if the free list isn't empty, or a pointer to
642 * NULL if the list is empty.
643 */
644 MARK_AS_PMAP_TEXT static void
pv_free_list_alloc(pv_free_list_t * free_list,pv_entry_t ** pvepp)645 pv_free_list_alloc(pv_free_list_t *free_list, pv_entry_t **pvepp)
646 {
647 assert(pvepp != NULL);
648 assert(((free_list->list != NULL) && (free_list->count > 0)) ||
649 ((free_list->list == NULL) && (free_list->count == 0)));
650
651 if ((*pvepp = free_list->list) != NULL) {
652 pv_entry_t *pvep = *pvepp;
653 free_list->list = pvep->pve_next;
654 pvep->pve_next = PV_ENTRY_NULL;
655 free_list->count--;
656 }
657 }
658
659 /**
660 * Allocates a PVE from the kernel-dedicated list.
661 *
662 * @note This is only called when the global free list is empty, so don't bother
663 * trying to allocate more nodes from that list.
664 *
665 * @param pvepp Output parameter that will get updated with a pointer to the
666 * allocated node if the free list isn't empty, or a pointer to
667 * NULL if the list is empty. This pointer can't already be
668 * pointing to a valid entry before allocation.
669 */
670 MARK_AS_PMAP_TEXT static void
pv_list_kern_alloc(pv_entry_t ** pvepp)671 pv_list_kern_alloc(pv_entry_t **pvepp)
672 {
673 assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
674 pmap_simple_lock(&pv_kern_free_list_lock);
675 if (pv_kern_free.count > 0) {
676 pmap_kern_reserve_alloc_stat++;
677 }
678 pv_free_list_alloc(&pv_kern_free, pvepp);
679 pmap_simple_unlock(&pv_kern_free_list_lock);
680 }
681
682 /**
683 * Returns a list of PVEs to the kernel-dedicated free list.
684 *
685 * @param pve_head Head of the list to be returned.
686 * @param pve_tail Tail of the list to be returned.
687 * @param pv_cnt Number of elements in the list to be returned.
688 */
689 MARK_AS_PMAP_TEXT static void
pv_list_kern_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,int pv_cnt)690 pv_list_kern_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, int pv_cnt)
691 {
692 assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
693
694 pmap_simple_lock(&pv_kern_free_list_lock);
695 pve_tail->pve_next = pv_kern_free.list;
696 pv_kern_free.list = pve_head;
697 pv_kern_free.count += pv_cnt;
698 pmap_simple_unlock(&pv_kern_free_list_lock);
699 }
700
701 /**
702 * Attempts to allocate from the per-cpu free list of PVEs, and if that fails,
703 * then replenish the per-cpu free list with a batch of PVEs from the global
704 * PVE free list.
705 *
706 * @param pvepp Output parameter that will get updated with a pointer to the
707 * allocated node if the free lists aren't empty, or a pointer to
708 * NULL if both the per-cpu and global lists are empty. This
709 * pointer can't already be pointing to a valid entry before
710 * allocation.
711 */
712 MARK_AS_PMAP_TEXT static void
pv_list_alloc(pv_entry_t ** pvepp)713 pv_list_alloc(pv_entry_t **pvepp)
714 {
715 assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
716
717 /* Disable preemption while working with per-CPU data. */
718 mp_disable_preemption();
719
720 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
721 pv_free_list_alloc(&pmap_cpu_data->pv_free, pvepp);
722
723 if (*pvepp != PV_ENTRY_NULL) {
724 goto pv_list_alloc_done;
725 }
726
727 if (pv_kern_free.count < pv_kern_low_water_mark) {
728 /**
729 * If the kernel reserved pool is low, let non-kernel mappings wait for
730 * a page from the VM.
731 */
732 goto pv_list_alloc_done;
733 }
734
735 /**
736 * Attempt to replenish the local list off the global one, and return the
737 * first element. If the global list is empty, then the allocation failed.
738 */
739 pv_entry_t *new_batch = pv_free_array_get_batch();
740
741 if (new_batch != PV_ENTRY_NULL) {
742 pmap_cpu_data->pv_free.count = PV_BATCH_SIZE - 1;
743 pmap_cpu_data->pv_free.list = new_batch->pve_next;
744 assert(pmap_cpu_data->pv_free.list != NULL);
745
746 new_batch->pve_next = PV_ENTRY_NULL;
747 *pvepp = new_batch;
748 }
749
750 pv_list_alloc_done:
751 mp_enable_preemption();
752
753 return;
754 }
755
756 /**
757 * Adds a list of PVEs to the per-CPU PVE free list. May spill out some entries
758 * to the global or the kernel PVE free lists if the per-CPU list contains too
759 * many PVEs.
760 *
761 * @param pve_head Head of the list to be returned.
762 * @param pve_tail Tail of the list to be returned.
763 * @param pv_cnt Number of elements in the list to be returned.
764 */
765 MARK_AS_PMAP_TEXT void
pv_list_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,unsigned int pv_cnt)766 pv_list_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, unsigned int pv_cnt)
767 {
768 assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
769
770 /* Disable preemption while working with per-CPU data. */
771 disable_preemption();
772
773 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
774
775 /**
776 * How many more PVEs need to be added to the last allocated batch to get it
777 * back up to a PV_BATCH_SIZE number of objects.
778 */
779 const uint32_t available = PV_BATCH_SIZE - (pmap_cpu_data->pv_free.count % PV_BATCH_SIZE);
780
781 /**
782 * The common case is that the number of PVEs to be freed fit in the current
783 * PV_BATCH_SIZE boundary. If that is the case, quickly prepend the whole
784 * list and return.
785 */
786 if (__probable((pv_cnt <= available) &&
787 ((pmap_cpu_data->pv_free.count % PV_BATCH_SIZE != 0) || (pmap_cpu_data->pv_free.count == 0)))) {
788 pve_tail->pve_next = pmap_cpu_data->pv_free.list;
789 pmap_cpu_data->pv_free.list = pve_head;
790 pmap_cpu_data->pv_free.count += pv_cnt;
791 goto pv_list_free_done;
792 }
793
794 unsigned int freed_count = 0;
795
796 /**
797 * In the degenerate case, we need to process PVEs one by one, to make sure
798 * we spill out to the global list, or update the spill marker as
799 * appropriate.
800 */
801 while (pv_cnt) {
802 /**
803 * Check for (and if necessary reenable) preemption every PV_BATCH_SIZE PVEs to
804 * avoid leaving preemption disabled for an excessive duration if we happen to be
805 * processing a very large PV list.
806 */
807 if (__improbable(freed_count == PV_BATCH_SIZE)) {
808 freed_count = 0;
809 if (__improbable(pmap_pending_preemption())) {
810 enable_preemption();
811 assert(preemption_enabled() || PMAP_IS_HIBERNATING());
812 disable_preemption();
813 pmap_cpu_data = pmap_get_cpu_data();
814 }
815 }
816
817 /**
818 * Take the node off the top of the passed in list and prepend it to the
819 * per-cpu list.
820 */
821 pv_entry_t *pv_next = pve_head->pve_next;
822 pve_head->pve_next = pmap_cpu_data->pv_free.list;
823 pmap_cpu_data->pv_free.list = pve_head;
824 pve_head = pv_next;
825 pmap_cpu_data->pv_free.count++;
826 pv_cnt--;
827 freed_count++;
828
829 if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE + 1))) {
830 /**
831 * A full batch of entries have been freed to the per-cpu list.
832 * Update the spill marker which is used to remember the end of a
833 * batch (remember, we prepend nodes) to eventually return back to
834 * the global list (we try to only keep one PV_BATCH_SIZE worth of
835 * nodes in any single per-cpu list).
836 */
837 pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
838 } else if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE * 2) + 1)) {
839 /* Spill out excess PVEs to the global PVE array */
840 pv_entry_t *spill_head = pmap_cpu_data->pv_free.list->pve_next;
841 pv_entry_t *spill_tail = pmap_cpu_data->pv_free_spill_marker;
842 pmap_cpu_data->pv_free.list->pve_next = pmap_cpu_data->pv_free_spill_marker->pve_next;
843 spill_tail->pve_next = PV_ENTRY_NULL;
844 pmap_cpu_data->pv_free.count -= PV_BATCH_SIZE;
845 pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
846
847 if (__improbable(pv_free_array_give_batch(spill_head) != KERN_SUCCESS)) {
848 /**
849 * This is extremely unlikely to happen, as it would imply that
850 * we have (PV_FREE_ARRAY_SIZE * PV_BATCH_SIZE) PVEs sitting in
851 * the global array. Just in case, push the excess down to the
852 * kernel PVE free list.
853 */
854 pv_list_kern_free(spill_head, spill_tail, PV_BATCH_SIZE);
855 }
856 }
857 }
858
859 pv_list_free_done:
860 enable_preemption();
861
862 return;
863 }
864
865 /**
866 * Adds a single page to the PVE allocation subsystem.
867 *
868 * @note This function operates under the assumption that a PV_BATCH_SIZE amount
869 * of PVEs can fit within a single page. One page is always allocated for
870 * one batch, so if there's empty space in the page after the batch of
871 * PVEs, it'll go unused (so it's best to keep the batch size at an amount
872 * that utilizes a whole page).
873 *
874 * @param alloc_flags Allocation flags passed to pmap_page_alloc(). See
875 * the definition of that function for a detailed description
876 * of the available flags.
877 *
878 * @return KERN_SUCCESS, or the value returned by pmap_page_alloc() upon
879 * failure.
880 */
881 MARK_AS_PMAP_TEXT static kern_return_t
pve_feed_page(unsigned alloc_flags)882 pve_feed_page(unsigned alloc_flags)
883 {
884 kern_return_t kr = KERN_FAILURE;
885
886 pv_entry_t *pve_head = PV_ENTRY_NULL;
887 pv_entry_t *pve_tail = PV_ENTRY_NULL;
888 pmap_paddr_t pa = 0;
889
890 kr = pmap_page_alloc(&pa, alloc_flags);
891
892 if (kr != KERN_SUCCESS) {
893 return kr;
894 }
895
896 /* Update statistics globals. See the variables' definitions for more info. */
897 os_atomic_inc(&pv_page_count, relaxed);
898 pmap_reserve_replenish_stat += PV_BATCH_SIZE;
899
900 /* Prepare a new list by linking all of the entries in advance. */
901 pve_head = (pv_entry_t *)phystokv(pa);
902 pve_tail = &pve_head[PV_BATCH_SIZE - 1];
903
904 for (int i = 0; i < PV_BATCH_SIZE; i++) {
905 pve_head[i].pve_next = &pve_head[i + 1];
906 }
907 pve_head[PV_BATCH_SIZE - 1].pve_next = PV_ENTRY_NULL;
908
909 /**
910 * Add the new list to the kernel PVE free list if we are running low on
911 * kernel-dedicated entries or the global free array is full.
912 */
913 if ((pv_kern_free.count < pv_kern_low_water_mark) ||
914 (pv_free_array_give_batch(pve_head) != KERN_SUCCESS)) {
915 pv_list_kern_free(pve_head, pve_tail, PV_BATCH_SIZE);
916 }
917
918 return KERN_SUCCESS;
919 }
920
921 /**
922 * Allocate a PV node from one of many different free lists (per-cpu, global, or
923 * kernel-specific).
924 *
925 * @note This function is very tightly coupled with pmap_enter_pv(). If
926 * modifying this code, please ensure that pmap_enter_pv() doesn't break.
927 *
928 * @note The pmap lock must already be held if the new mapping is a CPU mapping.
929 *
930 * @note The PVH lock for the physical page that is getting a new mapping
931 * registered must already be held.
932 *
933 * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
934 * an IOMMU translation.
935 * @param lock_mode Which state the pmap lock is being held in if the mapping is
936 * owned by a pmap, otherwise this is a don't care.
937 * @param options PMAP_OPTIONS_* family of options passed from the caller.
938 * @param pvepp Output parameter that will get updated with a pointer to the
939 * allocated node if none of the free lists are empty, or a pointer
940 * to NULL otherwise. This pointer can't already be pointing to a
941 * valid entry before allocation.
942 * @param locked_pvh Input/output parameter pointing to the wrapped value of the
943 * pv_head_table entry previously obtained from pvh_lock().
944 * This value will be updated if [locked_pvh->pai] needs to be
945 * re-locked.
946 * @param refcountp Pointer to a reference count that will be temporarily
947 * atomically incremented in the event that [pmap]'s lock needs
948 * to be temporarily dropped in order to satisfy the allocation.
949 * This is typically used to prevent a page table from being
950 * reclaimed while the lock is dropped. May be NULL.
951 *
952 * @return These are the possible return values:
953 * PV_ALLOC_SUCCESS: A PVE object was successfully allocated.
954 * PV_ALLOC_FAIL: No objects were available for allocation, and
955 * allocating a new page failed.
956 * PV_ALLOC_RETRY: No objects were available on the free lists, so a new
957 * page of PVE objects needed to be allocated. To do that,
958 * the pmap and PVH locks were dropped. The caller may have
959 * depended on these locks for consistency, so return and
960 * let the caller retry the PVE allocation with the locks
961 * held. Note that the locks have already been re-acquired
962 * before this function exits.
963 */
964 MARK_AS_PMAP_TEXT pv_alloc_return_t
pv_alloc(pmap_t pmap,pmap_lock_mode_t lock_mode,unsigned int options,pv_entry_t ** pvepp,locked_pvh_t * locked_pvh,volatile uint16_t * refcountp)965 pv_alloc(
966 pmap_t pmap,
967 pmap_lock_mode_t lock_mode,
968 unsigned int options,
969 pv_entry_t **pvepp,
970 locked_pvh_t *locked_pvh,
971 volatile uint16_t *refcountp)
972 {
973 assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
974 assert(locked_pvh != NULL);
975
976 if (pmap != NULL) {
977 pmap_assert_locked(pmap, lock_mode);
978 }
979
980 pv_list_alloc(pvepp);
981 if (PV_ENTRY_NULL != *pvepp) {
982 return PV_ALLOC_SUCCESS;
983 }
984
985 unsigned alloc_flags = 0;
986
987 /**
988 * We got here because both the per-CPU and the global lists are empty. If
989 * this allocation is for the kernel pmap or an IOMMU kernel driver, we try
990 * to get an entry from the kernel list next.
991 */
992 if ((pmap == NULL) || (kernel_pmap == pmap)) {
993 pv_list_kern_alloc(pvepp);
994 if (PV_ENTRY_NULL != *pvepp) {
995 return PV_ALLOC_SUCCESS;
996 }
997 }
998
999 /**
1000 * Make sure we have PMAP_PAGES_ALLOCATE_NOWAIT set in alloc_flags when the
1001 * input options argument has PMAP_OPTIONS_NOWAIT set.
1002 */
1003 alloc_flags |= (options & PMAP_OPTIONS_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0;
1004
1005 /**
1006 * We ran out of PV entries all across the board, or this allocation is not
1007 * for the kernel. Let's make sure that the kernel list is not too full
1008 * (very unlikely), in which case we can rebalance here.
1009 */
1010 if (__improbable(pv_kern_free.count > (PV_BATCH_SIZE * 2))) {
1011 pmap_simple_lock(&pv_kern_free_list_lock);
1012 /* Re-check, now that the lock is held. */
1013 if (pv_kern_free.count > (PV_BATCH_SIZE * 2)) {
1014 pv_entry_t *pve_head = pv_kern_free.list;
1015 pv_entry_t *pve_tail = pve_head;
1016
1017 for (int i = 0; i < (PV_BATCH_SIZE - 1); i++) {
1018 pve_tail = pve_tail->pve_next;
1019 }
1020
1021 pv_kern_free.list = pve_tail->pve_next;
1022 pv_kern_free.count -= PV_BATCH_SIZE;
1023 pve_tail->pve_next = PV_ENTRY_NULL;
1024 pmap_simple_unlock(&pv_kern_free_list_lock);
1025
1026 /* Return back every node except the first one to the free lists. */
1027 pv_list_free(pve_head->pve_next, pve_tail, PV_BATCH_SIZE - 1);
1028 pve_head->pve_next = PV_ENTRY_NULL;
1029 *pvepp = pve_head;
1030 return PV_ALLOC_SUCCESS;
1031 }
1032 pmap_simple_unlock(&pv_kern_free_list_lock);
1033 }
1034
1035 /**
1036 * If all else fails, try to get a new pmap page so that the allocation
1037 * succeeds once the caller retries it.
1038 */
1039 kern_return_t kr = KERN_FAILURE;
1040 pv_alloc_return_t pv_status = PV_ALLOC_FAIL;
1041 const unsigned int pai = locked_pvh->pai;
1042
1043 /**
1044 * Drop the lock during page allocation since that can take a while and
1045 * because preemption must be enabled when attempting to allocate memory
1046 * from the VM (which requires grabbing a mutex).
1047 */
1048 pvh_unlock(locked_pvh);
1049 if (pmap != NULL) {
1050 /**
1051 * Bump the provided refcount before we drop the pmap lock in order to prevent
1052 * page table reclamation while the lock is dropped.
1053 */
1054 if (__improbable((refcountp != NULL) && (os_atomic_inc_orig(refcountp, relaxed) == UINT16_MAX))) {
1055 panic("%s: pmap %p refcount %p overflow", __func__, pmap, refcountp);
1056 }
1057 pmap_unlock(pmap, lock_mode);
1058 }
1059
1060 if ((kr = pve_feed_page(alloc_flags)) == KERN_SUCCESS) {
1061 /**
1062 * Since the lock was dropped, even though we successfully allocated a
1063 * new page to be used for PVE nodes, the code that relies on this
1064 * function might have depended on the lock being held for consistency,
1065 * so return out early and let them retry the allocation with the lock
1066 * re-held.
1067 */
1068 pv_status = PV_ALLOC_RETRY;
1069 } else {
1070 pv_status = PV_ALLOC_FAIL;
1071 }
1072
1073 if (pmap != NULL) {
1074 pmap_lock(pmap, lock_mode);
1075 if (__improbable((refcountp != NULL) && (os_atomic_dec_orig(refcountp, relaxed) == 0))) {
1076 panic("%s: pmap %p refcount %p underflow", __func__, pmap, refcountp);
1077 }
1078 }
1079
1080 if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
1081 *locked_pvh = pvh_lock_nopreempt(pai);
1082 } else {
1083 *locked_pvh = pvh_lock(pai);
1084 }
1085
1086 /* Ensure that no node was created if we're not returning successfully. */
1087 assert(*pvepp == PV_ENTRY_NULL);
1088
1089 return pv_status;
1090 }
1091
1092 /**
1093 * Utility function for freeing a single PVE object back to the free lists.
1094 *
1095 * @param pvep Pointer to the PVE object to free.
1096 */
1097 MARK_AS_PMAP_TEXT void
pv_free(pv_entry_t * pvep)1098 pv_free(pv_entry_t *pvep)
1099 {
1100 assert(pvep != PV_ENTRY_NULL);
1101
1102 pv_list_free(pvep, pvep, 1);
1103 }
1104
1105 /**
1106 * This function provides a mechanism for the device tree to override the
1107 * default PV allocation amounts and the watermark level which determines how
1108 * many PVE objects are kept in the kernel-dedicated free list.
1109 */
1110 MARK_AS_PMAP_TEXT void
pmap_compute_pv_targets(void)1111 pmap_compute_pv_targets(void)
1112 {
1113 DTEntry entry = NULL;
1114 void const *prop = NULL;
1115 int err = 0;
1116 unsigned int prop_size = 0;
1117
1118 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1119 assert(err == kSuccess);
1120
1121 if (kSuccess == SecureDTGetProperty(entry, "pmap-pv-count", &prop, &prop_size)) {
1122 if (prop_size != sizeof(pv_alloc_initial_target)) {
1123 panic("pmap-pv-count property is not a 32-bit integer");
1124 }
1125 pv_alloc_initial_target = *((uint32_t const *)prop);
1126 }
1127
1128 if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-count", &prop, &prop_size)) {
1129 if (prop_size != sizeof(pv_kern_alloc_initial_target)) {
1130 panic("pmap-kern-pv-count property is not a 32-bit integer");
1131 }
1132 pv_kern_alloc_initial_target = *((uint32_t const *)prop);
1133 }
1134
1135 if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-min", &prop, &prop_size)) {
1136 if (prop_size != sizeof(pv_kern_low_water_mark)) {
1137 panic("pmap-kern-pv-min property is not a 32-bit integer");
1138 }
1139 pv_kern_low_water_mark = *((uint32_t const *)prop);
1140 }
1141 }
1142
1143 /**
1144 * This would normally be used to adjust the amount of PVE objects available in
1145 * the system, but we do that dynamically at runtime anyway so this is unneeded.
1146 */
1147 void
mapping_adjust(void)1148 mapping_adjust(void)
1149 {
1150 /* Not implemented for arm/arm64. */
1151 }
1152
1153 /**
1154 * Creates a target number of free pv_entry_t objects for the kernel free list
1155 * and the general free list.
1156 *
1157 * @note This function is called once during early boot, in kernel_bootstrap().
1158 *
1159 * @return KERN_SUCCESS if the objects were successfully allocated, or the
1160 * return value from pve_feed_page() on failure (could be caused by not
1161 * being able to allocate a page).
1162 */
1163 MARK_AS_PMAP_TEXT kern_return_t
mapping_free_prime_internal(void)1164 mapping_free_prime_internal(void)
1165 {
1166 kern_return_t kr = KERN_FAILURE;
1167
1168 /*
1169 * We do not need to hold the pv_free_array lock to calculate the number of
1170 * elements in it because no other core is running at this point.
1171 */
1172 while (((pv_free_array_n_elems() * PV_BATCH_SIZE) < pv_alloc_initial_target) ||
1173 (pv_kern_free.count < pv_kern_alloc_initial_target)) {
1174 if ((kr = pve_feed_page(0)) != KERN_SUCCESS) {
1175 return kr;
1176 }
1177 }
1178
1179 return KERN_SUCCESS;
1180 }
1181
1182 /**
1183 * Helper function for pmap_enter_pv (hereby shortened to "pepv") which converts
1184 * a PVH entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP which will transform the
1185 * entry into a linked list of mappings.
1186 *
1187 * @note This should only be called from pmap_enter_pv().
1188 *
1189 * @note The PVH lock for the passed in page must already be held and the type
1190 * must be PVH_TYPE_PTEP (wouldn't make sense to call this otherwise).
1191 *
1192 * @param pmap Either the pmap that owns the mapping being registered in
1193 * pmap_enter_pv(), or NULL if this is an IOMMU mapping.
1194 * @param lock_mode Which state the pmap lock is being held in if the mapping is
1195 * owned by a pmap, otherwise this is a don't care.
1196 * @param options PMAP_OPTIONS_* family of options.
1197 * @param locked_pvh Input/output parameter pointing to the wrapped value of the
1198 * pv_head_table entry previously obtained from pvh_lock().
1199 * This value will be updated if [locked_pvh->pai] needs to be
1200 * re-locked or if the allocation is successful and the PVH
1201 * entry is updated with the new PVE pointer.
1202 *
1203 * @return PV_ALLOC_SUCCESS if the entry at `pai` was successfully converted
1204 * into PVH_TYPE_PVEP, or the return value of pv_alloc() otherwise. See
1205 * pv_alloc()'s function header for a detailed explanation of the
1206 * possible return values.
1207 */
1208 MARK_AS_PMAP_TEXT static pv_alloc_return_t
pepv_convert_ptep_to_pvep(pmap_t pmap,pmap_lock_mode_t lock_mode,unsigned int options,locked_pvh_t * locked_pvh)1209 pepv_convert_ptep_to_pvep(
1210 pmap_t pmap,
1211 pmap_lock_mode_t lock_mode,
1212 unsigned int options,
1213 locked_pvh_t *locked_pvh)
1214 {
1215 assert(locked_pvh != NULL);
1216 assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PTEP));
1217
1218 pv_entry_t *pvep = PV_ENTRY_NULL;
1219 pv_alloc_return_t ret = pv_alloc(pmap, lock_mode, options, &pvep, locked_pvh, NULL);
1220 if (ret != PV_ALLOC_SUCCESS) {
1221 return ret;
1222 }
1223
1224 const unsigned int pai = locked_pvh->pai;
1225
1226 /* If we've gotten this far then a node should've been allocated. */
1227 assert(pvep != PV_ENTRY_NULL);
1228
1229 /* The new PVE should have the same PTE pointer as the previous PVH entry. */
1230 pve_init(pvep);
1231 pve_set_ptep(pvep, 0, pvh_ptep(locked_pvh->pvh));
1232
1233 assert(!pve_get_internal(pvep, 0));
1234 assert(!pve_get_altacct(pvep, 0));
1235 if (ppattr_is_internal(pai)) {
1236 /**
1237 * Transfer "internal" status from pp_attr to this pve. See the comment
1238 * above PP_ATTR_INTERNAL for more information on this.
1239 */
1240 ppattr_clear_internal(pai);
1241 pve_set_internal(pvep, 0);
1242 }
1243 if (ppattr_is_altacct(pai)) {
1244 /**
1245 * Transfer "altacct" status from pp_attr to this pve. See the comment
1246 * above PP_ATTR_ALTACCT for more information on this.
1247 */
1248 ppattr_clear_altacct(pai);
1249 pve_set_altacct(pvep, 0);
1250 }
1251
1252 pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
1253
1254 return PV_ALLOC_SUCCESS;
1255 }
1256
1257 /**
1258 * Register a new mapping into the pv_head_table. This is the main data
1259 * structure used for performing a reverse physical to virtual translation and
1260 * finding all mappings to a physical page. Whenever a new page table mapping is
1261 * created (regardless of whether it's for a CPU or an IOMMU), it should be
1262 * registered with a call to this function.
1263 *
1264 * @note The pmap lock must already be held if the new mapping is a CPU mapping.
1265 *
1266 * @note The PVH lock for the physical page that is getting a new mapping
1267 * registered must already be held.
1268 *
1269 * @note This function cannot be called during the hibernation process because
1270 * it modifies critical pmap data structures that need to be dumped into
1271 * the hibernation image in a consistent state.
1272 *
1273 * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
1274 * an IOMMU translation.
1275 * @param ptep The new mapping to register.
1276 * @param options Flags that can potentially be set on a per-page basis:
1277 * PMAP_OPTIONS_INTERNAL: If this is the first CPU mapping, then
1278 * mark the page as being "internal". See the definition of
1279 * PP_ATTR_INTERNAL for more info.
1280 * PMAP_OPTIONS_REUSABLE: If this is the first CPU mapping, and
1281 * this page is also marked internal, then mark the page as
1282 * being "reusable". See the definition of PP_ATTR_REUSABLE
1283 * for more info.
1284 * @param lock_mode Which state the pmap lock is being held in if the mapping is
1285 * owned by a pmap, otherwise this is a don't care.
1286 * @param locked_pvh Input/output parameter pointing to the wrapped value of the
1287 * pv_head_table entry previously obtained from pvh_lock().
1288 * If the registration is successful, locked_pvh->pvh will be
1289 * updated to reflect the new PV list head.
1290 * @param new_pvepp An output parameter that is updated with a pointer to the
1291 * PVE object where the PTEP was allocated into. In the event
1292 * of failure, or if the pointer passed in is NULL,
1293 * it's not modified.
1294 * @param new_pve_ptep_idx An output parameter that is updated with the index
1295 * into the PVE object where the PTEP was allocated into.
1296 * In the event of failure, or if new_pvepp in is NULL,
1297 * it's not modified.
1298 *
1299 * @return PV_ALLOC_SUCCESS if the entry at [locked_pvh->pai] was successfully
1300 * updated with the new mapping, or the return value of pv_alloc()
1301 * otherwise. See pv_alloc()'s function header for a detailed explanation
1302 * of the possible return values.
1303 */
1304 MARK_AS_PMAP_TEXT pv_alloc_return_t
pmap_enter_pv(pmap_t pmap,pt_entry_t * ptep,unsigned int options,pmap_lock_mode_t lock_mode,locked_pvh_t * locked_pvh,pv_entry_t ** new_pvepp,int * new_pve_ptep_idx)1305 pmap_enter_pv(
1306 pmap_t pmap,
1307 pt_entry_t *ptep,
1308 unsigned int options,
1309 pmap_lock_mode_t lock_mode,
1310 locked_pvh_t *locked_pvh,
1311 pv_entry_t **new_pvepp,
1312 int *new_pve_ptep_idx)
1313 {
1314 assert(ptep != PT_ENTRY_NULL);
1315 assert(locked_pvh != NULL);
1316
1317 bool first_cpu_mapping = false;
1318
1319 PMAP_ASSERT_NOT_WRITING_HIB();
1320
1321 if (pmap != NULL) {
1322 pmap_assert_locked(pmap, lock_mode);
1323 }
1324
1325 uintptr_t pvh_flags = pvh_get_flags(locked_pvh->pvh);
1326 const unsigned int pai = locked_pvh->pai;
1327
1328
1329 /**
1330 * An IOMMU mapping may already be present for a page that hasn't yet had a
1331 * CPU mapping established, so we use PVH_FLAG_CPU to determine if this is
1332 * the first CPU mapping. We base internal/reusable accounting on the
1333 * options specified for the first CPU mapping. PVH_FLAG_CPU, and thus this
1334 * accounting, will then persist as long as there are *any* mappings of the
1335 * page. The accounting for a page should not need to change until the page
1336 * is recycled by the VM layer, and we assert that there are no mappings
1337 * when a page is recycled. An IOMMU mapping of a freed/recycled page is
1338 * considered a security violation & potential DMA corruption path.
1339 */
1340 first_cpu_mapping = ((pmap != NULL) && !(pvh_flags & PVH_FLAG_CPU));
1341 if (first_cpu_mapping) {
1342 pvh_flags |= PVH_FLAG_CPU;
1343 pvh_set_flags(locked_pvh, pvh_flags);
1344 }
1345
1346 /**
1347 * Internal/reusable flags are based on the first CPU mapping made to a
1348 * page. These will persist until all mappings to the page are removed.
1349 */
1350 if (first_cpu_mapping) {
1351 if ((options & PMAP_OPTIONS_INTERNAL) &&
1352 (options & PMAP_OPTIONS_REUSABLE)) {
1353 ppattr_set_reusable(pai);
1354 } else {
1355 ppattr_clear_reusable(pai);
1356 }
1357 }
1358
1359 /* Visit the definitions for the PVH_TYPEs to learn more about each one. */
1360 if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_NULL)) {
1361 /* If this is the first mapping, upgrade the type to store a single PTEP. */
1362 pvh_update_head(locked_pvh, ptep, PVH_TYPE_PTEP);
1363 } else {
1364 pv_alloc_return_t ret = PV_ALLOC_FAIL;
1365
1366 if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_PTEP)) {
1367 /**
1368 * There was already a single mapping to the page. Convert the PVH
1369 * entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP so that multiple
1370 * mappings can be tracked. If PVEs cannot hold more than a single
1371 * mapping, a second PVE will be added farther down.
1372 */
1373 if ((ret = pepv_convert_ptep_to_pvep(pmap, lock_mode, options, locked_pvh)) != PV_ALLOC_SUCCESS) {
1374 return ret;
1375 }
1376
1377 /**
1378 * At this point, the PVH flags have been clobbered due to updating
1379 * PTEP->PVEP, but that's ok because the locks are being held and
1380 * the flags will get set again below before pv_alloc() is called
1381 * and the locks are potentially dropped again.
1382 */
1383 } else if (__improbable(!pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP))) {
1384 panic("%s: unexpected PV head %p, ptep=%p pmap=%p",
1385 __func__, (void*)locked_pvh->pvh, ptep, pmap);
1386 }
1387
1388 /**
1389 * Check if we have room for one more mapping in this PVE
1390 */
1391 pv_entry_t *pvep = pvh_pve_list(locked_pvh->pvh);
1392 assert(pvep != PV_ENTRY_NULL);
1393
1394 int pve_ptep_idx = pve_find_ptep_index(pvep, PT_ENTRY_NULL);
1395
1396 if (pve_ptep_idx == -1) {
1397 /**
1398 * Set up the pv_entry for this new mapping and then add it to the list
1399 * for this physical page.
1400 */
1401 pve_ptep_idx = 0;
1402 pvep = PV_ENTRY_NULL;
1403 if ((ret = pv_alloc(pmap, lock_mode, options, &pvep, locked_pvh, NULL)) != PV_ALLOC_SUCCESS) {
1404 return ret;
1405 }
1406
1407 /* If we've gotten this far then a node should've been allocated. */
1408 assert(pvep != PV_ENTRY_NULL);
1409 pve_init(pvep);
1410 pve_add(locked_pvh, pvep);
1411 }
1412
1413 pve_set_ptep(pvep, pve_ptep_idx, ptep);
1414
1415 /*
1416 * The PTEP was successfully entered into the PVE object.
1417 * If the caller requests it, set new_pvepp and new_pve_ptep_idx
1418 * appropriately.
1419 */
1420 if (new_pvepp != NULL) {
1421 *new_pvepp = pvep;
1422 *new_pve_ptep_idx = pve_ptep_idx;
1423 }
1424 }
1425
1426 return PV_ALLOC_SUCCESS;
1427 }
1428
1429 /**
1430 * Remove a mapping that was registered with the pv_head_table. This needs to be
1431 * done for every mapping that was previously registered using pmap_enter_pv()
1432 * when the mapping is removed.
1433 *
1434 * @note The PVH lock for the physical page that is getting a new mapping
1435 * registered must already be held.
1436 *
1437 * @note This function cannot be called during the hibernation process because
1438 * it modifies critical pmap data structures that need to be dumped into
1439 * the hibernation image in a consistent state.
1440 *
1441 * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
1442 * an IOMMU translation.
1443 * @param ptep The mapping that's getting removed.
1444 * @param locked_pvh Input/output parameter pointing to the wrapped value of the
1445 * pv_head_table entry previously obtained from pvh_lock().
1446 * If the removal is successful, locked_pvh->pvh may be updated
1447 * to reflect a new PV list head.
1448 * @param is_internal_p The internal bit of the PTE that was removed.
1449 * @param is_altacct_p The altacct bit of the PTE that was removed.
1450 * @return These are the possible return values:
1451 * PV_REMOVE_SUCCESS: A PV entry matching the PTE was found and
1452 * removed.
1453 * PV_REMOVE_FAIL: No matching PV entry was found. This may not be a fatal
1454 * condition; for example, pmap_disconnect() on another
1455 * thread may have removed the PV entry between removal
1456 * of the mapping and acquisition of the PV lock in
1457 * pmap_remove();
1458 */
1459 pv_remove_return_t
pmap_remove_pv(pmap_t pmap __assert_only,pt_entry_t * ptep,locked_pvh_t * locked_pvh,bool * is_internal_p,bool * is_altacct_p)1460 pmap_remove_pv(
1461 pmap_t pmap __assert_only,
1462 pt_entry_t *ptep,
1463 locked_pvh_t *locked_pvh,
1464 bool *is_internal_p,
1465 bool *is_altacct_p)
1466 {
1467 PMAP_ASSERT_NOT_WRITING_HIB();
1468 assert(locked_pvh != NULL);
1469
1470 pv_remove_return_t ret = PV_REMOVE_SUCCESS;
1471 const unsigned int pai = locked_pvh->pai;
1472 bool is_internal = false;
1473 bool is_altacct = false;
1474
1475
1476 if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_PTEP)) {
1477 if (__improbable((ptep != pvh_ptep(locked_pvh->pvh)))) {
1478 return PV_REMOVE_FAIL;
1479 }
1480
1481 pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
1482 is_internal = ppattr_is_internal(pai);
1483 is_altacct = ppattr_is_altacct(pai);
1484 } else if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP)) {
1485 pv_entry_t **pvepp = NULL;
1486 pv_entry_t *pvep = pvh_pve_list(locked_pvh->pvh);
1487 assert(pvep != PV_ENTRY_NULL);
1488 unsigned int npves = 0;
1489 int pve_pte_idx = 0;
1490 /* Find the PVE that represents the mapping we're removing. */
1491 while ((pvep != PV_ENTRY_NULL) && ((pve_pte_idx = pve_find_ptep_index(pvep, ptep)) == -1)) {
1492 if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
1493 pvh_lock_enter_sleep_mode(locked_pvh);
1494 }
1495 pvepp = pve_next_ptr(pvep);
1496 pvep = pve_next(pvep);
1497 npves++;
1498 }
1499
1500 if (__improbable((pvep == PV_ENTRY_NULL))) {
1501 return PV_REMOVE_FAIL;
1502 }
1503
1504 is_internal = pve_get_internal(pvep, pve_pte_idx);
1505 is_altacct = pve_get_altacct(pvep, pve_pte_idx);
1506 pve_set_ptep(pvep, pve_pte_idx, PT_ENTRY_NULL);
1507
1508 #if MACH_ASSERT
1509 /**
1510 * Ensure that the mapping didn't accidentally have multiple PVEs
1511 * associated with it (there should only be one PVE per mapping). This
1512 * checking only occurs on configurations that can accept the perf hit
1513 * that walking the PVE chain on every unmap entails.
1514 *
1515 * This is skipped for IOMMU mappings because some IOMMUs don't use
1516 * normal page tables (e.g., NVMe) to map pages, so the `ptep` field in
1517 * the associated PVE won't actually point to a real page table (see the
1518 * definition of PVH_FLAG_IOMMU_TABLE for more info). Because of that,
1519 * it's perfectly possible for duplicate IOMMU PVEs to exist.
1520 */
1521 if ((pmap != NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
1522 pv_entry_t *check_pvep = pvep;
1523
1524 do {
1525 if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
1526 pvh_lock_enter_sleep_mode(locked_pvh);
1527 }
1528 if (pve_find_ptep_index(check_pvep, ptep) != -1) {
1529 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
1530 "pvep=%p, pai=0x%x", __func__, ptep, pmap,
1531 (void*)locked_pvh->pvh, pvep, pai);
1532 }
1533 npves++;
1534 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
1535 }
1536 #endif /* MACH_ASSERT */
1537
1538 const bool pve_is_first = (pvepp == NULL);
1539 const bool pve_is_last = (pve_next(pvep) == PV_ENTRY_NULL);
1540 const int other_pte_idx = !pve_pte_idx;
1541
1542 if (pve_is_empty(pvep)) {
1543 /*
1544 * This PVE doesn't contain any mappings. We can get rid of it.
1545 */
1546 pve_remove(locked_pvh, pvepp, pvep);
1547 pv_free(pvep);
1548 } else if (!pve_is_first) {
1549 /*
1550 * This PVE contains a single mapping. See if we can coalesce it with the one
1551 * at the top of the list.
1552 */
1553 pv_entry_t *head_pvep = pvh_pve_list(locked_pvh->pvh);
1554 int head_pve_pte_empty_idx;
1555 if ((head_pve_pte_empty_idx = pve_find_ptep_index(head_pvep, PT_ENTRY_NULL)) != -1) {
1556 pve_set_ptep(head_pvep, head_pve_pte_empty_idx, pve_get_ptep(pvep, other_pte_idx));
1557 if (pve_get_internal(pvep, other_pte_idx)) {
1558 pve_set_internal(head_pvep, head_pve_pte_empty_idx);
1559 }
1560 if (pve_get_altacct(pvep, other_pte_idx)) {
1561 pve_set_altacct(head_pvep, head_pve_pte_empty_idx);
1562 }
1563 pve_remove(locked_pvh, pvepp, pvep);
1564 pv_free(pvep);
1565 } else {
1566 /*
1567 * We could not coalesce it. Move it to the start of the list, so that it
1568 * can be coalesced against in the future.
1569 */
1570 *pvepp = pve_next(pvep);
1571 pve_add(locked_pvh, pvep);
1572 }
1573 } else if (pve_is_first && pve_is_last) {
1574 /*
1575 * This PVE contains a single mapping, and it's the last mapping for this PAI.
1576 * Collapse this list back into the head, turning it into a PVH_TYPE_PTEP entry.
1577 */
1578 assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
1579 __func__, (void*)locked_pvh->pvh, pvep);
1580 pvh_update_head(locked_pvh, pve_get_ptep(pvep, other_pte_idx), PVH_TYPE_PTEP);
1581 pp_attr_t attrs_to_set = 0;
1582 if (pve_get_internal(pvep, other_pte_idx)) {
1583 attrs_to_set |= PP_ATTR_INTERNAL;
1584 }
1585 if (pve_get_altacct(pvep, other_pte_idx)) {
1586 attrs_to_set |= PP_ATTR_ALTACCT;
1587 }
1588 if (attrs_to_set != 0) {
1589 ppattr_modify_bits(pai, 0, attrs_to_set);
1590 }
1591 pv_free(pvep);
1592 }
1593 } else {
1594 /*
1595 * A concurrent disconnect operation may have already cleared the PVH to PVH_TYPE_NULL.
1596 * It's also possible that a subsequent page table allocation may have transitioned
1597 * the PVH to PVH_TYPE_PTDP.
1598 */
1599 return PV_REMOVE_FAIL;
1600 }
1601
1602 if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_NULL)) {
1603 pvh_set_flags(locked_pvh, 0);
1604 const pmap_paddr_t pa = pai_to_pa(pai);
1605 pmap_prepare_unmapped_page_for_retype(pa);
1606 pp_attr_t attrs_to_clear = 0;
1607 if (is_internal) {
1608 attrs_to_clear |= PP_ATTR_INTERNAL;
1609 }
1610 if (is_altacct) {
1611 attrs_to_clear |= PP_ATTR_ALTACCT;
1612 }
1613 if (attrs_to_clear != 0) {
1614 ppattr_modify_bits(pai, attrs_to_clear, 0);
1615 }
1616 /* If removing the last mapping to a specially-protected page, retype the page back to XNU_DEFAULT. */
1617 pmap_retype_unmapped_page(pa);
1618 }
1619
1620 *is_internal_p = is_internal;
1621 *is_altacct_p = is_altacct;
1622 return ret;
1623 }
1624
1625 /**
1626 * Bootstrap the initial Page Table Descriptor (PTD) node free list.
1627 *
1628 * @note It's not safe to allocate PTD nodes until after this function is
1629 * invoked.
1630 *
1631 * @note The maximum number of PTD objects that can reside within one page
1632 * (`ptd_per_page`) must have already been calculated before calling this
1633 * function.
1634 *
1635 * @param ptdp Pointer to the virtually-contiguous memory used for the initial
1636 * free list.
1637 * @param num_pages The number of virtually-contiguous pages pointed to by
1638 * `ptdp` that will be used to prime the PTD allocator.
1639 */
1640 MARK_AS_PMAP_TEXT void
ptd_bootstrap(pt_desc_t * ptdp,unsigned int num_pages)1641 ptd_bootstrap(pt_desc_t *ptdp, unsigned int num_pages)
1642 {
1643 assert(ptd_per_page > 0);
1644 assert((ptdp != NULL) && (((uintptr_t)ptdp & PAGE_MASK) == 0) && (num_pages > 0));
1645
1646 /**
1647 * Region represented by ptdp should be cleared by pmap_bootstrap().
1648 *
1649 * Only part of each page is being used for PTD objects (the rest is used
1650 * for each PTD's associated ptd_info_t object) so link together the last
1651 * PTD element of each page to the first element of the previous page.
1652 */
1653 for (int i = 0; i < num_pages; i++) {
1654 *((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
1655 ptd_free_list = ptdp;
1656 ptdp = (void *)(((uint8_t *)ptdp) + PAGE_SIZE);
1657 }
1658
1659 ptd_free_count = num_pages * ptd_per_page;
1660 simple_lock_init(&ptd_free_list_lock, 0);
1661 }
1662
1663 /**
1664 * Allocate a page table descriptor (PTD) object from the PTD free list, but
1665 * don't add it to the list of reclaimable userspace page table pages just yet
1666 * and don't associate the PTD with a specific pmap (that's what "unlinked"
1667 * means here).
1668 *
1669 * @param alloc_flags Allocation flags passed to pmap_page_alloc(). See the
1670 * definition of that function for a detailed description of
1671 * the available flags.
1672 *
1673 * @return The page table descriptor object if the allocation was successful, or
1674 * NULL otherwise (which indicates that a page failed to be allocated
1675 * for new nodes).
1676 */
1677 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc_unlinked(unsigned int alloc_flags)1678 ptd_alloc_unlinked(unsigned int alloc_flags)
1679 {
1680 pt_desc_t *ptdp = PTD_ENTRY_NULL;
1681
1682 pmap_simple_lock(&ptd_free_list_lock);
1683
1684 assert(ptd_per_page != 0);
1685
1686 /**
1687 * Ensure that we either have a free list with nodes available, or a
1688 * completely empty list to allocate and prepend new nodes to.
1689 */
1690 assert(((ptd_free_list != NULL) && (ptd_free_count > 0)) ||
1691 ((ptd_free_list == NULL) && (ptd_free_count == 0)));
1692
1693 if (__improbable(ptd_free_count == 0)) {
1694 pmap_paddr_t pa = 0;
1695
1696 /**
1697 * Drop the lock while allocating pages since that can take a while and
1698 * because preemption has to be enabled when allocating memory.
1699 */
1700 pmap_simple_unlock(&ptd_free_list_lock);
1701
1702 if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) {
1703 return NULL;
1704 }
1705 ptdp = (pt_desc_t *)phystokv(pa);
1706
1707 pmap_simple_lock(&ptd_free_list_lock);
1708 ptd_page_count++;
1709
1710 /**
1711 * Since the lock was dropped while allocating, it's possible another
1712 * CPU already allocated a page. To be safe, prepend the current free
1713 * list (which may or may not be empty now) to the page of nodes just
1714 * allocated and update the head to point to these new nodes.
1715 */
1716 *((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
1717 ptd_free_list = ptdp;
1718 ptd_free_count += ptd_per_page;
1719 }
1720
1721 /* There should be available nodes at this point. */
1722 if (__improbable((ptd_free_count == 0) || (ptd_free_list == PTD_ENTRY_NULL))) {
1723 panic_plain("%s: out of PTD entries and for some reason didn't "
1724 "allocate more %d %p", __func__, ptd_free_count, ptd_free_list);
1725 }
1726
1727 /* Grab the top node off of the free list to return later. */
1728 ptdp = ptd_free_list;
1729
1730 /**
1731 * Advance the free list to the next node.
1732 *
1733 * Each free pt_desc_t-sized object in this free list uses the first few
1734 * bytes of the object to point to the next object in the list. When an
1735 * object is deallocated (in ptd_deallocate()) the object is prepended onto
1736 * the free list by setting its first few bytes to point to the current free
1737 * list head. Then the head is updated to point to that object.
1738 *
1739 * When a new page is allocated for PTD nodes, it's left zeroed out. Once we
1740 * use up all of the previously deallocated nodes, the list will point
1741 * somewhere into the last allocated, empty page. We know we're pointing at
1742 * this page because the first few bytes of the object will be NULL. In
1743 * that case just set the head to this empty object.
1744 *
1745 * This empty page can be thought of as a "reserve" of empty nodes for the
1746 * case where more nodes are being allocated than there are nodes being
1747 * deallocated.
1748 */
1749 pt_desc_t *const next_node = (pt_desc_t *)(*(void **)ptd_free_list);
1750
1751 /**
1752 * If the next node in the list is NULL but there are supposed to still be
1753 * nodes left, then we've hit the previously allocated empty page of nodes.
1754 * Go ahead and advance the free list to the next free node in that page.
1755 */
1756 if ((next_node == PTD_ENTRY_NULL) && (ptd_free_count > 1)) {
1757 ptd_free_list = ptd_free_list + 1;
1758 } else {
1759 ptd_free_list = next_node;
1760 }
1761
1762 ptd_free_count--;
1763
1764 pmap_simple_unlock(&ptd_free_list_lock);
1765
1766 ptdp->pmap = NULL;
1767
1768 /**
1769 * Calculate and stash the address of the ptd_info_t associated with this
1770 * PTD. This can be done easily because both structures co-exist in the same
1771 * page, with ptd_info_t's starting at a given offset from the start of the
1772 * page.
1773 *
1774 * Each PTD is associated with a ptd_info_t of the same index. For example,
1775 * the 15th PTD will use the 15th ptd_info_t in the same page.
1776 */
1777 const unsigned ptd_index = ((uintptr_t)ptdp & PAGE_MASK) / sizeof(pt_desc_t);
1778 assert(ptd_index < ptd_per_page);
1779
1780 const uintptr_t start_of_page = (uintptr_t)ptdp & ~PAGE_MASK;
1781 ptd_info_t *first_ptd_info = (ptd_info_t *)(start_of_page + ptd_info_offset);
1782 ptdp->ptd_info = &first_ptd_info[ptd_index];
1783
1784 ptdp->va = (vm_offset_t)-1;
1785 ptdp->ptd_info->wiredcnt = 0;
1786
1787 return ptdp;
1788 }
1789
1790 /**
1791 * Allocate a single page table descriptor (PTD) object.
1792 *
1793 * @param pmap The pmap object that will be owning the page table(s) that this
1794 * descriptor object represents.
1795 * @param alloc_flags Allocation flags passed to ptd_alloc_unlinked(). See the
1796 * definition of that function for a detailed description of
1797 * the available flags.
1798 *
1799 * @return The allocated PTD object, or NULL if one failed to get allocated
1800 * (which indicates that memory wasn't able to get allocated).
1801 */
1802 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc(pmap_t pmap,unsigned int alloc_flags)1803 ptd_alloc(pmap_t pmap, unsigned int alloc_flags)
1804 {
1805 pt_desc_t *ptdp = ptd_alloc_unlinked(alloc_flags);
1806
1807 if (ptdp == NULL) {
1808 return NULL;
1809 }
1810
1811 ptdp->pmap = pmap;
1812
1813 pmap_tt_ledger_credit(pmap, sizeof(*ptdp));
1814 return ptdp;
1815 }
1816
1817 /**
1818 * Deallocate a single page table descriptor (PTD) object.
1819 *
1820 * @note Ledger statistics are tracked on a per-pmap basis, so for those pages
1821 * which are not associated with any specific pmap (e.g., IOMMU pages),
1822 * the caller must ensure that the pmap/iommu field in the PTD object is
1823 * NULL before calling this function.
1824 *
1825 * @param ptdp Pointer to the PTD object to deallocate.
1826 */
1827 MARK_AS_PMAP_TEXT void
ptd_deallocate(pt_desc_t * ptdp)1828 ptd_deallocate(pt_desc_t *ptdp)
1829 {
1830 pmap_t pmap = ptdp->pmap;
1831
1832 /* Prepend the deallocated node to the free list. */
1833 pmap_simple_lock(&ptd_free_list_lock);
1834 (*(void **)ptdp) = (void *)ptd_free_list;
1835 ptd_free_list = (pt_desc_t *)ptdp;
1836 ptd_free_count++;
1837 pmap_simple_unlock(&ptd_free_list_lock);
1838
1839 /**
1840 * If this PTD was being used to represent an IOMMU page then there won't be
1841 * an associated pmap, and therefore no ledger statistics to update.
1842 */
1843 if ((uintptr_t)pmap != IOMMU_INSTANCE_NULL) {
1844 pmap_tt_ledger_debit(pmap, sizeof(*ptdp));
1845 }
1846 }
1847
1848 /**
1849 * In address spaces where the VM page size is larger than the underlying
1850 * hardware page size, one page table descriptor (PTD) object can represent
1851 * multiple page tables. Some fields (like the reference counts) still need to
1852 * be tracked on a per-page-table basis. Because of this, those values are
1853 * stored in a separate array of ptd_info_t objects within the PTD where there's
1854 * one ptd_info_t for every page table a single PTD can manage.
1855 *
1856 * This function initializes the correct ptd_info_t field within a PTD based on
1857 * the page table it's representing.
1858 *
1859 * @param ptdp Pointer to the PTD object which contains the ptd_info_t field to
1860 * update. Must match up with the `pmap` and `ptep` parameters.
1861 * @param pmap The pmap that owns the page table managed by the passed in PTD.
1862 * @param va Any virtual address that resides within the virtual address space
1863 * being mapped by the page table pointed to by `ptep`.
1864 * @param level The level in the page table hierarchy that the table resides.
1865 * @param ptep A pointer into a page table that the passed in PTD manages. This
1866 * page table must be owned by `pmap` and be the PTE that maps `va`.
1867 */
1868 MARK_AS_PMAP_TEXT void
ptd_info_init(pt_desc_t * ptdp,pmap_t pmap,vm_map_address_t va,unsigned int level,pt_entry_t * ptep)1869 ptd_info_init(
1870 pt_desc_t *ptdp,
1871 pmap_t pmap,
1872 vm_map_address_t va,
1873 unsigned int level,
1874 pt_entry_t *ptep)
1875 {
1876 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1877
1878 if (ptdp->pmap != pmap) {
1879 panic("%s: pmap mismatch, ptdp=%p, pmap=%p, va=%p, level=%u, ptep=%p",
1880 __func__, ptdp, pmap, (void*)va, level, ptep);
1881 }
1882
1883 /**
1884 * Root tables are managed separately, and can be accessed through the
1885 * pmap structure itself (there's only one root table per address space).
1886 */
1887 assert(level > pt_attr_root_level(pt_attr));
1888
1889 /**
1890 * The "va" field represents the first virtual address that this page table
1891 * is translating for. Naturally, this is dependent on the level the page
1892 * table resides at since more VA space is mapped the closer the page
1893 * table's level is to the root.
1894 */
1895 ptdp->va = (vm_offset_t) va & ~pt_attr_ln_pt_offmask(pt_attr, level - 1);
1896 }
1897
1898 /**
1899 * Credit a specific ledger entry within the passed in pmap's ledger object.
1900 *
1901 * @param pmap The pmap whose ledger should be updated.
1902 * @param entry The specifc ledger entry to update. This needs to be one of the
1903 * task_ledger entries.
1904 * @param amount The amount to credit from the ledger.
1905 *
1906 * @return The return value from the credit operation.
1907 */
1908 kern_return_t
pmap_ledger_credit(pmap_t pmap,int entry,ledger_amount_t amount)1909 pmap_ledger_credit(pmap_t pmap, int entry, ledger_amount_t amount)
1910 {
1911 assert(pmap != NULL);
1912
1913 return ledger_credit(pmap->ledger, entry, amount);
1914 }
1915
1916 /**
1917 * Debit a specific ledger entry within the passed in pmap's ledger object.
1918 *
1919 * @param pmap The pmap whose ledger should be updated.
1920 * @param entry The specifc ledger entry to update. This needs to be one of the
1921 * task_ledger entries.
1922 * @param amount The amount to debit from the ledger.
1923 *
1924 * @return The return value from the debit operation.
1925 */
1926 kern_return_t
pmap_ledger_debit(pmap_t pmap,int entry,ledger_amount_t amount)1927 pmap_ledger_debit(pmap_t pmap, int entry, ledger_amount_t amount)
1928 {
1929 assert(pmap != NULL);
1930
1931 return ledger_debit(pmap->ledger, entry, amount);
1932 }
1933
1934 /**
1935 * Validate that the pointer passed into this method is a valid pmap object.
1936 *
1937 * @param pmap The pointer to validate.
1938 * @param func The stringized function name of the caller that will be printed
1939 * in the case that the validation fails.
1940 */
1941 void
validate_pmap_internal(const volatile struct pmap * pmap,const char * func)1942 validate_pmap_internal(const volatile struct pmap *pmap, const char *func)
1943 {
1944 #pragma unused(pmap, func)
1945 assert(pmap != NULL);
1946 }
1947
1948 /**
1949 * Validate that the pointer passed into this method is a valid pmap object and
1950 * is safe to both read and write.
1951 *
1952 * @param pmap The pointer to validate.
1953 * @param func The stringized function name of the caller that will be printed
1954 * in the case that the validation fails.
1955 */
1956 void
validate_pmap_mutable_internal(const volatile struct pmap * pmap,const char * func)1957 validate_pmap_mutable_internal(const volatile struct pmap *pmap, const char *func)
1958 {
1959 #pragma unused(pmap, func)
1960 assert(pmap != NULL);
1961 }
1962
1963 /**
1964 * Validate that the passed in pmap pointer is a pmap object that was allocated
1965 * by the pmap and not just random memory.
1966 *
1967 * This function will panic if the validation fails.
1968 *
1969 * @param pmap The object to validate.
1970 */
1971 void
pmap_require(pmap_t pmap)1972 pmap_require(pmap_t pmap)
1973 {
1974 if (pmap != kernel_pmap) {
1975 zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
1976 }
1977 }
1978
1979 /**
1980 * Helper function used when sorting and searching SPTM/PPL I/O ranges.
1981 *
1982 * @param a The first SPTM/PPL I/O range to compare.
1983 * @param b The second SPTM/PPL I/O range to compare.
1984 *
1985 * @return < 0 for a < b
1986 * 0 for a == b
1987 * > 0 for a > b
1988 */
1989 static int
cmp_io_rgns(const void * a,const void * b)1990 cmp_io_rgns(const void *a, const void *b)
1991 {
1992 const pmap_io_range_t *range_a = a;
1993 const pmap_io_range_t *range_b = b;
1994
1995 if ((range_b->addr + range_b->len) <= range_a->addr) {
1996 return 1;
1997 } else if ((range_a->addr + range_a->len) <= range_b->addr) {
1998 return -1;
1999 } else {
2000 return 0;
2001 }
2002 }
2003
2004 /**
2005 * Find and return the SPTM/PPL I/O range that contains the passed in physical
2006 * address.
2007 *
2008 * @note This function performs a binary search on the already sorted
2009 * io_attr_table, so it should be reasonably fast.
2010 *
2011 * @param paddr The physical address to query a specific I/O range for.
2012 *
2013 * @return A pointer to the pmap_io_range_t structure if one of the ranges
2014 * contains the passed in physical address. Otherwise, NULL.
2015 */
2016 pmap_io_range_t*
pmap_find_io_attr(pmap_paddr_t paddr)2017 pmap_find_io_attr(pmap_paddr_t paddr)
2018 {
2019 unsigned int begin = 0;
2020 unsigned int end = num_io_rgns - 1;
2021
2022 /**
2023 * If there are no I/O ranges, or the wanted address is below the lowest
2024 * range or above the highest range, then there's no point in searching
2025 * since it won't be here.
2026 */
2027 if ((num_io_rgns == 0) || (paddr < io_attr_table[begin].addr) ||
2028 (paddr >= (io_attr_table[end].addr + io_attr_table[end].len))) {
2029 return NULL;
2030 }
2031
2032 /**
2033 * A dummy I/O range to compare against when searching for a range that
2034 * includes `paddr`.
2035 */
2036 const pmap_io_range_t wanted_range = {
2037 .addr = paddr & ~PAGE_MASK,
2038 .len = PAGE_SIZE
2039 };
2040
2041 /* Perform a binary search to find the wanted I/O range. */
2042 for (;;) {
2043 const unsigned int middle = (begin + end) / 2;
2044 const int cmp = cmp_io_rgns(&wanted_range, &io_attr_table[middle]);
2045
2046 if (cmp == 0) {
2047 /* Success! Found the wanted I/O range. */
2048 return &io_attr_table[middle];
2049 } else if (begin == end) {
2050 /* We've checked every range and didn't find a match. */
2051 break;
2052 } else if (cmp > 0) {
2053 /* The wanted range is above the middle. */
2054 begin = middle + 1;
2055 } else {
2056 /* The wanted range is below the middle. */
2057 end = middle;
2058 }
2059 }
2060
2061 return NULL;
2062 }
2063
2064 /**
2065 * Initialize the pmap per-CPU data structure for a single CPU. This is called
2066 * once for each CPU in the system, on the CPU whose per-cpu data needs to be
2067 * initialized.
2068 *
2069 * In reality, many of the per-cpu data fields will have either already been
2070 * initialized or will rely on the fact that the per-cpu data is either zeroed
2071 * out during allocation (on non-PPL systems), or the data itself is a global
2072 * variable which will be zeroed by default (on PPL systems).
2073 *
2074 * @param cpu_number The number of the CPU whose pmap per-cpu data should be
2075 * initialized. This number should correspond to the CPU
2076 * executing this code.
2077 */
2078 MARK_AS_PMAP_TEXT void
pmap_cpu_data_init_internal(unsigned int cpu_number)2079 pmap_cpu_data_init_internal(unsigned int cpu_number)
2080 {
2081 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
2082
2083 pmap_cpu_data->cpu_number = cpu_number;
2084
2085 /* Setup per-cpu fields used when calling into the SPTM. */
2086 pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
2087 assert(((uintptr_t)sptm_pcpu & (PMAP_SPTM_PCPU_ALIGN - 1)) == 0);
2088 sptm_pcpu->sptm_ops_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_ops);
2089 sptm_pcpu->sptm_templates_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_templates);
2090 sptm_pcpu->sptm_paddrs_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_paddrs);
2091 sptm_pcpu->sptm_guest_dispatch_paddr = kvtophys_nofail((vm_offset_t)&sptm_pcpu->sptm_guest_dispatch);
2092
2093 const uint16_t sptm_cpu_number = sptm_cpu_id(ml_get_topology_info()->cpus[cpu_number].phys_id);
2094 sptm_pcpu->sptm_cpu_id = sptm_cpu_number;
2095
2096 const pmap_paddr_t iommu_scratch =
2097 sptm_cpu_iommu_scratch_start + (sptm_cpu_number * PMAP_IOMMU_SCRATCH_SIZE);
2098 assert(iommu_scratch <= (sptm_cpu_iommu_scratch_end - PMAP_IOMMU_SCRATCH_SIZE));
2099 sptm_pcpu->sptm_iommu_scratch = (void*)phystokv(iommu_scratch);
2100 sptm_pcpu->sptm_prev_ptes = (sptm_pte_t *)((uintptr_t)(SPTMArgs->sptm_prev_ptes) + (PAGE_SIZE * sptm_cpu_number));
2101 sptm_pcpu->sptm_cpu_id = sptm_cpu_number;
2102 }
2103
2104 /**
2105 * Initialize the pmap per-cpu data for the bootstrap CPU (the other CPUs should
2106 * just call pmap_cpu_data_init() directly).
2107 */
2108 void
pmap_cpu_data_array_init(void)2109 pmap_cpu_data_array_init(void)
2110 {
2111 /**
2112 * The EL2 portion of the IOMMU drivers need to have some memory they can
2113 * use to pass data into the SPTM. To save memory (since most IOMMU drivers
2114 * need this) and to preclude the need for IOMMU drivers to dynamically
2115 * allocate memory in their mapping/unmapping paths, memory is pre-allocated
2116 * here per-cpu for their usage.
2117 *
2118 * SPTM TODO: Only allocate this memory on systems that have IOMMU drivers.
2119 */
2120 sptm_cpu_iommu_scratch_start = avail_start;
2121 avail_start += MAX_CPUS * PMAP_IOMMU_SCRATCH_SIZE;
2122 sptm_cpu_iommu_scratch_end = avail_start;
2123
2124 pmap_cpu_data_init();
2125 }
2126
2127 /**
2128 * Retrieve the pmap per-cpu data for the current CPU.
2129 *
2130 * @return The per-cpu pmap data for the current CPU.
2131 */
2132 pmap_cpu_data_t *
pmap_get_cpu_data(void)2133 pmap_get_cpu_data(void)
2134 {
2135 pmap_cpu_data_t *pmap_cpu_data = NULL;
2136
2137 pmap_cpu_data = &getCpuDatap()->cpu_pmap_cpu_data;
2138 return pmap_cpu_data;
2139 }
2140
2141 /**
2142 * Retrieve the pmap per-cpu data for the specified cpu index.
2143 *
2144 * @return The per-cpu pmap data for the CPU
2145 */
2146 pmap_cpu_data_t *
pmap_get_remote_cpu_data(unsigned int cpu)2147 pmap_get_remote_cpu_data(unsigned int cpu)
2148 {
2149 cpu_data_t *cpu_data = cpu_datap((int)cpu);
2150 if (cpu_data == NULL) {
2151 return NULL;
2152 } else {
2153 return &cpu_data->cpu_pmap_cpu_data;
2154 }
2155 }
2156
2157 /**
2158 * Define the resources we need for spinning
2159 * until a paddr is not inflight.
2160 */
2161 __abortlike
2162 static hw_spin_timeout_status_t
hw_lck_paddr_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)2163 hw_lck_paddr_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
2164 {
2165 panic("paddr spinlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
2166 HW_SPIN_TIMEOUT_DETAILS_FMT,
2167 _lock, HW_SPIN_TIMEOUT_ARG(to, st),
2168 HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
2169 }
2170
2171 static const struct hw_spin_policy hw_paddr_inflight_spin_policy = {
2172 .hwsp_name = "hw_lck_paddr_lock",
2173 .hwsp_timeout_atomic = &LockTimeOut,
2174 .hwsp_op_timeout = hw_lck_paddr_timeout_panic,
2175 };
2176
2177 /**
2178 * Barrier function for spinning until the given physical page is
2179 * no longer inflight.
2180 *
2181 * @param paddr The physical address we want to spin until is not inflight.
2182 */
2183 static __attribute__((noinline)) void
pmap_paddr_inflight_barrier(pmap_paddr_t paddr)2184 pmap_paddr_inflight_barrier(pmap_paddr_t paddr)
2185 {
2186 hw_spin_policy_t pol = &hw_paddr_inflight_spin_policy;
2187 hw_spin_timeout_t to;
2188 hw_spin_state_t state = { };
2189
2190 disable_preemption();
2191 to = hw_spin_compute_timeout(pol);
2192 while (sptm_paddr_is_inflight(paddr) &&
2193 hw_spin_should_keep_spinning((void*)paddr, pol, to, &state)) {
2194 ;
2195 }
2196 enable_preemption();
2197 }
2198
2199 /**
2200 * Convenience function for checking if a given physical page is inflight.
2201 *
2202 * @param paddr The physical address to query.
2203 *
2204 * @return true if the page in question has no mappings, false otherwise.
2205 */
2206 inline bool
pmap_is_page_free(pmap_paddr_t paddr)2207 pmap_is_page_free(pmap_paddr_t paddr)
2208 {
2209 /**
2210 * We can't query the paddr refcounts if the physical page
2211 * is currently inflight. If it does, we spin until it's not.
2212 */
2213 if (__improbable(sptm_paddr_is_inflight(paddr))) {
2214 pmap_paddr_inflight_barrier(paddr);
2215 }
2216
2217 /**
2218 * A barrier from the last inflight operation. This allows us
2219 * to have proper visibility for the refcounts. Otherwise,
2220 * sptm_frame_is_last_mapping() might see stale values.
2221 */
2222 os_atomic_thread_fence(acquire);
2223
2224 /**
2225 * If SPTM returns TRUE for SPTM_REFCOUNT_NONE, it means
2226 * the physical page has no mappings.
2227 */
2228 return sptm_frame_is_last_mapping(paddr, SPTM_REFCOUNT_NONE);
2229 }
2230