xref: /xnu-12377.1.9/osfmk/arm64/sptm/pmap/pmap_data.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <arm/cpu_data_internal.h>
29 #include <kern/queue.h>
30 #include <libkern/OSAtomic.h>
31 #include <libkern/section_keywords.h>
32 #include <pexpert/device_tree.h>
33 #include <os/atomic_private.h>
34 #include <vm/cpm_internal.h>
35 #include <vm/vm_kern.h>
36 #include <vm/vm_protos.h>
37 #include <vm/vm_object_xnu.h>
38 #include <vm/vm_page_internal.h>
39 #include <vm/vm_pageout.h>
40 
41 #include <arm64/sptm/pmap/pmap_internal.h>
42 
43 /**
44  * Physical Page Attribute Table.
45  *
46  * Array that contains a set of flags for each kernel-managed physical VM page.
47  *
48  * @note There can be a disparity between the VM page size and the underlying
49  *       hardware page size for a specific address space. In those cases, it's
50  *       possible that multiple hardware pages will share the same set of
51  *       attributes. The VM operates on regions of memory by the VM page size
52  *       and is aware that all hardware pages within each VM page share
53  *       attributes.
54  */
55 SECURITY_READ_ONLY_LATE(volatile pp_attr_t*) pp_attr_table = (volatile pp_attr_t*)NULL;
56 
57 /**
58  * Physical to Virtual Table.
59  *
60  * Data structure that contains a list of virtual mappings for each kernel-
61  * managed physical page. Other flags and metadata are also stored in this
62  * structure on a per-physical-page basis.
63  *
64  * This structure is arranged as an array of pointers, where each pointer can
65  * point to one of three different types of data (single mapping, multiple
66  * mappings, or page table descriptor). Metadata about each page (including the
67  * type of pointer) are located in the lower and upper bits of the pointer.
68  * These bits need to be set/masked out to be able to dereference the pointer,
69  * so it's recommended to use the provided API in pmap_data.h to access the
70  * pv_head_table since it handles these details for you.
71  */
72 SECURITY_READ_ONLY_LATE(uintptr_t*) pv_head_table = NULL;
73 
74 /* Simple linked-list structure used in various page free lists. */
75 typedef struct page_free_entry {
76 	/**
77 	 * The first word in an empty page on a free list is used as a pointer to
78 	 * the next free page in the list.
79 	 */
80 	struct page_free_entry *next;
81 } page_free_entry_t;
82 
83 /* Represents a NULL entry in various page free lists. */
84 #define PAGE_FREE_ENTRY_NULL ((page_free_entry_t *) 0)
85 
86 /**
87  * This VM object will contain every VM page being used by the pmap. This acts
88  * as a convenient place to put pmap pages to keep the VM from reusing them, as
89  * well as providing a way for looping over every page being used by the pmap.
90  */
91 struct vm_object pmap_object_store VM_PAGE_PACKED_ALIGNED;
92 
93 /* Pointer to the pmap's VM object that can't be modified after machine_lockdown(). */
94 SECURITY_READ_ONLY_LATE(vm_object_t) pmap_object = &pmap_object_store;
95 
96 /**
97  * This variable, used for debugging purposes only, keeps track of how many pages
98  * are currently in use by the pmap layer. Once a page is given back to the VM,
99  * then inuse_pmap_pages_count will be decremented.
100  *
101  * Even if a page is sitting in one of the pmap's various free lists and hasn't
102  * been allocated for usage, it is still considered "used" by the pmap, from
103  * the perspective of the VM.
104  */
105 unsigned int inuse_pmap_pages_count = 0;
106 
107 /**
108  * Default watermark values used to keep a healthy supply of physical-to-virtual
109  * entries (PVEs) always available. These values can be overriden by the device
110  * tree (see pmap_compute_pv_targets() for more info).
111  */
112 #define PV_KERN_LOW_WATER_MARK_DEFAULT (0x400)
113 #define PV_ALLOC_CHUNK_INITIAL         (0x400)
114 #define PV_KERN_ALLOC_CHUNK_INITIAL    (0x400)
115 
116 /**
117  * The pv_free array acts as a ring buffer where each entry points to a linked
118  * list of PVEs that have a length set by this define.
119  */
120 #define PV_BATCH_SIZE (PAGE_SIZE / sizeof(pv_entry_t))
121 
122 /* The batch allocation code assumes that a batch can fit within a single page. */
123 #if __ARM_16K_PG__
124 /**
125  * PAGE_SIZE is a variable on arm64 systems with 4K VM pages, so no static
126  * assert on those systems.
127  */
128 static_assert((PV_BATCH_SIZE * sizeof(pv_entry_t)) <= PAGE_SIZE);
129 #endif /* __ARM_16K_PG__ */
130 
131 /**
132  * The number of PVEs to attempt to keep in the kernel-dedicated free list. If
133  * the number of entries is below this value, then allocate more.
134  */
135 static uint32_t pv_kern_low_water_mark MARK_AS_PMAP_DATA = PV_KERN_LOW_WATER_MARK_DEFAULT;
136 
137 /**
138  * The initial number of PVEs to allocate during bootstrap (can be overriden in
139  * the device tree, see pmap_compute_pv_targets() for more info).
140  */
141 uint32_t pv_alloc_initial_target MARK_AS_PMAP_DATA = PV_ALLOC_CHUNK_INITIAL * MAX_CPUS;
142 uint32_t pv_kern_alloc_initial_target MARK_AS_PMAP_DATA = PV_KERN_ALLOC_CHUNK_INITIAL;
143 
144 /**
145  * Global variables strictly used for debugging purposes. These variables keep
146  * track of the number of pages being used for PVE objects, PTD objects, and the
147  * total number of PVEs that have been added to the global or kernel-dedicated
148  * free lists respectively.
149  */
150 static _Atomic unsigned int pv_page_count MARK_AS_PMAP_DATA = 0;
151 static unsigned int ptd_page_count MARK_AS_PMAP_DATA = 0;
152 static unsigned pmap_reserve_replenish_stat MARK_AS_PMAP_DATA = 0;
153 static unsigned pmap_kern_reserve_alloc_stat MARK_AS_PMAP_DATA = 0;
154 
155 /**
156  * Number of linked lists of PVEs ("batches") in the global PV free ring buffer.
157  * This must be a power of two for the pv_free_array_n_elems() logic to work.
158  */
159 #define PV_FREE_ARRAY_SIZE (256U)
160 
161 /**
162  * A ring buffer where each entry in the buffer is a linked list of PV entries
163  * (called "batches"). Allocations out of this array will always operate on
164  * a PV_BATCH_SIZE amount of entries at a time.
165  */
166 static pv_free_list_t pv_free_ring[PV_FREE_ARRAY_SIZE] MARK_AS_PMAP_DATA = {0};
167 
168 /* Read and write indices for the pv_free ring buffer. */
169 static uint16_t pv_free_read_idx MARK_AS_PMAP_DATA = 0;
170 static uint16_t pv_free_write_idx MARK_AS_PMAP_DATA = 0;
171 
172 /**
173  * Make sure the PV free array is small enough so that all elements can be
174  * properly indexed by pv_free_[read/write]_idx.
175  */
176 static_assert(PV_FREE_ARRAY_SIZE <= (1 << (sizeof(pv_free_read_idx) * 8)));
177 
178 /**
179  * Return the number of free batches available for allocation out of the PV free
180  * ring buffer. Each batch is a linked list of PVEs with length PV_BATCH_SIZE.
181  *
182  * @note This function requires that PV_FREE_ARRAY_SIZE is a power of two.
183  */
184 static inline uint16_t
pv_free_array_n_elems(void)185 pv_free_array_n_elems(void)
186 {
187 	return (pv_free_write_idx - pv_free_read_idx) & (PV_FREE_ARRAY_SIZE - 1);
188 }
189 
190 /* Free list of PV entries dedicated for usage by the kernel. */
191 static pv_free_list_t pv_kern_free MARK_AS_PMAP_DATA = {0};
192 
193 /* Locks for the global and kernel-dedicated PV free lists. */
194 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_free_array_lock, 0);
195 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_kern_free_list_lock, 0);
196 
197 /* Represents a null page table descriptor (PTD). */
198 #define PTD_ENTRY_NULL ((pt_desc_t *) 0)
199 
200 /* Running free list of PTD nodes. */
201 static pt_desc_t *ptd_free_list MARK_AS_PMAP_DATA = PTD_ENTRY_NULL;
202 
203 /* The number of free PTD nodes available in the free list. */
204 static unsigned int ptd_free_count MARK_AS_PMAP_DATA = 0;
205 
206 /**
207  * The number of PTD objects located in each page being used by the PTD
208  * allocator. The PTD objects share each page with their associated ptd_info_t
209  * objects (with cache-line alignment padding between them). The maximum number
210  * of PTDs that can be placed into a single page is calculated once at boot.
211  */
212 static SECURITY_READ_ONLY_LATE(unsigned) ptd_per_page = 0;
213 
214 /**
215  * The offset in bytes from the beginning of a page of PTD objects where you
216  * start seeing the associated ptd_info_t objects. This is calculated once
217  * during boot to maximize the number of PTD and ptd_info_t objects that can
218  * reside within a page without sharing a cache-line.
219  */
220 static SECURITY_READ_ONLY_LATE(unsigned) ptd_info_offset = 0;
221 
222 /* Lock to protect accesses to the PTD free list. */
223 static decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA);
224 
225 /**
226  * Dummy _internal() prototypes so Clang doesn't complain about missing
227  * prototypes on a non-static function. These functions can't be marked as
228  * static because they need to be called from pmap_ppl_interface.c where the
229  * PMAP_SUPPORT_PROTOYPES() macro will auto-generate the prototype implicitly.
230  */
231 kern_return_t mapping_free_prime_internal(void);
232 
233 /**
234  * Flag indicating whether any I/O regions that require strong DSB are present.
235  * If not, certain TLB maintenance operations can be streamlined.
236  */
237 SECURITY_READ_ONLY_LATE(bool) sdsb_io_rgns_present = false;
238 
239 /**
240  * Sorted representation of the pmap-io-ranges nodes in the device tree. These
241  * nodes describe all of the SPTM/PPL-owned I/O ranges.
242  */
243 SECURITY_READ_ONLY_LATE(pmap_io_range_t*) io_attr_table = (pmap_io_range_t*)0;
244 
245 /* The number of ranges described by io_attr_table. */
246 SECURITY_READ_ONLY_LATE(unsigned int) num_io_rgns = 0;
247 
248 /**
249  * Sorted representation of the pmap-io-filter entries in the device tree
250  * The entries are sorted and queried by {signature, range}.
251  */
252 SECURITY_READ_ONLY_LATE(pmap_io_filter_entry_t*) io_filter_table = (pmap_io_filter_entry_t*)0;
253 
254 /* Number of total pmap-io-filter entries. */
255 SECURITY_READ_ONLY_LATE(unsigned int) num_io_filter_entries = 0;
256 
257 /**
258  * A list of pages that define the per-cpu scratch areas used by IOMMU drivers
259  * when preparing data to be passed into the SPTM. The size allocated per-cpu is
260  * defined by PMAP_IOMMU_SCRATCH_SIZE.
261  *
262  * SPTM TODO: Only have these variables on systems with IOMMU drivers (H11+).
263  */
264 #define PMAP_IOMMU_SCRATCH_SIZE (PMAP_IOMMU_NUM_SCRATCH_PAGES * PAGE_SIZE)
265 SECURITY_READ_ONLY_LATE(pmap_paddr_t) sptm_cpu_iommu_scratch_start = 0;
266 SECURITY_READ_ONLY_LATE(pmap_paddr_t) sptm_cpu_iommu_scratch_end = 0;
267 
268 /* Prototypes used by pmap_data_bootstrap(). */
269 void pmap_cpu_data_array_init(void);
270 
271 #if __ARM64_PMAP_SUBPAGE_L1__
272 /* A list of subpage user root table page tracking structures. */
273 queue_head_t surt_list;
274 
275 /**
276  * A mutex protecting surt_list related operations.
277  */
278 decl_lck_mtx_data(, surt_lock);
279 
280 /* Is the SURT subsystem initialized? */
281 bool surt_ready = false;
282 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
283 
284 #if DEBUG || DEVELOPMENT
285 /* Track number of instances a WC/RT mapping request is converted to Device-GRE. */
286 static _Atomic unsigned int pmap_wcrt_on_non_dram_count = 0;
287 #endif /* DEBUG || DEVELOPMENT */
288 
289 /**
290  * This function is called once during pmap_bootstrap() to allocate and
291  * initialize many of the core data structures that are implemented in this
292  * file.
293  *
294  * Memory for these data structures is carved out of `avail_start` which is a
295  * global setup by arm_vm_init() that points to a physically contiguous region
296  * used for bootstrap allocations.
297  *
298  * @note There is no guaranteed alignment of `avail_start` when this function
299  *       returns. If avail_start needs to be aligned to a specific value then it
300  *       must be done so by the caller before they use it for more allocations.
301  */
302 void
pmap_data_bootstrap(void)303 pmap_data_bootstrap(void)
304 {
305 	/**
306 	 * Set ptd_per_page to the maximum number of (pt_desc_t + ptd_info_t) we can
307 	 * fit in a single page. We need to allow for some padding between the two,
308 	 * so that no ptd_info_t shares a cache line with a pt_desc_t.
309 	 */
310 	const unsigned ptd_info_size = sizeof(ptd_info_t);
311 	const unsigned l2_cline_bytes = 1 << MAX_L2_CLINE;
312 	ptd_per_page = (PAGE_SIZE - (l2_cline_bytes - 1)) / (sizeof(pt_desc_t) + ptd_info_size);
313 	unsigned increment = 0;
314 	bool try_next = true;
315 
316 	/**
317 	 * The current ptd_per_page calculation was done assuming the worst-case
318 	 * scenario in terms of padding between the two object arrays that reside in
319 	 * the same page. The following loop attempts to optimize this further by
320 	 * finding the smallest possible amount of padding while still ensuring that
321 	 * the two object arrays don't share a cache line.
322 	 */
323 	while (try_next) {
324 		increment++;
325 		const unsigned pt_desc_total_size =
326 		    PMAP_ALIGN((ptd_per_page + increment) * sizeof(pt_desc_t), l2_cline_bytes);
327 		const unsigned ptd_info_total_size = (ptd_per_page + increment) * ptd_info_size;
328 		try_next = (pt_desc_total_size + ptd_info_total_size) <= PAGE_SIZE;
329 	}
330 	ptd_per_page += increment - 1;
331 	assert(ptd_per_page > 0);
332 
333 	/**
334 	 * ptd_info objects reside after the ptd descriptor objects, with some
335 	 * padding in between if necessary to ensure that they don't co-exist in the
336 	 * same cache line.
337 	 */
338 	const unsigned pt_desc_bytes = ptd_per_page * sizeof(pt_desc_t);
339 	ptd_info_offset = PMAP_ALIGN(pt_desc_bytes, l2_cline_bytes);
340 
341 	/* The maximum amount of padding should be (l2_cline_bytes - 1). */
342 	assert((ptd_info_offset - pt_desc_bytes) < l2_cline_bytes);
343 
344 	/**
345 	 * Allocate enough initial PTDs to map twice the available physical memory.
346 	 *
347 	 * To do this, start by calculating the number of leaf page tables that are
348 	 * needed to cover all of kernel-managed physical memory.
349 	 */
350 	const uint32_t num_leaf_page_tables =
351 	    (uint32_t)(mem_size / ((PAGE_SIZE / sizeof(pt_entry_t)) * ARM_PGBYTES));
352 
353 	/**
354 	 * There should be one PTD per page table (times 2 since we want twice the
355 	 * number of required PTDs), plus round the number of PTDs up to the next
356 	 * `ptd_per_page` value so there's no wasted space.
357 	 */
358 	const uint32_t ptd_root_table_n_ptds =
359 	    (ptd_per_page * ((num_leaf_page_tables * 2) / ptd_per_page)) + ptd_per_page;
360 
361 	/* Lastly, calculate the number of VM pages and bytes these PTDs take up. */
362 	const uint32_t num_ptd_pages = ptd_root_table_n_ptds / ptd_per_page;
363 	vm_size_t ptd_root_table_size = num_ptd_pages * PAGE_SIZE;
364 
365 	/* Number of VM pages that span all of kernel-managed memory. */
366 	unsigned int npages = (unsigned int)atop(mem_size);
367 
368 
369 	/* The pv_head_table and pp_attr_table both have one entry per VM page. */
370 	const vm_size_t pp_attr_table_size = npages * sizeof(pp_attr_t);
371 	const vm_size_t pv_head_size = round_page(npages * sizeof(*pv_head_table));
372 
373 	/* Scan the device tree and override heuristics in the PV entry management code. */
374 	pmap_compute_pv_targets();
375 
376 	io_attr_table = (pmap_io_range_t *) SPTMArgs->sptm_pmap_io_ranges;
377 	num_io_rgns = SPTMArgs->sptm_pmap_io_ranges_count;
378 	io_filter_table = (pmap_io_filter_entry_t *) SPTMArgs->sptm_pmap_io_filters;
379 	num_io_filter_entries = SPTMArgs->sptm_pmap_io_filters_count;
380 
381 	/**
382 	 * Don't make any assumptions about the alignment of avail_start before
383 	 * execution of this function. Always re-align it to ensure the first
384 	 * allocated data structure is aligned correctly.
385 	 */
386 	avail_start = PMAP_ALIGN(avail_start, __alignof(pp_attr_t));
387 
388 	/**
389 	 * Keep track of where the data structures start so we can clear this memory
390 	 * later.
391 	 */
392 	const pmap_paddr_t pmap_struct_start = avail_start;
393 
394 	pp_attr_table = (pp_attr_t *)phystokv(avail_start);
395 	avail_start = PMAP_ALIGN(avail_start + pp_attr_table_size, __alignof(pv_entry_t *));
396 
397 	pv_head_table = (uintptr_t *)phystokv(avail_start);
398 
399 	/**
400 	 * ptd_root_table must start on a page boundary because all of the math for
401 	 * associating pt_desc_t objects with ptd_info objects assumes the first
402 	 * pt_desc_t in a page starts at the beginning of the page it resides in.
403 	 */
404 	avail_start = round_page(avail_start + pv_head_size);
405 
406 	pt_desc_t *ptd_root_table = (pt_desc_t *)phystokv(avail_start);
407 	avail_start = round_page(avail_start + ptd_root_table_size);
408 
409 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
410 
411 	/* This function assumes that ptd_root_table has been zeroed out already. */
412 	ptd_bootstrap(ptd_root_table, num_ptd_pages);
413 
414 	/* Setup the pmap per-cpu data structures. */
415 	pmap_cpu_data_array_init();
416 }
417 
418 /**
419  * Add a queue of VM pages to the pmap's VM object. This informs the VM that
420  * these pages are being used by the pmap and shouldn't be reused.
421  *
422  * This also means that the pmap_object can be used as a convenient way to loop
423  * through every page currently being used by the pmap. For instance, this queue
424  * of pages is exposed to the debugger through the Low Globals, where it's used
425  * to ensure that all pmap data is saved in an active core dump.
426  *
427  * @param mem The head of the queue of VM pages to add to the pmap's VM object.
428  */
429 void
pmap_enqueue_pages(vm_page_t mem)430 pmap_enqueue_pages(vm_page_t mem)
431 {
432 	vm_page_t m_prev;
433 	vm_object_lock(pmap_object);
434 	while (mem != VM_PAGE_NULL) {
435 		const vm_object_offset_t offset =
436 		    (vm_object_offset_t) ((ptoa(VM_PAGE_GET_PHYS_PAGE(mem))) - gPhysBase);
437 
438 		vm_page_insert_wired(mem, pmap_object, offset, VM_KERN_MEMORY_PTE);
439 		m_prev = mem;
440 		mem = NEXT_PAGE(m_prev);
441 		*(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL;
442 	}
443 	vm_object_unlock(pmap_object);
444 }
445 
446 /**
447  * Allocate a page from the VM for usage within the pmap.
448  *
449  * @param ppa Output parameter to store the physical address of the allocated
450  *           page if one was able to be allocated (NULL otherwise).
451  * @param options The following options can be specified:
452  *     - PMAP_PAGE_ALLOCATE_NOWAIT: If the VM page free list doesn't have
453  *       any free pages available then don't wait for one, just return
454  *       immediately without allocating a page.
455  *
456  *     - PMAP_PAGE_RECLAIM_NOWAIT: If memory can't be allocated from the VM,
457  *       then fall back to attempting to reclaim a userspace page table. This
458  *       should only be specified in paths that absolutely can't take the
459  *       latency hit of waiting for the VM to allocate more pages. This flag
460  *       doesn't make much sense unless it's paired with
461  *       PMAP_PAGE_ALLOCATE_NOWAIT.
462  *
463  *     - PMAP_PAGE_NOZEROFILL: don't zero-fill the pages. This should only be
464  *       used if you know that something else in the relevant code path will
465  *       zero-fill or otherwise fully initialize the page with consistent data.
466  *       This is mostly intended for cases in which sptm_retype() is guaranteed
467  *       to zero-fill the page for us.
468  *
469  * @return KERN_SUCCESS if a page was successfully allocated, or
470  *         KERN_RESOURCE_SHORTAGE if a page failed to get allocated. This should
471  *         only be returned if PMAP_PAGE_ALLOCATE_NOWAIT is passed or if
472  *         preemption is disabled after early boot since allocating memory from
473  *         the VM requires grabbing a mutex. If PMAP_PAGE_ALLOCATE_NOWAIT is not
474  *         passed and the system is in a preemptable state, then the return
475  *         value should always be KERN_SUCCESS (as the thread will block until
476  *         there are free pages available).
477  */
478 MARK_AS_PMAP_TEXT kern_return_t
pmap_page_alloc(pmap_paddr_t * ppa,unsigned options)479 pmap_page_alloc(pmap_paddr_t *ppa, unsigned options)
480 {
481 	assert(ppa != NULL);
482 	pmap_paddr_t pa = 0;
483 	PMAP_ASSERT_NOT_WRITING_HIB();
484 	vm_page_t mem = VM_PAGE_NULL;
485 	thread_t self = current_thread();
486 
487 	/**
488 	 * It's not possible to allocate memory from the VM in a preemption disabled
489 	 * environment except during early boot (since the VM needs to grab a mutex).
490 	 * In those cases just return a resource shortage error and let the caller
491 	 * deal with it.
492 	 *
493 	 * We don't panic here as there are genuinely some cases where pmap_enter()
494 	 * is called with preemption disabled, and it's better to return an error
495 	 * to those callers to notify them to try again with preemption enabled.
496 	 */
497 	if (!pmap_is_preemptible()) {
498 		return KERN_RESOURCE_SHORTAGE;
499 	}
500 
501 	*ppa = 0;
502 
503 	/**
504 	 * We qualify for allocating reserved memory so set TH_OPT_VMPRIV to inform
505 	 * the VM of this.
506 	 *
507 	 * This field should only be modified by the local thread itself, so no lock
508 	 * needs to be taken.
509 	 */
510 	uint16_t thread_options = self->options;
511 	self->options |= TH_OPT_VMPRIV;
512 
513 	/**
514 	 * If we're only allocating a single page, just grab one off the VM's
515 	 * global page free list.
516 	 */
517 	vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
518 	while ((mem = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) {
519 		if (options & PMAP_PAGE_ALLOCATE_NOWAIT) {
520 			break;
521 		}
522 
523 		VM_PAGE_WAIT();
524 	}
525 
526 	if (mem != VM_PAGE_NULL) {
527 		vm_page_lock_queues();
528 		vm_page_wire(mem, VM_KERN_MEMORY_PTE, TRUE);
529 		vm_page_unlock_queues();
530 	}
531 
532 	self->options = thread_options;
533 
534 	if (mem == VM_PAGE_NULL) {
535 		return KERN_RESOURCE_SHORTAGE;
536 	}
537 
538 	pa = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(mem));
539 
540 	/* Add the allocated VM page(s) to the pmap's VM object. */
541 	pmap_enqueue_pages(mem);
542 
543 	/* Pages are considered "in use" by the pmap until returned to the VM. */
544 	OSAddAtomic(1, &inuse_pmap_pages_count);
545 
546 	/* SPTM TODO: assert that the returned page is of type XNU_DEFAULT in frame table */
547 	if (!(options & PMAP_PAGE_NOZEROFILL)) {
548 		bzero((void*)phystokv(pa), PAGE_SIZE);
549 	}
550 	*ppa = pa;
551 	return KERN_SUCCESS;
552 }
553 
554 /**
555  * Free memory previously allocated through pmap_page_alloc() back to the VM.
556  *
557  * @param pa Physical address of the page(s) to free.
558  */
559 void
pmap_page_free(pmap_paddr_t pa)560 pmap_page_free(pmap_paddr_t pa)
561 {
562 	/* SPTM TODO: assert that the page to be freed is of type XNU_DEFAULT in frame table */
563 
564 	/* Pages are considered "in use" until given back to the VM. */
565 	OSAddAtomic(-1, &inuse_pmap_pages_count);
566 
567 	vm_page_t mem = VM_PAGE_NULL;
568 	vm_object_lock(pmap_object);
569 
570 	/**
571 	 * Remove the page from the pmap's VM object and return it back to the
572 	 * VM's global free list of pages.
573 	 */
574 	mem = vm_page_lookup(pmap_object, (pa - gPhysBase));
575 	assert(mem != VM_PAGE_NULL);
576 	assert(VM_PAGE_WIRED(mem));
577 	vm_page_lock_queues();
578 	vm_page_free(mem);
579 	vm_page_unlock_queues();
580 	vm_object_unlock(pmap_object);
581 }
582 
583 /**
584  * Called by the VM to reclaim pages that we can reclaim quickly and cheaply.
585  * This will take pages in the pmap's VM object and add them back to the VM's
586  * global list of free pages.
587  *
588  * @return The number of pages returned to the VM.
589  */
590 uint64_t
pmap_release_pages_fast(void)591 pmap_release_pages_fast(void)
592 {
593 	return 0;
594 }
595 
596 /**
597  * Allocates a batch (list) of pv_entry_t's from the global PV free array.
598  *
599  * @return A pointer to the head of the newly-allocated batch, or PV_ENTRY_NULL
600  *         if empty.
601  */
602 MARK_AS_PMAP_TEXT static pv_entry_t *
pv_free_array_get_batch(void)603 pv_free_array_get_batch(void)
604 {
605 	pv_entry_t *new_batch = PV_ENTRY_NULL;
606 
607 	pmap_simple_lock(&pv_free_array_lock);
608 	if (pv_free_array_n_elems() > 0) {
609 		/**
610 		 * The global PV array acts as a ring buffer where each entry points to
611 		 * a linked list of PVEs of length PV_BATCH_SIZE. Get the next free
612 		 * batch.
613 		 */
614 		const size_t index = pv_free_read_idx++ & (PV_FREE_ARRAY_SIZE - 1);
615 		pv_free_list_t *free_list = &pv_free_ring[index];
616 
617 		assert((free_list->count == PV_BATCH_SIZE) && (free_list->list != PV_ENTRY_NULL));
618 		new_batch = free_list->list;
619 	}
620 	pmap_simple_unlock(&pv_free_array_lock);
621 
622 	return new_batch;
623 }
624 
625 /**
626  * Frees a batch (list) of pv_entry_t's into the global PV free array.
627  *
628  * @param batch_head Pointer to the first entry in the batch to be returned to
629  *                   the array. This must be a linked list of pv_entry_t's of
630  *                   length PV_BATCH_SIZE.
631  *
632  * @return KERN_SUCCESS, or KERN_FAILURE if the global array is full.
633  */
634 MARK_AS_PMAP_TEXT static kern_return_t
pv_free_array_give_batch(pv_entry_t * batch_head)635 pv_free_array_give_batch(pv_entry_t *batch_head)
636 {
637 	assert(batch_head != NULL);
638 
639 	pmap_simple_lock(&pv_free_array_lock);
640 	if (pv_free_array_n_elems() == (PV_FREE_ARRAY_SIZE - 1)) {
641 		pmap_simple_unlock(&pv_free_array_lock);
642 		return KERN_FAILURE;
643 	}
644 
645 	const size_t index = pv_free_write_idx++ & (PV_FREE_ARRAY_SIZE - 1);
646 	pv_free_list_t *free_list = &pv_free_ring[index];
647 	free_list->list = batch_head;
648 	free_list->count = PV_BATCH_SIZE;
649 	pmap_simple_unlock(&pv_free_array_lock);
650 
651 	return KERN_SUCCESS;
652 }
653 
654 /**
655  * Helper function for allocating a single PVE from an arbitrary free list.
656  *
657  * @param free_list The free list to allocate a node from.
658  * @param pvepp Output parameter that will get updated with a pointer to the
659  *              allocated node if the free list isn't empty, or a pointer to
660  *              NULL if the list is empty.
661  */
662 MARK_AS_PMAP_TEXT static void
pv_free_list_alloc(pv_free_list_t * free_list,pv_entry_t ** pvepp)663 pv_free_list_alloc(pv_free_list_t *free_list, pv_entry_t **pvepp)
664 {
665 	assert(pvepp != NULL);
666 	assert(((free_list->list != NULL) && (free_list->count > 0)) ||
667 	    ((free_list->list == NULL) && (free_list->count == 0)));
668 
669 	if ((*pvepp = free_list->list) != NULL) {
670 		pv_entry_t *pvep = *pvepp;
671 		free_list->list = pvep->pve_next;
672 		pvep->pve_next = PV_ENTRY_NULL;
673 		free_list->count--;
674 	}
675 }
676 
677 /**
678  * Allocates a PVE from the kernel-dedicated list.
679  *
680  * @note This is only called when the global free list is empty, so don't bother
681  *       trying to allocate more nodes from that list.
682  *
683  * @param pvepp Output parameter that will get updated with a pointer to the
684  *              allocated node if the free list isn't empty, or a pointer to
685  *              NULL if the list is empty. This pointer can't already be
686  *              pointing to a valid entry before allocation.
687  */
688 MARK_AS_PMAP_TEXT static void
pv_list_kern_alloc(pv_entry_t ** pvepp)689 pv_list_kern_alloc(pv_entry_t **pvepp)
690 {
691 	assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
692 	pmap_simple_lock(&pv_kern_free_list_lock);
693 	if (pv_kern_free.count > 0) {
694 		pmap_kern_reserve_alloc_stat++;
695 	}
696 	pv_free_list_alloc(&pv_kern_free, pvepp);
697 	pmap_simple_unlock(&pv_kern_free_list_lock);
698 }
699 
700 /**
701  * Returns a list of PVEs to the kernel-dedicated free list.
702  *
703  * @param pve_head Head of the list to be returned.
704  * @param pve_tail Tail of the list to be returned.
705  * @param pv_cnt Number of elements in the list to be returned.
706  */
707 MARK_AS_PMAP_TEXT static void
pv_list_kern_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,int pv_cnt)708 pv_list_kern_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, int pv_cnt)
709 {
710 	assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
711 
712 	pmap_simple_lock(&pv_kern_free_list_lock);
713 	pve_tail->pve_next = pv_kern_free.list;
714 	pv_kern_free.list = pve_head;
715 	pv_kern_free.count += pv_cnt;
716 	pmap_simple_unlock(&pv_kern_free_list_lock);
717 }
718 
719 /**
720  * Attempts to allocate from the per-cpu free list of PVEs, and if that fails,
721  * then replenish the per-cpu free list with a batch of PVEs from the global
722  * PVE free list.
723  *
724  * @param pvepp Output parameter that will get updated with a pointer to the
725  *              allocated node if the free lists aren't empty, or a pointer to
726  *              NULL if both the per-cpu and global lists are empty. This
727  *              pointer can't already be pointing to a valid entry before
728  *              allocation.
729  */
730 MARK_AS_PMAP_TEXT static void
pv_list_alloc(pv_entry_t ** pvepp)731 pv_list_alloc(pv_entry_t **pvepp)
732 {
733 	assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
734 
735 	/* Disable preemption while working with per-CPU data. */
736 	mp_disable_preemption();
737 
738 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
739 	pv_free_list_alloc(&pmap_cpu_data->pv_free, pvepp);
740 
741 	if (*pvepp != PV_ENTRY_NULL) {
742 		goto pv_list_alloc_done;
743 	}
744 
745 	if (pv_kern_free.count < pv_kern_low_water_mark) {
746 		/**
747 		 * If the kernel reserved pool is low, let non-kernel mappings wait for
748 		 * a page from the VM.
749 		 */
750 		goto pv_list_alloc_done;
751 	}
752 
753 	/**
754 	 * Attempt to replenish the local list off the global one, and return the
755 	 * first element. If the global list is empty, then the allocation failed.
756 	 */
757 	pv_entry_t *new_batch = pv_free_array_get_batch();
758 
759 	if (new_batch != PV_ENTRY_NULL) {
760 		pmap_cpu_data->pv_free.count = PV_BATCH_SIZE - 1;
761 		pmap_cpu_data->pv_free.list = new_batch->pve_next;
762 		assert(pmap_cpu_data->pv_free.list != NULL);
763 
764 		new_batch->pve_next = PV_ENTRY_NULL;
765 		*pvepp = new_batch;
766 	}
767 
768 pv_list_alloc_done:
769 	mp_enable_preemption();
770 
771 	return;
772 }
773 
774 /**
775  * Adds a list of PVEs to the per-CPU PVE free list. May spill out some entries
776  * to the global or the kernel PVE free lists if the per-CPU list contains too
777  * many PVEs.
778  *
779  * @param pve_head Head of the list to be returned.
780  * @param pve_tail Tail of the list to be returned.
781  * @param pv_cnt Number of elements in the list to be returned.
782  */
783 MARK_AS_PMAP_TEXT void
pv_list_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,unsigned int pv_cnt)784 pv_list_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, unsigned int pv_cnt)
785 {
786 	assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
787 
788 	/* Disable preemption while working with per-CPU data. */
789 	disable_preemption();
790 
791 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
792 
793 	/**
794 	 * How many more PVEs need to be added to the last allocated batch to get it
795 	 * back up to a PV_BATCH_SIZE number of objects.
796 	 */
797 	const uint32_t available = PV_BATCH_SIZE - (pmap_cpu_data->pv_free.count % PV_BATCH_SIZE);
798 
799 	/**
800 	 * The common case is that the number of PVEs to be freed fit in the current
801 	 * PV_BATCH_SIZE boundary. If that is the case, quickly prepend the whole
802 	 * list and return.
803 	 */
804 	if (__probable((pv_cnt <= available) &&
805 	    ((pmap_cpu_data->pv_free.count % PV_BATCH_SIZE != 0) || (pmap_cpu_data->pv_free.count == 0)))) {
806 		pve_tail->pve_next = pmap_cpu_data->pv_free.list;
807 		pmap_cpu_data->pv_free.list = pve_head;
808 		pmap_cpu_data->pv_free.count += pv_cnt;
809 		goto pv_list_free_done;
810 	}
811 
812 	unsigned int freed_count = 0;
813 
814 	/**
815 	 * In the degenerate case, we need to process PVEs one by one, to make sure
816 	 * we spill out to the global list, or update the spill marker as
817 	 * appropriate.
818 	 */
819 	while (pv_cnt) {
820 		/**
821 		 * Check for (and if necessary reenable) preemption every PV_BATCH_SIZE PVEs to
822 		 * avoid leaving preemption disabled for an excessive duration if we happen to be
823 		 * processing a very large PV list.
824 		 */
825 		if (__improbable(freed_count == PV_BATCH_SIZE)) {
826 			freed_count = 0;
827 			if (__improbable(pmap_pending_preemption())) {
828 				enable_preemption();
829 				assert(preemption_enabled() || PMAP_IS_HIBERNATING());
830 				disable_preemption();
831 				pmap_cpu_data = pmap_get_cpu_data();
832 			}
833 		}
834 
835 		/**
836 		 * Take the node off the top of the passed in list and prepend it to the
837 		 * per-cpu list.
838 		 */
839 		pv_entry_t *pv_next = pve_head->pve_next;
840 		pve_head->pve_next = pmap_cpu_data->pv_free.list;
841 		pmap_cpu_data->pv_free.list = pve_head;
842 		pve_head = pv_next;
843 		pmap_cpu_data->pv_free.count++;
844 		pv_cnt--;
845 		freed_count++;
846 
847 		if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE + 1))) {
848 			/**
849 			 * A full batch of entries have been freed to the per-cpu list.
850 			 * Update the spill marker which is used to remember the end of a
851 			 * batch (remember, we prepend nodes) to eventually return back to
852 			 * the global list (we try to only keep one PV_BATCH_SIZE worth of
853 			 * nodes in any single per-cpu list).
854 			 */
855 			pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
856 		} else if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE * 2) + 1)) {
857 			/* Spill out excess PVEs to the global PVE array */
858 			pv_entry_t *spill_head = pmap_cpu_data->pv_free.list->pve_next;
859 			pv_entry_t *spill_tail = pmap_cpu_data->pv_free_spill_marker;
860 			pmap_cpu_data->pv_free.list->pve_next = pmap_cpu_data->pv_free_spill_marker->pve_next;
861 			spill_tail->pve_next = PV_ENTRY_NULL;
862 			pmap_cpu_data->pv_free.count -= PV_BATCH_SIZE;
863 			pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
864 
865 			if (__improbable(pv_free_array_give_batch(spill_head) != KERN_SUCCESS)) {
866 				/**
867 				 * This is extremely unlikely to happen, as it would imply that
868 				 * we have (PV_FREE_ARRAY_SIZE * PV_BATCH_SIZE) PVEs sitting in
869 				 * the global array. Just in case, push the excess down to the
870 				 * kernel PVE free list.
871 				 */
872 				pv_list_kern_free(spill_head, spill_tail, PV_BATCH_SIZE);
873 			}
874 		}
875 	}
876 
877 pv_list_free_done:
878 	enable_preemption();
879 
880 	return;
881 }
882 
883 /**
884  * Adds a single page to the PVE allocation subsystem.
885  *
886  * @note This function operates under the assumption that a PV_BATCH_SIZE amount
887  *       of PVEs can fit within a single page. One page is always allocated for
888  *       one batch, so if there's empty space in the page after the batch of
889  *       PVEs, it'll go unused (so it's best to keep the batch size at an amount
890  *       that utilizes a whole page).
891  *
892  * @param alloc_flags Allocation flags passed to pmap_page_alloc(). See
893  *                    the definition of that function for a detailed description
894  *                    of the available flags.
895  *
896  * @return KERN_SUCCESS, or the value returned by pmap_page_alloc() upon
897  *         failure.
898  */
899 MARK_AS_PMAP_TEXT static kern_return_t
pve_feed_page(unsigned alloc_flags)900 pve_feed_page(unsigned alloc_flags)
901 {
902 	kern_return_t kr = KERN_FAILURE;
903 
904 	pv_entry_t *pve_head = PV_ENTRY_NULL;
905 	pv_entry_t *pve_tail = PV_ENTRY_NULL;
906 	pmap_paddr_t pa = 0;
907 
908 	kr = pmap_page_alloc(&pa, alloc_flags);
909 
910 	if (kr != KERN_SUCCESS) {
911 		return kr;
912 	}
913 
914 	/* Update statistics globals. See the variables' definitions for more info. */
915 	os_atomic_inc(&pv_page_count, relaxed);
916 	pmap_reserve_replenish_stat += PV_BATCH_SIZE;
917 
918 	/* Prepare a new list by linking all of the entries in advance. */
919 	pve_head = (pv_entry_t *)phystokv(pa);
920 	pve_tail = &pve_head[PV_BATCH_SIZE - 1];
921 
922 	for (int i = 0; i < PV_BATCH_SIZE; i++) {
923 		pve_head[i].pve_next = &pve_head[i + 1];
924 	}
925 	pve_head[PV_BATCH_SIZE - 1].pve_next = PV_ENTRY_NULL;
926 
927 	/**
928 	 * Add the new list to the kernel PVE free list if we are running low on
929 	 * kernel-dedicated entries or the global free array is full.
930 	 */
931 	if ((pv_kern_free.count < pv_kern_low_water_mark) ||
932 	    (pv_free_array_give_batch(pve_head) != KERN_SUCCESS)) {
933 		pv_list_kern_free(pve_head, pve_tail, PV_BATCH_SIZE);
934 	}
935 
936 	return KERN_SUCCESS;
937 }
938 
939 /**
940  * Allocate a PV node from one of many different free lists (per-cpu, global, or
941  * kernel-specific).
942  *
943  * @note This function is very tightly coupled with pmap_enter_pv(). If
944  *       modifying this code, please ensure that pmap_enter_pv() doesn't break.
945  *
946  * @note The pmap lock must already be held if the new mapping is a CPU mapping.
947  *
948  * @note The PVH lock for the physical page that is getting a new mapping
949  *       registered must already be held.
950  *
951  * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
952  *             an IOMMU translation.
953  * @param lock_mode Which state the pmap lock is being held in if the mapping is
954  *                  owned by a pmap, otherwise this is a don't care.
955  * @param options PMAP_OPTIONS_* family of options passed from the caller.
956  * @param pvepp Output parameter that will get updated with a pointer to the
957  *              allocated node if none of the free lists are empty, or a pointer
958  *              to NULL otherwise. This pointer can't already be pointing to a
959  *              valid entry before allocation.
960  * @param locked_pvh Input/output parameter pointing to the wrapped value of the
961  *                   pv_head_table entry previously obtained from pvh_lock().
962  *                   This value will be updated if [locked_pvh->pai] needs to be
963  *                   re-locked.
964  * @param refcountp Pointer to a reference count that will be temporarily
965  *                  atomically incremented in the event that [pmap]'s lock needs
966  *                  to be temporarily dropped in order to satisfy the allocation.
967  *                  This is typically used to prevent a page table from being
968  *                  reclaimed while the lock is dropped.  May be NULL.
969  *
970  * @return These are the possible return values:
971  *     PV_ALLOC_SUCCESS: A PVE object was successfully allocated.
972  *     PV_ALLOC_FAIL: No objects were available for allocation, and
973  *                    allocating a new page failed.
974  *     PV_ALLOC_RETRY: No objects were available on the free lists, so a new
975  *                     page of PVE objects needed to be allocated. To do that,
976  *                     the pmap and PVH locks were dropped. The caller may have
977  *                     depended on these locks for consistency, so return and
978  *                     let the caller retry the PVE allocation with the locks
979  *                     held. Note that the locks have already been re-acquired
980  *                     before this function exits.
981  */
982 MARK_AS_PMAP_TEXT pv_alloc_return_t
pv_alloc(pmap_t pmap,pmap_lock_mode_t lock_mode,unsigned int options,pv_entry_t ** pvepp,locked_pvh_t * locked_pvh,volatile uint16_t * refcountp)983 pv_alloc(
984 	pmap_t pmap,
985 	pmap_lock_mode_t lock_mode,
986 	unsigned int options,
987 	pv_entry_t **pvepp,
988 	locked_pvh_t *locked_pvh,
989 	volatile uint16_t *refcountp)
990 {
991 	assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
992 	assert(locked_pvh != NULL);
993 
994 	if (pmap != NULL) {
995 		pmap_assert_locked(pmap, lock_mode);
996 	}
997 
998 	pv_list_alloc(pvepp);
999 	if (PV_ENTRY_NULL != *pvepp) {
1000 		return PV_ALLOC_SUCCESS;
1001 	}
1002 
1003 	unsigned alloc_flags = 0;
1004 
1005 	/**
1006 	 * We got here because both the per-CPU and the global lists are empty. If
1007 	 * this allocation is for the kernel pmap or an IOMMU kernel driver, we try
1008 	 * to get an entry from the kernel list next.
1009 	 */
1010 	if ((pmap == NULL) || (kernel_pmap == pmap)) {
1011 		pv_list_kern_alloc(pvepp);
1012 		if (PV_ENTRY_NULL != *pvepp) {
1013 			return PV_ALLOC_SUCCESS;
1014 		}
1015 	}
1016 
1017 	/**
1018 	 * Make sure we have PMAP_PAGES_ALLOCATE_NOWAIT set in alloc_flags when the
1019 	 * input options argument has PMAP_OPTIONS_NOWAIT set.
1020 	 */
1021 	alloc_flags |= (options & PMAP_OPTIONS_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0;
1022 
1023 	/**
1024 	 * We ran out of PV entries all across the board, or this allocation is not
1025 	 * for the kernel. Let's make sure that the kernel list is not too full
1026 	 * (very unlikely), in which case we can rebalance here.
1027 	 */
1028 	if (__improbable(pv_kern_free.count > (PV_BATCH_SIZE * 2))) {
1029 		pmap_simple_lock(&pv_kern_free_list_lock);
1030 		/* Re-check, now that the lock is held. */
1031 		if (pv_kern_free.count > (PV_BATCH_SIZE * 2)) {
1032 			pv_entry_t *pve_head = pv_kern_free.list;
1033 			pv_entry_t *pve_tail = pve_head;
1034 
1035 			for (int i = 0; i < (PV_BATCH_SIZE - 1); i++) {
1036 				pve_tail = pve_tail->pve_next;
1037 			}
1038 
1039 			pv_kern_free.list = pve_tail->pve_next;
1040 			pv_kern_free.count -= PV_BATCH_SIZE;
1041 			pve_tail->pve_next = PV_ENTRY_NULL;
1042 			pmap_simple_unlock(&pv_kern_free_list_lock);
1043 
1044 			/* Return back every node except the first one to the free lists. */
1045 			pv_list_free(pve_head->pve_next, pve_tail, PV_BATCH_SIZE - 1);
1046 			pve_head->pve_next = PV_ENTRY_NULL;
1047 			*pvepp = pve_head;
1048 			return PV_ALLOC_SUCCESS;
1049 		}
1050 		pmap_simple_unlock(&pv_kern_free_list_lock);
1051 	}
1052 
1053 	/**
1054 	 * If all else fails, try to get a new pmap page so that the allocation
1055 	 * succeeds once the caller retries it.
1056 	 */
1057 	kern_return_t kr = KERN_FAILURE;
1058 	pv_alloc_return_t pv_status = PV_ALLOC_FAIL;
1059 	const unsigned int pai = locked_pvh->pai;
1060 
1061 	/**
1062 	 * Drop the lock during page allocation since that can take a while and
1063 	 * because preemption must be enabled when attempting to allocate memory
1064 	 * from the VM (which requires grabbing a mutex).
1065 	 */
1066 	pvh_unlock(locked_pvh);
1067 	if (pmap != NULL) {
1068 		/**
1069 		 * Bump the provided refcount before we drop the pmap lock in order to prevent
1070 		 * page table reclamation while the lock is dropped.
1071 		 */
1072 		if (__improbable((refcountp != NULL) && (os_atomic_inc_orig(refcountp, relaxed) == UINT16_MAX))) {
1073 			panic("%s: pmap %p refcount %p overflow", __func__, pmap, refcountp);
1074 		}
1075 		pmap_unlock(pmap, lock_mode);
1076 	}
1077 
1078 	if ((kr = pve_feed_page(alloc_flags)) == KERN_SUCCESS) {
1079 		/**
1080 		 * Since the lock was dropped, even though we successfully allocated a
1081 		 * new page to be used for PVE nodes, the code that relies on this
1082 		 * function might have depended on the lock being held for consistency,
1083 		 * so return out early and let them retry the allocation with the lock
1084 		 * re-held.
1085 		 */
1086 		pv_status = PV_ALLOC_RETRY;
1087 	} else {
1088 		pv_status = PV_ALLOC_FAIL;
1089 	}
1090 
1091 	if (pmap != NULL) {
1092 		pmap_lock(pmap, lock_mode);
1093 		if (__improbable((refcountp != NULL) && (os_atomic_dec_orig(refcountp, relaxed) == 0))) {
1094 			panic("%s: pmap %p refcount %p underflow", __func__, pmap, refcountp);
1095 		}
1096 	}
1097 
1098 	if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
1099 		*locked_pvh = pvh_lock_nopreempt(pai);
1100 	} else {
1101 		*locked_pvh = pvh_lock(pai);
1102 	}
1103 
1104 	/* Ensure that no node was created if we're not returning successfully. */
1105 	assert(*pvepp == PV_ENTRY_NULL);
1106 
1107 	return pv_status;
1108 }
1109 
1110 /**
1111  * Utility function for freeing a single PVE object back to the free lists.
1112  *
1113  * @param pvep Pointer to the PVE object to free.
1114  */
1115 MARK_AS_PMAP_TEXT void
pv_free(pv_entry_t * pvep)1116 pv_free(pv_entry_t *pvep)
1117 {
1118 	assert(pvep != PV_ENTRY_NULL);
1119 
1120 	pv_list_free(pvep, pvep, 1);
1121 }
1122 
1123 /**
1124  * This function provides a mechanism for the device tree to override the
1125  * default PV allocation amounts and the watermark level which determines how
1126  * many PVE objects are kept in the kernel-dedicated free list.
1127  */
1128 MARK_AS_PMAP_TEXT void
pmap_compute_pv_targets(void)1129 pmap_compute_pv_targets(void)
1130 {
1131 	DTEntry entry = NULL;
1132 	void const *prop = NULL;
1133 	int err = 0;
1134 	unsigned int prop_size = 0;
1135 
1136 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1137 	assert(err == kSuccess);
1138 
1139 	if (kSuccess == SecureDTGetProperty(entry, "pmap-pv-count", &prop, &prop_size)) {
1140 		if (prop_size != sizeof(pv_alloc_initial_target)) {
1141 			panic("pmap-pv-count property is not a 32-bit integer");
1142 		}
1143 		pv_alloc_initial_target = *((uint32_t const *)prop);
1144 	}
1145 
1146 	if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-count", &prop, &prop_size)) {
1147 		if (prop_size != sizeof(pv_kern_alloc_initial_target)) {
1148 			panic("pmap-kern-pv-count property is not a 32-bit integer");
1149 		}
1150 		pv_kern_alloc_initial_target = *((uint32_t const *)prop);
1151 	}
1152 
1153 	if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-min", &prop, &prop_size)) {
1154 		if (prop_size != sizeof(pv_kern_low_water_mark)) {
1155 			panic("pmap-kern-pv-min property is not a 32-bit integer");
1156 		}
1157 		pv_kern_low_water_mark = *((uint32_t const *)prop);
1158 	}
1159 }
1160 
1161 /**
1162  * This would normally be used to adjust the amount of PVE objects available in
1163  * the system, but we do that dynamically at runtime anyway so this is unneeded.
1164  */
1165 void
mapping_adjust(void)1166 mapping_adjust(void)
1167 {
1168 	/* Not implemented for arm/arm64. */
1169 }
1170 
1171 /**
1172  * Creates a target number of free pv_entry_t objects for the kernel free list
1173  * and the general free list.
1174  *
1175  * @note This function is called once during early boot, in kernel_bootstrap().
1176  *
1177  * @return KERN_SUCCESS if the objects were successfully allocated, or the
1178  *         return value from pve_feed_page() on failure (could be caused by not
1179  *         being able to allocate a page).
1180  */
1181 MARK_AS_PMAP_TEXT kern_return_t
mapping_free_prime_internal(void)1182 mapping_free_prime_internal(void)
1183 {
1184 	kern_return_t kr = KERN_FAILURE;
1185 
1186 	/*
1187 	 * We do not need to hold the pv_free_array lock to calculate the number of
1188 	 * elements in it because no other core is running at this point.
1189 	 */
1190 	while (((pv_free_array_n_elems() * PV_BATCH_SIZE) < pv_alloc_initial_target) ||
1191 	    (pv_kern_free.count < pv_kern_alloc_initial_target)) {
1192 		if ((kr = pve_feed_page(0)) != KERN_SUCCESS) {
1193 			return kr;
1194 		}
1195 	}
1196 
1197 	return KERN_SUCCESS;
1198 }
1199 
1200 /**
1201  * Helper function for pmap_enter_pv (hereby shortened to "pepv") which converts
1202  * a PVH entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP which will transform the
1203  * entry into a linked list of mappings.
1204  *
1205  * @note This should only be called from pmap_enter_pv().
1206  *
1207  * @note The PVH lock for the passed in page must already be held and the type
1208  *       must be PVH_TYPE_PTEP (wouldn't make sense to call this otherwise).
1209  *
1210  * @param pmap Either the pmap that owns the mapping being registered in
1211  *             pmap_enter_pv(), or NULL if this is an IOMMU mapping.
1212  * @param lock_mode Which state the pmap lock is being held in if the mapping is
1213  *                  owned by a pmap, otherwise this is a don't care.
1214  * @param options PMAP_OPTIONS_* family of options.
1215  * @param locked_pvh Input/output parameter pointing to the wrapped value of the
1216  *                   pv_head_table entry previously obtained from pvh_lock().
1217  *                   This value will be updated if [locked_pvh->pai] needs to be
1218  *                   re-locked or if the allocation is successful and the PVH
1219  *                   entry is updated with the new PVE pointer.
1220  *
1221  * @return PV_ALLOC_SUCCESS if the entry at `pai` was successfully converted
1222  *         into PVH_TYPE_PVEP, or the return value of pv_alloc() otherwise. See
1223  *         pv_alloc()'s function header for a detailed explanation of the
1224  *         possible return values.
1225  */
1226 MARK_AS_PMAP_TEXT static pv_alloc_return_t
pepv_convert_ptep_to_pvep(pmap_t pmap,pmap_lock_mode_t lock_mode,unsigned int options,locked_pvh_t * locked_pvh)1227 pepv_convert_ptep_to_pvep(
1228 	pmap_t pmap,
1229 	pmap_lock_mode_t lock_mode,
1230 	unsigned int options,
1231 	locked_pvh_t *locked_pvh)
1232 {
1233 	assert(locked_pvh != NULL);
1234 	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PTEP));
1235 
1236 	pv_entry_t *pvep = PV_ENTRY_NULL;
1237 	pv_alloc_return_t ret = pv_alloc(pmap, lock_mode, options, &pvep, locked_pvh, NULL);
1238 	if (ret != PV_ALLOC_SUCCESS) {
1239 		return ret;
1240 	}
1241 
1242 	const unsigned int pai = locked_pvh->pai;
1243 
1244 	/* If we've gotten this far then a node should've been allocated. */
1245 	assert(pvep != PV_ENTRY_NULL);
1246 
1247 	/* The new PVE should have the same PTE pointer as the previous PVH entry. */
1248 	pve_init(pvep);
1249 	pve_set_ptep(pvep, 0, pvh_ptep(locked_pvh->pvh));
1250 
1251 	assert(!pve_get_internal(pvep, 0));
1252 	assert(!pve_get_altacct(pvep, 0));
1253 	if (ppattr_is_internal(pai)) {
1254 		/**
1255 		 * Transfer "internal" status from pp_attr to this pve. See the comment
1256 		 * above PP_ATTR_INTERNAL for more information on this.
1257 		 */
1258 		ppattr_clear_internal(pai);
1259 		pve_set_internal(pvep, 0);
1260 	}
1261 	if (ppattr_is_altacct(pai)) {
1262 		/**
1263 		 * Transfer "altacct" status from pp_attr to this pve. See the comment
1264 		 * above PP_ATTR_ALTACCT for more information on this.
1265 		 */
1266 		ppattr_clear_altacct(pai);
1267 		pve_set_altacct(pvep, 0);
1268 	}
1269 
1270 	pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
1271 
1272 	return PV_ALLOC_SUCCESS;
1273 }
1274 
1275 /**
1276  * Register a new mapping into the pv_head_table. This is the main data
1277  * structure used for performing a reverse physical to virtual translation and
1278  * finding all mappings to a physical page. Whenever a new page table mapping is
1279  * created (regardless of whether it's for a CPU or an IOMMU), it should be
1280  * registered with a call to this function.
1281  *
1282  * @note The pmap lock must already be held if the new mapping is a CPU mapping.
1283  *
1284  * @note The PVH lock for the physical page that is getting a new mapping
1285  *       registered must already be held.
1286  *
1287  * @note This function cannot be called during the hibernation process because
1288  *       it modifies critical pmap data structures that need to be dumped into
1289  *       the hibernation image in a consistent state.
1290  *
1291  * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
1292  *             an IOMMU translation.
1293  * @param ptep The new mapping to register.
1294  * @param options Flags that can potentially be set on a per-page basis:
1295  *                PMAP_OPTIONS_INTERNAL: If this is the first CPU mapping, then
1296  *                    mark the page as being "internal". See the definition of
1297  *                    PP_ATTR_INTERNAL for more info.
1298  *                PMAP_OPTIONS_REUSABLE: If this is the first CPU mapping, and
1299  *                    this page is also marked internal, then mark the page as
1300  *                    being "reusable". See the definition of PP_ATTR_REUSABLE
1301  *                    for more info.
1302  * @param lock_mode Which state the pmap lock is being held in if the mapping is
1303  *                  owned by a pmap, otherwise this is a don't care.
1304  * @param locked_pvh Input/output parameter pointing to the wrapped value of the
1305  *                   pv_head_table entry previously obtained from pvh_lock().
1306  *                   If the registration is successful, locked_pvh->pvh will be
1307  *                   updated to reflect the new PV list head.
1308  * @param new_pvepp An output parameter that is updated with a pointer to the
1309  *                  PVE object where the PTEP was allocated into. In the event
1310  *                  of failure, or if the pointer passed in is NULL,
1311  *                  it's not modified.
1312  * @param new_pve_ptep_idx An output parameter that is updated with the index
1313  *                  into the PVE object where the PTEP was allocated into.
1314  *                  In the event of failure, or if new_pvepp in is NULL,
1315  *                  it's not modified.
1316  *
1317  * @return PV_ALLOC_SUCCESS if the entry at [locked_pvh->pai] was successfully
1318  *         updated with the new mapping, or the return value of pv_alloc()
1319  *         otherwise. See pv_alloc()'s function header for a detailed explanation
1320  *         of the possible return values.
1321  */
1322 MARK_AS_PMAP_TEXT pv_alloc_return_t
pmap_enter_pv(pmap_t pmap,pt_entry_t * ptep,unsigned int options,pmap_lock_mode_t lock_mode,locked_pvh_t * locked_pvh,pv_entry_t ** new_pvepp,int * new_pve_ptep_idx)1323 pmap_enter_pv(
1324 	pmap_t pmap,
1325 	pt_entry_t *ptep,
1326 	unsigned int options,
1327 	pmap_lock_mode_t lock_mode,
1328 	locked_pvh_t *locked_pvh,
1329 	pv_entry_t **new_pvepp,
1330 	int *new_pve_ptep_idx)
1331 {
1332 	assert(ptep != PT_ENTRY_NULL);
1333 	assert(locked_pvh != NULL);
1334 
1335 	bool first_cpu_mapping = false;
1336 
1337 	PMAP_ASSERT_NOT_WRITING_HIB();
1338 
1339 	if (pmap != NULL) {
1340 		pmap_assert_locked(pmap, lock_mode);
1341 	}
1342 
1343 	uintptr_t pvh_flags = pvh_get_flags(locked_pvh->pvh);
1344 	const unsigned int pai = locked_pvh->pai;
1345 
1346 
1347 	/**
1348 	 * An IOMMU mapping may already be present for a page that hasn't yet had a
1349 	 * CPU mapping established, so we use PVH_FLAG_CPU to determine if this is
1350 	 * the first CPU mapping. We base internal/reusable accounting on the
1351 	 * options specified for the first CPU mapping. PVH_FLAG_CPU, and thus this
1352 	 * accounting, will then persist as long as there are *any* mappings of the
1353 	 * page. The accounting for a page should not need to change until the page
1354 	 * is recycled by the VM layer, and we assert that there are no mappings
1355 	 * when a page is recycled. An IOMMU mapping of a freed/recycled page is
1356 	 * considered a security violation & potential DMA corruption path.
1357 	 */
1358 	first_cpu_mapping = ((pmap != NULL) && !(pvh_flags & PVH_FLAG_CPU));
1359 	if (first_cpu_mapping) {
1360 		pvh_flags |= PVH_FLAG_CPU;
1361 		pvh_set_flags(locked_pvh, pvh_flags);
1362 	}
1363 
1364 	/**
1365 	 * Internal/reusable flags are based on the first CPU mapping made to a
1366 	 * page. These will persist until all mappings to the page are removed.
1367 	 */
1368 	if (first_cpu_mapping) {
1369 		if ((options & PMAP_OPTIONS_INTERNAL) &&
1370 		    (options & PMAP_OPTIONS_REUSABLE)) {
1371 			ppattr_set_reusable(pai);
1372 		} else {
1373 			ppattr_clear_reusable(pai);
1374 		}
1375 	}
1376 
1377 	/* Visit the definitions for the PVH_TYPEs to learn more about each one. */
1378 	if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_NULL)) {
1379 		/* If this is the first mapping, upgrade the type to store a single PTEP. */
1380 		pvh_update_head(locked_pvh, ptep, PVH_TYPE_PTEP);
1381 	} else {
1382 		pv_alloc_return_t ret = PV_ALLOC_FAIL;
1383 
1384 		if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_PTEP)) {
1385 			/**
1386 			 * There was already a single mapping to the page. Convert the PVH
1387 			 * entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP so that multiple
1388 			 * mappings can be tracked. If PVEs cannot hold more than a single
1389 			 * mapping, a second PVE will be added farther down.
1390 			 */
1391 			if ((ret = pepv_convert_ptep_to_pvep(pmap, lock_mode, options, locked_pvh)) != PV_ALLOC_SUCCESS) {
1392 				return ret;
1393 			}
1394 
1395 			/**
1396 			 * At this point, the PVH flags have been clobbered due to updating
1397 			 * PTEP->PVEP, but that's ok because the locks are being held and
1398 			 * the flags will get set again below before pv_alloc() is called
1399 			 * and the locks are potentially dropped again.
1400 			 */
1401 		} else if (__improbable(!pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP))) {
1402 			panic("%s: unexpected PV head %p, ptep=%p pmap=%p",
1403 			    __func__, (void*)locked_pvh->pvh, ptep, pmap);
1404 		}
1405 
1406 		/**
1407 		 * Check if we have room for one more mapping in this PVE
1408 		 */
1409 		pv_entry_t *pvep = pvh_pve_list(locked_pvh->pvh);
1410 		assert(pvep != PV_ENTRY_NULL);
1411 
1412 		int pve_ptep_idx = pve_find_ptep_index(pvep, PT_ENTRY_NULL);
1413 
1414 		if (pve_ptep_idx == -1) {
1415 			/**
1416 			 * Set up the pv_entry for this new mapping and then add it to the list
1417 			 * for this physical page.
1418 			 */
1419 			pve_ptep_idx = 0;
1420 			pvep = PV_ENTRY_NULL;
1421 			if ((ret = pv_alloc(pmap, lock_mode, options, &pvep, locked_pvh, NULL)) != PV_ALLOC_SUCCESS) {
1422 				return ret;
1423 			}
1424 
1425 			/* If we've gotten this far then a node should've been allocated. */
1426 			assert(pvep != PV_ENTRY_NULL);
1427 			pve_init(pvep);
1428 			pve_add(locked_pvh, pvep);
1429 		}
1430 
1431 		pve_set_ptep(pvep, pve_ptep_idx, ptep);
1432 
1433 		/*
1434 		 * The PTEP was successfully entered into the PVE object.
1435 		 * If the caller requests it, set new_pvepp and new_pve_ptep_idx
1436 		 * appropriately.
1437 		 */
1438 		if (new_pvepp != NULL) {
1439 			*new_pvepp = pvep;
1440 			*new_pve_ptep_idx = pve_ptep_idx;
1441 		}
1442 	}
1443 
1444 	return PV_ALLOC_SUCCESS;
1445 }
1446 
1447 /**
1448  * Remove a mapping that was registered with the pv_head_table. This needs to be
1449  * done for every mapping that was previously registered using pmap_enter_pv()
1450  * when the mapping is removed.
1451  *
1452  * @note The PVH lock for the physical page that is getting a new mapping
1453  *       registered must already be held.
1454  *
1455  * @note This function cannot be called during the hibernation process because
1456  *       it modifies critical pmap data structures that need to be dumped into
1457  *       the hibernation image in a consistent state.
1458  *
1459  * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
1460  *             an IOMMU translation.
1461  * @param ptep The mapping that's getting removed.
1462  * @param locked_pvh Input/output parameter pointing to the wrapped value of the
1463  *                   pv_head_table entry previously obtained from pvh_lock().
1464  *                   If the removal is successful, locked_pvh->pvh may be updated
1465  *                   to reflect a new PV list head.
1466  * @param is_internal_p The internal bit of the PTE that was removed.
1467  * @param is_altacct_p The altacct bit of the PTE that was removed.
1468  * @return These are the possible return values:
1469  *     PV_REMOVE_SUCCESS: A PV entry matching the PTE was found and
1470  *                        removed.
1471  *     PV_REMOVE_FAIL: No matching PV entry was found.  This may not be a fatal
1472  *                        condition; for example, pmap_disconnect() on another
1473  *                        thread may have removed the PV entry between removal
1474  *                        of the mapping and acquisition of the PV lock in
1475  *                        pmap_remove();
1476  */
1477 pv_remove_return_t
pmap_remove_pv(pmap_t pmap __assert_only,pt_entry_t * ptep,locked_pvh_t * locked_pvh,bool * is_internal_p,bool * is_altacct_p)1478 pmap_remove_pv(
1479 	pmap_t pmap __assert_only,
1480 	pt_entry_t *ptep,
1481 	locked_pvh_t *locked_pvh,
1482 	bool *is_internal_p,
1483 	bool *is_altacct_p)
1484 {
1485 	PMAP_ASSERT_NOT_WRITING_HIB();
1486 	assert(locked_pvh != NULL);
1487 
1488 	pv_remove_return_t ret = PV_REMOVE_SUCCESS;
1489 	const unsigned int pai = locked_pvh->pai;
1490 	bool is_internal = false;
1491 	bool is_altacct = false;
1492 
1493 
1494 	if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_PTEP)) {
1495 		if (__improbable((ptep != pvh_ptep(locked_pvh->pvh)))) {
1496 			return PV_REMOVE_FAIL;
1497 		}
1498 
1499 		pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
1500 		is_internal = ppattr_is_internal(pai);
1501 		is_altacct = ppattr_is_altacct(pai);
1502 	} else if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP)) {
1503 		pv_entry_t **pvepp = NULL;
1504 		pv_entry_t *pvep = pvh_pve_list(locked_pvh->pvh);
1505 		assert(pvep != PV_ENTRY_NULL);
1506 		unsigned int npves = 0;
1507 		int pve_pte_idx = 0;
1508 		/* Find the PVE that represents the mapping we're removing. */
1509 		while ((pvep != PV_ENTRY_NULL) && ((pve_pte_idx = pve_find_ptep_index(pvep, ptep)) == -1)) {
1510 			if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
1511 				pvh_lock_enter_sleep_mode(locked_pvh);
1512 			}
1513 			pvepp = pve_next_ptr(pvep);
1514 			pvep = pve_next(pvep);
1515 			npves++;
1516 		}
1517 
1518 		if (__improbable((pvep == PV_ENTRY_NULL))) {
1519 			return PV_REMOVE_FAIL;
1520 		}
1521 
1522 		is_internal = pve_get_internal(pvep, pve_pte_idx);
1523 		is_altacct = pve_get_altacct(pvep, pve_pte_idx);
1524 		pve_set_ptep(pvep, pve_pte_idx, PT_ENTRY_NULL);
1525 
1526 #if MACH_ASSERT
1527 		/**
1528 		 * Ensure that the mapping didn't accidentally have multiple PVEs
1529 		 * associated with it (there should only be one PVE per mapping). This
1530 		 * checking only occurs on configurations that can accept the perf hit
1531 		 * that walking the PVE chain on every unmap entails.
1532 		 *
1533 		 * This is skipped for IOMMU mappings because some IOMMUs don't use
1534 		 * normal page tables (e.g., NVMe) to map pages, so the `ptep` field in
1535 		 * the associated PVE won't actually point to a real page table (see the
1536 		 * definition of PVH_FLAG_IOMMU_TABLE for more info). Because of that,
1537 		 * it's perfectly possible for duplicate IOMMU PVEs to exist.
1538 		 */
1539 		if ((pmap != NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
1540 			pv_entry_t *check_pvep = pvep;
1541 
1542 			do {
1543 				if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
1544 					pvh_lock_enter_sleep_mode(locked_pvh);
1545 				}
1546 				if (pve_find_ptep_index(check_pvep, ptep) != -1) {
1547 					panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
1548 					    "pvep=%p, pai=0x%x", __func__, ptep, pmap,
1549 					    (void*)locked_pvh->pvh, pvep, pai);
1550 				}
1551 				npves++;
1552 			} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
1553 		}
1554 #endif /* MACH_ASSERT */
1555 
1556 		const bool pve_is_first = (pvepp == NULL);
1557 		const bool pve_is_last = (pve_next(pvep) == PV_ENTRY_NULL);
1558 		const int other_pte_idx = !pve_pte_idx;
1559 
1560 		if (pve_is_empty(pvep)) {
1561 			/*
1562 			 * This PVE doesn't contain any mappings. We can get rid of it.
1563 			 */
1564 			pve_remove(locked_pvh, pvepp, pvep);
1565 			pv_free(pvep);
1566 		} else if (!pve_is_first) {
1567 			/*
1568 			 * This PVE contains a single mapping. See if we can coalesce it with the one
1569 			 * at the top of the list.
1570 			 */
1571 			pv_entry_t *head_pvep = pvh_pve_list(locked_pvh->pvh);
1572 			int head_pve_pte_empty_idx;
1573 			if ((head_pve_pte_empty_idx = pve_find_ptep_index(head_pvep, PT_ENTRY_NULL)) != -1) {
1574 				pve_set_ptep(head_pvep, head_pve_pte_empty_idx, pve_get_ptep(pvep, other_pte_idx));
1575 				if (pve_get_internal(pvep, other_pte_idx)) {
1576 					pve_set_internal(head_pvep, head_pve_pte_empty_idx);
1577 				}
1578 				if (pve_get_altacct(pvep, other_pte_idx)) {
1579 					pve_set_altacct(head_pvep, head_pve_pte_empty_idx);
1580 				}
1581 				pve_remove(locked_pvh, pvepp, pvep);
1582 				pv_free(pvep);
1583 			} else {
1584 				/*
1585 				 * We could not coalesce it. Move it to the start of the list, so that it
1586 				 * can be coalesced against in the future.
1587 				 */
1588 				*pvepp = pve_next(pvep);
1589 				pve_add(locked_pvh, pvep);
1590 			}
1591 		} else if (pve_is_first && pve_is_last) {
1592 			/*
1593 			 * This PVE contains a single mapping, and it's the last mapping for this PAI.
1594 			 * Collapse this list back into the head, turning it into a PVH_TYPE_PTEP entry.
1595 			 */
1596 			assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
1597 			    __func__, (void*)locked_pvh->pvh, pvep);
1598 			pvh_update_head(locked_pvh, pve_get_ptep(pvep, other_pte_idx), PVH_TYPE_PTEP);
1599 			pp_attr_t attrs_to_set = 0;
1600 			if (pve_get_internal(pvep, other_pte_idx)) {
1601 				attrs_to_set |= PP_ATTR_INTERNAL;
1602 			}
1603 			if (pve_get_altacct(pvep, other_pte_idx)) {
1604 				attrs_to_set |= PP_ATTR_ALTACCT;
1605 			}
1606 			if (attrs_to_set != 0) {
1607 				ppattr_modify_bits(pai, 0, attrs_to_set);
1608 			}
1609 			pv_free(pvep);
1610 		}
1611 	} else {
1612 		/*
1613 		 * A concurrent disconnect operation may have already cleared the PVH to PVH_TYPE_NULL.
1614 		 * It's also possible that a subsequent page table allocation may have transitioned
1615 		 * the PVH to PVH_TYPE_PTDP.
1616 		 */
1617 		return PV_REMOVE_FAIL;
1618 	}
1619 
1620 	if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_NULL)) {
1621 		pvh_set_flags(locked_pvh, 0);
1622 		pp_attr_t attrs_to_clear = 0;
1623 		if (is_internal) {
1624 			attrs_to_clear |= PP_ATTR_INTERNAL;
1625 		}
1626 		if (is_altacct) {
1627 			attrs_to_clear |= PP_ATTR_ALTACCT;
1628 		}
1629 		if (attrs_to_clear != 0) {
1630 			ppattr_modify_bits(pai, attrs_to_clear, 0);
1631 		}
1632 	}
1633 
1634 	*is_internal_p = is_internal;
1635 	*is_altacct_p = is_altacct;
1636 	return ret;
1637 }
1638 
1639 /**
1640  * Bootstrap the initial Page Table Descriptor (PTD) node free list.
1641  *
1642  * @note It's not safe to allocate PTD nodes until after this function is
1643  *       invoked.
1644  *
1645  * @note The maximum number of PTD objects that can reside within one page
1646  *       (`ptd_per_page`) must have already been calculated before calling this
1647  *       function.
1648  *
1649  * @param ptdp Pointer to the virtually-contiguous memory used for the initial
1650  *             free list.
1651  * @param num_pages The number of virtually-contiguous pages pointed to by
1652  *                  `ptdp` that will be used to prime the PTD allocator.
1653  */
1654 MARK_AS_PMAP_TEXT void
ptd_bootstrap(pt_desc_t * ptdp,unsigned int num_pages)1655 ptd_bootstrap(pt_desc_t *ptdp, unsigned int num_pages)
1656 {
1657 	assert(ptd_per_page > 0);
1658 	assert((ptdp != NULL) && (((uintptr_t)ptdp & PAGE_MASK) == 0) && (num_pages > 0));
1659 
1660 	/**
1661 	 * Region represented by ptdp should be cleared by pmap_bootstrap().
1662 	 *
1663 	 * Only part of each page is being used for PTD objects (the rest is used
1664 	 * for each PTD's associated ptd_info_t object) so link together the last
1665 	 * PTD element of each page to the first element of the previous page.
1666 	 */
1667 	for (int i = 0; i < num_pages; i++) {
1668 		*((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
1669 		ptd_free_list = ptdp;
1670 		ptdp = (void *)(((uint8_t *)ptdp) + PAGE_SIZE);
1671 	}
1672 
1673 	ptd_free_count = num_pages * ptd_per_page;
1674 	simple_lock_init(&ptd_free_list_lock, 0);
1675 }
1676 
1677 /**
1678  * Allocate a page table descriptor (PTD) object from the PTD free list, but
1679  * don't add it to the list of reclaimable userspace page table pages just yet
1680  * and don't associate the PTD with a specific pmap (that's what "unlinked"
1681  * means here).
1682  *
1683  * @param alloc_flags Allocation flags passed to pmap_page_alloc(). See the
1684  *                    definition of that function for a detailed description of
1685  *                    the available flags.
1686  *
1687  * @return The page table descriptor object if the allocation was successful, or
1688  *         NULL otherwise (which indicates that a page failed to be allocated
1689  *         for new nodes).
1690  */
1691 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc_unlinked(unsigned int alloc_flags)1692 ptd_alloc_unlinked(unsigned int alloc_flags)
1693 {
1694 	pt_desc_t *ptdp = PTD_ENTRY_NULL;
1695 
1696 	pmap_simple_lock(&ptd_free_list_lock);
1697 
1698 	assert(ptd_per_page != 0);
1699 
1700 	/**
1701 	 * Ensure that we either have a free list with nodes available, or a
1702 	 * completely empty list to allocate and prepend new nodes to.
1703 	 */
1704 	assert(((ptd_free_list != NULL) && (ptd_free_count > 0)) ||
1705 	    ((ptd_free_list == NULL) && (ptd_free_count == 0)));
1706 
1707 	if (__improbable(ptd_free_count == 0)) {
1708 		pmap_paddr_t pa = 0;
1709 
1710 		/**
1711 		 * Drop the lock while allocating pages since that can take a while and
1712 		 * because preemption has to be enabled when allocating memory.
1713 		 */
1714 		pmap_simple_unlock(&ptd_free_list_lock);
1715 
1716 		if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) {
1717 			return NULL;
1718 		}
1719 		ptdp = (pt_desc_t *)phystokv(pa);
1720 
1721 		pmap_simple_lock(&ptd_free_list_lock);
1722 		ptd_page_count++;
1723 
1724 		/**
1725 		 * Since the lock was dropped while allocating, it's possible another
1726 		 * CPU already allocated a page. To be safe, prepend the current free
1727 		 * list (which may or may not be empty now) to the page of nodes just
1728 		 * allocated and update the head to point to these new nodes.
1729 		 */
1730 		*((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
1731 		ptd_free_list = ptdp;
1732 		ptd_free_count += ptd_per_page;
1733 	}
1734 
1735 	/* There should be available nodes at this point. */
1736 	if (__improbable((ptd_free_count == 0) || (ptd_free_list == PTD_ENTRY_NULL))) {
1737 		panic_plain("%s: out of PTD entries and for some reason didn't "
1738 		    "allocate more %d %p", __func__, ptd_free_count, ptd_free_list);
1739 	}
1740 
1741 	/* Grab the top node off of the free list to return later. */
1742 	ptdp = ptd_free_list;
1743 
1744 	/**
1745 	 * Advance the free list to the next node.
1746 	 *
1747 	 * Each free pt_desc_t-sized object in this free list uses the first few
1748 	 * bytes of the object to point to the next object in the list. When an
1749 	 * object is deallocated (in ptd_deallocate()) the object is prepended onto
1750 	 * the free list by setting its first few bytes to point to the current free
1751 	 * list head. Then the head is updated to point to that object.
1752 	 *
1753 	 * When a new page is allocated for PTD nodes, it's left zeroed out. Once we
1754 	 * use up all of the previously deallocated nodes, the list will point
1755 	 * somewhere into the last allocated, empty page. We know we're pointing at
1756 	 * this page because the first few bytes of the object will be NULL. In
1757 	 * that case just set the head to this empty object.
1758 	 *
1759 	 * This empty page can be thought of as a "reserve" of empty nodes for the
1760 	 * case where more nodes are being allocated than there are nodes being
1761 	 * deallocated.
1762 	 */
1763 	pt_desc_t *const next_node = (pt_desc_t *)(*(void **)ptd_free_list);
1764 
1765 	/**
1766 	 * If the next node in the list is NULL but there are supposed to still be
1767 	 * nodes left, then we've hit the previously allocated empty page of nodes.
1768 	 * Go ahead and advance the free list to the next free node in that page.
1769 	 */
1770 	if ((next_node == PTD_ENTRY_NULL) && (ptd_free_count > 1)) {
1771 		ptd_free_list = ptd_free_list + 1;
1772 	} else {
1773 		ptd_free_list = next_node;
1774 	}
1775 
1776 	ptd_free_count--;
1777 
1778 	pmap_simple_unlock(&ptd_free_list_lock);
1779 
1780 	ptdp->pmap = NULL;
1781 
1782 	/**
1783 	 * Calculate and stash the address of the ptd_info_t associated with this
1784 	 * PTD. This can be done easily because both structures co-exist in the same
1785 	 * page, with ptd_info_t's starting at a given offset from the start of the
1786 	 * page.
1787 	 *
1788 	 * Each PTD is associated with a ptd_info_t of the same index. For example,
1789 	 * the 15th PTD will use the 15th ptd_info_t in the same page.
1790 	 */
1791 	const unsigned ptd_index = ((uintptr_t)ptdp & PAGE_MASK) / sizeof(pt_desc_t);
1792 	assert(ptd_index < ptd_per_page);
1793 
1794 	const uintptr_t start_of_page = (uintptr_t)ptdp & ~PAGE_MASK;
1795 	ptd_info_t *first_ptd_info = (ptd_info_t *)(start_of_page + ptd_info_offset);
1796 	ptdp->ptd_info = &first_ptd_info[ptd_index];
1797 
1798 	ptdp->va = (vm_offset_t)-1;
1799 	ptdp->ptd_info->wiredcnt = 0;
1800 
1801 	return ptdp;
1802 }
1803 
1804 /**
1805  * Allocate a single page table descriptor (PTD) object.
1806  *
1807  * @param pmap The pmap object that will be owning the page table(s) that this
1808  *             descriptor object represents.
1809  * @param alloc_flags Allocation flags passed to ptd_alloc_unlinked(). See the
1810  *                    definition of that function for a detailed description of
1811  *                    the available flags.
1812  *
1813  * @return The allocated PTD object, or NULL if one failed to get allocated
1814  *         (which indicates that memory wasn't able to get allocated).
1815  */
1816 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc(pmap_t pmap,unsigned int alloc_flags)1817 ptd_alloc(pmap_t pmap, unsigned int alloc_flags)
1818 {
1819 	pt_desc_t *ptdp = ptd_alloc_unlinked(alloc_flags);
1820 
1821 	if (ptdp == NULL) {
1822 		return NULL;
1823 	}
1824 
1825 	/**
1826 	 * For PTDs that are linked to pmaps, initialize the wired count to 1
1827 	 * to prevent pmap_remove() from concurrently attempting to free a
1828 	 * newly-installed page table page while it is still being initialized.
1829 	 * This wired reference will be atomically dropped in ptd_info_init()
1830 	 * once page table initialization is complete.
1831 	 */
1832 	ptdp->ptd_info->wiredcnt = 1;
1833 	ptdp->pmap = pmap;
1834 
1835 	pmap_tt_ledger_credit(pmap, sizeof(*ptdp));
1836 	return ptdp;
1837 }
1838 
1839 /**
1840  * Deallocate a single page table descriptor (PTD) object.
1841  *
1842  * @note Ledger statistics are tracked on a per-pmap basis, so for those pages
1843  *       which are not associated with any specific pmap (e.g., IOMMU pages),
1844  *       the caller must ensure that the pmap/iommu field in the PTD object is
1845  *       NULL before calling this function.
1846  *
1847  * @param ptdp Pointer to the PTD object to deallocate.
1848  */
1849 MARK_AS_PMAP_TEXT void
ptd_deallocate(pt_desc_t * ptdp)1850 ptd_deallocate(pt_desc_t *ptdp)
1851 {
1852 	pmap_t pmap = ptdp->pmap;
1853 
1854 	/* Prepend the deallocated node to the free list. */
1855 	pmap_simple_lock(&ptd_free_list_lock);
1856 	(*(void **)ptdp) = (void *)ptd_free_list;
1857 	ptd_free_list = (pt_desc_t *)ptdp;
1858 	ptd_free_count++;
1859 	pmap_simple_unlock(&ptd_free_list_lock);
1860 
1861 	/**
1862 	 * If this PTD was being used to represent an IOMMU page then there won't be
1863 	 * an associated pmap, and therefore no ledger statistics to update.
1864 	 */
1865 	if ((uintptr_t)pmap != IOMMU_INSTANCE_NULL) {
1866 		pmap_tt_ledger_debit(pmap, sizeof(*ptdp));
1867 	}
1868 }
1869 
1870 /**
1871  * This function initializes the VA within a PTD based on the page table it's
1872  * representing.  This function must be called before a newly-allocated page
1873  * table is installed via sptm_map_table(), as other threads will be able to
1874  * use that page table as soon as it is installed and will expect valid PTD
1875  * info at that point.  It is assumed that sptm_map_table() will issue barriers
1876  * which effectively guarantee the ordering of these updates.
1877  *
1878  * @param ptdp Pointer to the PTD object which contains the ptd_info_t field to
1879  *             update. Must match up with the `pmap` and `ptep` parameters.
1880  * @param pmap The pmap that owns the page table managed by the passed in PTD.
1881  * @param va Any virtual address that resides within the virtual address space
1882  *           being mapped by the page table pointed to by `ptep`.
1883  * @param level The level in the page table hierarchy that the table resides.
1884  * @param ptep A pointer into a page table that the passed in PTD manages. This
1885  *             page table must be owned by `pmap` and be the PTE that maps `va`.
1886  */
1887 MARK_AS_PMAP_TEXT void
ptd_info_init(pt_desc_t * ptdp,pmap_t pmap,vm_map_address_t va,unsigned int level,pt_entry_t * ptep)1888 ptd_info_init(
1889 	pt_desc_t *ptdp,
1890 	pmap_t pmap,
1891 	vm_map_address_t va,
1892 	unsigned int level,
1893 	pt_entry_t *ptep)
1894 {
1895 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1896 
1897 	if (ptdp->pmap != pmap) {
1898 		panic("%s: pmap mismatch, ptdp=%p, pmap=%p, va=%p, level=%u, ptep=%p",
1899 		    __func__, ptdp, pmap, (void*)va, level, ptep);
1900 	}
1901 
1902 	/**
1903 	 * Root tables are managed separately, and can be accessed through the
1904 	 * pmap structure itself (there's only one root table per address space).
1905 	 */
1906 	assert(level > pt_attr_root_level(pt_attr));
1907 
1908 	/**
1909 	 * The "va" field represents the first virtual address that this page table
1910 	 * is translating for. Naturally, this is dependent on the level the page
1911 	 * table resides at since more VA space is mapped the closer the page
1912 	 * table's level is to the root.
1913 	 */
1914 	ptdp->va = (vm_offset_t) va & ~pt_attr_ln_pt_offmask(pt_attr, level - 1);
1915 }
1916 
1917 /**
1918  * Performs final initialization of a newly-allocated page table descriptor.
1919  * This function effectively marks the linked page table as eligible for deallocation
1920  * and should therefore be called once initialization and mapping of the page table is
1921  * complete.
1922  *
1923  * @param ptdp Pointer to the PTD object which contains the ptd_info_t field to
1924  *             finalize
1925  */
1926 void
ptd_info_finalize(pt_desc_t * ptdp)1927 ptd_info_finalize(pt_desc_t *ptdp)
1928 {
1929 	/**
1930 	 * Atomically drop the wired count (previously initialized to 1) with
1931 	 * release ordering to ensure all prior page table initialization is visible
1932 	 * to any subsequent pmap operation that attempts to operate on the PTD.
1933 	 */
1934 	__assert_only unsigned short prev_refcnt =
1935 	    os_atomic_dec_orig(&ptdp->ptd_info->wiredcnt, release);
1936 	assert3u(prev_refcnt, >, 0);
1937 }
1938 
1939 /**
1940  * Credit a specific ledger entry within the passed in pmap's ledger object.
1941  *
1942  * @param pmap The pmap whose ledger should be updated.
1943  * @param entry The specifc ledger entry to update. This needs to be one of the
1944  *              task_ledger entries.
1945  * @param amount The amount to credit from the ledger.
1946  *
1947  * @return The return value from the credit operation.
1948  */
1949 kern_return_t
pmap_ledger_credit(pmap_t pmap,int entry,ledger_amount_t amount)1950 pmap_ledger_credit(pmap_t pmap, int entry, ledger_amount_t amount)
1951 {
1952 	assert(pmap != NULL);
1953 
1954 	return ledger_credit(pmap->ledger, entry, amount);
1955 }
1956 
1957 /**
1958  * Debit a specific ledger entry within the passed in pmap's ledger object.
1959  *
1960  * @param pmap The pmap whose ledger should be updated.
1961  * @param entry The specifc ledger entry to update. This needs to be one of the
1962  *              task_ledger entries.
1963  * @param amount The amount to debit from the ledger.
1964  *
1965  * @return The return value from the debit operation.
1966  */
1967 kern_return_t
pmap_ledger_debit(pmap_t pmap,int entry,ledger_amount_t amount)1968 pmap_ledger_debit(pmap_t pmap, int entry, ledger_amount_t amount)
1969 {
1970 	assert(pmap != NULL);
1971 
1972 	return ledger_debit(pmap->ledger, entry, amount);
1973 }
1974 
1975 /**
1976  * Validate that the pointer passed into this method is a valid pmap object.
1977  *
1978  * @param pmap The pointer to validate.
1979  * @param func The stringized function name of the caller that will be printed
1980  *             in the case that the validation fails.
1981  */
1982 void
validate_pmap_internal(const volatile struct pmap * pmap,const char * func)1983 validate_pmap_internal(const volatile struct pmap *pmap, const char *func)
1984 {
1985 	#pragma unused(pmap, func)
1986 	assert(pmap != NULL);
1987 }
1988 
1989 /**
1990  * Validate that the pointer passed into this method is a valid pmap object and
1991  * is safe to both read and write.
1992  *
1993  * @param pmap The pointer to validate.
1994  * @param func The stringized function name of the caller that will be printed
1995  *             in the case that the validation fails.
1996  */
1997 void
validate_pmap_mutable_internal(const volatile struct pmap * pmap,const char * func)1998 validate_pmap_mutable_internal(const volatile struct pmap *pmap, const char *func)
1999 {
2000 	#pragma unused(pmap, func)
2001 	assert(pmap != NULL);
2002 }
2003 
2004 /**
2005  * Validate that the passed in pmap pointer is a pmap object that was allocated
2006  * by the pmap and not just random memory.
2007  *
2008  * This function will panic if the validation fails.
2009  *
2010  * @param pmap The object to validate.
2011  */
2012 void
pmap_require(pmap_t pmap)2013 pmap_require(pmap_t pmap)
2014 {
2015 	if (pmap != kernel_pmap) {
2016 		zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2017 	}
2018 }
2019 
2020 /**
2021  * Helper function used when sorting and searching SPTM/PPL I/O ranges.
2022  *
2023  * @param a The first SPTM/PPL I/O range to compare.
2024  * @param b The second SPTM/PPL I/O range to compare.
2025  *
2026  * @return < 0 for a < b
2027  *           0 for a == b
2028  *         > 0 for a > b
2029  */
2030 static int
cmp_io_rgns(const void * a,const void * b)2031 cmp_io_rgns(const void *a, const void *b)
2032 {
2033 	const pmap_io_range_t *range_a = a;
2034 	const pmap_io_range_t *range_b = b;
2035 
2036 	if ((range_b->addr + range_b->len) <= range_a->addr) {
2037 		return 1;
2038 	} else if ((range_a->addr + range_a->len) <= range_b->addr) {
2039 		return -1;
2040 	} else {
2041 		return 0;
2042 	}
2043 }
2044 
2045 /**
2046  * Find and return the SPTM/PPL I/O range that contains the passed in physical
2047  * address.
2048  *
2049  * @note This function performs a binary search on the already sorted
2050  *       io_attr_table, so it should be reasonably fast.
2051  *
2052  * @param paddr The physical address to query a specific I/O range for.
2053  *
2054  * @return A pointer to the pmap_io_range_t structure if one of the ranges
2055  *         contains the passed in physical address. Otherwise, NULL.
2056  */
2057 pmap_io_range_t*
pmap_find_io_attr(pmap_paddr_t paddr)2058 pmap_find_io_attr(pmap_paddr_t paddr)
2059 {
2060 	unsigned int begin = 0;
2061 	unsigned int end = num_io_rgns - 1;
2062 
2063 	/**
2064 	 * If there are no I/O ranges, or the wanted address is below the lowest
2065 	 * range or above the highest range, then there's no point in searching
2066 	 * since it won't be here.
2067 	 */
2068 	if ((num_io_rgns == 0) || (paddr < io_attr_table[begin].addr) ||
2069 	    (paddr >= (io_attr_table[end].addr + io_attr_table[end].len))) {
2070 		return NULL;
2071 	}
2072 
2073 	/**
2074 	 * A dummy I/O range to compare against when searching for a range that
2075 	 * includes `paddr`.
2076 	 */
2077 	const pmap_io_range_t wanted_range = {
2078 		.addr = paddr & ~PAGE_MASK,
2079 		.len = PAGE_SIZE
2080 	};
2081 
2082 	/* Perform a binary search to find the wanted I/O range. */
2083 	for (;;) {
2084 		const unsigned int middle = (begin + end) / 2;
2085 		const int cmp = cmp_io_rgns(&wanted_range, &io_attr_table[middle]);
2086 
2087 		if (cmp == 0) {
2088 			pmap_io_range_t const *range = &io_attr_table[middle];
2089 			if (!(range->wimg & PMAP_IO_RANGE_NOT_IO)) {
2090 				/* Success! Found the wanted I/O range. */
2091 				return &io_attr_table[middle];
2092 			} else {
2093 				/* Ranges may not overlap, so we're not going to find anything. */
2094 				break;
2095 			}
2096 		} else if (begin == end) {
2097 			/* We've checked every range and didn't find a match. */
2098 			break;
2099 		} else if (cmp > 0) {
2100 			/* The wanted range is above the middle. */
2101 			begin = middle + 1;
2102 		} else {
2103 			/* The wanted range is below the middle. */
2104 			end = middle;
2105 		}
2106 	}
2107 
2108 	return NULL;
2109 }
2110 
2111 /**
2112  * Iterate over all pmap-io-ranges, call the given step function on
2113  * each of them, returning prematurely if the step function returns
2114  * false.
2115  *
2116  * @param step The step function applied to each range. If it returns
2117  *             false, iteration stops.
2118  */
2119 
2120 void
2121 pmap_range_iterate(bool (^step)(pmap_io_range_t const *))
2122 {
2123 	for (size_t i = 0; i < num_io_rgns; i++) {
2124 		if (!step(&io_attr_table[i])) {
2125 			return;
2126 		}
2127 	}
2128 }
2129 
2130 /**
2131  * Initialize the pmap per-CPU data structure for a single CPU. This is called
2132  * once for each CPU in the system, on the CPU whose per-cpu data needs to be
2133  * initialized.
2134  *
2135  * In reality, many of the per-cpu data fields will have either already been
2136  * initialized or will rely on the fact that the per-cpu data is either zeroed
2137  * out during allocation (on non-PPL systems), or the data itself is a global
2138  * variable which will be zeroed by default (on PPL systems).
2139  *
2140  * @param cpu_number The number of the CPU whose pmap per-cpu data should be
2141  *                   initialized. This number should correspond to the CPU
2142  *                   executing this code.
2143  */
2144 MARK_AS_PMAP_TEXT void
pmap_cpu_data_init_internal(unsigned int cpu_number)2145 pmap_cpu_data_init_internal(unsigned int cpu_number)
2146 {
2147 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
2148 
2149 	pmap_cpu_data->cpu_number = cpu_number;
2150 
2151 	/* Setup per-cpu fields used when calling into the SPTM. */
2152 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
2153 	assert(((uintptr_t)sptm_pcpu & (PMAP_SPTM_PCPU_ALIGN - 1)) == 0);
2154 	sptm_pcpu->sptm_user_pointer_ops_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_user_pointer_ops);
2155 	sptm_pcpu->sptm_ops_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_ops);
2156 	sptm_pcpu->sptm_templates_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_templates);
2157 	sptm_pcpu->sptm_paddrs_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_paddrs);
2158 	sptm_pcpu->sptm_guest_dispatch_paddr = kvtophys_nofail((vm_offset_t)&sptm_pcpu->sptm_guest_dispatch);
2159 
2160 	const uint16_t sptm_cpu_number = sptm_cpu_id(ml_get_topology_info()->cpus[cpu_number].phys_id);
2161 	sptm_pcpu->sptm_cpu_id = sptm_cpu_number;
2162 
2163 	const pmap_paddr_t iommu_scratch =
2164 	    sptm_cpu_iommu_scratch_start + (sptm_cpu_number * PMAP_IOMMU_SCRATCH_SIZE);
2165 	assert(iommu_scratch <= (sptm_cpu_iommu_scratch_end - PMAP_IOMMU_SCRATCH_SIZE));
2166 	sptm_pcpu->sptm_iommu_scratch = (void*)phystokv(iommu_scratch);
2167 	sptm_pcpu->sptm_prev_ptes = (sptm_pte_t *)((uintptr_t)(SPTMArgs->sptm_prev_ptes) + (PAGE_SIZE * sptm_cpu_number));
2168 	sptm_pcpu->sptm_cpu_id = sptm_cpu_number;
2169 }
2170 
2171 /**
2172  * Initialize the pmap per-cpu data for the bootstrap CPU (the other CPUs should
2173  * just call pmap_cpu_data_init() directly).
2174  */
2175 void
pmap_cpu_data_array_init(void)2176 pmap_cpu_data_array_init(void)
2177 {
2178 	/**
2179 	 * The EL2 portion of the IOMMU drivers need to have some memory they can
2180 	 * use to pass data into the SPTM. To save memory (since most IOMMU drivers
2181 	 * need this) and to preclude the need for IOMMU drivers to dynamically
2182 	 * allocate memory in their mapping/unmapping paths, memory is pre-allocated
2183 	 * here per-cpu for their usage.
2184 	 *
2185 	 * SPTM TODO: Only allocate this memory on systems that have IOMMU drivers.
2186 	 */
2187 	sptm_cpu_iommu_scratch_start = avail_start;
2188 	avail_start += MAX_CPUS * PMAP_IOMMU_SCRATCH_SIZE;
2189 	sptm_cpu_iommu_scratch_end = avail_start;
2190 
2191 	pmap_cpu_data_init();
2192 }
2193 
2194 /**
2195  * Retrieve the pmap per-cpu data for the current CPU.
2196  *
2197  * @return The per-cpu pmap data for the current CPU.
2198  */
2199 pmap_cpu_data_t *
pmap_get_cpu_data(void)2200 pmap_get_cpu_data(void)
2201 {
2202 	pmap_cpu_data_t *pmap_cpu_data = NULL;
2203 
2204 	pmap_cpu_data = &getCpuDatap()->cpu_pmap_cpu_data;
2205 	return pmap_cpu_data;
2206 }
2207 
2208 /**
2209  * Retrieve the pmap per-cpu data for the specified cpu index.
2210  *
2211  * @return The per-cpu pmap data for the CPU
2212  */
2213 pmap_cpu_data_t *
pmap_get_remote_cpu_data(unsigned int cpu)2214 pmap_get_remote_cpu_data(unsigned int cpu)
2215 {
2216 	cpu_data_t *cpu_data = cpu_datap((int)cpu);
2217 	if (cpu_data == NULL) {
2218 		return NULL;
2219 	} else {
2220 		return &cpu_data->cpu_pmap_cpu_data;
2221 	}
2222 }
2223 
2224 /**
2225  * Define the resources we need for spinning
2226  * until a paddr is not inflight.
2227  */
2228 __abortlike
2229 static hw_spin_timeout_status_t
hw_lck_paddr_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)2230 hw_lck_paddr_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
2231 {
2232 	panic("paddr spinlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
2233 	    HW_SPIN_TIMEOUT_DETAILS_FMT,
2234 	    _lock, HW_SPIN_TIMEOUT_ARG(to, st),
2235 	    HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
2236 }
2237 
2238 static const struct hw_spin_policy hw_paddr_inflight_spin_policy = {
2239 	.hwsp_name              = "hw_lck_paddr_lock",
2240 	.hwsp_timeout_atomic    = &LockTimeOut,
2241 	.hwsp_op_timeout        = hw_lck_paddr_timeout_panic,
2242 };
2243 
2244 /**
2245  * Barrier function for spinning until the given physical page is
2246  * no longer inflight.
2247  *
2248  * @param paddr The physical address we want to spin until is not inflight.
2249  */
2250 static __attribute__((noinline)) void
pmap_paddr_inflight_barrier(pmap_paddr_t paddr)2251 pmap_paddr_inflight_barrier(pmap_paddr_t paddr)
2252 {
2253 	hw_spin_policy_t  pol = &hw_paddr_inflight_spin_policy;
2254 	hw_spin_timeout_t to;
2255 	hw_spin_state_t   state  = { };
2256 
2257 	disable_preemption();
2258 	to  = hw_spin_compute_timeout(pol);
2259 	while (sptm_paddr_is_inflight(paddr) &&
2260 	    hw_spin_should_keep_spinning((void*)paddr, pol, to, &state)) {
2261 		;
2262 	}
2263 	enable_preemption();
2264 }
2265 
2266 /**
2267  * Convenience function for checking if a given physical page is inflight.
2268  *
2269  * @param paddr The physical address to query.
2270  *
2271  * @return true if the page in question has no mappings, false otherwise.
2272  */
2273 inline bool
pmap_is_page_free(pmap_paddr_t paddr)2274 pmap_is_page_free(pmap_paddr_t paddr)
2275 {
2276 	/**
2277 	 * We can't query the paddr refcounts if the physical page
2278 	 * is currently inflight. If it does, we spin until it's not.
2279 	 */
2280 	if (__improbable(sptm_paddr_is_inflight(paddr))) {
2281 		pmap_paddr_inflight_barrier(paddr);
2282 	}
2283 
2284 	/**
2285 	 * A barrier from the last inflight operation. This allows us
2286 	 * to have proper visibility for the refcounts. Otherwise,
2287 	 * sptm_frame_is_last_mapping() might see stale values.
2288 	 */
2289 	os_atomic_thread_fence(acquire);
2290 
2291 	/**
2292 	 * If SPTM returns TRUE for SPTM_REFCOUNT_NONE, it means
2293 	 * the physical page has no mappings.
2294 	 */
2295 	return sptm_frame_is_last_mapping(paddr, SPTM_REFCOUNT_NONE);
2296 }
2297 
2298 #if MACH_ASSERT
2299 /**
2300  * Verify that a given physical page contains no mappings (outside of the
2301  * default physical aperture mapping) and if it does, then panic.
2302  *
2303  * @note It's recommended to use pmap_verify_free() directly when operating in
2304  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2305  *       normally being called from outside of the PPL, and the pv_head_table
2306  *       can't be modified outside of the PPL).
2307  *
2308  * @param ppnum Physical page number to check there are no mappings to.
2309  */
2310 void
pmap_assert_free(ppnum_t ppnum)2311 pmap_assert_free(ppnum_t ppnum)
2312 {
2313 	const pmap_paddr_t pa = ptoa(ppnum);
2314 
2315 	/* Only mappings to kernel-managed physical memory are tracked. */
2316 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2317 		return;
2318 	}
2319 
2320 	const unsigned int pai = pa_index(pa);
2321 	const uintptr_t pvh = pai_to_pvh(pai);
2322 
2323 	/**
2324 	 * This function is always called from outside of the PPL. Because of this,
2325 	 * the PVH entry can't be locked. This function is generally only called
2326 	 * before the VM reclaims a physical page and shouldn't be creating new
2327 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2328 	 * the worst case is that the system will panic in another way, and we were
2329 	 * already about to panic anyway.
2330 	 */
2331 
2332 	/**
2333 	 * Since pmap_verify_free() returned false, that means there is at least one
2334 	 * mapping left. Let's get some extra info on the first mapping we find to
2335 	 * dump in the panic string (the common case is that there is one spare
2336 	 * mapping that was never unmapped).
2337 	 */
2338 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2339 
2340 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2341 		first_ptep = pvh_ptep(pvh);
2342 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2343 		pv_entry_t *pvep = pvh_pve_list(pvh);
2344 
2345 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2346 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2347 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2348 			if (first_ptep != PT_ENTRY_NULL) {
2349 				break;
2350 			}
2351 		}
2352 
2353 		/* The PVE should have at least one valid PTE. */
2354 		assert(first_ptep != PT_ENTRY_NULL);
2355 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2356 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2357 		    __func__, (void*)pvh, pai);
2358 	} else {
2359 		/**
2360 		 * The mapping disappeared between here and the pmap_verify_free() call.
2361 		 * The only way that can happen is if the VM was racing this call with
2362 		 * a call that unmaps PTEs. Operations on this page should not be
2363 		 * occurring at the same time as this check, and unfortunately we can't
2364 		 * lock the PVH entry to prevent it, so just panic instead.
2365 		 */
2366 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2367 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2368 		    __func__, (void*)pvh, pai);
2369 	}
2370 
2371 	/* Panic with a unique string identifying the first bad mapping and owner. */
2372 	{
2373 		/* First PTE is mapped by the main CPUs. */
2374 		pmap_t pmap = ptep_get_pmap(first_ptep);
2375 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2376 
2377 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2378 		    "%s CPU mapping (pmap: %p)",
2379 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2380 	}
2381 }
2382 #endif /* MACH_ASSERT */
2383 
2384 inline void
pmap_recycle_page(ppnum_t pn)2385 pmap_recycle_page(ppnum_t pn)
2386 {
2387 	const bool is_freed = pmap_is_page_free(ptoa(pn));
2388 
2389 	if (__improbable(!is_freed)) {
2390 		/*
2391 		 * There is a redundancy here, but we are going to panic anyways,
2392 		 * and ASSERT_PMAP_FREE traces useful information. So, we keep this
2393 		 * behavior.
2394 		 */
2395 #if MACH_ASSERT
2396 		pmap_assert_free(pn);
2397 #endif /* MACH_ASSERT */
2398 		panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn));
2399 	}
2400 
2401 	const pmap_paddr_t paddr = ptoa(pn);
2402 	const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
2403 	if (__improbable(pmap_type_requires_retype_on_recycle(frame_type))) {
2404 		const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2405 		sptm_retype(paddr, frame_type, XNU_DEFAULT, retype_params);
2406 	}
2407 }
2408 
2409 #if __ARM64_PMAP_SUBPAGE_L1__
2410 /* A structure tracking the state of a SURT page. */
2411 typedef struct {
2412 	/* The PA of the SURT page. */
2413 	pmap_paddr_t surt_page_pa;
2414 
2415 	/* A bitmap tracking the allocation status of the SURTs in the page. */
2416 	bitmap_t surt_page_free_bitmap[SUBPAGE_USER_ROOT_TABLE_INDEXES / (sizeof(bitmap_t) * 8)];
2417 
2418 	/* A queue chain chaining all the tracking structures together. */
2419 	queue_chain_t surt_chain;
2420 } surt_page_t;
2421 
2422 /**
2423  * Initialize the SURT subsystem.
2424  *
2425  * @note Expected to be called when pmap is being bootstrapped, before a user
2426  *       pmap is created.
2427  */
2428 void
surt_init()2429 surt_init()
2430 {
2431 	if (__improbable(surt_ready)) {
2432 		panic("%s: initializing the SURT subsystem while it has already been initialized", __func__);
2433 	}
2434 
2435 	queue_init(&surt_list);
2436 	lck_mtx_init(&surt_lock, &pmap_lck_grp, LCK_ATTR_NULL);
2437 
2438 	/* A plain write is okay only in single-core early bootstrapping. */
2439 	surt_ready = true;
2440 }
2441 
2442 /**
2443  * Lock the SURT lock.
2444  */
2445 static inline void
surt_lock_lock()2446 surt_lock_lock()
2447 {
2448 	assert(surt_ready);
2449 	lck_mtx_lock(&surt_lock);
2450 }
2451 
2452 /**
2453  * Unlock the SURT lock.
2454  */
2455 static inline void
surt_lock_unlock()2456 surt_lock_unlock()
2457 {
2458 	lck_mtx_unlock(&surt_lock);
2459 }
2460 
2461 /**
2462  * Try to find a SURT from the SURT page queue.
2463  *
2464  * @note This function doesn't block. If a SURT is not found, the caller is
2465  *       responsible for allocating a page and feed it to the SURT subsystem.
2466  *
2467  * @return the PA of the SURT if one is found, 0 otherwise.
2468  */
2469 pmap_paddr_t
surt_try_alloc()2470 surt_try_alloc()
2471 {
2472 	surt_lock_lock();
2473 	pmap_paddr_t surt_pa = 0ULL;
2474 
2475 	/* Look for a free table on existing SURT pages. */
2476 	surt_page_t *surt_page;
2477 	qe_foreach_element(surt_page, &surt_list, surt_chain) {
2478 		const int first_available_index = bitmap_lsb_first(&surt_page->surt_page_free_bitmap[0], SUBPAGE_USER_ROOT_TABLE_INDEXES);
2479 		if (first_available_index >= 0) {
2480 			surt_pa = surt_pa_from_surt_page_pa_and_index(surt_page->surt_page_pa, (uint8_t) first_available_index);
2481 			bitmap_clear(&surt_page->surt_page_free_bitmap[0], first_available_index);
2482 			break;
2483 		}
2484 	}
2485 
2486 	/**
2487 	 * Either return a non-zero PA of the found SURT or zero. A zero return
2488 	 * value indicates the caller should allocate a new SURT page
2489 	 */
2490 	surt_lock_unlock();
2491 	return surt_pa;
2492 }
2493 
2494 /**
2495  * Free the SURT at a physical address.
2496  *
2497  * @return True if the SURT page has no allocated SURT and has been removed
2498  *         from the queue so that the caller can repurpose the page. False
2499  *         otherwise.
2500  */
2501 bool
surt_free(pmap_paddr_t surt_pa)2502 surt_free(pmap_paddr_t surt_pa)
2503 {
2504 	if (__improbable(surt_pa & (SUBPAGE_USER_ROOT_TABLE_SIZE - 1))) {
2505 		panic("%s: surt_pa %p is expected to be %u-byte aligned",
2506 		    __func__, (void *)surt_pa, (unsigned int) SUBPAGE_USER_ROOT_TABLE_SIZE);
2507 	}
2508 
2509 	surt_lock_lock();
2510 	const uint8_t surt_index = (uint8_t) ((surt_pa & PAGE_MASK) / SUBPAGE_USER_ROOT_TABLE_SIZE);
2511 
2512 	/* Look for a free table on existing SURT pages. */
2513 	surt_page_t *surt_page;
2514 	qe_foreach_element_safe(surt_page, &surt_list, surt_chain) {
2515 		if (surt_page->surt_page_pa == surt_page_pa_from_surt_pa(surt_pa)) {
2516 			/* Mark the SURT as free. */
2517 			bitmap_set(&surt_page->surt_page_free_bitmap[0], surt_index);
2518 
2519 			/* If the entire SURT page is free, remove it from the page queue. */
2520 			if (bitmap_is_full(&surt_page->surt_page_free_bitmap[0], SUBPAGE_USER_ROOT_TABLE_INDEXES)) {
2521 				remqueue(&surt_page->surt_chain);
2522 
2523 				/* Done with the page queue so unlock it before freeing surt_page. */
2524 				surt_lock_unlock();
2525 				kfree_type(surt_page_t, surt_page);
2526 				return true;
2527 			} else {
2528 				surt_lock_unlock();
2529 				return false;
2530 			}
2531 		}
2532 	}
2533 
2534 	panic("%s: no matching surt_page_t found for surt_pa: %p", __func__, (void *)surt_pa);
2535 }
2536 
2537 /**
2538  * Add a SURT page to the SURT page queue, with its SURT at index 0 allocated.
2539  *
2540  * @note Designed this way so that the caller can call into SPTM for SURT
2541  *       allocation before the page is seen by the other threads in the
2542  *       system.
2543  *
2544  * @param surt_page_pa The phyiscal address of the SURT page.
2545  */
2546 void
surt_feed_page_with_first_table_allocated(pmap_paddr_t surt_page_pa)2547 surt_feed_page_with_first_table_allocated(pmap_paddr_t surt_page_pa)
2548 {
2549 	surt_page_t *surt_page = kalloc_type(surt_page_t, Z_ZERO | Z_WAITOK);
2550 
2551 	if (__improbable(surt_page_pa & PAGE_MASK)) {
2552 		panic("%s: surt_page_pa %p is expected to be page aligned", __func__, (void *)surt_page_pa);
2553 	}
2554 
2555 	surt_lock_lock();
2556 	surt_page->surt_page_pa = surt_page_pa;
2557 	bitmap_full(&surt_page->surt_page_free_bitmap[0], SUBPAGE_USER_ROOT_TABLE_INDEXES);
2558 	bitmap_clear(&surt_page->surt_page_free_bitmap[0], 0);
2559 	enqueue_head(&surt_list, &surt_page->surt_chain);
2560 	surt_lock_unlock();
2561 }
2562 
2563 unsigned int
surt_list_len()2564 surt_list_len()
2565 {
2566 	unsigned int len = 0;
2567 
2568 	surt_lock_lock();
2569 	__unused surt_page_t *surt_page;
2570 	qe_foreach_element(surt_page, &surt_list, surt_chain) {
2571 		len = len + 1;
2572 	}
2573 	surt_lock_unlock();
2574 	return len;
2575 }
2576 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2577 
2578 #if DEBUG || DEVELOPMENT
2579 /**
2580  * Get the value of the WC/RT on non-DRAM mapping request counter.
2581  *
2582  * @return The value of the counter.
2583  */
2584 unsigned int
pmap_wcrt_on_non_dram_count_get()2585 pmap_wcrt_on_non_dram_count_get()
2586 {
2587 	return os_atomic_load(&pmap_wcrt_on_non_dram_count, relaxed);
2588 }
2589 
2590 /**
2591  * Atomically increment the WC/RT on non-DRAM mapping request counter.
2592  */
2593 void
pmap_wcrt_on_non_dram_count_increment_atomic()2594 pmap_wcrt_on_non_dram_count_increment_atomic()
2595 {
2596 	os_atomic_inc(&pmap_wcrt_on_non_dram_count, relaxed);
2597 }
2598 #endif /* DEBUG || DEVELOPMENT */
2599