xref: /xnu-12377.1.9/osfmk/arm/pmap/pmap_data.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2020-2021, 2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <arm/cpu_data_internal.h>
29 #include <kern/queue.h>
30 #include <libkern/OSAtomic.h>
31 #include <libkern/section_keywords.h>
32 #include <pexpert/device_tree.h>
33 #include <os/atomic_private.h>
34 #include <vm/cpm_internal.h>
35 #include <vm/vm_kern.h>
36 #include <vm/vm_protos.h>
37 #include <vm/vm_object_xnu.h>
38 #include <vm/vm_page_internal.h>
39 #include <vm/vm_pageout.h>
40 
41 #include <arm/pmap/pmap_internal.h>
42 
43 /**
44  * Physical Page Attribute Table.
45  *
46  * Array that contains a set of flags for each kernel-managed physical VM page.
47  *
48  * @note There can be a disparity between the VM page size and the underlying
49  *       hardware page size for a specific address space. In those cases, it's
50  *       possible that multiple hardware pages will share the same set of
51  *       attributes. The VM operates on regions of memory by the VM page size
52  *       and is aware that all hardware pages within each VM page share
53  *       attributes.
54  */
55 SECURITY_READ_ONLY_LATE(volatile pp_attr_t*) pp_attr_table = (volatile pp_attr_t*)NULL;
56 
57 /**
58  * Physical to Virtual Table.
59  *
60  * Data structure that contains a list of virtual mappings for each kernel-
61  * managed physical page. Other flags and metadata are also stored in this
62  * structure on a per-physical-page basis.
63  *
64  * This structure is arranged as an array of pointers, where each pointer can
65  * point to one of three different types of data (single mapping, multiple
66  * mappings, or page table descriptor). Metadata about each page (including the
67  * type of pointer) are located in the lower and upper bits of the pointer.
68  * These bits need to be set/masked out to be able to dereference the pointer,
69  * so it's recommended to use the provided API in pmap_data.h to access the
70  * pv_head_table since it handles these details for you.
71  */
72 SECURITY_READ_ONLY_LATE(pv_entry_t * *) pv_head_table = (pv_entry_t**)NULL;
73 
74 /**
75  * Queue chain of userspace page table pages that can be quickly reclaimed by
76  * pmap_page_reclaim() in cases where the a page can't easily be allocated
77  * the normal way, but the caller needs a page quickly.
78  */
79 static queue_head_t pt_page_list MARK_AS_PMAP_DATA;
80 
81 /* Lock for pt_page_list. */
82 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pt_pages_lock, 0);
83 
84 /* Simple linked-list structure used in various page free lists. */
85 typedef struct page_free_entry {
86 	/**
87 	 * The first word in an empty page on a free list is used as a pointer to
88 	 * the next free page in the list.
89 	 */
90 	struct page_free_entry *next;
91 } page_free_entry_t;
92 
93 /* Represents a NULL entry in various page free lists. */
94 #define PAGE_FREE_ENTRY_NULL ((page_free_entry_t *) 0)
95 
96 /**
97  * pmap_page_reclaim() is called in critical, latency-sensitive code paths when
98  * either the VM doesn't have any pages available (on non-PPL systems), or the
99  * PPL page free lists are empty (on PPL systems). Before it attempts to reclaim
100  * a userspace page table page (which will have performance penalties), it will
101  * first try allocating a page from this high-priority free list.
102  *
103  * When the pmap is starved for memory and starts relying on
104  * pmap_page_reclaim() to allocate memory, then the next page being freed will
105  * be placed onto this list for usage only by pmap_page_reclaim(). Typically
106  * that page will be a userspace page table that was just reclaimed.
107  */
108 static page_free_entry_t *pmap_page_reclaim_list MARK_AS_PMAP_DATA = PAGE_FREE_ENTRY_NULL;
109 
110 /**
111  * Current number of pending requests to reclaim a page table page. This is used
112  * as an indicator to pmap_pages_free() to place any freed pages into the high
113  * priority pmap_page_reclaim() free list so that the next invocations of
114  * pmap_page_reclaim() can use them. Typically this will be a userspace page
115  * table that was just reclaimed.
116  */
117 static unsigned int pmap_pages_request_count MARK_AS_PMAP_DATA = 0;
118 
119 /**
120  * Total number of pages that have been requested from pmap_page_reclaim() since
121  * cold boot.
122  */
123 static unsigned long long pmap_pages_request_acum MARK_AS_PMAP_DATA = 0;
124 
125 /* Lock for the pmap_page_reclaim() high-priority free list. */
126 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_page_reclaim_lock, 0);
127 
128 #if XNU_MONITOR
129 /**
130  * The PPL cannot invoke the VM in order to allocate memory, so we must maintain
131  * a list of free pages that the PPL owns. The kernel can give the PPL
132  * additional pages by grabbing pages from the VM and marking them as PPL-owned.
133  * See pmap_alloc_page_for_ppl() for more information.
134  */
135 static page_free_entry_t *pmap_ppl_free_page_list MARK_AS_PMAP_DATA = PAGE_FREE_ENTRY_NULL;
136 
137 /* The current number of pages in the PPL page free list. */
138 uint64_t pmap_ppl_free_page_count MARK_AS_PMAP_DATA = 0;
139 
140 /* Lock for the PPL page free list. */
141 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_ppl_free_page_lock, 0);
142 #endif /* XNU_MONITOR */
143 
144 /**
145  * This VM object will contain every VM page being used by the pmap. This acts
146  * as a convenient place to put pmap pages to keep the VM from reusing them, as
147  * well as providing a way for looping over every page being used by the pmap.
148  */
149 struct vm_object pmap_object_store VM_PAGE_PACKED_ALIGNED;
150 
151 /* Pointer to the pmap's VM object that can't be modified after machine_lockdown(). */
152 SECURITY_READ_ONLY_LATE(vm_object_t) pmap_object = &pmap_object_store;
153 
154 /**
155  * Global variables strictly used for debugging purposes. These variables keep
156  * track of the total number of pages that have been allocated from the VM for
157  * pmap usage since cold boot, as well as how many are currently in use by the
158  * pmap. Once a page is given back to the VM, then the inuse_pmap_pages_count
159  * will be decremented.
160  *
161  * Even if a page is sitting in one of the pmap's various free lists and hasn't
162  * been allocated for usage, these are still considered "used" by the pmap, from
163  * the perspective of the VM.
164  */
165 static uint64_t alloc_pmap_pages_count __attribute__((aligned(8))) = 0LL;
166 unsigned int inuse_pmap_pages_count = 0;
167 
168 /**
169  * Default watermark values used to keep a healthy supply of physical-to-virtual
170  * entries (PVEs) always available. These values can be overriden by the device
171  * tree (see pmap_compute_pv_targets() for more info).
172  */
173 #if XNU_MONITOR
174 /*
175  * Increase the padding for PPL devices to accommodate increased mapping
176  * pressure from IOMMUs. This isn't strictly necessary, but will reduce the need
177  * to retry mappings due to PV allocation failure.
178  */
179 #define PV_KERN_LOW_WATER_MARK_DEFAULT (0x400)
180 #define PV_ALLOC_CHUNK_INITIAL         (0x400)
181 #define PV_KERN_ALLOC_CHUNK_INITIAL    (0x400)
182 #else /* XNU_MONITOR */
183 #define PV_KERN_LOW_WATER_MARK_DEFAULT (0x200)
184 #define PV_ALLOC_CHUNK_INITIAL         (0x200)
185 #define PV_KERN_ALLOC_CHUNK_INITIAL    (0x200)
186 #endif /* XNU_MONITOR */
187 
188 /**
189  * The pv_free array acts as a ring buffer where each entry points to a linked
190  * list of PVEs that have a length set by this define.
191  */
192 #define PV_BATCH_SIZE (PAGE_SIZE / sizeof(pv_entry_t))
193 
194 /* The batch allocation code assumes that a batch can fit within a single page. */
195 #if defined(__arm64__) && __ARM_16K_PG__
196 /**
197  * PAGE_SIZE is a variable on arm64 systems with 4K VM pages, so no static
198  * assert on those systems.
199  */
200 static_assert((PV_BATCH_SIZE * sizeof(pv_entry_t)) <= PAGE_SIZE);
201 #endif /* defined(__arm64__) && __ARM_16K_PG__ */
202 
203 /**
204  * The number of PVEs to attempt to keep in the kernel-dedicated free list. If
205  * the number of entries is below this value, then allocate more.
206  */
207 static uint32_t pv_kern_low_water_mark MARK_AS_PMAP_DATA = PV_KERN_LOW_WATER_MARK_DEFAULT;
208 
209 /**
210  * The initial number of PVEs to allocate during bootstrap (can be overriden in
211  * the device tree, see pmap_compute_pv_targets() for more info).
212  */
213 uint32_t pv_alloc_initial_target MARK_AS_PMAP_DATA = PV_ALLOC_CHUNK_INITIAL * MAX_CPUS;
214 uint32_t pv_kern_alloc_initial_target MARK_AS_PMAP_DATA = PV_KERN_ALLOC_CHUNK_INITIAL;
215 
216 /**
217  * Global variables strictly used for debugging purposes. These variables keep
218  * track of the number of pages being used for PVE objects, and the total number
219  * of PVEs that have been added to the global or kernel-dedicated free lists
220  * respectively.
221  */
222 static uint32_t pv_page_count MARK_AS_PMAP_DATA = 0;
223 static unsigned pmap_reserve_replenish_stat MARK_AS_PMAP_DATA = 0;
224 static unsigned pmap_kern_reserve_alloc_stat MARK_AS_PMAP_DATA = 0;
225 
226 /**
227  * Number of linked lists of PVEs ("batches") in the global PV free ring buffer.
228  * This must be a power of two for the pv_free_array_n_elems() logic to work.
229  */
230 #define PV_FREE_ARRAY_SIZE (256U)
231 
232 /**
233  * A ring buffer where each entry in the buffer is a linked list of PV entries
234  * (called "batches"). Allocations out of this array will always operate on
235  * a PV_BATCH_SIZE amount of entries at a time.
236  */
237 static pv_free_list_t pv_free_ring[PV_FREE_ARRAY_SIZE] MARK_AS_PMAP_DATA = {0};
238 
239 /* Read and write indices for the pv_free ring buffer. */
240 static uint16_t pv_free_read_idx MARK_AS_PMAP_DATA = 0;
241 static uint16_t pv_free_write_idx MARK_AS_PMAP_DATA = 0;
242 
243 /**
244  * Make sure the PV free array is small enough so that all elements can be
245  * properly indexed by pv_free_[read/write]_idx.
246  */
247 static_assert(PV_FREE_ARRAY_SIZE <= (1 << (sizeof(pv_free_read_idx) * 8)));
248 
249 /**
250  * Return the number of free batches available for allocation out of the PV free
251  * ring buffer. Each batch is a linked list of PVEs with length PV_BATCH_SIZE.
252  *
253  * @note This function requires that PV_FREE_ARRAY_SIZE is a power of two.
254  */
255 static inline uint16_t
pv_free_array_n_elems(void)256 pv_free_array_n_elems(void)
257 {
258 	return (pv_free_write_idx - pv_free_read_idx) & (PV_FREE_ARRAY_SIZE - 1);
259 }
260 
261 /* Free list of PV entries dedicated for usage by the kernel. */
262 static pv_free_list_t pv_kern_free MARK_AS_PMAP_DATA = {0};
263 
264 /* Locks for the global and kernel-dedicated PV free lists. */
265 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_free_array_lock, 0);
266 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_kern_free_list_lock, 0);
267 
268 /* Represents a null page table descriptor (PTD). */
269 #define PTD_ENTRY_NULL ((pt_desc_t *) 0)
270 
271 /* Running free list of PTD nodes. */
272 static pt_desc_t *ptd_free_list MARK_AS_PMAP_DATA = PTD_ENTRY_NULL;
273 
274 /* The number of free PTD nodes available in the free list. */
275 static unsigned int ptd_free_count MARK_AS_PMAP_DATA = 0;
276 
277 /**
278  * The number of PTD objects located in each page being used by the PTD
279  * allocator. The PTD objects share each page with their associated ptd_info_t
280  * objects (with cache-line alignment padding between them). The maximum number
281  * of PTDs that can be placed into a single page is calculated once at boot.
282  */
283 static SECURITY_READ_ONLY_LATE(unsigned) ptd_per_page = 0;
284 
285 /**
286  * The offset in bytes from the beginning of a page of PTD objects where you
287  * start seeing the associated ptd_info_t objects. This is calculated once
288  * during boot to maximize the number of PTD and ptd_info_t objects that can
289  * reside within a page without sharing a cache-line.
290  */
291 static SECURITY_READ_ONLY_LATE(unsigned) ptd_info_offset = 0;
292 
293 /* Lock to protect accesses to the PTD free list. */
294 static decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA);
295 
296 /**
297  * Dummy _internal() prototypes so Clang doesn't complain about missing
298  * prototypes on a non-static function. These functions can't be marked as
299  * static because they need to be called from pmap_ppl_interface.c where the
300  * PMAP_SUPPORT_PROTOYPES() macro will auto-generate the prototype implicitly.
301  */
302 kern_return_t mapping_free_prime_internal(void);
303 
304 #if XNU_MONITOR
305 
306 /**
307  * These types and variables only exist on PPL-enabled systems because those are
308  * the only systems that need to allocate and manage ledger/pmap objects
309  * themselves. On non-PPL systems, those objects are allocated using a standard
310  * zone allocator.
311  */
312 
313 /**
314  * Specify that the maximum number of ledgers and pmap objects are to be
315  * correlated to the maximum number of tasks allowed on the system (at most,
316  * we'll have one pmap object per task). For ledger objects, give a small amount
317  * of extra padding to account for allocation differences between pmap objects
318  * and ledgers (i.e. ~10% of total number of iOS tasks = 200).
319  *
320  * These defines are only valid once `pmap_max_asids` is initialized in
321  * pmap_bootstrap() (the value can change depending on the device tree).
322  */
323 #define LEDGER_PTR_ARRAY_SIZE (pmap_max_asids + 200)
324 #define PMAP_PTR_ARRAY_SIZE (pmap_max_asids)
325 
326 /**
327  * Each ledger object consists of a variable number of ledger entries that is
328  * determined by the template it's based on. The template used for pmap ledger
329  * objects is the task_ledgers template.
330  *
331  * This define attempts to calculate how large each pmap ledger needs to be
332  * based on how many ledger entries exist in the task_ledgers template. This is
333  * found by counting how many integers exist in the task_ledgers structure (each
334  * integer represents the index for a ledger_entry) and multiplying by the size
335  * of a single ledger entry. That value is then added to the other fields in a
336  * ledger structure to get the total size of a single pmap ledger.
337  *
338  * Some of the task ledger's entries use a smaller struct format. TASK_LEDGER_NUM_SMALL_INDICES
339  * is used to determine how much memory we need for those entries.
340  *
341  * This assumed size will get validated when the task_ledgers template is
342  * created and the system will panic if this calculation wasn't correct.
343  *
344  */
345 #define PMAP_LEDGER_DATA_BYTES \
346 	(((sizeof(task_ledgers) / sizeof(int) - TASK_LEDGER_NUM_SMALL_INDICES) * sizeof(struct ledger_entry) \
347 	  + TASK_LEDGER_NUM_SMALL_INDICES * sizeof(struct ledger_entry_small)) \
348 	  + sizeof(struct ledger))
349 
350 /**
351  * Opaque data structure that contains the exact number of bytes required to
352  * hold a single ledger object based off of the task_ledgers template.
353  */
354 typedef struct pmap_ledger_data {
355 	uint8_t pld_data[PMAP_LEDGER_DATA_BYTES];
356 } pmap_ledger_data_t;
357 
358 /**
359  * This struct contains the memory needed to hold a single ledger object used by
360  * the pmap as well as an index into the pmap_ledger_ptr_array used for
361  * validating ledger objects passed into the PPL.
362  */
363 typedef struct pmap_ledger {
364 	/**
365 	 * Either contain the memory needed for a ledger object based on the
366 	 * task_ledgers template (if already allocated) or a pointer to the next
367 	 * ledger object in the free list if the object hasn't been allocated yet.
368 	 *
369 	 * This union has to be the first member of this struct so that the memory
370 	 * used by this struct can be correctly cast to a ledger_t and used
371 	 * as a normal ledger object by the standard ledger API.
372 	 */
373 	union {
374 		struct pmap_ledger_data pld_data;
375 		struct pmap_ledger *next;
376 	};
377 
378 	/**
379 	 * This extra piece of information (not normally associated with generic
380 	 * ledger_t objects) is used to validate that a ledger passed into the PPL
381 	 * is indeed a ledger that was allocated by the PPL, and not just random
382 	 * memory being passed off as a ledger object. See pmap_ledger_validate()
383 	 * for more information on validating ledger objects.
384 	 */
385 	unsigned long array_index;
386 } pmap_ledger_t;
387 
388 /**
389  * This variable is used to ensure that the size of the ledger objects being
390  * allocated by the PPL match up with the actual size of the ledger objects
391  * before objects start being allocated.
392  */
393 static SECURITY_READ_ONLY_LATE(bool) pmap_ledger_size_verified = false;
394 
395 /* Ledger free list lock. */
396 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_ledger_lock, 0);
397 
398 /*
399  * The pmap_ledger_t contents are allowed to be written outside the PPL,
400  * so refcounts must be in a separate PPL-controlled array.
401  */
402 static SECURITY_READ_ONLY_LATE(os_refcnt_t *) pmap_ledger_refcnt = NULL;
403 
404 /**
405  * The number of entries in the pmap ledger pointer and ledger refcnt arrays.
406  * This determines the maximum number of pmap ledger objects that can be
407  * allocated.
408  *
409  * This value might be slightly higher than LEDGER_PTR_ARRAY_SIZE because the
410  * memory used for the array is rounded up to the nearest page boundary.
411  */
412 static SECURITY_READ_ONLY_LATE(unsigned long) pmap_ledger_ptr_array_count = 0;
413 
414 /**
415  * This array is used to validate that ledger objects passed into the PPL were
416  * allocated by the PPL and aren't just random memory being passed off as a
417  * ledger object. It does this by associating each ledger object allocated by
418  * the PPL with an index into this array. The value at that index will be a
419  * pointer to the ledger object itself.
420  *
421  * Even though the ledger object is kernel-writable, this array is only
422  * modifiable by the PPL. If a ledger object is passed into the PPL that has an
423  * index into this array that doesn't match up, then the validation will fail.
424  */
425 static SECURITY_READ_ONLY_LATE(pmap_ledger_t * *) pmap_ledger_ptr_array = NULL;
426 
427 /**
428  * The next free index into pmap_ledger_ptr_array to be given to the next
429  * allocated ledger object.
430  */
431 static uint64_t pmap_ledger_ptr_array_free_index MARK_AS_PMAP_DATA = 0;
432 
433 /* Free list of pmap ledger objects. */
434 static pmap_ledger_t *pmap_ledger_free_list MARK_AS_PMAP_DATA = NULL;
435 
436 /**
437  * This struct contains the memory needed to hold a single pmap object as well
438  * as an index into the pmap_ptr_array used for validating pmap objects passed
439  * into the PPL.
440  */
441 typedef struct pmap_list_entry {
442 	/**
443 	 * Either contain the memory needed for a single pmap object or a pointer to
444 	 * the next pmap object in the free list if the object hasn't been allocated
445 	 * yet.
446 	 *
447 	 * This union has to be the first member of this struct so that the memory
448 	 * used by this struct can be correctly cast as either a pmap_list_entry_t
449 	 * or a pmap_t (depending on whether the array_index is needed).
450 	 */
451 	union {
452 		struct pmap pmap;
453 		struct pmap_list_entry *next;
454 	};
455 
456 	/**
457 	 * This extra piece of information (not normally associated with generic
458 	 * pmap objects) is used to validate that a pmap object passed into the PPL
459 	 * is indeed a pmap object that was allocated by the PPL, and not just random
460 	 * memory being passed off as a pmap object. See validate_pmap()
461 	 * for more information on validating pmap objects.
462 	 */
463 	unsigned long array_index;
464 } pmap_list_entry_t;
465 
466 /* Lock for the pmap free list. */
467 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_free_list_lock, 0);
468 
469 /**
470  * The number of entries in the pmap pointer array. This determines the maximum
471  * number of pmap objects that can be allocated.
472  *
473  * This value might be slightly higher than PMAP_PTR_ARRAY_SIZE because the
474  * memory used for the array is rounded up to the nearest page boundary.
475  */
476 static SECURITY_READ_ONLY_LATE(unsigned long) pmap_ptr_array_count = 0;
477 
478 /**
479  * This array is used to validate that pmap objects passed into the PPL were
480  * allocated by the PPL and aren't just random memory being passed off as a pmap
481  * object. It does this by associating each pmap object allocated by the PPL
482  * with an index into this array. The value at that index will be a pointer to
483  * the pmap object itself.
484  *
485  * If a pmap object is passed into the PPL that has an index into this array
486  * that doesn't match up, then the validation will fail.
487  */
488 static SECURITY_READ_ONLY_LATE(pmap_list_entry_t * *) pmap_ptr_array = NULL;
489 
490 /**
491  * The next free index into pmap_ptr_array to be given to the next
492  * allocated pmap object.
493  */
494 static unsigned long pmap_ptr_array_free_index MARK_AS_PMAP_DATA = 0;
495 
496 /* Free list of pmap objects. */
497 static pmap_list_entry_t *pmap_free_list MARK_AS_PMAP_DATA = NULL;
498 
499 #endif /* XNU_MONITOR */
500 
501 /**
502  * Sorted representation of the pmap-io-ranges nodes in the device tree. These
503  * nodes describe all of the PPL-owned I/O ranges.
504  */
505 SECURITY_READ_ONLY_LATE(pmap_io_range_t*) io_attr_table = (pmap_io_range_t*)0;
506 
507 /* The number of ranges described by io_attr_table. */
508 SECURITY_READ_ONLY_LATE(unsigned int) num_io_rgns = 0;
509 
510 /**
511  * Sorted representation of the pmap-io-filter entries in the device tree
512  * The entries are sorted and queried by {signature, range}.
513  */
514 SECURITY_READ_ONLY_LATE(pmap_io_filter_entry_t*) io_filter_table = (pmap_io_filter_entry_t*)0;
515 
516 /* Number of total pmap-io-filter entries. */
517 SECURITY_READ_ONLY_LATE(unsigned int) num_io_filter_entries = 0;
518 
519 #if XNU_MONITOR
520 
521 /**
522  * Per-cpu pmap data. On PPL-enabled systems, this memory is only modifiable by
523  * the PPL itself and because of that, needs to be managed separately from the
524  * generic per-cpu data. The per-cpu pmap data exists on non-PPL systems as
525  * well, it's just located within the general machine-specific per-cpu data.
526  */
527 struct pmap_cpu_data_array_entry pmap_cpu_data_array[MAX_CPUS] MARK_AS_PMAP_DATA;
528 
529 /**
530  * The physical address spaces being used for the PPL stacks and PPL register
531  * save area are stored in global variables so that their permissions can be
532  * updated in pmap_static_allocations_done(). These regions are initialized by
533  * pmap_cpu_data_array_init().
534  */
535 SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_start_pa = 0;
536 SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_end_pa = 0;
537 SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_start = 0;
538 SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_end = 0;
539 
540 #if HAS_GUARDED_IO_FILTER
541 SECURITY_READ_ONLY_LATE(pmap_paddr_t) iofilter_stacks_start_pa = 0;
542 SECURITY_READ_ONLY_LATE(pmap_paddr_t) iofilter_stacks_end_pa = 0;
543 #endif /* HAS_GUARDED_IO_FILTER */
544 
545 #endif /* XNU_MONITOR */
546 
547 /* Prototypes used by pmap_data_bootstrap(). */
548 vm_size_t pmap_compute_io_rgns(void);
549 void pmap_load_io_rgns(void);
550 void pmap_cpu_data_array_init(void);
551 
552 #if HAS_GUARDED_IO_FILTER
553 vm_size_t pmap_compute_io_filters(void);
554 void pmap_load_io_filters(void);
555 #endif /* HAS_GUARDED_IO_FILTER */
556 
557 #if DEBUG || DEVELOPMENT
558 /* Track number of instances a WC/RT mapping request is converted to Device-GRE. */
559 static _Atomic unsigned int pmap_wcrt_on_non_dram_count = 0;
560 #endif /* DEBUG || DEVELOPMENT */
561 
562 /**
563  * This function is called once during pmap_bootstrap() to allocate and
564  * initialize many of the core data structures that are implemented in this
565  * file.
566  *
567  * Memory for these data structures is carved out of `avail_start` which is a
568  * global setup by arm_vm_init() that points to a physically contiguous region
569  * used for bootstrap allocations.
570  *
571  * @note There is no guaranteed alignment of `avail_start` when this function
572  *       returns. If avail_start needs to be aligned to a specific value then it
573  *       must be done so by the caller before they use it for more allocations.
574  */
575 void
pmap_data_bootstrap(void)576 pmap_data_bootstrap(void)
577 {
578 	/**
579 	 * Set ptd_per_page to the maximum number of (pt_desc_t + ptd_info_t) we can
580 	 * fit in a single page. We need to allow for some padding between the two,
581 	 * so that no ptd_info_t shares a cache line with a pt_desc_t.
582 	 */
583 	const unsigned ptd_info_size = sizeof(ptd_info_t) * PT_INDEX_MAX;
584 	const unsigned l2_cline_bytes = 1 << MAX_L2_CLINE;
585 	ptd_per_page = (PAGE_SIZE - (l2_cline_bytes - 1)) / (sizeof(pt_desc_t) + ptd_info_size);
586 	unsigned increment = 0;
587 	bool try_next = true;
588 
589 	/**
590 	 * The current ptd_per_page calculation was done assuming the worst-case
591 	 * scenario in terms of padding between the two object arrays that reside in
592 	 * the same page. The following loop attempts to optimize this further by
593 	 * finding the smallest possible amount of padding while still ensuring that
594 	 * the two object arrays don't share a cache line.
595 	 */
596 	while (try_next) {
597 		increment++;
598 		const unsigned pt_desc_total_size =
599 		    PMAP_ALIGN((ptd_per_page + increment) * sizeof(pt_desc_t), l2_cline_bytes);
600 		const unsigned ptd_info_total_size = (ptd_per_page + increment) * ptd_info_size;
601 		try_next = (pt_desc_total_size + ptd_info_total_size) <= PAGE_SIZE;
602 	}
603 	ptd_per_page += increment - 1;
604 	assert(ptd_per_page > 0);
605 
606 	/**
607 	 * ptd_info objects reside after the ptd descriptor objects, with some
608 	 * padding in between if necessary to ensure that they don't co-exist in the
609 	 * same cache line.
610 	 */
611 	const unsigned pt_desc_bytes = ptd_per_page * sizeof(pt_desc_t);
612 	ptd_info_offset = PMAP_ALIGN(pt_desc_bytes, l2_cline_bytes);
613 
614 	/* The maximum amount of padding should be (l2_cline_bytes - 1). */
615 	assert((ptd_info_offset - pt_desc_bytes) < l2_cline_bytes);
616 
617 	/**
618 	 * Allocate enough initial PTDs to map twice the available physical memory.
619 	 *
620 	 * To do this, start by calculating the number of leaf page tables that are
621 	 * needed to cover all of kernel-managed physical memory.
622 	 */
623 	const uint32_t num_leaf_page_tables =
624 	    (uint32_t)(mem_size / ((PAGE_SIZE / sizeof(pt_entry_t)) * ARM_PGBYTES));
625 
626 	/**
627 	 * There should be one PTD per page table (times 2 since we want twice the
628 	 * number of required PTDs), plus round the number of PTDs up to the next
629 	 * `ptd_per_page` value so there's no wasted space.
630 	 */
631 	const uint32_t ptd_root_table_n_ptds =
632 	    (ptd_per_page * ((num_leaf_page_tables * 2) / ptd_per_page)) + ptd_per_page;
633 
634 	/* Lastly, calculate the number of VM pages and bytes these PTDs take up. */
635 	const uint32_t num_ptd_pages = ptd_root_table_n_ptds / ptd_per_page;
636 	vm_size_t ptd_root_table_size = num_ptd_pages * PAGE_SIZE;
637 
638 	/* Number of VM pages that span all of kernel-managed memory. */
639 	unsigned int npages = (unsigned int)atop(mem_size);
640 
641 	/* The pv_head_table and pp_attr_table both have one entry per VM page. */
642 	const vm_size_t pp_attr_table_size = npages * sizeof(pp_attr_t);
643 	const vm_size_t pv_head_size = round_page(npages * sizeof(pv_entry_t *));
644 
645 	/* Scan the device tree and override heuristics in the PV entry management code. */
646 	pmap_compute_pv_targets();
647 
648 	/* Scan the device tree and figure out how many PPL-owned I/O regions there are. */
649 	const vm_size_t io_attr_table_size = pmap_compute_io_rgns();
650 
651 #if HAS_GUARDED_IO_FILTER
652 	/* Scan the device tree for the size of pmap-io-filter entries. */
653 	const vm_size_t io_filter_table_size = pmap_compute_io_filters();
654 #endif /* HAS_GUARDED_IO_FILTER */
655 
656 	/**
657 	 * Don't make any assumptions about the alignment of avail_start before
658 	 * execution of this function. Always re-align it to ensure the first
659 	 * allocated data structure is aligned correctly.
660 	 */
661 	avail_start = PMAP_ALIGN(avail_start, __alignof(pp_attr_t));
662 
663 	/**
664 	 * Keep track of where the data structures start so we can clear this memory
665 	 * later.
666 	 */
667 	const pmap_paddr_t pmap_struct_start = avail_start;
668 
669 	pp_attr_table = (pp_attr_t *)phystokv(avail_start);
670 	avail_start = PMAP_ALIGN(avail_start + pp_attr_table_size, __alignof(pmap_io_range_t));
671 
672 	io_attr_table = (pmap_io_range_t *)phystokv(avail_start);
673 
674  #if HAS_GUARDED_IO_FILTER
675 	/* Align avail_start to size of I/O filter entry. */
676 	avail_start = PMAP_ALIGN(avail_start + io_attr_table_size, __alignof(pmap_io_filter_entry_t));
677 
678 	/* Allocate memory for io_filter_table. */
679 	if (num_io_filter_entries != 0) {
680 		io_filter_table = (pmap_io_filter_entry_t *)phystokv(avail_start);
681 	}
682 
683 	/* Align avail_start for the next structure to be allocated. */
684 	avail_start = PMAP_ALIGN(avail_start + io_filter_table_size, __alignof(pv_entry_t *));
685 #else /* !HAS_GUARDED_IO_FILTER */
686 	avail_start = PMAP_ALIGN(avail_start + io_attr_table_size, __alignof(pv_entry_t *));
687 #endif /* HAS_GUARDED_IO_FILTER */
688 
689 	pv_head_table = (pv_entry_t **)phystokv(avail_start);
690 
691 	/**
692 	 * ptd_root_table must start on a page boundary because all of the math for
693 	 * associating pt_desc_t objects with ptd_info objects assumes the first
694 	 * pt_desc_t in a page starts at the beginning of the page it resides in.
695 	 */
696 	avail_start = round_page(avail_start + pv_head_size);
697 
698 	pt_desc_t *ptd_root_table = (pt_desc_t *)phystokv(avail_start);
699 	avail_start = round_page(avail_start + ptd_root_table_size);
700 
701 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
702 
703 	/* This function assumes that ptd_root_table has been zeroed out already. */
704 	ptd_bootstrap(ptd_root_table, num_ptd_pages);
705 
706 	/* Load data about the PPL-owned I/O regions into io_attr_table and sort it. */
707 	pmap_load_io_rgns();
708 
709 #if HAS_GUARDED_IO_FILTER
710 	/* Load the I/O filters into io_filter_table and sort them. */
711 	pmap_load_io_filters();
712 #endif /* HAS_GUARDED_IO_FILTER */
713 
714 #if XNU_MONITOR
715 	/**
716 	 * Each of these PPL-only data structures are rounded to the nearest page
717 	 * beyond their predefined size so as to provide a small extra buffer of
718 	 * objects and to make it easy to perform page-sized operations on them if
719 	 * the need ever arises.
720 	 */
721 	const vm_map_address_t pmap_ptr_array_begin = phystokv(avail_start);
722 	pmap_ptr_array = (pmap_list_entry_t**)pmap_ptr_array_begin;
723 	avail_start += round_page(PMAP_PTR_ARRAY_SIZE * sizeof(*pmap_ptr_array));
724 	const vm_map_address_t pmap_ptr_array_end = phystokv(avail_start);
725 
726 	pmap_ptr_array_count = ((pmap_ptr_array_end - pmap_ptr_array_begin) / sizeof(*pmap_ptr_array));
727 
728 	const vm_map_address_t pmap_ledger_ptr_array_begin = phystokv(avail_start);
729 	pmap_ledger_ptr_array = (pmap_ledger_t**)pmap_ledger_ptr_array_begin;
730 	avail_start += round_page(LEDGER_PTR_ARRAY_SIZE * sizeof(*pmap_ledger_ptr_array));
731 	const vm_map_address_t pmap_ledger_ptr_array_end = phystokv(avail_start);
732 	pmap_ledger_ptr_array_count = ((pmap_ledger_ptr_array_end - pmap_ledger_ptr_array_begin) / sizeof(*pmap_ledger_ptr_array));
733 
734 	pmap_ledger_refcnt = (os_refcnt_t*)phystokv(avail_start);
735 	avail_start += round_page(pmap_ledger_ptr_array_count * sizeof(*pmap_ledger_refcnt));
736 #endif /* XNU_MONITOR */
737 
738 	/**
739 	 * Setup the pmap per-cpu data structures (includes the PPL stacks, and PPL
740 	 * register save area). The pmap per-cpu data is managed separately from the
741 	 * general machine-specific per-cpu data on PPL systems so it can be made
742 	 * only writable by the PPL.
743 	 */
744 	pmap_cpu_data_array_init();
745 }
746 
747 /**
748  * Helper function for pmap_page_reclaim (hereby shortened to "ppr") which scans
749  * the list of userspace page table pages for one(s) that can be reclaimed. To
750  * be eligible, a page table must not have any wired PTEs, must contain at least
751  * one valid PTE, can't be nested, and the pmap that owns that page table must
752  * not already be locked.
753  *
754  * @note This should only be called from pmap_page_reclaim().
755  *
756  * @note If an eligible page table was found, then the pmap which contains that
757  *       page table will be locked exclusively.
758  *
759  * @note On systems where multiple page tables exist within one page, all page
760  *       tables within a page have to be eligible for that page to be considered
761  *       reclaimable.
762  *
763  * @param ptdpp Output parameter which will contain a pointer to the page table
764  *              descriptor for the page table(s) that can be reclaimed (if any
765  *              were found). If no page table was found, this will be set to
766  *              NULL.
767  *
768  * @return True if an eligible table was found, false otherwise. In the case
769  *         that a page table was found, ptdpp will be a pointer to the page
770  *         table descriptor for the table(s) that can be reclaimed. Otherwise
771  *         it'll be set to NULL.
772  */
773 MARK_AS_PMAP_TEXT static bool
ppr_find_eligible_pt_page(pt_desc_t ** ptdpp)774 ppr_find_eligible_pt_page(pt_desc_t **ptdpp)
775 {
776 	assert(ptdpp != NULL);
777 
778 	pmap_simple_lock(&pt_pages_lock);
779 	pt_desc_t *ptdp = (pt_desc_t *)queue_first(&pt_page_list);
780 
781 	while (!queue_end(&pt_page_list, (queue_entry_t)ptdp)) {
782 		/* Skip this pmap if it's nested or already locked. */
783 		if ((ptdp->pmap->type != PMAP_TYPE_USER) ||
784 		    (!pmap_try_lock(ptdp->pmap, PMAP_LOCK_EXCLUSIVE))) {
785 			ptdp = (pt_desc_t *)queue_next((queue_t)ptdp);
786 			continue;
787 		}
788 
789 		assert(ptdp->pmap != kernel_pmap);
790 
791 		unsigned refcnt_acc = 0;
792 		unsigned wiredcnt_acc = 0;
793 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);
794 
795 		/**
796 		 * On systems where the VM page size differs from the hardware
797 		 * page size, then multiple page tables can exist within one VM page.
798 		 */
799 		for (unsigned i = 0; i < (PAGE_SIZE / pt_attr_page_size(pt_attr)); i++) {
800 			/* Do not attempt to free a page that contains an L2 table. */
801 			if (ptdp->ptd_info[i].refcnt == PT_DESC_REFCOUNT) {
802 				refcnt_acc = 0;
803 				break;
804 			}
805 
806 			refcnt_acc += ptdp->ptd_info[i].refcnt;
807 			wiredcnt_acc += ptdp->ptd_info[i].wiredcnt;
808 		}
809 
810 		/**
811 		 * If we've found a page with no wired entries, but valid PTEs then
812 		 * choose it for reclamation.
813 		 */
814 		if ((wiredcnt_acc == 0) && (refcnt_acc != 0)) {
815 			*ptdpp = ptdp;
816 			pmap_simple_unlock(&pt_pages_lock);
817 
818 			/**
819 			 * Leave ptdp->pmap locked here. We're about to reclaim a page table
820 			 * from it, so we don't want anyone else messing with it while we do
821 			 * that.
822 			 */
823 			return true;
824 		}
825 
826 		/**
827 		 * This page table/PTD wasn't eligible, unlock its pmap and move to the
828 		 * next one in the queue.
829 		 */
830 		pmap_unlock(ptdp->pmap, PMAP_LOCK_EXCLUSIVE);
831 		ptdp = (pt_desc_t *)queue_next((queue_t)ptdp);
832 	}
833 
834 	pmap_simple_unlock(&pt_pages_lock);
835 	*ptdpp = NULL;
836 
837 	return false;
838 }
839 
840 /**
841  * Helper function for pmap_page_reclaim (hereby shortened to "ppr") which frees
842  * every page table within a page so that that page can get reclaimed.
843  *
844  * @note This should only be called from pmap_page_reclaim() and is only meant
845  *       to delete page tables deemed eligible for reclaiming by
846  *       ppr_find_eligible_pt_page().
847  *
848  * @param ptdp The page table descriptor whose page table(s) will get freed.
849  *
850  * @return KERN_SUCCESS on success. KERN_RESOURCE_SHORTAGE if the page is not
851  *         removed due to pending preemption.
852  */
853 MARK_AS_PMAP_TEXT static kern_return_t
ppr_remove_pt_page(pt_desc_t * ptdp)854 ppr_remove_pt_page(pt_desc_t *ptdp)
855 {
856 	assert(ptdp != NULL);
857 
858 	bool need_strong_sync = false;
859 	tt_entry_t *ttep = TT_ENTRY_NULL;
860 	pt_entry_t *ptep = PT_ENTRY_NULL;
861 	pt_entry_t *begin_pte = PT_ENTRY_NULL;
862 	pt_entry_t *end_pte = PT_ENTRY_NULL;
863 	pmap_t pmap = ptdp->pmap;
864 
865 	/**
866 	 * The pmap exclusive lock should have gotten locked when the eligible page
867 	 * table was found in ppr_find_eligible_pt_page().
868 	 */
869 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
870 
871 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
872 	const uint64_t hw_page_size = pt_attr_page_size(pt_attr);
873 
874 	/**
875 	 * On some systems, one page table descriptor can represent multiple page
876 	 * tables. In that case, remove every table within the wanted page so we
877 	 * can reclaim it.
878 	 */
879 	for (unsigned i = 0; i < (PAGE_SIZE / hw_page_size); i++) {
880 		const vm_map_address_t va = ptdp->va[i];
881 
882 		/**
883 		 * If the VA is bogus, this may represent an unallocated region or one
884 		 * which is in transition (already being freed or expanded). Don't try
885 		 * to remove mappings here.
886 		 */
887 		if (va == (vm_offset_t)-1) {
888 			continue;
889 		}
890 
891 		/* Get the twig table entry that points to the table to reclaim. */
892 		ttep = pmap_tte(pmap, va);
893 
894 		/**
895 		 * If the twig entry is nonexistent, or either an invalid/block mapping,
896 		 * skip it.
897 		 */
898 		if ((ttep == TT_ENTRY_NULL) || !tte_is_valid_table(*ttep)) {
899 			continue;
900 		}
901 
902 		ptep = (pt_entry_t *)ttetokv(*ttep);
903 		begin_pte = &ptep[pte_index(pt_attr, va)];
904 		end_pte = begin_pte + (hw_page_size / sizeof(pt_entry_t));
905 		vm_map_address_t eva = 0;
906 
907 		/**
908 		 * Remove all mappings in the page table being reclaimed.
909 		 *
910 		 * Use PMAP_OPTIONS_REMOVE to clear any "compressed" markers and
911 		 * update the "compressed" counter in the ledger. This means that
912 		 * we lose accounting for any compressed pages in this range but the
913 		 * alternative is to not be able to account for their future
914 		 * decompression, which could cause the counter to drift more and
915 		 * more.
916 		 */
917 		int pte_changed = pmap_remove_range_options(
918 			pmap, va, begin_pte, end_pte, &eva, &need_strong_sync, PMAP_OPTIONS_REMOVE);
919 
920 		const vm_offset_t expected_va_end = va + (size_t)pt_attr_leaf_table_size(pt_attr);
921 
922 		if (eva == expected_va_end) {
923 			/**
924 			 * Free the page table now that all of its mappings have been removed.
925 			 * Once all page tables within a page have been deallocated, then the
926 			 * page that contains the table(s) will be freed and made available for
927 			 * reuse.
928 			 */
929 			pmap_tte_deallocate(pmap, va, expected_va_end, need_strong_sync, ttep, pt_attr_twig_level(pt_attr));
930 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE); /* pmap_tte_deallocate() dropped the lock */
931 		} else {
932 			/**
933 			 * pmap_remove_range_options() returned earlier than expected,
934 			 * indicating there is emergent preemption pending. We should
935 			 * bail out, despite some of the mappings were removed in vain.
936 			 * They have to take the penalty of page faults to be brought
937 			 * back, but we don't want to miss the preemption deadline and
938 			 * panic.
939 			 */
940 			assert(eva < expected_va_end);
941 
942 			/**
943 			 * In the normal path, we expect pmap_tte_deallocate() to flush
944 			 * the TLB for us. However, on the abort path here, we need to
945 			 * handle it here explicitly. If there is any mapping updated,
946 			 * update the TLB. */
947 			if (pte_changed > 0) {
948 				pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, (size_t) (eva - va), pmap, false, need_strong_sync);
949 				arm64_sync_tlb(need_strong_sync);
950 			}
951 
952 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
953 			return KERN_ABORTED;
954 		}
955 	}
956 
957 	/**
958 	 * We're done modifying page tables, so undo the lock that was grabbed when
959 	 * we found the table(s) to reclaim in ppr_find_eligible_pt_page().
960 	 */
961 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
962 	return KERN_SUCCESS;
963 }
964 
965 /**
966  * Attempt to return a page by freeing an active page-table page. To be eligible
967  * for reclaiming, a page-table page must be assigned to a non-kernel pmap, it
968  * must not have any wired PTEs and must contain at least one valid PTE.
969  *
970  * @note This function is potentially invoked when PMAP_PAGE_RECLAIM_NOWAIT is
971  *       passed as an option to pmap_pages_alloc_zeroed().
972  *
973  * @note Invocations of this function are only meant to occur in critical paths
974  *       that absolutely can't take the latency hit of waiting for the VM or
975  *       jumping out of the PPL to allocate more pages. Reclaiming a page table
976  *       page can cause a performance hit when one of the removed mappings is
977  *       next accessed (forcing the VM to fault and re-insert the mapping).
978  *
979  * @return The physical address of the page that was allocated, or zero if no
980  *         suitable page was found on the page-table list.
981  */
982 MARK_AS_PMAP_TEXT static pmap_paddr_t
pmap_page_reclaim(void)983 pmap_page_reclaim(void)
984 {
985 	pmap_simple_lock(&pmap_page_reclaim_lock);
986 	pmap_pages_request_count++;
987 	pmap_pages_request_acum++;
988 
989 	/* This loop will never break out, the function will just return. */
990 	while (1) {
991 		/**
992 		 * Attempt to allocate a page from the page free list reserved for this
993 		 * function. This free list is managed in tandem with pmap_pages_free()
994 		 * which will add a page to this list for each call to
995 		 * pmap_page_reclaim(). Most likely that page will come from a reclaimed
996 		 * userspace page table, but if there aren't any page tables to reclaim,
997 		 * then whatever the next freed page is will show up on this list for
998 		 * the next invocation of pmap_page_reclaim() to use.
999 		 */
1000 		if (pmap_page_reclaim_list != PAGE_FREE_ENTRY_NULL) {
1001 			page_free_entry_t *page_entry = pmap_page_reclaim_list;
1002 			pmap_page_reclaim_list = pmap_page_reclaim_list->next;
1003 			pmap_simple_unlock(&pmap_page_reclaim_lock);
1004 
1005 			return ml_static_vtop((vm_offset_t)page_entry);
1006 		}
1007 
1008 		/* Drop the lock to allow pmap_pages_free() to add pages to the list. */
1009 		pmap_simple_unlock(&pmap_page_reclaim_lock);
1010 
1011 		/* Attempt to find an elegible page table page to reclaim. */
1012 		pt_desc_t *ptdp = NULL;
1013 		bool found_page = ppr_find_eligible_pt_page(&ptdp);
1014 
1015 		if (!found_page) {
1016 			/**
1017 			 * No eligible page table was found. pmap_pages_free() will still
1018 			 * add the next freed page to the reclaim free list, so the next
1019 			 * invocation of this function should have better luck.
1020 			 */
1021 			return (pmap_paddr_t)0;
1022 		}
1023 
1024 		/**
1025 		 * If we found a page table to reclaim, then ptdp should point to the
1026 		 * descriptor for that table. Go ahead and remove it.
1027 		 */
1028 		if (ppr_remove_pt_page(ptdp) != KERN_SUCCESS) {
1029 			/* Take the page not found path to bail out on pending preemption. */
1030 			return (pmap_paddr_t)0;
1031 		}
1032 
1033 		/**
1034 		 * Now that a page has hopefully been freed (and added to the reclaim
1035 		 * page list), the next iteration of the loop will re-check the reclaim
1036 		 * free list.
1037 		 */
1038 		pmap_simple_lock(&pmap_page_reclaim_lock);
1039 	}
1040 }
1041 
1042 #if XNU_MONITOR
1043 /**
1044  * Helper function for returning a PPL page back to the PPL page free list.
1045  *
1046  * @param pa Physical address of the page to add to the PPL page free list.
1047  *           This address must be aligned to the VM page size.
1048  */
1049 MARK_AS_PMAP_TEXT static void
pmap_give_free_ppl_page(pmap_paddr_t pa)1050 pmap_give_free_ppl_page(pmap_paddr_t pa)
1051 {
1052 	if ((pa & PAGE_MASK) != 0) {
1053 		panic("%s: Unaligned address passed in, pa=0x%llx",
1054 		    __func__, pa);
1055 	}
1056 
1057 	page_free_entry_t *page_entry = (page_free_entry_t *)phystokv(pa);
1058 	pmap_simple_lock(&pmap_ppl_free_page_lock);
1059 
1060 	/* Prepend the passed in page to the PPL page free list. */
1061 	page_entry->next = pmap_ppl_free_page_list;
1062 	pmap_ppl_free_page_list = page_entry;
1063 	pmap_ppl_free_page_count++;
1064 
1065 	pmap_simple_unlock(&pmap_ppl_free_page_lock);
1066 }
1067 
1068 /**
1069  * Helper function for getting a PPL page from the PPL page free list.
1070  *
1071  * @return The physical address of the page taken from the PPL page free list,
1072  *         or zero if there are no pages left in the free list.
1073  */
1074 MARK_AS_PMAP_TEXT static pmap_paddr_t
pmap_get_free_ppl_page(void)1075 pmap_get_free_ppl_page(void)
1076 {
1077 	pmap_paddr_t pa = 0;
1078 
1079 	pmap_simple_lock(&pmap_ppl_free_page_lock);
1080 
1081 	if (pmap_ppl_free_page_list != PAGE_FREE_ENTRY_NULL) {
1082 		/**
1083 		 * Pop a page off the front of the list. The second item in the list
1084 		 * will become the new head.
1085 		 */
1086 		page_free_entry_t *page_entry = pmap_ppl_free_page_list;
1087 		pmap_ppl_free_page_list = pmap_ppl_free_page_list->next;
1088 		pa = kvtophys_nofail((vm_offset_t)page_entry);
1089 		pmap_ppl_free_page_count--;
1090 	} else {
1091 		pa = 0L;
1092 	}
1093 
1094 	pmap_simple_unlock(&pmap_ppl_free_page_lock);
1095 	assert((pa & PAGE_MASK) == 0);
1096 
1097 	return pa;
1098 }
1099 
1100 /**
1101  * Claim a page on behalf of the PPL by marking it as PPL-owned and only
1102  * allowing the PPL to write to it. Also can potentially add the page to the
1103  * PPL page free list (see initially_free parameter).
1104  *
1105  * @note The page cannot have any mappings outside of the physical aperture.
1106  *
1107  * @param pa The physical address of the page to mark as PPL-owned.
1108  * @param initially_free Should the page be added to the PPL page free list.
1109  *                       This is typically "true" if a brand new page was just
1110  *                       allocated for the PPL's usage, and "false" if this is a
1111  *                       page already being used by other agents (e.g., IOMMUs).
1112  */
1113 MARK_AS_PMAP_TEXT void
pmap_mark_page_as_ppl_page_internal(pmap_paddr_t pa,bool initially_free)1114 pmap_mark_page_as_ppl_page_internal(pmap_paddr_t pa, bool initially_free)
1115 {
1116 	pp_attr_t attr = 0;
1117 
1118 	if (!pa_valid(pa)) {
1119 		panic("%s: Non-kernel-managed (maybe I/O) address passed in, pa=0x%llx",
1120 		    __func__, pa);
1121 	}
1122 
1123 	const unsigned int pai = pa_index(pa);
1124 	pvh_lock(pai);
1125 
1126 	/* A page that the PPL already owns can't be given to the PPL. */
1127 	if (__improbable(ppattr_pa_test_monitor(pa))) {
1128 		panic("%s: page already belongs to PPL, pa=0x%llx", __func__, pa);
1129 	}
1130 
1131 	if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
1132 		panic("%s: page locked down, pa=0x%llx", __func__, pa);
1133 	}
1134 
1135 	/* The page cannot be mapped outside of the physical aperture. */
1136 	if (__improbable(!pmap_verify_free((ppnum_t)atop(pa)))) {
1137 		panic("%s: page still has mappings, pa=0x%llx", __func__, pa);
1138 	}
1139 
1140 	do {
1141 		attr = pp_attr_table[pai];
1142 		if (__improbable(attr & PP_ATTR_NO_MONITOR)) {
1143 			panic("%s: page excluded from PPL, pa=0x%llx", __func__, pa);
1144 		}
1145 	} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_MONITOR, &pp_attr_table[pai]));
1146 
1147 	/* Ensure only the PPL has write access to the physical aperture mapping. */
1148 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
1149 
1150 	pvh_unlock(pai);
1151 
1152 	if (initially_free) {
1153 		pmap_give_free_ppl_page(pa);
1154 	}
1155 }
1156 
1157 /**
1158  * Helper function for converting a PPL page back into a kernel-writable page.
1159  * This removes the PPL-ownership for that page and updates the physical
1160  * aperture mapping of that page so it's kernel-writable again.
1161  *
1162  * @param pa The physical address of the PPL page to be made kernel-writable.
1163  */
1164 MARK_AS_PMAP_TEXT void
pmap_mark_page_as_kernel_page(pmap_paddr_t pa)1165 pmap_mark_page_as_kernel_page(pmap_paddr_t pa)
1166 {
1167 	const unsigned int pai = pa_index(pa);
1168 	pvh_lock(pai);
1169 
1170 	if (!ppattr_pa_test_monitor(pa)) {
1171 		panic("%s: page is not a PPL page, pa=%p", __func__, (void *)pa);
1172 	}
1173 
1174 	ppattr_pa_clear_monitor(pa);
1175 
1176 	/* Ensure the kernel has write access to the physical aperture mapping. */
1177 	pmap_set_xprr_perm(pai, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
1178 
1179 	pvh_unlock(pai);
1180 }
1181 
1182 /**
1183  * PPL Helper function for giving a single page on the PPL page free list back
1184  * to the kernel.
1185  *
1186  * @note This function implements the logic that HAS to run within the PPL for
1187  *       the pmap_release_ppl_pages_to_kernel() call. This helper function
1188  *       shouldn't be called directly.
1189  *
1190  * @note A minimum amount of pages (set by PMAP_MIN_FREE_PPL_PAGES) will always
1191  *       be kept on the PPL page free list to ensure that core operations can
1192  *       occur without having to refill the free list.
1193  *
1194  * @return The physical address of the page that's been returned to the kernel,
1195  *         or zero if no page was returned.
1196  */
1197 MARK_AS_PMAP_TEXT pmap_paddr_t
pmap_release_ppl_pages_to_kernel_internal(void)1198 pmap_release_ppl_pages_to_kernel_internal(void)
1199 {
1200 	pmap_paddr_t pa = 0;
1201 
1202 	if (pmap_ppl_free_page_count <= PMAP_MIN_FREE_PPL_PAGES) {
1203 		return 0;
1204 	}
1205 
1206 	pa = pmap_get_free_ppl_page();
1207 
1208 	if (!pa) {
1209 		return 0;
1210 	}
1211 
1212 	pmap_mark_page_as_kernel_page(pa);
1213 
1214 	return pa;
1215 }
1216 #endif /* XNU_MONITOR */
1217 
1218 /**
1219  * Add a queue of VM pages to the pmap's VM object. This informs the VM that
1220  * these pages are being used by the pmap and shouldn't be reused.
1221  *
1222  * This also means that the pmap_object can be used as a convenient way to loop
1223  * through every page currently being used by the pmap. For instance, this queue
1224  * of pages is exposed to the debugger through the Low Globals, where it's used
1225  * to ensure that all pmap data is saved in an active core dump.
1226  *
1227  * @param mem The head of the queue of VM pages to add to the pmap's VM object.
1228  */
1229 void
pmap_enqueue_pages(vm_page_t mem)1230 pmap_enqueue_pages(vm_page_t mem)
1231 {
1232 	vm_page_t m_prev;
1233 	vm_object_lock(pmap_object);
1234 	while (mem != VM_PAGE_NULL) {
1235 		const vm_object_offset_t offset =
1236 		    (vm_object_offset_t) ((ptoa(VM_PAGE_GET_PHYS_PAGE(mem))) - gPhysBase);
1237 
1238 		vm_page_insert_wired(mem, pmap_object, offset, VM_KERN_MEMORY_PTE);
1239 		m_prev = mem;
1240 		mem = NEXT_PAGE(m_prev);
1241 		*(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL;
1242 	}
1243 	vm_object_unlock(pmap_object);
1244 }
1245 
1246 /**
1247  * Allocate a page for usage within the pmap and zero it out. If running on a
1248  * PPL-enabled system, this will allocate pages from the PPL page free list.
1249  * Otherwise pages are grabbed directly from the VM.
1250  *
1251  * @note On PPL-enabled systems, this function can ONLY be called from within
1252  *       the PPL. If a page needs to be allocated from outside of the PPL on
1253  *       these systems, then use pmap_alloc_page_for_kern().
1254  *
1255  * @param pa Output parameter to store the physical address of the allocated
1256  *           page if one was able to be allocated (NULL otherwise).
1257  * @param size The amount of memory to allocate. This has to be PAGE_SIZE on
1258  *             PPL-enabled systems. On other systems it can be either PAGE_SIZE
1259  *             or 2*PAGE_SIZE, in which case the two pages are allocated
1260  *             physically contiguous.
1261  * @param options The following options can be specified:
1262  *     - PMAP_PAGES_ALLOCATE_NOWAIT: If the VM or PPL page free list don't have
1263  *       any free pages available then don't wait for one, just return
1264  *       immediately without allocating a page. PPL-enabled systems must ALWAYS
1265  *       pass this flag since allocating memory from within the PPL can't spin
1266  *       or block due to preemption being disabled (would be a perf hit).
1267  *
1268  *     - PMAP_PAGE_RECLAIM_NOWAIT: If memory failed to get allocated the normal
1269  *       way (either by the PPL page free list on PPL-enabled systems, or
1270  *       through the VM on other systems), then fall back to attempting to
1271  *       reclaim a userspace page table. This should only be specified in paths
1272  *       that absolutely can't take the latency hit of waiting for the VM or
1273  *       jumping out of the PPL to allocate more pages.
1274  *
1275  * @return KERN_SUCCESS if a page was successfully allocated, or
1276  *         KERN_RESOURCE_SHORTAGE if a page failed to get allocated. This can
1277  *         also be returned on non-PPL devices if preemption is disabled after
1278  *         early boot since allocating memory from the VM requires grabbing a
1279  *         mutex.
1280  */
1281 MARK_AS_PMAP_TEXT kern_return_t
pmap_pages_alloc_zeroed(pmap_paddr_t * pa,unsigned size,unsigned options)1282 pmap_pages_alloc_zeroed(pmap_paddr_t *pa, unsigned size, unsigned options)
1283 {
1284 	assert(pa != NULL);
1285 
1286 #if XNU_MONITOR
1287 	ASSERT_NOT_HIBERNATING();
1288 
1289 	/* The PPL page free list always operates on PAGE_SIZE chunks of memory. */
1290 	if (size != PAGE_SIZE) {
1291 		panic("%s: size != PAGE_SIZE, pa=%p, size=%u, options=%u",
1292 		    __func__, pa, size, options);
1293 	}
1294 
1295 	/* Allocating memory in the PPL can't wait since preemption is disabled. */
1296 	assert(options & PMAP_PAGES_ALLOCATE_NOWAIT);
1297 
1298 	*pa = pmap_get_free_ppl_page();
1299 
1300 	if ((*pa == 0) && (options & PMAP_PAGE_RECLAIM_NOWAIT)) {
1301 		*pa = pmap_page_reclaim();
1302 	}
1303 
1304 	if (*pa == 0) {
1305 		return KERN_RESOURCE_SHORTAGE;
1306 	} else {
1307 		bzero((void*)phystokv(*pa), size);
1308 		return KERN_SUCCESS;
1309 	}
1310 #else /* XNU_MONITOR */
1311 	vm_page_t mem = VM_PAGE_NULL;
1312 	thread_t self = current_thread();
1313 
1314 	/**
1315 	 * It's not possible to allocate memory from the VM in a preemption disabled
1316 	 * environment except during early boot (since the VM needs to grab a mutex).
1317 	 * In those cases just return a resource shortage error and let the caller
1318 	 * deal with it.
1319 	 */
1320 	if (!pmap_is_preemptible()) {
1321 		return KERN_RESOURCE_SHORTAGE;
1322 	}
1323 
1324 	/**
1325 	 * We qualify for allocating reserved memory so set TH_OPT_VMPRIV to inform
1326 	 * the VM of this.
1327 	 *
1328 	 * This field should only be modified by the local thread itself, so no lock
1329 	 * needs to be taken.
1330 	 */
1331 	uint16_t thread_options = self->options;
1332 	self->options |= TH_OPT_VMPRIV;
1333 
1334 	if (__probable(size == PAGE_SIZE)) {
1335 		/**
1336 		 * If we're only allocating a single page, just grab one off the VM's
1337 		 * global page free list.
1338 		 */
1339 		while ((mem = vm_page_grab()) == VM_PAGE_NULL) {
1340 			if (options & PMAP_PAGES_ALLOCATE_NOWAIT) {
1341 				break;
1342 			}
1343 
1344 			VM_PAGE_WAIT();
1345 		}
1346 
1347 		if (mem != VM_PAGE_NULL) {
1348 			vm_page_lock_queues();
1349 			vm_page_wire(mem, VM_KERN_MEMORY_PTE, TRUE);
1350 			vm_page_unlock_queues();
1351 		}
1352 	} else if (size == (2 * PAGE_SIZE)) {
1353 		/**
1354 		 * Allocate two physically contiguous pages. Any random two pages
1355 		 * obtained from the VM's global page free list aren't guaranteed to be
1356 		 * contiguous so we need to use the cpm_allocate() API.
1357 		 */
1358 		while (cpm_allocate(size, &mem, 0, 1, TRUE, 0) != KERN_SUCCESS) {
1359 			if (options & PMAP_PAGES_ALLOCATE_NOWAIT) {
1360 				break;
1361 			}
1362 
1363 			VM_PAGE_WAIT();
1364 		}
1365 	} else {
1366 		panic("%s: invalid size %u", __func__, size);
1367 	}
1368 
1369 	self->options = thread_options;
1370 
1371 	/**
1372 	 * If the normal method of allocating pages failed, then potentially fall
1373 	 * back to attempting to reclaim a userspace page table.
1374 	 */
1375 	if ((mem == VM_PAGE_NULL) && (options & PMAP_PAGE_RECLAIM_NOWAIT)) {
1376 		assert(size == PAGE_SIZE);
1377 		*pa = pmap_page_reclaim();
1378 		if (*pa != 0) {
1379 			bzero((void*)phystokv(*pa), size);
1380 			return KERN_SUCCESS;
1381 		}
1382 	}
1383 
1384 	if (mem == VM_PAGE_NULL) {
1385 		return KERN_RESOURCE_SHORTAGE;
1386 	}
1387 
1388 	*pa = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(mem));
1389 
1390 	/* Add the allocated VM page(s) to the pmap's VM object. */
1391 	pmap_enqueue_pages(mem);
1392 
1393 	/* Pages are considered "in use" by the pmap until returned to the VM. */
1394 	OSAddAtomic(size >> PAGE_SHIFT, &inuse_pmap_pages_count);
1395 	OSAddAtomic64(size >> PAGE_SHIFT, &alloc_pmap_pages_count);
1396 
1397 	bzero((void*)phystokv(*pa), size);
1398 	return KERN_SUCCESS;
1399 #endif /* XNU_MONITOR */
1400 }
1401 
1402 #if XNU_MONITOR
1403 /**
1404  * Allocate a page from the VM. If no pages are available, this function can
1405  * potentially spin until a page is available (see the `options` parameter).
1406  *
1407  * @note This function CANNOT be called from the PPL since it calls into the VM.
1408  *       If the PPL needs memory, then it'll need to exit the PPL before
1409  *       allocating more (usually by returning KERN_RESOURCE_SHORTAGE, and then
1410  *       calling pmap_alloc_page_for_ppl() from outside of the PPL).
1411  *
1412  * @param options The following options can be specified:
1413  *     - PMAP_PAGES_ALLOCATE_NOWAIT: If the VM doesn't have any free pages
1414  *       available then don't wait for one, just return immediately without
1415  *       allocating a page.
1416  *
1417  * @return The physical address of the page, if one was allocated. Zero,
1418  *         otherwise.
1419  */
1420 pmap_paddr_t
pmap_alloc_page_for_kern(unsigned int options)1421 pmap_alloc_page_for_kern(unsigned int options)
1422 {
1423 	pmap_paddr_t pa = 0;
1424 	vm_page_t mem = VM_PAGE_NULL;
1425 
1426 	/* It's not possible to lock VM page queue lock if not preemptible. */
1427 	if (!pmap_is_preemptible()) {
1428 		return 0;
1429 	}
1430 
1431 	while ((mem = vm_page_grab()) == VM_PAGE_NULL) {
1432 		if (options & PMAP_PAGES_ALLOCATE_NOWAIT) {
1433 			return 0;
1434 		}
1435 		VM_PAGE_WAIT();
1436 	}
1437 
1438 	/* Automatically wire any pages used by the pmap. */
1439 	vm_page_lock_queues();
1440 	vm_page_wire(mem, VM_KERN_MEMORY_PTE, TRUE);
1441 	vm_page_unlock_queues();
1442 
1443 	pa = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(mem));
1444 
1445 	if (__improbable(pa == 0)) {
1446 		panic("%s: physical address is 0", __func__);
1447 	}
1448 
1449 	/**
1450 	 * Add the acquired VM page to the pmap's VM object to notify the VM that
1451 	 * this page is being used.
1452 	 */
1453 	pmap_enqueue_pages(mem);
1454 
1455 	/* Pages are considered "in use" by the pmap until returned to the VM. */
1456 	OSAddAtomic(1, &inuse_pmap_pages_count);
1457 	OSAddAtomic64(1, &alloc_pmap_pages_count);
1458 
1459 	return pa;
1460 }
1461 
1462 /**
1463  * Allocate a page from the VM, mark it as being PPL-owned, and add it to the
1464  * PPL page free list.
1465  *
1466  * @note This function CANNOT be called from the PPL since it calls into the VM.
1467  *       If the PPL needs memory, then it'll need to exit the PPL before calling
1468  *       this function (usually by returning KERN_RESOURCE_SHORTAGE).
1469  *
1470  * @param options The following options can be specified:
1471  *     - PMAP_PAGES_ALLOCATE_NOWAIT: If the VM doesn't have any free pages
1472  *       available then don't wait for one, just return immediately without
1473  *       allocating a page.
1474  */
1475 void
pmap_alloc_page_for_ppl(unsigned int options)1476 pmap_alloc_page_for_ppl(unsigned int options)
1477 {
1478 	thread_t self = current_thread();
1479 
1480 	/**
1481 	 * We qualify for allocating reserved memory so set TH_OPT_VMPRIV to inform
1482 	 * the VM of this.
1483 	 *
1484 	 * This field should only be modified by the local thread itself, so no lock
1485 	 * needs to be taken.
1486 	 */
1487 	uint16_t thread_options = self->options;
1488 	self->options |= TH_OPT_VMPRIV;
1489 	pmap_paddr_t pa = pmap_alloc_page_for_kern(options);
1490 	self->options = thread_options;
1491 
1492 	if (pa != 0) {
1493 		pmap_mark_page_as_ppl_page(pa);
1494 	}
1495 }
1496 #endif /* XNU_MONITOR */
1497 
1498 /**
1499  * Free memory previously allocated through pmap_pages_alloc_zeroed() or
1500  * pmap_alloc_page_for_kern().
1501  *
1502  * On PPL-enabled systems, this just adds the page back to the PPL page free
1503  * list. On other systems, this returns the page(s) back to the VM.
1504  *
1505  * @param pa Physical address of the page(s) to free.
1506  * @param size The size in bytes of the memory region being freed (must be
1507  *             PAGE_SIZE on PPL-enabled systems).
1508  */
1509 void
pmap_pages_free(pmap_paddr_t pa,__assert_only unsigned size)1510 pmap_pages_free(pmap_paddr_t pa, __assert_only unsigned size)
1511 {
1512 	/**
1513 	 * If the pmap is starved for memory to the point that pmap_page_reclaim()
1514 	 * starts getting invoked to allocate memory, then let's take the page being
1515 	 * freed and add it directly to pmap_page_reclaim()'s dedicated free list.
1516 	 * In that case, the page being freed is most likely a userspace page table
1517 	 * that was reclaimed.
1518 	 */
1519 	if (__improbable(pmap_pages_request_count != 0)) {
1520 		pmap_simple_lock(&pmap_page_reclaim_lock);
1521 
1522 		if (pmap_pages_request_count != 0) {
1523 			pmap_pages_request_count--;
1524 
1525 			/* Prepend the freed page to the pmap_page_reclaim() free list. */
1526 			page_free_entry_t *page_entry = (page_free_entry_t *)phystokv(pa);
1527 			page_entry->next = pmap_page_reclaim_list;
1528 			pmap_page_reclaim_list = page_entry;
1529 			pmap_simple_unlock(&pmap_page_reclaim_lock);
1530 
1531 			return;
1532 		}
1533 		pmap_simple_unlock(&pmap_page_reclaim_lock);
1534 	}
1535 
1536 #if XNU_MONITOR
1537 	/* The PPL page free list always operates on PAGE_SIZE chunks of memory. */
1538 	assert(size == PAGE_SIZE);
1539 
1540 	/* On PPL-enabled systems, just add the page back to the PPL page free list. */
1541 	pmap_give_free_ppl_page(pa);
1542 #else /* XNU_MONITOR */
1543 	vm_page_t mem = VM_PAGE_NULL;
1544 	const pmap_paddr_t pa_max = pa + size;
1545 
1546 	/* Pages are considered "in use" until given back to the VM. */
1547 	OSAddAtomic(-(size >> PAGE_SHIFT), &inuse_pmap_pages_count);
1548 
1549 	for (; pa < pa_max; pa += PAGE_SIZE) {
1550 		vm_object_lock(pmap_object);
1551 
1552 		/**
1553 		 * Remove the page from the pmap's VM object and return it back to the
1554 		 * VM's global free list of pages.
1555 		 */
1556 		mem = vm_page_lookup(pmap_object, (pa - gPhysBase));
1557 		assert(mem != VM_PAGE_NULL);
1558 		assert(VM_PAGE_WIRED(mem));
1559 		vm_page_lock_queues();
1560 		vm_page_free(mem);
1561 		vm_page_unlock_queues();
1562 		vm_object_unlock(pmap_object);
1563 	}
1564 #endif /* XNU_MONITOR */
1565 }
1566 
1567 /**
1568  * Called by the VM to reclaim pages that we can reclaim quickly and cheaply.
1569  * This will take pages in the pmap's VM object and add them back to the VM's
1570  * global list of free pages.
1571  *
1572  * @return The number of pages returned to the VM.
1573  */
1574 uint64_t
pmap_release_pages_fast(void)1575 pmap_release_pages_fast(void)
1576 {
1577 #if XNU_MONITOR
1578 	return pmap_release_ppl_pages_to_kernel();
1579 #else /* XNU_MONITOR */
1580 	return 0;
1581 #endif
1582 }
1583 
1584 /**
1585  * Allocates a batch (list) of pv_entry_t's from the global PV free array.
1586  *
1587  * @return A pointer to the head of the newly-allocated batch, or PV_ENTRY_NULL
1588  *         if empty.
1589  */
1590 MARK_AS_PMAP_TEXT static pv_entry_t *
pv_free_array_get_batch(void)1591 pv_free_array_get_batch(void)
1592 {
1593 	pv_entry_t *new_batch = PV_ENTRY_NULL;
1594 
1595 	pmap_simple_lock(&pv_free_array_lock);
1596 	if (pv_free_array_n_elems() > 0) {
1597 		/**
1598 		 * The global PV array acts as a ring buffer where each entry points to
1599 		 * a linked list of PVEs of length PV_BATCH_SIZE. Get the next free
1600 		 * batch.
1601 		 */
1602 		const size_t index = pv_free_read_idx++ & (PV_FREE_ARRAY_SIZE - 1);
1603 		pv_free_list_t *free_list = &pv_free_ring[index];
1604 
1605 		assert((free_list->count == PV_BATCH_SIZE) && (free_list->list != PV_ENTRY_NULL));
1606 		new_batch = free_list->list;
1607 	}
1608 	pmap_simple_unlock(&pv_free_array_lock);
1609 
1610 	return new_batch;
1611 }
1612 
1613 /**
1614  * Frees a batch (list) of pv_entry_t's into the global PV free array.
1615  *
1616  * @param batch_head Pointer to the first entry in the batch to be returned to
1617  *                   the array. This must be a linked list of pv_entry_t's of
1618  *                   length PV_BATCH_SIZE.
1619  *
1620  * @return KERN_SUCCESS, or KERN_FAILURE if the global array is full.
1621  */
1622 MARK_AS_PMAP_TEXT static kern_return_t
pv_free_array_give_batch(pv_entry_t * batch_head)1623 pv_free_array_give_batch(pv_entry_t *batch_head)
1624 {
1625 	assert(batch_head != NULL);
1626 
1627 	pmap_simple_lock(&pv_free_array_lock);
1628 	if (pv_free_array_n_elems() == (PV_FREE_ARRAY_SIZE - 1)) {
1629 		pmap_simple_unlock(&pv_free_array_lock);
1630 		return KERN_FAILURE;
1631 	}
1632 
1633 	const size_t index = pv_free_write_idx++ & (PV_FREE_ARRAY_SIZE - 1);
1634 	pv_free_list_t *free_list = &pv_free_ring[index];
1635 	free_list->list = batch_head;
1636 	free_list->count = PV_BATCH_SIZE;
1637 	pmap_simple_unlock(&pv_free_array_lock);
1638 
1639 	return KERN_SUCCESS;
1640 }
1641 
1642 /**
1643  * Helper function for allocating a single PVE from an arbitrary free list.
1644  *
1645  * @param free_list The free list to allocate a node from.
1646  * @param pvepp Output parameter that will get updated with a pointer to the
1647  *              allocated node if the free list isn't empty, or a pointer to
1648  *              NULL if the list is empty.
1649  */
1650 MARK_AS_PMAP_TEXT static void
pv_free_list_alloc(pv_free_list_t * free_list,pv_entry_t ** pvepp)1651 pv_free_list_alloc(pv_free_list_t *free_list, pv_entry_t **pvepp)
1652 {
1653 	assert(pvepp != NULL);
1654 	assert(((free_list->list != NULL) && (free_list->count > 0)) ||
1655 	    ((free_list->list == NULL) && (free_list->count == 0)));
1656 
1657 	if ((*pvepp = free_list->list) != NULL) {
1658 		pv_entry_t *pvep = *pvepp;
1659 		free_list->list = pvep->pve_next;
1660 		pvep->pve_next = PV_ENTRY_NULL;
1661 		free_list->count--;
1662 	}
1663 }
1664 
1665 /**
1666  * Allocates a PVE from the kernel-dedicated list.
1667  *
1668  * @note This is only called when the global free list is empty, so don't bother
1669  *       trying to allocate more nodes from that list.
1670  *
1671  * @param pvepp Output parameter that will get updated with a pointer to the
1672  *              allocated node if the free list isn't empty, or a pointer to
1673  *              NULL if the list is empty. This pointer can't already be
1674  *              pointing to a valid entry before allocation.
1675  */
1676 MARK_AS_PMAP_TEXT static void
pv_list_kern_alloc(pv_entry_t ** pvepp)1677 pv_list_kern_alloc(pv_entry_t **pvepp)
1678 {
1679 	assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
1680 	pmap_simple_lock(&pv_kern_free_list_lock);
1681 	if (pv_kern_free.count > 0) {
1682 		pmap_kern_reserve_alloc_stat++;
1683 	}
1684 	pv_free_list_alloc(&pv_kern_free, pvepp);
1685 	pmap_simple_unlock(&pv_kern_free_list_lock);
1686 }
1687 
1688 /**
1689  * Returns a list of PVEs to the kernel-dedicated free list.
1690  *
1691  * @param pve_head Head of the list to be returned.
1692  * @param pve_tail Tail of the list to be returned.
1693  * @param pv_cnt Number of elements in the list to be returned.
1694  */
1695 MARK_AS_PMAP_TEXT static void
pv_list_kern_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,int pv_cnt)1696 pv_list_kern_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, int pv_cnt)
1697 {
1698 	assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
1699 
1700 	pmap_simple_lock(&pv_kern_free_list_lock);
1701 	pve_tail->pve_next = pv_kern_free.list;
1702 	pv_kern_free.list = pve_head;
1703 	pv_kern_free.count += pv_cnt;
1704 	pmap_simple_unlock(&pv_kern_free_list_lock);
1705 }
1706 
1707 /**
1708  * Attempts to allocate from the per-cpu free list of PVEs, and if that fails,
1709  * then replenish the per-cpu free list with a batch of PVEs from the global
1710  * PVE free list.
1711  *
1712  * @param pvepp Output parameter that will get updated with a pointer to the
1713  *              allocated node if the free lists aren't empty, or a pointer to
1714  *              NULL if both the per-cpu and global lists are empty. This
1715  *              pointer can't already be pointing to a valid entry before
1716  *              allocation.
1717  */
1718 MARK_AS_PMAP_TEXT static void
pv_list_alloc(pv_entry_t ** pvepp)1719 pv_list_alloc(pv_entry_t **pvepp)
1720 {
1721 	assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
1722 
1723 #if !XNU_MONITOR
1724 	/**
1725 	 * Preemption is always disabled in the PPL so it only needs to get disabled
1726 	 * on non-PPL systems. This needs to be disabled while working with per-cpu
1727 	 * data to prevent getting rescheduled onto a different CPU.
1728 	 */
1729 	mp_disable_preemption();
1730 #endif /* !XNU_MONITOR */
1731 
1732 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
1733 	pv_free_list_alloc(&pmap_cpu_data->pv_free, pvepp);
1734 
1735 	if (*pvepp != PV_ENTRY_NULL) {
1736 		goto pv_list_alloc_done;
1737 	}
1738 
1739 #if !XNU_MONITOR
1740 	if (pv_kern_free.count < pv_kern_low_water_mark) {
1741 		/**
1742 		 * If the kernel reserved pool is low, let non-kernel mappings wait for
1743 		 * a page from the VM.
1744 		 */
1745 		goto pv_list_alloc_done;
1746 	}
1747 #endif /* !XNU_MONITOR */
1748 
1749 	/**
1750 	 * Attempt to replenish the local list off the global one, and return the
1751 	 * first element. If the global list is empty, then the allocation failed.
1752 	 */
1753 	pv_entry_t *new_batch = pv_free_array_get_batch();
1754 
1755 	if (new_batch != PV_ENTRY_NULL) {
1756 		pmap_cpu_data->pv_free.count = PV_BATCH_SIZE - 1;
1757 		pmap_cpu_data->pv_free.list = new_batch->pve_next;
1758 		assert(pmap_cpu_data->pv_free.list != NULL);
1759 
1760 		new_batch->pve_next = PV_ENTRY_NULL;
1761 		*pvepp = new_batch;
1762 	}
1763 
1764 pv_list_alloc_done:
1765 #if !XNU_MONITOR
1766 	mp_enable_preemption();
1767 #endif /* !XNU_MONITOR */
1768 
1769 	return;
1770 }
1771 
1772 /**
1773  * Adds a list of PVEs to the per-CPU PVE free list. May spill out some entries
1774  * to the global or the kernel PVE free lists if the per-CPU list contains too
1775  * many PVEs.
1776  *
1777  * @param pve_head Head of the list to be returned.
1778  * @param pve_tail Tail of the list to be returned.
1779  * @param pv_cnt Number of elements in the list to be returned.
1780  */
1781 MARK_AS_PMAP_TEXT void
pv_list_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,int pv_cnt)1782 pv_list_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, int pv_cnt)
1783 {
1784 	assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
1785 
1786 #if !XNU_MONITOR
1787 	/**
1788 	 * Preemption is always disabled in the PPL so it only needs to get disabled
1789 	 * on non-PPL systems. This needs to be disabled while working with per-cpu
1790 	 * data to prevent getting rescheduled onto a different CPU.
1791 	 */
1792 	mp_disable_preemption();
1793 #endif /* !XNU_MONITOR */
1794 
1795 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
1796 
1797 	/**
1798 	 * How many more PVEs need to be added to the last allocated batch to get it
1799 	 * back up to a PV_BATCH_SIZE number of objects.
1800 	 */
1801 	const uint32_t available = PV_BATCH_SIZE - (pmap_cpu_data->pv_free.count % PV_BATCH_SIZE);
1802 
1803 	/**
1804 	 * The common case is that the number of PVEs to be freed fit in the current
1805 	 * PV_BATCH_SIZE boundary. If that is the case, quickly prepend the whole
1806 	 * list and return.
1807 	 */
1808 	if (__probable((pv_cnt <= available) &&
1809 	    ((pmap_cpu_data->pv_free.count % PV_BATCH_SIZE != 0) || (pmap_cpu_data->pv_free.count == 0)))) {
1810 		pve_tail->pve_next = pmap_cpu_data->pv_free.list;
1811 		pmap_cpu_data->pv_free.list = pve_head;
1812 		pmap_cpu_data->pv_free.count += pv_cnt;
1813 		goto pv_list_free_done;
1814 	}
1815 
1816 	/**
1817 	 * In the degenerate case, we need to process PVEs one by one, to make sure
1818 	 * we spill out to the global list, or update the spill marker as
1819 	 * appropriate.
1820 	 */
1821 	while (pv_cnt) {
1822 		/**
1823 		 * Take the node off the top of the passed in list and prepend it to the
1824 		 * per-cpu list.
1825 		 */
1826 		pv_entry_t *pv_next = pve_head->pve_next;
1827 		pve_head->pve_next = pmap_cpu_data->pv_free.list;
1828 		pmap_cpu_data->pv_free.list = pve_head;
1829 		pve_head = pv_next;
1830 		pmap_cpu_data->pv_free.count++;
1831 		pv_cnt--;
1832 
1833 		if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE + 1))) {
1834 			/**
1835 			 * A full batch of entries have been freed to the per-cpu list.
1836 			 * Update the spill marker which is used to remember the end of a
1837 			 * batch (remember, we prepend nodes) to eventually return back to
1838 			 * the global list (we try to only keep one PV_BATCH_SIZE worth of
1839 			 * nodes in any single per-cpu list).
1840 			 */
1841 			pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
1842 		} else if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE * 2) + 1)) {
1843 			/* Spill out excess PVEs to the global PVE array */
1844 			pv_entry_t *spill_head = pmap_cpu_data->pv_free.list->pve_next;
1845 			pv_entry_t *spill_tail = pmap_cpu_data->pv_free_spill_marker;
1846 			pmap_cpu_data->pv_free.list->pve_next = pmap_cpu_data->pv_free_spill_marker->pve_next;
1847 			spill_tail->pve_next = PV_ENTRY_NULL;
1848 			pmap_cpu_data->pv_free.count -= PV_BATCH_SIZE;
1849 			pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
1850 
1851 			if (__improbable(pv_free_array_give_batch(spill_head) != KERN_SUCCESS)) {
1852 				/**
1853 				 * This is extremely unlikely to happen, as it would imply that
1854 				 * we have (PV_FREE_ARRAY_SIZE * PV_BATCH_SIZE) PVEs sitting in
1855 				 * the global array. Just in case, push the excess down to the
1856 				 * kernel PVE free list.
1857 				 */
1858 				pv_list_kern_free(spill_head, spill_tail, PV_BATCH_SIZE);
1859 			}
1860 		}
1861 	}
1862 
1863 pv_list_free_done:
1864 #if !XNU_MONITOR
1865 	mp_enable_preemption();
1866 #endif /* !XNU_MONITOR */
1867 
1868 	return;
1869 }
1870 
1871 /**
1872  * Adds a single page to the PVE allocation subsystem.
1873  *
1874  * @note This function operates under the assumption that a PV_BATCH_SIZE amount
1875  *       of PVEs can fit within a single page. One page is always allocated for
1876  *       one batch, so if there's empty space in the page after the batch of
1877  *       PVEs, it'll go unused (so it's best to keep the batch size at an amount
1878  *       that utilizes a whole page).
1879  *
1880  * @param alloc_flags Allocation flags passed to pmap_pages_alloc_zeroed(). See
1881  *                    the definition of that function for a detailed description
1882  *                    of the available flags.
1883  *
1884  * @return KERN_SUCCESS, or the value returned by pmap_pages_alloc_zeroed() upon
1885  *         failure.
1886  */
1887 MARK_AS_PMAP_TEXT static kern_return_t
pve_feed_page(unsigned alloc_flags)1888 pve_feed_page(unsigned alloc_flags)
1889 {
1890 	kern_return_t kr = KERN_FAILURE;
1891 
1892 	pv_entry_t *pve_head = PV_ENTRY_NULL;
1893 	pv_entry_t *pve_tail = PV_ENTRY_NULL;
1894 	pmap_paddr_t pa = 0;
1895 
1896 	kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags);
1897 
1898 	if (kr != KERN_SUCCESS) {
1899 		return kr;
1900 	}
1901 
1902 	/* Update statistics globals. See the variables' definitions for more info. */
1903 	pv_page_count++;
1904 	pmap_reserve_replenish_stat += PV_BATCH_SIZE;
1905 
1906 	/* Prepare a new list by linking all of the entries in advance. */
1907 	pve_head = (pv_entry_t *)phystokv(pa);
1908 	pve_tail = &pve_head[PV_BATCH_SIZE - 1];
1909 
1910 	for (int i = 0; i < PV_BATCH_SIZE; i++) {
1911 		pve_head[i].pve_next = &pve_head[i + 1];
1912 	}
1913 	pve_head[PV_BATCH_SIZE - 1].pve_next = PV_ENTRY_NULL;
1914 
1915 	/**
1916 	 * Add the new list to the kernel PVE free list if we are running low on
1917 	 * kernel-dedicated entries or the global free array is full.
1918 	 */
1919 	if ((pv_kern_free.count < pv_kern_low_water_mark) ||
1920 	    (pv_free_array_give_batch(pve_head) != KERN_SUCCESS)) {
1921 		pv_list_kern_free(pve_head, pve_tail, PV_BATCH_SIZE);
1922 	}
1923 
1924 	return KERN_SUCCESS;
1925 }
1926 
1927 /**
1928  * Allocate a PV node from one of many different free lists (per-cpu, global, or
1929  * kernel-specific).
1930  *
1931  * @note This function is very tightly coupled with pmap_enter_pv(). If
1932  *       modifying this code, please ensure that pmap_enter_pv() doesn't break.
1933  *
1934  * @note The pmap lock must already be held if the new mapping is a CPU mapping.
1935  *
1936  * @note The PVH lock for the physical page that is getting a new mapping
1937  *       registered must already be held.
1938  *
1939  * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
1940  *             an IOMMU translation.
1941  * @param pai The physical address index of the page that's getting a new
1942  *            mapping.
1943  * @param lock_mode Which state the pmap lock is being held in if the mapping is
1944  *                  owned by a pmap, otherwise this is a don't care.
1945  * @param options PMAP_OPTIONS_* family of options passed from the caller.
1946  * @param pvepp Output parameter that will get updated with a pointer to the
1947  *              allocated node if none of the free lists are empty, or a pointer
1948  *              to NULL otherwise. This pointer can't already be pointing to a
1949  *              valid entry before allocation.
1950  *
1951  * @return These are the possible return values:
1952  *     PV_ALLOC_SUCCESS: A PVE object was successfully allocated.
1953  *     PV_ALLOC_FAILURE: No objects were available for allocation, and
1954  *                       allocating a new page failed. On PPL-enabled systems,
1955  *                       a fresh page needs to be added to the PPL page list
1956  *                       before retrying this operaton.
1957  *     PV_ALLOC_RETRY: No objects were available on the free lists, so a new
1958  *                     page of PVE objects needed to be allocated. To do that,
1959  *                     the pmap and PVH locks were dropped. The caller may have
1960  *                     depended on these locks for consistency, so return and
1961  *                     let the caller retry the PVE allocation with the locks
1962  *                     held. Note that the locks have already been re-acquired
1963  *                     before this function exits.
1964  */
1965 MARK_AS_PMAP_TEXT pv_alloc_return_t
pv_alloc(pmap_t pmap,unsigned int pai,pmap_lock_mode_t lock_mode,unsigned int options,pv_entry_t ** pvepp)1966 pv_alloc(
1967 	pmap_t pmap,
1968 	unsigned int pai,
1969 	pmap_lock_mode_t lock_mode,
1970 	unsigned int options,
1971 	pv_entry_t **pvepp)
1972 {
1973 	assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
1974 
1975 	if (pmap != NULL) {
1976 		pmap_assert_locked(pmap, lock_mode);
1977 	}
1978 	pvh_assert_locked(pai);
1979 
1980 	pv_list_alloc(pvepp);
1981 	if (PV_ENTRY_NULL != *pvepp) {
1982 		return PV_ALLOC_SUCCESS;
1983 	}
1984 
1985 #if XNU_MONITOR
1986 	/* PPL can't block so this flag is always required. */
1987 	unsigned alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT;
1988 #else /* XNU_MONITOR */
1989 	unsigned alloc_flags = 0;
1990 #endif /* XNU_MONITOR */
1991 
1992 	/**
1993 	 * We got here because both the per-CPU and the global lists are empty. If
1994 	 * this allocation is for the kernel pmap or an IOMMU kernel driver, we try
1995 	 * to get an entry from the kernel list next.
1996 	 */
1997 	if ((pmap == NULL) || (kernel_pmap == pmap)) {
1998 		pv_list_kern_alloc(pvepp);
1999 		if (PV_ENTRY_NULL != *pvepp) {
2000 			return PV_ALLOC_SUCCESS;
2001 		}
2002 		/**
2003 		 * If the pmap is NULL, this is an allocation outside the normal pmap path,
2004 		 * most likely an IOMMU allocation.  We therefore don't know what other locks
2005 		 * this path may hold or timing constraints it may have, so we should avoid
2006 		 * a potentially expensive call to pmap_page_reclaim() on this path.
2007 		 */
2008 		if (pmap == NULL) {
2009 			alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT;
2010 		} else {
2011 			alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT | PMAP_PAGE_RECLAIM_NOWAIT;
2012 		}
2013 	}
2014 
2015 	/**
2016 	 * Make sure we have PMAP_PAGES_ALLOCATE_NOWAIT set in alloc_flags when the
2017 	 * input options argument has PMAP_OPTIONS_NOWAIT set.
2018 	 */
2019 	alloc_flags |= (options & PMAP_OPTIONS_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
2020 
2021 	/**
2022 	 * We ran out of PV entries all across the board, or this allocation is not
2023 	 * for the kernel. Let's make sure that the kernel list is not too full
2024 	 * (very unlikely), in which case we can rebalance here.
2025 	 */
2026 	if (__improbable(pv_kern_free.count > (PV_BATCH_SIZE * 2))) {
2027 		pmap_simple_lock(&pv_kern_free_list_lock);
2028 		/* Re-check, now that the lock is held. */
2029 		if (pv_kern_free.count > (PV_BATCH_SIZE * 2)) {
2030 			pv_entry_t *pve_head = pv_kern_free.list;
2031 			pv_entry_t *pve_tail = pve_head;
2032 
2033 			for (int i = 0; i < (PV_BATCH_SIZE - 1); i++) {
2034 				pve_tail = pve_tail->pve_next;
2035 			}
2036 
2037 			pv_kern_free.list = pve_tail->pve_next;
2038 			pv_kern_free.count -= PV_BATCH_SIZE;
2039 			pve_tail->pve_next = PV_ENTRY_NULL;
2040 			pmap_simple_unlock(&pv_kern_free_list_lock);
2041 
2042 			/* Return back every node except the first one to the free lists. */
2043 			pv_list_free(pve_head->pve_next, pve_tail, PV_BATCH_SIZE - 1);
2044 			pve_head->pve_next = PV_ENTRY_NULL;
2045 			*pvepp = pve_head;
2046 			return PV_ALLOC_SUCCESS;
2047 		}
2048 		pmap_simple_unlock(&pv_kern_free_list_lock);
2049 	}
2050 
2051 	/**
2052 	 * If all else fails, try to get a new pmap page so that the allocation
2053 	 * succeeds once the caller retries it.
2054 	 */
2055 	kern_return_t kr = KERN_FAILURE;
2056 	pv_alloc_return_t pv_status = PV_ALLOC_FAIL;
2057 
2058 	/* Drop the lock during page allocation since that can take a while. */
2059 	pvh_unlock(pai);
2060 	if (pmap != NULL) {
2061 		pmap_unlock(pmap, lock_mode);
2062 	}
2063 
2064 	if ((kr = pve_feed_page(alloc_flags)) == KERN_SUCCESS) {
2065 		/**
2066 		 * Since the lock was dropped, even though we successfully allocated a
2067 		 * new page to be used for PVE nodes, the code that relies on this
2068 		 * function might have depended on the lock being held for consistency,
2069 		 * so return out early and let them retry the allocation with the lock
2070 		 * re-held.
2071 		 */
2072 		pv_status = PV_ALLOC_RETRY;
2073 	} else {
2074 		pv_status = PV_ALLOC_FAIL;
2075 	}
2076 
2077 	if (pmap != NULL) {
2078 		pmap_lock(pmap, lock_mode);
2079 	}
2080 	pvh_lock(pai);
2081 
2082 	/* Ensure that no node was created if we're not returning successfully. */
2083 	assert(*pvepp == PV_ENTRY_NULL);
2084 
2085 	return pv_status;
2086 }
2087 
2088 /**
2089  * Utility function for freeing a single PVE object back to the free lists.
2090  *
2091  * @param pvep Pointer to the PVE object to free.
2092  */
2093 MARK_AS_PMAP_TEXT void
pv_free(pv_entry_t * pvep)2094 pv_free(pv_entry_t *pvep)
2095 {
2096 	assert(pvep != PV_ENTRY_NULL);
2097 
2098 	pv_list_free(pvep, pvep, 1);
2099 }
2100 
2101 /**
2102  * This function provides a mechanism for the device tree to override the
2103  * default PV allocation amounts and the watermark level which determines how
2104  * many PVE objects are kept in the kernel-dedicated free list.
2105  */
2106 MARK_AS_PMAP_TEXT void
pmap_compute_pv_targets(void)2107 pmap_compute_pv_targets(void)
2108 {
2109 	DTEntry entry = NULL;
2110 	void const *prop = NULL;
2111 	int err = 0;
2112 	unsigned int prop_size = 0;
2113 
2114 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2115 	assert(err == kSuccess);
2116 
2117 	if (kSuccess == SecureDTGetProperty(entry, "pmap-pv-count", &prop, &prop_size)) {
2118 		if (prop_size != sizeof(pv_alloc_initial_target)) {
2119 			panic("pmap-pv-count property is not a 32-bit integer");
2120 		}
2121 		pv_alloc_initial_target = *((uint32_t const *)prop);
2122 	}
2123 
2124 	if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-count", &prop, &prop_size)) {
2125 		if (prop_size != sizeof(pv_kern_alloc_initial_target)) {
2126 			panic("pmap-kern-pv-count property is not a 32-bit integer");
2127 		}
2128 		pv_kern_alloc_initial_target = *((uint32_t const *)prop);
2129 	}
2130 
2131 	if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-min", &prop, &prop_size)) {
2132 		if (prop_size != sizeof(pv_kern_low_water_mark)) {
2133 			panic("pmap-kern-pv-min property is not a 32-bit integer");
2134 		}
2135 		pv_kern_low_water_mark = *((uint32_t const *)prop);
2136 	}
2137 }
2138 
2139 /**
2140  * This would normally be used to adjust the amount of PVE objects available in
2141  * the system, but we do that dynamically at runtime anyway so this is unneeded.
2142  */
2143 void
mapping_adjust(void)2144 mapping_adjust(void)
2145 {
2146 	/* Not implemented for arm/arm64. */
2147 }
2148 
2149 /**
2150  * Creates a target number of free pv_entry_t objects for the kernel free list
2151  * and the general free list.
2152  *
2153  * @note This function is called once during early boot, in kernel_bootstrap().
2154  *
2155  * @return KERN_SUCCESS if the objects were successfully allocated, or the
2156  *         return value from pve_feed_page() on failure (could be caused by not
2157  *         being able to allocate a page).
2158  */
2159 MARK_AS_PMAP_TEXT kern_return_t
mapping_free_prime_internal(void)2160 mapping_free_prime_internal(void)
2161 {
2162 	kern_return_t kr = KERN_FAILURE;
2163 
2164 #if XNU_MONITOR
2165 	/* PPL can't block so this flag is always required. */
2166 	unsigned alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT;
2167 #else /* XNU_MONITOR */
2168 	unsigned alloc_flags = 0;
2169 #endif /* XNU_MONITOR */
2170 
2171 	/*
2172 	 * We do not need to hold the pv_free_array lock to calculate the number of
2173 	 * elements in it because no other core is running at this point.
2174 	 */
2175 	while (((pv_free_array_n_elems() * PV_BATCH_SIZE) < pv_alloc_initial_target) ||
2176 	    (pv_kern_free.count < pv_kern_alloc_initial_target)) {
2177 		if ((kr = pve_feed_page(alloc_flags)) != KERN_SUCCESS) {
2178 			return kr;
2179 		}
2180 	}
2181 
2182 	return KERN_SUCCESS;
2183 }
2184 
2185 /**
2186  * Helper function for pmap_enter_pv (hereby shortened to "pepv") which converts
2187  * a PVH entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP which will transform the
2188  * entry into a linked list of mappings.
2189  *
2190  * @note This should only be called from pmap_enter_pv().
2191  *
2192  * @note The PVH lock for the passed in page must already be held and the type
2193  *       must be PVH_TYPE_PTEP (wouldn't make sense to call this otherwise).
2194  *
2195  * @param pmap Either the pmap that owns the mapping being registered in
2196  *             pmap_enter_pv(), or NULL if this is an IOMMU mapping.
2197  * @param pai The physical address index of the page that's getting a second
2198  *            mapping and needs to be converted from PVH_TYPE_PTEP to
2199  *            PVH_TYPE_PVEP.
2200  * @param lock_mode Which state the pmap lock is being held in if the mapping is
2201  *                  owned by a pmap, otherwise this is a don't care.
2202  * @param options PMAP_OPTIONS_* family of options.
2203  *
2204  * @return PV_ALLOC_SUCCESS if the entry at `pai` was successfully converted
2205  *         into PVH_TYPE_PVEP, or the return value of pv_alloc() otherwise. See
2206  *         pv_alloc()'s function header for a detailed explanation of the
2207  *         possible return values.
2208  */
2209 MARK_AS_PMAP_TEXT static pv_alloc_return_t
pepv_convert_ptep_to_pvep(pmap_t pmap,unsigned int pai,pmap_lock_mode_t lock_mode,unsigned int options)2210 pepv_convert_ptep_to_pvep(
2211 	pmap_t pmap,
2212 	unsigned int pai,
2213 	pmap_lock_mode_t lock_mode,
2214 	unsigned int options)
2215 {
2216 	pvh_assert_locked(pai);
2217 
2218 	pv_entry_t **pvh = pai_to_pvh(pai);
2219 	assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
2220 
2221 	pv_entry_t *pvep = PV_ENTRY_NULL;
2222 	pv_alloc_return_t ret = pv_alloc(pmap, pai, lock_mode, options, &pvep);
2223 	if (ret != PV_ALLOC_SUCCESS) {
2224 		return ret;
2225 	}
2226 
2227 	/* If we've gotten this far then a node should've been allocated. */
2228 	assert(pvep != PV_ENTRY_NULL);
2229 
2230 	/* The new PVE should have the same PTE pointer as the previous PVH entry. */
2231 	pve_init(pvep);
2232 	pve_set_ptep(pvep, 0, pvh_ptep(pvh));
2233 
2234 	assert(!pve_get_internal(pvep, 0));
2235 	assert(!pve_get_altacct(pvep, 0));
2236 	if (ppattr_is_internal(pai)) {
2237 		/**
2238 		 * Transfer "internal" status from pp_attr to this pve. See the comment
2239 		 * above PP_ATTR_INTERNAL for more information on this.
2240 		 */
2241 		ppattr_clear_internal(pai);
2242 		pve_set_internal(pvep, 0);
2243 	}
2244 	if (ppattr_is_altacct(pai)) {
2245 		/**
2246 		 * Transfer "altacct" status from pp_attr to this pve. See the comment
2247 		 * above PP_ATTR_ALTACCT for more information on this.
2248 		 */
2249 		ppattr_clear_altacct(pai);
2250 		pve_set_altacct(pvep, 0);
2251 	}
2252 
2253 	pvh_update_head(pvh, pvep, PVH_TYPE_PVEP);
2254 
2255 	return PV_ALLOC_SUCCESS;
2256 }
2257 
2258 /**
2259  * Register a new mapping into the pv_head_table. This is the main data
2260  * structure used for performing a reverse physical to virtual translation and
2261  * finding all mappings to a physical page. Whenever a new page table mapping is
2262  * created (regardless of whether it's for a CPU or an IOMMU), it should be
2263  * registered with a call to this function.
2264  *
2265  * @note The pmap lock must already be held if the new mapping is a CPU mapping.
2266  *
2267  * @note The PVH lock for the physical page that is getting a new mapping
2268  *       registered must already be held.
2269  *
2270  * @note This function cannot be called during the hibernation process because
2271  *       it modifies critical pmap data structures that need to be dumped into
2272  *       the hibernation image in a consistent state.
2273  *
2274  * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
2275  *             an IOMMU translation.
2276  * @param ptep The new mapping to register.
2277  * @param pai The physical address index of the physical page being mapped by
2278  *            `ptep`.
2279  * @param options Flags that can potentially be set on a per-page basis:
2280  *                PMAP_OPTIONS_INTERNAL: If this is the first CPU mapping, then
2281  *                    mark the page as being "internal". See the definition of
2282  *                    PP_ATTR_INTERNAL for more info.
2283  *                PMAP_OPTIONS_REUSABLE: If this is the first CPU mapping, and
2284  *                    this page is also marked internal, then mark the page as
2285  *                    being "reusable". See the definition of PP_ATTR_REUSABLE
2286  *                    for more info.
2287  * @param lock_mode Which state the pmap lock is being held in if the mapping is
2288  *                  owned by a pmap, otherwise this is a don't care.
2289  * @param new_pvepp An output parameter that is updated with a pointer to the
2290  *                  PVE object where the PTEP was allocated into. In the event
2291  *                  of failure, or if the pointer passed in is NULL,
2292  *                  it's not modified.
2293  * @param new_pve_ptep_idx An output parameter that is updated with the index
2294  *                  into the PVE object where the PTEP was allocated into.
2295  *                  In the event of failure, or if new_pvepp in is NULL,
2296  *                  it's not modified.
2297  *
2298  * @return PV_ALLOC_SUCCESS if the entry at `pai` was successfully updated with
2299  *         the new mapping, or the return value of pv_alloc() otherwise. See
2300  *         pv_alloc()'s function header for a detailed explanation of the
2301  *         possible return values.
2302  */
2303 MARK_AS_PMAP_TEXT pv_alloc_return_t
pmap_enter_pv(pmap_t pmap,pt_entry_t * ptep,int pai,unsigned int options,pmap_lock_mode_t lock_mode,pv_entry_t ** new_pvepp,int * new_pve_ptep_idx)2304 pmap_enter_pv(
2305 	pmap_t pmap,
2306 	pt_entry_t *ptep,
2307 	int pai,
2308 	unsigned int options,
2309 	pmap_lock_mode_t lock_mode,
2310 	pv_entry_t **new_pvepp,
2311 	int *new_pve_ptep_idx)
2312 {
2313 	assert(ptep != PT_ENTRY_NULL);
2314 
2315 	pv_entry_t **pvh = pai_to_pvh(pai);
2316 	bool first_cpu_mapping = false;
2317 
2318 	ASSERT_NOT_HIBERNATING();
2319 	pvh_assert_locked(pai);
2320 
2321 	if (pmap != NULL) {
2322 		pmap_assert_locked(pmap, lock_mode);
2323 	}
2324 
2325 	vm_offset_t pvh_flags = pvh_get_flags(pvh);
2326 
2327 #if XNU_MONITOR
2328 	if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2329 		panic("%d is locked down (%#lx), cannot enter", pai, pvh_flags);
2330 	}
2331 #endif /* XNU_MONITOR */
2332 
2333 
2334 #ifdef PVH_FLAG_CPU
2335 	/**
2336 	 * An IOMMU mapping may already be present for a page that hasn't yet had a
2337 	 * CPU mapping established, so we use PVH_FLAG_CPU to determine if this is
2338 	 * the first CPU mapping. We base internal/reusable accounting on the
2339 	 * options specified for the first CPU mapping. PVH_FLAG_CPU, and thus this
2340 	 * accounting, will then persist as long as there are *any* mappings of the
2341 	 * page. The accounting for a page should not need to change until the page
2342 	 * is recycled by the VM layer, and we assert that there are no mappings
2343 	 * when a page is recycled. An IOMMU mapping of a freed/recycled page is
2344 	 * considered a security violation & potential DMA corruption path.
2345 	 */
2346 	first_cpu_mapping = ((pmap != NULL) && !(pvh_flags & PVH_FLAG_CPU));
2347 	if (first_cpu_mapping) {
2348 		pvh_flags |= PVH_FLAG_CPU;
2349 	}
2350 #else /* PVH_FLAG_CPU */
2351 	first_cpu_mapping = pvh_test_type(pvh, PVH_TYPE_NULL);
2352 #endif /* PVH_FLAG_CPU */
2353 
2354 	/**
2355 	 * Internal/reusable flags are based on the first CPU mapping made to a
2356 	 * page. These will persist until all mappings to the page are removed.
2357 	 */
2358 	if (first_cpu_mapping) {
2359 		if ((options & PMAP_OPTIONS_INTERNAL) &&
2360 		    (options & PMAP_OPTIONS_REUSABLE)) {
2361 			ppattr_set_reusable(pai);
2362 		} else {
2363 			ppattr_clear_reusable(pai);
2364 		}
2365 	}
2366 
2367 	/* Visit the definitions for the PVH_TYPEs to learn more about each one. */
2368 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
2369 		/* If this is the first mapping, upgrade the type to store a single PTEP. */
2370 		pvh_update_head(pvh, ptep, PVH_TYPE_PTEP);
2371 	} else {
2372 		pv_alloc_return_t ret = PV_ALLOC_FAIL;
2373 
2374 		if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2375 			/**
2376 			 * There was already a single mapping to the page. Convert the PVH
2377 			 * entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP so that multiple
2378 			 * mappings can be tracked. If PVEs cannot hold more than a single
2379 			 * mapping, a second PVE will be added farther down.
2380 			 *
2381 			 * Also, ensure that the PVH flags (which can possibly contain
2382 			 * PVH_FLAG_CPU) are set before potentially returning or dropping
2383 			 * the locks. We use that flag to lock in the internal/reusable
2384 			 * attributes and we don't want another mapping to jump in while the
2385 			 * locks are dropped, think it's the first CPU mapping, and decide
2386 			 * to clobber those attributes.
2387 			 */
2388 			pvh_set_flags(pvh, pvh_flags);
2389 			if ((ret = pepv_convert_ptep_to_pvep(pmap, pai, lock_mode, options)) != PV_ALLOC_SUCCESS) {
2390 				return ret;
2391 			}
2392 
2393 			/**
2394 			 * At this point, the PVH flags have been clobbered due to updating
2395 			 * PTEP->PVEP, but that's ok because the locks are being held and
2396 			 * the flags will get set again below before pv_alloc() is called
2397 			 * and the locks are potentially dropped again.
2398 			 */
2399 		} else if (!pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2400 			panic("%s: unexpected PV head %p, ptep=%p pmap=%p pvh=%p",
2401 			    __func__, *pvh, ptep, pmap, pvh);
2402 		}
2403 
2404 		/**
2405 		 * Check if we have room for one more mapping in this PVE
2406 		 */
2407 		pv_entry_t *pvep = pvh_pve_list(pvh);
2408 		assert(pvep != PV_ENTRY_NULL);
2409 
2410 		int pve_ptep_idx = pve_find_ptep_index(pvep, PT_ENTRY_NULL);
2411 
2412 		if (pve_ptep_idx == -1) {
2413 			/**
2414 			 * Set up the pv_entry for this new mapping and then add it to the list
2415 			 * for this physical page.
2416 			 */
2417 			pve_ptep_idx = 0;
2418 			pvh_set_flags(pvh, pvh_flags);
2419 			pvep = PV_ENTRY_NULL;
2420 			if ((ret = pv_alloc(pmap, pai, lock_mode, options, &pvep)) != PV_ALLOC_SUCCESS) {
2421 				return ret;
2422 			}
2423 
2424 			/* If we've gotten this far then a node should've been allocated. */
2425 			assert(pvep != PV_ENTRY_NULL);
2426 			pve_init(pvep);
2427 			pve_add(pvh, pvep);
2428 		}
2429 
2430 		pve_set_ptep(pvep, pve_ptep_idx, ptep);
2431 
2432 		/*
2433 		 * The PTEP was successfully entered into the PVE object.
2434 		 * If the caller requests it, set new_pvepp and new_pve_ptep_idx
2435 		 * appropriately.
2436 		 */
2437 		if (new_pvepp != NULL) {
2438 			*new_pvepp = pvep;
2439 			*new_pve_ptep_idx = pve_ptep_idx;
2440 		}
2441 	}
2442 
2443 	pvh_set_flags(pvh, pvh_flags);
2444 
2445 	return PV_ALLOC_SUCCESS;
2446 }
2447 
2448 /**
2449  * Remove a mapping that was registered with the pv_head_table. This needs to be
2450  * done for every mapping that was previously registered using pmap_enter_pv()
2451  * when the mapping is removed.
2452  *
2453  * @note The PVH lock for the physical page that is getting a new mapping
2454  *       registered must already be held.
2455  *
2456  * @note This function cannot be called during the hibernation process because
2457  *       it modifies critical pmap data structures that need to be dumped into
2458  *       the hibernation image in a consistent state.
2459  *
2460  * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
2461  *             an IOMMU translation.
2462  * @param ptep The mapping that's getting removed.
2463  * @param pai The physical address index of the physical page being mapped by
2464  *            `ptep`.
2465  * @param flush_tlb_async On some systems, removing the last mapping to a page
2466  *                        that used to be mapped executable will require
2467  *                        updating the physical aperture mapping of the page.
2468  *                        This parameter specifies whether the TLB invalidate
2469  *                        should be synchronized or not if that update occurs.
2470  * @param is_internal_p The internal bit of the PTE that was removed.
2471  * @param is_altacct_p The altacct bit of the PTE that was removed.
2472  */
2473 void
pmap_remove_pv(pmap_t pmap,pt_entry_t * ptep,int pai,bool flush_tlb_async __unused,bool * is_internal_p,bool * is_altacct_p)2474 pmap_remove_pv(
2475 	pmap_t pmap,
2476 	pt_entry_t *ptep,
2477 	int pai,
2478 	bool flush_tlb_async __unused,
2479 	bool *is_internal_p,
2480 	bool *is_altacct_p)
2481 {
2482 	ASSERT_NOT_HIBERNATING();
2483 	pvh_assert_locked(pai);
2484 
2485 	bool is_internal = false;
2486 	bool is_altacct = false;
2487 	pv_entry_t **pvh = pai_to_pvh(pai);
2488 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
2489 
2490 #if XNU_MONITOR
2491 	if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2492 		panic("%s: PVH entry at pai %d is locked down (%#lx), cannot remove",
2493 		    __func__, pai, pvh_flags);
2494 	}
2495 #endif /* XNU_MONITOR */
2496 
2497 
2498 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2499 		if (__improbable((ptep != pvh_ptep(pvh)))) {
2500 			/**
2501 			 * The only mapping that exists for this page isn't the one we're
2502 			 * unmapping, weird.
2503 			 */
2504 			panic("%s: ptep=%p does not match pvh=%p (%p), pai=0x%x",
2505 			    __func__, ptep, pvh, pvh_ptep(pvh), pai);
2506 		}
2507 
2508 		pvh_update_head(pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
2509 		is_internal = ppattr_is_internal(pai);
2510 		is_altacct = ppattr_is_altacct(pai);
2511 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2512 		pv_entry_t **pvepp = pvh;
2513 		pv_entry_t *pvep = pvh_pve_list(pvh);
2514 		assert(pvep != PV_ENTRY_NULL);
2515 		int pve_pte_idx = 0;
2516 		/* Find the PVE that represents the mapping we're removing. */
2517 		while ((pvep != PV_ENTRY_NULL) && ((pve_pte_idx = pve_find_ptep_index(pvep, ptep)) == -1)) {
2518 			pvepp = pve_next_ptr(pvep);
2519 			pvep = pve_next(pvep);
2520 		}
2521 
2522 		if (__improbable((pvep == PV_ENTRY_NULL))) {
2523 			panic("%s: ptep=%p (pai=0x%x) not in pvh=%p", __func__, ptep, pai, pvh);
2524 		}
2525 
2526 		is_internal = pve_get_internal(pvep, pve_pte_idx);
2527 		is_altacct = pve_get_altacct(pvep, pve_pte_idx);
2528 		pve_set_ptep(pvep, pve_pte_idx, PT_ENTRY_NULL);
2529 
2530 #if MACH_ASSERT
2531 		/**
2532 		 * Ensure that the mapping didn't accidentally have multiple PVEs
2533 		 * associated with it (there should only be one PVE per mapping). This
2534 		 * checking only occurs on configurations that can accept the perf hit
2535 		 * that walking the PVE chain on every unmap entails.
2536 		 *
2537 		 * This is skipped for IOMMU mappings because some IOMMUs don't use
2538 		 * normal page tables (e.g., NVMe) to map pages, so the `ptep` field in
2539 		 * the associated PVE won't actually point to a real page table (see the
2540 		 * definition of PVH_FLAG_IOMMU_TABLE for more info). Because of that,
2541 		 * it's perfectly possible for duplicate IOMMU PVEs to exist.
2542 		 */
2543 		if ((pmap != NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
2544 			pv_entry_t *check_pvep = pvep;
2545 
2546 			do {
2547 				if (pve_find_ptep_index(check_pvep, ptep) != -1) {
2548 					panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
2549 					    "pvep=%p, pai=0x%x", __func__, ptep, pmap, pvh, pvep, pai);
2550 				}
2551 			} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
2552 		}
2553 #endif /* MACH_ASSERT */
2554 
2555 		const bool pve_is_first = (pvepp == pvh);
2556 		const bool pve_is_last = (pve_next(pvep) == PV_ENTRY_NULL);
2557 		const int other_pte_idx = !pve_pte_idx;
2558 
2559 		if (pve_is_empty(pvep)) {
2560 			/*
2561 			 * This PVE doesn't contain any mappings. We can get rid of it.
2562 			 */
2563 			pve_remove(pvh, pvepp, pvep);
2564 			pv_free(pvep);
2565 		} else if (!pve_is_first) {
2566 			/*
2567 			 * This PVE contains a single mapping. See if we can coalesce it with the one
2568 			 * at the top of the list.
2569 			 */
2570 			pv_entry_t *head_pvep = pvh_pve_list(pvh);
2571 			int head_pve_pte_empty_idx;
2572 			if ((head_pve_pte_empty_idx = pve_find_ptep_index(head_pvep, PT_ENTRY_NULL)) != -1) {
2573 				pve_set_ptep(head_pvep, head_pve_pte_empty_idx, pve_get_ptep(pvep, other_pte_idx));
2574 				if (pve_get_internal(pvep, other_pte_idx)) {
2575 					pve_set_internal(head_pvep, head_pve_pte_empty_idx);
2576 				}
2577 				if (pve_get_altacct(pvep, other_pte_idx)) {
2578 					pve_set_altacct(head_pvep, head_pve_pte_empty_idx);
2579 				}
2580 				pve_remove(pvh, pvepp, pvep);
2581 				pv_free(pvep);
2582 			} else {
2583 				/*
2584 				 * We could not coalesce it. Move it to the start of the list, so that it
2585 				 * can be coalesced against in the future.
2586 				 */
2587 				*pvepp = pve_next(pvep);
2588 				pve_add(pvh, pvep);
2589 			}
2590 		} else if (pve_is_first && pve_is_last) {
2591 			/*
2592 			 * This PVE contains a single mapping, and it's the last mapping for this PAI.
2593 			 * Collapse this list back into the head, turning it into a PVH_TYPE_PTEP entry.
2594 			 */
2595 			pve_remove(pvh, pvepp, pvep);
2596 			pvh_update_head(pvh, pve_get_ptep(pvep, other_pte_idx), PVH_TYPE_PTEP);
2597 			if (pve_get_internal(pvep, other_pte_idx)) {
2598 				ppattr_set_internal(pai);
2599 			}
2600 			if (pve_get_altacct(pvep, other_pte_idx)) {
2601 				ppattr_set_altacct(pai);
2602 			}
2603 			pv_free(pvep);
2604 		}
2605 
2606 		/**
2607 		 * Removing a PVE entry can clobber the PVH flags if the head itself is
2608 		 * updated (when removing the first PVE in the list) so let's re-set the
2609 		 * flags back to what they should be.
2610 		 */
2611 		if (!pvh_test_type(pvh, PVH_TYPE_NULL)) {
2612 			pvh_set_flags(pvh, pvh_flags);
2613 		}
2614 	} else {
2615 		panic("%s: unexpected PV head %p, ptep=%p pmap=%p pvh=%p pai=0x%x",
2616 		    __func__, *pvh, ptep, pmap, pvh, pai);
2617 	}
2618 
2619 #ifdef PVH_FLAG_EXEC
2620 	/**
2621 	 * If we're on a system that has extra protections around executable pages,
2622 	 * then removing the last mapping to an executable page means we need to
2623 	 * give write-access back to the physical aperture mapping of this page
2624 	 * (write access is removed when a page is executable for security reasons).
2625 	 */
2626 	if ((pvh_flags & PVH_FLAG_EXEC) && pvh_test_type(pvh, PVH_TYPE_NULL)) {
2627 		pmap_set_ptov_ap(pai, AP_RWNA, flush_tlb_async);
2628 	}
2629 #endif /* PVH_FLAG_EXEC */
2630 	if (__improbable((pvh_flags & PVH_FLAG_FLUSH_NEEDED) && pvh_test_type(pvh, PVH_TYPE_NULL))) {
2631 		pmap_flush_noncoherent_page((pmap_paddr_t)ptoa(pai) + vm_first_phys);
2632 	}
2633 
2634 	*is_internal_p = is_internal;
2635 	*is_altacct_p = is_altacct;
2636 }
2637 
2638 /**
2639  * Bootstrap the initial Page Table Descriptor (PTD) node free list.
2640  *
2641  * @note It's not safe to allocate PTD nodes until after this function is
2642  *       invoked.
2643  *
2644  * @note The maximum number of PTD objects that can reside within one page
2645  *       (`ptd_per_page`) must have already been calculated before calling this
2646  *       function.
2647  *
2648  * @param ptdp Pointer to the virtually-contiguous memory used for the initial
2649  *             free list.
2650  * @param num_pages The number of virtually-contiguous pages pointed to by
2651  *                  `ptdp` that will be used to prime the PTD allocator.
2652  */
2653 MARK_AS_PMAP_TEXT void
ptd_bootstrap(pt_desc_t * ptdp,unsigned int num_pages)2654 ptd_bootstrap(pt_desc_t *ptdp, unsigned int num_pages)
2655 {
2656 	assert(ptd_per_page > 0);
2657 	assert((ptdp != NULL) && (((uintptr_t)ptdp & PAGE_MASK) == 0) && (num_pages > 0));
2658 
2659 	queue_init(&pt_page_list);
2660 
2661 	/**
2662 	 * Region represented by ptdp should be cleared by pmap_bootstrap().
2663 	 *
2664 	 * Only part of each page is being used for PTD objects (the rest is used
2665 	 * for each PTD's associated ptd_info_t object) so link together the last
2666 	 * PTD element of each page to the first element of the previous page.
2667 	 */
2668 	for (int i = 0; i < num_pages; i++) {
2669 		*((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
2670 		ptd_free_list = ptdp;
2671 		ptdp = (void *)(((uint8_t *)ptdp) + PAGE_SIZE);
2672 	}
2673 
2674 	ptd_free_count = num_pages * ptd_per_page;
2675 	simple_lock_init(&ptd_free_list_lock, 0);
2676 }
2677 
2678 /**
2679  * Allocate a page table descriptor (PTD) object from the PTD free list, but
2680  * don't add it to the list of reclaimable userspace page table pages just yet
2681  * and don't associate the PTD with a specific pmap (that's what "unlinked"
2682  * means here).
2683  *
2684  * @note Until a page table's descriptor object is added to the page table list,
2685  *       that table won't be eligible for reclaiming by pmap_page_reclaim().
2686  *
2687  * @return The page table descriptor object if the allocation was successful, or
2688  *         NULL otherwise (which indicates that a page failed to be allocated
2689  *         for new nodes).
2690  */
2691 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc_unlinked(void)2692 ptd_alloc_unlinked(void)
2693 {
2694 	pt_desc_t *ptdp = PTD_ENTRY_NULL;
2695 
2696 	pmap_simple_lock(&ptd_free_list_lock);
2697 
2698 	assert(ptd_per_page != 0);
2699 
2700 	/**
2701 	 * Ensure that we either have a free list with nodes available, or a
2702 	 * completely empty list to allocate and prepend new nodes to.
2703 	 */
2704 	assert(((ptd_free_list != NULL) && (ptd_free_count > 0)) ||
2705 	    ((ptd_free_list == NULL) && (ptd_free_count == 0)));
2706 
2707 	if (__improbable(ptd_free_count == 0)) {
2708 		pmap_paddr_t pa = 0;
2709 
2710 		/* Drop the lock while allocating pages since that can take a while. */
2711 		pmap_simple_unlock(&ptd_free_list_lock);
2712 
2713 		if (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT) != KERN_SUCCESS) {
2714 			return NULL;
2715 		}
2716 		ptdp = (pt_desc_t *)phystokv(pa);
2717 
2718 		pmap_simple_lock(&ptd_free_list_lock);
2719 
2720 		/**
2721 		 * Since the lock was dropped while allocating, it's possible another
2722 		 * CPU already allocated a page. To be safe, prepend the current free
2723 		 * list (which may or may not be empty now) to the page of nodes just
2724 		 * allocated and update the head to point to these new nodes.
2725 		 */
2726 		*((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
2727 		ptd_free_list = ptdp;
2728 		ptd_free_count += ptd_per_page;
2729 	}
2730 
2731 	/* There should be available nodes at this point. */
2732 	if (__improbable((ptd_free_count == 0) || (ptd_free_list == PTD_ENTRY_NULL))) {
2733 		panic_plain("%s: out of PTD entries and for some reason didn't "
2734 		    "allocate more %d %p", __func__, ptd_free_count, ptd_free_list);
2735 	}
2736 
2737 	/* Grab the top node off of the free list to return later. */
2738 	ptdp = ptd_free_list;
2739 
2740 	/**
2741 	 * Advance the free list to the next node.
2742 	 *
2743 	 * Each free pt_desc_t-sized object in this free list uses the first few
2744 	 * bytes of the object to point to the next object in the list. When an
2745 	 * object is deallocated (in ptd_deallocate()) the object is prepended onto
2746 	 * the free list by setting its first few bytes to point to the current free
2747 	 * list head. Then the head is updated to point to that object.
2748 	 *
2749 	 * When a new page is allocated for PTD nodes, it's left zeroed out. Once we
2750 	 * use up all of the previously deallocated nodes, the list will point
2751 	 * somewhere into the last allocated, empty page. We know we're pointing at
2752 	 * this page because the first few bytes of the object will be NULL. In
2753 	 * that case just set the head to this empty object.
2754 	 *
2755 	 * This empty page can be thought of as a "reserve" of empty nodes for the
2756 	 * case where more nodes are being allocated than there are nodes being
2757 	 * deallocated.
2758 	 */
2759 	pt_desc_t *const next_node = (pt_desc_t *)(*(void **)ptd_free_list);
2760 
2761 	/**
2762 	 * If the next node in the list is NULL but there are supposed to still be
2763 	 * nodes left, then we've hit the previously allocated empty page of nodes.
2764 	 * Go ahead and advance the free list to the next free node in that page.
2765 	 */
2766 	if ((next_node == PTD_ENTRY_NULL) && (ptd_free_count > 1)) {
2767 		ptd_free_list = ptd_free_list + 1;
2768 	} else {
2769 		ptd_free_list = next_node;
2770 	}
2771 
2772 	ptd_free_count--;
2773 
2774 	pmap_simple_unlock(&ptd_free_list_lock);
2775 
2776 	ptdp->pt_page.next = NULL;
2777 	ptdp->pt_page.prev = NULL;
2778 	ptdp->pmap = NULL;
2779 
2780 	/**
2781 	 * Calculate and stash the address of the ptd_info_t associated with this
2782 	 * PTD. This can be done easily because both structures co-exist in the same
2783 	 * page, with ptd_info_t's starting at a given offset from the start of the
2784 	 * page.
2785 	 *
2786 	 * Each PTD is associated with a ptd_info_t of the same index. For example,
2787 	 * the 15th PTD will use the 15th ptd_info_t in the same page.
2788 	 */
2789 	const unsigned ptd_index = ((uintptr_t)ptdp & PAGE_MASK) / sizeof(pt_desc_t);
2790 	assert(ptd_index < ptd_per_page);
2791 
2792 	const uintptr_t start_of_page = (uintptr_t)ptdp & ~PAGE_MASK;
2793 	ptd_info_t *first_ptd_info = (ptd_info_t *)(start_of_page + ptd_info_offset);
2794 	ptdp->ptd_info = &first_ptd_info[ptd_index * PT_INDEX_MAX];
2795 
2796 	/**
2797 	 * On systems where the VM page size doesn't match the hardware page size,
2798 	 * one PTD might have to manage multiple page tables.
2799 	 */
2800 	for (unsigned int i = 0; i < PT_INDEX_MAX; i++) {
2801 		ptdp->va[i] = (vm_offset_t)-1;
2802 		ptdp->ptd_info[i].refcnt = 0;
2803 		ptdp->ptd_info[i].wiredcnt = 0;
2804 	}
2805 
2806 	return ptdp;
2807 }
2808 
2809 /**
2810  * Allocate a single page table descriptor (PTD) object, and if it's meant to
2811  * keep track of a userspace page table, then add that descriptor object to the
2812  * list of PTDs that can be reclaimed in pmap_page_reclaim().
2813  *
2814  * @param pmap The pmap object that will be owning the page table(s) that this
2815  *             descriptor object represents.
2816  *
2817  * @return The allocated PTD object, or NULL if one failed to get allocated
2818  *         (which indicates that memory wasn't able to get allocated).
2819  */
2820 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc(pmap_t pmap)2821 ptd_alloc(pmap_t pmap)
2822 {
2823 	pt_desc_t *ptdp = ptd_alloc_unlinked();
2824 
2825 	if (ptdp == NULL) {
2826 		return NULL;
2827 	}
2828 
2829 	ptdp->pmap = pmap;
2830 	if (pmap != kernel_pmap) {
2831 		/**
2832 		 * We should never try to reclaim kernel pagetable pages in
2833 		 * pmap_page_reclaim(), so don't enter them into the list.
2834 		 */
2835 		pmap_simple_lock(&pt_pages_lock);
2836 		queue_enter(&pt_page_list, ptdp, pt_desc_t *, pt_page);
2837 		pmap_simple_unlock(&pt_pages_lock);
2838 	}
2839 
2840 	pmap_tt_ledger_credit(pmap, sizeof(*ptdp));
2841 	return ptdp;
2842 }
2843 
2844 /**
2845  * Deallocate a single page table descriptor (PTD) object.
2846  *
2847  * @note Ledger statistics are tracked on a per-pmap basis, so for those pages
2848  *       which are not associated with any specific pmap (e.g., IOMMU pages),
2849  *       the caller must ensure that the pmap/iommu field in the PTD object is
2850  *       NULL before calling this function.
2851  *
2852  * @param ptdp Pointer to the PTD object to deallocate.
2853  */
2854 MARK_AS_PMAP_TEXT void
ptd_deallocate(pt_desc_t * ptdp)2855 ptd_deallocate(pt_desc_t *ptdp)
2856 {
2857 	pmap_t pmap = ptdp->pmap;
2858 
2859 	/**
2860 	 * If this PTD was put onto the reclaimable page table list, then remove it
2861 	 * from that list before deallocating.
2862 	 */
2863 	if (ptdp->pt_page.next != NULL) {
2864 		pmap_simple_lock(&pt_pages_lock);
2865 		queue_remove(&pt_page_list, ptdp, pt_desc_t *, pt_page);
2866 		pmap_simple_unlock(&pt_pages_lock);
2867 	}
2868 
2869 	/* Prepend the deallocated node to the free list. */
2870 	pmap_simple_lock(&ptd_free_list_lock);
2871 	(*(void **)ptdp) = (void *)ptd_free_list;
2872 	ptd_free_list = (pt_desc_t *)ptdp;
2873 	ptd_free_count++;
2874 	pmap_simple_unlock(&ptd_free_list_lock);
2875 
2876 	/**
2877 	 * If this PTD was being used to represent an IOMMU page then there won't be
2878 	 * an associated pmap, and therefore no ledger statistics to update.
2879 	 */
2880 	if (pmap != NULL) {
2881 		pmap_tt_ledger_debit(pmap, sizeof(*ptdp));
2882 	}
2883 }
2884 
2885 /**
2886  * In address spaces where the VM page size is larger than the underlying
2887  * hardware page size, one page table descriptor (PTD) object can represent
2888  * multiple page tables. Some fields (like the reference counts) still need to
2889  * be tracked on a per-page-table basis. Because of this, those values are
2890  * stored in a separate array of ptd_info_t objects within the PTD where there's
2891  * one ptd_info_t for every page table a single PTD can manage.
2892  *
2893  * This function initializes the correct ptd_info_t field within a PTD based on
2894  * the page table it's representing.
2895  *
2896  * @param ptdp Pointer to the PTD object which contains the ptd_info_t field to
2897  *             update. Must match up with the `pmap` and `ptep` parameters.
2898  * @param pmap The pmap that owns the page table managed by the passed in PTD.
2899  * @param va Any virtual address that resides within the virtual address space
2900  *           being mapped by the page table pointed to by `ptep`.
2901  * @param level The level in the page table hierarchy that the table resides.
2902  * @param ptep A pointer into a page table that the passed in PTD manages. This
2903  *             page table must be owned by `pmap` and be the PTE that maps `va`.
2904  */
2905 MARK_AS_PMAP_TEXT void
ptd_info_init(pt_desc_t * ptdp,pmap_t pmap,vm_map_address_t va,unsigned int level,pt_entry_t * ptep)2906 ptd_info_init(
2907 	pt_desc_t *ptdp,
2908 	pmap_t pmap,
2909 	vm_map_address_t va,
2910 	unsigned int level,
2911 	pt_entry_t *ptep)
2912 {
2913 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2914 
2915 	if (ptdp->pmap != pmap) {
2916 		panic("%s: pmap mismatch, ptdp=%p, pmap=%p, va=%p, level=%u, ptep=%p",
2917 		    __func__, ptdp, pmap, (void*)va, level, ptep);
2918 	}
2919 
2920 	/**
2921 	 * Root tables are managed separately, and can be accessed through the
2922 	 * pmap structure itself (there's only one root table per address space).
2923 	 */
2924 	assert(level > pt_attr_root_level(pt_attr));
2925 
2926 	/**
2927 	 * Each PTD can represent multiple page tables. Get the correct index to use
2928 	 * with the per-page-table properties.
2929 	 */
2930 	const unsigned pt_index = ptd_get_index(ptdp, ptep);
2931 
2932 	/**
2933 	 * The "va" field represents the first virtual address that this page table
2934 	 * is translating for. Naturally, this is dependent on the level the page
2935 	 * table resides at since more VA space is mapped the closer the page
2936 	 * table's level is to the root.
2937 	 */
2938 	ptdp->va[pt_index] = (vm_offset_t) va & ~pt_attr_ln_offmask(pt_attr, level - 1);
2939 
2940 	/**
2941 	 * Reference counts are only tracked on CPU leaf tables because those are
2942 	 * the only tables that can be opportunistically deallocated.
2943 	 */
2944 	if (level < pt_attr_leaf_level(pt_attr)) {
2945 		ptdp->ptd_info[pt_index].refcnt = PT_DESC_REFCOUNT;
2946 	}
2947 }
2948 
2949 #if XNU_MONITOR
2950 
2951 /**
2952  * Validate that a pointer passed into the PPL is indeed an actual ledger object
2953  * that was allocated from within the PPL.
2954  *
2955  * If this is truly a real PPL-allocated ledger object then the object will have
2956  * an index into the ledger pointer array located right after it. That index
2957  * into the ledger pointer array should contain the exact same pointer that
2958  * we're validating. This works because the ledger array is PPL-owned data, so
2959  * even if the index was fabricated to try and point to a different ledger
2960  * object, the pointer inside the array won't match up with the passed in
2961  * pointer and validation will fail.
2962  *
2963  * @note This validation does not need to occur on non-PPL systems because on
2964  *       those systems the ledger objects are allocated using a zone allocator.
2965  *
2966  * @param ledger Pointer to the supposed ledger object that we need to validate.
2967  *
2968  * @return The index into the ledger pointer array used to validate the passed
2969  *         in ledger pointer. If the pointer failed to validate, then the system
2970  *         will panic.
2971  */
2972 MARK_AS_PMAP_TEXT uint64_t
pmap_ledger_validate(const volatile void * ledger)2973 pmap_ledger_validate(const volatile void *ledger)
2974 {
2975 	assert(ledger != NULL);
2976 
2977 	uint64_t array_index = ((const volatile pmap_ledger_t*)ledger)->array_index;
2978 
2979 	if (__improbable(array_index >= pmap_ledger_ptr_array_count)) {
2980 		panic("%s: ledger %p array index invalid, index was %#llx", __func__,
2981 		    ledger, array_index);
2982 	}
2983 
2984 	if (__improbable(pmap_ledger_ptr_array[array_index] != ledger)) {
2985 		panic("%s: ledger pointer mismatch, %p != %p", __func__, ledger,
2986 		    pmap_ledger_ptr_array[array_index]);
2987 	}
2988 
2989 	return array_index;
2990 }
2991 
2992 /**
2993  * The size of the ledgers being allocated by the PPL need to be large enough
2994  * to handle ledgers produced by the task_ledgers ledger template. That template
2995  * is dynamically created at runtime so this function is used to verify that the
2996  * real size of a ledger based on the task_ledgers template matches up with the
2997  * amount of space the PPL calculated is required for a single ledger.
2998  *
2999  * @note See the definition of PMAP_LEDGER_DATA_BYTES for more information.
3000  *
3001  * @note This function needs to be called before any ledgers can be allocated.
3002  *
3003  * @param size The actual size that each pmap ledger should be. This is
3004  *             calculated based on the task_ledgers template which should match
3005  *             up with PMAP_LEDGER_DATA_BYTES.
3006  */
3007 MARK_AS_PMAP_TEXT void
pmap_ledger_verify_size_internal(size_t size)3008 pmap_ledger_verify_size_internal(size_t size)
3009 {
3010 	pmap_simple_lock(&pmap_ledger_lock);
3011 
3012 	if (pmap_ledger_size_verified) {
3013 		panic("%s: ledger size already verified, size=%lu", __func__, size);
3014 	}
3015 
3016 	if ((size == 0) || (size > sizeof(pmap_ledger_data_t)) ||
3017 	    ((sizeof(pmap_ledger_data_t) - size) % sizeof(struct ledger_entry))) {
3018 		panic("%s: size mismatch, expected %lu, size=%lu", __func__,
3019 		    PMAP_LEDGER_DATA_BYTES, size);
3020 	}
3021 
3022 	pmap_ledger_size_verified = true;
3023 
3024 	pmap_simple_unlock(&pmap_ledger_lock);
3025 }
3026 
3027 /**
3028  * Allocate a ledger object from the pmap ledger free list and associate it with
3029  * the ledger pointer array so it can be validated when passed into the PPL.
3030  *
3031  * @return Pointer to the successfully allocated ledger object, or NULL if we're
3032  *         out of PPL pages.
3033  */
3034 MARK_AS_PMAP_TEXT ledger_t
pmap_ledger_alloc_internal(void)3035 pmap_ledger_alloc_internal(void)
3036 {
3037 	/**
3038 	 * Ensure that we've double checked the size of the ledger objects we're
3039 	 * allocating before we allocate anything.
3040 	 */
3041 	if (!pmap_ledger_size_verified) {
3042 		panic_plain("%s: Attempted to allocate a pmap ledger before verifying "
3043 		    "the ledger size", __func__);
3044 	}
3045 
3046 	pmap_simple_lock(&pmap_ledger_lock);
3047 	if (pmap_ledger_free_list == NULL) {
3048 		/* The free list is empty, so allocate a page's worth of objects. */
3049 		const pmap_paddr_t paddr = pmap_get_free_ppl_page();
3050 
3051 		if (paddr == 0) {
3052 			pmap_simple_unlock(&pmap_ledger_lock);
3053 			return NULL;
3054 		}
3055 
3056 		const vm_map_address_t vstart = phystokv(paddr);
3057 		const uint32_t ledgers_per_page = PAGE_SIZE / sizeof(pmap_ledger_t);
3058 		const vm_map_address_t vend = vstart + (ledgers_per_page * sizeof(pmap_ledger_t));
3059 		assert(vend > vstart);
3060 
3061 		/**
3062 		 * Loop through every pmap ledger object within the recently allocated
3063 		 * page and add it to both the ledger free list and the ledger pointer
3064 		 * array (which will be used to validate these objects in the future).
3065 		 */
3066 		for (vm_map_address_t vaddr = vstart; vaddr < vend; vaddr += sizeof(pmap_ledger_t)) {
3067 			/* Get the next free entry in the ledger pointer array. */
3068 			const uint64_t index = pmap_ledger_ptr_array_free_index++;
3069 
3070 			if (index >= pmap_ledger_ptr_array_count) {
3071 				panic("%s: pmap_ledger_ptr_array is full, index=%llu",
3072 				    __func__, index);
3073 			}
3074 
3075 			pmap_ledger_t *free_ledger = (pmap_ledger_t*)vaddr;
3076 
3077 			/**
3078 			 * This association between the just allocated ledger and the
3079 			 * pointer array is what allows this object to be validated in the
3080 			 * future that it's indeed a ledger allocated by this code.
3081 			 */
3082 			pmap_ledger_ptr_array[index] = free_ledger;
3083 			free_ledger->array_index = index;
3084 
3085 			/* Prepend this new ledger object to the free list. */
3086 			free_ledger->next = pmap_ledger_free_list;
3087 			pmap_ledger_free_list = free_ledger;
3088 		}
3089 
3090 		/**
3091 		 * In an effort to reduce the amount of ledger code that needs to be
3092 		 * called from within the PPL, the ledger objects themselves are made
3093 		 * kernel writable. This way, all of the initialization and checking of
3094 		 * the ledgers can occur outside of the PPL.
3095 		 *
3096 		 * The only modification to these ledger objects that should occur from
3097 		 * within the PPL is when debiting/crediting the ledgers. And those
3098 		 * operations should only occur on validated ledger objects that are
3099 		 * validated using the ledger pointer array (which is wholly contained
3100 		 * in PPL-owned memory).
3101 		 */
3102 		pa_set_range_xprr_perm(paddr, paddr + PAGE_SIZE, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
3103 	}
3104 
3105 	ledger_t new_ledger = (ledger_t)pmap_ledger_free_list;
3106 	pmap_ledger_free_list = pmap_ledger_free_list->next;
3107 
3108 	/**
3109 	 * Double check that the array index of the recently allocated object wasn't
3110 	 * tampered with while the object was sitting on the free list.
3111 	 */
3112 	const uint64_t array_index = pmap_ledger_validate(new_ledger);
3113 	os_ref_init(&pmap_ledger_refcnt[array_index], NULL);
3114 
3115 	pmap_simple_unlock(&pmap_ledger_lock);
3116 
3117 	return new_ledger;
3118 }
3119 
3120 /**
3121  * Free a ledger that was previously allocated by the PPL.
3122  *
3123  * @param ledger The ledger to put back onto the pmap ledger free list.
3124  */
3125 MARK_AS_PMAP_TEXT void
pmap_ledger_free_internal(ledger_t ledger)3126 pmap_ledger_free_internal(ledger_t ledger)
3127 {
3128 	/**
3129 	 * A pmap_ledger_t wholly contains a ledger_t as its first member, but also
3130 	 * includes an index into the ledger pointer array used for validation
3131 	 * purposes.
3132 	 */
3133 	pmap_ledger_t *free_ledger = (pmap_ledger_t*)ledger;
3134 
3135 	pmap_simple_lock(&pmap_ledger_lock);
3136 
3137 	/* Ensure that what we're putting onto the free list is a real ledger. */
3138 	const uint64_t array_index = pmap_ledger_validate(ledger);
3139 
3140 	/* Ensure no pmap objects are still using this ledger. */
3141 	os_ref_release_last(&pmap_ledger_refcnt[array_index]);
3142 
3143 	/* Prepend the ledger to the free list. */
3144 	free_ledger->next = pmap_ledger_free_list;
3145 	pmap_ledger_free_list = free_ledger;
3146 
3147 	pmap_simple_unlock(&pmap_ledger_lock);
3148 }
3149 
3150 /**
3151  * Bump the reference count on a ledger object to denote that is currently in
3152  * use by a pmap object.
3153  *
3154  * @param ledger The ledger whose refcnt to increment.
3155  */
3156 MARK_AS_PMAP_TEXT void
pmap_ledger_retain(ledger_t ledger)3157 pmap_ledger_retain(ledger_t ledger)
3158 {
3159 	pmap_simple_lock(&pmap_ledger_lock);
3160 	const uint64_t array_index = pmap_ledger_validate(ledger);
3161 	os_ref_retain(&pmap_ledger_refcnt[array_index]);
3162 	pmap_simple_unlock(&pmap_ledger_lock);
3163 }
3164 
3165 /**
3166  * Decrement the reference count on a ledger object to denote that a pmap object
3167  * that used to use it now isn't.
3168  *
3169  * @param ledger The ledger whose refcnt to decrement.
3170  */
3171 MARK_AS_PMAP_TEXT void
pmap_ledger_release(ledger_t ledger)3172 pmap_ledger_release(ledger_t ledger)
3173 {
3174 	pmap_simple_lock(&pmap_ledger_lock);
3175 	const uint64_t array_index = pmap_ledger_validate(ledger);
3176 	os_ref_release_live(&pmap_ledger_refcnt[array_index]);
3177 	pmap_simple_unlock(&pmap_ledger_lock);
3178 }
3179 
3180 /**
3181  * This function is used to check a ledger that was recently updated (usually
3182  * from within the PPL) and potentially take actions based on the new ledger
3183  * balances (e.g., set an AST).
3184  *
3185  * @note On non-PPL systems this checking occurs automatically every time a
3186  *       ledger is credited/debited. Due to that, this function only needs to
3187  *       get called on PPL-enabled systems.
3188  *
3189  * @note This function can ONLY be called from *outside* of the PPL due to its
3190  *       usage of current_thread(). The TPIDR register is kernel-modifiable, and
3191  *       hence can't be trusted. This also means we don't need to pull all of
3192  *       the logic used to check ledger balances into the PPL.
3193  *
3194  * @param pmap The pmap whose ledger should be checked.
3195  */
3196 void
pmap_ledger_check_balance(pmap_t pmap)3197 pmap_ledger_check_balance(pmap_t pmap)
3198 {
3199 	/* This function should only be called from outside of the PPL. */
3200 	assert((pmap != NULL) && !pmap_in_ppl());
3201 
3202 	ledger_t ledger = pmap->ledger;
3203 
3204 	if (ledger == NULL) {
3205 		return;
3206 	}
3207 
3208 	thread_t cur_thread = current_thread();
3209 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting);
3210 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting_compressed);
3211 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal);
3212 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal_compressed);
3213 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.page_table);
3214 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_footprint);
3215 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_mem);
3216 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.tkm_private);
3217 	ledger_check_new_balance(cur_thread, ledger, task_ledgers.wired_mem);
3218 }
3219 
3220 #endif /* XNU_MONITOR */
3221 
3222 /**
3223  * Credit a specific ledger entry within the passed in pmap's ledger object.
3224  *
3225  * @note On PPL-enabled systems this operation will not automatically check the
3226  *       ledger balances after updating. A call to pmap_ledger_check_balance()
3227  *       will need to occur outside of the PPL to handle this.
3228  *
3229  * @param pmap The pmap whose ledger should be updated.
3230  * @param entry The specifc ledger entry to update. This needs to be one of the
3231  *              task_ledger entries.
3232  * @param amount The amount to credit from the ledger.
3233  *
3234  * @return The return value from the credit operation.
3235  */
3236 kern_return_t
pmap_ledger_credit(pmap_t pmap,int entry,ledger_amount_t amount)3237 pmap_ledger_credit(pmap_t pmap, int entry, ledger_amount_t amount)
3238 {
3239 	assert(pmap != NULL);
3240 
3241 #if XNU_MONITOR
3242 	/**
3243 	 * On PPL-enabled systems the "nocheck" variant MUST be called to ensure
3244 	 * that the ledger balance doesn't automatically get checked after being
3245 	 * updated.
3246 	 *
3247 	 * That checking process is unsafe to perform within the PPL due to its
3248 	 * reliance on current_thread().
3249 	 */
3250 	return ledger_credit_nocheck(pmap->ledger, entry, amount);
3251 #else /* XNU_MONITOR */
3252 	return ledger_credit(pmap->ledger, entry, amount);
3253 #endif /* XNU_MONITOR */
3254 }
3255 
3256 /**
3257  * Debit a specific ledger entry within the passed in pmap's ledger object.
3258  *
3259  * @note On PPL-enabled systems this operation will not automatically check the
3260  *       ledger balances after updating. A call to pmap_ledger_check_balance()
3261  *       will need to occur outside of the PPL to handle this.
3262  *
3263  * @param pmap The pmap whose ledger should be updated.
3264  * @param entry The specifc ledger entry to update. This needs to be one of the
3265  *              task_ledger entries.
3266  * @param amount The amount to debit from the ledger.
3267  *
3268  * @return The return value from the debit operation.
3269  */
3270 kern_return_t
pmap_ledger_debit(pmap_t pmap,int entry,ledger_amount_t amount)3271 pmap_ledger_debit(pmap_t pmap, int entry, ledger_amount_t amount)
3272 {
3273 	assert(pmap != NULL);
3274 
3275 #if XNU_MONITOR
3276 	/**
3277 	 * On PPL-enabled systems the "nocheck" variant MUST be called to ensure
3278 	 * that the ledger balance doesn't automatically get checked after being
3279 	 * updated.
3280 	 *
3281 	 * That checking process is unsafe to perform within the PPL due to its
3282 	 * reliance on current_thread().
3283 	 */
3284 	return ledger_debit_nocheck(pmap->ledger, entry, amount);
3285 #else /* XNU_MONITOR */
3286 	return ledger_debit(pmap->ledger, entry, amount);
3287 #endif /* XNU_MONITOR */
3288 }
3289 
3290 #if XNU_MONITOR
3291 
3292 /**
3293  * Allocate a pmap object from the pmap object free list and associate it with
3294  * the pmap pointer array so it can be validated when passed into the PPL.
3295  *
3296  * @param pmap Output parameter that holds the newly allocated pmap object if
3297  *             the operation was successful, or NULL otherwise. The return value
3298  *             must be checked to know what this parameter should return.
3299  *
3300  * @return KERN_SUCCESS if the allocation was successful, KERN_RESOURCE_SHORTAGE
3301  *         if out of free PPL pages, or KERN_NO_SPACE if more pmap objects were
3302  *         trying to be allocated than the pmap pointer array could manage. On
3303  *         KERN_SUCCESS, the `pmap` output parameter will point to the newly
3304  *         allocated object.
3305  */
3306 MARK_AS_PMAP_TEXT kern_return_t
pmap_alloc_pmap(pmap_t * pmap)3307 pmap_alloc_pmap(pmap_t *pmap)
3308 {
3309 	pmap_t new_pmap = PMAP_NULL;
3310 	kern_return_t kr = KERN_SUCCESS;
3311 
3312 	pmap_simple_lock(&pmap_free_list_lock);
3313 
3314 	if (pmap_free_list == NULL) {
3315 		/* If the pmap pointer array is full, then no more objects can be allocated. */
3316 		if (__improbable(pmap_ptr_array_free_index == pmap_ptr_array_count)) {
3317 			kr = KERN_NO_SPACE;
3318 			goto pmap_alloc_cleanup;
3319 		}
3320 
3321 		/* The free list is empty, so allocate a page's worth of objects. */
3322 		const pmap_paddr_t paddr = pmap_get_free_ppl_page();
3323 
3324 		if (paddr == 0) {
3325 			kr = KERN_RESOURCE_SHORTAGE;
3326 			goto pmap_alloc_cleanup;
3327 		}
3328 
3329 		const vm_map_address_t vstart = phystokv(paddr);
3330 		const uint32_t pmaps_per_page = PAGE_SIZE / sizeof(pmap_list_entry_t);
3331 		const vm_map_address_t vend = vstart + (pmaps_per_page * sizeof(pmap_list_entry_t));
3332 		assert(vend > vstart);
3333 
3334 		/**
3335 		 * Loop through every pmap object within the recently allocated page and
3336 		 * add it to both the pmap free list and the pmap pointer array (which
3337 		 * will be used to validate these objects in the future).
3338 		 */
3339 		for (vm_map_address_t vaddr = vstart; vaddr < vend; vaddr += sizeof(pmap_list_entry_t)) {
3340 			/* Get the next free entry in the pmap pointer array. */
3341 			const unsigned long index = pmap_ptr_array_free_index++;
3342 
3343 			if (__improbable(index >= pmap_ptr_array_count)) {
3344 				panic("%s: pmap array index %lu >= limit %lu; corruption?",
3345 				    __func__, index, pmap_ptr_array_count);
3346 			}
3347 			pmap_list_entry_t *free_pmap = (pmap_list_entry_t*)vaddr;
3348 			os_atomic_init(&free_pmap->pmap.ref_count, 0);
3349 
3350 			/**
3351 			 * This association between the just allocated pmap object and the
3352 			 * pointer array is what allows this object to be validated in the
3353 			 * future that it's indeed a pmap object allocated by this code.
3354 			 */
3355 			pmap_ptr_array[index] = free_pmap;
3356 			free_pmap->array_index = index;
3357 
3358 			/* Prepend this new pmap object to the free list. */
3359 			free_pmap->next = pmap_free_list;
3360 			pmap_free_list = free_pmap;
3361 
3362 			/* Check if we've reached the maximum number of pmap objects. */
3363 			if (__improbable(pmap_ptr_array_free_index == pmap_ptr_array_count)) {
3364 				break;
3365 			}
3366 		}
3367 	}
3368 
3369 	new_pmap = &pmap_free_list->pmap;
3370 	pmap_free_list = pmap_free_list->next;
3371 
3372 pmap_alloc_cleanup:
3373 	pmap_simple_unlock(&pmap_free_list_lock);
3374 	*pmap = new_pmap;
3375 	return kr;
3376 }
3377 
3378 /**
3379  * Free a pmap object that was previously allocated by the PPL.
3380  *
3381  * @note This should only be called on pmap objects that have already been
3382  *       validated to be real pmap objects.
3383  *
3384  * @param pmap The pmap object to put back onto the pmap free.
3385  */
3386 MARK_AS_PMAP_TEXT void
pmap_free_pmap(pmap_t pmap)3387 pmap_free_pmap(pmap_t pmap)
3388 {
3389 	/**
3390 	 * A pmap_list_entry_t wholly contains a struct pmap as its first member,
3391 	 * but also includes an index into the pmap pointer array used for
3392 	 * validation purposes.
3393 	 */
3394 	pmap_list_entry_t *free_pmap = (pmap_list_entry_t*)pmap;
3395 	if (__improbable(free_pmap->array_index >= pmap_ptr_array_count)) {
3396 		panic("%s: pmap %p has index %lu >= limit %lu", __func__, pmap,
3397 		    free_pmap->array_index, pmap_ptr_array_count);
3398 	}
3399 
3400 	pmap_simple_lock(&pmap_free_list_lock);
3401 
3402 	/* Prepend the pmap object to the free list. */
3403 	free_pmap->next = pmap_free_list;
3404 	pmap_free_list = free_pmap;
3405 
3406 	pmap_simple_unlock(&pmap_free_list_lock);
3407 }
3408 
3409 #endif /* XNU_MONITOR */
3410 
3411 #if XNU_MONITOR
3412 
3413 /**
3414  * Helper function to validate that the pointer passed into this method is truly
3415  * a userspace pmap object that was allocated through the pmap_alloc_pmap() API.
3416  * This function will panic if the validation fails.
3417  *
3418  * @param pmap The pointer to validate.
3419  * @param func The stringized function name of the caller that will be printed
3420  *             in the case that the validation fails.
3421  */
3422 static void
validate_user_pmap(const volatile struct pmap * pmap,const char * func)3423 validate_user_pmap(const volatile struct pmap *pmap, const char *func)
3424 {
3425 	/**
3426 	 * Ensure the array index isn't corrupted. This could happen if an attacker
3427 	 * is trying to pass off random memory as a pmap object.
3428 	 */
3429 	const unsigned long array_index = ((const volatile pmap_list_entry_t*)pmap)->array_index;
3430 	if (__improbable(array_index >= pmap_ptr_array_count)) {
3431 		panic("%s: pmap array index %lu >= limit %lu", func, array_index, pmap_ptr_array_count);
3432 	}
3433 
3434 	/**
3435 	 * If the array index is valid, then ensure that the passed in object
3436 	 * matches up with the object in the pmap pointer array for this index. Even
3437 	 * if an attacker passed in random memory with a valid index, there's no way
3438 	 * the pmap pointer array will ever point to anything but the objects
3439 	 * allocated by the pmap free list (it's PPL-owned memory).
3440 	 */
3441 	if (__improbable(pmap_ptr_array[array_index] != (const volatile pmap_list_entry_t*)pmap)) {
3442 		panic("%s: pmap %p does not match array element %p at index %lu", func, pmap,
3443 		    pmap_ptr_array[array_index], array_index);
3444 	}
3445 
3446 	/**
3447 	 * Ensure that this isn't just an object sitting on the free list waiting to
3448 	 * be allocated. This also helps protect against a race between validating
3449 	 * and deleting a pmap object.
3450 	 */
3451 	if (__improbable(os_atomic_load(&pmap->ref_count, seq_cst) <= 0)) {
3452 		panic("%s: pmap %p is not in use", func, pmap);
3453 	}
3454 }
3455 
3456 #endif /* XNU_MONITOR */
3457 
3458 /**
3459  * Validate that the pointer passed into this method is a valid pmap object and
3460  * is safe to read from and base PPL decisions off of. This function will panic
3461  * if the validation fails.
3462  *
3463  * @note On non-PPL systems this only checks that the pmap object isn't NULL.
3464  *
3465  * @note This validation should only be used on objects that won't be written to
3466  *       for the duration of the PPL call. If the object is going to be modified
3467  *       then you must use validate_pmap_mutable().
3468  *
3469  * @param pmap The pointer to validate.
3470  * @param func The stringized function name of the caller that will be printed
3471  *             in the case that the validation fails.
3472  */
3473 void
validate_pmap_internal(const volatile struct pmap * pmap,const char * func)3474 validate_pmap_internal(const volatile struct pmap *pmap, const char *func)
3475 {
3476 #if !XNU_MONITOR
3477 	#pragma unused(pmap, func)
3478 	assert(pmap != NULL);
3479 #else /* !XNU_MONITOR */
3480 	if (pmap != kernel_pmap) {
3481 		validate_user_pmap(pmap, func);
3482 	}
3483 #endif /* !XNU_MONITOR */
3484 }
3485 
3486 /**
3487  * Validate that the pointer passed into this method is a valid pmap object and
3488  * is safe to both read and write to from within the PPL. This function will
3489  * panic if the validation fails.
3490  *
3491  * @note On non-PPL systems this only checks that the pmap object isn't NULL.
3492  *
3493  * @note If you're only going to be reading from the pmap object for the
3494  *       duration of the PPL call, it'll be faster to use the immutable version
3495  *       of this validation: validate_pmap().
3496  *
3497  * @param pmap The pointer to validate.
3498  * @param func The stringized function name of the caller that will be printed
3499  *             in the case that the validation fails.
3500  */
3501 void
validate_pmap_mutable_internal(const volatile struct pmap * pmap,const char * func)3502 validate_pmap_mutable_internal(const volatile struct pmap *pmap, const char *func)
3503 {
3504 #if !XNU_MONITOR
3505 	#pragma unused(pmap, func)
3506 	assert(pmap != NULL);
3507 #else /* !XNU_MONITOR */
3508 	if (pmap != kernel_pmap) {
3509 		/**
3510 		 * Every time a pmap object is validated to be mutable, we mark it down
3511 		 * as an "inflight" pmap on this CPU. The inflight pmap for this CPU
3512 		 * will be set to NULL automatically when the PPL is exited. The
3513 		 * pmap_destroy() path will ensure that no "inflight" pmaps (on any CPU)
3514 		 * are ever destroyed so as to prevent racy use-after-free attacks.
3515 		 */
3516 		pmap_cpu_data_t *cpu_data = pmap_get_cpu_data();
3517 
3518 		/**
3519 		 * As a sanity check (since the inflight pmap should be cleared when
3520 		 * exiting the PPL), ensure that the previous inflight pmap is NULL, or
3521 		 * is the same as the one being validated here (which allows for
3522 		 * validating the same object twice).
3523 		 */
3524 		__assert_only const volatile struct pmap *prev_inflight_pmap =
3525 		    os_atomic_load(&cpu_data->inflight_pmap, relaxed);
3526 		assert((prev_inflight_pmap == NULL) || (prev_inflight_pmap == pmap));
3527 
3528 		/**
3529 		 * The release barrier here is intended to pair with the seq_cst load of
3530 		 * ref_count in validate_user_pmap() to ensure that if a pmap is
3531 		 * concurrently destroyed, either this path will observe that it was
3532 		 * destroyed after marking it in-flight and panic, or pmap_destroy will
3533 		 * observe the pmap as in-flight after decrementing ref_count and panic.
3534 		 */
3535 		os_atomic_store(&cpu_data->inflight_pmap, pmap, release);
3536 
3537 		validate_user_pmap(pmap, func);
3538 	}
3539 #endif /* !XNU_MONITOR */
3540 }
3541 
3542 /**
3543  * Validate that the passed in pmap pointer is a pmap object that was allocated
3544  * by the pmap and not just random memory. On PPL-enabled systems, the
3545  * allocation is done through the pmap_alloc_pmap() API. On all other systems
3546  * it's allocated through a zone allocator.
3547  *
3548  * This function will panic if the validation fails.
3549  *
3550  * @param pmap The object to validate.
3551  */
3552 void
pmap_require(pmap_t pmap)3553 pmap_require(pmap_t pmap)
3554 {
3555 #if XNU_MONITOR
3556 	validate_pmap(pmap);
3557 #else /* XNU_MONITOR */
3558 	if (pmap != kernel_pmap) {
3559 		zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
3560 	}
3561 #endif /* XNU_MONITOR */
3562 }
3563 
3564 /**
3565  * Parse the device tree and determine how many pmap-io-ranges there are and
3566  * how much memory is needed to store all of that data.
3567  *
3568  * @note See the definition of pmap_io_range_t for more information on what a
3569  *       "pmap-io-range" actually represents.
3570  *
3571  * @return The number of bytes needed to store metadata for all PPL-owned I/O
3572  *         regions.
3573  */
3574 vm_size_t
pmap_compute_io_rgns(void)3575 pmap_compute_io_rgns(void)
3576 {
3577 	DTEntry entry = NULL;
3578 	__assert_only int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3579 	assert(err == kSuccess);
3580 
3581 	void const *prop = NULL;
3582 	unsigned int prop_size = 0;
3583 	if (kSuccess != SecureDTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size)) {
3584 		return 0;
3585 	}
3586 
3587 	/**
3588 	 * The device tree node for pmap-io-ranges maps directly onto an array of
3589 	 * pmap_io_range_t structures.
3590 	 */
3591 	pmap_io_range_t const *ranges = prop;
3592 
3593 	/* Determine the number of regions and validate the fields. */
3594 	for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) {
3595 		if (ranges[i].addr & PAGE_MASK) {
3596 			panic("%s: %u addr 0x%llx is not page-aligned",
3597 			    __func__, i, ranges[i].addr);
3598 		}
3599 
3600 		if (ranges[i].len & PAGE_MASK) {
3601 			panic("%s: %u length 0x%llx is not page-aligned",
3602 			    __func__, i, ranges[i].len);
3603 		}
3604 
3605 		uint64_t rgn_end = 0;
3606 		if (os_add_overflow(ranges[i].addr, ranges[i].len, &rgn_end)) {
3607 			panic("%s: %u addr 0x%llx length 0x%llx wraps around",
3608 			    __func__, i, ranges[i].addr, ranges[i].len);
3609 		}
3610 
3611 		if (!(ranges[i].wimg & PMAP_IO_RANGE_NOT_IO) &&
3612 		    !(ranges[i].addr >= avail_end || rgn_end <= gPhysBase)) {
3613 			panic("%s: I/O %u addr 0x%llx length 0x%llx overlaps physical memory",
3614 			    __func__, i, ranges[i].addr, ranges[i].len);
3615 		}
3616 
3617 		++num_io_rgns;
3618 	}
3619 
3620 	return num_io_rgns * sizeof(*ranges);
3621 }
3622 
3623 /**
3624  * Helper function used when sorting and searching PPL I/O ranges.
3625  *
3626  * @param a The first PPL I/O range to compare.
3627  * @param b The second PPL I/O range to compare.
3628  *
3629  * @return < 0 for a < b
3630  *           0 for a == b
3631  *         > 0 for a > b
3632  */
3633 static int
cmp_io_rgns(const void * a,const void * b)3634 cmp_io_rgns(const void *a, const void *b)
3635 {
3636 	const pmap_io_range_t *range_a = a;
3637 	const pmap_io_range_t *range_b = b;
3638 
3639 	if ((range_b->addr + range_b->len) <= range_a->addr) {
3640 		return 1;
3641 	} else if ((range_a->addr + range_a->len) <= range_b->addr) {
3642 		return -1;
3643 	} else {
3644 		return 0;
3645 	}
3646 }
3647 
3648 /**
3649  * Now that enough memory has been allocated to store all of the pmap-io-ranges
3650  * device tree nodes in memory, go ahead and do that copy and then sort the
3651  * resulting array by address for quicker lookup later.
3652  *
3653  * @note This function assumes that the amount of memory required to store the
3654  *       entire pmap-io-ranges device tree node has already been calculated (via
3655  *       pmap_compute_io_rgns()) and allocated in io_attr_table.
3656  *
3657  * @note This function will leave io_attr_table sorted by address to allow for
3658  *       performing a binary search when doing future range lookups.
3659  */
3660 void
pmap_load_io_rgns(void)3661 pmap_load_io_rgns(void)
3662 {
3663 	if (num_io_rgns == 0) {
3664 		return;
3665 	}
3666 
3667 	DTEntry entry = NULL;
3668 	int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3669 	assert(err == kSuccess);
3670 
3671 	void const *prop = NULL;
3672 	unsigned int prop_size;
3673 	err = SecureDTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size);
3674 	assert(err == kSuccess);
3675 
3676 	pmap_io_range_t const *ranges = prop;
3677 	for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) {
3678 		io_attr_table[i] = ranges[i];
3679 	}
3680 
3681 	qsort(io_attr_table, num_io_rgns, sizeof(*ranges), cmp_io_rgns);
3682 }
3683 
3684 /**
3685  * Checks if a pmap-io-range is exempted from being enforced under certain
3686  * conditions.
3687  *
3688  * @param io_range The pmap-io-range to be checked
3689  *
3690  * @return NULL if the pmap-io-range should be exempted. Otherwise, returns
3691  *         the passed in pmap-io-range.
3692  */
3693 static pmap_io_range_t*
pmap_exempt_io_range(pmap_io_range_t * io_range)3694 pmap_exempt_io_range(pmap_io_range_t *io_range)
3695 {
3696 #if DEBUG || DEVELOPMENT
3697 	if (__improbable(io_range->signature == 'RVBR')) {
3698 		return NULL;
3699 	}
3700 #endif /* DEBUG || DEVELOPMENT */
3701 
3702 	return io_range;
3703 }
3704 
3705 /**
3706  * Find and return the PPL I/O range that contains the passed in physical
3707  * address.
3708  *
3709  * @note This function performs a binary search on the already sorted
3710  *       io_attr_table, so it should be reasonably fast.
3711  *
3712  * @param paddr The physical address to query a specific I/O range for.
3713  *
3714  * @return A pointer to the pmap_io_range_t structure if one of the ranges
3715  *         contains the passed in physical address. Otherwise, NULL.
3716  */
3717 pmap_io_range_t*
pmap_find_io_attr(pmap_paddr_t paddr)3718 pmap_find_io_attr(pmap_paddr_t paddr)
3719 {
3720 	unsigned int begin = 0;
3721 	unsigned int end = num_io_rgns - 1;
3722 
3723 	/**
3724 	 * If there are no I/O ranges, or the wanted address is below the lowest
3725 	 * range or above the highest range, then there's no point in searching
3726 	 * since it won't be here.
3727 	 */
3728 	if ((num_io_rgns == 0) || (paddr < io_attr_table[begin].addr) ||
3729 	    (paddr >= (io_attr_table[end].addr + io_attr_table[end].len))) {
3730 		return NULL;
3731 	}
3732 
3733 	/**
3734 	 * A dummy I/O range to compare against when searching for a range that
3735 	 * includes `paddr`.
3736 	 */
3737 	const pmap_io_range_t wanted_range = {
3738 		.addr = paddr & ~PAGE_MASK,
3739 		.len = PAGE_SIZE
3740 	};
3741 
3742 	/* Perform a binary search to find the wanted I/O range. */
3743 	for (;;) {
3744 		const unsigned int middle = (begin + end) / 2;
3745 		const int cmp = cmp_io_rgns(&wanted_range, &io_attr_table[middle]);
3746 
3747 		if (cmp == 0) {
3748 			/* Success! Found the wanted I/O range. */
3749 			return pmap_exempt_io_range(&io_attr_table[middle]);
3750 		} else if (begin == end) {
3751 			/* We've checked every range and didn't find a match. */
3752 			break;
3753 		} else if (cmp > 0) {
3754 			/* The wanted range is above the middle. */
3755 			begin = middle + 1;
3756 		} else {
3757 			/* The wanted range is below the middle. */
3758 			end = middle;
3759 		}
3760 	}
3761 
3762 	return NULL;
3763 }
3764 
3765 #if HAS_GUARDED_IO_FILTER
3766 /**
3767  * Parse the device tree and determine how many pmap-io-filters there are and
3768  * how much memory is needed to store all of that data.
3769  *
3770  * @note See the definition of pmap_io_filter_entry_t for more information on what a
3771  *       "pmap-io-filter" actually represents.
3772  *
3773  * @return The number of bytes needed to store metadata for all I/O filter
3774  *         entries.
3775  */
3776 vm_size_t
pmap_compute_io_filters(void)3777 pmap_compute_io_filters(void)
3778 {
3779 	DTEntry entry = NULL;
3780 	__assert_only int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3781 	assert(err == kSuccess);
3782 
3783 	void const *prop = NULL;
3784 	unsigned int prop_size = 0;
3785 	if (kSuccess != SecureDTGetProperty(entry, "pmap-io-filters", &prop, &prop_size)) {
3786 		return 0;
3787 	}
3788 
3789 	pmap_io_filter_entry_t const *entries = prop;
3790 
3791 	/* Determine the number of entries. */
3792 	for (unsigned int i = 0; i < (prop_size / sizeof(*entries)); ++i) {
3793 		if (entries[i].offset + entries[i].length > ARM_PGMASK) {
3794 			panic("%s: io filter entry %u offset 0x%hx length 0x%hx crosses page boundary",
3795 			    __func__, i, entries[i].offset, entries[i].length);
3796 		}
3797 
3798 		++num_io_filter_entries;
3799 	}
3800 
3801 	return num_io_filter_entries * sizeof(*entries);
3802 }
3803 
3804 /**
3805  * Compares two I/O filter entries by signature.
3806  *
3807  * @note The numerical comparison of signatures does not carry any meaning
3808  *       but it does give us a way to order and binary search the entries.
3809  *
3810  * @param a The first I/O filter entry to compare.
3811  * @param b The second I/O filter entry to compare.
3812  *
3813  * @return < 0 for a < b
3814  *           0 for a == b
3815  *         > 0 for a > b
3816  */
3817 static int
cmp_io_filter_entries_by_signature(const void * a,const void * b)3818 cmp_io_filter_entries_by_signature(const void *a, const void *b)
3819 {
3820 	const pmap_io_filter_entry_t *entry_a = a;
3821 	const pmap_io_filter_entry_t *entry_b = b;
3822 
3823 	if (entry_b->signature < entry_a->signature) {
3824 		return 1;
3825 	} else if (entry_a->signature < entry_b->signature) {
3826 		return -1;
3827 	} else {
3828 		return 0;
3829 	}
3830 }
3831 
3832 /**
3833  * Compares two I/O filter entries by address range.
3834  *
3835  * @note The function returns 0 as long as the ranges overlap. It allows
3836  *       the user not only to detect overlaps across a list of entries,
3837  *       but also to feed it an address with unit length and a range
3838  *       to check for inclusion.
3839  *
3840  * @param a The first I/O filter entry to compare.
3841  * @param b The second I/O filter entry to compare.
3842  *
3843  * @return < 0 for a < b
3844  *           0 for a == b
3845  *         > 0 for a > b
3846  */
3847 static int
cmp_io_filter_entries_by_addr(const void * a,const void * b)3848 cmp_io_filter_entries_by_addr(const void *a, const void *b)
3849 {
3850 	const pmap_io_filter_entry_t *entry_a = a;
3851 	const pmap_io_filter_entry_t *entry_b = b;
3852 
3853 	if ((entry_b->offset + entry_b->length) <= entry_a->offset) {
3854 		return 1;
3855 	} else if ((entry_a->offset + entry_a->length) <= entry_b->offset) {
3856 		return -1;
3857 	} else {
3858 		return 0;
3859 	}
3860 }
3861 
3862 /**
3863  * Compares two I/O filter entries by signature, then by address range.
3864  *
3865  * @param a The first I/O filter entry to compare.
3866  * @param b The second I/O filter entry to compare.
3867  *
3868  * @return < 0 for a < b
3869  *           0 for a == b
3870  *         > 0 for a > b
3871  */
3872 static int
cmp_io_filter_entries(const void * a,const void * b)3873 cmp_io_filter_entries(const void *a, const void *b)
3874 {
3875 	const int cmp_signature_result = cmp_io_filter_entries_by_signature(a, b);
3876 	return (cmp_signature_result != 0) ? cmp_signature_result : cmp_io_filter_entries_by_addr(a, b);
3877 }
3878 
3879 /**
3880  * Now that enough memory has been allocated to store all of the pmap-io-filters
3881  * device tree nodes in memory, go ahead and do that copy and then sort the
3882  * resulting array by address for quicker lookup later.
3883  *
3884  * @note This function assumes that the amount of memory required to store the
3885  *       entire pmap-io-filters device tree node has already been calculated (via
3886  *       pmap_compute_io_filters()) and allocated in io_filter_table.
3887  *
3888  * @note This function will leave io_attr_table sorted by signature and addresss to
3889  *       allow for performing a binary search when doing future lookups.
3890  */
3891 void
pmap_load_io_filters(void)3892 pmap_load_io_filters(void)
3893 {
3894 	if (num_io_filter_entries == 0) {
3895 		return;
3896 	}
3897 
3898 	DTEntry entry = NULL;
3899 	int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3900 	assert(err == kSuccess);
3901 
3902 	void const *prop = NULL;
3903 	unsigned int prop_size;
3904 	err = SecureDTGetProperty(entry, "pmap-io-filters", &prop, &prop_size);
3905 	assert(err == kSuccess);
3906 
3907 	pmap_io_filter_entry_t const *entries = prop;
3908 	for (unsigned int i = 0; i < (prop_size / sizeof(*entries)); ++i) {
3909 		io_filter_table[i] = entries[i];
3910 	}
3911 
3912 	qsort(io_filter_table, num_io_filter_entries, sizeof(*entries), cmp_io_filter_entries);
3913 
3914 	for (unsigned int i = 0; i < num_io_filter_entries - 1; i++) {
3915 		if (io_filter_table[i].signature == io_filter_table[i + 1].signature) {
3916 			if (io_filter_table[i].offset + io_filter_table[i].length > io_filter_table[i + 1].offset) {
3917 				panic("%s: io filter entry %u and %u overlap.",
3918 				    __func__, i, i + 1);
3919 			}
3920 		}
3921 	}
3922 }
3923 
3924 /**
3925  * Find and return the I/O filter entry that contains the passed in physical
3926  * address.
3927  *
3928  * @note This function performs a binary search on the already sorted
3929  *       io_filter_table, so it should be reasonably fast.
3930  *
3931  * @param paddr The physical address to query a specific I/O filter for.
3932  * @param width The width of the I/O register at paddr, at most 8 bytes.
3933  * @param io_range_outp If not NULL, this argument is set to the io_attr_table
3934  *        entry containing paddr.
3935  *
3936  * @return A pointer to the pmap_io_range_t structure if one of the ranges
3937  *         contains the passed in I/O register described by paddr and width.
3938  *         Otherwise, NULL.
3939  */
3940 pmap_io_filter_entry_t*
pmap_find_io_filter_entry(pmap_paddr_t paddr,uint64_t width,const pmap_io_range_t ** io_range_outp)3941 pmap_find_io_filter_entry(pmap_paddr_t paddr, uint64_t width, const pmap_io_range_t **io_range_outp)
3942 {
3943 	/* Don't bother looking for it when we don't have any entries. */
3944 	if (__improbable(num_io_filter_entries == 0)) {
3945 		return NULL;
3946 	}
3947 
3948 	if (__improbable(width > 8)) {
3949 		return NULL;
3950 	}
3951 
3952 	/* Check if paddr is owned by PPL (Guarded mode SW). */
3953 	const pmap_io_range_t *io_range = pmap_find_io_attr(paddr);
3954 
3955 	/**
3956 	 * Just return NULL if paddr is not owned by PPL.
3957 	 */
3958 	if (io_range == NULL) {
3959 		return NULL;
3960 	}
3961 
3962 	const uint32_t signature = io_range->signature;
3963 	unsigned int begin = 0;
3964 	unsigned int end = num_io_filter_entries - 1;
3965 
3966 	/**
3967 	 * A dummy I/O filter entry to compare against when searching for a range that
3968 	 * includes `paddr`.
3969 	 */
3970 	const pmap_io_filter_entry_t wanted_filter = {
3971 		.signature = signature,
3972 		.offset = (uint16_t) ((paddr & ~0b11) & PAGE_MASK),
3973 		.length = (uint16_t) width // This downcast is safe because width is validated.
3974 	};
3975 
3976 	/* Perform a binary search to find the wanted filter entry. */
3977 	for (;;) {
3978 		const unsigned int middle = (begin + end) / 2;
3979 		const int cmp = cmp_io_filter_entries(&wanted_filter, &io_filter_table[middle]);
3980 
3981 		if (cmp == 0) {
3982 			/**
3983 			 * We have found a "match" by the definition of cmp_io_filter_entries,
3984 			 * meaning the dummy range and the io_filter_entry are overlapping. Make
3985 			 * sure the dummy range is contained entirely by the entry.
3986 			 */
3987 			const pmap_io_filter_entry_t entry_found = io_filter_table[middle];
3988 			if ((wanted_filter.offset >= entry_found.offset) &&
3989 			    ((wanted_filter.offset + wanted_filter.length) <= (entry_found.offset + entry_found.length))) {
3990 				if (io_range) {
3991 					*io_range_outp = io_range;
3992 				}
3993 
3994 				return &io_filter_table[middle];
3995 			} else {
3996 				/**
3997 				 * Under the assumption that there is no overlapping io_filter_entry,
3998 				 * if the dummy range is found overlapping but not contained by an
3999 				 * io_filter_entry, there cannot be another io_filter_entry containing
4000 				 * the dummy range, so return NULL here.
4001 				 */
4002 				return NULL;
4003 			}
4004 		} else if (begin == end) {
4005 			/* We've checked every range and didn't find a match. */
4006 			break;
4007 		} else if (cmp > 0) {
4008 			/* The wanted range is above the middle. */
4009 			begin = middle + 1;
4010 		} else {
4011 			/* The wanted range is below the middle. */
4012 			end = middle;
4013 		}
4014 	}
4015 
4016 	return NULL;
4017 }
4018 #endif /* HAS_GUARDED_IO_FILTER */
4019 
4020 /**
4021  * Initialize the pmap per-CPU data structure for a single CPU. This is called
4022  * once for each CPU in the system, on the CPU whose per-cpu data needs to be
4023  * initialized.
4024  *
4025  * In reality, many of the per-cpu data fields will have either already been
4026  * initialized or will rely on the fact that the per-cpu data is either zeroed
4027  * out during allocation (on non-PPL systems), or the data itself is a global
4028  * variable which will be zeroed by default (on PPL systems).
4029  *
4030  * @param cpu_number The number of the CPU whose pmap per-cpu data should be
4031  *                   initialized. This number should correspond to the CPU
4032  *                   executing this code.
4033  */
4034 MARK_AS_PMAP_TEXT void
pmap_cpu_data_init_internal(unsigned int cpu_number)4035 pmap_cpu_data_init_internal(unsigned int cpu_number)
4036 {
4037 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
4038 
4039 #if XNU_MONITOR
4040 	/* Verify the per-cpu data is cacheline-aligned. */
4041 	assert(((vm_offset_t)pmap_cpu_data & (MAX_L2_CLINE_BYTES - 1)) == 0);
4042 
4043 	/**
4044 	 * The CPU number should already have been initialized to
4045 	 * PMAP_INVALID_CPU_NUM when initializing the boot CPU data.
4046 	 */
4047 	if (pmap_cpu_data->cpu_number != PMAP_INVALID_CPU_NUM) {
4048 		panic("%s: pmap_cpu_data->cpu_number=%u, cpu_number=%u",
4049 		    __func__, pmap_cpu_data->cpu_number, cpu_number);
4050 	}
4051 #endif /* XNU_MONITOR */
4052 
4053 	/**
4054 	 * At least when operating in the PPL, it's important to duplicate the CPU
4055 	 * number into a PPL-owned location. If we relied strictly on the CPU number
4056 	 * located in the general machine-specific per-cpu data, it could be
4057 	 * modified in a way to affect PPL operation.
4058 	 */
4059 	pmap_cpu_data->cpu_number = cpu_number;
4060 #if __ARM_MIXED_PAGE_SIZE__
4061 	pmap_cpu_data->commpage_page_shift = PAGE_SHIFT;
4062 #endif
4063 }
4064 
4065 /**
4066  * Initialize the pmap per-cpu data for the bootstrap CPU (the other CPUs should
4067  * just call pmap_cpu_data_init() directly). This code does one of two things
4068  * depending on whether this is a PPL-enabled system.
4069  *
4070  * PPL-enabled: This function will setup the PPL-specific per-cpu data like the
4071  *              PPL stacks and register save area. This performs the
4072  *              functionality usually done by cpu_data_init() to setup the pmap
4073  *              per-cpu data fields. In reality, most fields are not initialized
4074  *              and are assumed to be zero thanks to this data being global.
4075  *
4076  * Non-PPL: Just calls pmap_cpu_data_init() to initialize the bootstrap CPU's
4077  *          pmap per-cpu data (non-boot CPUs will call that function once they
4078  *          come out of reset).
4079  *
4080  * @note This function will carve out physical pages for the PPL stacks and PPL
4081  *       register save area from avail_start. It's assumed that avail_start is
4082  *       on a page boundary before executing this function on PPL-enabled
4083  *       systems.
4084  */
4085 void
pmap_cpu_data_array_init(void)4086 pmap_cpu_data_array_init(void)
4087 {
4088 #if XNU_MONITOR
4089 	/**
4090 	 * Enough virtual address space to cover all PPL stacks for every CPU should
4091 	 * have already been allocated by arm_vm_init() before pmap_bootstrap() is
4092 	 * called.
4093 	 */
4094 	assert((pmap_stacks_start != NULL) && (pmap_stacks_end != NULL));
4095 	assert(((uintptr_t)pmap_stacks_end - (uintptr_t)pmap_stacks_start) == PPL_STACK_REGION_SIZE);
4096 
4097 	/**
4098 	 * Ensure avail_start is aligned to a page boundary before allocating the
4099 	 * stacks and register save area.
4100 	 */
4101 	assert(avail_start == round_page(avail_start));
4102 
4103 	/* Each PPL stack contains guard pages before and after. */
4104 	vm_offset_t stack_va = (vm_offset_t)pmap_stacks_start + ARM_PGBYTES;
4105 
4106 	/**
4107 	 * Globally save off the beginning of the PPL stacks physical space so that
4108 	 * we can update its physical aperture mappings later in the bootstrap
4109 	 * process.
4110 	 */
4111 	pmap_stacks_start_pa = avail_start;
4112 
4113 	/* Map the PPL stacks for each CPU. */
4114 	for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4115 		/**
4116 		 * The PPL stack size is based off of the VM page size, which may differ
4117 		 * from the underlying hardware page size.
4118 		 *
4119 		 * Map all of the PPL stack into the kernel's address space.
4120 		 */
4121 		for (vm_offset_t cur_va = stack_va; cur_va < (stack_va + PPL_STACK_SIZE); cur_va += ARM_PGBYTES) {
4122 			assert(cur_va < (vm_offset_t)pmap_stacks_end);
4123 
4124 			pt_entry_t *ptep = pmap_pte(kernel_pmap, cur_va);
4125 			assert(*ptep == ARM_PTE_EMPTY);
4126 
4127 			pt_entry_t template = pa_to_pte(avail_start) | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) |
4128 			    ARM_PTE_TYPE_VALID | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | xprr_perm_to_pte(XPRR_PPL_RW_PERM);
4129 
4130 #if __ARM_KERNEL_PROTECT__
4131 			/**
4132 			 * On systems with software based spectre/meltdown mitigations,
4133 			 * kernel mappings are explicitly not made global because the kernel
4134 			 * is unmapped when executing in EL0 (this ensures that kernel TLB
4135 			 * entries won't accidentally be valid in EL0).
4136 			 */
4137 			template |= ARM_PTE_NG;
4138 #endif /* __ARM_KERNEL_PROTECT__ */
4139 
4140 			write_pte(ptep, template);
4141 			__builtin_arm_isb(ISB_SY);
4142 
4143 			avail_start += ARM_PGBYTES;
4144 		}
4145 
4146 #if KASAN
4147 		kasan_map_shadow(stack_va, PPL_STACK_SIZE, false);
4148 #endif /* KASAN */
4149 
4150 		/**
4151 		 * Setup non-zero pmap per-cpu data fields. If the default value should
4152 		 * be zero, then you can assume the field is already set to that.
4153 		 */
4154 		pmap_cpu_data_array[cpu_num].cpu_data.cpu_number = PMAP_INVALID_CPU_NUM;
4155 		pmap_cpu_data_array[cpu_num].cpu_data.ppl_state = PPL_STATE_KERNEL;
4156 		pmap_cpu_data_array[cpu_num].cpu_data.ppl_stack = (void*)(stack_va + PPL_STACK_SIZE);
4157 
4158 		/**
4159 		 * Get the first VA of the next CPU's PPL stack. Need to skip the guard
4160 		 * page after the stack.
4161 		 */
4162 		stack_va += (PPL_STACK_SIZE + ARM_PGBYTES);
4163 	}
4164 
4165 	pmap_stacks_end_pa = avail_start;
4166 
4167 	/**
4168 	 * The PPL register save area location is saved into global variables so
4169 	 * that they can be made writable if DTrace support is needed. This is
4170 	 * needed because DTrace will try to update the register state.
4171 	 */
4172 	ppl_cpu_save_area_start = avail_start;
4173 	ppl_cpu_save_area_end = ppl_cpu_save_area_start;
4174 	pmap_paddr_t ppl_cpu_save_area_cur = ppl_cpu_save_area_start;
4175 
4176 	/* Carve out space for the PPL register save area for each CPU. */
4177 	for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4178 		/* Allocate enough space to cover at least one arm_context_t object. */
4179 		while ((ppl_cpu_save_area_end - ppl_cpu_save_area_cur) < sizeof(arm_context_t)) {
4180 			avail_start += PAGE_SIZE;
4181 			ppl_cpu_save_area_end = avail_start;
4182 		}
4183 
4184 		pmap_cpu_data_array[cpu_num].cpu_data.save_area = (arm_context_t *)phystokv(ppl_cpu_save_area_cur);
4185 		ppl_cpu_save_area_cur += sizeof(arm_context_t);
4186 	}
4187 
4188 #if HAS_GUARDED_IO_FILTER
4189 	/**
4190 	 * Enough virtual address space to cover all I/O filter stacks for every CPU should
4191 	 * have already been allocated by arm_vm_init() before pmap_bootstrap() is
4192 	 * called.
4193 	 */
4194 	assert((iofilter_stacks_start != NULL) && (iofilter_stacks_end != NULL));
4195 	assert(((uintptr_t)iofilter_stacks_end - (uintptr_t)iofilter_stacks_start) == IOFILTER_STACK_REGION_SIZE);
4196 
4197 	/* Each I/O filter stack contains guard pages before and after. */
4198 	vm_offset_t iofilter_stack_va = (vm_offset_t)iofilter_stacks_start + ARM_PGBYTES;
4199 
4200 	/**
4201 	 * Globally save off the beginning of the I/O filter stacks physical space so that
4202 	 * we can update its physical aperture mappings later in the bootstrap
4203 	 * process.
4204 	 */
4205 	iofilter_stacks_start_pa = avail_start;
4206 
4207 	/* Map the I/O filter stacks for each CPU. */
4208 	for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4209 		/**
4210 		 * Map all of the I/O filter stack into the kernel's address space.
4211 		 */
4212 		for (vm_offset_t cur_va = iofilter_stack_va; cur_va < (iofilter_stack_va + IOFILTER_STACK_SIZE); cur_va += ARM_PGBYTES) {
4213 			assert(cur_va < (vm_offset_t)iofilter_stacks_end);
4214 
4215 			pt_entry_t *ptep = pmap_pte(kernel_pmap, cur_va);
4216 			assert(*ptep == ARM_PTE_EMPTY);
4217 
4218 			pt_entry_t template = pa_to_pte(avail_start) | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) |
4219 			    ARM_PTE_TYPE_VALID | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | xprr_perm_to_pte(XPRR_PPL_RW_PERM);
4220 
4221 #if __ARM_KERNEL_PROTECT__
4222 			template |= ARM_PTE_NG;
4223 #endif /* __ARM_KERNEL_PROTECT__ */
4224 
4225 			write_pte(ptep, template);
4226 			__builtin_arm_isb(ISB_SY);
4227 
4228 			avail_start += ARM_PGBYTES;
4229 		}
4230 
4231 #if KASAN
4232 		kasan_map_shadow(iofilter_stack_va, IOFILTER_STACK_SIZE, false);
4233 #endif /* KASAN */
4234 
4235 		/**
4236 		 * Setup non-zero pmap per-cpu data fields. If the default value should
4237 		 * be zero, then you can assume the field is already set to that.
4238 		 */
4239 		pmap_cpu_data_array[cpu_num].cpu_data.iofilter_stack = (void*)(iofilter_stack_va + IOFILTER_STACK_SIZE);
4240 
4241 		/**
4242 		 * Get the first VA of the next CPU's IOFILTER stack. Need to skip the guard
4243 		 * page after the stack.
4244 		 */
4245 		iofilter_stack_va += (IOFILTER_STACK_SIZE + ARM_PGBYTES);
4246 	}
4247 
4248 	iofilter_stacks_end_pa = avail_start;
4249 #endif /* HAS_GUARDED_IO_FILTER */
4250 
4251 	/* Carve out scratch space for each cpu */
4252 	for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4253 		pmap_cpu_data_array[cpu_num].cpu_data.scratch_page = (void*)phystokv(avail_start);
4254 		avail_start += PAGE_SIZE;
4255 	}
4256 #endif /* XNU_MONITOR */
4257 
4258 	pmap_cpu_data_init();
4259 }
4260 
4261 /**
4262  * Retrieve the pmap per-cpu data for the current CPU. On PPL-enabled systems
4263  * this data is managed separately from the general machine-specific per-cpu
4264  * data to handle the requirement that it must only be PPL-writable.
4265  *
4266  * @return The per-cpu pmap data for the current CPU.
4267  */
4268 pmap_cpu_data_t *
pmap_get_cpu_data(void)4269 pmap_get_cpu_data(void)
4270 {
4271 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4272 
4273 #if XNU_MONITOR
4274 	extern pmap_cpu_data_t* ml_get_ppl_cpu_data(void);
4275 	pmap_cpu_data = ml_get_ppl_cpu_data();
4276 #else /* XNU_MONITOR */
4277 	/**
4278 	 * On non-PPL systems, the pmap per-cpu data is stored in the general
4279 	 * machine-specific per-cpu data.
4280 	 */
4281 	pmap_cpu_data = &getCpuDatap()->cpu_pmap_cpu_data;
4282 #endif /* XNU_MONITOR */
4283 
4284 	return pmap_cpu_data;
4285 }
4286 
4287 /**
4288  * Retrieve the pmap per-cpu data for the specified cpu index.
4289  *
4290  * @return The per-cpu pmap data for the CPU
4291  */
4292 pmap_cpu_data_t *
pmap_get_remote_cpu_data(unsigned int cpu)4293 pmap_get_remote_cpu_data(unsigned int cpu)
4294 {
4295 #if XNU_MONITOR
4296 	assert(cpu < MAX_CPUS);
4297 	return &pmap_cpu_data_array[cpu].cpu_data;
4298 #else
4299 	cpu_data_t *cpu_data = cpu_datap((int)cpu);
4300 	if (cpu_data == NULL) {
4301 		return NULL;
4302 	} else {
4303 		return &cpu_data->cpu_pmap_cpu_data;
4304 	}
4305 #endif
4306 }
4307 
4308 void
pmap_mark_page_for_cache_flush(pmap_paddr_t pa)4309 pmap_mark_page_for_cache_flush(pmap_paddr_t pa)
4310 {
4311 	if (!pa_valid(pa)) {
4312 		return;
4313 	}
4314 	const unsigned int pai = pa_index(pa);
4315 	pv_entry_t **pvh = pai_to_pvh(pai);
4316 	pvh_lock(pai);
4317 	pvh_set_flags(pvh, pvh_get_flags(pvh) | PVH_FLAG_FLUSH_NEEDED);
4318 	pvh_unlock(pai);
4319 }
4320 
4321 #if HAS_DC_INCPA
4322 void
4323 #else
4324 void __attribute__((noreturn))
4325 #endif
pmap_flush_noncoherent_page(pmap_paddr_t paddr __unused)4326 pmap_flush_noncoherent_page(pmap_paddr_t paddr __unused)
4327 {
4328 	assertf((paddr & PAGE_MASK) == 0, "%s: paddr 0x%llx not page-aligned",
4329 	    __func__, (unsigned long long)paddr);
4330 
4331 #if HAS_DC_INCPA
4332 	for (unsigned int i = 0; i < (PAGE_SIZE >> 12); ++i) {
4333 		const register uint64_t dc_arg asm("x8") = paddr + (i << 12);
4334 		/**
4335 		 * rdar://problem/106067403
4336 		 * __asm__ __volatile__("dc incpa4k, %0" : : "r"(dc_arg));
4337 		 */
4338 		__asm__ __volatile__ (".long 0x201308" : : "r"(dc_arg));
4339 	}
4340 	__builtin_arm_dsb(DSB_OSH);
4341 #else
4342 	panic("%s called on unsupported configuration", __func__);
4343 #endif /* HAS_DC_INCPA */
4344 }
4345 
4346 #if DEBUG || DEVELOPMENT
4347 /**
4348  * Get the value of the WC/RT on non-DRAM mapping request counter.
4349  *
4350  * @return The value of the counter.
4351  */
4352 unsigned int
pmap_wcrt_on_non_dram_count_get()4353 pmap_wcrt_on_non_dram_count_get()
4354 {
4355 	return os_atomic_load(&pmap_wcrt_on_non_dram_count, relaxed);
4356 }
4357 
4358 /**
4359  * Atomically increment the WC/RT on non-DRAM mapping request counter.
4360  */
4361 void
pmap_wcrt_on_non_dram_count_increment_atomic()4362 pmap_wcrt_on_non_dram_count_increment_atomic()
4363 {
4364 	os_atomic_inc(&pmap_wcrt_on_non_dram_count, relaxed);
4365 }
4366 #endif /* DEBUG || DEVELOPMENT */
4367