1 /*
2 * Copyright (c) 2020-2021, 2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <arm/cpu_data_internal.h>
29 #include <kern/queue.h>
30 #include <libkern/OSAtomic.h>
31 #include <libkern/section_keywords.h>
32 #include <pexpert/device_tree.h>
33 #include <os/atomic_private.h>
34 #include <vm/cpm.h>
35 #include <vm/vm_kern.h>
36 #include <vm/vm_protos.h>
37 #include <vm/vm_object.h>
38 #include <vm/vm_page.h>
39 #include <vm/vm_pageout.h>
40
41 #include <arm/pmap/pmap_internal.h>
42
43 /**
44 * Physical Page Attribute Table.
45 *
46 * Array that contains a set of flags for each kernel-managed physical VM page.
47 *
48 * @note There can be a disparity between the VM page size and the underlying
49 * hardware page size for a specific address space. In those cases, it's
50 * possible that multiple hardware pages will share the same set of
51 * attributes. The VM operates on regions of memory by the VM page size
52 * and is aware that all hardware pages within each VM page share
53 * attributes.
54 */
55 SECURITY_READ_ONLY_LATE(volatile pp_attr_t*) pp_attr_table = (volatile pp_attr_t*)NULL;
56
57 /**
58 * Physical to Virtual Table.
59 *
60 * Data structure that contains a list of virtual mappings for each kernel-
61 * managed physical page. Other flags and metadata are also stored in this
62 * structure on a per-physical-page basis.
63 *
64 * This structure is arranged as an array of pointers, where each pointer can
65 * point to one of three different types of data (single mapping, multiple
66 * mappings, or page table descriptor). Metadata about each page (including the
67 * type of pointer) are located in the lower and upper bits of the pointer.
68 * These bits need to be set/masked out to be able to dereference the pointer,
69 * so it's recommended to use the provided API in pmap_data.h to access the
70 * pv_head_table since it handles these details for you.
71 */
72 SECURITY_READ_ONLY_LATE(pv_entry_t * *) pv_head_table = (pv_entry_t**)NULL;
73
74 /**
75 * Queue chain of userspace page table pages that can be quickly reclaimed by
76 * pmap_page_reclaim() in cases where the a page can't easily be allocated
77 * the normal way, but the caller needs a page quickly.
78 */
79 static queue_head_t pt_page_list MARK_AS_PMAP_DATA;
80
81 /* Lock for pt_page_list. */
82 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pt_pages_lock, 0);
83
84 /* Simple linked-list structure used in various page free lists. */
85 typedef struct page_free_entry {
86 /**
87 * The first word in an empty page on a free list is used as a pointer to
88 * the next free page in the list.
89 */
90 struct page_free_entry *next;
91 } page_free_entry_t;
92
93 /* Represents a NULL entry in various page free lists. */
94 #define PAGE_FREE_ENTRY_NULL ((page_free_entry_t *) 0)
95
96 /**
97 * pmap_page_reclaim() is called in critical, latency-sensitive code paths when
98 * either the VM doesn't have any pages available (on non-PPL systems), or the
99 * PPL page free lists are empty (on PPL systems). Before it attempts to reclaim
100 * a userspace page table page (which will have performance penalties), it will
101 * first try allocating a page from this high-priority free list.
102 *
103 * When the pmap is starved for memory and starts relying on
104 * pmap_page_reclaim() to allocate memory, then the next page being freed will
105 * be placed onto this list for usage only by pmap_page_reclaim(). Typically
106 * that page will be a userspace page table that was just reclaimed.
107 */
108 static page_free_entry_t *pmap_page_reclaim_list MARK_AS_PMAP_DATA = PAGE_FREE_ENTRY_NULL;
109
110 /**
111 * Current number of pending requests to reclaim a page table page. This is used
112 * as an indicator to pmap_pages_free() to place any freed pages into the high
113 * priority pmap_page_reclaim() free list so that the next invocations of
114 * pmap_page_reclaim() can use them. Typically this will be a userspace page
115 * table that was just reclaimed.
116 */
117 static unsigned int pmap_pages_request_count MARK_AS_PMAP_DATA = 0;
118
119 /**
120 * Total number of pages that have been requested from pmap_page_reclaim() since
121 * cold boot.
122 */
123 static unsigned long long pmap_pages_request_acum MARK_AS_PMAP_DATA = 0;
124
125 /* Lock for the pmap_page_reclaim() high-priority free list. */
126 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_page_reclaim_lock, 0);
127
128 #if XNU_MONITOR
129 /**
130 * The PPL cannot invoke the VM in order to allocate memory, so we must maintain
131 * a list of free pages that the PPL owns. The kernel can give the PPL
132 * additional pages by grabbing pages from the VM and marking them as PPL-owned.
133 * See pmap_alloc_page_for_ppl() for more information.
134 */
135 static page_free_entry_t *pmap_ppl_free_page_list MARK_AS_PMAP_DATA = PAGE_FREE_ENTRY_NULL;
136
137 /* The current number of pages in the PPL page free list. */
138 uint64_t pmap_ppl_free_page_count MARK_AS_PMAP_DATA = 0;
139
140 /* Lock for the PPL page free list. */
141 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_ppl_free_page_lock, 0);
142 #endif /* XNU_MONITOR */
143
144 /**
145 * This VM object will contain every VM page being used by the pmap. This acts
146 * as a convenient place to put pmap pages to keep the VM from reusing them, as
147 * well as providing a way for looping over every page being used by the pmap.
148 */
149 struct vm_object pmap_object_store VM_PAGE_PACKED_ALIGNED;
150
151 /* Pointer to the pmap's VM object that can't be modified after machine_lockdown(). */
152 SECURITY_READ_ONLY_LATE(vm_object_t) pmap_object = &pmap_object_store;
153
154 /**
155 * Global variables strictly used for debugging purposes. These variables keep
156 * track of the total number of pages that have been allocated from the VM for
157 * pmap usage since cold boot, as well as how many are currently in use by the
158 * pmap. Once a page is given back to the VM, then the inuse_pmap_pages_count
159 * will be decremented.
160 *
161 * Even if a page is sitting in one of the pmap's various free lists and hasn't
162 * been allocated for usage, these are still considered "used" by the pmap, from
163 * the perspective of the VM.
164 */
165 static uint64_t alloc_pmap_pages_count __attribute__((aligned(8))) = 0LL;
166 unsigned int inuse_pmap_pages_count = 0;
167
168 /**
169 * Default watermark values used to keep a healthy supply of physical-to-virtual
170 * entries (PVEs) always available. These values can be overriden by the device
171 * tree (see pmap_compute_pv_targets() for more info).
172 */
173 #if XNU_MONITOR
174 /*
175 * Increase the padding for PPL devices to accommodate increased mapping
176 * pressure from IOMMUs. This isn't strictly necessary, but will reduce the need
177 * to retry mappings due to PV allocation failure.
178 */
179 #define PV_KERN_LOW_WATER_MARK_DEFAULT (0x400)
180 #define PV_ALLOC_CHUNK_INITIAL (0x400)
181 #define PV_KERN_ALLOC_CHUNK_INITIAL (0x400)
182 #else /* XNU_MONITOR */
183 #define PV_KERN_LOW_WATER_MARK_DEFAULT (0x200)
184 #define PV_ALLOC_CHUNK_INITIAL (0x200)
185 #define PV_KERN_ALLOC_CHUNK_INITIAL (0x200)
186 #endif /* XNU_MONITOR */
187
188 /**
189 * The pv_free array acts as a ring buffer where each entry points to a linked
190 * list of PVEs that have a length set by this define.
191 */
192 #define PV_BATCH_SIZE (PAGE_SIZE / sizeof(pv_entry_t))
193
194 /* The batch allocation code assumes that a batch can fit within a single page. */
195 #if defined(__arm64__) && __ARM_16K_PG__
196 /**
197 * PAGE_SIZE is a variable on arm64 systems with 4K VM pages, so no static
198 * assert on those systems.
199 */
200 static_assert((PV_BATCH_SIZE * sizeof(pv_entry_t)) <= PAGE_SIZE);
201 #endif /* defined(__arm64__) && __ARM_16K_PG__ */
202
203 /**
204 * The number of PVEs to attempt to keep in the kernel-dedicated free list. If
205 * the number of entries is below this value, then allocate more.
206 */
207 static uint32_t pv_kern_low_water_mark MARK_AS_PMAP_DATA = PV_KERN_LOW_WATER_MARK_DEFAULT;
208
209 /**
210 * The initial number of PVEs to allocate during bootstrap (can be overriden in
211 * the device tree, see pmap_compute_pv_targets() for more info).
212 */
213 uint32_t pv_alloc_initial_target MARK_AS_PMAP_DATA = PV_ALLOC_CHUNK_INITIAL * MAX_CPUS;
214 uint32_t pv_kern_alloc_initial_target MARK_AS_PMAP_DATA = PV_KERN_ALLOC_CHUNK_INITIAL;
215
216 /**
217 * Global variables strictly used for debugging purposes. These variables keep
218 * track of the number of pages being used for PVE objects, and the total number
219 * of PVEs that have been added to the global or kernel-dedicated free lists
220 * respectively.
221 */
222 static uint32_t pv_page_count MARK_AS_PMAP_DATA = 0;
223 static unsigned pmap_reserve_replenish_stat MARK_AS_PMAP_DATA = 0;
224 static unsigned pmap_kern_reserve_alloc_stat MARK_AS_PMAP_DATA = 0;
225
226 /**
227 * Number of linked lists of PVEs ("batches") in the global PV free ring buffer.
228 * This must be a power of two for the pv_free_array_n_elems() logic to work.
229 */
230 #define PV_FREE_ARRAY_SIZE (256U)
231
232 /**
233 * A ring buffer where each entry in the buffer is a linked list of PV entries
234 * (called "batches"). Allocations out of this array will always operate on
235 * a PV_BATCH_SIZE amount of entries at a time.
236 */
237 static pv_free_list_t pv_free_ring[PV_FREE_ARRAY_SIZE] MARK_AS_PMAP_DATA = {0};
238
239 /* Read and write indices for the pv_free ring buffer. */
240 static uint16_t pv_free_read_idx MARK_AS_PMAP_DATA = 0;
241 static uint16_t pv_free_write_idx MARK_AS_PMAP_DATA = 0;
242
243 /**
244 * Make sure the PV free array is small enough so that all elements can be
245 * properly indexed by pv_free_[read/write]_idx.
246 */
247 static_assert(PV_FREE_ARRAY_SIZE <= (1 << (sizeof(pv_free_read_idx) * 8)));
248
249 /**
250 * Return the number of free batches available for allocation out of the PV free
251 * ring buffer. Each batch is a linked list of PVEs with length PV_BATCH_SIZE.
252 *
253 * @note This function requires that PV_FREE_ARRAY_SIZE is a power of two.
254 */
255 static inline uint16_t
pv_free_array_n_elems(void)256 pv_free_array_n_elems(void)
257 {
258 return (pv_free_write_idx - pv_free_read_idx) & (PV_FREE_ARRAY_SIZE - 1);
259 }
260
261 /* Free list of PV entries dedicated for usage by the kernel. */
262 static pv_free_list_t pv_kern_free MARK_AS_PMAP_DATA = {0};
263
264 /* Locks for the global and kernel-dedicated PV free lists. */
265 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_free_array_lock, 0);
266 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pv_kern_free_list_lock, 0);
267
268 /* Represents a null page table descriptor (PTD). */
269 #define PTD_ENTRY_NULL ((pt_desc_t *) 0)
270
271 /* Running free list of PTD nodes. */
272 static pt_desc_t *ptd_free_list MARK_AS_PMAP_DATA = PTD_ENTRY_NULL;
273
274 /* The number of free PTD nodes available in the free list. */
275 static unsigned int ptd_free_count MARK_AS_PMAP_DATA = 0;
276
277 /**
278 * The number of PTD objects located in each page being used by the PTD
279 * allocator. The PTD objects share each page with their associated ptd_info_t
280 * objects (with cache-line alignment padding between them). The maximum number
281 * of PTDs that can be placed into a single page is calculated once at boot.
282 */
283 static SECURITY_READ_ONLY_LATE(unsigned) ptd_per_page = 0;
284
285 /**
286 * The offset in bytes from the beginning of a page of PTD objects where you
287 * start seeing the associated ptd_info_t objects. This is calculated once
288 * during boot to maximize the number of PTD and ptd_info_t objects that can
289 * reside within a page without sharing a cache-line.
290 */
291 static SECURITY_READ_ONLY_LATE(unsigned) ptd_info_offset = 0;
292
293 /* Lock to protect accesses to the PTD free list. */
294 static decl_simple_lock_data(, ptd_free_list_lock MARK_AS_PMAP_DATA);
295
296 /**
297 * Dummy _internal() prototypes so Clang doesn't complain about missing
298 * prototypes on a non-static function. These functions can't be marked as
299 * static because they need to be called from pmap_ppl_interface.c where the
300 * PMAP_SUPPORT_PROTOYPES() macro will auto-generate the prototype implicitly.
301 */
302 kern_return_t mapping_free_prime_internal(void);
303
304 #if XNU_MONITOR
305
306 /**
307 * These types and variables only exist on PPL-enabled systems because those are
308 * the only systems that need to allocate and manage ledger/pmap objects
309 * themselves. On non-PPL systems, those objects are allocated using a standard
310 * zone allocator.
311 */
312
313 /**
314 * Specify that the maximum number of ledgers and pmap objects are to be
315 * correlated to the maximum number of tasks allowed on the system (at most,
316 * we'll have one pmap object per task). For ledger objects, give a small amount
317 * of extra padding to account for allocation differences between pmap objects
318 * and ledgers (i.e. ~10% of total number of iOS tasks = 200).
319 *
320 * These defines are only valid once `pmap_max_asids` is initialized in
321 * pmap_bootstrap() (the value can change depending on the device tree).
322 */
323 #define LEDGER_PTR_ARRAY_SIZE (pmap_max_asids + 200)
324 #define PMAP_PTR_ARRAY_SIZE (pmap_max_asids)
325
326 /**
327 * Each ledger object consists of a variable number of ledger entries that is
328 * determined by the template it's based on. The template used for pmap ledger
329 * objects is the task_ledgers template.
330 *
331 * This define attempts to calculate how large each pmap ledger needs to be
332 * based on how many ledger entries exist in the task_ledgers template. This is
333 * found by counting how many integers exist in the task_ledgers structure (each
334 * integer represents the index for a ledger_entry) and multiplying by the size
335 * of a single ledger entry. That value is then added to the other fields in a
336 * ledger structure to get the total size of a single pmap ledger.
337 *
338 * Some of the task ledger's entries use a smaller struct format. TASK_LEDGER_NUM_SMALL_INDICES
339 * is used to determine how much memory we need for those entries.
340 *
341 * This assumed size will get validated when the task_ledgers template is
342 * created and the system will panic if this calculation wasn't correct.
343 *
344 */
345 #define PMAP_LEDGER_DATA_BYTES \
346 (((sizeof(task_ledgers) / sizeof(int) - TASK_LEDGER_NUM_SMALL_INDICES) * sizeof(struct ledger_entry) \
347 + TASK_LEDGER_NUM_SMALL_INDICES * sizeof(struct ledger_entry_small)) \
348 + sizeof(struct ledger))
349
350 /**
351 * Opaque data structure that contains the exact number of bytes required to
352 * hold a single ledger object based off of the task_ledgers template.
353 */
354 typedef struct pmap_ledger_data {
355 uint8_t pld_data[PMAP_LEDGER_DATA_BYTES];
356 } pmap_ledger_data_t;
357
358 /**
359 * This struct contains the memory needed to hold a single ledger object used by
360 * the pmap as well as an index into the pmap_ledger_ptr_array used for
361 * validating ledger objects passed into the PPL.
362 */
363 typedef struct pmap_ledger {
364 /**
365 * Either contain the memory needed for a ledger object based on the
366 * task_ledgers template (if already allocated) or a pointer to the next
367 * ledger object in the free list if the object hasn't been allocated yet.
368 *
369 * This union has to be the first member of this struct so that the memory
370 * used by this struct can be correctly cast to a ledger_t and used
371 * as a normal ledger object by the standard ledger API.
372 */
373 union {
374 struct pmap_ledger_data pld_data;
375 struct pmap_ledger *next;
376 };
377
378 /**
379 * This extra piece of information (not normally associated with generic
380 * ledger_t objects) is used to validate that a ledger passed into the PPL
381 * is indeed a ledger that was allocated by the PPL, and not just random
382 * memory being passed off as a ledger object. See pmap_ledger_validate()
383 * for more information on validating ledger objects.
384 */
385 unsigned long array_index;
386 } pmap_ledger_t;
387
388 /**
389 * This variable is used to ensure that the size of the ledger objects being
390 * allocated by the PPL match up with the actual size of the ledger objects
391 * before objects start being allocated.
392 */
393 static SECURITY_READ_ONLY_LATE(bool) pmap_ledger_size_verified = false;
394
395 /* Ledger free list lock. */
396 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_ledger_lock, 0);
397
398 /*
399 * The pmap_ledger_t contents are allowed to be written outside the PPL,
400 * so refcounts must be in a separate PPL-controlled array.
401 */
402 static SECURITY_READ_ONLY_LATE(os_refcnt_t *) pmap_ledger_refcnt = NULL;
403
404 /**
405 * The number of entries in the pmap ledger pointer and ledger refcnt arrays.
406 * This determines the maximum number of pmap ledger objects that can be
407 * allocated.
408 *
409 * This value might be slightly higher than LEDGER_PTR_ARRAY_SIZE because the
410 * memory used for the array is rounded up to the nearest page boundary.
411 */
412 static SECURITY_READ_ONLY_LATE(unsigned long) pmap_ledger_ptr_array_count = 0;
413
414 /**
415 * This array is used to validate that ledger objects passed into the PPL were
416 * allocated by the PPL and aren't just random memory being passed off as a
417 * ledger object. It does this by associating each ledger object allocated by
418 * the PPL with an index into this array. The value at that index will be a
419 * pointer to the ledger object itself.
420 *
421 * Even though the ledger object is kernel-writable, this array is only
422 * modifiable by the PPL. If a ledger object is passed into the PPL that has an
423 * index into this array that doesn't match up, then the validation will fail.
424 */
425 static SECURITY_READ_ONLY_LATE(pmap_ledger_t * *) pmap_ledger_ptr_array = NULL;
426
427 /**
428 * The next free index into pmap_ledger_ptr_array to be given to the next
429 * allocated ledger object.
430 */
431 static uint64_t pmap_ledger_ptr_array_free_index MARK_AS_PMAP_DATA = 0;
432
433 /* Free list of pmap ledger objects. */
434 static pmap_ledger_t *pmap_ledger_free_list MARK_AS_PMAP_DATA = NULL;
435
436 /**
437 * This struct contains the memory needed to hold a single pmap object as well
438 * as an index into the pmap_ptr_array used for validating pmap objects passed
439 * into the PPL.
440 */
441 typedef struct pmap_list_entry {
442 /**
443 * Either contain the memory needed for a single pmap object or a pointer to
444 * the next pmap object in the free list if the object hasn't been allocated
445 * yet.
446 *
447 * This union has to be the first member of this struct so that the memory
448 * used by this struct can be correctly cast as either a pmap_list_entry_t
449 * or a pmap_t (depending on whether the array_index is needed).
450 */
451 union {
452 struct pmap pmap;
453 struct pmap_list_entry *next;
454 };
455
456 /**
457 * This extra piece of information (not normally associated with generic
458 * pmap objects) is used to validate that a pmap object passed into the PPL
459 * is indeed a pmap object that was allocated by the PPL, and not just random
460 * memory being passed off as a pmap object. See validate_pmap()
461 * for more information on validating pmap objects.
462 */
463 unsigned long array_index;
464 } pmap_list_entry_t;
465
466 /* Lock for the pmap free list. */
467 static MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_free_list_lock, 0);
468
469 /**
470 * The number of entries in the pmap pointer array. This determines the maximum
471 * number of pmap objects that can be allocated.
472 *
473 * This value might be slightly higher than PMAP_PTR_ARRAY_SIZE because the
474 * memory used for the array is rounded up to the nearest page boundary.
475 */
476 static SECURITY_READ_ONLY_LATE(unsigned long) pmap_ptr_array_count = 0;
477
478 /**
479 * This array is used to validate that pmap objects passed into the PPL were
480 * allocated by the PPL and aren't just random memory being passed off as a pmap
481 * object. It does this by associating each pmap object allocated by the PPL
482 * with an index into this array. The value at that index will be a pointer to
483 * the pmap object itself.
484 *
485 * If a pmap object is passed into the PPL that has an index into this array
486 * that doesn't match up, then the validation will fail.
487 */
488 static SECURITY_READ_ONLY_LATE(pmap_list_entry_t * *) pmap_ptr_array = NULL;
489
490 /**
491 * The next free index into pmap_ptr_array to be given to the next
492 * allocated pmap object.
493 */
494 static unsigned long pmap_ptr_array_free_index MARK_AS_PMAP_DATA = 0;
495
496 /* Free list of pmap objects. */
497 static pmap_list_entry_t *pmap_free_list MARK_AS_PMAP_DATA = NULL;
498
499 #endif /* XNU_MONITOR */
500
501 /**
502 * Sorted representation of the pmap-io-ranges nodes in the device tree. These
503 * nodes describe all of the PPL-owned I/O ranges.
504 */
505 SECURITY_READ_ONLY_LATE(pmap_io_range_t*) io_attr_table = (pmap_io_range_t*)0;
506
507 /* The number of ranges described by io_attr_table. */
508 SECURITY_READ_ONLY_LATE(unsigned int) num_io_rgns = 0;
509
510 /**
511 * Sorted representation of the pmap-io-filter entries in the device tree
512 * The entries are sorted and queried by {signature, range}.
513 */
514 SECURITY_READ_ONLY_LATE(pmap_io_filter_entry_t*) io_filter_table = (pmap_io_filter_entry_t*)0;
515
516 /* Number of total pmap-io-filter entries. */
517 SECURITY_READ_ONLY_LATE(unsigned int) num_io_filter_entries = 0;
518
519 #if XNU_MONITOR
520
521 /**
522 * Per-cpu pmap data. On PPL-enabled systems, this memory is only modifiable by
523 * the PPL itself and because of that, needs to be managed separately from the
524 * generic per-cpu data. The per-cpu pmap data exists on non-PPL systems as
525 * well, it's just located within the general machine-specific per-cpu data.
526 */
527 struct pmap_cpu_data_array_entry pmap_cpu_data_array[MAX_CPUS] MARK_AS_PMAP_DATA;
528
529 /**
530 * The physical address spaces being used for the PPL stacks and PPL register
531 * save area are stored in global variables so that their permissions can be
532 * updated in pmap_static_allocations_done(). These regions are initialized by
533 * pmap_cpu_data_array_init().
534 */
535 SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_start_pa = 0;
536 SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_end_pa = 0;
537 SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_start = 0;
538 SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_end = 0;
539
540 #if HAS_GUARDED_IO_FILTER
541 SECURITY_READ_ONLY_LATE(pmap_paddr_t) iofilter_stacks_start_pa = 0;
542 SECURITY_READ_ONLY_LATE(pmap_paddr_t) iofilter_stacks_end_pa = 0;
543 #endif /* HAS_GUARDED_IO_FILTER */
544
545 #endif /* XNU_MONITOR */
546
547 /* Prototypes used by pmap_data_bootstrap(). */
548 vm_size_t pmap_compute_io_rgns(void);
549 void pmap_load_io_rgns(void);
550 void pmap_cpu_data_array_init(void);
551
552 #if HAS_GUARDED_IO_FILTER
553 vm_size_t pmap_compute_io_filters(void);
554 void pmap_load_io_filters(void);
555 #endif /* HAS_GUARDED_IO_FILTER */
556
557 /**
558 * This function is called once during pmap_bootstrap() to allocate and
559 * initialize many of the core data structures that are implemented in this
560 * file.
561 *
562 * Memory for these data structures is carved out of `avail_start` which is a
563 * global setup by arm_vm_init() that points to a physically contiguous region
564 * used for bootstrap allocations.
565 *
566 * @note There is no guaranteed alignment of `avail_start` when this function
567 * returns. If avail_start needs to be aligned to a specific value then it
568 * must be done so by the caller before they use it for more allocations.
569 */
570 void
pmap_data_bootstrap(void)571 pmap_data_bootstrap(void)
572 {
573 /**
574 * Set ptd_per_page to the maximum number of (pt_desc_t + ptd_info_t) we can
575 * fit in a single page. We need to allow for some padding between the two,
576 * so that no ptd_info_t shares a cache line with a pt_desc_t.
577 */
578 const unsigned ptd_info_size = sizeof(ptd_info_t) * PT_INDEX_MAX;
579 const unsigned l2_cline_bytes = 1 << MAX_L2_CLINE;
580 ptd_per_page = (PAGE_SIZE - (l2_cline_bytes - 1)) / (sizeof(pt_desc_t) + ptd_info_size);
581 unsigned increment = 0;
582 bool try_next = true;
583
584 /**
585 * The current ptd_per_page calculation was done assuming the worst-case
586 * scenario in terms of padding between the two object arrays that reside in
587 * the same page. The following loop attempts to optimize this further by
588 * finding the smallest possible amount of padding while still ensuring that
589 * the two object arrays don't share a cache line.
590 */
591 while (try_next) {
592 increment++;
593 const unsigned pt_desc_total_size =
594 PMAP_ALIGN((ptd_per_page + increment) * sizeof(pt_desc_t), l2_cline_bytes);
595 const unsigned ptd_info_total_size = (ptd_per_page + increment) * ptd_info_size;
596 try_next = (pt_desc_total_size + ptd_info_total_size) <= PAGE_SIZE;
597 }
598 ptd_per_page += increment - 1;
599 assert(ptd_per_page > 0);
600
601 /**
602 * ptd_info objects reside after the ptd descriptor objects, with some
603 * padding in between if necessary to ensure that they don't co-exist in the
604 * same cache line.
605 */
606 const unsigned pt_desc_bytes = ptd_per_page * sizeof(pt_desc_t);
607 ptd_info_offset = PMAP_ALIGN(pt_desc_bytes, l2_cline_bytes);
608
609 /* The maximum amount of padding should be (l2_cline_bytes - 1). */
610 assert((ptd_info_offset - pt_desc_bytes) < l2_cline_bytes);
611
612 /**
613 * Allocate enough initial PTDs to map twice the available physical memory.
614 *
615 * To do this, start by calculating the number of leaf page tables that are
616 * needed to cover all of kernel-managed physical memory.
617 */
618 const uint32_t num_leaf_page_tables =
619 (uint32_t)(mem_size / ((PAGE_SIZE / sizeof(pt_entry_t)) * ARM_PGBYTES));
620
621 /**
622 * There should be one PTD per page table (times 2 since we want twice the
623 * number of required PTDs), plus round the number of PTDs up to the next
624 * `ptd_per_page` value so there's no wasted space.
625 */
626 const uint32_t ptd_root_table_n_ptds =
627 (ptd_per_page * ((num_leaf_page_tables * 2) / ptd_per_page)) + ptd_per_page;
628
629 /* Lastly, calculate the number of VM pages and bytes these PTDs take up. */
630 const uint32_t num_ptd_pages = ptd_root_table_n_ptds / ptd_per_page;
631 vm_size_t ptd_root_table_size = num_ptd_pages * PAGE_SIZE;
632
633 /* Number of VM pages that span all of kernel-managed memory. */
634 const unsigned int npages = (unsigned int)atop(mem_size);
635
636 /* The pv_head_table and pp_attr_table both have one entry per VM page. */
637 const vm_size_t pp_attr_table_size = npages * sizeof(pp_attr_t);
638 const vm_size_t pv_head_size = round_page(npages * sizeof(pv_entry_t *));
639
640 /* Scan the device tree and override heuristics in the PV entry management code. */
641 pmap_compute_pv_targets();
642
643 /* Scan the device tree and figure out how many PPL-owned I/O regions there are. */
644 const vm_size_t io_attr_table_size = pmap_compute_io_rgns();
645
646 #if HAS_GUARDED_IO_FILTER
647 /* Scan the device tree for the size of pmap-io-filter entries. */
648 const vm_size_t io_filter_table_size = pmap_compute_io_filters();
649 #endif /* HAS_GUARDED_IO_FILTER */
650
651 /**
652 * Don't make any assumptions about the alignment of avail_start before
653 * execution of this function. Always re-align it to ensure the first
654 * allocated data structure is aligned correctly.
655 */
656 avail_start = PMAP_ALIGN(avail_start, __alignof(pp_attr_t));
657
658 /**
659 * Keep track of where the data structures start so we can clear this memory
660 * later.
661 */
662 const pmap_paddr_t pmap_struct_start = avail_start;
663
664 pp_attr_table = (pp_attr_t *)phystokv(avail_start);
665 avail_start = PMAP_ALIGN(avail_start + pp_attr_table_size, __alignof(pmap_io_range_t));
666
667 io_attr_table = (pmap_io_range_t *)phystokv(avail_start);
668
669 #if HAS_GUARDED_IO_FILTER
670 /* Align avail_start to size of I/O filter entry. */
671 avail_start = PMAP_ALIGN(avail_start + io_attr_table_size, __alignof(pmap_io_filter_entry_t));
672
673 /* Allocate memory for io_filter_table. */
674 if (num_io_filter_entries != 0) {
675 io_filter_table = (pmap_io_filter_entry_t *)phystokv(avail_start);
676 }
677
678 /* Align avail_start for the next structure to be allocated. */
679 avail_start = PMAP_ALIGN(avail_start + io_filter_table_size, __alignof(pv_entry_t *));
680 #else /* !HAS_GUARDED_IO_FILTER */
681 avail_start = PMAP_ALIGN(avail_start + io_attr_table_size, __alignof(pv_entry_t *));
682 #endif /* HAS_GUARDED_IO_FILTER */
683
684 pv_head_table = (pv_entry_t **)phystokv(avail_start);
685
686 /**
687 * ptd_root_table must start on a page boundary because all of the math for
688 * associating pt_desc_t objects with ptd_info objects assumes the first
689 * pt_desc_t in a page starts at the beginning of the page it resides in.
690 */
691 avail_start = round_page(avail_start + pv_head_size);
692
693 pt_desc_t *ptd_root_table = (pt_desc_t *)phystokv(avail_start);
694 avail_start = round_page(avail_start + ptd_root_table_size);
695
696 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
697
698 /* This function assumes that ptd_root_table has been zeroed out already. */
699 ptd_bootstrap(ptd_root_table, num_ptd_pages);
700
701 /* Load data about the PPL-owned I/O regions into io_attr_table and sort it. */
702 pmap_load_io_rgns();
703
704 #if HAS_GUARDED_IO_FILTER
705 /* Load the I/O filters into io_filter_table and sort them. */
706 pmap_load_io_filters();
707 #endif /* HAS_GUARDED_IO_FILTER */
708
709 #if XNU_MONITOR
710 /**
711 * Each of these PPL-only data structures are rounded to the nearest page
712 * beyond their predefined size so as to provide a small extra buffer of
713 * objects and to make it easy to perform page-sized operations on them if
714 * the need ever arises.
715 */
716 const vm_map_address_t pmap_ptr_array_begin = phystokv(avail_start);
717 pmap_ptr_array = (pmap_list_entry_t**)pmap_ptr_array_begin;
718 avail_start += round_page(PMAP_PTR_ARRAY_SIZE * sizeof(*pmap_ptr_array));
719 const vm_map_address_t pmap_ptr_array_end = phystokv(avail_start);
720
721 pmap_ptr_array_count = ((pmap_ptr_array_end - pmap_ptr_array_begin) / sizeof(*pmap_ptr_array));
722
723 const vm_map_address_t pmap_ledger_ptr_array_begin = phystokv(avail_start);
724 pmap_ledger_ptr_array = (pmap_ledger_t**)pmap_ledger_ptr_array_begin;
725 avail_start += round_page(LEDGER_PTR_ARRAY_SIZE * sizeof(*pmap_ledger_ptr_array));
726 const vm_map_address_t pmap_ledger_ptr_array_end = phystokv(avail_start);
727 pmap_ledger_ptr_array_count = ((pmap_ledger_ptr_array_end - pmap_ledger_ptr_array_begin) / sizeof(*pmap_ledger_ptr_array));
728
729 pmap_ledger_refcnt = (os_refcnt_t*)phystokv(avail_start);
730 avail_start += round_page(pmap_ledger_ptr_array_count * sizeof(*pmap_ledger_refcnt));
731 #endif /* XNU_MONITOR */
732
733 /**
734 * Setup the pmap per-cpu data structures (includes the PPL stacks, and PPL
735 * register save area). The pmap per-cpu data is managed separately from the
736 * general machine-specific per-cpu data on PPL systems so it can be made
737 * only writable by the PPL.
738 */
739 pmap_cpu_data_array_init();
740 }
741
742 /**
743 * Helper function for pmap_page_reclaim (hereby shortened to "ppr") which scans
744 * the list of userspace page table pages for one(s) that can be reclaimed. To
745 * be eligible, a page table must not have any wired PTEs, must contain at least
746 * one valid PTE, can't be nested, and the pmap that owns that page table must
747 * not already be locked.
748 *
749 * @note This should only be called from pmap_page_reclaim().
750 *
751 * @note If an eligible page table was found, then the pmap which contains that
752 * page table will be locked exclusively.
753 *
754 * @note On systems where multiple page tables exist within one page, all page
755 * tables within a page have to be eligible for that page to be considered
756 * reclaimable.
757 *
758 * @param ptdpp Output parameter which will contain a pointer to the page table
759 * descriptor for the page table(s) that can be reclaimed (if any
760 * were found). If no page table was found, this will be set to
761 * NULL.
762 *
763 * @return True if an eligible table was found, false otherwise. In the case
764 * that a page table was found, ptdpp will be a pointer to the page
765 * table descriptor for the table(s) that can be reclaimed. Otherwise
766 * it'll be set to NULL.
767 */
768 MARK_AS_PMAP_TEXT static bool
ppr_find_eligible_pt_page(pt_desc_t ** ptdpp)769 ppr_find_eligible_pt_page(pt_desc_t **ptdpp)
770 {
771 assert(ptdpp != NULL);
772
773 pmap_simple_lock(&pt_pages_lock);
774 pt_desc_t *ptdp = (pt_desc_t *)queue_first(&pt_page_list);
775
776 while (!queue_end(&pt_page_list, (queue_entry_t)ptdp)) {
777 /* Skip this pmap if it's nested or already locked. */
778 if ((ptdp->pmap->type != PMAP_TYPE_USER) ||
779 (!pmap_try_lock(ptdp->pmap, PMAP_LOCK_EXCLUSIVE))) {
780 ptdp = (pt_desc_t *)queue_next((queue_t)ptdp);
781 continue;
782 }
783
784 assert(ptdp->pmap != kernel_pmap);
785
786 unsigned refcnt_acc = 0;
787 unsigned wiredcnt_acc = 0;
788 const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);
789
790 /**
791 * On systems where the VM page size differs from the hardware
792 * page size, then multiple page tables can exist within one VM page.
793 */
794 for (unsigned i = 0; i < (PAGE_SIZE / pt_attr_page_size(pt_attr)); i++) {
795 /* Do not attempt to free a page that contains an L2 table. */
796 if (ptdp->ptd_info[i].refcnt == PT_DESC_REFCOUNT) {
797 refcnt_acc = 0;
798 break;
799 }
800
801 refcnt_acc += ptdp->ptd_info[i].refcnt;
802 wiredcnt_acc += ptdp->ptd_info[i].wiredcnt;
803 }
804
805 /**
806 * If we've found a page with no wired entries, but valid PTEs then
807 * choose it for reclamation.
808 */
809 if ((wiredcnt_acc == 0) && (refcnt_acc != 0)) {
810 *ptdpp = ptdp;
811 pmap_simple_unlock(&pt_pages_lock);
812
813 /**
814 * Leave ptdp->pmap locked here. We're about to reclaim a page table
815 * from it, so we don't want anyone else messing with it while we do
816 * that.
817 */
818 return true;
819 }
820
821 /**
822 * This page table/PTD wasn't eligible, unlock its pmap and move to the
823 * next one in the queue.
824 */
825 pmap_unlock(ptdp->pmap, PMAP_LOCK_EXCLUSIVE);
826 ptdp = (pt_desc_t *)queue_next((queue_t)ptdp);
827 }
828
829 pmap_simple_unlock(&pt_pages_lock);
830 *ptdpp = NULL;
831
832 return false;
833 }
834
835 /**
836 * Helper function for pmap_page_reclaim (hereby shortened to "ppr") which frees
837 * every page table within a page so that that page can get reclaimed.
838 *
839 * @note This should only be called from pmap_page_reclaim() and is only meant
840 * to delete page tables deemed eligible for reclaiming by
841 * ppr_find_eligible_pt_page().
842 *
843 * @param ptdp The page table descriptor whose page table(s) will get freed.
844 *
845 * @return KERN_SUCCESS on success. KERN_RESOURCE_SHORTAGE if the page is not
846 * removed due to pending preemption.
847 */
848 MARK_AS_PMAP_TEXT static kern_return_t
ppr_remove_pt_page(pt_desc_t * ptdp)849 ppr_remove_pt_page(pt_desc_t *ptdp)
850 {
851 assert(ptdp != NULL);
852
853 bool need_strong_sync = false;
854 tt_entry_t *ttep = TT_ENTRY_NULL;
855 pt_entry_t *ptep = PT_ENTRY_NULL;
856 pt_entry_t *begin_pte = PT_ENTRY_NULL;
857 pt_entry_t *end_pte = PT_ENTRY_NULL;
858 pmap_t pmap = ptdp->pmap;
859
860 /**
861 * The pmap exclusive lock should have gotten locked when the eligible page
862 * table was found in ppr_find_eligible_pt_page().
863 */
864 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
865
866 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
867 const uint64_t hw_page_size = pt_attr_page_size(pt_attr);
868
869 /**
870 * On some systems, one page table descriptor can represent multiple page
871 * tables. In that case, remove every table within the wanted page so we
872 * can reclaim it.
873 */
874 for (unsigned i = 0; i < (PAGE_SIZE / hw_page_size); i++) {
875 const vm_map_address_t va = ptdp->va[i];
876
877 /**
878 * If the VA is bogus, this may represent an unallocated region or one
879 * which is in transition (already being freed or expanded). Don't try
880 * to remove mappings here.
881 */
882 if (va == (vm_offset_t)-1) {
883 continue;
884 }
885
886 /* Get the twig table entry that points to the table to reclaim. */
887 ttep = pmap_tte(pmap, va);
888
889 /* If the twig entry is either invalid or a block mapping, skip it. */
890 if ((ttep == TT_ENTRY_NULL) ||
891 ((*ttep & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE)) {
892 continue;
893 }
894
895 ptep = (pt_entry_t *)ttetokv(*ttep);
896 begin_pte = &ptep[pte_index(pt_attr, va)];
897 end_pte = begin_pte + (hw_page_size / sizeof(pt_entry_t));
898 vm_map_address_t eva = 0;
899
900 /**
901 * Remove all mappings in the page table being reclaimed.
902 *
903 * Use PMAP_OPTIONS_REMOVE to clear any "compressed" markers and
904 * update the "compressed" counter in the ledger. This means that
905 * we lose accounting for any compressed pages in this range but the
906 * alternative is to not be able to account for their future
907 * decompression, which could cause the counter to drift more and
908 * more.
909 */
910 int pte_changed = pmap_remove_range_options(
911 pmap, va, begin_pte, end_pte, &eva, &need_strong_sync, PMAP_OPTIONS_REMOVE);
912
913 const vm_offset_t expected_va_end = va + (size_t)pt_attr_leaf_table_size(pt_attr);
914
915 if (eva == expected_va_end) {
916 /**
917 * Free the page table now that all of its mappings have been removed.
918 * Once all page tables within a page have been deallocated, then the
919 * page that contains the table(s) will be freed and made available for
920 * reuse.
921 */
922 pmap_tte_deallocate(pmap, va, expected_va_end, need_strong_sync, ttep, pt_attr_twig_level(pt_attr));
923 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE); /* pmap_tte_deallocate() dropped the lock */
924 } else {
925 /**
926 * pmap_remove_range_options() returned earlier than expected,
927 * indicating there is emergent preemption pending. We should
928 * bail out, despite some of the mappings were removed in vain.
929 * They have to take the penalty of page faults to be brought
930 * back, but we don't want to miss the preemption deadline and
931 * panic.
932 */
933 assert(eva < expected_va_end);
934
935 /**
936 * In the normal path, we expect pmap_tte_deallocate() to flush
937 * the TLB for us. However, on the abort path here, we need to
938 * handle it here explicitly. If there is any mapping updated,
939 * update the TLB. */
940 if (pte_changed > 0) {
941 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, (size_t) (eva - va), pmap, false, need_strong_sync);
942 arm64_sync_tlb(need_strong_sync);
943 }
944
945 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
946 return KERN_ABORTED;
947 }
948 }
949
950 /**
951 * We're done modifying page tables, so undo the lock that was grabbed when
952 * we found the table(s) to reclaim in ppr_find_eligible_pt_page().
953 */
954 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
955 return KERN_SUCCESS;
956 }
957
958 /**
959 * Attempt to return a page by freeing an active page-table page. To be eligible
960 * for reclaiming, a page-table page must be assigned to a non-kernel pmap, it
961 * must not have any wired PTEs and must contain at least one valid PTE.
962 *
963 * @note This function is potentially invoked when PMAP_PAGE_RECLAIM_NOWAIT is
964 * passed as an option to pmap_pages_alloc_zeroed().
965 *
966 * @note Invocations of this function are only meant to occur in critical paths
967 * that absolutely can't take the latency hit of waiting for the VM or
968 * jumping out of the PPL to allocate more pages. Reclaiming a page table
969 * page can cause a performance hit when one of the removed mappings is
970 * next accessed (forcing the VM to fault and re-insert the mapping).
971 *
972 * @return The physical address of the page that was allocated, or zero if no
973 * suitable page was found on the page-table list.
974 */
975 MARK_AS_PMAP_TEXT static pmap_paddr_t
pmap_page_reclaim(void)976 pmap_page_reclaim(void)
977 {
978 pmap_simple_lock(&pmap_page_reclaim_lock);
979 pmap_pages_request_count++;
980 pmap_pages_request_acum++;
981
982 /* This loop will never break out, the function will just return. */
983 while (1) {
984 /**
985 * Attempt to allocate a page from the page free list reserved for this
986 * function. This free list is managed in tandem with pmap_pages_free()
987 * which will add a page to this list for each call to
988 * pmap_page_reclaim(). Most likely that page will come from a reclaimed
989 * userspace page table, but if there aren't any page tables to reclaim,
990 * then whatever the next freed page is will show up on this list for
991 * the next invocation of pmap_page_reclaim() to use.
992 */
993 if (pmap_page_reclaim_list != PAGE_FREE_ENTRY_NULL) {
994 page_free_entry_t *page_entry = pmap_page_reclaim_list;
995 pmap_page_reclaim_list = pmap_page_reclaim_list->next;
996 pmap_simple_unlock(&pmap_page_reclaim_lock);
997
998 return ml_static_vtop((vm_offset_t)page_entry);
999 }
1000
1001 /* Drop the lock to allow pmap_pages_free() to add pages to the list. */
1002 pmap_simple_unlock(&pmap_page_reclaim_lock);
1003
1004 /* Attempt to find an elegible page table page to reclaim. */
1005 pt_desc_t *ptdp = NULL;
1006 bool found_page = ppr_find_eligible_pt_page(&ptdp);
1007
1008 if (!found_page) {
1009 /**
1010 * No eligible page table was found. pmap_pages_free() will still
1011 * add the next freed page to the reclaim free list, so the next
1012 * invocation of this function should have better luck.
1013 */
1014 return (pmap_paddr_t)0;
1015 }
1016
1017 /**
1018 * If we found a page table to reclaim, then ptdp should point to the
1019 * descriptor for that table. Go ahead and remove it.
1020 */
1021 if (ppr_remove_pt_page(ptdp) != KERN_SUCCESS) {
1022 /* Take the page not found path to bail out on pending preemption. */
1023 return (pmap_paddr_t)0;
1024 }
1025
1026 /**
1027 * Now that a page has hopefully been freed (and added to the reclaim
1028 * page list), the next iteration of the loop will re-check the reclaim
1029 * free list.
1030 */
1031 pmap_simple_lock(&pmap_page_reclaim_lock);
1032 }
1033 }
1034
1035 #if XNU_MONITOR
1036 /**
1037 * Helper function for returning a PPL page back to the PPL page free list.
1038 *
1039 * @param pa Physical address of the page to add to the PPL page free list.
1040 * This address must be aligned to the VM page size.
1041 */
1042 MARK_AS_PMAP_TEXT static void
pmap_give_free_ppl_page(pmap_paddr_t pa)1043 pmap_give_free_ppl_page(pmap_paddr_t pa)
1044 {
1045 if ((pa & PAGE_MASK) != 0) {
1046 panic("%s: Unaligned address passed in, pa=0x%llx",
1047 __func__, pa);
1048 }
1049
1050 page_free_entry_t *page_entry = (page_free_entry_t *)phystokv(pa);
1051 pmap_simple_lock(&pmap_ppl_free_page_lock);
1052
1053 /* Prepend the passed in page to the PPL page free list. */
1054 page_entry->next = pmap_ppl_free_page_list;
1055 pmap_ppl_free_page_list = page_entry;
1056 pmap_ppl_free_page_count++;
1057
1058 pmap_simple_unlock(&pmap_ppl_free_page_lock);
1059 }
1060
1061 /**
1062 * Helper function for getting a PPL page from the PPL page free list.
1063 *
1064 * @return The physical address of the page taken from the PPL page free list,
1065 * or zero if there are no pages left in the free list.
1066 */
1067 MARK_AS_PMAP_TEXT static pmap_paddr_t
pmap_get_free_ppl_page(void)1068 pmap_get_free_ppl_page(void)
1069 {
1070 pmap_paddr_t pa = 0;
1071
1072 pmap_simple_lock(&pmap_ppl_free_page_lock);
1073
1074 if (pmap_ppl_free_page_list != PAGE_FREE_ENTRY_NULL) {
1075 /**
1076 * Pop a page off the front of the list. The second item in the list
1077 * will become the new head.
1078 */
1079 page_free_entry_t *page_entry = pmap_ppl_free_page_list;
1080 pmap_ppl_free_page_list = pmap_ppl_free_page_list->next;
1081 pa = kvtophys_nofail((vm_offset_t)page_entry);
1082 pmap_ppl_free_page_count--;
1083 } else {
1084 pa = 0L;
1085 }
1086
1087 pmap_simple_unlock(&pmap_ppl_free_page_lock);
1088 assert((pa & PAGE_MASK) == 0);
1089
1090 return pa;
1091 }
1092
1093 /**
1094 * Claim a page on behalf of the PPL by marking it as PPL-owned and only
1095 * allowing the PPL to write to it. Also can potentially add the page to the
1096 * PPL page free list (see initially_free parameter).
1097 *
1098 * @note The page cannot have any mappings outside of the physical aperture.
1099 *
1100 * @param pa The physical address of the page to mark as PPL-owned.
1101 * @param initially_free Should the page be added to the PPL page free list.
1102 * This is typically "true" if a brand new page was just
1103 * allocated for the PPL's usage, and "false" if this is a
1104 * page already being used by other agents (e.g., IOMMUs).
1105 */
1106 MARK_AS_PMAP_TEXT void
pmap_mark_page_as_ppl_page_internal(pmap_paddr_t pa,bool initially_free)1107 pmap_mark_page_as_ppl_page_internal(pmap_paddr_t pa, bool initially_free)
1108 {
1109 pp_attr_t attr = 0;
1110
1111 if (!pa_valid(pa)) {
1112 panic("%s: Non-kernel-managed (maybe I/O) address passed in, pa=0x%llx",
1113 __func__, pa);
1114 }
1115
1116 const unsigned int pai = pa_index(pa);
1117 pvh_lock(pai);
1118
1119 /* A page that the PPL already owns can't be given to the PPL. */
1120 if (__improbable(ppattr_pa_test_monitor(pa))) {
1121 panic("%s: page already belongs to PPL, pa=0x%llx", __func__, pa);
1122 }
1123
1124 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
1125 panic("%s: page locked down, pa=0x%llx", __func__, pa);
1126 }
1127
1128 /* The page cannot be mapped outside of the physical aperture. */
1129 if (__improbable(!pmap_verify_free((ppnum_t)atop(pa)))) {
1130 panic("%s: page still has mappings, pa=0x%llx", __func__, pa);
1131 }
1132
1133 do {
1134 attr = pp_attr_table[pai];
1135 if (__improbable(attr & PP_ATTR_NO_MONITOR)) {
1136 panic("%s: page excluded from PPL, pa=0x%llx", __func__, pa);
1137 }
1138 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_MONITOR, &pp_attr_table[pai]));
1139
1140 /* Ensure only the PPL has write access to the physical aperture mapping. */
1141 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
1142
1143 pvh_unlock(pai);
1144
1145 if (initially_free) {
1146 pmap_give_free_ppl_page(pa);
1147 }
1148 }
1149
1150 /**
1151 * Helper function for converting a PPL page back into a kernel-writable page.
1152 * This removes the PPL-ownership for that page and updates the physical
1153 * aperture mapping of that page so it's kernel-writable again.
1154 *
1155 * @param pa The physical address of the PPL page to be made kernel-writable.
1156 */
1157 MARK_AS_PMAP_TEXT void
pmap_mark_page_as_kernel_page(pmap_paddr_t pa)1158 pmap_mark_page_as_kernel_page(pmap_paddr_t pa)
1159 {
1160 const unsigned int pai = pa_index(pa);
1161 pvh_lock(pai);
1162
1163 if (!ppattr_pa_test_monitor(pa)) {
1164 panic("%s: page is not a PPL page, pa=%p", __func__, (void *)pa);
1165 }
1166
1167 ppattr_pa_clear_monitor(pa);
1168
1169 /* Ensure the kernel has write access to the physical aperture mapping. */
1170 pmap_set_xprr_perm(pai, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
1171
1172 pvh_unlock(pai);
1173 }
1174
1175 /**
1176 * PPL Helper function for giving a single page on the PPL page free list back
1177 * to the kernel.
1178 *
1179 * @note This function implements the logic that HAS to run within the PPL for
1180 * the pmap_release_ppl_pages_to_kernel() call. This helper function
1181 * shouldn't be called directly.
1182 *
1183 * @note A minimum amount of pages (set by PMAP_MIN_FREE_PPL_PAGES) will always
1184 * be kept on the PPL page free list to ensure that core operations can
1185 * occur without having to refill the free list.
1186 *
1187 * @return The physical address of the page that's been returned to the kernel,
1188 * or zero if no page was returned.
1189 */
1190 MARK_AS_PMAP_TEXT pmap_paddr_t
pmap_release_ppl_pages_to_kernel_internal(void)1191 pmap_release_ppl_pages_to_kernel_internal(void)
1192 {
1193 pmap_paddr_t pa = 0;
1194
1195 if (pmap_ppl_free_page_count <= PMAP_MIN_FREE_PPL_PAGES) {
1196 return 0;
1197 }
1198
1199 pa = pmap_get_free_ppl_page();
1200
1201 if (!pa) {
1202 return 0;
1203 }
1204
1205 pmap_mark_page_as_kernel_page(pa);
1206
1207 return pa;
1208 }
1209 #endif /* XNU_MONITOR */
1210
1211 /**
1212 * Add a queue of VM pages to the pmap's VM object. This informs the VM that
1213 * these pages are being used by the pmap and shouldn't be reused.
1214 *
1215 * This also means that the pmap_object can be used as a convenient way to loop
1216 * through every page currently being used by the pmap. For instance, this queue
1217 * of pages is exposed to the debugger through the Low Globals, where it's used
1218 * to ensure that all pmap data is saved in an active core dump.
1219 *
1220 * @param mem The head of the queue of VM pages to add to the pmap's VM object.
1221 */
1222 void
pmap_enqueue_pages(vm_page_t mem)1223 pmap_enqueue_pages(vm_page_t mem)
1224 {
1225 vm_page_t m_prev;
1226 vm_object_lock(pmap_object);
1227 while (mem != VM_PAGE_NULL) {
1228 const vm_object_offset_t offset =
1229 (vm_object_offset_t) ((ptoa(VM_PAGE_GET_PHYS_PAGE(mem))) - gPhysBase);
1230
1231 vm_page_insert_wired(mem, pmap_object, offset, VM_KERN_MEMORY_PTE);
1232 m_prev = mem;
1233 mem = NEXT_PAGE(m_prev);
1234 *(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL;
1235 }
1236 vm_object_unlock(pmap_object);
1237 }
1238
1239 static inline boolean_t
pmap_is_preemptible(void)1240 pmap_is_preemptible(void)
1241 {
1242 return preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT);
1243 }
1244
1245 /**
1246 * Allocate a page for usage within the pmap and zero it out. If running on a
1247 * PPL-enabled system, this will allocate pages from the PPL page free list.
1248 * Otherwise pages are grabbed directly from the VM.
1249 *
1250 * @note On PPL-enabled systems, this function can ONLY be called from within
1251 * the PPL. If a page needs to be allocated from outside of the PPL on
1252 * these systems, then use pmap_alloc_page_for_kern().
1253 *
1254 * @param pa Output parameter to store the physical address of the allocated
1255 * page if one was able to be allocated (NULL otherwise).
1256 * @param size The amount of memory to allocate. This has to be PAGE_SIZE on
1257 * PPL-enabled systems. On other systems it can be either PAGE_SIZE
1258 * or 2*PAGE_SIZE, in which case the two pages are allocated
1259 * physically contiguous.
1260 * @param options The following options can be specified:
1261 * - PMAP_PAGES_ALLOCATE_NOWAIT: If the VM or PPL page free list don't have
1262 * any free pages available then don't wait for one, just return
1263 * immediately without allocating a page. PPL-enabled systems must ALWAYS
1264 * pass this flag since allocating memory from within the PPL can't spin
1265 * or block due to preemption being disabled (would be a perf hit).
1266 *
1267 * - PMAP_PAGE_RECLAIM_NOWAIT: If memory failed to get allocated the normal
1268 * way (either by the PPL page free list on PPL-enabled systems, or
1269 * through the VM on other systems), then fall back to attempting to
1270 * reclaim a userspace page table. This should only be specified in paths
1271 * that absolutely can't take the latency hit of waiting for the VM or
1272 * jumping out of the PPL to allocate more pages.
1273 *
1274 * @return KERN_SUCCESS if a page was successfully allocated, or
1275 * KERN_RESOURCE_SHORTAGE if a page failed to get allocated. This can
1276 * also be returned on non-PPL devices if preemption is disabled after
1277 * early boot since allocating memory from the VM requires grabbing a
1278 * mutex.
1279 */
1280 MARK_AS_PMAP_TEXT kern_return_t
pmap_pages_alloc_zeroed(pmap_paddr_t * pa,unsigned size,unsigned options)1281 pmap_pages_alloc_zeroed(pmap_paddr_t *pa, unsigned size, unsigned options)
1282 {
1283 assert(pa != NULL);
1284
1285 #if XNU_MONITOR
1286 ASSERT_NOT_HIBERNATING();
1287
1288 /* The PPL page free list always operates on PAGE_SIZE chunks of memory. */
1289 if (size != PAGE_SIZE) {
1290 panic("%s: size != PAGE_SIZE, pa=%p, size=%u, options=%u",
1291 __func__, pa, size, options);
1292 }
1293
1294 /* Allocating memory in the PPL can't wait since preemption is disabled. */
1295 assert(options & PMAP_PAGES_ALLOCATE_NOWAIT);
1296
1297 *pa = pmap_get_free_ppl_page();
1298
1299 if ((*pa == 0) && (options & PMAP_PAGE_RECLAIM_NOWAIT)) {
1300 *pa = pmap_page_reclaim();
1301 }
1302
1303 if (*pa == 0) {
1304 return KERN_RESOURCE_SHORTAGE;
1305 } else {
1306 bzero((void*)phystokv(*pa), size);
1307 return KERN_SUCCESS;
1308 }
1309 #else /* XNU_MONITOR */
1310 vm_page_t mem = VM_PAGE_NULL;
1311 thread_t self = current_thread();
1312
1313 /**
1314 * It's not possible to allocate memory from the VM in a preemption disabled
1315 * environment except during early boot (since the VM needs to grab a mutex).
1316 * In those cases just return a resource shortage error and let the caller
1317 * deal with it.
1318 */
1319 if (!pmap_is_preemptible()) {
1320 return KERN_RESOURCE_SHORTAGE;
1321 }
1322
1323 /**
1324 * We qualify for allocating reserved memory so set TH_OPT_VMPRIV to inform
1325 * the VM of this.
1326 *
1327 * This field should only be modified by the local thread itself, so no lock
1328 * needs to be taken.
1329 */
1330 uint16_t thread_options = self->options;
1331 self->options |= TH_OPT_VMPRIV;
1332
1333 if (__probable(size == PAGE_SIZE)) {
1334 /**
1335 * If we're only allocating a single page, just grab one off the VM's
1336 * global page free list.
1337 */
1338 while ((mem = vm_page_grab()) == VM_PAGE_NULL) {
1339 if (options & PMAP_PAGES_ALLOCATE_NOWAIT) {
1340 break;
1341 }
1342
1343 VM_PAGE_WAIT();
1344 }
1345
1346 if (mem != VM_PAGE_NULL) {
1347 vm_page_lock_queues();
1348 vm_page_wire(mem, VM_KERN_MEMORY_PTE, TRUE);
1349 vm_page_unlock_queues();
1350 }
1351 } else if (size == (2 * PAGE_SIZE)) {
1352 /**
1353 * Allocate two physically contiguous pages. Any random two pages
1354 * obtained from the VM's global page free list aren't guaranteed to be
1355 * contiguous so we need to use the cpm_allocate() API.
1356 */
1357 while (cpm_allocate(size, &mem, 0, 1, TRUE, 0) != KERN_SUCCESS) {
1358 if (options & PMAP_PAGES_ALLOCATE_NOWAIT) {
1359 break;
1360 }
1361
1362 VM_PAGE_WAIT();
1363 }
1364 } else {
1365 panic("%s: invalid size %u", __func__, size);
1366 }
1367
1368 self->options = thread_options;
1369
1370 /**
1371 * If the normal method of allocating pages failed, then potentially fall
1372 * back to attempting to reclaim a userspace page table.
1373 */
1374 if ((mem == VM_PAGE_NULL) && (options & PMAP_PAGE_RECLAIM_NOWAIT)) {
1375 assert(size == PAGE_SIZE);
1376 *pa = pmap_page_reclaim();
1377 if (*pa != 0) {
1378 bzero((void*)phystokv(*pa), size);
1379 return KERN_SUCCESS;
1380 }
1381 }
1382
1383 if (mem == VM_PAGE_NULL) {
1384 return KERN_RESOURCE_SHORTAGE;
1385 }
1386
1387 *pa = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(mem));
1388
1389 /* Add the allocated VM page(s) to the pmap's VM object. */
1390 pmap_enqueue_pages(mem);
1391
1392 /* Pages are considered "in use" by the pmap until returned to the VM. */
1393 OSAddAtomic(size >> PAGE_SHIFT, &inuse_pmap_pages_count);
1394 OSAddAtomic64(size >> PAGE_SHIFT, &alloc_pmap_pages_count);
1395
1396 bzero((void*)phystokv(*pa), size);
1397 return KERN_SUCCESS;
1398 #endif /* XNU_MONITOR */
1399 }
1400
1401 #if XNU_MONITOR
1402 /**
1403 * Allocate a page from the VM. If no pages are available, this function can
1404 * potentially spin until a page is available (see the `options` parameter).
1405 *
1406 * @note This function CANNOT be called from the PPL since it calls into the VM.
1407 * If the PPL needs memory, then it'll need to exit the PPL before
1408 * allocating more (usually by returning KERN_RESOURCE_SHORTAGE, and then
1409 * calling pmap_alloc_page_for_ppl() from outside of the PPL).
1410 *
1411 * @param options The following options can be specified:
1412 * - PMAP_PAGES_ALLOCATE_NOWAIT: If the VM doesn't have any free pages
1413 * available then don't wait for one, just return immediately without
1414 * allocating a page.
1415 *
1416 * @return The physical address of the page, if one was allocated. Zero,
1417 * otherwise.
1418 */
1419 pmap_paddr_t
pmap_alloc_page_for_kern(unsigned int options)1420 pmap_alloc_page_for_kern(unsigned int options)
1421 {
1422 pmap_paddr_t pa = 0;
1423 vm_page_t mem = VM_PAGE_NULL;
1424
1425 /* It's not possible to lock VM page queue lock if not preemptible. */
1426 if (!pmap_is_preemptible()) {
1427 return 0;
1428 }
1429
1430 while ((mem = vm_page_grab()) == VM_PAGE_NULL) {
1431 if (options & PMAP_PAGES_ALLOCATE_NOWAIT) {
1432 return 0;
1433 }
1434 VM_PAGE_WAIT();
1435 }
1436
1437 /* Automatically wire any pages used by the pmap. */
1438 vm_page_lock_queues();
1439 vm_page_wire(mem, VM_KERN_MEMORY_PTE, TRUE);
1440 vm_page_unlock_queues();
1441
1442 pa = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(mem));
1443
1444 if (__improbable(pa == 0)) {
1445 panic("%s: physical address is 0", __func__);
1446 }
1447
1448 /**
1449 * Add the acquired VM page to the pmap's VM object to notify the VM that
1450 * this page is being used.
1451 */
1452 pmap_enqueue_pages(mem);
1453
1454 /* Pages are considered "in use" by the pmap until returned to the VM. */
1455 OSAddAtomic(1, &inuse_pmap_pages_count);
1456 OSAddAtomic64(1, &alloc_pmap_pages_count);
1457
1458 return pa;
1459 }
1460
1461 /**
1462 * Allocate a page from the VM, mark it as being PPL-owned, and add it to the
1463 * PPL page free list.
1464 *
1465 * @note This function CANNOT be called from the PPL since it calls into the VM.
1466 * If the PPL needs memory, then it'll need to exit the PPL before calling
1467 * this function (usually by returning KERN_RESOURCE_SHORTAGE).
1468 *
1469 * @param options The following options can be specified:
1470 * - PMAP_PAGES_ALLOCATE_NOWAIT: If the VM doesn't have any free pages
1471 * available then don't wait for one, just return immediately without
1472 * allocating a page.
1473 */
1474 void
pmap_alloc_page_for_ppl(unsigned int options)1475 pmap_alloc_page_for_ppl(unsigned int options)
1476 {
1477 thread_t self = current_thread();
1478
1479 /**
1480 * We qualify for allocating reserved memory so set TH_OPT_VMPRIV to inform
1481 * the VM of this.
1482 *
1483 * This field should only be modified by the local thread itself, so no lock
1484 * needs to be taken.
1485 */
1486 uint16_t thread_options = self->options;
1487 self->options |= TH_OPT_VMPRIV;
1488 pmap_paddr_t pa = pmap_alloc_page_for_kern(options);
1489 self->options = thread_options;
1490
1491 if (pa != 0) {
1492 pmap_mark_page_as_ppl_page(pa);
1493 }
1494 }
1495 #endif /* XNU_MONITOR */
1496
1497 /**
1498 * Free memory previously allocated through pmap_pages_alloc_zeroed() or
1499 * pmap_alloc_page_for_kern().
1500 *
1501 * On PPL-enabled systems, this just adds the page back to the PPL page free
1502 * list. On other systems, this returns the page(s) back to the VM.
1503 *
1504 * @param pa Physical address of the page(s) to free.
1505 * @param size The size in bytes of the memory region being freed (must be
1506 * PAGE_SIZE on PPL-enabled systems).
1507 */
1508 void
pmap_pages_free(pmap_paddr_t pa,__assert_only unsigned size)1509 pmap_pages_free(pmap_paddr_t pa, __assert_only unsigned size)
1510 {
1511 /**
1512 * If the pmap is starved for memory to the point that pmap_page_reclaim()
1513 * starts getting invoked to allocate memory, then let's take the page being
1514 * freed and add it directly to pmap_page_reclaim()'s dedicated free list.
1515 * In that case, the page being freed is most likely a userspace page table
1516 * that was reclaimed.
1517 */
1518 if (__improbable(pmap_pages_request_count != 0)) {
1519 pmap_simple_lock(&pmap_page_reclaim_lock);
1520
1521 if (pmap_pages_request_count != 0) {
1522 pmap_pages_request_count--;
1523
1524 /* Prepend the freed page to the pmap_page_reclaim() free list. */
1525 page_free_entry_t *page_entry = (page_free_entry_t *)phystokv(pa);
1526 page_entry->next = pmap_page_reclaim_list;
1527 pmap_page_reclaim_list = page_entry;
1528 pmap_simple_unlock(&pmap_page_reclaim_lock);
1529
1530 return;
1531 }
1532 pmap_simple_unlock(&pmap_page_reclaim_lock);
1533 }
1534
1535 #if XNU_MONITOR
1536 /* The PPL page free list always operates on PAGE_SIZE chunks of memory. */
1537 assert(size == PAGE_SIZE);
1538
1539 /* On PPL-enabled systems, just add the page back to the PPL page free list. */
1540 pmap_give_free_ppl_page(pa);
1541 #else /* XNU_MONITOR */
1542 vm_page_t mem = VM_PAGE_NULL;
1543 const pmap_paddr_t pa_max = pa + size;
1544
1545 /* Pages are considered "in use" until given back to the VM. */
1546 OSAddAtomic(-(size >> PAGE_SHIFT), &inuse_pmap_pages_count);
1547
1548 for (; pa < pa_max; pa += PAGE_SIZE) {
1549 vm_object_lock(pmap_object);
1550
1551 /**
1552 * Remove the page from the pmap's VM object and return it back to the
1553 * VM's global free list of pages.
1554 */
1555 mem = vm_page_lookup(pmap_object, (pa - gPhysBase));
1556 assert(mem != VM_PAGE_NULL);
1557 assert(VM_PAGE_WIRED(mem));
1558 vm_page_lock_queues();
1559 vm_page_free(mem);
1560 vm_page_unlock_queues();
1561 vm_object_unlock(pmap_object);
1562 }
1563 #endif /* XNU_MONITOR */
1564 }
1565
1566 /**
1567 * Called by the VM to reclaim pages that we can reclaim quickly and cheaply.
1568 * This will take pages in the pmap's VM object and add them back to the VM's
1569 * global list of free pages.
1570 *
1571 * @return The number of pages returned to the VM.
1572 */
1573 uint64_t
pmap_release_pages_fast(void)1574 pmap_release_pages_fast(void)
1575 {
1576 #if XNU_MONITOR
1577 return pmap_release_ppl_pages_to_kernel();
1578 #else /* XNU_MONITOR */
1579 return 0;
1580 #endif
1581 }
1582
1583 /**
1584 * Allocates a batch (list) of pv_entry_t's from the global PV free array.
1585 *
1586 * @return A pointer to the head of the newly-allocated batch, or PV_ENTRY_NULL
1587 * if empty.
1588 */
1589 MARK_AS_PMAP_TEXT static pv_entry_t *
pv_free_array_get_batch(void)1590 pv_free_array_get_batch(void)
1591 {
1592 pv_entry_t *new_batch = PV_ENTRY_NULL;
1593
1594 pmap_simple_lock(&pv_free_array_lock);
1595 if (pv_free_array_n_elems() > 0) {
1596 /**
1597 * The global PV array acts as a ring buffer where each entry points to
1598 * a linked list of PVEs of length PV_BATCH_SIZE. Get the next free
1599 * batch.
1600 */
1601 const size_t index = pv_free_read_idx++ & (PV_FREE_ARRAY_SIZE - 1);
1602 pv_free_list_t *free_list = &pv_free_ring[index];
1603
1604 assert((free_list->count == PV_BATCH_SIZE) && (free_list->list != PV_ENTRY_NULL));
1605 new_batch = free_list->list;
1606 }
1607 pmap_simple_unlock(&pv_free_array_lock);
1608
1609 return new_batch;
1610 }
1611
1612 /**
1613 * Frees a batch (list) of pv_entry_t's into the global PV free array.
1614 *
1615 * @param batch_head Pointer to the first entry in the batch to be returned to
1616 * the array. This must be a linked list of pv_entry_t's of
1617 * length PV_BATCH_SIZE.
1618 *
1619 * @return KERN_SUCCESS, or KERN_FAILURE if the global array is full.
1620 */
1621 MARK_AS_PMAP_TEXT static kern_return_t
pv_free_array_give_batch(pv_entry_t * batch_head)1622 pv_free_array_give_batch(pv_entry_t *batch_head)
1623 {
1624 assert(batch_head != NULL);
1625
1626 pmap_simple_lock(&pv_free_array_lock);
1627 if (pv_free_array_n_elems() == (PV_FREE_ARRAY_SIZE - 1)) {
1628 pmap_simple_unlock(&pv_free_array_lock);
1629 return KERN_FAILURE;
1630 }
1631
1632 const size_t index = pv_free_write_idx++ & (PV_FREE_ARRAY_SIZE - 1);
1633 pv_free_list_t *free_list = &pv_free_ring[index];
1634 free_list->list = batch_head;
1635 free_list->count = PV_BATCH_SIZE;
1636 pmap_simple_unlock(&pv_free_array_lock);
1637
1638 return KERN_SUCCESS;
1639 }
1640
1641 /**
1642 * Helper function for allocating a single PVE from an arbitrary free list.
1643 *
1644 * @param free_list The free list to allocate a node from.
1645 * @param pvepp Output parameter that will get updated with a pointer to the
1646 * allocated node if the free list isn't empty, or a pointer to
1647 * NULL if the list is empty.
1648 */
1649 MARK_AS_PMAP_TEXT static void
pv_free_list_alloc(pv_free_list_t * free_list,pv_entry_t ** pvepp)1650 pv_free_list_alloc(pv_free_list_t *free_list, pv_entry_t **pvepp)
1651 {
1652 assert(pvepp != NULL);
1653 assert(((free_list->list != NULL) && (free_list->count > 0)) ||
1654 ((free_list->list == NULL) && (free_list->count == 0)));
1655
1656 if ((*pvepp = free_list->list) != NULL) {
1657 pv_entry_t *pvep = *pvepp;
1658 free_list->list = pvep->pve_next;
1659 pvep->pve_next = PV_ENTRY_NULL;
1660 free_list->count--;
1661 }
1662 }
1663
1664 /**
1665 * Allocates a PVE from the kernel-dedicated list.
1666 *
1667 * @note This is only called when the global free list is empty, so don't bother
1668 * trying to allocate more nodes from that list.
1669 *
1670 * @param pvepp Output parameter that will get updated with a pointer to the
1671 * allocated node if the free list isn't empty, or a pointer to
1672 * NULL if the list is empty. This pointer can't already be
1673 * pointing to a valid entry before allocation.
1674 */
1675 MARK_AS_PMAP_TEXT static void
pv_list_kern_alloc(pv_entry_t ** pvepp)1676 pv_list_kern_alloc(pv_entry_t **pvepp)
1677 {
1678 assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
1679 pmap_simple_lock(&pv_kern_free_list_lock);
1680 if (pv_kern_free.count > 0) {
1681 pmap_kern_reserve_alloc_stat++;
1682 }
1683 pv_free_list_alloc(&pv_kern_free, pvepp);
1684 pmap_simple_unlock(&pv_kern_free_list_lock);
1685 }
1686
1687 /**
1688 * Returns a list of PVEs to the kernel-dedicated free list.
1689 *
1690 * @param pve_head Head of the list to be returned.
1691 * @param pve_tail Tail of the list to be returned.
1692 * @param pv_cnt Number of elements in the list to be returned.
1693 */
1694 MARK_AS_PMAP_TEXT static void
pv_list_kern_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,int pv_cnt)1695 pv_list_kern_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, int pv_cnt)
1696 {
1697 assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
1698
1699 pmap_simple_lock(&pv_kern_free_list_lock);
1700 pve_tail->pve_next = pv_kern_free.list;
1701 pv_kern_free.list = pve_head;
1702 pv_kern_free.count += pv_cnt;
1703 pmap_simple_unlock(&pv_kern_free_list_lock);
1704 }
1705
1706 /**
1707 * Attempts to allocate from the per-cpu free list of PVEs, and if that fails,
1708 * then replenish the per-cpu free list with a batch of PVEs from the global
1709 * PVE free list.
1710 *
1711 * @param pvepp Output parameter that will get updated with a pointer to the
1712 * allocated node if the free lists aren't empty, or a pointer to
1713 * NULL if both the per-cpu and global lists are empty. This
1714 * pointer can't already be pointing to a valid entry before
1715 * allocation.
1716 */
1717 MARK_AS_PMAP_TEXT static void
pv_list_alloc(pv_entry_t ** pvepp)1718 pv_list_alloc(pv_entry_t **pvepp)
1719 {
1720 assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
1721
1722 #if !XNU_MONITOR
1723 /**
1724 * Preemption is always disabled in the PPL so it only needs to get disabled
1725 * on non-PPL systems. This needs to be disabled while working with per-cpu
1726 * data to prevent getting rescheduled onto a different CPU.
1727 */
1728 mp_disable_preemption();
1729 #endif /* !XNU_MONITOR */
1730
1731 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
1732 pv_free_list_alloc(&pmap_cpu_data->pv_free, pvepp);
1733
1734 if (*pvepp != PV_ENTRY_NULL) {
1735 goto pv_list_alloc_done;
1736 }
1737
1738 #if !XNU_MONITOR
1739 if (pv_kern_free.count < pv_kern_low_water_mark) {
1740 /**
1741 * If the kernel reserved pool is low, let non-kernel mappings wait for
1742 * a page from the VM.
1743 */
1744 goto pv_list_alloc_done;
1745 }
1746 #endif /* !XNU_MONITOR */
1747
1748 /**
1749 * Attempt to replenish the local list off the global one, and return the
1750 * first element. If the global list is empty, then the allocation failed.
1751 */
1752 pv_entry_t *new_batch = pv_free_array_get_batch();
1753
1754 if (new_batch != PV_ENTRY_NULL) {
1755 pmap_cpu_data->pv_free.count = PV_BATCH_SIZE - 1;
1756 pmap_cpu_data->pv_free.list = new_batch->pve_next;
1757 assert(pmap_cpu_data->pv_free.list != NULL);
1758
1759 new_batch->pve_next = PV_ENTRY_NULL;
1760 *pvepp = new_batch;
1761 }
1762
1763 pv_list_alloc_done:
1764 #if !XNU_MONITOR
1765 mp_enable_preemption();
1766 #endif /* !XNU_MONITOR */
1767
1768 return;
1769 }
1770
1771 /**
1772 * Adds a list of PVEs to the per-CPU PVE free list. May spill out some entries
1773 * to the global or the kernel PVE free lists if the per-CPU list contains too
1774 * many PVEs.
1775 *
1776 * @param pve_head Head of the list to be returned.
1777 * @param pve_tail Tail of the list to be returned.
1778 * @param pv_cnt Number of elements in the list to be returned.
1779 */
1780 MARK_AS_PMAP_TEXT void
pv_list_free(pv_entry_t * pve_head,pv_entry_t * pve_tail,int pv_cnt)1781 pv_list_free(pv_entry_t *pve_head, pv_entry_t *pve_tail, int pv_cnt)
1782 {
1783 assert((pve_head != PV_ENTRY_NULL) && (pve_tail != PV_ENTRY_NULL));
1784
1785 #if !XNU_MONITOR
1786 /**
1787 * Preemption is always disabled in the PPL so it only needs to get disabled
1788 * on non-PPL systems. This needs to be disabled while working with per-cpu
1789 * data to prevent getting rescheduled onto a different CPU.
1790 */
1791 mp_disable_preemption();
1792 #endif /* !XNU_MONITOR */
1793
1794 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
1795
1796 /**
1797 * How many more PVEs need to be added to the last allocated batch to get it
1798 * back up to a PV_BATCH_SIZE number of objects.
1799 */
1800 const uint32_t available = PV_BATCH_SIZE - (pmap_cpu_data->pv_free.count % PV_BATCH_SIZE);
1801
1802 /**
1803 * The common case is that the number of PVEs to be freed fit in the current
1804 * PV_BATCH_SIZE boundary. If that is the case, quickly prepend the whole
1805 * list and return.
1806 */
1807 if (__probable((pv_cnt <= available) &&
1808 ((pmap_cpu_data->pv_free.count % PV_BATCH_SIZE != 0) || (pmap_cpu_data->pv_free.count == 0)))) {
1809 pve_tail->pve_next = pmap_cpu_data->pv_free.list;
1810 pmap_cpu_data->pv_free.list = pve_head;
1811 pmap_cpu_data->pv_free.count += pv_cnt;
1812 goto pv_list_free_done;
1813 }
1814
1815 /**
1816 * In the degenerate case, we need to process PVEs one by one, to make sure
1817 * we spill out to the global list, or update the spill marker as
1818 * appropriate.
1819 */
1820 while (pv_cnt) {
1821 /**
1822 * Take the node off the top of the passed in list and prepend it to the
1823 * per-cpu list.
1824 */
1825 pv_entry_t *pv_next = pve_head->pve_next;
1826 pve_head->pve_next = pmap_cpu_data->pv_free.list;
1827 pmap_cpu_data->pv_free.list = pve_head;
1828 pve_head = pv_next;
1829 pmap_cpu_data->pv_free.count++;
1830 pv_cnt--;
1831
1832 if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE + 1))) {
1833 /**
1834 * A full batch of entries have been freed to the per-cpu list.
1835 * Update the spill marker which is used to remember the end of a
1836 * batch (remember, we prepend nodes) to eventually return back to
1837 * the global list (we try to only keep one PV_BATCH_SIZE worth of
1838 * nodes in any single per-cpu list).
1839 */
1840 pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
1841 } else if (__improbable(pmap_cpu_data->pv_free.count == (PV_BATCH_SIZE * 2) + 1)) {
1842 /* Spill out excess PVEs to the global PVE array */
1843 pv_entry_t *spill_head = pmap_cpu_data->pv_free.list->pve_next;
1844 pv_entry_t *spill_tail = pmap_cpu_data->pv_free_spill_marker;
1845 pmap_cpu_data->pv_free.list->pve_next = pmap_cpu_data->pv_free_spill_marker->pve_next;
1846 spill_tail->pve_next = PV_ENTRY_NULL;
1847 pmap_cpu_data->pv_free.count -= PV_BATCH_SIZE;
1848 pmap_cpu_data->pv_free_spill_marker = pmap_cpu_data->pv_free.list;
1849
1850 if (__improbable(pv_free_array_give_batch(spill_head) != KERN_SUCCESS)) {
1851 /**
1852 * This is extremely unlikely to happen, as it would imply that
1853 * we have (PV_FREE_ARRAY_SIZE * PV_BATCH_SIZE) PVEs sitting in
1854 * the global array. Just in case, push the excess down to the
1855 * kernel PVE free list.
1856 */
1857 pv_list_kern_free(spill_head, spill_tail, PV_BATCH_SIZE);
1858 }
1859 }
1860 }
1861
1862 pv_list_free_done:
1863 #if !XNU_MONITOR
1864 mp_enable_preemption();
1865 #endif /* !XNU_MONITOR */
1866
1867 return;
1868 }
1869
1870 /**
1871 * Adds a single page to the PVE allocation subsystem.
1872 *
1873 * @note This function operates under the assumption that a PV_BATCH_SIZE amount
1874 * of PVEs can fit within a single page. One page is always allocated for
1875 * one batch, so if there's empty space in the page after the batch of
1876 * PVEs, it'll go unused (so it's best to keep the batch size at an amount
1877 * that utilizes a whole page).
1878 *
1879 * @param alloc_flags Allocation flags passed to pmap_pages_alloc_zeroed(). See
1880 * the definition of that function for a detailed description
1881 * of the available flags.
1882 *
1883 * @return KERN_SUCCESS, or the value returned by pmap_pages_alloc_zeroed() upon
1884 * failure.
1885 */
1886 MARK_AS_PMAP_TEXT static kern_return_t
pve_feed_page(unsigned alloc_flags)1887 pve_feed_page(unsigned alloc_flags)
1888 {
1889 kern_return_t kr = KERN_FAILURE;
1890
1891 pv_entry_t *pve_head = PV_ENTRY_NULL;
1892 pv_entry_t *pve_tail = PV_ENTRY_NULL;
1893 pmap_paddr_t pa = 0;
1894
1895 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags);
1896
1897 if (kr != KERN_SUCCESS) {
1898 return kr;
1899 }
1900
1901 /* Update statistics globals. See the variables' definitions for more info. */
1902 pv_page_count++;
1903 pmap_reserve_replenish_stat += PV_BATCH_SIZE;
1904
1905 /* Prepare a new list by linking all of the entries in advance. */
1906 pve_head = (pv_entry_t *)phystokv(pa);
1907 pve_tail = &pve_head[PV_BATCH_SIZE - 1];
1908
1909 for (int i = 0; i < PV_BATCH_SIZE; i++) {
1910 pve_head[i].pve_next = &pve_head[i + 1];
1911 }
1912 pve_head[PV_BATCH_SIZE - 1].pve_next = PV_ENTRY_NULL;
1913
1914 /**
1915 * Add the new list to the kernel PVE free list if we are running low on
1916 * kernel-dedicated entries or the global free array is full.
1917 */
1918 if ((pv_kern_free.count < pv_kern_low_water_mark) ||
1919 (pv_free_array_give_batch(pve_head) != KERN_SUCCESS)) {
1920 pv_list_kern_free(pve_head, pve_tail, PV_BATCH_SIZE);
1921 }
1922
1923 return KERN_SUCCESS;
1924 }
1925
1926 /**
1927 * Allocate a PV node from one of many different free lists (per-cpu, global, or
1928 * kernel-specific).
1929 *
1930 * @note This function is very tightly coupled with pmap_enter_pv(). If
1931 * modifying this code, please ensure that pmap_enter_pv() doesn't break.
1932 *
1933 * @note The pmap lock must already be held if the new mapping is a CPU mapping.
1934 *
1935 * @note The PVH lock for the physical page that is getting a new mapping
1936 * registered must already be held.
1937 *
1938 * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
1939 * an IOMMU translation.
1940 * @param pai The physical address index of the page that's getting a new
1941 * mapping.
1942 * @param lock_mode Which state the pmap lock is being held in if the mapping is
1943 * owned by a pmap, otherwise this is a don't care.
1944 * @param options PMAP_OPTIONS_* family of options passed from the caller.
1945 * @param pvepp Output parameter that will get updated with a pointer to the
1946 * allocated node if none of the free lists are empty, or a pointer
1947 * to NULL otherwise. This pointer can't already be pointing to a
1948 * valid entry before allocation.
1949 *
1950 * @return These are the possible return values:
1951 * PV_ALLOC_SUCCESS: A PVE object was successfully allocated.
1952 * PV_ALLOC_FAILURE: No objects were available for allocation, and
1953 * allocating a new page failed. On PPL-enabled systems,
1954 * a fresh page needs to be added to the PPL page list
1955 * before retrying this operaton.
1956 * PV_ALLOC_RETRY: No objects were available on the free lists, so a new
1957 * page of PVE objects needed to be allocated. To do that,
1958 * the pmap and PVH locks were dropped. The caller may have
1959 * depended on these locks for consistency, so return and
1960 * let the caller retry the PVE allocation with the locks
1961 * held. Note that the locks have already been re-acquired
1962 * before this function exits.
1963 */
1964 MARK_AS_PMAP_TEXT pv_alloc_return_t
pv_alloc(pmap_t pmap,unsigned int pai,pmap_lock_mode_t lock_mode,unsigned int options,pv_entry_t ** pvepp)1965 pv_alloc(
1966 pmap_t pmap,
1967 unsigned int pai,
1968 pmap_lock_mode_t lock_mode,
1969 unsigned int options,
1970 pv_entry_t **pvepp)
1971 {
1972 assert((pvepp != NULL) && (*pvepp == PV_ENTRY_NULL));
1973
1974 if (pmap != NULL) {
1975 pmap_assert_locked(pmap, lock_mode);
1976 }
1977 pvh_assert_locked(pai);
1978
1979 pv_list_alloc(pvepp);
1980 if (PV_ENTRY_NULL != *pvepp) {
1981 return PV_ALLOC_SUCCESS;
1982 }
1983
1984 #if XNU_MONITOR
1985 /* PPL can't block so this flag is always required. */
1986 unsigned alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT;
1987 #else /* XNU_MONITOR */
1988 unsigned alloc_flags = 0;
1989 #endif /* XNU_MONITOR */
1990
1991 /**
1992 * We got here because both the per-CPU and the global lists are empty. If
1993 * this allocation is for the kernel pmap or an IOMMU kernel driver, we try
1994 * to get an entry from the kernel list next.
1995 */
1996 if ((pmap == NULL) || (kernel_pmap == pmap)) {
1997 pv_list_kern_alloc(pvepp);
1998 if (PV_ENTRY_NULL != *pvepp) {
1999 return PV_ALLOC_SUCCESS;
2000 }
2001 /**
2002 * If the pmap is NULL, this is an allocation outside the normal pmap path,
2003 * most likely an IOMMU allocation. We therefore don't know what other locks
2004 * this path may hold or timing constraints it may have, so we should avoid
2005 * a potentially expensive call to pmap_page_reclaim() on this path.
2006 */
2007 if (pmap == NULL) {
2008 alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT;
2009 } else {
2010 alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT | PMAP_PAGE_RECLAIM_NOWAIT;
2011 }
2012 }
2013
2014 /**
2015 * Make sure we have PMAP_PAGES_ALLOCATE_NOWAIT set in alloc_flags when the
2016 * input options argument has PMAP_OPTIONS_NOWAIT set.
2017 */
2018 alloc_flags |= (options & PMAP_OPTIONS_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
2019
2020 /**
2021 * We ran out of PV entries all across the board, or this allocation is not
2022 * for the kernel. Let's make sure that the kernel list is not too full
2023 * (very unlikely), in which case we can rebalance here.
2024 */
2025 if (__improbable(pv_kern_free.count > (PV_BATCH_SIZE * 2))) {
2026 pmap_simple_lock(&pv_kern_free_list_lock);
2027 /* Re-check, now that the lock is held. */
2028 if (pv_kern_free.count > (PV_BATCH_SIZE * 2)) {
2029 pv_entry_t *pve_head = pv_kern_free.list;
2030 pv_entry_t *pve_tail = pve_head;
2031
2032 for (int i = 0; i < (PV_BATCH_SIZE - 1); i++) {
2033 pve_tail = pve_tail->pve_next;
2034 }
2035
2036 pv_kern_free.list = pve_tail->pve_next;
2037 pv_kern_free.count -= PV_BATCH_SIZE;
2038 pve_tail->pve_next = PV_ENTRY_NULL;
2039 pmap_simple_unlock(&pv_kern_free_list_lock);
2040
2041 /* Return back every node except the first one to the free lists. */
2042 pv_list_free(pve_head->pve_next, pve_tail, PV_BATCH_SIZE - 1);
2043 pve_head->pve_next = PV_ENTRY_NULL;
2044 *pvepp = pve_head;
2045 return PV_ALLOC_SUCCESS;
2046 }
2047 pmap_simple_unlock(&pv_kern_free_list_lock);
2048 }
2049
2050 /**
2051 * If all else fails, try to get a new pmap page so that the allocation
2052 * succeeds once the caller retries it.
2053 */
2054 kern_return_t kr = KERN_FAILURE;
2055 pv_alloc_return_t pv_status = PV_ALLOC_FAIL;
2056
2057 /* Drop the lock during page allocation since that can take a while. */
2058 pvh_unlock(pai);
2059 if (pmap != NULL) {
2060 pmap_unlock(pmap, lock_mode);
2061 }
2062
2063 if ((kr = pve_feed_page(alloc_flags)) == KERN_SUCCESS) {
2064 /**
2065 * Since the lock was dropped, even though we successfully allocated a
2066 * new page to be used for PVE nodes, the code that relies on this
2067 * function might have depended on the lock being held for consistency,
2068 * so return out early and let them retry the allocation with the lock
2069 * re-held.
2070 */
2071 pv_status = PV_ALLOC_RETRY;
2072 } else {
2073 pv_status = PV_ALLOC_FAIL;
2074 }
2075
2076 if (pmap != NULL) {
2077 pmap_lock(pmap, lock_mode);
2078 }
2079 pvh_lock(pai);
2080
2081 /* Ensure that no node was created if we're not returning successfully. */
2082 assert(*pvepp == PV_ENTRY_NULL);
2083
2084 return pv_status;
2085 }
2086
2087 /**
2088 * Utility function for freeing a single PVE object back to the free lists.
2089 *
2090 * @param pvep Pointer to the PVE object to free.
2091 */
2092 MARK_AS_PMAP_TEXT void
pv_free(pv_entry_t * pvep)2093 pv_free(pv_entry_t *pvep)
2094 {
2095 assert(pvep != PV_ENTRY_NULL);
2096
2097 pv_list_free(pvep, pvep, 1);
2098 }
2099
2100 /**
2101 * This function provides a mechanism for the device tree to override the
2102 * default PV allocation amounts and the watermark level which determines how
2103 * many PVE objects are kept in the kernel-dedicated free list.
2104 */
2105 MARK_AS_PMAP_TEXT void
pmap_compute_pv_targets(void)2106 pmap_compute_pv_targets(void)
2107 {
2108 DTEntry entry = NULL;
2109 void const *prop = NULL;
2110 int err = 0;
2111 unsigned int prop_size = 0;
2112
2113 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2114 assert(err == kSuccess);
2115
2116 if (kSuccess == SecureDTGetProperty(entry, "pmap-pv-count", &prop, &prop_size)) {
2117 if (prop_size != sizeof(pv_alloc_initial_target)) {
2118 panic("pmap-pv-count property is not a 32-bit integer");
2119 }
2120 pv_alloc_initial_target = *((uint32_t const *)prop);
2121 }
2122
2123 if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-count", &prop, &prop_size)) {
2124 if (prop_size != sizeof(pv_kern_alloc_initial_target)) {
2125 panic("pmap-kern-pv-count property is not a 32-bit integer");
2126 }
2127 pv_kern_alloc_initial_target = *((uint32_t const *)prop);
2128 }
2129
2130 if (kSuccess == SecureDTGetProperty(entry, "pmap-kern-pv-min", &prop, &prop_size)) {
2131 if (prop_size != sizeof(pv_kern_low_water_mark)) {
2132 panic("pmap-kern-pv-min property is not a 32-bit integer");
2133 }
2134 pv_kern_low_water_mark = *((uint32_t const *)prop);
2135 }
2136 }
2137
2138 /**
2139 * This would normally be used to adjust the amount of PVE objects available in
2140 * the system, but we do that dynamically at runtime anyway so this is unneeded.
2141 */
2142 void
mapping_adjust(void)2143 mapping_adjust(void)
2144 {
2145 /* Not implemented for arm/arm64. */
2146 }
2147
2148 /**
2149 * Creates a target number of free pv_entry_t objects for the kernel free list
2150 * and the general free list.
2151 *
2152 * @note This function is called once during early boot, in kernel_bootstrap().
2153 *
2154 * @return KERN_SUCCESS if the objects were successfully allocated, or the
2155 * return value from pve_feed_page() on failure (could be caused by not
2156 * being able to allocate a page).
2157 */
2158 MARK_AS_PMAP_TEXT kern_return_t
mapping_free_prime_internal(void)2159 mapping_free_prime_internal(void)
2160 {
2161 kern_return_t kr = KERN_FAILURE;
2162
2163 #if XNU_MONITOR
2164 /* PPL can't block so this flag is always required. */
2165 unsigned alloc_flags = PMAP_PAGES_ALLOCATE_NOWAIT;
2166 #else /* XNU_MONITOR */
2167 unsigned alloc_flags = 0;
2168 #endif /* XNU_MONITOR */
2169
2170 /*
2171 * We do not need to hold the pv_free_array lock to calculate the number of
2172 * elements in it because no other core is running at this point.
2173 */
2174 while (((pv_free_array_n_elems() * PV_BATCH_SIZE) < pv_alloc_initial_target) ||
2175 (pv_kern_free.count < pv_kern_alloc_initial_target)) {
2176 if ((kr = pve_feed_page(alloc_flags)) != KERN_SUCCESS) {
2177 return kr;
2178 }
2179 }
2180
2181 return KERN_SUCCESS;
2182 }
2183
2184 /**
2185 * Helper function for pmap_enter_pv (hereby shortened to "pepv") which converts
2186 * a PVH entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP which will transform the
2187 * entry into a linked list of mappings.
2188 *
2189 * @note This should only be called from pmap_enter_pv().
2190 *
2191 * @note The PVH lock for the passed in page must already be held and the type
2192 * must be PVH_TYPE_PTEP (wouldn't make sense to call this otherwise).
2193 *
2194 * @param pmap Either the pmap that owns the mapping being registered in
2195 * pmap_enter_pv(), or NULL if this is an IOMMU mapping.
2196 * @param pai The physical address index of the page that's getting a second
2197 * mapping and needs to be converted from PVH_TYPE_PTEP to
2198 * PVH_TYPE_PVEP.
2199 * @param lock_mode Which state the pmap lock is being held in if the mapping is
2200 * owned by a pmap, otherwise this is a don't care.
2201 * @param options PMAP_OPTIONS_* family of options.
2202 *
2203 * @return PV_ALLOC_SUCCESS if the entry at `pai` was successfully converted
2204 * into PVH_TYPE_PVEP, or the return value of pv_alloc() otherwise. See
2205 * pv_alloc()'s function header for a detailed explanation of the
2206 * possible return values.
2207 */
2208 MARK_AS_PMAP_TEXT static pv_alloc_return_t
pepv_convert_ptep_to_pvep(pmap_t pmap,unsigned int pai,pmap_lock_mode_t lock_mode,unsigned int options)2209 pepv_convert_ptep_to_pvep(
2210 pmap_t pmap,
2211 unsigned int pai,
2212 pmap_lock_mode_t lock_mode,
2213 unsigned int options)
2214 {
2215 pvh_assert_locked(pai);
2216
2217 pv_entry_t **pvh = pai_to_pvh(pai);
2218 assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
2219
2220 pv_entry_t *pvep = PV_ENTRY_NULL;
2221 pv_alloc_return_t ret = pv_alloc(pmap, pai, lock_mode, options, &pvep);
2222 if (ret != PV_ALLOC_SUCCESS) {
2223 return ret;
2224 }
2225
2226 /* If we've gotten this far then a node should've been allocated. */
2227 assert(pvep != PV_ENTRY_NULL);
2228
2229 /* The new PVE should have the same PTE pointer as the previous PVH entry. */
2230 pve_init(pvep);
2231 pve_set_ptep(pvep, 0, pvh_ptep(pvh));
2232
2233 assert(!pve_get_internal(pvep, 0));
2234 assert(!pve_get_altacct(pvep, 0));
2235 if (ppattr_is_internal(pai)) {
2236 /**
2237 * Transfer "internal" status from pp_attr to this pve. See the comment
2238 * above PP_ATTR_INTERNAL for more information on this.
2239 */
2240 ppattr_clear_internal(pai);
2241 pve_set_internal(pvep, 0);
2242 }
2243 if (ppattr_is_altacct(pai)) {
2244 /**
2245 * Transfer "altacct" status from pp_attr to this pve. See the comment
2246 * above PP_ATTR_ALTACCT for more information on this.
2247 */
2248 ppattr_clear_altacct(pai);
2249 pve_set_altacct(pvep, 0);
2250 }
2251
2252 pvh_update_head(pvh, pvep, PVH_TYPE_PVEP);
2253
2254 return PV_ALLOC_SUCCESS;
2255 }
2256
2257 /**
2258 * Register a new mapping into the pv_head_table. This is the main data
2259 * structure used for performing a reverse physical to virtual translation and
2260 * finding all mappings to a physical page. Whenever a new page table mapping is
2261 * created (regardless of whether it's for a CPU or an IOMMU), it should be
2262 * registered with a call to this function.
2263 *
2264 * @note The pmap lock must already be held if the new mapping is a CPU mapping.
2265 *
2266 * @note The PVH lock for the physical page that is getting a new mapping
2267 * registered must already be held.
2268 *
2269 * @note This function cannot be called during the hibernation process because
2270 * it modifies critical pmap data structures that need to be dumped into
2271 * the hibernation image in a consistent state.
2272 *
2273 * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
2274 * an IOMMU translation.
2275 * @param ptep The new mapping to register.
2276 * @param pai The physical address index of the physical page being mapped by
2277 * `ptep`.
2278 * @param options Flags that can potentially be set on a per-page basis:
2279 * PMAP_OPTIONS_INTERNAL: If this is the first CPU mapping, then
2280 * mark the page as being "internal". See the definition of
2281 * PP_ATTR_INTERNAL for more info.
2282 * PMAP_OPTIONS_REUSABLE: If this is the first CPU mapping, and
2283 * this page is also marked internal, then mark the page as
2284 * being "reusable". See the definition of PP_ATTR_REUSABLE
2285 * for more info.
2286 * @param lock_mode Which state the pmap lock is being held in if the mapping is
2287 * owned by a pmap, otherwise this is a don't care.
2288 * @param new_pvepp An output parameter that is updated with a pointer to the
2289 * PVE object where the PTEP was allocated into. In the event
2290 * of failure, or if the pointer passed in is NULL,
2291 * it's not modified.
2292 * @param new_pve_ptep_idx An output parameter that is updated with the index
2293 * into the PVE object where the PTEP was allocated into.
2294 * In the event of failure, or if new_pvepp in is NULL,
2295 * it's not modified.
2296 *
2297 * @return PV_ALLOC_SUCCESS if the entry at `pai` was successfully updated with
2298 * the new mapping, or the return value of pv_alloc() otherwise. See
2299 * pv_alloc()'s function header for a detailed explanation of the
2300 * possible return values.
2301 */
2302 MARK_AS_PMAP_TEXT pv_alloc_return_t
pmap_enter_pv(pmap_t pmap,pt_entry_t * ptep,int pai,unsigned int options,pmap_lock_mode_t lock_mode,pv_entry_t ** new_pvepp,int * new_pve_ptep_idx)2303 pmap_enter_pv(
2304 pmap_t pmap,
2305 pt_entry_t *ptep,
2306 int pai,
2307 unsigned int options,
2308 pmap_lock_mode_t lock_mode,
2309 pv_entry_t **new_pvepp,
2310 int *new_pve_ptep_idx)
2311 {
2312 assert(ptep != PT_ENTRY_NULL);
2313
2314 pv_entry_t **pvh = pai_to_pvh(pai);
2315 bool first_cpu_mapping = false;
2316
2317 ASSERT_NOT_HIBERNATING();
2318 pvh_assert_locked(pai);
2319
2320 if (pmap != NULL) {
2321 pmap_assert_locked(pmap, lock_mode);
2322 }
2323
2324 vm_offset_t pvh_flags = pvh_get_flags(pvh);
2325
2326 #if XNU_MONITOR
2327 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2328 panic("%d is locked down (%#lx), cannot enter", pai, pvh_flags);
2329 }
2330 #endif /* XNU_MONITOR */
2331
2332
2333 #ifdef PVH_FLAG_CPU
2334 /**
2335 * An IOMMU mapping may already be present for a page that hasn't yet had a
2336 * CPU mapping established, so we use PVH_FLAG_CPU to determine if this is
2337 * the first CPU mapping. We base internal/reusable accounting on the
2338 * options specified for the first CPU mapping. PVH_FLAG_CPU, and thus this
2339 * accounting, will then persist as long as there are *any* mappings of the
2340 * page. The accounting for a page should not need to change until the page
2341 * is recycled by the VM layer, and we assert that there are no mappings
2342 * when a page is recycled. An IOMMU mapping of a freed/recycled page is
2343 * considered a security violation & potential DMA corruption path.
2344 */
2345 first_cpu_mapping = ((pmap != NULL) && !(pvh_flags & PVH_FLAG_CPU));
2346 if (first_cpu_mapping) {
2347 pvh_flags |= PVH_FLAG_CPU;
2348 }
2349 #else /* PVH_FLAG_CPU */
2350 first_cpu_mapping = pvh_test_type(pvh, PVH_TYPE_NULL);
2351 #endif /* PVH_FLAG_CPU */
2352
2353 /**
2354 * Internal/reusable flags are based on the first CPU mapping made to a
2355 * page. These will persist until all mappings to the page are removed.
2356 */
2357 if (first_cpu_mapping) {
2358 if ((options & PMAP_OPTIONS_INTERNAL) &&
2359 (options & PMAP_OPTIONS_REUSABLE)) {
2360 ppattr_set_reusable(pai);
2361 } else {
2362 ppattr_clear_reusable(pai);
2363 }
2364 }
2365
2366 /* Visit the definitions for the PVH_TYPEs to learn more about each one. */
2367 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
2368 /* If this is the first mapping, upgrade the type to store a single PTEP. */
2369 pvh_update_head(pvh, ptep, PVH_TYPE_PTEP);
2370 } else {
2371 pv_alloc_return_t ret = PV_ALLOC_FAIL;
2372
2373 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2374 /**
2375 * There was already a single mapping to the page. Convert the PVH
2376 * entry from PVH_TYPE_PTEP to PVH_TYPE_PVEP so that multiple
2377 * mappings can be tracked. If PVEs cannot hold more than a single
2378 * mapping, a second PVE will be added farther down.
2379 *
2380 * Also, ensure that the PVH flags (which can possibly contain
2381 * PVH_FLAG_CPU) are set before potentially returning or dropping
2382 * the locks. We use that flag to lock in the internal/reusable
2383 * attributes and we don't want another mapping to jump in while the
2384 * locks are dropped, think it's the first CPU mapping, and decide
2385 * to clobber those attributes.
2386 */
2387 pvh_set_flags(pvh, pvh_flags);
2388 if ((ret = pepv_convert_ptep_to_pvep(pmap, pai, lock_mode, options)) != PV_ALLOC_SUCCESS) {
2389 return ret;
2390 }
2391
2392 /**
2393 * At this point, the PVH flags have been clobbered due to updating
2394 * PTEP->PVEP, but that's ok because the locks are being held and
2395 * the flags will get set again below before pv_alloc() is called
2396 * and the locks are potentially dropped again.
2397 */
2398 } else if (!pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2399 panic("%s: unexpected PV head %p, ptep=%p pmap=%p pvh=%p",
2400 __func__, *pvh, ptep, pmap, pvh);
2401 }
2402
2403 /**
2404 * Check if we have room for one more mapping in this PVE
2405 */
2406 pv_entry_t *pvep = pvh_pve_list(pvh);
2407 assert(pvep != PV_ENTRY_NULL);
2408
2409 int pve_ptep_idx = pve_find_ptep_index(pvep, PT_ENTRY_NULL);
2410
2411 if (pve_ptep_idx == -1) {
2412 /**
2413 * Set up the pv_entry for this new mapping and then add it to the list
2414 * for this physical page.
2415 */
2416 pve_ptep_idx = 0;
2417 pvh_set_flags(pvh, pvh_flags);
2418 pvep = PV_ENTRY_NULL;
2419 if ((ret = pv_alloc(pmap, pai, lock_mode, options, &pvep)) != PV_ALLOC_SUCCESS) {
2420 return ret;
2421 }
2422
2423 /* If we've gotten this far then a node should've been allocated. */
2424 assert(pvep != PV_ENTRY_NULL);
2425 pve_init(pvep);
2426 pve_add(pvh, pvep);
2427 }
2428
2429 pve_set_ptep(pvep, pve_ptep_idx, ptep);
2430
2431 /*
2432 * The PTEP was successfully entered into the PVE object.
2433 * If the caller requests it, set new_pvepp and new_pve_ptep_idx
2434 * appropriately.
2435 */
2436 if (new_pvepp != NULL) {
2437 *new_pvepp = pvep;
2438 *new_pve_ptep_idx = pve_ptep_idx;
2439 }
2440 }
2441
2442 pvh_set_flags(pvh, pvh_flags);
2443
2444 return PV_ALLOC_SUCCESS;
2445 }
2446
2447 /**
2448 * Remove a mapping that was registered with the pv_head_table. This needs to be
2449 * done for every mapping that was previously registered using pmap_enter_pv()
2450 * when the mapping is removed.
2451 *
2452 * @note The PVH lock for the physical page that is getting a new mapping
2453 * registered must already be held.
2454 *
2455 * @note This function cannot be called during the hibernation process because
2456 * it modifies critical pmap data structures that need to be dumped into
2457 * the hibernation image in a consistent state.
2458 *
2459 * @param pmap The pmap that owns the new mapping, or NULL if this is tracking
2460 * an IOMMU translation.
2461 * @param ptep The mapping that's getting removed.
2462 * @param pai The physical address index of the physical page being mapped by
2463 * `ptep`.
2464 * @param flush_tlb_async On some systems, removing the last mapping to a page
2465 * that used to be mapped executable will require
2466 * updating the physical aperture mapping of the page.
2467 * This parameter specifies whether the TLB invalidate
2468 * should be synchronized or not if that update occurs.
2469 * @param is_internal_p The internal bit of the PTE that was removed.
2470 * @param is_altacct_p The altacct bit of the PTE that was removed.
2471 */
2472 void
pmap_remove_pv(pmap_t pmap,pt_entry_t * ptep,int pai,bool flush_tlb_async __unused,bool * is_internal_p,bool * is_altacct_p)2473 pmap_remove_pv(
2474 pmap_t pmap,
2475 pt_entry_t *ptep,
2476 int pai,
2477 bool flush_tlb_async __unused,
2478 bool *is_internal_p,
2479 bool *is_altacct_p)
2480 {
2481 ASSERT_NOT_HIBERNATING();
2482 pvh_assert_locked(pai);
2483
2484 bool is_internal = false;
2485 bool is_altacct = false;
2486 pv_entry_t **pvh = pai_to_pvh(pai);
2487 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
2488
2489 #if XNU_MONITOR
2490 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2491 panic("%s: PVH entry at pai %d is locked down (%#lx), cannot remove",
2492 __func__, pai, pvh_flags);
2493 }
2494 #endif /* XNU_MONITOR */
2495
2496
2497 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2498 if (__improbable((ptep != pvh_ptep(pvh)))) {
2499 /**
2500 * The only mapping that exists for this page isn't the one we're
2501 * unmapping, weird.
2502 */
2503 panic("%s: ptep=%p does not match pvh=%p (%p), pai=0x%x",
2504 __func__, ptep, pvh, pvh_ptep(pvh), pai);
2505 }
2506
2507 pvh_update_head(pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
2508 is_internal = ppattr_is_internal(pai);
2509 is_altacct = ppattr_is_altacct(pai);
2510 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2511 pv_entry_t **pvepp = pvh;
2512 pv_entry_t *pvep = pvh_pve_list(pvh);
2513 assert(pvep != PV_ENTRY_NULL);
2514 int pve_pte_idx = 0;
2515 /* Find the PVE that represents the mapping we're removing. */
2516 while ((pvep != PV_ENTRY_NULL) && ((pve_pte_idx = pve_find_ptep_index(pvep, ptep)) == -1)) {
2517 pvepp = pve_next_ptr(pvep);
2518 pvep = pve_next(pvep);
2519 }
2520
2521 if (__improbable((pvep == PV_ENTRY_NULL))) {
2522 panic("%s: ptep=%p (pai=0x%x) not in pvh=%p", __func__, ptep, pai, pvh);
2523 }
2524
2525 is_internal = pve_get_internal(pvep, pve_pte_idx);
2526 is_altacct = pve_get_altacct(pvep, pve_pte_idx);
2527 pve_set_ptep(pvep, pve_pte_idx, PT_ENTRY_NULL);
2528
2529 #if MACH_ASSERT
2530 /**
2531 * Ensure that the mapping didn't accidentally have multiple PVEs
2532 * associated with it (there should only be one PVE per mapping). This
2533 * checking only occurs on configurations that can accept the perf hit
2534 * that walking the PVE chain on every unmap entails.
2535 *
2536 * This is skipped for IOMMU mappings because some IOMMUs don't use
2537 * normal page tables (e.g., NVMe) to map pages, so the `ptep` field in
2538 * the associated PVE won't actually point to a real page table (see the
2539 * definition of PVH_FLAG_IOMMU_TABLE for more info). Because of that,
2540 * it's perfectly possible for duplicate IOMMU PVEs to exist.
2541 */
2542 if ((pmap != NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
2543 pv_entry_t *check_pvep = pvep;
2544
2545 do {
2546 if (pve_find_ptep_index(check_pvep, ptep) != -1) {
2547 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
2548 "pvep=%p, pai=0x%x", __func__, ptep, pmap, pvh, pvep, pai);
2549 }
2550 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
2551 }
2552 #endif /* MACH_ASSERT */
2553
2554 const bool pve_is_first = (pvepp == pvh);
2555 const bool pve_is_last = (pve_next(pvep) == PV_ENTRY_NULL);
2556 const int other_pte_idx = !pve_pte_idx;
2557
2558 if (pve_is_empty(pvep)) {
2559 /*
2560 * This PVE doesn't contain any mappings. We can get rid of it.
2561 */
2562 pve_remove(pvh, pvepp, pvep);
2563 pv_free(pvep);
2564 } else if (!pve_is_first) {
2565 /*
2566 * This PVE contains a single mapping. See if we can coalesce it with the one
2567 * at the top of the list.
2568 */
2569 pv_entry_t *head_pvep = pvh_pve_list(pvh);
2570 int head_pve_pte_empty_idx;
2571 if ((head_pve_pte_empty_idx = pve_find_ptep_index(head_pvep, PT_ENTRY_NULL)) != -1) {
2572 pve_set_ptep(head_pvep, head_pve_pte_empty_idx, pve_get_ptep(pvep, other_pte_idx));
2573 if (pve_get_internal(pvep, other_pte_idx)) {
2574 pve_set_internal(head_pvep, head_pve_pte_empty_idx);
2575 }
2576 if (pve_get_altacct(pvep, other_pte_idx)) {
2577 pve_set_altacct(head_pvep, head_pve_pte_empty_idx);
2578 }
2579 pve_remove(pvh, pvepp, pvep);
2580 pv_free(pvep);
2581 } else {
2582 /*
2583 * We could not coalesce it. Move it to the start of the list, so that it
2584 * can be coalesced against in the future.
2585 */
2586 *pvepp = pve_next(pvep);
2587 pve_add(pvh, pvep);
2588 }
2589 } else if (pve_is_first && pve_is_last) {
2590 /*
2591 * This PVE contains a single mapping, and it's the last mapping for this PAI.
2592 * Collapse this list back into the head, turning it into a PVH_TYPE_PTEP entry.
2593 */
2594 pve_remove(pvh, pvepp, pvep);
2595 pvh_update_head(pvh, pve_get_ptep(pvep, other_pte_idx), PVH_TYPE_PTEP);
2596 if (pve_get_internal(pvep, other_pte_idx)) {
2597 ppattr_set_internal(pai);
2598 }
2599 if (pve_get_altacct(pvep, other_pte_idx)) {
2600 ppattr_set_altacct(pai);
2601 }
2602 pv_free(pvep);
2603 }
2604
2605 /**
2606 * Removing a PVE entry can clobber the PVH flags if the head itself is
2607 * updated (when removing the first PVE in the list) so let's re-set the
2608 * flags back to what they should be.
2609 */
2610 if (!pvh_test_type(pvh, PVH_TYPE_NULL)) {
2611 pvh_set_flags(pvh, pvh_flags);
2612 }
2613 } else {
2614 panic("%s: unexpected PV head %p, ptep=%p pmap=%p pvh=%p pai=0x%x",
2615 __func__, *pvh, ptep, pmap, pvh, pai);
2616 }
2617
2618 #ifdef PVH_FLAG_EXEC
2619 /**
2620 * If we're on a system that has extra protections around executable pages,
2621 * then removing the last mapping to an executable page means we need to
2622 * give write-access back to the physical aperture mapping of this page
2623 * (write access is removed when a page is executable for security reasons).
2624 */
2625 if ((pvh_flags & PVH_FLAG_EXEC) && pvh_test_type(pvh, PVH_TYPE_NULL)) {
2626 pmap_set_ptov_ap(pai, AP_RWNA, flush_tlb_async);
2627 }
2628 #endif /* PVH_FLAG_EXEC */
2629 if (__improbable((pvh_flags & PVH_FLAG_FLUSH_NEEDED) && pvh_test_type(pvh, PVH_TYPE_NULL))) {
2630 pmap_flush_noncoherent_page((pmap_paddr_t)ptoa(pai) + vm_first_phys);
2631 }
2632
2633 *is_internal_p = is_internal;
2634 *is_altacct_p = is_altacct;
2635 }
2636
2637 /**
2638 * Bootstrap the initial Page Table Descriptor (PTD) node free list.
2639 *
2640 * @note It's not safe to allocate PTD nodes until after this function is
2641 * invoked.
2642 *
2643 * @note The maximum number of PTD objects that can reside within one page
2644 * (`ptd_per_page`) must have already been calculated before calling this
2645 * function.
2646 *
2647 * @param ptdp Pointer to the virtually-contiguous memory used for the initial
2648 * free list.
2649 * @param num_pages The number of virtually-contiguous pages pointed to by
2650 * `ptdp` that will be used to prime the PTD allocator.
2651 */
2652 MARK_AS_PMAP_TEXT void
ptd_bootstrap(pt_desc_t * ptdp,unsigned int num_pages)2653 ptd_bootstrap(pt_desc_t *ptdp, unsigned int num_pages)
2654 {
2655 assert(ptd_per_page > 0);
2656 assert((ptdp != NULL) && (((uintptr_t)ptdp & PAGE_MASK) == 0) && (num_pages > 0));
2657
2658 queue_init(&pt_page_list);
2659
2660 /**
2661 * Region represented by ptdp should be cleared by pmap_bootstrap().
2662 *
2663 * Only part of each page is being used for PTD objects (the rest is used
2664 * for each PTD's associated ptd_info_t object) so link together the last
2665 * PTD element of each page to the first element of the previous page.
2666 */
2667 for (int i = 0; i < num_pages; i++) {
2668 *((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
2669 ptd_free_list = ptdp;
2670 ptdp = (void *)(((uint8_t *)ptdp) + PAGE_SIZE);
2671 }
2672
2673 ptd_free_count = num_pages * ptd_per_page;
2674 simple_lock_init(&ptd_free_list_lock, 0);
2675 }
2676
2677 /**
2678 * Allocate a page table descriptor (PTD) object from the PTD free list, but
2679 * don't add it to the list of reclaimable userspace page table pages just yet
2680 * and don't associate the PTD with a specific pmap (that's what "unlinked"
2681 * means here).
2682 *
2683 * @note Until a page table's descriptor object is added to the page table list,
2684 * that table won't be eligible for reclaiming by pmap_page_reclaim().
2685 *
2686 * @return The page table descriptor object if the allocation was successful, or
2687 * NULL otherwise (which indicates that a page failed to be allocated
2688 * for new nodes).
2689 */
2690 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc_unlinked(void)2691 ptd_alloc_unlinked(void)
2692 {
2693 pt_desc_t *ptdp = PTD_ENTRY_NULL;
2694
2695 pmap_simple_lock(&ptd_free_list_lock);
2696
2697 assert(ptd_per_page != 0);
2698
2699 /**
2700 * Ensure that we either have a free list with nodes available, or a
2701 * completely empty list to allocate and prepend new nodes to.
2702 */
2703 assert(((ptd_free_list != NULL) && (ptd_free_count > 0)) ||
2704 ((ptd_free_list == NULL) && (ptd_free_count == 0)));
2705
2706 if (__improbable(ptd_free_count == 0)) {
2707 pmap_paddr_t pa = 0;
2708
2709 /* Drop the lock while allocating pages since that can take a while. */
2710 pmap_simple_unlock(&ptd_free_list_lock);
2711
2712 if (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT) != KERN_SUCCESS) {
2713 return NULL;
2714 }
2715 ptdp = (pt_desc_t *)phystokv(pa);
2716
2717 pmap_simple_lock(&ptd_free_list_lock);
2718
2719 /**
2720 * Since the lock was dropped while allocating, it's possible another
2721 * CPU already allocated a page. To be safe, prepend the current free
2722 * list (which may or may not be empty now) to the page of nodes just
2723 * allocated and update the head to point to these new nodes.
2724 */
2725 *((void**)(&ptdp[ptd_per_page - 1])) = (void*)ptd_free_list;
2726 ptd_free_list = ptdp;
2727 ptd_free_count += ptd_per_page;
2728 }
2729
2730 /* There should be available nodes at this point. */
2731 if (__improbable((ptd_free_count == 0) || (ptd_free_list == PTD_ENTRY_NULL))) {
2732 panic_plain("%s: out of PTD entries and for some reason didn't "
2733 "allocate more %d %p", __func__, ptd_free_count, ptd_free_list);
2734 }
2735
2736 /* Grab the top node off of the free list to return later. */
2737 ptdp = ptd_free_list;
2738
2739 /**
2740 * Advance the free list to the next node.
2741 *
2742 * Each free pt_desc_t-sized object in this free list uses the first few
2743 * bytes of the object to point to the next object in the list. When an
2744 * object is deallocated (in ptd_deallocate()) the object is prepended onto
2745 * the free list by setting its first few bytes to point to the current free
2746 * list head. Then the head is updated to point to that object.
2747 *
2748 * When a new page is allocated for PTD nodes, it's left zeroed out. Once we
2749 * use up all of the previously deallocated nodes, the list will point
2750 * somewhere into the last allocated, empty page. We know we're pointing at
2751 * this page because the first few bytes of the object will be NULL. In
2752 * that case just set the head to this empty object.
2753 *
2754 * This empty page can be thought of as a "reserve" of empty nodes for the
2755 * case where more nodes are being allocated than there are nodes being
2756 * deallocated.
2757 */
2758 pt_desc_t *const next_node = (pt_desc_t *)(*(void **)ptd_free_list);
2759
2760 /**
2761 * If the next node in the list is NULL but there are supposed to still be
2762 * nodes left, then we've hit the previously allocated empty page of nodes.
2763 * Go ahead and advance the free list to the next free node in that page.
2764 */
2765 if ((next_node == PTD_ENTRY_NULL) && (ptd_free_count > 1)) {
2766 ptd_free_list = ptd_free_list + 1;
2767 } else {
2768 ptd_free_list = next_node;
2769 }
2770
2771 ptd_free_count--;
2772
2773 pmap_simple_unlock(&ptd_free_list_lock);
2774
2775 ptdp->pt_page.next = NULL;
2776 ptdp->pt_page.prev = NULL;
2777 ptdp->pmap = NULL;
2778
2779 /**
2780 * Calculate and stash the address of the ptd_info_t associated with this
2781 * PTD. This can be done easily because both structures co-exist in the same
2782 * page, with ptd_info_t's starting at a given offset from the start of the
2783 * page.
2784 *
2785 * Each PTD is associated with a ptd_info_t of the same index. For example,
2786 * the 15th PTD will use the 15th ptd_info_t in the same page.
2787 */
2788 const unsigned ptd_index = ((uintptr_t)ptdp & PAGE_MASK) / sizeof(pt_desc_t);
2789 assert(ptd_index < ptd_per_page);
2790
2791 const uintptr_t start_of_page = (uintptr_t)ptdp & ~PAGE_MASK;
2792 ptd_info_t *first_ptd_info = (ptd_info_t *)(start_of_page + ptd_info_offset);
2793 ptdp->ptd_info = &first_ptd_info[ptd_index * PT_INDEX_MAX];
2794
2795 /**
2796 * On systems where the VM page size doesn't match the hardware page size,
2797 * one PTD might have to manage multiple page tables.
2798 */
2799 for (unsigned int i = 0; i < PT_INDEX_MAX; i++) {
2800 ptdp->va[i] = (vm_offset_t)-1;
2801 ptdp->ptd_info[i].refcnt = 0;
2802 ptdp->ptd_info[i].wiredcnt = 0;
2803 }
2804
2805 return ptdp;
2806 }
2807
2808 /**
2809 * Allocate a single page table descriptor (PTD) object, and if it's meant to
2810 * keep track of a userspace page table, then add that descriptor object to the
2811 * list of PTDs that can be reclaimed in pmap_page_reclaim().
2812 *
2813 * @param pmap The pmap object that will be owning the page table(s) that this
2814 * descriptor object represents.
2815 *
2816 * @return The allocated PTD object, or NULL if one failed to get allocated
2817 * (which indicates that memory wasn't able to get allocated).
2818 */
2819 MARK_AS_PMAP_TEXT pt_desc_t*
ptd_alloc(pmap_t pmap)2820 ptd_alloc(pmap_t pmap)
2821 {
2822 pt_desc_t *ptdp = ptd_alloc_unlinked();
2823
2824 if (ptdp == NULL) {
2825 return NULL;
2826 }
2827
2828 ptdp->pmap = pmap;
2829 if (pmap != kernel_pmap) {
2830 /**
2831 * We should never try to reclaim kernel pagetable pages in
2832 * pmap_page_reclaim(), so don't enter them into the list.
2833 */
2834 pmap_simple_lock(&pt_pages_lock);
2835 queue_enter(&pt_page_list, ptdp, pt_desc_t *, pt_page);
2836 pmap_simple_unlock(&pt_pages_lock);
2837 }
2838
2839 pmap_tt_ledger_credit(pmap, sizeof(*ptdp));
2840 return ptdp;
2841 }
2842
2843 /**
2844 * Deallocate a single page table descriptor (PTD) object.
2845 *
2846 * @note Ledger statistics are tracked on a per-pmap basis, so for those pages
2847 * which are not associated with any specific pmap (e.g., IOMMU pages),
2848 * the caller must ensure that the pmap/iommu field in the PTD object is
2849 * NULL before calling this function.
2850 *
2851 * @param ptdp Pointer to the PTD object to deallocate.
2852 */
2853 MARK_AS_PMAP_TEXT void
ptd_deallocate(pt_desc_t * ptdp)2854 ptd_deallocate(pt_desc_t *ptdp)
2855 {
2856 pmap_t pmap = ptdp->pmap;
2857
2858 /**
2859 * If this PTD was put onto the reclaimable page table list, then remove it
2860 * from that list before deallocating.
2861 */
2862 if (ptdp->pt_page.next != NULL) {
2863 pmap_simple_lock(&pt_pages_lock);
2864 queue_remove(&pt_page_list, ptdp, pt_desc_t *, pt_page);
2865 pmap_simple_unlock(&pt_pages_lock);
2866 }
2867
2868 /* Prepend the deallocated node to the free list. */
2869 pmap_simple_lock(&ptd_free_list_lock);
2870 (*(void **)ptdp) = (void *)ptd_free_list;
2871 ptd_free_list = (pt_desc_t *)ptdp;
2872 ptd_free_count++;
2873 pmap_simple_unlock(&ptd_free_list_lock);
2874
2875 /**
2876 * If this PTD was being used to represent an IOMMU page then there won't be
2877 * an associated pmap, and therefore no ledger statistics to update.
2878 */
2879 if (pmap != NULL) {
2880 pmap_tt_ledger_debit(pmap, sizeof(*ptdp));
2881 }
2882 }
2883
2884 /**
2885 * In address spaces where the VM page size is larger than the underlying
2886 * hardware page size, one page table descriptor (PTD) object can represent
2887 * multiple page tables. Some fields (like the reference counts) still need to
2888 * be tracked on a per-page-table basis. Because of this, those values are
2889 * stored in a separate array of ptd_info_t objects within the PTD where there's
2890 * one ptd_info_t for every page table a single PTD can manage.
2891 *
2892 * This function initializes the correct ptd_info_t field within a PTD based on
2893 * the page table it's representing.
2894 *
2895 * @param ptdp Pointer to the PTD object which contains the ptd_info_t field to
2896 * update. Must match up with the `pmap` and `ptep` parameters.
2897 * @param pmap The pmap that owns the page table managed by the passed in PTD.
2898 * @param va Any virtual address that resides within the virtual address space
2899 * being mapped by the page table pointed to by `ptep`.
2900 * @param level The level in the page table hierarchy that the table resides.
2901 * @param ptep A pointer into a page table that the passed in PTD manages. This
2902 * page table must be owned by `pmap` and be the PTE that maps `va`.
2903 */
2904 MARK_AS_PMAP_TEXT void
ptd_info_init(pt_desc_t * ptdp,pmap_t pmap,vm_map_address_t va,unsigned int level,pt_entry_t * ptep)2905 ptd_info_init(
2906 pt_desc_t *ptdp,
2907 pmap_t pmap,
2908 vm_map_address_t va,
2909 unsigned int level,
2910 pt_entry_t *ptep)
2911 {
2912 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2913
2914 if (ptdp->pmap != pmap) {
2915 panic("%s: pmap mismatch, ptdp=%p, pmap=%p, va=%p, level=%u, ptep=%p",
2916 __func__, ptdp, pmap, (void*)va, level, ptep);
2917 }
2918
2919 /**
2920 * Root tables are managed separately, and can be accessed through the
2921 * pmap structure itself (there's only one root table per address space).
2922 */
2923 assert(level > pt_attr_root_level(pt_attr));
2924
2925 /**
2926 * Each PTD can represent multiple page tables. Get the correct index to use
2927 * with the per-page-table properties.
2928 */
2929 const unsigned pt_index = ptd_get_index(ptdp, ptep);
2930
2931 /**
2932 * The "va" field represents the first virtual address that this page table
2933 * is translating for. Naturally, this is dependent on the level the page
2934 * table resides at since more VA space is mapped the closer the page
2935 * table's level is to the root.
2936 */
2937 ptdp->va[pt_index] = (vm_offset_t) va & ~pt_attr_ln_offmask(pt_attr, level - 1);
2938
2939 /**
2940 * Reference counts are only tracked on CPU leaf tables because those are
2941 * the only tables that can be opportunistically deallocated.
2942 */
2943 if (level < pt_attr_leaf_level(pt_attr)) {
2944 ptdp->ptd_info[pt_index].refcnt = PT_DESC_REFCOUNT;
2945 }
2946 }
2947
2948 #if XNU_MONITOR
2949
2950 /**
2951 * Validate that a pointer passed into the PPL is indeed an actual ledger object
2952 * that was allocated from within the PPL.
2953 *
2954 * If this is truly a real PPL-allocated ledger object then the object will have
2955 * an index into the ledger pointer array located right after it. That index
2956 * into the ledger pointer array should contain the exact same pointer that
2957 * we're validating. This works because the ledger array is PPL-owned data, so
2958 * even if the index was fabricated to try and point to a different ledger
2959 * object, the pointer inside the array won't match up with the passed in
2960 * pointer and validation will fail.
2961 *
2962 * @note This validation does not need to occur on non-PPL systems because on
2963 * those systems the ledger objects are allocated using a zone allocator.
2964 *
2965 * @param ledger Pointer to the supposed ledger object that we need to validate.
2966 *
2967 * @return The index into the ledger pointer array used to validate the passed
2968 * in ledger pointer. If the pointer failed to validate, then the system
2969 * will panic.
2970 */
2971 MARK_AS_PMAP_TEXT uint64_t
pmap_ledger_validate(const volatile void * ledger)2972 pmap_ledger_validate(const volatile void *ledger)
2973 {
2974 assert(ledger != NULL);
2975
2976 uint64_t array_index = ((const volatile pmap_ledger_t*)ledger)->array_index;
2977
2978 if (__improbable(array_index >= pmap_ledger_ptr_array_count)) {
2979 panic("%s: ledger %p array index invalid, index was %#llx", __func__,
2980 ledger, array_index);
2981 }
2982
2983 if (__improbable(pmap_ledger_ptr_array[array_index] != ledger)) {
2984 panic("%s: ledger pointer mismatch, %p != %p", __func__, ledger,
2985 pmap_ledger_ptr_array[array_index]);
2986 }
2987
2988 return array_index;
2989 }
2990
2991 /**
2992 * The size of the ledgers being allocated by the PPL need to be large enough
2993 * to handle ledgers produced by the task_ledgers ledger template. That template
2994 * is dynamically created at runtime so this function is used to verify that the
2995 * real size of a ledger based on the task_ledgers template matches up with the
2996 * amount of space the PPL calculated is required for a single ledger.
2997 *
2998 * @note See the definition of PMAP_LEDGER_DATA_BYTES for more information.
2999 *
3000 * @note This function needs to be called before any ledgers can be allocated.
3001 *
3002 * @param size The actual size that each pmap ledger should be. This is
3003 * calculated based on the task_ledgers template which should match
3004 * up with PMAP_LEDGER_DATA_BYTES.
3005 */
3006 MARK_AS_PMAP_TEXT void
pmap_ledger_verify_size_internal(size_t size)3007 pmap_ledger_verify_size_internal(size_t size)
3008 {
3009 pmap_simple_lock(&pmap_ledger_lock);
3010
3011 if (pmap_ledger_size_verified) {
3012 panic("%s: ledger size already verified, size=%lu", __func__, size);
3013 }
3014
3015 if ((size == 0) || (size > sizeof(pmap_ledger_data_t)) ||
3016 ((sizeof(pmap_ledger_data_t) - size) % sizeof(struct ledger_entry))) {
3017 panic("%s: size mismatch, expected %lu, size=%lu", __func__,
3018 PMAP_LEDGER_DATA_BYTES, size);
3019 }
3020
3021 pmap_ledger_size_verified = true;
3022
3023 pmap_simple_unlock(&pmap_ledger_lock);
3024 }
3025
3026 /**
3027 * Allocate a ledger object from the pmap ledger free list and associate it with
3028 * the ledger pointer array so it can be validated when passed into the PPL.
3029 *
3030 * @return Pointer to the successfully allocated ledger object, or NULL if we're
3031 * out of PPL pages.
3032 */
3033 MARK_AS_PMAP_TEXT ledger_t
pmap_ledger_alloc_internal(void)3034 pmap_ledger_alloc_internal(void)
3035 {
3036 /**
3037 * Ensure that we've double checked the size of the ledger objects we're
3038 * allocating before we allocate anything.
3039 */
3040 if (!pmap_ledger_size_verified) {
3041 panic_plain("%s: Attempted to allocate a pmap ledger before verifying "
3042 "the ledger size", __func__);
3043 }
3044
3045 pmap_simple_lock(&pmap_ledger_lock);
3046 if (pmap_ledger_free_list == NULL) {
3047 /* The free list is empty, so allocate a page's worth of objects. */
3048 const pmap_paddr_t paddr = pmap_get_free_ppl_page();
3049
3050 if (paddr == 0) {
3051 pmap_simple_unlock(&pmap_ledger_lock);
3052 return NULL;
3053 }
3054
3055 const vm_map_address_t vstart = phystokv(paddr);
3056 const uint32_t ledgers_per_page = PAGE_SIZE / sizeof(pmap_ledger_t);
3057 const vm_map_address_t vend = vstart + (ledgers_per_page * sizeof(pmap_ledger_t));
3058 assert(vend > vstart);
3059
3060 /**
3061 * Loop through every pmap ledger object within the recently allocated
3062 * page and add it to both the ledger free list and the ledger pointer
3063 * array (which will be used to validate these objects in the future).
3064 */
3065 for (vm_map_address_t vaddr = vstart; vaddr < vend; vaddr += sizeof(pmap_ledger_t)) {
3066 /* Get the next free entry in the ledger pointer array. */
3067 const uint64_t index = pmap_ledger_ptr_array_free_index++;
3068
3069 if (index >= pmap_ledger_ptr_array_count) {
3070 panic("%s: pmap_ledger_ptr_array is full, index=%llu",
3071 __func__, index);
3072 }
3073
3074 pmap_ledger_t *free_ledger = (pmap_ledger_t*)vaddr;
3075
3076 /**
3077 * This association between the just allocated ledger and the
3078 * pointer array is what allows this object to be validated in the
3079 * future that it's indeed a ledger allocated by this code.
3080 */
3081 pmap_ledger_ptr_array[index] = free_ledger;
3082 free_ledger->array_index = index;
3083
3084 /* Prepend this new ledger object to the free list. */
3085 free_ledger->next = pmap_ledger_free_list;
3086 pmap_ledger_free_list = free_ledger;
3087 }
3088
3089 /**
3090 * In an effort to reduce the amount of ledger code that needs to be
3091 * called from within the PPL, the ledger objects themselves are made
3092 * kernel writable. This way, all of the initialization and checking of
3093 * the ledgers can occur outside of the PPL.
3094 *
3095 * The only modification to these ledger objects that should occur from
3096 * within the PPL is when debiting/crediting the ledgers. And those
3097 * operations should only occur on validated ledger objects that are
3098 * validated using the ledger pointer array (which is wholly contained
3099 * in PPL-owned memory).
3100 */
3101 pa_set_range_xprr_perm(paddr, paddr + PAGE_SIZE, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
3102 }
3103
3104 ledger_t new_ledger = (ledger_t)pmap_ledger_free_list;
3105 pmap_ledger_free_list = pmap_ledger_free_list->next;
3106
3107 /**
3108 * Double check that the array index of the recently allocated object wasn't
3109 * tampered with while the object was sitting on the free list.
3110 */
3111 const uint64_t array_index = pmap_ledger_validate(new_ledger);
3112 os_ref_init(&pmap_ledger_refcnt[array_index], NULL);
3113
3114 pmap_simple_unlock(&pmap_ledger_lock);
3115
3116 return new_ledger;
3117 }
3118
3119 /**
3120 * Free a ledger that was previously allocated by the PPL.
3121 *
3122 * @param ledger The ledger to put back onto the pmap ledger free list.
3123 */
3124 MARK_AS_PMAP_TEXT void
pmap_ledger_free_internal(ledger_t ledger)3125 pmap_ledger_free_internal(ledger_t ledger)
3126 {
3127 /**
3128 * A pmap_ledger_t wholly contains a ledger_t as its first member, but also
3129 * includes an index into the ledger pointer array used for validation
3130 * purposes.
3131 */
3132 pmap_ledger_t *free_ledger = (pmap_ledger_t*)ledger;
3133
3134 pmap_simple_lock(&pmap_ledger_lock);
3135
3136 /* Ensure that what we're putting onto the free list is a real ledger. */
3137 const uint64_t array_index = pmap_ledger_validate(ledger);
3138
3139 /* Ensure no pmap objects are still using this ledger. */
3140 if (os_ref_release(&pmap_ledger_refcnt[array_index]) != 0) {
3141 panic("%s: ledger still referenced, ledger=%p", __func__, ledger);
3142 }
3143
3144 /* Prepend the ledger to the free list. */
3145 free_ledger->next = pmap_ledger_free_list;
3146 pmap_ledger_free_list = free_ledger;
3147
3148 pmap_simple_unlock(&pmap_ledger_lock);
3149 }
3150
3151 /**
3152 * Bump the reference count on a ledger object to denote that is currently in
3153 * use by a pmap object.
3154 *
3155 * @param ledger The ledger whose refcnt to increment.
3156 */
3157 MARK_AS_PMAP_TEXT void
pmap_ledger_retain(ledger_t ledger)3158 pmap_ledger_retain(ledger_t ledger)
3159 {
3160 pmap_simple_lock(&pmap_ledger_lock);
3161 const uint64_t array_index = pmap_ledger_validate(ledger);
3162 os_ref_retain(&pmap_ledger_refcnt[array_index]);
3163 pmap_simple_unlock(&pmap_ledger_lock);
3164 }
3165
3166 /**
3167 * Decrement the reference count on a ledger object to denote that a pmap object
3168 * that used to use it now isn't.
3169 *
3170 * @param ledger The ledger whose refcnt to decrement.
3171 */
3172 MARK_AS_PMAP_TEXT void
pmap_ledger_release(ledger_t ledger)3173 pmap_ledger_release(ledger_t ledger)
3174 {
3175 pmap_simple_lock(&pmap_ledger_lock);
3176 const uint64_t array_index = pmap_ledger_validate(ledger);
3177 os_ref_release_live(&pmap_ledger_refcnt[array_index]);
3178 pmap_simple_unlock(&pmap_ledger_lock);
3179 }
3180
3181 /**
3182 * This function is used to check a ledger that was recently updated (usually
3183 * from within the PPL) and potentially take actions based on the new ledger
3184 * balances (e.g., set an AST).
3185 *
3186 * @note On non-PPL systems this checking occurs automatically every time a
3187 * ledger is credited/debited. Due to that, this function only needs to
3188 * get called on PPL-enabled systems.
3189 *
3190 * @note This function can ONLY be called from *outside* of the PPL due to its
3191 * usage of current_thread(). The TPIDR register is kernel-modifiable, and
3192 * hence can't be trusted. This also means we don't need to pull all of
3193 * the logic used to check ledger balances into the PPL.
3194 *
3195 * @param pmap The pmap whose ledger should be checked.
3196 */
3197 void
pmap_ledger_check_balance(pmap_t pmap)3198 pmap_ledger_check_balance(pmap_t pmap)
3199 {
3200 /* This function should only be called from outside of the PPL. */
3201 assert((pmap != NULL) && !pmap_in_ppl());
3202
3203 ledger_t ledger = pmap->ledger;
3204
3205 if (ledger == NULL) {
3206 return;
3207 }
3208
3209 thread_t cur_thread = current_thread();
3210 ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting);
3211 ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting_compressed);
3212 ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal);
3213 ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal_compressed);
3214 ledger_check_new_balance(cur_thread, ledger, task_ledgers.page_table);
3215 ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_footprint);
3216 ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_mem);
3217 ledger_check_new_balance(cur_thread, ledger, task_ledgers.tkm_private);
3218 ledger_check_new_balance(cur_thread, ledger, task_ledgers.wired_mem);
3219 }
3220
3221 #endif /* XNU_MONITOR */
3222
3223 /**
3224 * Credit a specific ledger entry within the passed in pmap's ledger object.
3225 *
3226 * @note On PPL-enabled systems this operation will not automatically check the
3227 * ledger balances after updating. A call to pmap_ledger_check_balance()
3228 * will need to occur outside of the PPL to handle this.
3229 *
3230 * @param pmap The pmap whose ledger should be updated.
3231 * @param entry The specifc ledger entry to update. This needs to be one of the
3232 * task_ledger entries.
3233 * @param amount The amount to credit from the ledger.
3234 *
3235 * @return The return value from the credit operation.
3236 */
3237 kern_return_t
pmap_ledger_credit(pmap_t pmap,int entry,ledger_amount_t amount)3238 pmap_ledger_credit(pmap_t pmap, int entry, ledger_amount_t amount)
3239 {
3240 assert(pmap != NULL);
3241
3242 #if XNU_MONITOR
3243 /**
3244 * On PPL-enabled systems the "nocheck" variant MUST be called to ensure
3245 * that the ledger balance doesn't automatically get checked after being
3246 * updated.
3247 *
3248 * That checking process is unsafe to perform within the PPL due to its
3249 * reliance on current_thread().
3250 */
3251 return ledger_credit_nocheck(pmap->ledger, entry, amount);
3252 #else /* XNU_MONITOR */
3253 return ledger_credit(pmap->ledger, entry, amount);
3254 #endif /* XNU_MONITOR */
3255 }
3256
3257 /**
3258 * Debit a specific ledger entry within the passed in pmap's ledger object.
3259 *
3260 * @note On PPL-enabled systems this operation will not automatically check the
3261 * ledger balances after updating. A call to pmap_ledger_check_balance()
3262 * will need to occur outside of the PPL to handle this.
3263 *
3264 * @param pmap The pmap whose ledger should be updated.
3265 * @param entry The specifc ledger entry to update. This needs to be one of the
3266 * task_ledger entries.
3267 * @param amount The amount to debit from the ledger.
3268 *
3269 * @return The return value from the debit operation.
3270 */
3271 kern_return_t
pmap_ledger_debit(pmap_t pmap,int entry,ledger_amount_t amount)3272 pmap_ledger_debit(pmap_t pmap, int entry, ledger_amount_t amount)
3273 {
3274 assert(pmap != NULL);
3275
3276 #if XNU_MONITOR
3277 /**
3278 * On PPL-enabled systems the "nocheck" variant MUST be called to ensure
3279 * that the ledger balance doesn't automatically get checked after being
3280 * updated.
3281 *
3282 * That checking process is unsafe to perform within the PPL due to its
3283 * reliance on current_thread().
3284 */
3285 return ledger_debit_nocheck(pmap->ledger, entry, amount);
3286 #else /* XNU_MONITOR */
3287 return ledger_debit(pmap->ledger, entry, amount);
3288 #endif /* XNU_MONITOR */
3289 }
3290
3291 #if XNU_MONITOR
3292
3293 /**
3294 * Allocate a pmap object from the pmap object free list and associate it with
3295 * the pmap pointer array so it can be validated when passed into the PPL.
3296 *
3297 * @param pmap Output parameter that holds the newly allocated pmap object if
3298 * the operation was successful, or NULL otherwise. The return value
3299 * must be checked to know what this parameter should return.
3300 *
3301 * @return KERN_SUCCESS if the allocation was successful, KERN_RESOURCE_SHORTAGE
3302 * if out of free PPL pages, or KERN_NO_SPACE if more pmap objects were
3303 * trying to be allocated than the pmap pointer array could manage. On
3304 * KERN_SUCCESS, the `pmap` output parameter will point to the newly
3305 * allocated object.
3306 */
3307 MARK_AS_PMAP_TEXT kern_return_t
pmap_alloc_pmap(pmap_t * pmap)3308 pmap_alloc_pmap(pmap_t *pmap)
3309 {
3310 pmap_t new_pmap = PMAP_NULL;
3311 kern_return_t kr = KERN_SUCCESS;
3312
3313 pmap_simple_lock(&pmap_free_list_lock);
3314
3315 if (pmap_free_list == NULL) {
3316 /* If the pmap pointer array is full, then no more objects can be allocated. */
3317 if (__improbable(pmap_ptr_array_free_index == pmap_ptr_array_count)) {
3318 kr = KERN_NO_SPACE;
3319 goto pmap_alloc_cleanup;
3320 }
3321
3322 /* The free list is empty, so allocate a page's worth of objects. */
3323 const pmap_paddr_t paddr = pmap_get_free_ppl_page();
3324
3325 if (paddr == 0) {
3326 kr = KERN_RESOURCE_SHORTAGE;
3327 goto pmap_alloc_cleanup;
3328 }
3329
3330 const vm_map_address_t vstart = phystokv(paddr);
3331 const uint32_t pmaps_per_page = PAGE_SIZE / sizeof(pmap_list_entry_t);
3332 const vm_map_address_t vend = vstart + (pmaps_per_page * sizeof(pmap_list_entry_t));
3333 assert(vend > vstart);
3334
3335 /**
3336 * Loop through every pmap object within the recently allocated page and
3337 * add it to both the pmap free list and the pmap pointer array (which
3338 * will be used to validate these objects in the future).
3339 */
3340 for (vm_map_address_t vaddr = vstart; vaddr < vend; vaddr += sizeof(pmap_list_entry_t)) {
3341 /* Get the next free entry in the pmap pointer array. */
3342 const unsigned long index = pmap_ptr_array_free_index++;
3343
3344 if (__improbable(index >= pmap_ptr_array_count)) {
3345 panic("%s: pmap array index %lu >= limit %lu; corruption?",
3346 __func__, index, pmap_ptr_array_count);
3347 }
3348 pmap_list_entry_t *free_pmap = (pmap_list_entry_t*)vaddr;
3349 os_atomic_init(&free_pmap->pmap.ref_count, 0);
3350
3351 /**
3352 * This association between the just allocated pmap object and the
3353 * pointer array is what allows this object to be validated in the
3354 * future that it's indeed a pmap object allocated by this code.
3355 */
3356 pmap_ptr_array[index] = free_pmap;
3357 free_pmap->array_index = index;
3358
3359 /* Prepend this new pmap object to the free list. */
3360 free_pmap->next = pmap_free_list;
3361 pmap_free_list = free_pmap;
3362
3363 /* Check if we've reached the maximum number of pmap objects. */
3364 if (__improbable(pmap_ptr_array_free_index == pmap_ptr_array_count)) {
3365 break;
3366 }
3367 }
3368 }
3369
3370 new_pmap = &pmap_free_list->pmap;
3371 pmap_free_list = pmap_free_list->next;
3372
3373 pmap_alloc_cleanup:
3374 pmap_simple_unlock(&pmap_free_list_lock);
3375 *pmap = new_pmap;
3376 return kr;
3377 }
3378
3379 /**
3380 * Free a pmap object that was previously allocated by the PPL.
3381 *
3382 * @note This should only be called on pmap objects that have already been
3383 * validated to be real pmap objects.
3384 *
3385 * @param pmap The pmap object to put back onto the pmap free.
3386 */
3387 MARK_AS_PMAP_TEXT void
pmap_free_pmap(pmap_t pmap)3388 pmap_free_pmap(pmap_t pmap)
3389 {
3390 /**
3391 * A pmap_list_entry_t wholly contains a struct pmap as its first member,
3392 * but also includes an index into the pmap pointer array used for
3393 * validation purposes.
3394 */
3395 pmap_list_entry_t *free_pmap = (pmap_list_entry_t*)pmap;
3396 if (__improbable(free_pmap->array_index >= pmap_ptr_array_count)) {
3397 panic("%s: pmap %p has index %lu >= limit %lu", __func__, pmap,
3398 free_pmap->array_index, pmap_ptr_array_count);
3399 }
3400
3401 pmap_simple_lock(&pmap_free_list_lock);
3402
3403 /* Prepend the pmap object to the free list. */
3404 free_pmap->next = pmap_free_list;
3405 pmap_free_list = free_pmap;
3406
3407 pmap_simple_unlock(&pmap_free_list_lock);
3408 }
3409
3410 #endif /* XNU_MONITOR */
3411
3412 #if XNU_MONITOR
3413
3414 /**
3415 * Helper function to validate that the pointer passed into this method is truly
3416 * a userspace pmap object that was allocated through the pmap_alloc_pmap() API.
3417 * This function will panic if the validation fails.
3418 *
3419 * @param pmap The pointer to validate.
3420 * @param func The stringized function name of the caller that will be printed
3421 * in the case that the validation fails.
3422 */
3423 static void
validate_user_pmap(const volatile struct pmap * pmap,const char * func)3424 validate_user_pmap(const volatile struct pmap *pmap, const char *func)
3425 {
3426 /**
3427 * Ensure the array index isn't corrupted. This could happen if an attacker
3428 * is trying to pass off random memory as a pmap object.
3429 */
3430 const unsigned long array_index = ((const volatile pmap_list_entry_t*)pmap)->array_index;
3431 if (__improbable(array_index >= pmap_ptr_array_count)) {
3432 panic("%s: pmap array index %lu >= limit %lu", func, array_index, pmap_ptr_array_count);
3433 }
3434
3435 /**
3436 * If the array index is valid, then ensure that the passed in object
3437 * matches up with the object in the pmap pointer array for this index. Even
3438 * if an attacker passed in random memory with a valid index, there's no way
3439 * the pmap pointer array will ever point to anything but the objects
3440 * allocated by the pmap free list (it's PPL-owned memory).
3441 */
3442 if (__improbable(pmap_ptr_array[array_index] != (const volatile pmap_list_entry_t*)pmap)) {
3443 panic("%s: pmap %p does not match array element %p at index %lu", func, pmap,
3444 pmap_ptr_array[array_index], array_index);
3445 }
3446
3447 /**
3448 * Ensure that this isn't just an object sitting on the free list waiting to
3449 * be allocated. This also helps protect against a race between validating
3450 * and deleting a pmap object.
3451 */
3452 if (__improbable(os_atomic_load(&pmap->ref_count, seq_cst) <= 0)) {
3453 panic("%s: pmap %p is not in use", func, pmap);
3454 }
3455 }
3456
3457 #endif /* XNU_MONITOR */
3458
3459 /**
3460 * Validate that the pointer passed into this method is a valid pmap object and
3461 * is safe to read from and base PPL decisions off of. This function will panic
3462 * if the validation fails.
3463 *
3464 * @note On non-PPL systems this only checks that the pmap object isn't NULL.
3465 *
3466 * @note This validation should only be used on objects that won't be written to
3467 * for the duration of the PPL call. If the object is going to be modified
3468 * then you must use validate_pmap_mutable().
3469 *
3470 * @param pmap The pointer to validate.
3471 * @param func The stringized function name of the caller that will be printed
3472 * in the case that the validation fails.
3473 */
3474 void
validate_pmap_internal(const volatile struct pmap * pmap,const char * func)3475 validate_pmap_internal(const volatile struct pmap *pmap, const char *func)
3476 {
3477 #if !XNU_MONITOR
3478 #pragma unused(pmap, func)
3479 assert(pmap != NULL);
3480 #else /* !XNU_MONITOR */
3481 if (pmap != kernel_pmap) {
3482 validate_user_pmap(pmap, func);
3483 }
3484 #endif /* !XNU_MONITOR */
3485 }
3486
3487 /**
3488 * Validate that the pointer passed into this method is a valid pmap object and
3489 * is safe to both read and write to from within the PPL. This function will
3490 * panic if the validation fails.
3491 *
3492 * @note On non-PPL systems this only checks that the pmap object isn't NULL.
3493 *
3494 * @note If you're only going to be reading from the pmap object for the
3495 * duration of the PPL call, it'll be faster to use the immutable version
3496 * of this validation: validate_pmap().
3497 *
3498 * @param pmap The pointer to validate.
3499 * @param func The stringized function name of the caller that will be printed
3500 * in the case that the validation fails.
3501 */
3502 void
validate_pmap_mutable_internal(const volatile struct pmap * pmap,const char * func)3503 validate_pmap_mutable_internal(const volatile struct pmap *pmap, const char *func)
3504 {
3505 #if !XNU_MONITOR
3506 #pragma unused(pmap, func)
3507 assert(pmap != NULL);
3508 #else /* !XNU_MONITOR */
3509 if (pmap != kernel_pmap) {
3510 /**
3511 * Every time a pmap object is validated to be mutable, we mark it down
3512 * as an "inflight" pmap on this CPU. The inflight pmap for this CPU
3513 * will be set to NULL automatically when the PPL is exited. The
3514 * pmap_destroy() path will ensure that no "inflight" pmaps (on any CPU)
3515 * are ever destroyed so as to prevent racy use-after-free attacks.
3516 */
3517 pmap_cpu_data_t *cpu_data = pmap_get_cpu_data();
3518
3519 /**
3520 * As a sanity check (since the inflight pmap should be cleared when
3521 * exiting the PPL), ensure that the previous inflight pmap is NULL, or
3522 * is the same as the one being validated here (which allows for
3523 * validating the same object twice).
3524 */
3525 __assert_only const volatile struct pmap *prev_inflight_pmap =
3526 os_atomic_load(&cpu_data->inflight_pmap, relaxed);
3527 assert((prev_inflight_pmap == NULL) || (prev_inflight_pmap == pmap));
3528
3529 /**
3530 * The release barrier here is intended to pair with the seq_cst load of
3531 * ref_count in validate_user_pmap() to ensure that if a pmap is
3532 * concurrently destroyed, either this path will observe that it was
3533 * destroyed after marking it in-flight and panic, or pmap_destroy will
3534 * observe the pmap as in-flight after decrementing ref_count and panic.
3535 */
3536 os_atomic_store(&cpu_data->inflight_pmap, pmap, release);
3537
3538 validate_user_pmap(pmap, func);
3539 }
3540 #endif /* !XNU_MONITOR */
3541 }
3542
3543 /**
3544 * Validate that the passed in pmap pointer is a pmap object that was allocated
3545 * by the pmap and not just random memory. On PPL-enabled systems, the
3546 * allocation is done through the pmap_alloc_pmap() API. On all other systems
3547 * it's allocated through a zone allocator.
3548 *
3549 * This function will panic if the validation fails.
3550 *
3551 * @param pmap The object to validate.
3552 */
3553 void
pmap_require(pmap_t pmap)3554 pmap_require(pmap_t pmap)
3555 {
3556 #if XNU_MONITOR
3557 validate_pmap(pmap);
3558 #else /* XNU_MONITOR */
3559 if (pmap != kernel_pmap) {
3560 zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
3561 }
3562 #endif /* XNU_MONITOR */
3563 }
3564
3565 /**
3566 * Parse the device tree and determine how many pmap-io-ranges there are and
3567 * how much memory is needed to store all of that data.
3568 *
3569 * @note See the definition of pmap_io_range_t for more information on what a
3570 * "pmap-io-range" actually represents.
3571 *
3572 * @return The number of bytes needed to store metadata for all PPL-owned I/O
3573 * regions.
3574 */
3575 vm_size_t
pmap_compute_io_rgns(void)3576 pmap_compute_io_rgns(void)
3577 {
3578 DTEntry entry = NULL;
3579 __assert_only int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3580 assert(err == kSuccess);
3581
3582 void const *prop = NULL;
3583 unsigned int prop_size = 0;
3584 if (kSuccess != SecureDTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size)) {
3585 return 0;
3586 }
3587
3588 /**
3589 * The device tree node for pmap-io-ranges maps directly onto an array of
3590 * pmap_io_range_t structures.
3591 */
3592 pmap_io_range_t const *ranges = prop;
3593
3594 /* Determine the number of regions and validate the fields. */
3595 for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) {
3596 if (ranges[i].addr & PAGE_MASK) {
3597 panic("%s: %u addr 0x%llx is not page-aligned",
3598 __func__, i, ranges[i].addr);
3599 }
3600
3601 if (ranges[i].len & PAGE_MASK) {
3602 panic("%s: %u length 0x%llx is not page-aligned",
3603 __func__, i, ranges[i].len);
3604 }
3605
3606 uint64_t rgn_end = 0;
3607 if (os_add_overflow(ranges[i].addr, ranges[i].len, &rgn_end)) {
3608 panic("%s: %u addr 0x%llx length 0x%llx wraps around",
3609 __func__, i, ranges[i].addr, ranges[i].len);
3610 }
3611
3612 if (((ranges[i].addr <= gPhysBase) && (rgn_end > gPhysBase)) ||
3613 ((ranges[i].addr < avail_end) && (rgn_end >= avail_end)) ||
3614 ((ranges[i].addr > gPhysBase) && (rgn_end < avail_end))) {
3615 panic("%s: %u addr 0x%llx length 0x%llx overlaps physical memory",
3616 __func__, i, ranges[i].addr, ranges[i].len);
3617 }
3618
3619 ++num_io_rgns;
3620 }
3621
3622 return num_io_rgns * sizeof(*ranges);
3623 }
3624
3625 /**
3626 * Helper function used when sorting and searching PPL I/O ranges.
3627 *
3628 * @param a The first PPL I/O range to compare.
3629 * @param b The second PPL I/O range to compare.
3630 *
3631 * @return < 0 for a < b
3632 * 0 for a == b
3633 * > 0 for a > b
3634 */
3635 static int
cmp_io_rgns(const void * a,const void * b)3636 cmp_io_rgns(const void *a, const void *b)
3637 {
3638 const pmap_io_range_t *range_a = a;
3639 const pmap_io_range_t *range_b = b;
3640
3641 if ((range_b->addr + range_b->len) <= range_a->addr) {
3642 return 1;
3643 } else if ((range_a->addr + range_a->len) <= range_b->addr) {
3644 return -1;
3645 } else {
3646 return 0;
3647 }
3648 }
3649
3650 /**
3651 * Now that enough memory has been allocated to store all of the pmap-io-ranges
3652 * device tree nodes in memory, go ahead and do that copy and then sort the
3653 * resulting array by address for quicker lookup later.
3654 *
3655 * @note This function assumes that the amount of memory required to store the
3656 * entire pmap-io-ranges device tree node has already been calculated (via
3657 * pmap_compute_io_rgns()) and allocated in io_attr_table.
3658 *
3659 * @note This function will leave io_attr_table sorted by address to allow for
3660 * performing a binary search when doing future range lookups.
3661 */
3662 void
pmap_load_io_rgns(void)3663 pmap_load_io_rgns(void)
3664 {
3665 if (num_io_rgns == 0) {
3666 return;
3667 }
3668
3669 DTEntry entry = NULL;
3670 int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3671 assert(err == kSuccess);
3672
3673 void const *prop = NULL;
3674 unsigned int prop_size;
3675 err = SecureDTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size);
3676 assert(err == kSuccess);
3677
3678 pmap_io_range_t const *ranges = prop;
3679 for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) {
3680 io_attr_table[i] = ranges[i];
3681 }
3682
3683 qsort(io_attr_table, num_io_rgns, sizeof(*ranges), cmp_io_rgns);
3684 }
3685
3686 /**
3687 * Checks if a pmap-io-range is exempted from being enforced under certain
3688 * conditions.
3689 *
3690 * @param io_range The pmap-io-range to be checked
3691 *
3692 * @return NULL if the pmap-io-range should be exempted. Otherwise, returns
3693 * the passed in pmap-io-range.
3694 */
3695 static pmap_io_range_t*
pmap_exempt_io_range(pmap_io_range_t * io_range)3696 pmap_exempt_io_range(pmap_io_range_t *io_range)
3697 {
3698 #if DEBUG || DEVELOPMENT
3699 if (__improbable(io_range->signature == 'RVBR')) {
3700 return NULL;
3701 }
3702 #endif /* DEBUG || DEVELOPMENT */
3703
3704 return io_range;
3705 }
3706
3707 /**
3708 * Find and return the PPL I/O range that contains the passed in physical
3709 * address.
3710 *
3711 * @note This function performs a binary search on the already sorted
3712 * io_attr_table, so it should be reasonably fast.
3713 *
3714 * @param paddr The physical address to query a specific I/O range for.
3715 *
3716 * @return A pointer to the pmap_io_range_t structure if one of the ranges
3717 * contains the passed in physical address. Otherwise, NULL.
3718 */
3719 pmap_io_range_t*
pmap_find_io_attr(pmap_paddr_t paddr)3720 pmap_find_io_attr(pmap_paddr_t paddr)
3721 {
3722 unsigned int begin = 0;
3723 unsigned int end = num_io_rgns - 1;
3724
3725 /**
3726 * If there are no I/O ranges, or the wanted address is below the lowest
3727 * range or above the highest range, then there's no point in searching
3728 * since it won't be here.
3729 */
3730 if ((num_io_rgns == 0) || (paddr < io_attr_table[begin].addr) ||
3731 (paddr >= (io_attr_table[end].addr + io_attr_table[end].len))) {
3732 return NULL;
3733 }
3734
3735 /**
3736 * A dummy I/O range to compare against when searching for a range that
3737 * includes `paddr`.
3738 */
3739 const pmap_io_range_t wanted_range = {
3740 .addr = paddr & ~PAGE_MASK,
3741 .len = PAGE_SIZE
3742 };
3743
3744 /* Perform a binary search to find the wanted I/O range. */
3745 for (;;) {
3746 const unsigned int middle = (begin + end) / 2;
3747 const int cmp = cmp_io_rgns(&wanted_range, &io_attr_table[middle]);
3748
3749 if (cmp == 0) {
3750 /* Success! Found the wanted I/O range. */
3751 return pmap_exempt_io_range(&io_attr_table[middle]);
3752 } else if (begin == end) {
3753 /* We've checked every range and didn't find a match. */
3754 break;
3755 } else if (cmp > 0) {
3756 /* The wanted range is above the middle. */
3757 begin = middle + 1;
3758 } else {
3759 /* The wanted range is below the middle. */
3760 end = middle;
3761 }
3762 }
3763
3764 return NULL;
3765 }
3766
3767 #if HAS_GUARDED_IO_FILTER
3768 /**
3769 * Parse the device tree and determine how many pmap-io-filters there are and
3770 * how much memory is needed to store all of that data.
3771 *
3772 * @note See the definition of pmap_io_filter_entry_t for more information on what a
3773 * "pmap-io-filter" actually represents.
3774 *
3775 * @return The number of bytes needed to store metadata for all I/O filter
3776 * entries.
3777 */
3778 vm_size_t
pmap_compute_io_filters(void)3779 pmap_compute_io_filters(void)
3780 {
3781 DTEntry entry = NULL;
3782 __assert_only int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3783 assert(err == kSuccess);
3784
3785 void const *prop = NULL;
3786 unsigned int prop_size = 0;
3787 if (kSuccess != SecureDTGetProperty(entry, "pmap-io-filters", &prop, &prop_size)) {
3788 return 0;
3789 }
3790
3791 pmap_io_filter_entry_t const *entries = prop;
3792
3793 /* Determine the number of entries. */
3794 for (unsigned int i = 0; i < (prop_size / sizeof(*entries)); ++i) {
3795 if (entries[i].offset + entries[i].length > ARM_PGMASK) {
3796 panic("%s: io filter entry %u offset 0x%hx length 0x%hx crosses page boundary",
3797 __func__, i, entries[i].offset, entries[i].length);
3798 }
3799
3800 ++num_io_filter_entries;
3801 }
3802
3803 return num_io_filter_entries * sizeof(*entries);
3804 }
3805
3806 /**
3807 * Compares two I/O filter entries by signature.
3808 *
3809 * @note The numerical comparison of signatures does not carry any meaning
3810 * but it does give us a way to order and binary search the entries.
3811 *
3812 * @param a The first I/O filter entry to compare.
3813 * @param b The second I/O filter entry to compare.
3814 *
3815 * @return < 0 for a < b
3816 * 0 for a == b
3817 * > 0 for a > b
3818 */
3819 static int
cmp_io_filter_entries_by_signature(const void * a,const void * b)3820 cmp_io_filter_entries_by_signature(const void *a, const void *b)
3821 {
3822 const pmap_io_filter_entry_t *entry_a = a;
3823 const pmap_io_filter_entry_t *entry_b = b;
3824
3825 if (entry_b->signature < entry_a->signature) {
3826 return 1;
3827 } else if (entry_a->signature < entry_b->signature) {
3828 return -1;
3829 } else {
3830 return 0;
3831 }
3832 }
3833
3834 /**
3835 * Compares two I/O filter entries by address range.
3836 *
3837 * @note The function returns 0 as long as the ranges overlap. It allows
3838 * the user not only to detect overlaps across a list of entries,
3839 * but also to feed it an address with unit length and a range
3840 * to check for inclusion.
3841 *
3842 * @param a The first I/O filter entry to compare.
3843 * @param b The second I/O filter entry to compare.
3844 *
3845 * @return < 0 for a < b
3846 * 0 for a == b
3847 * > 0 for a > b
3848 */
3849 static int
cmp_io_filter_entries_by_addr(const void * a,const void * b)3850 cmp_io_filter_entries_by_addr(const void *a, const void *b)
3851 {
3852 const pmap_io_filter_entry_t *entry_a = a;
3853 const pmap_io_filter_entry_t *entry_b = b;
3854
3855 if ((entry_b->offset + entry_b->length) <= entry_a->offset) {
3856 return 1;
3857 } else if ((entry_a->offset + entry_a->length) <= entry_b->offset) {
3858 return -1;
3859 } else {
3860 return 0;
3861 }
3862 }
3863
3864 /**
3865 * Compares two I/O filter entries by signature, then by address range.
3866 *
3867 * @param a The first I/O filter entry to compare.
3868 * @param b The second I/O filter entry to compare.
3869 *
3870 * @return < 0 for a < b
3871 * 0 for a == b
3872 * > 0 for a > b
3873 */
3874 static int
cmp_io_filter_entries(const void * a,const void * b)3875 cmp_io_filter_entries(const void *a, const void *b)
3876 {
3877 const int cmp_signature_result = cmp_io_filter_entries_by_signature(a, b);
3878 return (cmp_signature_result != 0) ? cmp_signature_result : cmp_io_filter_entries_by_addr(a, b);
3879 }
3880
3881 /**
3882 * Now that enough memory has been allocated to store all of the pmap-io-filters
3883 * device tree nodes in memory, go ahead and do that copy and then sort the
3884 * resulting array by address for quicker lookup later.
3885 *
3886 * @note This function assumes that the amount of memory required to store the
3887 * entire pmap-io-filters device tree node has already been calculated (via
3888 * pmap_compute_io_filters()) and allocated in io_filter_table.
3889 *
3890 * @note This function will leave io_attr_table sorted by signature and addresss to
3891 * allow for performing a binary search when doing future lookups.
3892 */
3893 void
pmap_load_io_filters(void)3894 pmap_load_io_filters(void)
3895 {
3896 if (num_io_filter_entries == 0) {
3897 return;
3898 }
3899
3900 DTEntry entry = NULL;
3901 int err = SecureDTLookupEntry(NULL, "/defaults", &entry);
3902 assert(err == kSuccess);
3903
3904 void const *prop = NULL;
3905 unsigned int prop_size;
3906 err = SecureDTGetProperty(entry, "pmap-io-filters", &prop, &prop_size);
3907 assert(err == kSuccess);
3908
3909 pmap_io_filter_entry_t const *entries = prop;
3910 for (unsigned int i = 0; i < (prop_size / sizeof(*entries)); ++i) {
3911 io_filter_table[i] = entries[i];
3912 }
3913
3914 qsort(io_filter_table, num_io_filter_entries, sizeof(*entries), cmp_io_filter_entries);
3915
3916 for (unsigned int i = 0; i < num_io_filter_entries - 1; i++) {
3917 if (io_filter_table[i].signature == io_filter_table[i + 1].signature) {
3918 if (io_filter_table[i].offset + io_filter_table[i].length > io_filter_table[i + 1].offset) {
3919 panic("%s: io filter entry %u and %u overlap.",
3920 __func__, i, i + 1);
3921 }
3922 }
3923 }
3924 }
3925
3926 /**
3927 * Find and return the I/O filter entry that contains the passed in physical
3928 * address.
3929 *
3930 * @note This function performs a binary search on the already sorted
3931 * io_filter_table, so it should be reasonably fast.
3932 *
3933 * @param paddr The physical address to query a specific I/O filter for.
3934 * @param width The width of the I/O register at paddr, at most 8 bytes.
3935 * @param io_range_outp If not NULL, this argument is set to the io_attr_table
3936 * entry containing paddr.
3937 *
3938 * @return A pointer to the pmap_io_range_t structure if one of the ranges
3939 * contains the passed in I/O register described by paddr and width.
3940 * Otherwise, NULL.
3941 */
3942 pmap_io_filter_entry_t*
pmap_find_io_filter_entry(pmap_paddr_t paddr,uint64_t width,const pmap_io_range_t ** io_range_outp)3943 pmap_find_io_filter_entry(pmap_paddr_t paddr, uint64_t width, const pmap_io_range_t **io_range_outp)
3944 {
3945 /* Don't bother looking for it when we don't have any entries. */
3946 if (__improbable(num_io_filter_entries == 0)) {
3947 return NULL;
3948 }
3949
3950 if (__improbable(width > 8)) {
3951 return NULL;
3952 }
3953
3954 /* Check if paddr is owned by PPL (Guarded mode SW). */
3955 const pmap_io_range_t *io_range = pmap_find_io_attr(paddr);
3956
3957 /**
3958 * Just return NULL if paddr is not owned by PPL.
3959 */
3960 if (io_range == NULL) {
3961 return NULL;
3962 }
3963
3964 const uint32_t signature = io_range->signature;
3965 unsigned int begin = 0;
3966 unsigned int end = num_io_filter_entries - 1;
3967
3968 /**
3969 * A dummy I/O filter entry to compare against when searching for a range that
3970 * includes `paddr`.
3971 */
3972 const pmap_io_filter_entry_t wanted_filter = {
3973 .signature = signature,
3974 .offset = (uint16_t) ((paddr & ~0b11) & PAGE_MASK),
3975 .length = (uint16_t) width // This downcast is safe because width is validated.
3976 };
3977
3978 /* Perform a binary search to find the wanted filter entry. */
3979 for (;;) {
3980 const unsigned int middle = (begin + end) / 2;
3981 const int cmp = cmp_io_filter_entries(&wanted_filter, &io_filter_table[middle]);
3982
3983 if (cmp == 0) {
3984 /**
3985 * We have found a "match" by the definition of cmp_io_filter_entries,
3986 * meaning the dummy range and the io_filter_entry are overlapping. Make
3987 * sure the dummy range is contained entirely by the entry.
3988 */
3989 const pmap_io_filter_entry_t entry_found = io_filter_table[middle];
3990 if ((wanted_filter.offset >= entry_found.offset) &&
3991 ((wanted_filter.offset + wanted_filter.length) <= (entry_found.offset + entry_found.length))) {
3992 if (io_range) {
3993 *io_range_outp = io_range;
3994 }
3995
3996 return &io_filter_table[middle];
3997 } else {
3998 /**
3999 * Under the assumption that there is no overlapping io_filter_entry,
4000 * if the dummy range is found overlapping but not contained by an
4001 * io_filter_entry, there cannot be another io_filter_entry containing
4002 * the dummy range, so return NULL here.
4003 */
4004 return NULL;
4005 }
4006 } else if (begin == end) {
4007 /* We've checked every range and didn't find a match. */
4008 break;
4009 } else if (cmp > 0) {
4010 /* The wanted range is above the middle. */
4011 begin = middle + 1;
4012 } else {
4013 /* The wanted range is below the middle. */
4014 end = middle;
4015 }
4016 }
4017
4018 return NULL;
4019 }
4020 #endif /* HAS_GUARDED_IO_FILTER */
4021
4022 /**
4023 * Initialize the pmap per-CPU data structure for a single CPU. This is called
4024 * once for each CPU in the system, on the CPU whose per-cpu data needs to be
4025 * initialized.
4026 *
4027 * In reality, many of the per-cpu data fields will have either already been
4028 * initialized or will rely on the fact that the per-cpu data is either zeroed
4029 * out during allocation (on non-PPL systems), or the data itself is a global
4030 * variable which will be zeroed by default (on PPL systems).
4031 *
4032 * @param cpu_number The number of the CPU whose pmap per-cpu data should be
4033 * initialized. This number should correspond to the CPU
4034 * executing this code.
4035 */
4036 MARK_AS_PMAP_TEXT void
pmap_cpu_data_init_internal(unsigned int cpu_number)4037 pmap_cpu_data_init_internal(unsigned int cpu_number)
4038 {
4039 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
4040
4041 #if XNU_MONITOR
4042 /* Verify the per-cpu data is cacheline-aligned. */
4043 assert(((vm_offset_t)pmap_cpu_data & (MAX_L2_CLINE_BYTES - 1)) == 0);
4044
4045 /**
4046 * The CPU number should already have been initialized to
4047 * PMAP_INVALID_CPU_NUM when initializing the boot CPU data.
4048 */
4049 if (pmap_cpu_data->cpu_number != PMAP_INVALID_CPU_NUM) {
4050 panic("%s: pmap_cpu_data->cpu_number=%u, cpu_number=%u",
4051 __func__, pmap_cpu_data->cpu_number, cpu_number);
4052 }
4053 #endif /* XNU_MONITOR */
4054
4055 /**
4056 * At least when operating in the PPL, it's important to duplicate the CPU
4057 * number into a PPL-owned location. If we relied strictly on the CPU number
4058 * located in the general machine-specific per-cpu data, it could be
4059 * modified in a way to affect PPL operation.
4060 */
4061 pmap_cpu_data->cpu_number = cpu_number;
4062 #if __ARM_MIXED_PAGE_SIZE__
4063 pmap_cpu_data->commpage_page_shift = PAGE_SHIFT;
4064 #endif
4065 }
4066
4067 /**
4068 * Initialize the pmap per-cpu data for the bootstrap CPU (the other CPUs should
4069 * just call pmap_cpu_data_init() directly). This code does one of two things
4070 * depending on whether this is a PPL-enabled system.
4071 *
4072 * PPL-enabled: This function will setup the PPL-specific per-cpu data like the
4073 * PPL stacks and register save area. This performs the
4074 * functionality usually done by cpu_data_init() to setup the pmap
4075 * per-cpu data fields. In reality, most fields are not initialized
4076 * and are assumed to be zero thanks to this data being global.
4077 *
4078 * Non-PPL: Just calls pmap_cpu_data_init() to initialize the bootstrap CPU's
4079 * pmap per-cpu data (non-boot CPUs will call that function once they
4080 * come out of reset).
4081 *
4082 * @note This function will carve out physical pages for the PPL stacks and PPL
4083 * register save area from avail_start. It's assumed that avail_start is
4084 * on a page boundary before executing this function on PPL-enabled
4085 * systems.
4086 */
4087 void
pmap_cpu_data_array_init(void)4088 pmap_cpu_data_array_init(void)
4089 {
4090 #if XNU_MONITOR
4091 /**
4092 * Enough virtual address space to cover all PPL stacks for every CPU should
4093 * have already been allocated by arm_vm_init() before pmap_bootstrap() is
4094 * called.
4095 */
4096 assert((pmap_stacks_start != NULL) && (pmap_stacks_end != NULL));
4097 assert(((uintptr_t)pmap_stacks_end - (uintptr_t)pmap_stacks_start) == PPL_STACK_REGION_SIZE);
4098
4099 /**
4100 * Ensure avail_start is aligned to a page boundary before allocating the
4101 * stacks and register save area.
4102 */
4103 assert(avail_start == round_page(avail_start));
4104
4105 /* Each PPL stack contains guard pages before and after. */
4106 vm_offset_t stack_va = (vm_offset_t)pmap_stacks_start + ARM_PGBYTES;
4107
4108 /**
4109 * Globally save off the beginning of the PPL stacks physical space so that
4110 * we can update its physical aperture mappings later in the bootstrap
4111 * process.
4112 */
4113 pmap_stacks_start_pa = avail_start;
4114
4115 /* Map the PPL stacks for each CPU. */
4116 for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4117 /**
4118 * The PPL stack size is based off of the VM page size, which may differ
4119 * from the underlying hardware page size.
4120 *
4121 * Map all of the PPL stack into the kernel's address space.
4122 */
4123 for (vm_offset_t cur_va = stack_va; cur_va < (stack_va + PPL_STACK_SIZE); cur_va += ARM_PGBYTES) {
4124 assert(cur_va < (vm_offset_t)pmap_stacks_end);
4125
4126 pt_entry_t *ptep = pmap_pte(kernel_pmap, cur_va);
4127 assert(*ptep == ARM_PTE_EMPTY);
4128
4129 pt_entry_t template = pa_to_pte(avail_start) | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) |
4130 ARM_PTE_TYPE | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | xprr_perm_to_pte(XPRR_PPL_RW_PERM);
4131
4132 #if __ARM_KERNEL_PROTECT__
4133 /**
4134 * On systems with software based spectre/meltdown mitigations,
4135 * kernel mappings are explicitly not made global because the kernel
4136 * is unmapped when executing in EL0 (this ensures that kernel TLB
4137 * entries won't accidentally be valid in EL0).
4138 */
4139 template |= ARM_PTE_NG;
4140 #endif /* __ARM_KERNEL_PROTECT__ */
4141
4142 write_pte(ptep, template);
4143 __builtin_arm_isb(ISB_SY);
4144
4145 avail_start += ARM_PGBYTES;
4146 }
4147
4148 #if KASAN
4149 kasan_map_shadow(stack_va, PPL_STACK_SIZE, false);
4150 #endif /* KASAN */
4151
4152 /**
4153 * Setup non-zero pmap per-cpu data fields. If the default value should
4154 * be zero, then you can assume the field is already set to that.
4155 */
4156 pmap_cpu_data_array[cpu_num].cpu_data.cpu_number = PMAP_INVALID_CPU_NUM;
4157 pmap_cpu_data_array[cpu_num].cpu_data.ppl_state = PPL_STATE_KERNEL;
4158 pmap_cpu_data_array[cpu_num].cpu_data.ppl_stack = (void*)(stack_va + PPL_STACK_SIZE);
4159
4160 /**
4161 * Get the first VA of the next CPU's PPL stack. Need to skip the guard
4162 * page after the stack.
4163 */
4164 stack_va += (PPL_STACK_SIZE + ARM_PGBYTES);
4165 }
4166
4167 pmap_stacks_end_pa = avail_start;
4168
4169 /**
4170 * The PPL register save area location is saved into global variables so
4171 * that they can be made writable if DTrace support is needed. This is
4172 * needed because DTrace will try to update the register state.
4173 */
4174 ppl_cpu_save_area_start = avail_start;
4175 ppl_cpu_save_area_end = ppl_cpu_save_area_start;
4176 pmap_paddr_t ppl_cpu_save_area_cur = ppl_cpu_save_area_start;
4177
4178 /* Carve out space for the PPL register save area for each CPU. */
4179 for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4180 /* Allocate enough space to cover at least one arm_context_t object. */
4181 while ((ppl_cpu_save_area_end - ppl_cpu_save_area_cur) < sizeof(arm_context_t)) {
4182 avail_start += PAGE_SIZE;
4183 ppl_cpu_save_area_end = avail_start;
4184 }
4185
4186 pmap_cpu_data_array[cpu_num].cpu_data.save_area = (arm_context_t *)phystokv(ppl_cpu_save_area_cur);
4187 ppl_cpu_save_area_cur += sizeof(arm_context_t);
4188 }
4189
4190 #if HAS_GUARDED_IO_FILTER
4191 /**
4192 * Enough virtual address space to cover all I/O filter stacks for every CPU should
4193 * have already been allocated by arm_vm_init() before pmap_bootstrap() is
4194 * called.
4195 */
4196 assert((iofilter_stacks_start != NULL) && (iofilter_stacks_end != NULL));
4197 assert(((uintptr_t)iofilter_stacks_end - (uintptr_t)iofilter_stacks_start) == IOFILTER_STACK_REGION_SIZE);
4198
4199 /* Each I/O filter stack contains guard pages before and after. */
4200 vm_offset_t iofilter_stack_va = (vm_offset_t)iofilter_stacks_start + ARM_PGBYTES;
4201
4202 /**
4203 * Globally save off the beginning of the I/O filter stacks physical space so that
4204 * we can update its physical aperture mappings later in the bootstrap
4205 * process.
4206 */
4207 iofilter_stacks_start_pa = avail_start;
4208
4209 /* Map the I/O filter stacks for each CPU. */
4210 for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4211 /**
4212 * Map all of the I/O filter stack into the kernel's address space.
4213 */
4214 for (vm_offset_t cur_va = iofilter_stack_va; cur_va < (iofilter_stack_va + IOFILTER_STACK_SIZE); cur_va += ARM_PGBYTES) {
4215 assert(cur_va < (vm_offset_t)iofilter_stacks_end);
4216
4217 pt_entry_t *ptep = pmap_pte(kernel_pmap, cur_va);
4218 assert(*ptep == ARM_PTE_EMPTY);
4219
4220 pt_entry_t template = pa_to_pte(avail_start) | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) |
4221 ARM_PTE_TYPE | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | xprr_perm_to_pte(XPRR_PPL_RW_PERM);
4222
4223 #if __ARM_KERNEL_PROTECT__
4224 template |= ARM_PTE_NG;
4225 #endif /* __ARM_KERNEL_PROTECT__ */
4226
4227 write_pte(ptep, template);
4228 __builtin_arm_isb(ISB_SY);
4229
4230 avail_start += ARM_PGBYTES;
4231 }
4232
4233 #if KASAN
4234 kasan_map_shadow(iofilter_stack_va, IOFILTER_STACK_SIZE, false);
4235 #endif /* KASAN */
4236
4237 /**
4238 * Setup non-zero pmap per-cpu data fields. If the default value should
4239 * be zero, then you can assume the field is already set to that.
4240 */
4241 pmap_cpu_data_array[cpu_num].cpu_data.iofilter_stack = (void*)(iofilter_stack_va + IOFILTER_STACK_SIZE);
4242
4243 /**
4244 * Get the first VA of the next CPU's IOFILTER stack. Need to skip the guard
4245 * page after the stack.
4246 */
4247 iofilter_stack_va += (IOFILTER_STACK_SIZE + ARM_PGBYTES);
4248 }
4249
4250 iofilter_stacks_end_pa = avail_start;
4251 #endif /* HAS_GUARDED_IO_FILTER */
4252
4253 /* Carve out scratch space for each cpu */
4254 for (unsigned int cpu_num = 0; cpu_num < MAX_CPUS; cpu_num++) {
4255 pmap_cpu_data_array[cpu_num].cpu_data.scratch_page = (void*)phystokv(avail_start);
4256 avail_start += PAGE_SIZE;
4257 }
4258 #endif /* XNU_MONITOR */
4259
4260 pmap_cpu_data_init();
4261 }
4262
4263 /**
4264 * Retrieve the pmap per-cpu data for the current CPU. On PPL-enabled systems
4265 * this data is managed separately from the general machine-specific per-cpu
4266 * data to handle the requirement that it must only be PPL-writable.
4267 *
4268 * @return The per-cpu pmap data for the current CPU.
4269 */
4270 pmap_cpu_data_t *
pmap_get_cpu_data(void)4271 pmap_get_cpu_data(void)
4272 {
4273 pmap_cpu_data_t *pmap_cpu_data = NULL;
4274
4275 #if XNU_MONITOR
4276 extern pmap_cpu_data_t* ml_get_ppl_cpu_data(void);
4277 pmap_cpu_data = ml_get_ppl_cpu_data();
4278 #else /* XNU_MONITOR */
4279 /**
4280 * On non-PPL systems, the pmap per-cpu data is stored in the general
4281 * machine-specific per-cpu data.
4282 */
4283 pmap_cpu_data = &getCpuDatap()->cpu_pmap_cpu_data;
4284 #endif /* XNU_MONITOR */
4285
4286 return pmap_cpu_data;
4287 }
4288
4289 /**
4290 * Retrieve the pmap per-cpu data for the specified cpu index.
4291 *
4292 * @return The per-cpu pmap data for the CPU
4293 */
4294 pmap_cpu_data_t *
pmap_get_remote_cpu_data(unsigned int cpu)4295 pmap_get_remote_cpu_data(unsigned int cpu)
4296 {
4297 #if XNU_MONITOR
4298 assert(cpu < MAX_CPUS);
4299 return &pmap_cpu_data_array[cpu].cpu_data;
4300 #else
4301 cpu_data_t *cpu_data = cpu_datap((int)cpu);
4302 if (cpu_data == NULL) {
4303 return NULL;
4304 } else {
4305 return &cpu_data->cpu_pmap_cpu_data;
4306 }
4307 #endif
4308 }
4309
4310 void
pmap_mark_page_for_cache_flush(pmap_paddr_t pa)4311 pmap_mark_page_for_cache_flush(pmap_paddr_t pa)
4312 {
4313 if (!pa_valid(pa)) {
4314 return;
4315 }
4316 const unsigned int pai = pa_index(pa);
4317 pv_entry_t **pvh = pai_to_pvh(pai);
4318 pvh_lock(pai);
4319 pvh_set_flags(pvh, pvh_get_flags(pvh) | PVH_FLAG_FLUSH_NEEDED);
4320 pvh_unlock(pai);
4321 }
4322
4323 #if HAS_DC_INCPA
4324 void
4325 #else
4326 void __attribute__((noreturn))
4327 #endif
pmap_flush_noncoherent_page(pmap_paddr_t paddr __unused)4328 pmap_flush_noncoherent_page(pmap_paddr_t paddr __unused)
4329 {
4330 assertf((paddr & PAGE_MASK) == 0, "%s: paddr 0x%llx not page-aligned",
4331 __func__, (unsigned long long)paddr);
4332
4333 #if HAS_DC_INCPA
4334 for (unsigned int i = 0; i < (PAGE_SIZE >> 12); ++i) {
4335 const register uint64_t dc_arg asm("x8") = paddr + (i << 12);
4336 /**
4337 * rdar://problem/106067403
4338 * __asm__ __volatile__("dc incpa4k, %0" : : "r"(dc_arg));
4339 */
4340 __asm__ __volatile__ (".long 0x201308" : : "r"(dc_arg));
4341 }
4342 __builtin_arm_dsb(DSB_OSH);
4343 #else
4344 panic("%s called on unsupported configuration", __func__);
4345 #endif /* HAS_DC_INCPA */
4346 }
4347