xref: /xnu-12377.81.4/osfmk/vm/vm_resident.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_page.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *
62  *	Resident memory management module.
63  */
64 #include <debug.h>
65 #include <libkern/OSDebug.h>
66 
67 #include <mach/clock_types.h>
68 #include <mach/vm_prot.h>
69 #include <mach/vm_statistics.h>
70 #include <mach/sdt.h>
71 #include <kern/counter.h>
72 #include <kern/exclaves_memory.h>
73 #include <kern/host_statistics.h>
74 #include <kern/sched_prim.h>
75 #include <kern/policy_internal.h>
76 #include <kern/task.h>
77 #include <kern/thread.h>
78 #include <kern/kalloc.h>
79 #include <kern/zalloc_internal.h>
80 #include <kern/ledger.h>
81 #include <kern/ecc.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_init_xnu.h>
84 #include <vm/vm_map_internal.h>
85 #include <vm/vm_page_internal.h>
86 #include <vm/vm_pageout_internal.h>
87 #include <vm/vm_kern_xnu.h>                 /* kmem_alloc() */
88 #include <vm/vm_compressor_pager_internal.h>
89 #include <kern/misc_protos.h>
90 #include <mach_debug/zone_info.h>
91 #include <vm/cpm_internal.h>
92 #include <pexpert/pexpert.h>
93 #include <pexpert/device_tree.h>
94 #include <san/kasan.h>
95 #include <vm/vm_log.h>
96 
97 #include <libkern/coreanalytics/coreanalytics.h>
98 #include <kern/backtrace.h>
99 #include <kern/telemetry.h>
100 
101 #include <vm/vm_protos_internal.h>
102 #include <vm/memory_object.h>
103 #include <vm/vm_purgeable_internal.h>
104 #include <vm/vm_compressor_internal.h>
105 #include <vm/vm_iokit.h>
106 #include <vm/vm_object_internal.h>
107 
108 #if HAS_MTE
109 #include <vm/vm_mteinfo_internal.h>
110 #endif /* HAS_MTE */
111 
112 #if defined (__x86_64__)
113 #include <i386/misc_protos.h>
114 #endif
115 
116 #if CONFIG_SPTM
117 #include <arm64/sptm/sptm.h>
118 #endif
119 
120 #if CONFIG_PHANTOM_CACHE
121 #include <vm/vm_phantom_cache_internal.h>
122 #endif
123 
124 #if HIBERNATION
125 #include <IOKit/IOHibernatePrivate.h>
126 #include <machine/pal_hibernate.h>
127 #endif /* HIBERNATION */
128 
129 #if CONFIG_SECLUDED_MEMORY
130 static_assert(!XNU_VM_HAS_LOPAGE,
131     "VM_PAGE_ON_SECLUDED_Q and VM_PAGE_ON_FREE_LOPAGE_Q alias");
132 #endif
133 
134 #include <sys/kdebug.h>
135 
136 #if defined(HAS_APPLE_PAC)
137 #include <ptrauth.h>
138 #endif
139 #if defined(__arm64__)
140 #include <arm/cpu_internal.h>
141 #endif /* defined(__arm64__) */
142 
143 /*
144  * During single threaded early boot we don't initialize all pages.
145  * This avoids some delay during boot. They'll be initialized and
146  * added to the free list as needed or after we are multithreaded by
147  * what becomes the pageout thread.
148  *
149  * This slows down booting the DEBUG kernel, particularly on
150  * large memory systems, but is worthwhile in deterministically
151  * trapping uninitialized memory usage.
152  */
153 #if DEBUG
154 static TUNABLE(uint32_t, fillval, "fill", 0xDEB8F177);
155 #else
156 static TUNABLE(uint32_t, fillval, "fill", 0);
157 #endif
158 
159 #if MACH_ASSERT
160 
161 TUNABLE(bool, vm_check_refs_on_alloc, "vm_check_refs_on_alloc", false);
162 
163 #endif /* MACH_ASSERT */
164 
165 extern boolean_t vm_pageout_running;
166 extern thread_t  vm_pageout_scan_thread;
167 extern bool vps_dynamic_priority_enabled;
168 
169 const uint16_t vm_page_inactive_states =
170     BIT(VM_PAGE_ON_INACTIVE_INTERNAL_Q) |
171     BIT(VM_PAGE_ON_INACTIVE_EXTERNAL_Q) |
172     BIT(VM_PAGE_ON_INACTIVE_CLEANED_Q);
173 
174 const uint16_t vm_page_active_or_inactive_states =
175     vm_page_inactive_states |
176 #if CONFIG_SECLUDED_MEMORY
177     BIT(VM_PAGE_ON_SECLUDED_Q) |
178 #endif /* CONFIG_SECLUDED_MEMORY */
179     BIT(VM_PAGE_ON_ACTIVE_Q);
180 
181 const uint16_t vm_page_non_speculative_pageable_states =
182     vm_page_active_or_inactive_states |
183     BIT(VM_PAGE_ON_THROTTLED_Q);
184 
185 const uint16_t vm_page_pageable_states =
186     vm_page_non_speculative_pageable_states |
187     BIT(VM_PAGE_ON_SPECULATIVE_Q);
188 
189 #if CONFIG_SECLUDED_MEMORY
190 struct vm_page_secluded_data vm_page_secluded;
191 #endif /* CONFIG_SECLUDED_MEMORY */
192 #if HIBERNATION
193 static bool hibernate_rebuild_needed = false;
194 #endif /* HIBERNATION */
195 
196 #if DEVELOPMENT || DEBUG
197 extern struct memory_object_pager_ops shared_region_pager_ops;
198 unsigned int shared_region_pagers_resident_count = 0;
199 unsigned int shared_region_pagers_resident_peak = 0;
200 #endif /* DEVELOPMENT || DEBUG */
201 
202 
203 
204 unsigned int    PERCPU_DATA(start_color);
205 vm_page_t       PERCPU_DATA(free_pages);
206 SCALABLE_COUNTER_DEFINE(vm_cpu_free_count);
207 boolean_t       hibernate_cleaning_in_progress = FALSE;
208 
209 atomic_counter_t vm_guard_count;
210 
211 #if XNU_VM_HAS_LOPAGE
212 /*
213  * this interface exists to support hardware controllers
214  * incapable of generating DMAs with more than 32 bits
215  * of address on platforms with physical memory > 4G...
216  */
217 vm_page_queue_head_t    vm_lopage_queue_free VM_PAGE_PACKED_ALIGNED;
218 uint32_t        vm_lopage_free_count   = 0;
219 uint32_t        vm_lopage_free_limit   = 0;
220 uint32_t        vm_lopage_lowater      = 0;
221 bool            vm_lopage_refill       = false;
222 bool            vm_lopage_needed       = false;
223 unsigned int    vm_lopages_allocated_q = 0;
224 unsigned int    vm_lopages_allocated_cpm_success = 0;
225 unsigned int    vm_lopages_allocated_cpm_failed = 0;
226 #endif /* XNU_VM_HAS_LOPAGE */
227 
228 
229 int             speculative_age_index = 0;
230 int             speculative_steal_index = 0;
231 struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_RESERVED_SPECULATIVE_AGE_Q + 1];
232 
233 boolean_t       hibernation_vmqueues_inspection = FALSE; /* Tracks if the hibernation code is looking at the VM queues.
234                                                           * Updated and checked behind the vm_page_queues_lock. */
235 
236 static void             vm_page_free_prepare(vm_page_t  page);
237 
238 #if HAS_MTE
239 void                    vm_page_wire_boot_tags(void);
240 #endif /* HAS_MTE */
241 
242 static void vm_tag_init(void);
243 
244 /* for debugging purposes */
245 SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask =
246     VM_PAGE_PACKED_FROM_ARRAY;
247 #ifndef __BUILDING_XNU_LIB_UNITTEST__ /* This is not a compile-time constant when building unit-test */
248 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params =
249     VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR);
250 #endif /* __BUILDING_XNU_LIB_UNITTEST__ */
251 
252 /*
253  *	Associated with page of user-allocatable memory is a
254  *	page structure.
255  */
256 
257 /*
258  *	These variables record the values returned by vm_page_bootstrap,
259  *	for debugging purposes.  The implementation of pmap_steal_memory
260  *	and pmap_startup here also uses them internally.
261  */
262 
263 vm_offset_t virtual_space_start;
264 vm_offset_t virtual_space_end;
265 uint32_t        vm_page_pages;
266 
267 /*
268  *	The vm_page_lookup() routine, which provides for fast
269  *	(virtual memory object, offset) to page lookup, employs
270  *	the following hash table.  The vm_page_{insert,remove}
271  *	routines install and remove associations in the table.
272  *	[This table is often called the virtual-to-physical,
273  *	or VP, table.]
274  */
275 typedef struct {
276 	vm_page_packed_t page_list;
277 #if     MACH_PAGE_HASH_STATS
278 	int             cur_count;              /* current count */
279 	int             hi_count;               /* high water mark */
280 #endif /* MACH_PAGE_HASH_STATS */
281 } vm_page_bucket_t;
282 
283 
284 #define BUCKETS_PER_LOCK        16
285 
286 SECURITY_READ_ONLY_LATE(vm_page_bucket_t *) vm_page_buckets;                /* Array of buckets */
287 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_bucket_count = 0;       /* How big is array? */
288 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_hash_mask;              /* Mask for hash function */
289 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_hash_shift;             /* Shift for hash function */
290 SECURITY_READ_ONLY_LATE(uint32_t)           vm_page_bucket_hash;            /* Basic bucket hash */
291 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_bucket_lock_count = 0;  /* How big is array of locks? */
292 
293 #ifndef VM_TAG_ACTIVE_UPDATE
294 #error VM_TAG_ACTIVE_UPDATE
295 #endif
296 #ifndef VM_TAG_SIZECLASSES
297 #error VM_TAG_SIZECLASSES
298 #endif
299 
300 /* for debugging */
301 SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE;
302 SECURITY_READ_ONLY_LATE(lck_ticket_t *) vm_page_bucket_locks;
303 
304 vm_allocation_site_t            vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1];
305 vm_allocation_site_t *          vm_allocation_sites[VM_MAX_TAG_VALUE];
306 #if VM_TAG_SIZECLASSES
307 static vm_allocation_zone_total_t **vm_allocation_zone_totals;
308 #endif /* VM_TAG_SIZECLASSES */
309 
310 vm_tag_t vm_allocation_tag_highest;
311 
312 #if VM_PAGE_BUCKETS_CHECK
313 boolean_t vm_page_buckets_check_ready = FALSE;
314 #if VM_PAGE_FAKE_BUCKETS
315 vm_page_bucket_t *vm_page_fake_buckets; /* decoy buckets */
316 vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
317 #endif /* VM_PAGE_FAKE_BUCKETS */
318 #endif /* VM_PAGE_BUCKETS_CHECK */
319 
320 #if     MACH_PAGE_HASH_STATS
321 /* This routine is only for debug.  It is intended to be called by
322  * hand by a developer using a kernel debugger.  This routine prints
323  * out vm_page_hash table statistics to the kernel debug console.
324  */
325 void
hash_debug(void)326 hash_debug(void)
327 {
328 	int     i;
329 	int     numbuckets = 0;
330 	int     highsum = 0;
331 	int     maxdepth = 0;
332 
333 	for (i = 0; i < vm_page_bucket_count; i++) {
334 		if (vm_page_buckets[i].hi_count) {
335 			numbuckets++;
336 			highsum += vm_page_buckets[i].hi_count;
337 			if (vm_page_buckets[i].hi_count > maxdepth) {
338 				maxdepth = vm_page_buckets[i].hi_count;
339 			}
340 		}
341 	}
342 	printf("Total number of buckets: %d\n", vm_page_bucket_count);
343 	printf("Number used buckets:     %d = %d%%\n",
344 	    numbuckets, 100 * numbuckets / vm_page_bucket_count);
345 	printf("Number unused buckets:   %d = %d%%\n",
346 	    vm_page_bucket_count - numbuckets,
347 	    100 * (vm_page_bucket_count - numbuckets) / vm_page_bucket_count);
348 	printf("Sum of bucket max depth: %d\n", highsum);
349 	printf("Average bucket depth:    %d.%2d\n",
350 	    highsum / vm_page_bucket_count,
351 	    highsum % vm_page_bucket_count);
352 	printf("Maximum bucket depth:    %d\n", maxdepth);
353 }
354 #endif /* MACH_PAGE_HASH_STATS */
355 
356 /*
357  *	The virtual page size is currently implemented as a runtime
358  *	variable, but is constant once initialized using vm_set_page_size.
359  *	This initialization must be done in the machine-dependent
360  *	bootstrap sequence, before calling other machine-independent
361  *	initializations.
362  *
363  *	All references to the virtual page size outside this
364  *	module must use the PAGE_SIZE, PAGE_MASK and PAGE_SHIFT
365  *	constants.
366  */
367 #if defined(__arm64__)
368 vm_size_t       page_size;
369 vm_size_t       page_mask;
370 int             page_shift;
371 #else
372 vm_size_t       page_size  = PAGE_SIZE;
373 vm_size_t       page_mask  = PAGE_MASK;
374 int             page_shift = PAGE_SHIFT;
375 #endif
376 
377 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages;
378 #if XNU_VM_HAS_DELAYED_PAGES
379 vm_page_t                          vm_pages_end;
380 uint32_t                           vm_pages_count;
381 #else
382 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_end;
383 SECURITY_READ_ONLY_LATE(uint32_t)  vm_pages_count;
384 #endif /* XNU_VM_HAS_DELAYED_PAGES */
385 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
386 SECURITY_READ_ONLY_LATE(ppnum_t)   vm_pages_first_pnum;
387 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
388 #if HAS_MTE
389 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_tag_storage;
390 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_tag_storage_end;
391 #endif /* HAS_MTE */
392 #if CONFIG_SPTM
393 /*
394  * When used, these 128bit (MAX_COLORS bits) masks represent a "cluster"
395  * of contiguous free physical pages.
396  *
397  * For each cluster, there is an enqueue "index", which is -1 when there is no
398  * free page in the cluster, or the index in [0, 128) of the page that is
399  * enqueued on the vm_page_free_queue to represent the entire cluster.
400  *
401  * Grouping pages this way has the double nice effect to reduce doubly linked
402  * list (the worst data structure known to man when considering cache misses)
403  * manipulations, and also to mechanically make the VM serve more "contiguous"
404  * pages naturally.
405  */
406 static_assert(XNU_VM_HAS_LINEAR_PAGES_ARRAY);
407 SECURITY_READ_ONLY_LATE(__uint128_t *) _vm_pages_free_masks;
408 SECURITY_READ_ONLY_LATE(int8_t *)      _vm_pages_free_enqueue_idx;
409 #endif /* CONFIG_SPTM */
410 
411 
412 /*
413  *	Resident pages that represent real memory
414  *	are allocated from a set of free lists,
415  *	one per color.
416  */
417 SECURITY_READ_ONLY_LATE(unsigned int) vm_colors;
418 SECURITY_READ_ONLY_LATE(unsigned int) vm_color_mask; /* mask is == (vm_colors-1) */
419 unsigned int    vm_cache_geometry_colors = 0;   /* set by hw dependent code during startup */
420 unsigned int    vm_free_magazine_refill_limit = 0;
421 
422 struct vm_page_free_queue vm_page_queue_free;
423 
424 unsigned int    vm_page_free_wanted;
425 unsigned int    vm_page_free_wanted_privileged;
426 #if CONFIG_SECLUDED_MEMORY
427 unsigned int    vm_page_free_wanted_secluded;
428 #endif /* CONFIG_SECLUDED_MEMORY */
429 unsigned int    vm_page_free_count;
430 
431 unsigned int    vm_page_realtime_count;
432 
433 /*
434  *	Occasionally, the virtual memory system uses
435  *	resident page structures that do not refer to
436  *	real pages, for example to leave a page with
437  *	important state information in the VP table.
438  *
439  *	These page structures are allocated the way
440  *	most other kernel structures are.
441  */
442 SECURITY_READ_ONLY_LATE(zone_t) vm_page_zone;
443 vm_locks_array_t vm_page_locks;
444 
445 LCK_ATTR_DECLARE(vm_page_lck_attr, 0, 0);
446 LCK_GRP_DECLARE(vm_page_lck_grp_free, "vm_page_free");
447 LCK_GRP_DECLARE(vm_page_lck_grp_queue, "vm_page_queue");
448 LCK_GRP_DECLARE(vm_page_lck_grp_local, "vm_page_queue_local");
449 LCK_GRP_DECLARE(vm_page_lck_grp_purge, "vm_page_purge");
450 LCK_GRP_DECLARE(vm_page_lck_grp_alloc, "vm_page_alloc");
451 LCK_GRP_DECLARE(vm_page_lck_grp_bucket, "vm_page_bucket");
452 LCK_SPIN_DECLARE_ATTR(vm_objects_wired_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
453 LCK_TICKET_DECLARE(vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
454 
455 unsigned int    vm_page_local_q_soft_limit = 250;
456 unsigned int    vm_page_local_q_hard_limit = 500;
457 struct vpl     *__zpercpu vm_page_local_q;
458 
459 /* N.B. Guard and fictitious pages must not
460  * be assigned a zero phys_page value.
461  */
462 /*
463  *	Fictitious pages don't have a physical address,
464  *	but we must initialize phys_page to something.
465  *	For debugging, this should be a strange value
466  *	that the pmap module can recognize in assertions.
467  */
468 const ppnum_t vm_page_fictitious_addr = (ppnum_t) -1;
469 
470 /*
471  *	Guard pages are not accessible so they don't
472  *      need a physical address, but we need to enter
473  *	one in the pmap.
474  *	Let's make it recognizable and make sure that
475  *	we don't use a real physical page with that
476  *	physical address.
477  */
478 const ppnum_t vm_page_guard_addr = (ppnum_t) -2;
479 
480 /*
481  *	Resident page structures are also chained on
482  *	queues that are used by the page replacement
483  *	system (pageout daemon).  These queues are
484  *	defined here, but are shared by the pageout
485  *	module.  The inactive queue is broken into
486  *	file backed and anonymous for convenience as the
487  *	pageout daemon often assignes a higher
488  *	importance to anonymous pages (less likely to pick)
489  */
490 vm_page_queue_head_t    vm_page_queue_active VM_PAGE_PACKED_ALIGNED;
491 vm_page_queue_head_t    vm_page_queue_inactive VM_PAGE_PACKED_ALIGNED;
492 #if CONFIG_SECLUDED_MEMORY
493 vm_page_queue_head_t    vm_page_queue_secluded VM_PAGE_PACKED_ALIGNED;
494 #endif /* CONFIG_SECLUDED_MEMORY */
495 vm_page_queue_head_t    vm_page_queue_anonymous VM_PAGE_PACKED_ALIGNED;  /* inactive memory queue for anonymous pages */
496 vm_page_queue_head_t    vm_page_queue_throttled VM_PAGE_PACKED_ALIGNED;
497 
498 queue_head_t    vm_objects_wired;
499 
500 vm_page_queue_head_t    vm_page_queue_donate VM_PAGE_PACKED_ALIGNED;
501 uint32_t        vm_page_donate_mode;
502 uint32_t        vm_page_donate_target, vm_page_donate_target_high, vm_page_donate_target_low;
503 uint32_t        vm_page_donate_count;
504 bool            vm_page_donate_queue_ripe;
505 
506 
507 vm_page_queue_head_t    vm_page_queue_background VM_PAGE_PACKED_ALIGNED;
508 uint32_t        vm_page_background_target;
509 uint32_t        vm_page_background_target_snapshot;
510 uint32_t        vm_page_background_count;
511 uint64_t        vm_page_background_promoted_count;
512 
513 uint32_t        vm_page_background_internal_count;
514 uint32_t        vm_page_background_external_count;
515 
516 uint32_t        vm_page_background_mode;
517 uint32_t        vm_page_background_exclude_external;
518 
519 unsigned int    vm_page_active_count;
520 unsigned int    vm_page_inactive_count;
521 unsigned int    vm_page_kernelcache_count;
522 #if CONFIG_SECLUDED_MEMORY
523 unsigned int    vm_page_secluded_count;
524 unsigned int    vm_page_secluded_count_free;
525 unsigned int    vm_page_secluded_count_inuse;
526 unsigned int    vm_page_secluded_count_over_target;
527 #endif /* CONFIG_SECLUDED_MEMORY */
528 unsigned int    vm_page_anonymous_count;
529 unsigned int    vm_page_throttled_count;
530 unsigned int    vm_page_speculative_count;
531 
532 unsigned int    vm_page_wire_count;
533 unsigned int    vm_page_wire_count_on_boot = 0;
534 unsigned int    vm_page_stolen_count = 0;
535 unsigned int    vm_page_wire_count_initial;
536 unsigned int    vm_page_gobble_count = 0;
537 unsigned int    vm_page_kern_lpage_count = 0;
538 
539 uint64_t        booter_size;  /* external so it can be found in core dumps */
540 
541 #define VM_PAGE_WIRE_COUNT_WARNING      0
542 #define VM_PAGE_GOBBLE_COUNT_WARNING    0
543 
544 unsigned int    vm_page_purgeable_count = 0; /* # of pages purgeable now */
545 unsigned int    vm_page_purgeable_wired_count = 0; /* # of purgeable pages that are wired now */
546 uint64_t        vm_page_purged_count = 0;    /* total count of purged pages */
547 
548 unsigned int    vm_page_xpmapped_external_count = 0;
549 unsigned int    vm_page_external_count = 0;
550 unsigned int    vm_page_internal_count = 0;
551 unsigned int    vm_page_pageable_external_count = 0;
552 unsigned int    vm_page_pageable_internal_count = 0;
553 
554 #if DEVELOPMENT || DEBUG
555 unsigned int    vm_page_speculative_recreated = 0;
556 unsigned int    vm_page_speculative_created = 0;
557 unsigned int    vm_page_speculative_used = 0;
558 #endif
559 
560 _Atomic unsigned int vm_page_swapped_count = 0;
561 
562 vm_page_queue_head_t    vm_page_queue_cleaned VM_PAGE_PACKED_ALIGNED;
563 
564 unsigned int    vm_page_cleaned_count = 0;
565 
566 uint64_t        max_valid_dma_address = 0xffffffffffffffffULL;
567 ppnum_t         max_valid_low_ppnum = PPNUM_MAX;
568 
569 
570 /*
571  *	Several page replacement parameters are also
572  *	shared with this module, so that page allocation
573  *	(done here in vm_page_alloc) can trigger the
574  *	pageout daemon.
575  */
576 unsigned int    vm_page_free_target = 0;
577 unsigned int    vm_page_free_min = 0;
578 unsigned int    vm_page_throttle_limit = 0;
579 unsigned int    vm_page_inactive_target = 0;
580 #if CONFIG_SECLUDED_MEMORY
581 unsigned int    vm_page_secluded_target = 0;
582 #endif /* CONFIG_SECLUDED_MEMORY */
583 unsigned int    vm_page_anonymous_min = 0;
584 unsigned int    vm_page_free_reserved = 0;
585 
586 
587 /*
588  *	The VM system has a couple of heuristics for deciding
589  *	that pages are "uninteresting" and should be placed
590  *	on the inactive queue as likely candidates for replacement.
591  *	These variables let the heuristics be controlled at run-time
592  *	to make experimentation easier.
593  */
594 
595 boolean_t vm_page_deactivate_hint = TRUE;
596 
597 struct vm_page_stats_reusable vm_page_stats_reusable;
598 
599 /*
600  *	vm_set_page_size:
601  *
602  *	Sets the page size, perhaps based upon the memory
603  *	size.  Must be called before any use of page-size
604  *	dependent functions.
605  *
606  *	Sets page_shift and page_mask from page_size.
607  */
608 void
vm_set_page_size(void)609 vm_set_page_size(void)
610 {
611 	page_size  = PAGE_SIZE;
612 	page_mask  = PAGE_MASK;
613 	page_shift = PAGE_SHIFT;
614 
615 	if ((page_mask & page_size) != 0) {
616 		panic("vm_set_page_size: page size not a power of two");
617 	}
618 
619 	for (page_shift = 0;; page_shift++) {
620 		if ((1U << page_shift) == page_size) {
621 			break;
622 		}
623 	}
624 }
625 
626 #if HAS_MTE
627 
628 bool
vm_page_is_tag_storage_pnum(vm_page_t mem,ppnum_t pnum)629 vm_page_is_tag_storage_pnum(vm_page_t mem, ppnum_t pnum)
630 {
631 	return pmap_in_tag_storage_range(pnum) &&
632 	       !mteinfo_tag_storage_disabled(mem);
633 }
634 
635 #endif
636 
637 /*
638  * @abstract
639  * Given a page, returns the memory class of that page.
640  */
641 static vm_memory_class_t
vm_page_get_memory_class(vm_page_t mem __unused,ppnum_t pnum __unused)642 vm_page_get_memory_class(vm_page_t mem __unused, ppnum_t pnum __unused)
643 {
644 	assert(!vm_page_is_fictitious(mem));
645 
646 #if XNU_VM_HAS_LOPAGE
647 	if (mem->vmp_lopage) {
648 		return VM_MEMORY_CLASS_LOPAGE;
649 	}
650 #endif /* XNU_VM_HAS_LOPAGE */
651 #if HAS_MTE
652 	if (mem->vmp_using_mte) {
653 		return VM_MEMORY_CLASS_TAGGED;
654 	} else if (!is_mte_enabled || !pmap_in_tag_storage_range(pnum)) {
655 		return VM_MEMORY_CLASS_REGULAR;
656 	} else if (mteinfo_tag_storage_disabled(mem)) {
657 		return VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
658 	} else {
659 		return VM_MEMORY_CLASS_TAG_STORAGE;
660 	}
661 #else /* !HAS_MTE */
662 	return VM_MEMORY_CLASS_REGULAR;
663 #endif /* !HAS_MTE */
664 }
665 
666 /*
667  * vm_page_is_restricted:
668  *
669  * Checks if a given vm_page_t is a restricted page.
670  */
671 inline bool
vm_page_is_restricted(vm_page_t mem)672 vm_page_is_restricted(vm_page_t mem)
673 {
674 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(mem);
675 	return pmap_is_page_restricted(pn);
676 }
677 
678 #ifdef __x86_64__
679 
680 #define MAX_CLUMP_SIZE      16
681 #define DEFAULT_CLUMP_SIZE  4
682 
683 unsigned int vm_clump_size, vm_clump_mask, vm_clump_shift, vm_clump_promote_threshold;
684 
685 #if DEVELOPMENT || DEBUG
686 unsigned long vm_clump_stats[MAX_CLUMP_SIZE + 1];
687 unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
688 
689 static inline void
vm_clump_update_stats(unsigned int c)690 vm_clump_update_stats(unsigned int c)
691 {
692 	assert(c <= vm_clump_size);
693 	if (c > 0 && c <= vm_clump_size) {
694 		vm_clump_stats[c] += c;
695 	}
696 	vm_clump_allocs += c;
697 }
698 #endif  /*  if DEVELOPMENT || DEBUG */
699 
700 /* Called once to setup the VM clump knobs */
701 static void
vm_page_setup_clump(void)702 vm_page_setup_clump( void )
703 {
704 	unsigned int override, n;
705 
706 	vm_clump_size = DEFAULT_CLUMP_SIZE;
707 	if (PE_parse_boot_argn("clump_size", &override, sizeof(override))) {
708 		vm_clump_size = override;
709 	}
710 
711 	if (vm_clump_size > MAX_CLUMP_SIZE) {
712 		panic("vm_page_setup_clump:: clump_size is too large!");
713 	}
714 	if (vm_clump_size < 1) {
715 		panic("vm_page_setup_clump:: clump_size must be >= 1");
716 	}
717 	if ((vm_clump_size & (vm_clump_size - 1)) != 0) {
718 		panic("vm_page_setup_clump:: clump_size must be a power of 2");
719 	}
720 
721 	vm_clump_promote_threshold = vm_clump_size;
722 	vm_clump_mask = vm_clump_size - 1;
723 	for (vm_clump_shift = 0, n = vm_clump_size; n > 1; n >>= 1, vm_clump_shift++) {
724 		;
725 	}
726 
727 #if DEVELOPMENT || DEBUG
728 	bzero(vm_clump_stats, sizeof(vm_clump_stats));
729 	vm_clump_allocs = vm_clump_inserts = vm_clump_inrange = vm_clump_promotes = 0;
730 #endif  /*  if DEVELOPMENT || DEBUG */
731 }
732 
733 #endif /* __x86_64__ */
734 
735 void
vm_page_free_queue_init(vm_page_free_queue_t free_queue)736 vm_page_free_queue_init(vm_page_free_queue_t free_queue)
737 {
738 	for (unsigned int color = 0; color < MAX_COLORS; color++) {
739 		vm_page_queue_init(&free_queue->vmpfq_queues[color].qhead);
740 	}
741 }
742 
743 /*!
744  * @function vm_page_free_queue_for_class()
745  *
746  * @abstract
747  * Returns the appropriate free queue for the given class and page color.
748  */
749 __pure2
750 static vm_page_queue_t
vm_page_free_queue_for_class(vm_memory_class_t mem_class,unsigned int color)751 vm_page_free_queue_for_class(vm_memory_class_t mem_class, unsigned int color)
752 {
753 	switch (mem_class) {
754 	case VM_MEMORY_CLASS_REGULAR:
755 #if HAS_MTE
756 	case VM_MEMORY_CLASS_TAGGED:
757 	case VM_MEMORY_CLASS_TAG_STORAGE:
758 		if (is_mte_enabled) {
759 			return NULL;
760 		}
761 	case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
762 #endif
763 		return &vm_page_queue_free.vmpfq_queues[color].qhead;
764 #if XNU_VM_HAS_LOPAGE
765 	case VM_MEMORY_CLASS_LOPAGE:
766 		return &vm_lopage_queue_free;
767 #endif /* XNU_VM_HAS_LOPAGE */
768 #if CONFIG_SECLUDED_MEMORY
769 	case VM_MEMORY_CLASS_SECLUDED:
770 		return &vm_page_queue_secluded;
771 #endif
772 	}
773 }
774 
775 __pure2
776 static bool
vm_page_free_queue_has_colors(vm_memory_class_t mem_class)777 vm_page_free_queue_has_colors(vm_memory_class_t mem_class)
778 {
779 	switch (mem_class) {
780 	case VM_MEMORY_CLASS_REGULAR:
781 #if HAS_MTE
782 	case VM_MEMORY_CLASS_TAGGED:
783 	case VM_MEMORY_CLASS_TAG_STORAGE:
784 	case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
785 #endif
786 		return true;
787 #if XNU_VM_HAS_LOPAGE
788 	case VM_MEMORY_CLASS_LOPAGE:
789 		return false;
790 #endif /* XNU_VM_HAS_LOPAGE */
791 #if CONFIG_SECLUDED_MEMORY
792 	case VM_MEMORY_CLASS_SECLUDED:
793 		return false;
794 #endif
795 	}
796 }
797 
798 
799 #if CONFIG_SECLUDED_MEMORY
800 
801 static bool
vm_page_secluded_pool_eligible(vm_memory_class_t class)802 vm_page_secluded_pool_eligible(vm_memory_class_t class)
803 {
804 	switch (class) {
805 #if XNU_VM_HAS_LOPAGE
806 	case VM_MEMORY_CLASS_LOPAGE:
807 		return false;
808 #endif /* XNU_VM_HAS_LOPAGE */
809 #if HAS_MTE
810 	case VM_MEMORY_CLASS_TAG_STORAGE:
811 	case VM_MEMORY_CLASS_TAGGED:
812 		return false;
813 #endif /* HAS_MTE */
814 	default:
815 		return true;
816 	}
817 }
818 
819 static bool
vm_page_secluded_pool_depleted(void)820 vm_page_secluded_pool_depleted(void)
821 {
822 	if (vm_page_free_count <= vm_page_free_reserved) {
823 		return false;
824 	}
825 	if (num_tasks_can_use_secluded_mem) {
826 		return false;
827 	}
828 	return vm_page_secluded_count < vm_page_secluded_target;
829 }
830 
831 #endif /* CONFIG_SECLUDED_MEMORY */
832 #if HIBERNATION
833 
834 __attribute__((overloadable))
835 static void
836 vm_page_free_queue_foreach(vm_page_queue_t queue, void (^block)(vm_page_t))
837 {
838 	vm_page_t page;
839 
vm_page_queue_iterate(queue,page,vmp_pageq)840 	vm_page_queue_iterate(queue, page, vmp_pageq) {
841 		block(page);
842 	}
843 }
844 
845 __attribute__((overloadable))
846 static void
847 vm_page_free_queue_foreach(vm_page_free_queue_t queue, void (^block)(vm_page_t))
848 {
849 	for (unsigned int color = 0; color < vm_colors; color++) {
850 		vm_page_free_queue_foreach(&queue->vmpfq_queues[color].qhead, block);
851 	}
852 }
853 
854 #endif /* HIBERNATION */
855 #if CONFIG_SPTM
856 
857 static inline uint32_t
vm_pages_free_mask_len(void)858 vm_pages_free_mask_len(void)
859 {
860 	extern pmap_paddr_t real_avail_end;
861 
862 	uint64_t pnums = atop(real_avail_end) - pmap_first_pnum;
863 	static_assert(8 * sizeof(__uint128_t) == MAX_COLORS);
864 	return (uint32_t)((pnums + MAX_COLORS - 1) / MAX_COLORS);
865 }
866 
867 static inline int8_t
vm_pages_free_mask_bit(ppnum_t pnum)868 vm_pages_free_mask_bit(ppnum_t pnum)
869 {
870 	return (int8_t)(pnum & (MAX_COLORS - 1));
871 }
872 
873 static inline uint32_t
vm_pages_free_mask_index(ppnum_t pnum)874 vm_pages_free_mask_index(ppnum_t pnum)
875 {
876 	return (pnum - pmap_first_pnum) / MAX_COLORS;
877 }
878 
879 __pure2
880 static inline __uint128_t *
vm_pages_free_masks(void)881 vm_pages_free_masks(void)
882 {
883 	return _vm_pages_free_masks;
884 }
885 
886 __pure2
887 static inline bitmap_t *
vm_pages_free_masks_as_bitmap(uint32_t index)888 vm_pages_free_masks_as_bitmap(uint32_t index)
889 {
890 	/*
891 	 * this conversion is gross but helps with codegen for bit-wise
892 	 * accesses where the __uint128_t type is really yielding poor code.
893 	 *
894 	 * This conversion is only legal on little endian architectures.
895 	 */
896 #ifndef __LITTLE_ENDIAN__
897 #error unsupported configuration
898 #endif
899 	return (bitmap_t *)(_vm_pages_free_masks + index);
900 }
901 
902 __pure2
903 static inline int8_t *
vm_pages_free_enqueue_idx(uint32_t index)904 vm_pages_free_enqueue_idx(uint32_t index)
905 {
906 	return &_vm_pages_free_enqueue_idx[index];
907 }
908 
909 /*!
910  * @brief
911  * Return the position of the next bit in "circular" order for a given cluster
912  * of pages, starting at and including @c bit.
913  */
914 static inline int8_t
vm_pages_free_mask_next_bit(uint32_t index,int8_t bit)915 vm_pages_free_mask_next_bit(uint32_t index, int8_t bit)
916 {
917 	__uint128_t value = vm_pages_free_masks()[index];
918 	__uint128_t mask  = ((__uint128_t)1 << bit) - 1;
919 
920 	if (value == 0) {
921 		return -1;
922 	}
923 
924 	if (value & ~mask) {
925 		value &= ~mask;
926 	}
927 	if ((uint64_t)value) {
928 		return (int8_t)__builtin_ctzll((uint64_t)value);
929 	}
930 	return 64 + (int8_t)__builtin_ctzll((uint64_t)(value >> 64));
931 }
932 
933 static inline bool
vm_pages_free_mask_test(uint32_t index,int8_t bit)934 vm_pages_free_mask_test(uint32_t index, int8_t bit)
935 {
936 	return bitmap_test(vm_pages_free_masks_as_bitmap(index), bit);
937 }
938 
939 static inline void
vm_pages_free_mask_set(uint32_t index,int8_t bit)940 vm_pages_free_mask_set(uint32_t index, int8_t bit)
941 {
942 	assert(!vm_pages_free_mask_test(index, bit));
943 	bitmap_set(vm_pages_free_masks_as_bitmap(index), bit);
944 }
945 
946 static inline void
vm_pages_free_mask_clear(uint32_t index,int8_t bit)947 vm_pages_free_mask_clear(uint32_t index, int8_t bit)
948 {
949 	assert(vm_pages_free_mask_test(index, bit));
950 	bitmap_clear(vm_pages_free_masks_as_bitmap(index), bit);
951 }
952 
953 #endif /* CONFIG_SPTM */
954 
955 __attribute__((always_inline))
956 void
vm_page_free_queue_enter(vm_memory_class_t class,vm_page_t mem,ppnum_t pnum)957 vm_page_free_queue_enter(vm_memory_class_t class, vm_page_t mem, ppnum_t pnum)
958 {
959 	bool            enter_first;
960 	unsigned int    color;
961 	vm_page_queue_t queue;
962 
963 	if (startup_phase >= STARTUP_SUB_KMEM) {
964 		LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
965 	}
966 
967 	assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
968 	assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
969 	    mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
970 	    mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
971 	    mem->vmp_next_m == 0 &&
972 	    mem->vmp_object == 0 &&
973 	    mem->vmp_wire_count == 0 &&
974 	    mem->vmp_busy &&
975 	    !mem->vmp_tabled &&
976 	    !mem->vmp_laundry &&
977 	    !mem->vmp_pmapped &&
978 	    !mem->vmp_wpmapped &&
979 	    !mem->vmp_realtime);
980 
981 	switch (class) {
982 #if XNU_VM_HAS_LOPAGE
983 	case VM_MEMORY_CLASS_LOPAGE:
984 		mem->vmp_q_state     = VM_PAGE_ON_FREE_LOPAGE_Q;
985 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
986 		mem->vmp_lopage      = true;
987 		mem->vmp_canonical   = true;
988 		enter_first          = true;
989 		break;
990 #endif /* XNU_VM_HAS_LOPAGE */
991 #if CONFIG_SECLUDED_MEMORY
992 	case VM_MEMORY_CLASS_SECLUDED:
993 		if (startup_phase >= STARTUP_SUB_KMEM) {
994 			LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
995 		}
996 		mem->vmp_q_state     = VM_PAGE_ON_SECLUDED_Q;
997 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
998 		mem->vmp_lopage      = false;
999 		mem->vmp_canonical   = true;
1000 		enter_first          = true;
1001 		break;
1002 #endif
1003 	default:
1004 		mem->vmp_q_state     = VM_PAGE_ON_FREE_Q;
1005 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1006 		mem->vmp_lopage      = false;
1007 		mem->vmp_canonical   = true;
1008 		enter_first          = false;
1009 		break;
1010 	}
1011 
1012 #if HAS_MTE
1013 	if (is_mte_enabled) {
1014 		switch (class) {
1015 		case VM_MEMORY_CLASS_REGULAR:
1016 			return mteinfo_covered_page_set_free(pnum, false);
1017 		case VM_MEMORY_CLASS_TAGGED:
1018 			return mteinfo_covered_page_set_free(pnum, true);
1019 		case VM_MEMORY_CLASS_TAG_STORAGE:
1020 			return mteinfo_tag_storage_set_inactive(mem, false);
1021 		default:
1022 			break;
1023 		}
1024 	}
1025 #endif /* HAS_MTE */
1026 
1027 	color = VM_PAGE_GET_COLOR_PNUM(pnum);
1028 	queue = vm_page_free_queue_for_class(class, color);
1029 #if CONFIG_SPTM
1030 	if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) {
1031 		uint32_t index = vm_pages_free_mask_index(pnum);
1032 		int8_t   bit   = vm_pages_free_mask_bit(pnum);
1033 
1034 		if (vm_pages_free_masks()[index] == 0) {
1035 			vm_page_queue_enter(queue, mem, vmp_pageq);
1036 			*vm_pages_free_enqueue_idx(index) = bit;
1037 		}
1038 		vm_pages_free_mask_set(index, bit);
1039 	} else
1040 #endif /* CONFIG_SPTM */
1041 	if (enter_first) {
1042 		vm_page_queue_enter_first(queue, mem, vmp_pageq);
1043 	} else {
1044 #if defined(__x86_64__)
1045 		vm_page_queue_enter_clump(queue, mem);
1046 #else
1047 		vm_page_queue_enter(queue, mem, vmp_pageq);
1048 #endif
1049 	}
1050 
1051 	switch (class) {
1052 	case VM_MEMORY_CLASS_REGULAR:
1053 		VM_COUNTER_INC(&vm_page_queue_free.vmpfq_count);
1054 		VM_COUNTER_INC(&vm_page_free_count);
1055 		break;
1056 #if HAS_MTE
1057 	case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1058 		VM_COUNTER_INC(&vm_page_queue_free.vmpfq_count);
1059 		VM_COUNTER_INC(&vm_page_free_unmanaged_tag_storage_count);
1060 		/* these do not participate to the vm page free count */
1061 		break;
1062 #endif
1063 #if XNU_VM_HAS_LOPAGE
1064 	case VM_MEMORY_CLASS_LOPAGE:
1065 		VM_COUNTER_INC(&vm_lopage_free_count);
1066 		if (vm_lopage_free_count >= vm_lopage_free_limit) {
1067 			vm_lopage_refill = false;
1068 		}
1069 		break;
1070 #endif /* XNU_VM_HAS_LOPAGE */
1071 #if CONFIG_SECLUDED_MEMORY
1072 	case VM_MEMORY_CLASS_SECLUDED:
1073 		vm_page_secluded_count++;
1074 		vm_page_secluded_count_free++;
1075 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
1076 		break;
1077 #endif /* CONFIG_SECLUDED_MEMORY */
1078 	default:
1079 		__builtin_unreachable();
1080 	}
1081 }
1082 
1083 /*!
1084  * @typedef vmp_free_list_result_t
1085  *
1086  * @discussion
1087  * This data structure is used by vm_page_free_queue_add_list to track
1088  * how many pages were freed to which free lists, so that it can then drive
1089  * which waiters we are going to wake up.
1090  *
1091  * uint8_t counters are enough because we never free more than 64 pages at
1092  * a time, and this allows for the data structure to be passed by register.
1093  */
1094 typedef struct {
1095 	uint8_t vmpr_regular;
1096 #if HAS_MTE
1097 	uint8_t vmpr_taggable;
1098 	uint8_t vmpr_tag_storage;
1099 #endif /* HAS_MTE */
1100 	uint8_t vmpr_lopage;
1101 #if CONFIG_SECLUDED_MEMORY
1102 	uint8_t vmpr_secluded;
1103 #endif /* CONFIG_SECLUDED_MEMORY */
1104 } vmp_free_list_result_t;
1105 
1106 /*!
1107  * @abstract
1108  * Returns whether there are any threads blocked in VM_PAGE_WAIT().
1109  *
1110  * @discussion
1111  * The page free queue lock must be held.
1112  */
1113 static bool
vm_page_free_queue_has_any_waiters(void)1114 vm_page_free_queue_has_any_waiters(void)
1115 {
1116 	uint32_t result = 0;
1117 
1118 	result |= vm_page_free_wanted;
1119 	result |= vm_page_free_wanted_privileged;
1120 #if HAS_MTE
1121 	result |= vm_page_free_wanted_tagged;
1122 	result |= vm_page_free_wanted_tagged_privileged;
1123 #endif /* HAS_MTE */
1124 #if CONFIG_SECLUDED_MEMORY
1125 	result |= vm_page_free_wanted_secluded;
1126 #endif /* CONFIG_SECLUDED_MEMORY */
1127 
1128 	return result != 0;
1129 }
1130 
1131 void
vm_page_free_wakeup(event_t event,uint32_t n)1132 vm_page_free_wakeup(event_t event, uint32_t n)
1133 {
1134 	if (vps_dynamic_priority_enabled) {
1135 		if (n == UINT32_MAX) {
1136 			wakeup_all_with_inheritor(event, THREAD_AWAKENED);
1137 		} else {
1138 			while (n-- > 0) {
1139 				wakeup_one_with_inheritor(event, THREAD_AWAKENED,
1140 				    LCK_WAKE_DO_NOT_TRANSFER_PUSH, NULL);
1141 			}
1142 		}
1143 	} else {
1144 		thread_wakeup_nthreads(event, n);
1145 	}
1146 }
1147 
1148 /*!
1149  * @abstract
1150  * Helper to wakeup threads in VM_PAGE_WAIT() given
1151  * a vm_page_free_queue_enter_list() result.
1152  *
1153  * @discussion
1154  * The page free queue lock must be held, and is unlocked on return.
1155  *
1156  * @param vmpr          The result of a vm_page_free_queue_enter_list() call.
1157  */
1158 __attribute__((noinline))
1159 static void
vm_page_free_queue_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr)1160 vm_page_free_queue_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr)
1161 {
1162 	unsigned int    need_wakeup = 0;
1163 	unsigned int    need_priv_wakeup = 0;
1164 #if CONFIG_SECLUDED_MEMORY
1165 	unsigned int    need_wakeup_secluded = 0;
1166 #endif /* CONFIG_SECLUDED_MEMORY */
1167 	unsigned int    unpriv_limit;
1168 #if HAS_MTE
1169 	unsigned int    need_tagged_wakeup = 0;
1170 	unsigned int    need_priv_tagged_wakeup = 0;
1171 	unsigned int    unpriv_tagged_limit;
1172 	unsigned int    n;
1173 	bool            wakeup_refill_thread = false;
1174 #endif /* HAS_MTE */
1175 
1176 #define DONATE_TO_WAITERS(wake, count, waiters_count, limit)  ({ \
1177 	uint32_t __n = MIN(MIN(waiters_count, vmpr.count), limit);              \
1178 	waiters_count -= __n;                                                   \
1179 	vmpr.count    -= __n;                                                   \
1180 	wake          += __n;                                                   \
1181 	__n;                                                                    \
1182 })
1183 
1184 	/*
1185 	 *	Step 1: privileged waiters get to be satisfied first
1186 	 */
1187 #if HAS_MTE
1188 	if (vm_page_free_wanted_tagged_privileged) {
1189 		DONATE_TO_WAITERS(need_priv_tagged_wakeup,
1190 		    vmpr_taggable, vm_page_free_wanted_tagged_privileged,
1191 		    UINT32_MAX);
1192 
1193 		/*
1194 		 * If we will not wake up privileged threads, and there are
1195 		 * tagged privileged waiters, we need the refill thread to do
1196 		 * an emergency activation or reclaim to fulfill this need.
1197 		 *
1198 		 * We need to at least have 2 extra free pages because the
1199 		 * reclaim path might require to relocate a page to give us one.
1200 		 */
1201 		if (!need_priv_tagged_wakeup &&
1202 		    vm_page_free_count >= vm_page_free_taggable_count + 2) {
1203 			wakeup_refill_thread = true;
1204 		}
1205 	}
1206 #endif /* HAS_MTE */
1207 	if (vm_page_free_wanted_privileged) {
1208 		DONATE_TO_WAITERS(need_priv_wakeup,
1209 		    vmpr_regular, vm_page_free_wanted_privileged,
1210 		    UINT32_MAX);
1211 #if HAS_MTE
1212 		DONATE_TO_WAITERS(need_priv_wakeup,
1213 		    vmpr_taggable, vm_page_free_wanted_privileged,
1214 		    UINT32_MAX);
1215 #endif /* HAS_MTE */
1216 	}
1217 
1218 
1219 	/*
1220 	 *	Step 2: the privileged reserve needs to be replenished
1221 	 *
1222 	 *	Let's make sure that we only wake up regular threads
1223 	 *	for free pages above the reserve threshold.
1224 	 */
1225 	if (vm_page_free_count <= vm_page_free_reserved) {
1226 		unpriv_limit = 0;
1227 	} else {
1228 		unpriv_limit = vm_page_free_count - vm_page_free_reserved;
1229 	}
1230 #if HAS_MTE
1231 	if (vm_page_free_taggable_count <= vm_page_free_reserved) {
1232 		unpriv_tagged_limit = 0;
1233 	} else {
1234 		unpriv_tagged_limit = vm_page_free_taggable_count -
1235 		    vm_page_free_reserved;
1236 	}
1237 #endif /* HAS_MTE */
1238 
1239 	/*
1240 	 *	Step 3: satisfy secluded waiters, using the secluded pool first,
1241 	 *	regular pages second.
1242 	 */
1243 #if CONFIG_SECLUDED_MEMORY
1244 	if (vm_page_free_wanted_secluded) {
1245 		DONATE_TO_WAITERS(need_wakeup_secluded,
1246 		    vmpr_secluded, vm_page_free_wanted_secluded,
1247 		    UINT32_MAX);
1248 		unpriv_limit -= DONATE_TO_WAITERS(need_wakeup_secluded,
1249 		    vmpr_regular, vm_page_free_wanted_secluded,
1250 		    unpriv_limit);
1251 
1252 		if (vm_page_free_wanted_secluded == 0) {
1253 			need_wakeup_secluded = UINT32_MAX;
1254 		}
1255 	}
1256 #endif /* CONFIG_SECLUDED_MEMORY */
1257 
1258 	/*
1259 	 *	Step 4: satisfy regular demand last.
1260 	 */
1261 #if HAS_MTE
1262 	if (vm_page_free_wanted_tagged) {
1263 		n = DONATE_TO_WAITERS(need_tagged_wakeup,
1264 		    vmpr_taggable, vm_page_free_wanted_tagged,
1265 		    MIN(unpriv_limit, unpriv_tagged_limit));
1266 
1267 		unpriv_limit -= n;
1268 		unpriv_tagged_limit -= n;
1269 
1270 		if (vm_page_free_wanted_tagged == 0) {
1271 			need_tagged_wakeup = UINT32_MAX;
1272 		} else if (vm_page_free_count >=
1273 		    MAX(vm_page_free_taggable_count + 2, vm_page_free_min)) {
1274 			/*
1275 			 * If we still have tagged waiters, and that rebalancing
1276 			 * pages would get us above vm_page_free_min, then wake
1277 			 * up the refill thread to help do that rebalance.
1278 			 */
1279 			wakeup_refill_thread = true;
1280 		}
1281 	}
1282 #endif /* HAS_MTE */
1283 	if (vm_page_free_wanted) {
1284 		unpriv_limit -= DONATE_TO_WAITERS(need_wakeup,
1285 		    vmpr_regular, vm_page_free_wanted,
1286 		    unpriv_limit);
1287 #if HAS_MTE
1288 		n = DONATE_TO_WAITERS(need_wakeup,
1289 		    vmpr_taggable, vm_page_free_wanted,
1290 		    MIN(unpriv_limit, unpriv_tagged_limit));
1291 
1292 		unpriv_limit -= n;
1293 		unpriv_tagged_limit -= n;
1294 #endif /* HAS_MTE */
1295 		if (vm_page_free_wanted == 0) {
1296 			need_wakeup = UINT32_MAX;
1297 		}
1298 	}
1299 
1300 	/*
1301 	 * We have updated waiter counts, and if that release page happens
1302 	 * from the context of a thread that's super low priority we might
1303 	 * starve waking up privileged threads.
1304 	 *
1305 	 * While we hold the free page lock, such threads would wake us up via
1306 	 * the mutex priority inheritance mechanism, but as soon as we drop the
1307 	 * lock all bets are off.
1308 	 *
1309 	 * To avoid this priority inversion that could really hurt the VM,
1310 	 * disable preemption until we've woken up everyone.
1311 	 */
1312 	disable_preemption();
1313 	vm_free_page_unlock();
1314 
1315 	/*
1316 	 * Dispatch privileged wakeups
1317 	 *
1318 	 * There shouldn't be that many VM-privileged threads,
1319 	 * so let's wake them all up, even if we don't quite
1320 	 * have enough pages to satisfy them all.
1321 	 */
1322 	if (need_priv_wakeup) {
1323 		vm_page_free_wakeup(&vm_page_free_wanted_privileged,
1324 		    UINT32_MAX);
1325 	}
1326 	if (need_wakeup) {
1327 		vm_page_free_wakeup(&vm_page_free_count, need_wakeup);
1328 	}
1329 #if HAS_MTE
1330 	if (need_priv_tagged_wakeup) {
1331 		vm_page_free_wakeup(&vm_page_free_wanted_tagged_privileged,
1332 		    UINT32_MAX);
1333 	}
1334 	if (need_tagged_wakeup) {
1335 		vm_page_free_wakeup(&vm_page_free_wanted_tagged,
1336 		    need_tagged_wakeup);
1337 	}
1338 	if (wakeup_refill_thread) {
1339 		mteinfo_wake_fill_thread();
1340 	}
1341 #endif /* HAS_MTE */
1342 #if CONFIG_SECLUDED_MEMORY
1343 	if (need_wakeup_secluded) {
1344 		vm_page_free_wakeup(&vm_page_free_wanted_secluded,
1345 		    need_wakeup_secluded);
1346 	}
1347 #endif /* CONFIG_SECLUDED_MEMORY */
1348 
1349 	enable_preemption();
1350 
1351 #undef DONATE_TO_WAITERS
1352 }
1353 
1354 /*
1355  * @abstract
1356  * Given a list of pages, put each page on whichever global free queue is
1357  * appropriate.
1358  *
1359  * @discussion
1360  * Must be called with the VM free page lock unlocked.
1361  *
1362  * The list must contain less than 255 elements.
1363  */
1364 #if HAS_MTE
1365 /*
1366  * To put it more bluntly: this will demux pages onto the free tag storage
1367  * queue or the global free queue, as appropriate.  If we start freeing tagged
1368  * pages onto the free tagged queue, this function should be updated to deal
1369  * with that too.
1370  */
1371 #endif /* HAS_MTE */
1372 static void
vm_page_free_queue_enter_list(vm_page_list_t list,vmp_release_options_t opts)1373 vm_page_free_queue_enter_list(vm_page_list_t list, vmp_release_options_t opts)
1374 {
1375 	bool                   page_queues_unlock = false;
1376 	bool                   page_queues_locked = false;
1377 	bool                   do_secluded = false;
1378 	vmp_free_list_result_t result = { };
1379 	vm_page_t              mem;
1380 
1381 	LCK_MTX_ASSERT(&vm_page_queue_lock,
1382 	    (opts & VMP_RELEASE_Q_LOCKED)
1383 	    ? LCK_MTX_ASSERT_OWNED
1384 	    : LCK_MTX_ASSERT_NOTOWNED);
1385 
1386 	/*
1387 	 * Hibernation and startup do not really need the lock because
1388 	 * these are single threaded paths, so from the PoV of that function,
1389 	 * it's as if VMP_RELEASE_Q_LOCKED was passed.
1390 	 */
1391 	page_queues_locked = (opts & (VMP_RELEASE_STARTUP |
1392 	    VMP_RELEASE_HIBERNATE |
1393 	    VMP_RELEASE_Q_LOCKED));
1394 
1395 #if CONFIG_SECLUDED_MEMORY
1396 	do_secluded = vm_page_secluded_pool_depleted();
1397 #if HAS_MTE
1398 	if (do_secluded && list.vmpl_has_tagged &&
1399 	    (opts & VMP_RELEASE_Q_LOCKED) == 0) {
1400 		/*
1401 		 * Try to do the untagging so that pages become eligible
1402 		 * for the secluded pool while holding the least amount
1403 		 * of locks possible.
1404 		 *
1405 		 * This does mean we shouldn't do this retyping if the page
1406 		 * queue lock is held for real. The only path doing this
1407 		 * right now is vm_page_free() which is one page at a time,
1408 		 * so it's probably "fine" to not contribute these to the
1409 		 * secluded pool.
1410 		 */
1411 		const unified_page_list_t pmap_batch_list = {
1412 			.page_slist = list.vmpl_head,
1413 			.type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
1414 		};
1415 
1416 		pmap_unmake_tagged_pages(&pmap_batch_list);
1417 		vm_page_list_foreach(mem, list) {
1418 			mem->vmp_using_mte = false;
1419 		}
1420 		list.vmpl_has_tagged = false;
1421 		list.vmpl_has_untagged = true;
1422 	}
1423 #endif /* HAS_MTE */
1424 #endif /* CONFIG_SECLUDED_MEMORY */
1425 
1426 	if (!page_queues_locked && (list.vmpl_has_realtime || do_secluded)) {
1427 		vm_page_lock_queues();
1428 		page_queues_locked = true;
1429 		page_queues_unlock = true;
1430 	}
1431 
1432 	if (opts & VMP_RELEASE_STARTUP) {
1433 		LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1434 	} else {
1435 		vm_free_page_lock_spin();
1436 	}
1437 
1438 	vm_page_list_foreach_consume(mem, &list) {
1439 		ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
1440 		vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
1441 
1442 		if (mem->vmp_realtime) {
1443 			mem->vmp_realtime = false;
1444 			VM_COUNTER_DEC(&vm_page_realtime_count);
1445 		}
1446 
1447 #if XNU_VM_HAS_LOPAGE
1448 		if ((class == VM_MEMORY_CLASS_REGULAR ||
1449 		    class == VM_MEMORY_CLASS_LOPAGE) &&
1450 		    vm_lopage_refill &&
1451 		    vm_lopage_free_count < vm_lopage_free_limit &&
1452 		    pnum < max_valid_low_ppnum) {
1453 			class = VM_MEMORY_CLASS_LOPAGE;
1454 		} else {
1455 			class = VM_MEMORY_CLASS_REGULAR;
1456 		}
1457 #endif /* XNU_VM_HAS_LOPAGE */
1458 
1459 #if CONFIG_SECLUDED_MEMORY
1460 		/*
1461 		 * XXX FBDP TODO: also avoid refilling secluded queue
1462 		 * when some IOKit objects are already grabbing from it...
1463 		 */
1464 		if (page_queues_locked &&
1465 		    vm_page_secluded_pool_eligible(class) &&
1466 		    vm_page_secluded_pool_depleted()) {
1467 			class = VM_MEMORY_CLASS_SECLUDED;
1468 		}
1469 #endif /* CONFIG_SECLUDED_MEMORY */
1470 
1471 		vm_page_free_queue_enter(class, mem, pnum);
1472 
1473 		switch (class) {
1474 		case VM_MEMORY_CLASS_REGULAR:
1475 #if HAS_MTE
1476 			if (is_mte_enabled && mteinfo_covered_page_taggable(pnum)) {
1477 				result.vmpr_taggable++;
1478 				break;
1479 			}
1480 			OS_FALLTHROUGH;
1481 		case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1482 #endif /* HAS_MTE */
1483 			result.vmpr_regular++;
1484 			break;
1485 #if HAS_MTE
1486 		case VM_MEMORY_CLASS_TAGGED:
1487 			result.vmpr_taggable++;
1488 			break;
1489 		case VM_MEMORY_CLASS_TAG_STORAGE:
1490 			result.vmpr_tag_storage++;
1491 			break;
1492 #endif /* HAS_MTE */
1493 #if XNU_VM_HAS_LOPAGE
1494 		case VM_MEMORY_CLASS_LOPAGE:
1495 			result.vmpr_lopage++;
1496 			break;
1497 #endif /* XNU_VM_HAS_LOPAGE */
1498 #if CONFIG_SECLUDED_MEMORY
1499 		case VM_MEMORY_CLASS_SECLUDED:
1500 			result.vmpr_secluded++;
1501 			continue;
1502 #endif /* CONFIG_SECLUDED_MEMORY */
1503 		}
1504 	}
1505 
1506 	if (page_queues_unlock) {
1507 		vm_page_unlock_queues();
1508 	}
1509 
1510 	vm_pageout_vminfo.vm_page_pages_freed += list.vmpl_count;
1511 	VM_DEBUG_CONSTANT_EVENT(vm_page_release, DBG_VM_PAGE_RELEASE,
1512 	    DBG_FUNC_NONE, list.vmpl_count, 0, 0, 0);
1513 
1514 	if (opts & VMP_RELEASE_STARTUP) {
1515 		/*
1516 		 * On purpose skip the VM_CHECK_MEMORYSTATUS,
1517 		 * pmap_startup() will do it,
1518 		 * and the caller holds the free queue lock the whole time.
1519 		 */
1520 		return;
1521 	}
1522 
1523 	if (vm_page_free_queue_has_any_waiters()) {
1524 		vm_page_free_queue_handle_wakeups_and_unlock(result);
1525 	} else {
1526 		vm_free_page_unlock();
1527 	}
1528 
1529 	if ((opts & VMP_RELEASE_HIBERNATE) == 0) {
1530 		/*
1531 		 * Skip VM_CHECK_MEMORYSTATUS here as
1532 		 * hibernate_rebuild_vm_structs() will run it after the last flush.
1533 		 */
1534 		VM_CHECK_MEMORYSTATUS;
1535 	}
1536 }
1537 
1538 __attribute__((always_inline))
1539 void
vm_page_free_queue_remove(vm_memory_class_t class,vm_page_t mem,ppnum_t pnum,vm_page_q_state_t q_state)1540 vm_page_free_queue_remove(
1541 	vm_memory_class_t       class,
1542 	vm_page_t               mem,
1543 	ppnum_t                 pnum,
1544 	vm_page_q_state_t       q_state)
1545 {
1546 	unsigned int    color;
1547 	vm_page_queue_t queue;
1548 
1549 	if (startup_phase >= STARTUP_SUB_KMEM) {
1550 		LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1551 	}
1552 
1553 	mem->vmp_q_state = q_state;
1554 
1555 #if HAS_MTE
1556 	if (is_mte_enabled) {
1557 		switch (class) {
1558 		case VM_MEMORY_CLASS_REGULAR:
1559 			return mteinfo_covered_page_set_used(pnum, false);
1560 		case VM_MEMORY_CLASS_TAGGED:
1561 			return mteinfo_covered_page_set_used(pnum, true);
1562 		case VM_MEMORY_CLASS_TAG_STORAGE:
1563 			return mteinfo_tag_storage_set_claimed(mem);
1564 		default:
1565 			break;
1566 		}
1567 	}
1568 #endif /* HAS_MTE */
1569 
1570 	color = VM_PAGE_GET_COLOR_PNUM(pnum);
1571 	queue = vm_page_free_queue_for_class(class, color);
1572 #if CONFIG_SPTM
1573 	if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) {
1574 		uint32_t index = vm_pages_free_mask_index(pnum);
1575 		int8_t   bit   = vm_pages_free_mask_bit(pnum);
1576 
1577 		vm_pages_free_mask_clear(index, bit);
1578 		if (*vm_pages_free_enqueue_idx(index) == bit) {
1579 			vm_page_queue_remove(queue, mem, vmp_pageq);
1580 			bit = vm_pages_free_mask_next_bit(index, bit);
1581 			*vm_pages_free_enqueue_idx(index) = bit;
1582 
1583 			if (bit != -1) {
1584 				assert(vm_pages_free_mask_test(index, bit));
1585 				pnum  = (pnum & -MAX_COLORS) + bit;
1586 				mem   = vm_page_find_canonical(pnum);
1587 				color = VM_PAGE_GET_COLOR_PNUM(pnum);
1588 				queue = vm_page_free_queue_for_class(class, color);
1589 				vm_page_queue_enter(queue, mem, vmp_pageq);
1590 			}
1591 		}
1592 	} else
1593 #endif /* CONFIG_SPTM */
1594 	{
1595 		vm_page_queue_remove(queue, mem, vmp_pageq);
1596 	}
1597 
1598 	switch (class) {
1599 	case VM_MEMORY_CLASS_REGULAR:
1600 		VM_COUNTER_DEC(&vm_page_queue_free.vmpfq_count);
1601 		VM_COUNTER_DEC(&vm_page_free_count);
1602 		break;
1603 #if HAS_MTE
1604 	case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1605 		VM_COUNTER_DEC(&vm_page_queue_free.vmpfq_count);
1606 		VM_COUNTER_DEC(&vm_page_free_unmanaged_tag_storage_count);
1607 		/* these do not participate to the vm page free count */
1608 		break;
1609 #endif /* HAS_MTE */
1610 #if XNU_VM_HAS_LOPAGE
1611 	case VM_MEMORY_CLASS_LOPAGE:
1612 		VM_COUNTER_DEC(&vm_lopage_free_count);
1613 		vm_lopages_allocated_q += 1;
1614 		if (vm_lopage_free_count < vm_lopage_lowater) {
1615 			vm_lopage_refill = true;
1616 		}
1617 		break;
1618 #endif /* XNU_VM_HAS_LOPAGE */
1619 	default:
1620 		__builtin_unreachable();
1621 	}
1622 }
1623 
1624 vm_page_list_t
vm_page_free_queue_grab(vm_grab_options_t options __unused,vm_memory_class_t class,unsigned int num_pages,vm_page_q_state_t q_state)1625 vm_page_free_queue_grab(
1626 	vm_grab_options_t       options __unused,
1627 	vm_memory_class_t       class,
1628 	unsigned int            num_pages,
1629 	vm_page_q_state_t       q_state)
1630 {
1631 	unsigned int  *colorp;
1632 	unsigned int   color;
1633 #if defined(__x86_64__)
1634 	unsigned int   clump_end = 1;
1635 	unsigned int   sub_count = 0;
1636 #endif /* __x86_64__ */
1637 	vm_page_list_t list      = { };
1638 
1639 	if (startup_phase >= STARTUP_SUB_KMEM) {
1640 		LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1641 	}
1642 	assert(get_preemption_level() != 0);
1643 	assert(q_state <= VM_PAGE_Q_STATE_LAST_VALID_VALUE);
1644 
1645 #if HAS_MTE
1646 	if (is_mte_enabled && class != VM_MEMORY_CLASS_DEAD_TAG_STORAGE) {
1647 		return mteinfo_free_queue_grab(options, class, num_pages, q_state);
1648 	}
1649 #endif /* HAS_MTE */
1650 
1651 	colorp = PERCPU_GET(start_color);
1652 	color  = *colorp;
1653 
1654 	/* Get the pages. */
1655 	while (list.vmpl_count < num_pages) {
1656 		uint32_t        color_offset = 1;
1657 		vm_page_queue_t queue;
1658 		vm_page_t       mem;
1659 
1660 		queue = vm_page_free_queue_for_class(class, color);
1661 		if (!vm_page_free_queue_has_colors(class)) {
1662 			assert(!vm_page_queue_empty(queue));
1663 			color_offset = 0;
1664 		}
1665 		while (vm_page_queue_empty(queue)) {
1666 			color = (color + 1) & vm_color_mask;
1667 			queue = vm_page_free_queue_for_class(class, color);
1668 		}
1669 
1670 #if defined(__x86_64__)
1671 		if (class == VM_MEMORY_CLASS_REGULAR) {
1672 			/*
1673 			 * x86_64 uses a bespoke free queue scheme, where the free path
1674 			 * tries to cluster clumps of contiguous pages together on
1675 			 * the free queue to optimize for the platform's memory
1676 			 * controller.
1677 			 */
1678 			vm_page_queue_remove_first_with_clump(queue, mem, clump_end);
1679 			sub_count++;
1680 			if (clump_end) {
1681 #if DEVELOPMENT || DEBUG
1682 				vm_clump_update_stats(sub_count);
1683 #endif /* !DEVELOPMENT && !DEBUG */
1684 				sub_count = 0;
1685 			} else {
1686 				/* Only change colors at the end of a clump. */
1687 				color_offset = 0;
1688 			}
1689 		} else
1690 #endif /* !defined(__x86_64__) */
1691 		{
1692 			/* Other targets default to rotating colors after each pop. */
1693 			vm_page_queue_remove_first(queue, mem, vmp_pageq);
1694 		}
1695 
1696 #if CONFIG_SPTM
1697 		if (vm_pages_free_masks()) {
1698 			ppnum_t   pnum       = VM_PAGE_GET_PHYS_PAGE(mem);
1699 			ppnum_t   first_pnum = pnum & -MAX_COLORS;
1700 			uint32_t  index      = vm_pages_free_mask_index(pnum);
1701 			int8_t    bit        = vm_pages_free_mask_bit(pnum);
1702 
1703 			for (;;) {
1704 				vm_pages_free_mask_clear(index, bit);
1705 				mem->vmp_q_state = q_state;
1706 				vm_page_list_push(&list, mem);
1707 
1708 				bit = (bit + 1) & (MAX_COLORS - 1);
1709 
1710 				if (!vm_pages_free_mask_test(index, bit) ||
1711 				    num_pages <= list.vmpl_count) {
1712 					break;
1713 				}
1714 				mem = vm_page_find_canonical(first_pnum + bit);
1715 			}
1716 
1717 			color = bit & vm_color_mask;
1718 
1719 			bit = vm_pages_free_mask_next_bit(index, bit);
1720 			*vm_pages_free_enqueue_idx(index) = bit;
1721 
1722 			if (bit != -1) {
1723 				assert(vm_pages_free_mask_test(index, bit));
1724 				mem   = vm_page_find_canonical(first_pnum + bit);
1725 				queue = vm_page_free_queue_for_class(class,
1726 				    bit & vm_color_mask);
1727 				vm_page_queue_enter_first(queue, mem, vmp_pageq);
1728 			}
1729 		} else
1730 #endif /* CONFIG_SPTM */
1731 		{
1732 			/* Set the page to the client's desired queue state. */
1733 			mem->vmp_q_state = q_state;
1734 			vm_page_list_push(&list, mem);
1735 
1736 			color = (color + color_offset) & vm_color_mask;
1737 		}
1738 	}
1739 
1740 	switch (class) {
1741 	case VM_MEMORY_CLASS_REGULAR:
1742 		VM_COUNTER_SUB(&vm_page_queue_free.vmpfq_count, list.vmpl_count);
1743 		VM_COUNTER_SUB(&vm_page_free_count, list.vmpl_count);
1744 		break;
1745 #if HAS_MTE
1746 	case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1747 		VM_COUNTER_SUB(&vm_page_queue_free.vmpfq_count, list.vmpl_count);
1748 		VM_COUNTER_SUB(&vm_page_free_unmanaged_tag_storage_count, list.vmpl_count);
1749 		/* these do not participate to the vm page free count */
1750 		break;
1751 #endif /* HAS_MTE */
1752 #if XNU_VM_HAS_LOPAGE
1753 	case VM_MEMORY_CLASS_LOPAGE:
1754 		VM_COUNTER_SUB(&vm_lopage_free_count, list.vmpl_count);
1755 		vm_lopages_allocated_q += list.vmpl_count;
1756 		if (vm_lopage_free_count < vm_lopage_lowater) {
1757 			vm_lopage_refill = true;
1758 		}
1759 		break;
1760 #endif /* XNU_VM_HAS_LOPAGE */
1761 	default:
1762 		__builtin_unreachable();
1763 	}
1764 
1765 	/* Record the next page color the CPU should try to get. */
1766 	*colorp = color;
1767 #if defined(__x86_64__) && (DEVELOPMENT || DEBUG)
1768 	vm_clump_update_stats(sub_count);
1769 #endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */
1770 
1771 	return list;
1772 }
1773 
1774 
1775 #define COLOR_GROUPS_TO_STEAL   4
1776 
1777 /* Called once during statup, once the cache geometry is known.
1778  */
1779 static void
vm_page_set_colors(void)1780 vm_page_set_colors( void )
1781 {
1782 	unsigned int    n, override;
1783 
1784 #if defined (__x86_64__)
1785 	/* adjust #colors because we need to color outside the clump boundary */
1786 	vm_cache_geometry_colors >>= vm_clump_shift;
1787 #endif
1788 	if (PE_parse_boot_argn("colors", &override, sizeof(override))) {                /* colors specified as a boot-arg? */
1789 		n = override;
1790 	} else if (vm_cache_geometry_colors) {                  /* do we know what the cache geometry is? */
1791 		n = vm_cache_geometry_colors;
1792 	} else {
1793 		n = DEFAULT_COLORS;                             /* use default if all else fails */
1794 	}
1795 	if (n == 0) {
1796 		n = 1;
1797 	}
1798 	if (n > MAX_COLORS) {
1799 		n = MAX_COLORS;
1800 	}
1801 
1802 	/* the count must be a power of 2  */
1803 	if ((n & (n - 1)) != 0) {
1804 		n = DEFAULT_COLORS;                             /* use default if all else fails */
1805 	}
1806 	vm_colors = n;
1807 	vm_color_mask = n - 1;
1808 
1809 	vm_free_magazine_refill_limit = vm_colors * COLOR_GROUPS_TO_STEAL;
1810 
1811 #if defined (__x86_64__)
1812 	/* adjust for reduction in colors due to clumping and multiple cores */
1813 	if (real_ncpus) {
1814 		vm_free_magazine_refill_limit *= (vm_clump_size * real_ncpus);
1815 	}
1816 #endif
1817 }
1818 
1819 #if XNU_VM_HAS_DELAYED_PAGES
1820 
1821 static uint32_t vm_delayed_count = 0;    /* when non-zero, indicates we may have more pages to init */
1822 static ppnum_t delay_above_pnum = PPNUM_MAX;
1823 
1824 /*
1825  * For x86 first 8 Gig initializes quickly and gives us lots of lowmem + mem above to start off with.
1826  * If ARM ever uses delayed page initialization, this value may need to be quite different.
1827  */
1828 #define DEFAULT_DELAY_ABOVE_PHYS_GB (8)
1829 
1830 /*
1831  * When we have to dip into more delayed pages due to low memory, free up
1832  * a large chunk to get things back to normal. This avoids contention on the
1833  * delayed code allocating page by page.
1834  */
1835 #define VM_DELAY_PAGE_CHUNK ((1024 * 1024 * 1024) / PAGE_SIZE)
1836 
1837 /*
1838  * Get and initialize the next delayed page.
1839  */
1840 __attribute__((noinline))
1841 static vm_page_t
vm_get_delayed_page(vm_grab_options_t grab_options)1842 vm_get_delayed_page(vm_grab_options_t grab_options)
1843 {
1844 	vm_page_t p;
1845 	ppnum_t   pnum;
1846 
1847 	/*
1848 	 * Get a new page if we have one.
1849 	 */
1850 	vm_free_page_lock();
1851 	if (vm_delayed_count == 0) {
1852 		vm_free_page_unlock();
1853 		return NULL;
1854 	}
1855 
1856 	if (!pmap_next_page(&pnum)) {
1857 		vm_delayed_count = 0;
1858 		vm_free_page_unlock();
1859 		return NULL;
1860 	}
1861 
1862 
1863 	assert(vm_delayed_count > 0);
1864 	--vm_delayed_count;
1865 
1866 #if defined(__x86_64__)
1867 	/* x86 cluster code requires increasing phys_page in vm_pages[] */
1868 	if (vm_pages_count > 0) {
1869 		assert(pnum > vm_page_get(vm_pages_count - 1)->vmp_phys_page);
1870 	}
1871 #endif
1872 	p = vm_page_get(vm_pages_count);
1873 	assert(p < vm_pages_end);
1874 	vm_page_init(p, pnum);
1875 	++vm_pages_count;
1876 	++vm_page_pages;
1877 	vm_free_page_unlock();
1878 
1879 	/*
1880 	 * These pages were initially counted as wired, undo that now.
1881 	 */
1882 	if (grab_options & VM_PAGE_GRAB_Q_LOCK_HELD) {
1883 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1884 	} else {
1885 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
1886 		vm_page_lockspin_queues();
1887 	}
1888 	--vm_page_wire_count;
1889 	--vm_page_wire_count_initial;
1890 	if (vm_page_wire_count_on_boot != 0) {
1891 		--vm_page_wire_count_on_boot;
1892 	}
1893 	if (!(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
1894 		vm_page_unlock_queues();
1895 	}
1896 
1897 
1898 	if (fillval) {
1899 		fillPage(pnum, fillval);
1900 	}
1901 	return p;
1902 }
1903 
1904 /*
1905  * Free all remaining delayed pages to the free lists.
1906  */
1907 void
vm_free_delayed_pages(void)1908 vm_free_delayed_pages(void)
1909 {
1910 	vm_page_t   p;
1911 	vm_page_t   list = NULL;
1912 	uint_t      cnt = 0;
1913 	vm_offset_t start_free_va;
1914 	int64_t     free_size;
1915 
1916 	while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) {
1917 		if (vm_himemory_mode) {
1918 			vm_page_release(p, VMP_RELEASE_NONE);
1919 		} else {
1920 			p->vmp_snext = list;
1921 			list = p;
1922 		}
1923 		++cnt;
1924 	}
1925 
1926 	/*
1927 	 * Free the pages in reverse order if not himemory mode.
1928 	 * Hence the low memory pages will be first on free lists. (LIFO)
1929 	 */
1930 	while (list != NULL) {
1931 		p = list;
1932 		list = p->vmp_snext;
1933 		p->vmp_snext = NULL;
1934 		vm_page_release(p, VMP_RELEASE_NONE);
1935 	}
1936 #if DEVELOPMENT || DEBUG
1937 	kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt);
1938 #endif
1939 
1940 	/*
1941 	 * Free up any unused full pages at the end of the vm_pages[] array
1942 	 */
1943 	start_free_va = round_page((vm_offset_t)vm_page_get(vm_pages_count));
1944 
1945 #if defined(__x86_64__)
1946 	/*
1947 	 * Since x86 might have used large pages for vm_pages[], we can't
1948 	 * free starting in the middle of a partially used large page.
1949 	 */
1950 	if (pmap_query_pagesize(kernel_pmap, start_free_va) == I386_LPGBYTES) {
1951 		start_free_va = ((start_free_va + I386_LPGMASK) & ~I386_LPGMASK);
1952 	}
1953 #endif
1954 	if (start_free_va < (vm_offset_t)vm_pages_end) {
1955 		free_size = trunc_page((vm_offset_t)vm_pages_end - start_free_va);
1956 		if (free_size > 0) {
1957 			ml_static_mfree(start_free_va, (vm_offset_t)free_size);
1958 			vm_pages_end = (void *)start_free_va;
1959 
1960 			/*
1961 			 * Note there's no locking here, as only this thread will ever change this value.
1962 			 * The reader, vm_page_diagnose, doesn't grab any locks for the counts it looks at.
1963 			 */
1964 			vm_page_stolen_count -= (free_size >> PAGE_SHIFT);
1965 
1966 #if DEVELOPMENT || DEBUG
1967 			kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n",
1968 			    (long)free_size, (long)start_free_va);
1969 #endif
1970 		}
1971 	}
1972 }
1973 
1974 /*
1975  * Try and free up enough delayed pages to match a contig memory allocation.
1976  */
1977 static void
vm_free_delayed_pages_contig(uint_t npages,ppnum_t max_pnum,ppnum_t pnum_mask)1978 vm_free_delayed_pages_contig(
1979 	uint_t    npages,
1980 	ppnum_t   max_pnum,
1981 	ppnum_t   pnum_mask)
1982 {
1983 	vm_page_t p;
1984 	ppnum_t   pnum;
1985 	uint_t    cnt = 0;
1986 
1987 	/*
1988 	 * Treat 0 as the absolute max page number.
1989 	 */
1990 	if (max_pnum == 0) {
1991 		max_pnum = PPNUM_MAX;
1992 	}
1993 
1994 	/*
1995 	 * Free till we get a properly aligned start page
1996 	 */
1997 	for (;;) {
1998 		p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
1999 		if (p == NULL) {
2000 			return;
2001 		}
2002 		pnum = VM_PAGE_GET_PHYS_PAGE(p);
2003 		vm_page_release(p, VMP_RELEASE_NONE);
2004 		if (pnum >= max_pnum) {
2005 			return;
2006 		}
2007 		if ((pnum & pnum_mask) == 0) {
2008 			break;
2009 		}
2010 	}
2011 
2012 	/*
2013 	 * Having a healthy pool of free pages will help performance. We don't
2014 	 * want to fall back to the delayed code for every page allocation.
2015 	 */
2016 	if (vm_page_free_count < VM_DELAY_PAGE_CHUNK) {
2017 		npages += VM_DELAY_PAGE_CHUNK;
2018 	}
2019 
2020 	/*
2021 	 * Now free up the pages
2022 	 */
2023 	for (cnt = 1; cnt < npages; ++cnt) {
2024 		p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
2025 		if (p == NULL) {
2026 			return;
2027 		}
2028 		vm_page_release(p, VMP_RELEASE_NONE);
2029 	}
2030 }
2031 
2032 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2033 
2034 #define ROUNDUP_NEXTP2(X) (1U << (32 - __builtin_clz((X) - 1)))
2035 
2036 void
vm_page_init_local_q(unsigned int num_cpus)2037 vm_page_init_local_q(unsigned int num_cpus)
2038 {
2039 	struct vpl *t_local_q;
2040 
2041 	/*
2042 	 * no point in this for a uni-processor system
2043 	 */
2044 	if (num_cpus >= 2) {
2045 		ml_cpu_info_t cpu_info;
2046 
2047 		/*
2048 		 * Force the allocation alignment to a cacheline,
2049 		 * because the `vpl` struct has a lock and will be taken
2050 		 * cross CPU so we want to isolate the rest of the per-CPU
2051 		 * data to avoid false sharing due to this lock being taken.
2052 		 */
2053 
2054 		ml_cpu_get_info(&cpu_info);
2055 
2056 		t_local_q = zalloc_percpu_permanent(sizeof(struct vpl),
2057 		    cpu_info.cache_line_size - 1);
2058 
2059 		zpercpu_foreach(lq, t_local_q) {
2060 			VPL_LOCK_INIT(lq, &vm_page_lck_grp_local, &vm_page_lck_attr);
2061 			vm_page_queue_init(&lq->vpl_queue);
2062 		}
2063 
2064 		/* make the initialization visible to all cores */
2065 		os_atomic_store(&vm_page_local_q, t_local_q, release);
2066 	}
2067 }
2068 
2069 /*
2070  * vm_init_before_launchd
2071  *
2072  * This should be called right before launchd is loaded.
2073  */
2074 void
vm_init_before_launchd(void)2075 vm_init_before_launchd(void)
2076 {
2077 	vm_page_lockspin_queues();
2078 	vm_page_wire_count_on_boot = vm_page_wire_count;
2079 	vm_page_unlock_queues();
2080 }
2081 
2082 
2083 /*
2084  *	vm_page_bootstrap:
2085  *
2086  *	Initializes the resident memory module.
2087  *
2088  *	Allocates memory for the page cells, and
2089  *	for the object/offset-to-page hash table headers.
2090  *	Each page cell is initialized and placed on the free list.
2091  *	Returns the range of available kernel virtual memory.
2092  */
2093 __startup_func
2094 void
vm_page_bootstrap(vm_offset_t * startp,vm_offset_t * endp)2095 vm_page_bootstrap(
2096 	vm_offset_t             *startp,
2097 	vm_offset_t             *endp)
2098 {
2099 	unsigned int            i;
2100 	unsigned int            log1;
2101 	unsigned int            log2;
2102 	unsigned int            size;
2103 
2104 	/*
2105 	 *	Initialize the page queues.
2106 	 */
2107 
2108 	lck_mtx_init(&vm_page_queue_free_lock, &vm_page_lck_grp_free, &vm_page_lck_attr);
2109 	lck_mtx_init(&vm_page_queue_lock, &vm_page_lck_grp_queue, &vm_page_lck_attr);
2110 	lck_mtx_init(&vm_purgeable_queue_lock, &vm_page_lck_grp_purge, &vm_page_lck_attr);
2111 
2112 	for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) {
2113 		int group;
2114 
2115 		purgeable_queues[i].token_q_head = 0;
2116 		purgeable_queues[i].token_q_tail = 0;
2117 		for (group = 0; group < NUM_VOLATILE_GROUPS; group++) {
2118 			queue_init(&purgeable_queues[i].objq[group]);
2119 		}
2120 
2121 		purgeable_queues[i].type = i;
2122 		purgeable_queues[i].new_pages = 0;
2123 #if MACH_ASSERT
2124 		purgeable_queues[i].debug_count_tokens = 0;
2125 		purgeable_queues[i].debug_count_objects = 0;
2126 #endif
2127 	}
2128 	;
2129 	purgeable_nonvolatile_count = 0;
2130 	queue_init(&purgeable_nonvolatile_queue);
2131 
2132 	vm_page_free_queue_init(&vm_page_queue_free);
2133 #if XNU_VM_HAS_LOPAGE
2134 	vm_page_queue_init(&vm_lopage_queue_free);
2135 #endif /* XNU_VM_HAS_LOPAGE */
2136 	vm_page_queue_init(&vm_page_queue_active);
2137 	vm_page_queue_init(&vm_page_queue_inactive);
2138 #if CONFIG_SECLUDED_MEMORY
2139 	vm_page_queue_init(&vm_page_queue_secluded);
2140 #endif /* CONFIG_SECLUDED_MEMORY */
2141 	vm_page_queue_init(&vm_page_queue_cleaned);
2142 	vm_page_queue_init(&vm_page_queue_throttled);
2143 	vm_page_queue_init(&vm_page_queue_anonymous);
2144 	queue_init(&vm_objects_wired);
2145 
2146 	for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
2147 		vm_page_queue_init(&vm_page_queue_speculative[i].age_q);
2148 
2149 		vm_page_queue_speculative[i].age_ts.tv_sec = 0;
2150 		vm_page_queue_speculative[i].age_ts.tv_nsec = 0;
2151 	}
2152 
2153 	vm_page_queue_init(&vm_page_queue_donate);
2154 	vm_page_queue_init(&vm_page_queue_background);
2155 
2156 	vm_page_background_count = 0;
2157 	vm_page_background_internal_count = 0;
2158 	vm_page_background_external_count = 0;
2159 	vm_page_background_promoted_count = 0;
2160 
2161 	vm_page_background_target = (unsigned int)(atop_64(max_mem) / 25);
2162 
2163 	if (vm_page_background_target > VM_PAGE_BACKGROUND_TARGET_MAX) {
2164 		vm_page_background_target = VM_PAGE_BACKGROUND_TARGET_MAX;
2165 	}
2166 
2167 #if    defined(__LP64__)
2168 	vm_page_background_mode = VM_PAGE_BG_ENABLED;
2169 	vm_page_donate_mode = VM_PAGE_DONATE_ENABLED;
2170 #else
2171 	vm_page_background_mode = VM_PAGE_BG_DISABLED;
2172 	vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
2173 #endif
2174 	vm_page_background_exclude_external = 0;
2175 
2176 	PE_parse_boot_argn("vm_page_bg_mode", &vm_page_background_mode, sizeof(vm_page_background_mode));
2177 	PE_parse_boot_argn("vm_page_bg_exclude_external", &vm_page_background_exclude_external, sizeof(vm_page_background_exclude_external));
2178 	PE_parse_boot_argn("vm_page_bg_target", &vm_page_background_target, sizeof(vm_page_background_target));
2179 
2180 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && vm_page_background_mode != VM_PAGE_BG_ENABLED) {
2181 		vm_page_background_mode = VM_PAGE_BG_DISABLED;
2182 	}
2183 
2184 	PE_parse_boot_argn("vm_page_donate_mode", &vm_page_donate_mode, sizeof(vm_page_donate_mode));
2185 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED && vm_page_donate_mode != VM_PAGE_DONATE_ENABLED) {
2186 		vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
2187 	}
2188 
2189 	vm_page_donate_target_high = VM_PAGE_DONATE_TARGET_HIGHWATER;
2190 	vm_page_donate_target_low = VM_PAGE_DONATE_TARGET_LOWWATER;
2191 	vm_page_donate_target = vm_page_donate_target_high;
2192 	vm_page_donate_count = 0;
2193 
2194 	vm_page_free_wanted = 0;
2195 	vm_page_free_wanted_privileged = 0;
2196 #if CONFIG_SECLUDED_MEMORY
2197 	vm_page_free_wanted_secluded = 0;
2198 #endif /* CONFIG_SECLUDED_MEMORY */
2199 
2200 #if defined (__x86_64__)
2201 	/* this must be called before vm_page_set_colors() */
2202 	vm_page_setup_clump();
2203 #endif
2204 
2205 	vm_page_set_colors();
2206 
2207 	for (vm_tag_t t = 0; t < VM_KERN_MEMORY_FIRST_DYNAMIC; t++) {
2208 		vm_allocation_sites_static[t].refcount = 2;
2209 		vm_allocation_sites_static[t].tag = t;
2210 		vm_allocation_sites[t] = &vm_allocation_sites_static[t];
2211 	}
2212 	vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].refcount = 2;
2213 	vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].tag = VM_KERN_MEMORY_ANY;
2214 	vm_allocation_sites[VM_KERN_MEMORY_ANY] = &vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC];
2215 
2216 	/*
2217 	 *	Steal memory for the map and zone subsystems.
2218 	 *
2219 	 *	make sure initialize_ram_ranges() has run before we steal pages for the first time on arm
2220 	 */
2221 	(void)pmap_free_pages();
2222 
2223 	kernel_startup_initialize_upto(STARTUP_SUB_PMAP_STEAL);
2224 
2225 	/*
2226 	 *	Allocate (and initialize) the virtual-to-physical
2227 	 *	table hash buckets.
2228 	 *
2229 	 *	The number of buckets should be a power of two to
2230 	 *	get a good hash function.  The following computation
2231 	 *	chooses the first power of two that is greater
2232 	 *	than the number of physical pages in the system.
2233 	 */
2234 
2235 	if (vm_page_bucket_count == 0) {
2236 		unsigned int npages = pmap_free_pages();
2237 
2238 		vm_page_bucket_count = 1;
2239 		while (vm_page_bucket_count < npages) {
2240 			vm_page_bucket_count <<= 1;
2241 		}
2242 	}
2243 	vm_page_bucket_lock_count = (vm_page_bucket_count + BUCKETS_PER_LOCK - 1) / BUCKETS_PER_LOCK;
2244 
2245 	vm_page_hash_mask = vm_page_bucket_count - 1;
2246 
2247 	/*
2248 	 *	Calculate object shift value for hashing algorithm:
2249 	 *		O = log2(sizeof(struct vm_object))
2250 	 *		B = log2(vm_page_bucket_count)
2251 	 *	        hash shifts the object left by
2252 	 *		B/2 - O
2253 	 */
2254 	size = vm_page_bucket_count;
2255 	for (log1 = 0; size > 1; log1++) {
2256 		size /= 2;
2257 	}
2258 	size = sizeof(struct vm_object);
2259 	for (log2 = 0; size > 1; log2++) {
2260 		size /= 2;
2261 	}
2262 	vm_page_hash_shift = log1 / 2 - log2 + 1;
2263 
2264 	vm_page_bucket_hash = 1 << ((log1 + 1) >> 1);           /* Get (ceiling of sqrt of table size) */
2265 	vm_page_bucket_hash |= 1 << ((log1 + 1) >> 2);          /* Get (ceiling of quadroot of table size) */
2266 	vm_page_bucket_hash |= 1;                                                       /* Set bit and add 1 - always must be 1 to insure unique series */
2267 
2268 	if (vm_page_hash_mask & vm_page_bucket_count) {
2269 		printf("vm_page_bootstrap: WARNING -- strange page hash\n");
2270 	}
2271 
2272 #if VM_PAGE_BUCKETS_CHECK
2273 #if VM_PAGE_FAKE_BUCKETS
2274 	/*
2275 	 * Allocate a decoy set of page buckets, to detect
2276 	 * any stomping there.
2277 	 */
2278 	vm_page_fake_buckets = (vm_page_bucket_t *)
2279 	    pmap_steal_memory(vm_page_bucket_count *
2280 	    sizeof(vm_page_bucket_t), 0);
2281 	vm_page_fake_buckets_start = (vm_map_offset_t) vm_page_fake_buckets;
2282 	vm_page_fake_buckets_end =
2283 	    vm_map_round_page((vm_page_fake_buckets_start +
2284 	    (vm_page_bucket_count *
2285 	    sizeof(vm_page_bucket_t))),
2286 	    PAGE_MASK);
2287 	char *cp;
2288 	for (cp = (char *)vm_page_fake_buckets_start;
2289 	    cp < (char *)vm_page_fake_buckets_end;
2290 	    cp++) {
2291 		*cp = 0x5a;
2292 	}
2293 #endif /* VM_PAGE_FAKE_BUCKETS */
2294 #endif /* VM_PAGE_BUCKETS_CHECK */
2295 
2296 	kernel_debug_string_early("vm_page_buckets");
2297 	vm_page_buckets = (vm_page_bucket_t *)
2298 	    pmap_steal_memory(vm_page_bucket_count *
2299 	    sizeof(vm_page_bucket_t), 0);
2300 
2301 	kernel_debug_string_early("vm_page_bucket_locks");
2302 	vm_page_bucket_locks = (lck_ticket_t *)
2303 	    pmap_steal_memory(vm_page_bucket_lock_count *
2304 	    sizeof(lck_ticket_t), 0);
2305 
2306 	for (i = 0; i < vm_page_bucket_count; i++) {
2307 		vm_page_bucket_t *bucket = &vm_page_buckets[i];
2308 
2309 		bucket->page_list = VM_PAGE_PACK_PTR(VM_PAGE_NULL);
2310 #if     MACH_PAGE_HASH_STATS
2311 		bucket->cur_count = 0;
2312 		bucket->hi_count = 0;
2313 #endif /* MACH_PAGE_HASH_STATS */
2314 	}
2315 
2316 	for (i = 0; i < vm_page_bucket_lock_count; i++) {
2317 		lck_ticket_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket);
2318 	}
2319 
2320 	vm_tag_init();
2321 
2322 #if VM_PAGE_BUCKETS_CHECK
2323 	vm_page_buckets_check_ready = TRUE;
2324 #endif /* VM_PAGE_BUCKETS_CHECK */
2325 
2326 	/*
2327 	 *	Machine-dependent code allocates the resident page table.
2328 	 *	It uses vm_page_init to initialize the page frames.
2329 	 *	The code also returns to us the virtual space available
2330 	 *	to the kernel.  We don't trust the pmap module
2331 	 *	to get the alignment right.
2332 	 */
2333 
2334 	kernel_debug_string_early("pmap_startup");
2335 	pmap_startup(&virtual_space_start, &virtual_space_end);
2336 	virtual_space_start = round_page(virtual_space_start);
2337 	virtual_space_end = trunc_page(virtual_space_end);
2338 
2339 	*startp = virtual_space_start;
2340 	*endp = virtual_space_end;
2341 
2342 	/*
2343 	 *	Compute the initial "wire" count.
2344 	 *	Up until now, the pages which have been set aside are not under
2345 	 *	the VM system's control, so although they aren't explicitly
2346 	 *	wired, they nonetheless can't be moved. At this moment,
2347 	 *	all VM managed pages are "free", courtesy of pmap_startup.
2348 	 */
2349 	assert((unsigned int) atop_64(max_mem) == atop_64(max_mem));
2350 	vm_page_wire_count = ((unsigned int) atop_64(max_mem)) -
2351 	    vm_page_free_count - vm_lopage_free_count;
2352 #if CONFIG_SECLUDED_MEMORY
2353 	vm_page_wire_count -= vm_page_secluded_count;
2354 #endif
2355 #if HAS_MTE
2356 	/*
2357 	 * Discount any tag storage pages that we have set aside in
2358 	 * vm_page_release_startup().
2359 	 */
2360 	vm_page_wire_count -= mte_tag_storage_count;
2361 #endif
2362 	vm_page_wire_count_initial = vm_page_wire_count;
2363 
2364 	/* capture this for later use */
2365 	booter_size = ml_get_booter_memory_size();
2366 
2367 	printf("vm_page_bootstrap: %d free pages, %d wired pages"
2368 #if XNU_VM_HAS_DELAYED_PAGES
2369 	    ", (up to %d of which are delayed free)"
2370 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2371 	    "%c",
2372 	    vm_page_free_count,
2373 	    vm_page_wire_count,
2374 #if XNU_VM_HAS_DELAYED_PAGES
2375 	    vm_delayed_count,
2376 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2377 	    '\n');
2378 
2379 	kernel_debug_string_early("vm_page_bootstrap complete");
2380 }
2381 
2382 #ifndef MACHINE_PAGES
2383 /*
2384  * This is the early boot time allocator for data structures needed to bootstrap the VM system.
2385  * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this
2386  * on ARM yet, due to the combination of a large base page size and smaller RAM devices.
2387  */
2388 __static_testable void *
pmap_steal_memory_internal(vm_size_t size,vm_size_t alignment,boolean_t might_free,unsigned int flags,pmap_mapping_type_t mapping_type)2389 pmap_steal_memory_internal(
2390 	vm_size_t size,
2391 	vm_size_t alignment,
2392 	boolean_t might_free,
2393 	unsigned int flags,
2394 	pmap_mapping_type_t mapping_type)
2395 {
2396 	kern_return_t kr;
2397 	vm_offset_t addr;
2398 	vm_offset_t end = 0;
2399 	vm_offset_t map_addr;
2400 	ppnum_t phys_page;
2401 	unsigned int pmap_flags;
2402 
2403 	if (size > UINT64_MAX - sizeof(void *)) {
2404 		panic("pmap_steal_memory(): size: 0x%lx", size);
2405 	}
2406 	/*
2407 	 * Size needs to be aligned to word size.
2408 	 */
2409 	size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
2410 
2411 	/*
2412 	 * Alignment defaults to word size if not specified.
2413 	 */
2414 	if (alignment == 0) {
2415 		alignment = sizeof(void*);
2416 	}
2417 
2418 	/*
2419 	 * Alignment must be no greater than a page and must be a power of two.
2420 	 */
2421 	assert(alignment <= PAGE_SIZE);
2422 	assert((alignment & (alignment - 1)) == 0);
2423 
2424 	/*
2425 	 * On the first call, get the initial values for virtual address space
2426 	 * and page align them.
2427 	 */
2428 	if (virtual_space_start == virtual_space_end) {
2429 		pmap_virtual_space(&virtual_space_start, &virtual_space_end);
2430 		virtual_space_start = round_page(virtual_space_start);
2431 		virtual_space_end = trunc_page(virtual_space_end);
2432 
2433 #if defined(__x86_64__)
2434 		/*
2435 		 * Release remaining unused section of preallocated KVA and the 4K page tables
2436 		 * that map it. This makes the VA available for large page mappings.
2437 		 */
2438 		Idle_PTs_release(virtual_space_start, virtual_space_end);
2439 #endif
2440 	}
2441 
2442 	/*
2443 	 * Allocate the virtual space for this request. On x86, we'll align to a large page
2444 	 * address if the size is big enough to back with at least 1 large page.
2445 	 */
2446 #if defined(__x86_64__)
2447 	if (size >= I386_LPGBYTES) {
2448 		virtual_space_start = ((virtual_space_start + I386_LPGMASK) & ~I386_LPGMASK);
2449 	}
2450 #endif
2451 	virtual_space_start = (virtual_space_start + (alignment - 1)) & ~(alignment - 1);
2452 	addr = virtual_space_start;
2453 	virtual_space_start += size;
2454 
2455 	//kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size);	/* (TEST/DEBUG) */
2456 
2457 	/*
2458 	 * Allocate and map physical pages to back the new virtual space.
2459 	 */
2460 	map_addr = round_page(addr);
2461 	if (os_add_overflow(addr, size, &end)) {
2462 		panic("pmap_steal_memory() overflow, addr: %lx, size: 0x%lx", addr, size);
2463 	}
2464 	while (map_addr < end) {
2465 #if defined(__x86_64__)
2466 		/*
2467 		 * Back with a large page if properly aligned on x86
2468 		 */
2469 		if ((map_addr & I386_LPGMASK) == 0 &&
2470 		    map_addr + I386_LPGBYTES <= addr + size &&
2471 		    pmap_pre_expand_large(kernel_pmap, map_addr) == KERN_SUCCESS &&
2472 		    pmap_next_page_large(&phys_page) == KERN_SUCCESS) {
2473 			kr = pmap_enter(kernel_pmap, map_addr, phys_page,
2474 			    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
2475 			    VM_WIMG_USE_DEFAULT | VM_MEM_SUPERPAGE, FALSE, mapping_type);
2476 
2477 			if (kr != KERN_SUCCESS) {
2478 				panic("pmap_steal_memory: pmap_enter() large failed, new_addr=%#lx, phys_page=%u",
2479 				    (unsigned long)map_addr, phys_page);
2480 			}
2481 			map_addr += I386_LPGBYTES;
2482 			vm_page_wire_count += I386_LPGBYTES >> PAGE_SHIFT;
2483 			vm_page_stolen_count += I386_LPGBYTES >> PAGE_SHIFT;
2484 			vm_page_kern_lpage_count++;
2485 			continue;
2486 		}
2487 #endif
2488 
2489 		if (!pmap_next_page_hi(&phys_page, might_free)) {
2490 			panic("pmap_steal_memory() size: 0x%llx", (uint64_t)size);
2491 		}
2492 
2493 #if defined(__x86_64__)
2494 		pmap_pre_expand(kernel_pmap, map_addr);
2495 #endif
2496 		pmap_flags = flags ? flags : VM_WIMG_USE_DEFAULT;
2497 
2498 #if HAS_MTE
2499 		if (pmap_flags & VM_MEM_MAP_MTE) {
2500 			mteinfo_covered_page_set_stolen_tagged(phys_page);
2501 			pmap_make_tagged_page(phys_page);
2502 		}
2503 #endif /* HAS_MTE */
2504 
2505 		kr = pmap_enter(kernel_pmap, map_addr, phys_page,
2506 		    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
2507 		    pmap_flags, FALSE, mapping_type);
2508 
2509 		if (kr != KERN_SUCCESS) {
2510 			panic("pmap_steal_memory() pmap_enter failed, map_addr=%#lx, phys_page=%u",
2511 			    (unsigned long)map_addr, phys_page);
2512 		}
2513 		map_addr += PAGE_SIZE;
2514 
2515 		/*
2516 		 * Account for newly stolen memory
2517 		 */
2518 		vm_page_wire_count++;
2519 		vm_page_stolen_count++;
2520 	}
2521 
2522 #if defined(__x86_64__)
2523 	/*
2524 	 * The call with might_free is currently the last use of pmap_steal_memory*().
2525 	 * Notify the pmap layer to record which high pages were allocated so far.
2526 	 */
2527 	if (might_free) {
2528 		pmap_hi_pages_done();
2529 	}
2530 #endif
2531 #if KASAN
2532 	kasan_notify_address(round_page(addr), size);
2533 #endif
2534 	return (void *) addr;
2535 }
2536 
2537 __mockable void *
pmap_steal_memory(vm_size_t size,vm_size_t alignment)2538 pmap_steal_memory(
2539 	vm_size_t size,
2540 	vm_size_t alignment)
2541 {
2542 	return pmap_steal_memory_internal(size, alignment, FALSE, 0, PMAP_MAPPING_TYPE_RESTRICTED);
2543 }
2544 
2545 void *
pmap_steal_freeable_memory(vm_size_t size)2546 pmap_steal_freeable_memory(
2547 	vm_size_t size)
2548 {
2549 	return pmap_steal_memory_internal(size, 0, TRUE, 0, PMAP_MAPPING_TYPE_RESTRICTED);
2550 }
2551 
2552 #if HAS_MTE
2553 void *
pmap_steal_zone_memory(vm_size_t size,vm_size_t alignment)2554 pmap_steal_zone_memory(
2555 	vm_size_t size,
2556 	vm_size_t alignment)
2557 {
2558 	return pmap_steal_memory_internal(size, alignment, FALSE, VM_MEM_MAP_MTE, PMAP_MAPPING_TYPE_RESTRICTED);
2559 }
2560 #endif /* HAS_MTE */
2561 
2562 
2563 #if CONFIG_SECLUDED_MEMORY
2564 /* boot-args to control secluded memory */
2565 TUNABLE_DT(unsigned int, secluded_mem_mb, "/defaults", "kern.secluded_mem_mb", "secluded_mem_mb", 0, TUNABLE_DT_NONE);
2566 /* IOKit can use secluded memory */
2567 TUNABLE(bool, secluded_for_iokit, "secluded_for_iokit", true);
2568 /* apps can use secluded memory */
2569 TUNABLE(bool, secluded_for_apps, "secluded_for_apps", true);
2570 /* filecache can use seclude memory */
2571 TUNABLE(secluded_filecache_mode_t, secluded_for_filecache, "secluded_for_filecache", SECLUDED_FILECACHE_RDONLY);
2572 uint64_t secluded_shutoff_trigger = 0;
2573 uint64_t secluded_shutoff_headroom = 150 * 1024 * 1024; /* original value from N56 */
2574 #endif /* CONFIG_SECLUDED_MEMORY */
2575 
2576 
2577 #if defined(__arm64__)
2578 extern void patch_low_glo_vm_page_info(void *, void *, uint32_t);
2579 #endif
2580 
2581 void vm_page_release_startup(vm_page_t mem);
2582 __mockable void
pmap_startup(vm_offset_t * startp,vm_offset_t * endp)2583 pmap_startup(
2584 	vm_offset_t     *startp,
2585 	vm_offset_t     *endp)
2586 {
2587 	unsigned int    npages;
2588 	ppnum_t         phys_page;
2589 	uint64_t        mem_sz;
2590 	uint64_t        start_ns;
2591 	uint64_t        now_ns;
2592 	uint32_t        divisor;
2593 #if XNU_VM_HAS_DELAYED_PAGES
2594 	uint_t          low_page_count = 0;
2595 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2596 
2597 	/*
2598 	 * make sure we are aligned on a 64 byte boundary
2599 	 * for VM_PAGE_PACK_PTR (it clips off the low-order
2600 	 * 6 bits of the pointer)
2601 	 */
2602 	if (virtual_space_start != virtual_space_end) {
2603 		virtual_space_start = round_page(virtual_space_start);
2604 	}
2605 
2606 	/*
2607 	 * We calculate how many page frames we will have
2608 	 * and then allocate the page structures in one chunk.
2609 	 *
2610 	 * Note that the calculation here doesn't take into account
2611 	 * the memory needed to map what's being allocated, i.e. the page
2612 	 * table entries. So the actual number of pages we get will be
2613 	 * less than this. To do someday: include that in the computation.
2614 	 *
2615 	 * Also for ARM, we don't use the count of free_pages, but rather the
2616 	 * range from last page to first page (ignore holes due to retired pages).
2617 	 */
2618 
2619 	/*
2620 	 * Initialize and release the page frames.
2621 	 */
2622 	kernel_debug_string_early("page_frame_init");
2623 	absolutetime_to_nanoseconds(mach_absolute_time(), &start_ns);
2624 	if (fillval) {
2625 		kprintf("Filling vm_pages with pattern: 0x%x\n", fillval);
2626 	}
2627 
2628 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2629 	mem_sz  = ptoa(pmap_free_pages_span());
2630 #if HAS_MTE
2631 	if (!is_mte_enabled)
2632 #endif /* HAS_MTE */
2633 #if CONFIG_SPTM
2634 	{
2635 		uint32_t count = vm_pages_free_mask_len();
2636 
2637 		_vm_pages_free_masks = pmap_steal_memory(count *
2638 		    sizeof(__uint128_t), sizeof(__uint128_t));
2639 		_vm_pages_free_enqueue_idx = pmap_steal_memory(count, sizeof(uint8_t));
2640 		bzero(_vm_pages_free_masks, count * sizeof(__uint128_t));
2641 		memset(_vm_pages_free_enqueue_idx, 0xff, count);
2642 	}
2643 #endif /* CONFIG_SPTM */
2644 #else
2645 	mem_sz  = ptoa(pmap_free_pages());
2646 #endif
2647 	mem_sz += round_page(virtual_space_start) - virtual_space_start;   /* Account for any slop */
2648 	divisor = PAGE_SIZE + sizeof(struct vm_page);
2649 	npages  = (uint32_t)((mem_sz + divisor - 1) / divisor); /* scaled to include the vm_page_ts */
2650 
2651 
2652 	vm_pages     = pmap_steal_freeable_memory(npages * sizeof(struct vm_page));
2653 	vm_pages_end = vm_page_get(npages);
2654 
2655 #if CONFIG_SECLUDED_MEMORY
2656 	/*
2657 	 * Figure out how much secluded memory to have before we start
2658 	 * release pages to free lists.
2659 	 * The default, if specified nowhere else, is no secluded mem.
2660 	 */
2661 	vm_page_secluded_target = (unsigned int)atop_64(secluded_mem_mb * 1024ULL * 1024ULL);
2662 
2663 	/*
2664 	 * Allow a really large app to effectively use secluded memory until it exits.
2665 	 */
2666 	if (vm_page_secluded_target != 0) {
2667 		/*
2668 		 * Get an amount from boot-args, else use 1/2 of max_mem.
2669 		 * 1/2 max_mem was chosen from a Peace daemon tentpole test which
2670 		 * used munch to induce jetsam thrashing of false idle daemons on N56.
2671 		 */
2672 		int secluded_shutoff_mb;
2673 		if (PE_parse_boot_argn("secluded_shutoff_mb", &secluded_shutoff_mb,
2674 		    sizeof(secluded_shutoff_mb))) {
2675 			secluded_shutoff_trigger = (uint64_t)secluded_shutoff_mb * 1024 * 1024;
2676 		} else {
2677 			secluded_shutoff_trigger = max_mem / 2;
2678 		}
2679 
2680 		/* ensure the headroom value is sensible and avoid underflows */
2681 		assert(secluded_shutoff_trigger == 0 || secluded_shutoff_trigger > secluded_shutoff_headroom);
2682 	}
2683 #endif /* CONFIG_SECLUDED_MEMORY */
2684 
2685 #if defined(__x86_64__)
2686 
2687 	/*
2688 	 * Decide how much memory we delay freeing at boot time.
2689 	 */
2690 	uint32_t delay_above_gb;
2691 	if (!PE_parse_boot_argn("delay_above_gb", &delay_above_gb, sizeof(delay_above_gb))) {
2692 		delay_above_gb = DEFAULT_DELAY_ABOVE_PHYS_GB;
2693 	}
2694 
2695 	if (delay_above_gb == 0) {
2696 		delay_above_pnum = PPNUM_MAX;
2697 	} else {
2698 		delay_above_pnum = delay_above_gb * (1024 * 1024 * 1024 / PAGE_SIZE);
2699 	}
2700 
2701 	/* make sure we have sane breathing room: 1G above low memory */
2702 	if (delay_above_pnum <= max_valid_low_ppnum) {
2703 		delay_above_pnum = max_valid_low_ppnum + ((1024 * 1024 * 1024) >> PAGE_SHIFT);
2704 	}
2705 
2706 	if (delay_above_pnum < PPNUM_MAX) {
2707 		printf("pmap_startup() delaying init/free of page nums > 0x%x\n", delay_above_pnum);
2708 	}
2709 
2710 #endif /* defined(__x86_64__) */
2711 
2712 
2713 	vm_free_page_lock();
2714 
2715 	for (uint32_t i = 0; i < npages && pmap_next_page(&phys_page); i++) {
2716 #if XNU_VM_HAS_DELAYED_PAGES
2717 		if (phys_page < max_valid_low_ppnum) {
2718 			++low_page_count;
2719 		}
2720 
2721 		/* Are we at high enough pages to delay the rest? */
2722 		if (low_page_count > vm_lopage_free_limit &&
2723 		    phys_page > delay_above_pnum) {
2724 			vm_delayed_count = pmap_free_pages();
2725 			assert3u(vm_pages_count + vm_delayed_count, <=, npages);
2726 			break;
2727 		}
2728 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2729 
2730 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2731 		if (i == 0) {
2732 			vm_pages_first_pnum = phys_page;
2733 			patch_low_glo_vm_page_info(vm_pages, vm_pages_end,
2734 			    vm_pages_first_pnum);
2735 #if HAS_MTE
2736 			if (is_mte_enabled) {
2737 				vm_pages_tag_storage = vm_page_get(
2738 					(mte_tag_storage_start_pnum - vm_pages_first_pnum));
2739 				vm_pages_tag_storage_end = vm_tag_storage_page_get(mte_tag_storage_count);
2740 				assert3p(vm_pages_tag_storage_end, <=, vm_pages_end);
2741 			}
2742 #endif /* HAS_MTE */
2743 		}
2744 #else
2745 		/* The x86 clump freeing code requires increasing ppn's to work correctly */
2746 		if (i > 0) {
2747 			assert(phys_page > vm_page_get(i - 1)->vmp_phys_page);
2748 		}
2749 #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */
2750 
2751 		++vm_pages_count;
2752 		vm_page_init(vm_page_get(i), phys_page);
2753 		if (fillval) {
2754 			fillPage(phys_page, fillval);
2755 		}
2756 		if (vm_himemory_mode) {
2757 			vm_page_release_startup(vm_page_get(i));
2758 		}
2759 	}
2760 
2761 	vm_page_pages = vm_pages_count; /* used to report to user space */
2762 
2763 	if (!vm_himemory_mode) {
2764 		for (uint32_t i = npages; i-- > 0;) {
2765 			/* skip retired pages */
2766 			if (!VMP_ERROR_GET(vm_page_get(i))) {
2767 				vm_page_release_startup(vm_page_get(i));
2768 			}
2769 		}
2770 	}
2771 
2772 	vm_free_page_unlock();
2773 
2774 	absolutetime_to_nanoseconds(mach_absolute_time(), &now_ns);
2775 	printf("pmap_startup() init/release time: %lld microsec\n",
2776 	    (now_ns - start_ns) / NSEC_PER_USEC);
2777 #if XNU_VM_HAS_DELAYED_PAGES
2778 	printf("pmap_startup() delayed init/release of %d pages\n",
2779 	    vm_delayed_count);
2780 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2781 
2782 	/*
2783 	 * Validate packing will work properly.  This needs to be done last
2784 	 * after vm_pages_count has been computed.
2785 	 */
2786 	if (npages >= VM_PAGE_PACKED_FROM_ARRAY) {
2787 		panic("pmap_startup(): too many pages to support vm_page packing");
2788 	}
2789 	if ((vm_page_t)VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(vm_pages)) != vm_pages) {
2790 		panic("VM_PAGE_PACK_PTR failed on vm_pages - %p", vm_pages);
2791 	}
2792 	if ((vm_page_t)VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(vm_page_get(vm_pages_count - 1))) !=
2793 	    vm_page_get(vm_pages_count - 1)) {
2794 		panic("VM_PAGE_PACK_PTR failed on vm_pages_end - %p",
2795 		    vm_page_get(vm_pages_count - 1));
2796 	}
2797 
2798 	VM_CHECK_MEMORYSTATUS;
2799 
2800 	/*
2801 	 * We have to re-align virtual_space_start,
2802 	 * because pmap_steal_memory has been using it.
2803 	 */
2804 	virtual_space_start = round_page(virtual_space_start);
2805 	*startp = virtual_space_start;
2806 	*endp = virtual_space_end;
2807 }
2808 #endif  /* MACHINE_PAGES */
2809 
2810 /*
2811  * Create the zone that represents the vm_pages[] array. Nothing ever allocates
2812  * or frees to this zone. It's just here for reporting purposes via zprint command.
2813  * This needs to be done after all initially delayed pages are put on the free lists.
2814  */
2815 void
vm_pages_array_finalize(void)2816 vm_pages_array_finalize(void)
2817 {
2818 	(void)zone_create_ext("vm pages array", sizeof(struct vm_page),
2819 	    ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE, ZONE_ID_VM_PAGES, ^(zone_t z) {
2820 		uint64_t vm_page_zone_pages, vm_page_array_zone_data_size;
2821 
2822 		zone_set_exhaustible(z, 0, true);
2823 		/*
2824 		 * Reflect size and usage information for vm_pages[].
2825 		 */
2826 
2827 		z->z_elems_avail = (uint32_t)(vm_pages_end - vm_pages);
2828 		z->z_elems_free = z->z_elems_avail - vm_pages_count;
2829 		zpercpu_get_cpu(z->z_stats, 0)->zs_mem_allocated =
2830 		vm_pages_count * sizeof(struct vm_page);
2831 		vm_page_array_zone_data_size = (uint64_t)vm_pages_end - (uint64_t)vm_pages;
2832 		vm_page_zone_pages = atop(round_page((vm_offset_t)vm_page_array_zone_data_size));
2833 		z->z_wired_cur += vm_page_zone_pages;
2834 		z->z_wired_hwm = z->z_wired_cur;
2835 		z->z_va_cur = z->z_wired_cur;
2836 		/* since zone accounts for these, take them out of stolen */
2837 		VM_PAGE_MOVE_STOLEN(vm_page_zone_pages);
2838 	});
2839 }
2840 
2841 /*
2842  * Create the vm_pages zone. This is used for the vm_page structures for the pages
2843  * that are scavanged from other boot time usages by ml_static_mfree(). As such,
2844  * this needs to happen in early VM bootstrap.
2845  */
2846 
2847 __startup_func
2848 static void
vm_page_module_init(void)2849 vm_page_module_init(void)
2850 {
2851 	vm_size_t vm_page_with_ppnum_size;
2852 
2853 	/*
2854 	 * Since the pointers to elements in this zone will be packed, they
2855 	 * must have appropriate size. Not strictly what sizeof() reports.
2856 	 */
2857 	vm_page_with_ppnum_size =
2858 	    (sizeof(struct vm_page_with_ppnum) + (VM_PAGE_PACKED_PTR_ALIGNMENT - 1)) &
2859 	    ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
2860 
2861 	vm_page_zone = zone_create_ext("vm pages", vm_page_with_ppnum_size,
2862 	    ZC_ALIGNMENT_REQUIRED | ZC_VM,
2863 	    ZONE_ID_ANY, ^(zone_t z) {
2864 		/*
2865 		 * The number "10" is a small number that is larger than the number
2866 		 * of fictitious pages that any single caller will attempt to allocate
2867 		 * without blocking.
2868 		 *
2869 		 * The largest such number at the moment is kmem_alloc()
2870 		 * when 2 guard pages are asked. 10 is simply a somewhat larger number,
2871 		 * taking into account the 50% hysteresis the zone allocator uses.
2872 		 *
2873 		 * Note: this works at all because the zone allocator
2874 		 *       doesn't ever allocate fictitious pages.
2875 		 */
2876 		zone_raise_reserve(z, 10);
2877 	});
2878 }
2879 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_page_module_init);
2880 
2881 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2882 /*
2883  * Radix tree of pages within the [pmap_first_pnum, vm_pages_first_pnum) range,
2884  * in order to support page lookup by pnum (@see vm_page_find_canonical()),
2885  * which corresponds to pages returned to the VM via @c ml_static_mfree().
2886  *
2887  * Kernel vm pages are never freed, which means that this data structure
2888  * is insert only.
2889  *
2890  * Empirically we have about 4-5k such pages, typically in only few rather dense
2891  * contiguous spans, inside a range of roughly 32k pnums.
2892  *
2893  * A radix tree works well with the distribution of keys, but also allows for
2894  * a straightforward lockless lookup path.
2895  */
2896 
2897 #define VM_PAGE_RADIX_FANOUT_SHIFT  8
2898 #define VM_PAGE_RADIX_FANOUT        (1u << VM_PAGE_RADIX_FANOUT_SHIFT)
2899 
2900 typedef uint32_t vm_page_radix_ptr_t;
2901 
2902 typedef struct vm_page_radix_node {
2903 	vm_page_radix_ptr_t     vmpr_array[VM_PAGE_RADIX_FANOUT];
2904 } *vm_page_radix_node_t;
2905 
2906 static LCK_GRP_DECLARE(vm_pages_radix_lock_grp, "VM pages radix");
2907 static LCK_MTX_DECLARE(vm_pages_radix_lock, &vm_pages_radix_lock_grp);
2908 
2909 static SECURITY_READ_ONLY_LATE(uintptr_t) vm_pages_radix_root;
2910 static uint32_t vm_pages_radix_count;
2911 
2912 static vm_page_radix_node_t
vm_page_radix_node_unpack(vm_page_radix_ptr_t ptr)2913 vm_page_radix_node_unpack(vm_page_radix_ptr_t ptr)
2914 {
2915 	return (vm_page_radix_node_t)VM_UNPACK_POINTER(ptr, VM_PAGE_PACKED_PTR);
2916 }
2917 
2918 static vm_page_radix_ptr_t
vm_page_radix_node_pack(vm_page_radix_node_t node)2919 vm_page_radix_node_pack(vm_page_radix_node_t node)
2920 {
2921 	vm_offset_t ptr = (vm_offset_t)node;
2922 
2923 	VM_ASSERT_POINTER_PACKABLE(ptr, VM_PAGE_PACKED_PTR);
2924 	return (vm_page_radix_ptr_t)VM_PACK_POINTER(ptr, VM_PAGE_PACKED_PTR);
2925 }
2926 
2927 static uint32_t
vm_page_radix_key(uint32_t level,uint32_t index)2928 vm_page_radix_key(uint32_t level, uint32_t index)
2929 {
2930 	uint32_t key = index >> (VM_PAGE_RADIX_FANOUT_SHIFT * level);
2931 
2932 	return key & (VM_PAGE_RADIX_FANOUT - 1);
2933 }
2934 
2935 static vm_page_radix_ptr_t *
vm_page_radix_slot(vm_page_radix_node_t node,uint32_t level,uint32_t index)2936 vm_page_radix_slot(vm_page_radix_node_t node, uint32_t level, uint32_t index)
2937 {
2938 	return node->vmpr_array + vm_page_radix_key(level, index);
2939 }
2940 
2941 __startup_func
2942 __attribute__((noinline))
2943 static vm_page_radix_node_t
vm_pages_radix_init_root(uint32_t * levelp)2944 vm_pages_radix_init_root(uint32_t *levelp)
2945 {
2946 	uint32_t max_index = vm_pages_first_pnum - pmap_first_pnum - 1;
2947 	vm_page_radix_node_t root;
2948 	uint32_t level;
2949 	vm_size_t size;
2950 
2951 	/*
2952 	 * Init a top-level node right away, to cover any index within
2953 	 * [0, vm_pages_first_pnum - pmap_first_pnum)
2954 	 */
2955 	level = (fls(max_index | 1) - 1) / VM_PAGE_RADIX_FANOUT_SHIFT;
2956 	size  = (vm_page_radix_key(level, max_index) + 1) *
2957 	    sizeof(vm_page_radix_ptr_t);
2958 
2959 	root  = zalloc_permanent(size, ZALIGN_64);
2960 
2961 	/*
2962 	 * Pack the level into the root pointer low bits,
2963 	 * so that pointer and level can be read atomically.
2964 	 *
2965 	 * See vm_pages_radix_load_root().
2966 	 */
2967 	os_atomic_store(&vm_pages_radix_root, (uintptr_t)root | level, release);
2968 
2969 	*levelp = level;
2970 	return root;
2971 }
2972 
2973 static vm_page_radix_node_t
vm_pages_radix_node_alloc(vm_page_radix_ptr_t * slot)2974 vm_pages_radix_node_alloc(vm_page_radix_ptr_t *slot)
2975 {
2976 	vm_page_radix_node_t node;
2977 
2978 	node = zalloc_permanent(sizeof(struct vm_page_radix_node),
2979 	    VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
2980 	os_atomic_store(slot, vm_page_radix_node_pack(node), release);
2981 	return node;
2982 }
2983 
2984 static vm_page_radix_node_t
vm_pages_radix_load_root(uint32_t * level)2985 vm_pages_radix_load_root(uint32_t *level)
2986 {
2987 	const uintptr_t VM_PAGE_RADIX_LEVEL_MASK = 0x7ul;
2988 
2989 	uintptr_t root = os_atomic_load(&vm_pages_radix_root, dependency);
2990 
2991 	*level = root & VM_PAGE_RADIX_LEVEL_MASK;
2992 	root &= ~VM_PAGE_RADIX_LEVEL_MASK;
2993 	return (vm_page_radix_node_t)root;
2994 }
2995 
2996 vm_page_t
vm_pages_radix_next(uint32_t * cursor,ppnum_t * pnum)2997 vm_pages_radix_next(uint32_t *cursor, ppnum_t *pnum)
2998 {
2999 	const uint32_t max_index = vm_pages_first_pnum - pmap_first_pnum;
3000 	vm_page_radix_node_t node;
3001 	uint32_t level, index;
3002 
3003 	index  = *cursor;
3004 	node   = vm_pages_radix_load_root(&level);
3005 
3006 	if (node == NULL) {
3007 		return VM_PAGE_NULL;
3008 	}
3009 
3010 	while (index < max_index) {
3011 		vm_page_radix_ptr_t *slot = vm_page_radix_slot(node, level, index);
3012 		vm_page_radix_ptr_t  ptr  = os_atomic_load(slot, dependency);
3013 
3014 		if (ptr == 0) {
3015 			uint32_t stride = 1 << (VM_PAGE_RADIX_FANOUT_SHIFT * level);
3016 
3017 			index = (index + stride) & -stride;
3018 			if (vm_page_radix_key(level, index) == 0) {
3019 				/* restart lookup at the top */
3020 				node = vm_pages_radix_load_root(&level);
3021 			}
3022 		} else if (level > 0) {
3023 			node   = vm_page_radix_node_unpack(ptr);
3024 			level -= 1;
3025 		} else {
3026 			*cursor = index + 1;
3027 			if (pnum) {
3028 				*pnum = pmap_first_pnum + index;
3029 			}
3030 			return (vm_page_t)VM_PAGE_UNPACK_PTR(ptr);
3031 		}
3032 	}
3033 
3034 	if (pnum) {
3035 		*pnum = 0;
3036 	}
3037 	return VM_PAGE_NULL;
3038 }
3039 
3040 #if DEBUG || DEVELOPMENT
3041 
3042 static int
vm_page_radix_verify_test(int64_t in __unused,int64_t * out)3043 vm_page_radix_verify_test(int64_t in __unused, int64_t *out)
3044 {
3045 	uint32_t count = 0;
3046 	vm_page_t mem;
3047 
3048 	lck_mtx_lock(&vm_pages_radix_lock);
3049 
3050 	vm_pages_radix_for_each(mem) {
3051 		count++;
3052 		assert(mem == vm_page_find_canonical(VM_PAGE_GET_PHYS_PAGE(mem)));
3053 	}
3054 
3055 	assert(count == vm_pages_radix_count);
3056 
3057 	lck_mtx_unlock(&vm_pages_radix_lock);
3058 
3059 	*out = 1;
3060 	return 0;
3061 }
3062 SYSCTL_TEST_REGISTER(vm_page_radix_verify, vm_page_radix_verify_test);
3063 
3064 #endif /* DEBUG || DEVELOPMENT */
3065 
3066 __attribute__((noinline))
3067 static void
vm_pages_radix_insert(ppnum_t pnum,vm_page_t page)3068 vm_pages_radix_insert(ppnum_t pnum, vm_page_t page)
3069 {
3070 	vm_page_radix_ptr_t *slot;
3071 	vm_page_radix_node_t node;
3072 	uint32_t level, index;
3073 
3074 	assert(!vm_page_in_array(page));
3075 	index = pnum - pmap_first_pnum;
3076 
3077 	lck_mtx_lock(&vm_pages_radix_lock);
3078 
3079 	node = vm_pages_radix_load_root(&level);
3080 	if (node == NULL) {
3081 		node = vm_pages_radix_init_root(&level);
3082 	}
3083 
3084 	for (; level > 0; level--) {
3085 		slot = vm_page_radix_slot(node, level, index);
3086 		if (*slot == 0) {
3087 			node = vm_pages_radix_node_alloc(slot);
3088 		} else {
3089 			node = vm_page_radix_node_unpack(*slot);
3090 		}
3091 	}
3092 
3093 	slot = vm_page_radix_slot(node, 0, index);
3094 	assert(*slot == 0);
3095 	os_atomic_store(slot, VM_PAGE_PACK_PTR(page), release);
3096 	vm_pages_radix_count++;
3097 
3098 	lck_mtx_unlock(&vm_pages_radix_lock);
3099 }
3100 
3101 __abortlike
3102 static void
vm_page_for_ppnum_panic(ppnum_t pnum)3103 vm_page_for_ppnum_panic(ppnum_t pnum)
3104 {
3105 	if (pnum < pmap_first_pnum) {
3106 		panic("physical page is before the start of DRAM: %#x < %#x)",
3107 		    pnum, pmap_first_pnum);
3108 	}
3109 	panic("physical page is beyond the end of managed DRAM: %#x >= %#x)",
3110 	    pnum, vm_pages_first_pnum + vm_pages_count);
3111 }
3112 
3113 vm_page_t
vm_page_find_canonical(ppnum_t pnum)3114 vm_page_find_canonical(ppnum_t pnum)
3115 {
3116 	vm_page_radix_ptr_t *slot;
3117 	vm_page_radix_node_t node;
3118 	vm_page_radix_ptr_t  ptr;
3119 	uint32_t level, index;
3120 
3121 	if (pnum < pmap_first_pnum) {
3122 		vm_page_for_ppnum_panic(pnum);
3123 	}
3124 
3125 	if (pnum >= vm_pages_first_pnum + vm_pages_count) {
3126 		/*
3127 		 * We could receive requests for pages which are beyond the xnu's managed space. (eg: ECC errors)
3128 		 * These need to be handled gracefully, so we return VM_PAGE_NULL here.
3129 		 */
3130 		return VM_PAGE_NULL;
3131 	}
3132 
3133 	if (__probable(pnum >= vm_pages_first_pnum)) {
3134 		return vm_page_get(pnum - vm_pages_first_pnum);
3135 	}
3136 
3137 	index = pnum - pmap_first_pnum;
3138 	node  = vm_pages_radix_load_root(&level);
3139 
3140 	for (; node && level > 0; level--) {
3141 		slot = vm_page_radix_slot(node, level, index);
3142 		ptr  = os_atomic_load(slot, dependency);
3143 		node = vm_page_radix_node_unpack(ptr);
3144 	}
3145 
3146 	if (__probable(node)) {
3147 		slot = vm_page_radix_slot(node, 0, index);
3148 		ptr  = os_atomic_load(slot, dependency);
3149 		return (vm_page_t)VM_PAGE_UNPACK_PTR(ptr);
3150 	}
3151 
3152 	return VM_PAGE_NULL;
3153 }
3154 
3155 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
3156 
3157 /*!
3158  * @function vm_page_create()
3159  *
3160  * @brief
3161  * Common helper for all vm_page_create* functions.
3162  */
3163 vm_page_t
vm_page_create(ppnum_t phys_page,bool canonical,zalloc_flags_t flags)3164 vm_page_create(ppnum_t phys_page, bool canonical, zalloc_flags_t flags)
3165 {
3166 	vm_page_t m;
3167 
3168 	m = zalloc_flags(vm_page_zone, flags);
3169 	if (m) {
3170 		vm_page_init(m, phys_page);
3171 		if (phys_page == vm_page_guard_addr) {
3172 			counter_inc(&vm_guard_count);
3173 		}
3174 	}
3175 	if (canonical) {
3176 		assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
3177 		m->vmp_canonical = true;
3178 #if HAS_MTE
3179 		m->vmp_using_mte = pmap_is_tagged_page(phys_page);
3180 #endif /* HAS_MTE */
3181 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
3182 		vm_pages_radix_insert(phys_page, m);
3183 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
3184 		vm_free_page_lock();
3185 		vm_page_pages++;
3186 		vm_free_page_unlock();
3187 	}
3188 	return m;
3189 }
3190 
3191 /*
3192  *	Routine:	vm_page_create_canonical
3193  *	Purpose:
3194  *		After the VM system is up, machine-dependent code
3195  *		may stumble across more physical memory.  For example,
3196  *		memory that it was reserving for a frame buffer.
3197  *		vm_page_create_canonical turns this memory into available pages.
3198  */
3199 
3200 void
vm_page_create_canonical(ppnum_t phys_page)3201 vm_page_create_canonical(ppnum_t phys_page)
3202 {
3203 	vm_page_t m;
3204 
3205 	m = vm_page_create(phys_page, true, Z_WAITOK);
3206 	vm_page_release(m, VMP_RELEASE_NONE);
3207 }
3208 
3209 
3210 /*
3211  *	vm_page_hash:
3212  *
3213  *	Distributes the object/offset key pair among hash buckets.
3214  *
3215  *	NOTE:	The bucket count must be a power of 2
3216  */
3217 #define vm_page_hash(object, offset) (\
3218 	( (natural_t)((uintptr_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\
3219 	 & vm_page_hash_mask)
3220 
3221 
3222 /*
3223  *	vm_page_insert:		[ internal use only ]
3224  *
3225  *	Inserts the given mem entry into the object/object-page
3226  *	table and object list.
3227  *
3228  *	The object must be locked.
3229  */
3230 void
vm_page_insert(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)3231 vm_page_insert(
3232 	vm_page_t               mem,
3233 	vm_object_t             object,
3234 	vm_object_offset_t      offset)
3235 {
3236 	vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, FALSE, FALSE, NULL);
3237 }
3238 
3239 void
vm_page_insert_wired(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag)3240 vm_page_insert_wired(
3241 	vm_page_t               mem,
3242 	vm_object_t             object,
3243 	vm_object_offset_t      offset,
3244 	vm_tag_t                tag)
3245 {
3246 	vm_page_insert_internal(mem, object, offset, tag, FALSE, TRUE, FALSE, FALSE, NULL);
3247 }
3248 
3249 void
vm_page_insert_internal(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag,boolean_t queues_lock_held,boolean_t insert_in_hash,boolean_t batch_pmap_op,boolean_t batch_accounting,uint64_t * delayed_ledger_update)3250 vm_page_insert_internal(
3251 	vm_page_t               mem,
3252 	vm_object_t             object,
3253 	vm_object_offset_t      offset,
3254 	vm_tag_t                tag,
3255 	boolean_t               queues_lock_held,
3256 	boolean_t               insert_in_hash,
3257 	boolean_t               batch_pmap_op,
3258 	boolean_t               batch_accounting,
3259 	uint64_t                *delayed_ledger_update)
3260 {
3261 	vm_page_bucket_t        *bucket;
3262 	lck_ticket_t            *bucket_lock;
3263 	int                     hash_id;
3264 	task_t                  owner;
3265 	int                     ledger_idx_volatile;
3266 	int                     ledger_idx_nonvolatile;
3267 	int                     ledger_idx_volatile_compressed;
3268 	int                     ledger_idx_nonvolatile_compressed;
3269 	int                     ledger_idx_composite;
3270 	int                     ledger_idx_external_wired;
3271 	boolean_t               do_footprint;
3272 
3273 #if 0
3274 	/*
3275 	 * we may not hold the page queue lock
3276 	 * so this check isn't safe to make
3277 	 */
3278 	VM_PAGE_CHECK(mem);
3279 #endif
3280 
3281 	assertf(page_aligned(offset), "0x%llx\n", offset);
3282 
3283 	assert(!VM_PAGE_WIRED(mem) || !vm_page_is_canonical(mem) ||
3284 	    (tag != VM_KERN_MEMORY_NONE));
3285 
3286 #if HAS_MTE
3287 	assert_mte_vmo_matches_vmp(object, mem);
3288 #endif /* HAS_MTE */
3289 	vm_object_lock_assert_exclusive(object);
3290 	LCK_MTX_ASSERT(&vm_page_queue_lock,
3291 	    queues_lock_held ? LCK_MTX_ASSERT_OWNED
3292 	    : LCK_MTX_ASSERT_NOTOWNED);
3293 
3294 	if (queues_lock_held == FALSE) {
3295 		assert(!VM_PAGE_PAGEABLE(mem));
3296 	}
3297 
3298 	if (insert_in_hash == TRUE) {
3299 #if DEBUG || VM_PAGE_BUCKETS_CHECK
3300 		if (mem->vmp_tabled || mem->vmp_object) {
3301 			panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
3302 			    "already in (obj=%p,off=0x%llx)",
3303 			    mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
3304 		}
3305 #endif
3306 		if (object->internal && (offset >= object->vo_size)) {
3307 			panic("vm_page_insert_internal: (page=%p,obj=%p,off=0x%llx,size=0x%llx) inserted at offset past object bounds",
3308 			    mem, object, offset, object->vo_size);
3309 		}
3310 
3311 		assert(vm_page_lookup(object, offset) == VM_PAGE_NULL);
3312 
3313 		/*
3314 		 *	Record the object/offset pair in this page
3315 		 */
3316 
3317 		mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
3318 		mem->vmp_offset = offset;
3319 
3320 #if CONFIG_SECLUDED_MEMORY
3321 		if (object->eligible_for_secluded) {
3322 			vm_page_secluded.eligible_for_secluded++;
3323 		}
3324 #endif /* CONFIG_SECLUDED_MEMORY */
3325 
3326 		/*
3327 		 *	Insert it into the object_object/offset hash table
3328 		 */
3329 		hash_id = vm_page_hash(object, offset);
3330 		bucket = &vm_page_buckets[hash_id];
3331 		bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3332 
3333 		lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3334 
3335 		mem->vmp_next_m = bucket->page_list;
3336 		bucket->page_list = VM_PAGE_PACK_PTR(mem);
3337 		assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)));
3338 
3339 #if     MACH_PAGE_HASH_STATS
3340 		if (++bucket->cur_count > bucket->hi_count) {
3341 			bucket->hi_count = bucket->cur_count;
3342 		}
3343 #endif /* MACH_PAGE_HASH_STATS */
3344 		mem->vmp_hashed = TRUE;
3345 		lck_ticket_unlock(bucket_lock);
3346 	}
3347 
3348 	{
3349 		unsigned int    cache_attr;
3350 
3351 		cache_attr = object->wimg_bits & VM_WIMG_MASK;
3352 
3353 #if HAS_MTE
3354 		/*
3355 		 * Set the cache attributes if it's neither the default atttributes
3356 		 * nor it's WIMG_MTE because we would have already set it before
3357 		 * inserting the page into this object. There is no need to take
3358 		 * the set hit.
3359 		 *
3360 		 *
3361 		 */
3362 		if (cache_attr == VM_WIMG_MTE) {
3363 			if (vm_object_is_mte_mappable_with_page(object, mem)) {
3364 				/*
3365 				 * By now, we expect non-fictitious pages to have been made
3366 				 * tagged. This should happen in mteinfo_page_list_fix_tagging()
3367 				 * when the page is inserted onto the per-CPU free tagged queue.
3368 				 */
3369 				assert(mem->vmp_using_mte);
3370 				assert(pmap_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem)) == VM_WIMG_MTE);
3371 			} else {
3372 				/*
3373 				 * We don't want the object for fictitious pages to have its
3374 				 * cache attributes set if the object is MTE.
3375 				 */
3376 			}
3377 		} else {
3378 #endif /* HAS_MTE */
3379 
3380 		if (cache_attr != VM_WIMG_USE_DEFAULT) {
3381 			PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op);
3382 		}
3383 
3384 #if HAS_MTE
3385 	}
3386 #endif
3387 	}
3388 
3389 	/*
3390 	 *	Now link into the object's list of backed pages.
3391 	 */
3392 	vm_page_queue_enter(&object->memq, mem, vmp_listq);
3393 	object->memq_hint = mem;
3394 	mem->vmp_tabled = TRUE;
3395 
3396 	/*
3397 	 *	Show that the object has one more resident page.
3398 	 */
3399 
3400 	object->resident_page_count++;
3401 	if (VM_PAGE_WIRED(mem)) {
3402 		assert(mem->vmp_wire_count > 0);
3403 		VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
3404 		VM_OBJECT_WIRED_PAGE_ADD(object, mem);
3405 		VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
3406 	}
3407 	assert(object->resident_page_count >= object->wired_page_count);
3408 
3409 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
3410 	vm_object_set_chead_hint(object);
3411 #endif
3412 
3413 #if DEVELOPMENT || DEBUG
3414 	if (object->object_is_shared_cache &&
3415 	    object->pager != NULL &&
3416 	    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3417 		int new, old;
3418 		assert(!object->internal);
3419 		new = OSAddAtomic(+1, &shared_region_pagers_resident_count);
3420 		do {
3421 			old = shared_region_pagers_resident_peak;
3422 		} while (old < new &&
3423 		    !OSCompareAndSwap(old, new, &shared_region_pagers_resident_peak));
3424 	}
3425 #endif /* DEVELOPMENT || DEBUG */
3426 
3427 	if (batch_accounting == FALSE) {
3428 		if (object->internal) {
3429 			OSAddAtomic(1, &vm_page_internal_count);
3430 		} else {
3431 			OSAddAtomic(1, &vm_page_external_count);
3432 		}
3433 	}
3434 
3435 	/*
3436 	 * It wouldn't make sense to insert a "reusable" page in
3437 	 * an object (the page would have been marked "reusable" only
3438 	 * at the time of a madvise(MADV_FREE_REUSABLE) if it was already
3439 	 * in the object at that time).
3440 	 * But a page could be inserted in a "all_reusable" object, if
3441 	 * something faults it in (a vm_read() from another task or a
3442 	 * "use-after-free" issue in user space, for example).  It can
3443 	 * also happen if we're relocating a page from that object to
3444 	 * a different physical page during a physically-contiguous
3445 	 * allocation.
3446 	 */
3447 	assert(!mem->vmp_reusable);
3448 	if (object->all_reusable) {
3449 		OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count);
3450 	}
3451 
3452 	if (object->purgable == VM_PURGABLE_DENY &&
3453 	    !object->vo_ledger_tag) {
3454 		owner = TASK_NULL;
3455 	} else {
3456 		owner = VM_OBJECT_OWNER(object);
3457 		vm_object_ledger_tag_ledgers(object,
3458 		    &ledger_idx_volatile,
3459 		    &ledger_idx_nonvolatile,
3460 		    &ledger_idx_volatile_compressed,
3461 		    &ledger_idx_nonvolatile_compressed,
3462 		    &ledger_idx_composite,
3463 		    &ledger_idx_external_wired,
3464 		    &do_footprint);
3465 	}
3466 	if (owner &&
3467 	    object->internal &&
3468 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
3469 	    object->purgable == VM_PURGABLE_DENY ||
3470 	    VM_PAGE_WIRED(mem))) {
3471 		if (delayed_ledger_update) {
3472 			*delayed_ledger_update += PAGE_SIZE;
3473 		} else {
3474 			/* more non-volatile bytes */
3475 			ledger_credit(owner->ledger,
3476 			    ledger_idx_nonvolatile,
3477 			    PAGE_SIZE);
3478 			if (do_footprint) {
3479 				/* more footprint */
3480 				ledger_credit(owner->ledger,
3481 				    task_ledgers.phys_footprint,
3482 				    PAGE_SIZE);
3483 			} else if (ledger_idx_composite != -1) {
3484 				ledger_credit(owner->ledger,
3485 				    ledger_idx_composite,
3486 				    PAGE_SIZE);
3487 			}
3488 		}
3489 	} else if (owner &&
3490 	    object->internal &&
3491 	    (object->purgable == VM_PURGABLE_VOLATILE ||
3492 	    object->purgable == VM_PURGABLE_EMPTY)) {
3493 		assert(!VM_PAGE_WIRED(mem));
3494 		/* more volatile bytes */
3495 		ledger_credit(owner->ledger,
3496 		    ledger_idx_volatile,
3497 		    PAGE_SIZE);
3498 	}
3499 
3500 	if (object->purgable == VM_PURGABLE_VOLATILE) {
3501 		if (VM_PAGE_WIRED(mem)) {
3502 			OSAddAtomic(+1, &vm_page_purgeable_wired_count);
3503 		} else {
3504 			OSAddAtomic(+1, &vm_page_purgeable_count);
3505 		}
3506 	} else if (object->purgable == VM_PURGABLE_EMPTY &&
3507 	    mem->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
3508 		/*
3509 		 * This page belongs to a purged VM object but hasn't
3510 		 * been purged (because it was "busy").
3511 		 * It's in the "throttled" queue and hence not
3512 		 * visible to vm_pageout_scan().  Move it to a pageable
3513 		 * queue, so that it can eventually be reclaimed, instead
3514 		 * of lingering in the "empty" object.
3515 		 */
3516 		if (queues_lock_held == FALSE) {
3517 			vm_page_lockspin_queues();
3518 		}
3519 		vm_page_deactivate(mem);
3520 		if (queues_lock_held == FALSE) {
3521 			vm_page_unlock_queues();
3522 		}
3523 	}
3524 
3525 #if HAS_MTE
3526 	/*
3527 	 * If adding pages to the compressor object, account for whether it's
3528 	 * tag storage or not.
3529 	 */
3530 	if (object == compressor_object) {
3531 		if (vm_page_is_tag_storage(mem)) {
3532 			counter_inc(&compressor_tag_storage_pages_in_pool);
3533 		} else {
3534 			counter_inc(&compressor_non_tag_storage_pages_in_pool);
3535 		}
3536 	}
3537 #endif /* HAS_MTE */
3538 
3539 #if VM_OBJECT_TRACKING_OP_MODIFIED
3540 	if (vm_object_tracking_btlog &&
3541 	    object->internal &&
3542 	    object->resident_page_count == 0 &&
3543 	    object->pager == NULL &&
3544 	    object->shadow != NULL &&
3545 	    object->shadow->vo_copy == object) {
3546 		btlog_record(vm_object_tracking_btlog, object,
3547 		    VM_OBJECT_TRACKING_OP_MODIFIED,
3548 		    btref_get(__builtin_frame_address(0), 0));
3549 	}
3550 #endif /* VM_OBJECT_TRACKING_OP_MODIFIED */
3551 }
3552 
3553 /*
3554  *	vm_page_replace:
3555  *
3556  *	Exactly like vm_page_insert, except that we first
3557  *	remove any existing page at the given offset in object.
3558  *
3559  *	The object must be locked.
3560  */
3561 void
vm_page_replace(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)3562 vm_page_replace(
3563 	vm_page_t               mem,
3564 	vm_object_t             object,
3565 	vm_object_offset_t      offset)
3566 {
3567 	vm_page_bucket_t *bucket;
3568 	vm_page_t        found_m = VM_PAGE_NULL;
3569 	lck_ticket_t    *bucket_lock;
3570 	int              hash_id;
3571 
3572 #if 0
3573 	/*
3574 	 * we don't hold the page queue lock
3575 	 * so this check isn't safe to make
3576 	 */
3577 	VM_PAGE_CHECK(mem);
3578 #endif
3579 #if HAS_MTE
3580 	assert_mte_vmo_matches_vmp(object, mem);
3581 #endif /* HAS_MTE */
3582 	vm_object_lock_assert_exclusive(object);
3583 #if DEBUG || VM_PAGE_BUCKETS_CHECK
3584 	if (mem->vmp_tabled || mem->vmp_object) {
3585 		panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
3586 		    "already in (obj=%p,off=0x%llx)",
3587 		    mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
3588 	}
3589 #endif
3590 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3591 
3592 	assert(!VM_PAGE_PAGEABLE(mem));
3593 
3594 	/*
3595 	 *	Record the object/offset pair in this page
3596 	 */
3597 	mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
3598 	mem->vmp_offset = offset;
3599 
3600 	/*
3601 	 *	Insert it into the object_object/offset hash table,
3602 	 *	replacing any page that might have been there.
3603 	 */
3604 
3605 	hash_id = vm_page_hash(object, offset);
3606 	bucket = &vm_page_buckets[hash_id];
3607 	bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3608 
3609 	lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3610 
3611 	if (bucket->page_list) {
3612 		vm_page_packed_t *mp = &bucket->page_list;
3613 		vm_page_t m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp));
3614 
3615 		do {
3616 			/*
3617 			 * compare packed object pointers
3618 			 */
3619 			if (m->vmp_object == mem->vmp_object && m->vmp_offset == offset) {
3620 				/*
3621 				 * Remove old page from hash list
3622 				 */
3623 				*mp = m->vmp_next_m;
3624 				m->vmp_hashed = FALSE;
3625 				m->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3626 
3627 				found_m = m;
3628 				break;
3629 			}
3630 			mp = &m->vmp_next_m;
3631 		} while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp))));
3632 
3633 		mem->vmp_next_m = bucket->page_list;
3634 	} else {
3635 		mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3636 	}
3637 	/*
3638 	 * insert new page at head of hash list
3639 	 */
3640 	bucket->page_list = VM_PAGE_PACK_PTR(mem);
3641 	mem->vmp_hashed = TRUE;
3642 
3643 	lck_ticket_unlock(bucket_lock);
3644 
3645 	if (found_m) {
3646 		/*
3647 		 * there was already a page at the specified
3648 		 * offset for this object... remove it from
3649 		 * the object and free it back to the free list
3650 		 */
3651 		vm_page_free_unlocked(found_m, FALSE);
3652 	}
3653 	vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, FALSE, FALSE, FALSE, NULL);
3654 }
3655 
3656 /*
3657  *	vm_page_remove:		[ internal use only ]
3658  *
3659  *	Removes the given mem entry from the object/offset-page
3660  *	table and the object page list.
3661  *
3662  *	The object must be locked.
3663  */
3664 
3665 void
vm_page_remove(vm_page_t mem,boolean_t remove_from_hash)3666 vm_page_remove(
3667 	vm_page_t       mem,
3668 	boolean_t       remove_from_hash)
3669 {
3670 	vm_page_bucket_t *bucket;
3671 	vm_page_t       this;
3672 	lck_ticket_t   *bucket_lock;
3673 	int             hash_id;
3674 	task_t          owner;
3675 	vm_object_t     m_object;
3676 	int             ledger_idx_volatile;
3677 	int             ledger_idx_nonvolatile;
3678 	int             ledger_idx_volatile_compressed;
3679 	int             ledger_idx_nonvolatile_compressed;
3680 	int             ledger_idx_composite;
3681 	int             ledger_idx_external_wired;
3682 	int             do_footprint;
3683 
3684 	m_object = VM_PAGE_OBJECT(mem);
3685 
3686 	vm_object_lock_assert_exclusive(m_object);
3687 	assert(mem->vmp_tabled);
3688 	assert(!mem->vmp_cleaning);
3689 	assert(!mem->vmp_laundry);
3690 
3691 	if (VM_PAGE_PAGEABLE(mem)) {
3692 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3693 	}
3694 #if 0
3695 	/*
3696 	 * we don't hold the page queue lock
3697 	 * so this check isn't safe to make
3698 	 */
3699 	VM_PAGE_CHECK(mem);
3700 #endif
3701 	if (remove_from_hash == TRUE) {
3702 		/*
3703 		 *	Remove from the object_object/offset hash table
3704 		 */
3705 		hash_id = vm_page_hash(m_object, mem->vmp_offset);
3706 		bucket = &vm_page_buckets[hash_id];
3707 		bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3708 
3709 		lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3710 
3711 		if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) {
3712 			/* optimize for common case */
3713 
3714 			bucket->page_list = mem->vmp_next_m;
3715 		} else {
3716 			vm_page_packed_t        *prev;
3717 
3718 			for (prev = &this->vmp_next_m;
3719 			    (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem;
3720 			    prev = &this->vmp_next_m) {
3721 				continue;
3722 			}
3723 			*prev = this->vmp_next_m;
3724 		}
3725 #if     MACH_PAGE_HASH_STATS
3726 		bucket->cur_count--;
3727 #endif /* MACH_PAGE_HASH_STATS */
3728 		mem->vmp_hashed = FALSE;
3729 		this->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3730 		lck_ticket_unlock(bucket_lock);
3731 	}
3732 	/*
3733 	 *	Now remove from the object's list of backed pages.
3734 	 */
3735 
3736 	vm_page_remove_internal(mem);
3737 
3738 	/*
3739 	 *	And show that the object has one fewer resident
3740 	 *	page.
3741 	 */
3742 
3743 	assert(m_object->resident_page_count > 0);
3744 	m_object->resident_page_count--;
3745 
3746 #if DEVELOPMENT || DEBUG
3747 	if (m_object->object_is_shared_cache &&
3748 	    m_object->pager != NULL &&
3749 	    m_object->pager->mo_pager_ops == &shared_region_pager_ops) {
3750 		assert(!m_object->internal);
3751 		OSAddAtomic(-1, &shared_region_pagers_resident_count);
3752 	}
3753 #endif /* DEVELOPMENT || DEBUG */
3754 
3755 	if (m_object->internal) {
3756 #if DEBUG
3757 		assert(vm_page_internal_count);
3758 #endif /* DEBUG */
3759 
3760 		OSAddAtomic(-1, &vm_page_internal_count);
3761 	} else {
3762 		assert(vm_page_external_count);
3763 		OSAddAtomic(-1, &vm_page_external_count);
3764 
3765 		if (mem->vmp_xpmapped) {
3766 			assert(vm_page_xpmapped_external_count);
3767 			OSAddAtomic(-1, &vm_page_xpmapped_external_count);
3768 		}
3769 	}
3770 	if (!m_object->internal &&
3771 	    m_object->cached_list.next &&
3772 	    m_object->cached_list.prev) {
3773 		if (m_object->resident_page_count == 0) {
3774 			vm_object_cache_remove(m_object);
3775 		}
3776 	}
3777 
3778 	if (VM_PAGE_WIRED(mem)) {
3779 		assert(mem->vmp_wire_count > 0);
3780 		VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
3781 		VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
3782 		VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
3783 	}
3784 	assert(m_object->resident_page_count >=
3785 	    m_object->wired_page_count);
3786 	if (mem->vmp_reusable) {
3787 		assert(m_object->reusable_page_count > 0);
3788 		m_object->reusable_page_count--;
3789 		assert(m_object->reusable_page_count <=
3790 		    m_object->resident_page_count);
3791 		mem->vmp_reusable = FALSE;
3792 		OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
3793 		vm_page_stats_reusable.reused_remove++;
3794 	} else if (m_object->all_reusable) {
3795 		OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
3796 		vm_page_stats_reusable.reused_remove++;
3797 	}
3798 
3799 	if (m_object->purgable == VM_PURGABLE_DENY &&
3800 	    !m_object->vo_ledger_tag) {
3801 		owner = TASK_NULL;
3802 	} else {
3803 		owner = VM_OBJECT_OWNER(m_object);
3804 		vm_object_ledger_tag_ledgers(m_object,
3805 		    &ledger_idx_volatile,
3806 		    &ledger_idx_nonvolatile,
3807 		    &ledger_idx_volatile_compressed,
3808 		    &ledger_idx_nonvolatile_compressed,
3809 		    &ledger_idx_composite,
3810 		    &ledger_idx_external_wired,
3811 		    &do_footprint);
3812 	}
3813 	if (owner &&
3814 	    m_object->internal &&
3815 	    (m_object->purgable == VM_PURGABLE_NONVOLATILE ||
3816 	    m_object->purgable == VM_PURGABLE_DENY ||
3817 	    VM_PAGE_WIRED(mem))) {
3818 		/* less non-volatile bytes */
3819 		ledger_debit(owner->ledger,
3820 		    ledger_idx_nonvolatile,
3821 		    PAGE_SIZE);
3822 		if (do_footprint) {
3823 			/* less footprint */
3824 			ledger_debit(owner->ledger,
3825 			    task_ledgers.phys_footprint,
3826 			    PAGE_SIZE);
3827 		} else if (ledger_idx_composite != -1) {
3828 			ledger_debit(owner->ledger,
3829 			    ledger_idx_composite,
3830 			    PAGE_SIZE);
3831 		}
3832 	} else if (owner &&
3833 	    m_object->internal &&
3834 	    (m_object->purgable == VM_PURGABLE_VOLATILE ||
3835 	    m_object->purgable == VM_PURGABLE_EMPTY)) {
3836 		assert(!VM_PAGE_WIRED(mem));
3837 		/* less volatile bytes */
3838 		ledger_debit(owner->ledger,
3839 		    ledger_idx_volatile,
3840 		    PAGE_SIZE);
3841 	}
3842 
3843 	if (m_object->purgable == VM_PURGABLE_VOLATILE) {
3844 		if (VM_PAGE_WIRED(mem)) {
3845 			assert(vm_page_purgeable_wired_count > 0);
3846 			OSAddAtomic(-1, &vm_page_purgeable_wired_count);
3847 		} else {
3848 			assert(vm_page_purgeable_count > 0);
3849 			OSAddAtomic(-1, &vm_page_purgeable_count);
3850 		}
3851 	}
3852 
3853 #if HAS_MTE
3854 	/*
3855 	 * If removing pages from the compressor object, account for whether it's
3856 	 * tag storage or not.
3857 	 */
3858 	if (m_object == compressor_object) {
3859 		if (vm_page_is_tag_storage(mem)) {
3860 			counter_dec(&compressor_tag_storage_pages_in_pool);
3861 		} else {
3862 			counter_dec(&compressor_non_tag_storage_pages_in_pool);
3863 		}
3864 	}
3865 
3866 	assert_mte_vmo_matches_vmp(m_object, mem);
3867 	if (!vm_object_is_mte_mappable(m_object)) {
3868 #endif /* HAS_MTE */
3869 	if (m_object->set_cache_attr == TRUE) {
3870 		pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0);
3871 	}
3872 #if HAS_MTE
3873 }
3874 #endif /* HAS_MTE */
3875 
3876 	mem->vmp_tabled = FALSE;
3877 	mem->vmp_object = 0;
3878 	mem->vmp_offset = (vm_object_offset_t) -1;
3879 }
3880 
3881 
3882 /*
3883  *	vm_page_lookup:
3884  *
3885  *	Returns the page associated with the object/offset
3886  *	pair specified; if none is found, VM_PAGE_NULL is returned.
3887  *
3888  *	The object must be locked.  No side effects.
3889  */
3890 
3891 #define VM_PAGE_HASH_LOOKUP_THRESHOLD   10
3892 
3893 #if DEBUG_VM_PAGE_LOOKUP
3894 
3895 struct {
3896 	uint64_t        vpl_total;
3897 	uint64_t        vpl_empty_obj;
3898 	uint64_t        vpl_bucket_NULL;
3899 	uint64_t        vpl_hit_hint;
3900 	uint64_t        vpl_hit_hint_next;
3901 	uint64_t        vpl_hit_hint_prev;
3902 	uint64_t        vpl_fast;
3903 	uint64_t        vpl_slow;
3904 	uint64_t        vpl_hit;
3905 	uint64_t        vpl_miss;
3906 
3907 	uint64_t        vpl_fast_elapsed;
3908 	uint64_t        vpl_slow_elapsed;
3909 } vm_page_lookup_stats __attribute__((aligned(8)));
3910 
3911 #endif
3912 
3913 #define KDP_VM_PAGE_WALK_MAX    1000
3914 
3915 vm_page_t
kdp_vm_page_lookup(vm_object_t object,vm_object_offset_t offset)3916 kdp_vm_page_lookup(
3917 	vm_object_t             object,
3918 	vm_object_offset_t      offset)
3919 {
3920 	vm_page_t cur_page;
3921 	int num_traversed = 0;
3922 
3923 	if (not_in_kdp) {
3924 		panic("panic: kdp_vm_page_lookup done outside of kernel debugger");
3925 	}
3926 
3927 	vm_page_queue_iterate(&object->memq, cur_page, vmp_listq) {
3928 		if (cur_page->vmp_offset == offset) {
3929 			return cur_page;
3930 		}
3931 		num_traversed++;
3932 
3933 		if (num_traversed >= KDP_VM_PAGE_WALK_MAX) {
3934 			return VM_PAGE_NULL;
3935 		}
3936 	}
3937 
3938 	return VM_PAGE_NULL;
3939 }
3940 
3941 vm_page_t
vm_page_lookup(vm_object_t object,vm_object_offset_t offset)3942 vm_page_lookup(
3943 	vm_object_t             object,
3944 	vm_object_offset_t      offset)
3945 {
3946 	vm_page_t       mem;
3947 	vm_page_bucket_t *bucket;
3948 	vm_page_queue_entry_t   qe;
3949 	lck_ticket_t   *bucket_lock = NULL;
3950 	int             hash_id;
3951 #if DEBUG_VM_PAGE_LOOKUP
3952 	uint64_t        start, elapsed;
3953 
3954 	OSAddAtomic64(1, &vm_page_lookup_stats.vpl_total);
3955 #endif
3956 
3957 #if KASAN_TBI
3958 	if (is_kernel_object(object)) {
3959 		offset = vm_memtag_canonicalize_kernel(offset);
3960 	}
3961 #endif /* KASAN_TBI */
3962 
3963 	vm_object_lock_assert_held(object);
3964 	assertf(page_aligned(offset), "offset 0x%llx\n", offset);
3965 
3966 	if (object->resident_page_count == 0) {
3967 #if DEBUG_VM_PAGE_LOOKUP
3968 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_empty_obj);
3969 #endif
3970 		return VM_PAGE_NULL;
3971 	}
3972 
3973 	mem = object->memq_hint;
3974 
3975 	if (mem != VM_PAGE_NULL) {
3976 		assert(VM_PAGE_OBJECT(mem) == object);
3977 
3978 		if (mem->vmp_offset == offset) {
3979 #if DEBUG_VM_PAGE_LOOKUP
3980 			OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint);
3981 #endif
3982 			return mem;
3983 		}
3984 		qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->vmp_listq);
3985 
3986 		if (!vm_page_queue_end(&object->memq, qe)) {
3987 			vm_page_t       next_page;
3988 
3989 			next_page = (vm_page_t)((uintptr_t)qe);
3990 			assert(VM_PAGE_OBJECT(next_page) == object);
3991 
3992 			if (next_page->vmp_offset == offset) {
3993 				object->memq_hint = next_page; /* new hint */
3994 #if DEBUG_VM_PAGE_LOOKUP
3995 				OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_next);
3996 #endif
3997 				return next_page;
3998 			}
3999 		}
4000 		qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->vmp_listq);
4001 
4002 		if (!vm_page_queue_end(&object->memq, qe)) {
4003 			vm_page_t prev_page;
4004 
4005 			prev_page = (vm_page_t)((uintptr_t)qe);
4006 			assert(VM_PAGE_OBJECT(prev_page) == object);
4007 
4008 			if (prev_page->vmp_offset == offset) {
4009 				object->memq_hint = prev_page; /* new hint */
4010 #if DEBUG_VM_PAGE_LOOKUP
4011 				OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_prev);
4012 #endif
4013 				return prev_page;
4014 			}
4015 		}
4016 	}
4017 	/*
4018 	 * Search the hash table for this object/offset pair
4019 	 */
4020 	hash_id = vm_page_hash(object, offset);
4021 	bucket = &vm_page_buckets[hash_id];
4022 
4023 	/*
4024 	 * since we hold the object lock, we are guaranteed that no
4025 	 * new pages can be inserted into this object... this in turn
4026 	 * guarantess that the page we're looking for can't exist
4027 	 * if the bucket it hashes to is currently NULL even when looked
4028 	 * at outside the scope of the hash bucket lock... this is a
4029 	 * really cheap optimiztion to avoid taking the lock
4030 	 */
4031 	if (!bucket->page_list) {
4032 #if DEBUG_VM_PAGE_LOOKUP
4033 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_bucket_NULL);
4034 #endif
4035 		return VM_PAGE_NULL;
4036 	}
4037 
4038 #if DEBUG_VM_PAGE_LOOKUP
4039 	start = mach_absolute_time();
4040 #endif
4041 	if (object->resident_page_count <= VM_PAGE_HASH_LOOKUP_THRESHOLD) {
4042 		/*
4043 		 * on average, it's roughly 3 times faster to run a short memq list
4044 		 * than to take the spin lock and go through the hash list
4045 		 */
4046 		mem = (vm_page_t)vm_page_queue_first(&object->memq);
4047 
4048 		while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
4049 			if (mem->vmp_offset == offset) {
4050 				break;
4051 			}
4052 
4053 			mem = (vm_page_t)vm_page_queue_next(&mem->vmp_listq);
4054 		}
4055 		if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
4056 			mem = NULL;
4057 		}
4058 	} else {
4059 		vm_page_object_t        packed_object;
4060 
4061 		packed_object = VM_PAGE_PACK_OBJECT(object);
4062 
4063 		bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
4064 
4065 		lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
4066 
4067 		for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
4068 		    mem != VM_PAGE_NULL;
4069 		    mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m))) {
4070 #if 0
4071 			/*
4072 			 * we don't hold the page queue lock
4073 			 * so this check isn't safe to make
4074 			 */
4075 			VM_PAGE_CHECK(mem);
4076 #endif
4077 			if ((mem->vmp_object == packed_object) && (mem->vmp_offset == offset)) {
4078 				break;
4079 			}
4080 		}
4081 		lck_ticket_unlock(bucket_lock);
4082 	}
4083 
4084 #if DEBUG_VM_PAGE_LOOKUP
4085 	elapsed = mach_absolute_time() - start;
4086 
4087 	if (bucket_lock) {
4088 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_slow);
4089 		OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_slow_elapsed);
4090 	} else {
4091 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_fast);
4092 		OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_fast_elapsed);
4093 	}
4094 	if (mem != VM_PAGE_NULL) {
4095 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit);
4096 	} else {
4097 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_miss);
4098 	}
4099 #endif
4100 	if (mem != VM_PAGE_NULL) {
4101 		assert(VM_PAGE_OBJECT(mem) == object);
4102 
4103 		object->memq_hint = mem;
4104 	}
4105 	return mem;
4106 }
4107 
4108 
4109 /*
4110  *	vm_page_rename:
4111  *
4112  *	Move the given memory entry from its
4113  *	current object to the specified target object/offset.
4114  *
4115  *	The object must be locked.
4116  */
4117 void
vm_page_rename(vm_page_t mem,vm_object_t new_object,vm_object_offset_t new_offset)4118 vm_page_rename(
4119 	vm_page_t               mem,
4120 	vm_object_t             new_object,
4121 	vm_object_offset_t      new_offset)
4122 {
4123 	boolean_t       internal_to_external, external_to_internal;
4124 	vm_tag_t        tag;
4125 	vm_object_t     m_object;
4126 
4127 	m_object = VM_PAGE_OBJECT(mem);
4128 
4129 	assert(m_object != new_object);
4130 	assert(m_object);
4131 
4132 	/*
4133 	 *	Changes to mem->vmp_object require the page lock because
4134 	 *	the pageout daemon uses that lock to get the object.
4135 	 */
4136 	vm_page_lockspin_queues();
4137 
4138 	internal_to_external = FALSE;
4139 	external_to_internal = FALSE;
4140 
4141 	if (mem->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
4142 		/*
4143 		 * it's much easier to get the vm_page_pageable_xxx accounting correct
4144 		 * if we first move the page to the active queue... it's going to end
4145 		 * up there anyway, and we don't do vm_page_rename's frequently enough
4146 		 * for this to matter.
4147 		 */
4148 		vm_page_queues_remove(mem, FALSE);
4149 		vm_page_activate(mem);
4150 	}
4151 	if (VM_PAGE_PAGEABLE(mem)) {
4152 		if (m_object->internal && !new_object->internal) {
4153 			internal_to_external = TRUE;
4154 		}
4155 		if (!m_object->internal && new_object->internal) {
4156 			external_to_internal = TRUE;
4157 		}
4158 	}
4159 
4160 	tag = m_object->wire_tag;
4161 	vm_page_remove(mem, TRUE);
4162 	vm_page_insert_internal(mem, new_object, new_offset, tag, TRUE, TRUE, FALSE, FALSE, NULL);
4163 
4164 	if (internal_to_external) {
4165 		vm_page_pageable_internal_count--;
4166 		vm_page_pageable_external_count++;
4167 	} else if (external_to_internal) {
4168 		vm_page_pageable_external_count--;
4169 		vm_page_pageable_internal_count++;
4170 	}
4171 
4172 	vm_page_unlock_queues();
4173 }
4174 
4175 /*
4176  *	vm_page_init:
4177  *
4178  *	Initialize the fields in a new page.
4179  *	This takes a structure with random values and initializes it
4180  *	so that it can be given to vm_page_release or vm_page_insert.
4181  */
4182 void
vm_page_init(vm_page_t mem,ppnum_t phys_page)4183 vm_page_init(vm_page_t mem, ppnum_t phys_page)
4184 {
4185 	assert(phys_page);
4186 
4187 #if DEBUG
4188 	if ((phys_page != vm_page_fictitious_addr) && (phys_page != vm_page_guard_addr)) {
4189 		if (!(pmap_valid_page(phys_page))) {
4190 			panic("vm_page_init: non-DRAM phys_page 0x%x", phys_page);
4191 		}
4192 	}
4193 #endif /* DEBUG */
4194 
4195 	/*
4196 	 * Initialize the fields of the vm_page. If adding any new fields to vm_page,
4197 	 * try to use initial values which match 0. This minimizes the number of writes
4198 	 * needed for boot-time initialization.
4199 	 */
4200 	assert(VM_PAGE_NOT_ON_Q == 0);
4201 	assert(sizeof(*mem) % sizeof(uintptr_t) == 0);
4202 	*mem = (struct vm_page) {
4203 		.vmp_offset      = (vm_object_offset_t)-1,
4204 		.vmp_q_state     = VM_PAGE_NOT_ON_Q,
4205 		.vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY,
4206 		.vmp_canonical   = vm_page_in_array(mem),
4207 		.vmp_busy        = true,
4208 	};
4209 
4210 	VM_PAGE_INIT_PHYS_PAGE(mem, phys_page);
4211 
4212 #if 0
4213 	/*
4214 	 * we're leaving this turned off for now... currently pages
4215 	 * come off the free list and are either immediately dirtied/referenced
4216 	 * due to zero-fill or COW faults, or are used to read or write files...
4217 	 * in the file I/O case, the UPL mechanism takes care of clearing
4218 	 * the state of the HW ref/mod bits in a somewhat fragile way.
4219 	 * Since we may change the way this works in the future (to toughen it up),
4220 	 * I'm leaving this as a reminder of where these bits could get cleared
4221 	 */
4222 
4223 	/*
4224 	 * make sure both the h/w referenced and modified bits are
4225 	 * clear at this point... we are especially dependent on
4226 	 * not finding a 'stale' h/w modified in a number of spots
4227 	 * once this page goes back into use
4228 	 */
4229 	pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
4230 #endif
4231 }
4232 
4233 vm_page_t
vm_page_create_fictitious(void)4234 vm_page_create_fictitious(void)
4235 {
4236 	return vm_page_create(vm_page_fictitious_addr, false, Z_WAITOK);
4237 }
4238 
4239 vm_page_t
vm_page_create_guard(bool canwait)4240 vm_page_create_guard(bool canwait)
4241 {
4242 	return vm_page_create(vm_page_guard_addr, false, canwait ? Z_WAITOK : Z_NOWAIT);
4243 }
4244 
4245 vm_page_t
vm_page_create_private(ppnum_t base_page)4246 vm_page_create_private(ppnum_t base_page)
4247 {
4248 	assert(base_page != vm_page_fictitious_addr &&
4249 	    base_page != vm_page_guard_addr);
4250 	return vm_page_create(base_page, false, Z_WAITOK);
4251 }
4252 
4253 bool
vm_page_is_canonical(const struct vm_page * m)4254 vm_page_is_canonical(const struct vm_page *m)
4255 {
4256 	return m->vmp_canonical;
4257 }
4258 
4259 bool
vm_page_is_fictitious(const struct vm_page * m)4260 vm_page_is_fictitious(const struct vm_page *m)
4261 {
4262 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
4263 	if (vm_page_in_array(m)) {
4264 		return false;
4265 	}
4266 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
4267 	switch (VM_PAGE_GET_PHYS_PAGE(m)) {
4268 	case vm_page_guard_addr:
4269 	case vm_page_fictitious_addr:
4270 		return true;
4271 	default:
4272 		return false;
4273 	}
4274 }
4275 
4276 bool
vm_page_is_guard(const struct vm_page * m)4277 vm_page_is_guard(const struct vm_page *m)
4278 {
4279 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
4280 	if (vm_page_in_array(m)) {
4281 		return false;
4282 	}
4283 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
4284 	return VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr;
4285 }
4286 
4287 bool
vm_page_is_private(const struct vm_page * m)4288 vm_page_is_private(const struct vm_page *m)
4289 {
4290 	return !vm_page_is_canonical(m) && !vm_page_is_fictitious(m);
4291 }
4292 
4293 void
vm_page_make_private(vm_page_t m,ppnum_t base_page)4294 vm_page_make_private(vm_page_t m, ppnum_t base_page)
4295 {
4296 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4297 	assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr);
4298 
4299 	VM_PAGE_SET_PHYS_PAGE(m, base_page);
4300 }
4301 
4302 void
vm_page_reset_private(vm_page_t m)4303 vm_page_reset_private(vm_page_t m)
4304 {
4305 	assert(vm_page_is_private(m));
4306 
4307 	VM_PAGE_SET_PHYS_PAGE(m, vm_page_fictitious_addr);
4308 }
4309 
4310 /*
4311  *	vm_page_release_fictitious:
4312  *
4313  *	Release a fictitious page to the zone pool
4314  */
4315 static void
vm_page_release_fictitious(vm_page_t m)4316 vm_page_release_fictitious(vm_page_t m)
4317 {
4318 	assert((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
4319 	    (m->vmp_q_state == VM_PAGE_IS_WIRED));
4320 	assert(vm_page_is_fictitious(m));
4321 	assert(!m->vmp_realtime);
4322 
4323 	if (vm_page_is_guard(m)) {
4324 		counter_dec(&vm_guard_count);
4325 	}
4326 	zfree(vm_page_zone, m);
4327 }
4328 
4329 /*
4330  *	vm_pool_low():
4331  *
4332  *	Return true if it is not likely that a non-vm_privileged thread
4333  *	can get memory without blocking.  Advisory only, since the
4334  *	situation may change under us.
4335  */
4336 bool
vm_pool_low(void)4337 vm_pool_low(void)
4338 {
4339 	/* No locking, at worst we will fib. */
4340 	return vm_page_free_count <= vm_page_free_reserved;
4341 }
4342 
4343 boolean_t vm_darkwake_mode = FALSE;
4344 
4345 /*
4346  * vm_update_darkwake_mode():
4347  *
4348  * Tells the VM that the system is in / out of darkwake.
4349  *
4350  * Today, the VM only lowers/raises the background queue target
4351  * so as to favor consuming more/less background pages when
4352  * darwake is ON/OFF.
4353  *
4354  * We might need to do more things in the future.
4355  */
4356 
4357 void
vm_update_darkwake_mode(boolean_t darkwake_mode)4358 vm_update_darkwake_mode(boolean_t darkwake_mode)
4359 {
4360 #if XNU_TARGET_OS_OSX && defined(__arm64__)
4361 #pragma unused(darkwake_mode)
4362 	assert(vm_darkwake_mode == FALSE);
4363 	/*
4364 	 * Darkwake mode isn't supported for AS macOS.
4365 	 */
4366 	return;
4367 #else /* XNU_TARGET_OS_OSX && __arm64__ */
4368 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
4369 
4370 	vm_page_lockspin_queues();
4371 
4372 	if (vm_darkwake_mode == darkwake_mode) {
4373 		/*
4374 		 * No change.
4375 		 */
4376 		vm_page_unlock_queues();
4377 		return;
4378 	}
4379 
4380 	vm_darkwake_mode = darkwake_mode;
4381 
4382 	if (vm_darkwake_mode == TRUE) {
4383 		/* save background target to restore later */
4384 		vm_page_background_target_snapshot = vm_page_background_target;
4385 
4386 		/* target is set to 0...no protection for background pages */
4387 		vm_page_background_target = 0;
4388 	} else if (vm_darkwake_mode == FALSE) {
4389 		if (vm_page_background_target_snapshot) {
4390 			vm_page_background_target = vm_page_background_target_snapshot;
4391 		}
4392 	}
4393 	vm_page_unlock_queues();
4394 #endif
4395 }
4396 
4397 void
vm_page_update_special_state(vm_page_t mem)4398 vm_page_update_special_state(vm_page_t mem)
4399 {
4400 	if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR || mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
4401 		return;
4402 	}
4403 
4404 	switch (mem->vmp_on_specialq) {
4405 	case VM_PAGE_SPECIAL_Q_BG:
4406 	{
4407 		task_t  my_task = current_task_early();
4408 
4409 		if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4410 			return;
4411 		}
4412 
4413 		if (my_task) {
4414 			if (task_get_darkwake_mode(my_task)) {
4415 				return;
4416 			}
4417 		}
4418 
4419 		if (my_task) {
4420 			if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG)) {
4421 				return;
4422 			}
4423 		}
4424 		vm_page_lockspin_queues();
4425 
4426 		vm_page_background_promoted_count++;
4427 
4428 		vm_page_remove_from_specialq(mem);
4429 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4430 
4431 		vm_page_unlock_queues();
4432 		break;
4433 	}
4434 
4435 	case VM_PAGE_SPECIAL_Q_DONATE:
4436 	{
4437 		task_t  my_task = current_task_early();
4438 
4439 		if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
4440 			return;
4441 		}
4442 
4443 		if (my_task->donates_own_pages == false) {
4444 			vm_page_lockspin_queues();
4445 
4446 			vm_page_remove_from_specialq(mem);
4447 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4448 
4449 			vm_page_unlock_queues();
4450 		}
4451 		break;
4452 	}
4453 
4454 	default:
4455 	{
4456 		assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
4457 		    VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
4458 		break;
4459 	}
4460 	}
4461 }
4462 
4463 
4464 void
vm_page_assign_special_state(vm_page_t mem,vm_page_specialq_t mode)4465 vm_page_assign_special_state(vm_page_t mem, vm_page_specialq_t mode)
4466 {
4467 	if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
4468 		return;
4469 	}
4470 
4471 	switch (mode) {
4472 	case VM_PAGE_SPECIAL_Q_BG:
4473 	{
4474 		if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4475 			return;
4476 		}
4477 
4478 		task_t  my_task = current_task_early();
4479 
4480 		if (my_task) {
4481 			if (task_get_darkwake_mode(my_task)) {
4482 				mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
4483 				return;
4484 			}
4485 		}
4486 
4487 		if (my_task) {
4488 			mem->vmp_on_specialq = (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG) ? VM_PAGE_SPECIAL_Q_BG : VM_PAGE_SPECIAL_Q_EMPTY);
4489 		}
4490 		break;
4491 	}
4492 
4493 	case VM_PAGE_SPECIAL_Q_DONATE:
4494 	{
4495 		if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
4496 			return;
4497 		}
4498 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
4499 		break;
4500 	}
4501 
4502 	default:
4503 		break;
4504 	}
4505 }
4506 
4507 
4508 void
vm_page_remove_from_specialq(vm_page_t mem)4509 vm_page_remove_from_specialq(vm_page_t mem)
4510 {
4511 	vm_object_t     m_object;
4512 
4513 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4514 
4515 	switch (mem->vmp_on_specialq) {
4516 	case VM_PAGE_SPECIAL_Q_BG:
4517 	{
4518 		if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4519 			vm_page_queue_remove(&vm_page_queue_background, mem, vmp_specialq);
4520 
4521 			mem->vmp_specialq.next = 0;
4522 			mem->vmp_specialq.prev = 0;
4523 
4524 			vm_page_background_count--;
4525 
4526 			m_object = VM_PAGE_OBJECT(mem);
4527 
4528 			if (m_object->internal) {
4529 				vm_page_background_internal_count--;
4530 			} else {
4531 				vm_page_background_external_count--;
4532 			}
4533 		}
4534 		break;
4535 	}
4536 
4537 	case VM_PAGE_SPECIAL_Q_DONATE:
4538 	{
4539 		if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4540 			vm_page_queue_remove((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4541 			mem->vmp_specialq.next = 0;
4542 			mem->vmp_specialq.prev = 0;
4543 			vm_page_donate_count--;
4544 			if (vm_page_donate_queue_ripe && (vm_page_donate_count < vm_page_donate_target)) {
4545 				assert(vm_page_donate_target == vm_page_donate_target_low);
4546 				vm_page_donate_target = vm_page_donate_target_high;
4547 				vm_page_donate_queue_ripe = false;
4548 			}
4549 		}
4550 
4551 		break;
4552 	}
4553 
4554 	default:
4555 	{
4556 		assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
4557 		    VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
4558 		break;
4559 	}
4560 	}
4561 }
4562 
4563 
4564 void
vm_page_add_to_specialq(vm_page_t mem,boolean_t first)4565 vm_page_add_to_specialq(vm_page_t mem, boolean_t first)
4566 {
4567 	vm_object_t     m_object;
4568 
4569 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4570 
4571 	if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4572 		return;
4573 	}
4574 
4575 	switch (mem->vmp_on_specialq) {
4576 	case VM_PAGE_SPECIAL_Q_BG:
4577 	{
4578 		if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4579 			return;
4580 		}
4581 
4582 		m_object = VM_PAGE_OBJECT(mem);
4583 
4584 		if (vm_page_background_exclude_external && !m_object->internal) {
4585 			return;
4586 		}
4587 
4588 		if (first == TRUE) {
4589 			vm_page_queue_enter_first(&vm_page_queue_background, mem, vmp_specialq);
4590 		} else {
4591 			vm_page_queue_enter(&vm_page_queue_background, mem, vmp_specialq);
4592 		}
4593 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
4594 
4595 		vm_page_background_count++;
4596 
4597 		if (m_object->internal) {
4598 			vm_page_background_internal_count++;
4599 		} else {
4600 			vm_page_background_external_count++;
4601 		}
4602 		break;
4603 	}
4604 
4605 	case VM_PAGE_SPECIAL_Q_DONATE:
4606 	{
4607 		if (first == TRUE) {
4608 			vm_page_queue_enter_first((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4609 		} else {
4610 			vm_page_queue_enter((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4611 		}
4612 		vm_page_donate_count++;
4613 		if (!vm_page_donate_queue_ripe && (vm_page_donate_count > vm_page_donate_target)) {
4614 			assert(vm_page_donate_target == vm_page_donate_target_high);
4615 			vm_page_donate_target = vm_page_donate_target_low;
4616 			vm_page_donate_queue_ripe = true;
4617 		}
4618 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
4619 		break;
4620 	}
4621 
4622 	default:
4623 		break;
4624 	}
4625 }
4626 
4627 /*!
4628  * @brief
4629  * Prepares a page that has been successfully grabbed for the caller.
4630  *
4631  * @discussion
4632  * This function will update accounting, emit tracements, ...
4633  */
4634 static vm_page_t
vm_page_grab_finalize(vm_grab_options_t grab_options __unused,vm_page_t mem)4635 vm_page_grab_finalize(vm_grab_options_t grab_options __unused, vm_page_t mem)
4636 {
4637 	task_t task;
4638 
4639 #if MACH_ASSERT
4640 	/*
4641 	 * For all free pages, no matter their provenance...
4642 	 * ensure they are not referenced anywhere,
4643 	 * and their state is clean.
4644 	 */
4645 	if (vm_check_refs_on_alloc) {
4646 		pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
4647 	}
4648 	assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
4649 	assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
4650 	    mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
4651 	    mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
4652 	    mem->vmp_next_m == 0 &&
4653 	    mem->vmp_object == 0 &&
4654 	    mem->vmp_wire_count == 0 &&
4655 	    mem->vmp_busy &&
4656 	    !mem->vmp_tabled &&
4657 	    !mem->vmp_laundry &&
4658 	    !mem->vmp_pmapped &&
4659 	    !mem->vmp_wpmapped &&
4660 	    !mem->vmp_realtime);
4661 #endif /* MACH_ASSERT */
4662 
4663 	mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
4664 	VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
4665 
4666 #if HAS_MTE
4667 	if (!(grab_options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE)) {
4668 		assert(!vm_page_is_tag_storage(mem));
4669 	}
4670 	if (grab_options & VM_PAGE_GRAB_MTE) {
4671 		assert(mem->vmp_using_mte);
4672 		VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB_MTE,
4673 		    DBG_FUNC_NONE, grab_options, 0, 0, 0);
4674 	} else
4675 #endif /* HAS_MTE */
4676 	{
4677 		VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB,
4678 		    DBG_FUNC_NONE, grab_options, 0, 0, 0);
4679 	}
4680 
4681 	counter_inc(&vm_page_grab_count);
4682 
4683 	task = current_task_early();
4684 	if (task != TASK_NULL) {
4685 		ledger_credit(task->ledger, task_ledgers.pages_grabbed, 1);
4686 	}
4687 	if (task != TASK_NULL && task != kernel_task) {
4688 		/*
4689 		 * tag:DONATE this is where the donate state of the page
4690 		 * is decided according to what task grabs it
4691 		 */
4692 		if (task->donates_own_pages) {
4693 			vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE);
4694 		} else {
4695 			vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG);
4696 		}
4697 	}
4698 
4699 	return mem;
4700 }
4701 
4702 #if __x86_64__
4703 /*
4704  * This can be switched to FALSE to help debug drivers
4705  * that are having problems with memory > 4G.
4706  */
4707 boolean_t       vm_himemory_mode = TRUE;
4708 #endif /* __x86_64__ */
4709 
4710 #if XNU_VM_HAS_LOPAGE
4711 
4712 vm_page_t
vm_page_grablo(vm_grab_options_t grab_options)4713 vm_page_grablo(vm_grab_options_t grab_options)
4714 {
4715 	vm_page_t mem = VM_PAGE_NULL;
4716 
4717 	if (!vm_lopage_needed) {
4718 		return vm_page_grab_options(grab_options);
4719 	}
4720 
4721 	vm_free_page_lock_spin();
4722 	if (vm_lopage_free_count) {
4723 #if LCK_MTX_USE_ARCH
4724 		/*
4725 		 * Intel locks do not really always disable preemption
4726 		 * for lck_mtx_lock_spin(), and vm_page_free_queue_grab()
4727 		 * really want that.
4728 		 */
4729 		disable_preemption();
4730 #endif
4731 		mem = vm_page_free_queue_grab(grab_options,
4732 		    VM_MEMORY_CLASS_LOPAGE, 1, VM_PAGE_NOT_ON_Q).vmpl_head;
4733 #if LCK_MTX_USE_ARCH
4734 		enable_preemption();
4735 #endif
4736 	}
4737 	vm_free_page_unlock();
4738 
4739 	if (mem == VM_PAGE_NULL) {
4740 		if (cpm_allocate(PAGE_SIZE, &mem, atop(PPNUM_MAX), 0, FALSE, KMA_LOMEM) != KERN_SUCCESS) {
4741 			vm_free_page_lock_spin();
4742 			vm_lopages_allocated_cpm_failed++;
4743 			vm_free_page_unlock();
4744 
4745 			return VM_PAGE_NULL;
4746 		}
4747 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
4748 
4749 		mem->vmp_busy = TRUE;
4750 
4751 		vm_page_lockspin_queues();
4752 
4753 		mem->vmp_gobbled = FALSE;
4754 		vm_page_gobble_count--;
4755 		vm_page_wire_count--;
4756 
4757 		vm_lopages_allocated_cpm_success++;
4758 		vm_page_unlock_queues();
4759 	}
4760 
4761 	return vm_page_grab_finalize(grab_options, mem);
4762 }
4763 
4764 #endif /* XNU_VM_HAS_LOPAGE */
4765 #if CONFIG_SECLUDED_MEMORY
4766 
4767 /*!
4768  * @brief
4769  * Attempt to allocate a page from the secluded queue
4770  *
4771  * @discussion
4772  * This function will check that the caller is eligible
4773  * for the secluded pool, and if not, return VM_PAGE_NULL.
4774  */
4775 __attribute__((noinline))
4776 static vm_page_t
vm_page_grab_secluded(vm_grab_options_t grab_options)4777 vm_page_grab_secluded(vm_grab_options_t grab_options)
4778 {
4779 	vm_page_t       mem;
4780 	vm_object_t     object;
4781 	int             refmod_state;
4782 
4783 #if HAS_MTE
4784 	if (grab_options & VM_PAGE_GRAB_MTE) {
4785 		return VM_PAGE_NULL;
4786 	}
4787 #endif /* HAS_MTE */
4788 	if (vm_page_secluded_count == 0) {
4789 		return VM_PAGE_NULL;
4790 	}
4791 
4792 	if (grab_options & VM_PAGE_GRAB_SECLUDED) {
4793 		vm_page_secluded.grab_for_iokit++;
4794 	} else if (!task_can_use_secluded_mem(current_task(), TRUE)) {
4795 		return VM_PAGE_NULL;
4796 	}
4797 
4798 
4799 	/* secluded queue is protected by the VM page queue lock */
4800 	vm_page_lock_queues();
4801 
4802 	if (vm_page_secluded_count == 0) {
4803 		/* no secluded pages to grab... */
4804 		vm_page_unlock_queues();
4805 		return VM_PAGE_NULL;
4806 	}
4807 
4808 #if 00
4809 	/* can we grab from the secluded queue? */
4810 	if (vm_page_secluded_count > vm_page_secluded_target ||
4811 	    (vm_page_secluded_count > 0 &&
4812 	    task_can_use_secluded_mem(current_task(), TRUE))) {
4813 		/* OK */
4814 	} else {
4815 		/* can't grab from secluded queue... */
4816 		vm_page_unlock_queues();
4817 		return VM_PAGE_NULL;
4818 	}
4819 #endif
4820 
4821 	/* we can grab a page from secluded queue! */
4822 	assert((vm_page_secluded_count_free +
4823 	    vm_page_secluded_count_inuse) ==
4824 	    vm_page_secluded_count);
4825 	if (current_task()->task_can_use_secluded_mem) {
4826 		assert(num_tasks_can_use_secluded_mem > 0);
4827 	}
4828 	assert(!vm_page_queue_empty(&vm_page_queue_secluded));
4829 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4830 	mem = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
4831 	assert(mem->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
4832 	vm_page_queues_remove(mem, TRUE);
4833 
4834 	object = VM_PAGE_OBJECT(mem);
4835 
4836 	assert(!vm_page_is_fictitious(mem));
4837 	assert(!VM_PAGE_WIRED(mem));
4838 	if (object == VM_OBJECT_NULL) {
4839 		/* free for grab! */
4840 		vm_page_unlock_queues();
4841 		vm_page_secluded.grab_success_free++;
4842 		goto out_success;
4843 	}
4844 
4845 	assert(!object->internal);
4846 //	vm_page_pageable_external_count--;
4847 
4848 	if (!vm_object_lock_try(object)) {
4849 //		printf("SECLUDED: page %p: object %p locked\n", mem, object);
4850 		vm_page_secluded.grab_failure_locked++;
4851 reactivate_secluded_page:
4852 		vm_page_activate(mem);
4853 		vm_page_unlock_queues();
4854 		return VM_PAGE_NULL;
4855 	}
4856 	if (mem->vmp_busy ||
4857 	    mem->vmp_cleaning ||
4858 	    mem->vmp_laundry) {
4859 		/* can't steal page in this state... */
4860 		vm_object_unlock(object);
4861 		vm_page_secluded.grab_failure_state++;
4862 		goto reactivate_secluded_page;
4863 	}
4864 	if (mem->vmp_realtime) {
4865 		/* don't steal pages used by realtime threads... */
4866 		vm_object_unlock(object);
4867 		vm_page_secluded.grab_failure_realtime++;
4868 		goto reactivate_secluded_page;
4869 	}
4870 
4871 	mem->vmp_busy = TRUE;
4872 	refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
4873 	if (refmod_state & VM_MEM_REFERENCED) {
4874 		mem->vmp_reference = TRUE;
4875 	}
4876 	if (refmod_state & VM_MEM_MODIFIED) {
4877 		SET_PAGE_DIRTY(mem, FALSE);
4878 	}
4879 	if (mem->vmp_dirty || mem->vmp_precious) {
4880 		/* can't grab a dirty page; re-activate */
4881 //		printf("SECLUDED: dirty page %p\n", mem);
4882 		vm_page_wakeup_done(object, mem);
4883 		vm_page_secluded.grab_failure_dirty++;
4884 		vm_object_unlock(object);
4885 		goto reactivate_secluded_page;
4886 	}
4887 	if (mem->vmp_reference) {
4888 		/* it's been used but we do need to grab a page... */
4889 	}
4890 
4891 	vm_page_unlock_queues();
4892 
4893 	/* finish what vm_page_free() would have done... */
4894 	vm_page_free_prepare_object(mem, TRUE);
4895 	vm_object_unlock(object);
4896 	object = VM_OBJECT_NULL;
4897 
4898 	pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
4899 	vm_page_secluded.grab_success_other++;
4900 
4901 out_success:
4902 
4903 	if (grab_options & VM_PAGE_GRAB_SECLUDED) {
4904 		vm_page_secluded.grab_for_iokit_success++;
4905 	}
4906 	return mem;
4907 }
4908 
4909 uint64_t
vm_page_secluded_drain(void)4910 vm_page_secluded_drain(void)
4911 {
4912 	vm_page_t local_freeq;
4913 	int local_freed;
4914 	uint64_t num_reclaimed;
4915 	unsigned int saved_secluded_count, saved_secluded_target;
4916 
4917 	num_reclaimed = 0;
4918 	local_freeq = NULL;
4919 	local_freed = 0;
4920 
4921 	vm_page_lock_queues();
4922 
4923 	saved_secluded_count = vm_page_secluded_count;
4924 	saved_secluded_target = vm_page_secluded_target;
4925 	vm_page_secluded_target = 0;
4926 	VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
4927 	while (vm_page_secluded_count) {
4928 		vm_page_t secluded_page;
4929 
4930 		assert((vm_page_secluded_count_free +
4931 		    vm_page_secluded_count_inuse) ==
4932 		    vm_page_secluded_count);
4933 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
4934 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
4935 
4936 		vm_page_queues_remove(secluded_page, FALSE);
4937 		assert(!vm_page_is_fictitious(secluded_page));
4938 		assert(!VM_PAGE_WIRED(secluded_page));
4939 
4940 		if (secluded_page->vmp_object == 0) {
4941 			/* transfer to free queue */
4942 			assert(secluded_page->vmp_busy);
4943 			secluded_page->vmp_snext = local_freeq;
4944 			local_freeq = secluded_page;
4945 			local_freed += 1;
4946 		} else {
4947 			/* transfer to head of active queue */
4948 			vm_page_enqueue_active(secluded_page, FALSE);
4949 			secluded_page = VM_PAGE_NULL;
4950 		}
4951 		num_reclaimed++;
4952 	}
4953 	vm_page_secluded_target = saved_secluded_target;
4954 	VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
4955 
4956 //	printf("FBDP %s:%d secluded_count %d->%d, target %d, reclaimed %lld\n", __FUNCTION__, __LINE__, saved_secluded_count, vm_page_secluded_count, vm_page_secluded_target, num_reclaimed);
4957 
4958 	vm_page_unlock_queues();
4959 
4960 	if (local_freed) {
4961 		vm_page_free_list(local_freeq, TRUE);
4962 		local_freeq = NULL;
4963 		local_freed = 0;
4964 	}
4965 
4966 	return num_reclaimed;
4967 }
4968 
4969 #endif /* CONFIG_SECLUDED_MEMORY */
4970 
4971 /*!
4972  * @brief
4973  * Attempts to allocate a page from the specified per-cpu page queue.
4974  */
4975 static vm_page_t
vm_page_grab_from_cpu(vm_page_t * cpu_list,scalable_counter_t * counter)4976 vm_page_grab_from_cpu(vm_page_t *cpu_list, scalable_counter_t *counter)
4977 {
4978 	vm_page_t mem = _vm_page_list_pop(cpu_list);
4979 
4980 	if (mem != VM_PAGE_NULL) {
4981 #if HIBERNATION
4982 		if (hibernate_rebuild_needed) {
4983 			panic("should not modify cpu->free_pages while hibernating");
4984 		}
4985 #endif /* HIBERNATION */
4986 		counter_dec_preemption_disabled(counter);
4987 	}
4988 	return mem;
4989 }
4990 
4991 #if HAS_MTE
4992 /*!
4993  * @brief
4994  * Attempts to allocate pages from free tag storage percpu queue.
4995  */
4996 static vm_page_t
vm_page_grab_claimed_from_cpu(mte_pcpu_t pcpu,vm_grab_options_t options)4997 vm_page_grab_claimed_from_cpu(mte_pcpu_t pcpu, vm_grab_options_t options)
4998 {
4999 	vm_page_t mem = VM_PAGE_NULL;
5000 
5001 	if (!(options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE)) {
5002 		return VM_PAGE_NULL;
5003 	}
5004 
5005 	if (vm_page_queue_empty(&pcpu->free_claimed_pages)) {
5006 		return VM_PAGE_NULL;
5007 	}
5008 
5009 	lck_ticket_lock(&pcpu->free_claimed_lock, &vm_page_lck_grp_bucket);
5010 
5011 	if (!vm_page_queue_empty(&pcpu->free_claimed_pages)) {
5012 		vm_page_queue_remove_first(&pcpu->free_claimed_pages,
5013 		    mem, vmp_pageq);
5014 		counter_dec_preemption_disabled(&vm_cpu_free_claimed_count);
5015 		counter_inc(&vm_cpu_claimed_count);
5016 		/* must be done immediately to synchronize with stealing */
5017 		mem->vmp_q_state  = VM_PAGE_NOT_ON_Q;
5018 		mem->vmp_local_id = 0;
5019 	}
5020 
5021 	lck_ticket_unlock(&pcpu->free_claimed_lock);
5022 
5023 	return mem;
5024 }
5025 #endif /* HAS_MTE */
5026 
5027 /*!
5028  * @brief
5029  * Attempts to allocate pages from free queues, and to populate the per-cpu
5030  * queue as a side effect.
5031  *
5032  * @discussion
5033  * This function will take the properties of the allocating thread into account
5034  * to decide how many pages it can allocate.
5035  *
5036  * If the free queues are depleted, then it will return VM_PAGE_NULL.
5037  */
5038 __attribute__((noinline))
5039 static vm_page_t
vm_page_grab_slow(vm_grab_options_t grab_options)5040 vm_page_grab_slow(vm_grab_options_t grab_options)
5041 {
5042 #if HAS_MTE
5043 	unsigned int        mte_draw = 0;
5044 	unsigned int        mte_slop = 0;
5045 #endif /* HAS_MTE */
5046 	unsigned int        target   = vm_free_magazine_refill_limit;
5047 	vm_memory_class_t   class    = VM_MEMORY_CLASS_REGULAR;
5048 	vm_page_t           mem      = VM_PAGE_NULL;
5049 	vm_page_list_t      list     = { };
5050 	vm_page_t          *cpu_list = NULL;
5051 	scalable_counter_t *counter  = NULL;
5052 
5053 	vm_free_page_lock_spin();
5054 #if LCK_MTX_USE_ARCH
5055 	/* Intel does't disable preemption with vm_free_page_lock_spin() */
5056 	disable_preemption();
5057 #endif /* LCK_MTX_USE_ARCH */
5058 	cpu_list = PERCPU_GET(free_pages);
5059 	counter  = &vm_cpu_free_count;
5060 #if HAS_MTE
5061 	if (grab_options & VM_PAGE_GRAB_MTE) {
5062 again:
5063 		cpu_list = &PERCPU_GET(mte_pcpu)->free_tagged_pages;
5064 		counter  = &vm_cpu_free_tagged_count;
5065 		target   = vm_free_magazine_refill_limit / 2;
5066 		class    = VM_MEMORY_CLASS_TAGGED;
5067 		mte_slop = 0;
5068 	} else if (grab_options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE) {
5069 		/*
5070 		 * Note that this is the last time we'll explicitly try to grab
5071 		 * free, claimable pages. If it comes down to it, we'll grab either
5072 		 * normal or dead tag storage pages in vm_page_free_queue_grab()
5073 		 * and hopefully refill the per-CPU free claimable queue.
5074 		 */
5075 		mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
5076 		mem = vm_page_grab_claimed_from_cpu(mte_pcpu, grab_options);
5077 	}
5078 	if (mem == VM_PAGE_NULL)
5079 #endif /* HAS_MTE */
5080 	{
5081 		mem = vm_page_grab_from_cpu(cpu_list, counter);
5082 	}
5083 	if (mem != VM_PAGE_NULL) {
5084 #if LCK_MTX_USE_ARCH
5085 		enable_preemption();
5086 #endif /* LCK_MTX_USE_ARCH */
5087 		vm_free_page_unlock();
5088 		return mem;
5089 	}
5090 
5091 	if (vm_page_free_count <= vm_page_free_reserved) {
5092 		if ((current_thread()->options & TH_OPT_VMPRIV) == 0) {
5093 			target = 0;
5094 		} else if (vm_page_free_count == 0) {
5095 			target = 0;
5096 		} else {
5097 			target = 1;
5098 		}
5099 	} else {
5100 		target = MIN(target, vm_page_free_count - vm_page_free_reserved);
5101 	}
5102 #if HAS_MTE
5103 	if (grab_options & VM_PAGE_GRAB_MTE) {
5104 		mte_draw = target;
5105 		target   = 0;
5106 		if (vm_page_free_taggable_count < mte_draw + vm_page_free_min &&
5107 		    vm_page_free_count >= mte_draw + vm_page_free_min &&
5108 		    !(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
5109 			/*
5110 			 * If the mte draw is such that we deplete our reserves,
5111 			 * but there are enough free untaggable pages available,
5112 			 * attempt to activate pages in order to rebalance
5113 			 * toward the taggable pool.
5114 			 *
5115 			 * If the operation succeeds, the free page queue lock
5116 			 * was dropped and we need to re-take it from the top.
5117 			 */
5118 			if (mteinfo_tag_storage_try_activate(mte_draw +
5119 			    vm_page_free_min - vm_page_free_taggable_count,
5120 			    /* lock_spin */ true)) {
5121 				goto again;
5122 			}
5123 		}
5124 	} else if (target > vm_page_free_count - vm_page_free_taggable_count) {
5125 		mte_draw = target - (vm_page_free_count - vm_page_free_taggable_count);
5126 		target   = (vm_page_free_count - vm_page_free_taggable_count);
5127 	} else {
5128 		mte_draw = 0;
5129 	}
5130 
5131 	if (vm_page_free_taggable_count <= vm_page_free_reserved) {
5132 		if ((current_thread()->options & TH_OPT_VMPRIV) == 0) {
5133 			mte_draw = 0;
5134 		} else if (vm_page_free_taggable_count == 0) {
5135 			mte_draw = 0;
5136 		} else if (target) {
5137 			mte_draw = 0;
5138 		} else {
5139 			mte_draw = 1;
5140 		}
5141 	} else {
5142 		mte_draw = MIN(mte_draw,
5143 		    vm_page_free_taggable_count - vm_page_free_reserved);
5144 	}
5145 
5146 	target += mte_draw;
5147 #endif /* HAS_MTE */
5148 
5149 #if HIBERNATION
5150 	if (target > 0 && hibernate_rebuild_needed) {
5151 		panic("should not modify CPU free_pages while hibernating");
5152 	}
5153 #endif /* HIBERNATION */
5154 
5155 	/*
5156 	 * Convert the lock hold into a mutex, to signal to waiters that the
5157 	 * lock may be held for longer.
5158 	 */
5159 #if !LCK_MTX_USE_ARCH
5160 	disable_preemption();
5161 #endif /* !LCK_MTX_USE_ARCH */
5162 	vm_free_page_lock_convert();
5163 
5164 	if (target != 0) {
5165 		list = vm_page_free_queue_grab(grab_options, class, target,
5166 		    VM_PAGE_ON_FREE_LOCAL_Q);
5167 	}
5168 
5169 #if VM_PAGE_WIRE_COUNT_WARNING
5170 	if (vm_page_wire_count >= VM_PAGE_WIRE_COUNT_WARNING) {
5171 		printf("mk: vm_page_grab(): high wired page count of %d\n",
5172 		    vm_page_wire_count);
5173 	}
5174 #endif
5175 #if VM_PAGE_GOBBLE_COUNT_WARNING
5176 	if (vm_page_gobble_count >= VM_PAGE_GOBBLE_COUNT_WARNING) {
5177 		printf("mk: vm_page_grab(): high gobbled page count of %d\n",
5178 		    vm_page_gobble_count);
5179 	}
5180 #endif
5181 
5182 	if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) {
5183 		thread_wakeup(&vm_page_free_wanted);
5184 	}
5185 
5186 	vm_free_page_unlock();
5187 
5188 	VM_CHECK_MEMORYSTATUS;
5189 
5190 	if (list.vmpl_head) {
5191 #if HAS_MTE
5192 		mteinfo_page_list_fix_tagging(class, &list);
5193 #endif /* HAS_MTE */
5194 		/* Steal a page off the list for the caller. */
5195 		mem = vm_page_list_pop(&list);
5196 
5197 		/* Add the remaining pages to the CPU's free list. */
5198 		assert(*cpu_list == VM_PAGE_NULL);
5199 		*cpu_list = list.vmpl_head;
5200 		counter_add_preemption_disabled(counter, list.vmpl_count);
5201 	}
5202 
5203 	enable_preemption();
5204 
5205 	return mem;
5206 }
5207 
5208 vm_page_t
vm_page_grab_options(vm_grab_options_t options)5209 vm_page_grab_options(vm_grab_options_t options)
5210 {
5211 #if HAS_MTE
5212 	mte_pcpu_t          mte_pcpu;
5213 	vm_page_t          *cpu_list;
5214 	scalable_counter_t *counter;
5215 #endif
5216 	vm_page_t           mem;
5217 
5218 restart:
5219 
5220 	/*
5221 	 *	Step 1: look at the CPU magazines.
5222 	 */
5223 
5224 	disable_preemption();
5225 #if HAS_MTE
5226 	mte_pcpu = PERCPU_GET(mte_pcpu);
5227 	if (options & VM_PAGE_GRAB_MTE) {
5228 		cpu_list = &mte_pcpu->free_tagged_pages;
5229 		counter  = &vm_cpu_free_tagged_count;
5230 		mem      = VM_PAGE_NULL;
5231 	} else {
5232 		cpu_list = PERCPU_GET(free_pages);
5233 		counter  = &vm_cpu_free_count;
5234 		mem      = VM_PAGE_NULL;
5235 	}
5236 
5237 	if (options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE) {
5238 		mem = vm_page_grab_claimed_from_cpu(mte_pcpu, options);
5239 	}
5240 	if (mem == VM_PAGE_NULL) {
5241 		mem = vm_page_grab_from_cpu(cpu_list, counter);
5242 	}
5243 #else
5244 	mem = vm_page_grab_from_cpu(PERCPU_GET(free_pages), &vm_cpu_free_count);
5245 #endif /* HAS_MTE */
5246 	enable_preemption();
5247 
5248 	if (mem != VM_PAGE_NULL) {
5249 		return vm_page_grab_finalize(options, mem);
5250 	}
5251 
5252 #if XNU_VM_HAS_DELAYED_PAGES
5253 	/*
5254 	 *	If free count is low and we have delayed pages from early boot,
5255 	 *	get one of those instead.
5256 	 */
5257 	if (__improbable(vm_delayed_count > 0 &&
5258 	    vm_page_free_count <= vm_page_free_target)) {
5259 		mem = vm_get_delayed_page(options);
5260 		if (mem != VM_PAGE_NULL) {
5261 			return vm_page_grab_finalize(options, mem);
5262 		}
5263 	}
5264 #endif /* XNU_VM_HAS_DELAYED_PAGES */
5265 
5266 
5267 	/*
5268 	 *	Step 2: Try to promote pages from the free queues,
5269 	 *	        or the secluded queue if appropriate.
5270 	 */
5271 
5272 	mem = vm_page_grab_slow(options);
5273 	if (mem != VM_PAGE_NULL) {
5274 		return vm_page_grab_finalize(options, mem);
5275 	}
5276 
5277 #if CONFIG_SECLUDED_MEMORY
5278 	mem = vm_page_grab_secluded(options);
5279 	if (mem != VM_PAGE_NULL) {
5280 		return vm_page_grab_finalize(options, mem);
5281 	}
5282 #endif /* CONFIG_SECLUDED_MEMORY */
5283 
5284 
5285 	/*
5286 	 *	Step 3: Privileged threads block and retry, others fail.
5287 	 */
5288 
5289 #if HAS_MTE
5290 	if (options & VM_PAGE_GRAB_MTE) {
5291 		current_thread()->page_wait_class = VM_MEMORY_CLASS_TAGGED;
5292 	} else {
5293 		current_thread()->page_wait_class = VM_MEMORY_CLASS_REGULAR;
5294 	}
5295 #endif /* HAS_MTE */
5296 	if ((options & VM_PAGE_GRAB_NOPAGEWAIT) == 0 &&
5297 	    (current_thread()->options & TH_OPT_VMPRIV) != 0) {
5298 		VM_PAGE_WAIT();
5299 		goto restart;
5300 	}
5301 
5302 	return VM_PAGE_NULL;
5303 }
5304 
5305 vm_grab_options_t
vm_page_grab_options_for_object(vm_object_t object __unused)5306 vm_page_grab_options_for_object(vm_object_t object __unused)
5307 {
5308 	vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
5309 
5310 #if CONFIG_SECLUDED_MEMORY
5311 	if (object->can_grab_secluded) {
5312 		options |= VM_PAGE_GRAB_SECLUDED;
5313 	}
5314 #endif /* CONFIG_SECLUDED_MEMORY */
5315 #if HAS_MTE
5316 	if (vm_object_is_mte_mappable(object)) {
5317 		options |= VM_PAGE_GRAB_MTE;
5318 	}
5319 #endif /* HAS_MTE */
5320 
5321 	return options;
5322 }
5323 
5324 /*!
5325  * @function vm_page_free_queue_steal()
5326  *
5327  * @abstract
5328  * Steal a given page from the free queues.
5329  *
5330  * @discussion
5331  * The given page must be in the given free queue, or state may be corrupted.
5332  *
5333  * Internally, the free queue is not synchronized, so any locking must be done
5334  * outside of this function.
5335  *
5336  * This function, like vm_page_grab(), takes care of waking up
5337  * page out scan as needed.
5338  */
5339 static void
vm_page_free_queue_steal(vm_grab_options_t options,vm_page_t mem)5340 vm_page_free_queue_steal(vm_grab_options_t options, vm_page_t mem)
5341 {
5342 	ppnum_t           pnum  = VM_PAGE_GET_PHYS_PAGE(mem);
5343 	vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
5344 
5345 	assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
5346 	assert(!mem->vmp_lopage && mem->vmp_busy);
5347 
5348 	vm_page_free_queue_remove(class, mem, pnum, VM_PAGE_NOT_ON_Q);
5349 	vm_page_grab_finalize(options, mem);
5350 
5351 	if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) {
5352 		thread_wakeup(&vm_page_free_wanted);
5353 	}
5354 }
5355 
5356 #if HAS_MTE
5357 /*!
5358  * @function _vm_page_wait_wakeup_fill_thread()
5359  *
5360  * @abstract
5361  * Given the number of waiters, return whether the MTE fill thread should
5362  * wake up.
5363  *
5364  * @discussion
5365  * The idea is to wake up the MTE fill thread without explicitly triggering
5366  * pageout_scan(), which means @c vm_page_free_count must be at least
5367  * @c vm_page_free_min. On top of that, it's possible that tag storage pages
5368  * may get relocated, which means that some free untagged pages will be needed
5369  * to activate a tag storage page. This function uses the naive, pessimistic
5370  * heuristic that a given tag storage page does not have many free covered
5371  * pages, and some number of those tag storage pages will need to be relocated.
5372  *
5373  * The free queue lock should be held during this function.
5374  *
5375  * @param n_waiters					The number of waiters for tagged memory.
5376  *
5377  * @returns							Whether the system has enough free pages to
5378  *                                  wake up the MTE fill thread.
5379  */
5380 static bool
_vm_page_wait_wakeup_fill_thread(uint32_t n_waiters)5381 _vm_page_wait_wakeup_fill_thread(uint32_t n_waiters)
5382 {
5383 	LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
5384 	return vm_page_free_count > vm_page_free_min + (3 * n_waiters) / 2;
5385 }
5386 #endif /* HAS_MTE */
5387 
5388 /*
5389  *	vm_page_wait:
5390  *
5391  *	Wait for a page to become available.
5392  *	If there are plenty of free pages, then we don't sleep.
5393  *
5394  *	Returns:
5395  *		TRUE:  There may be another page, try again
5396  *		FALSE: We were interrupted out of our wait, don't try again
5397  */
5398 
5399 boolean_t
vm_page_wait(int interruptible)5400 vm_page_wait(int interruptible)
5401 {
5402 	/*
5403 	 *	We can't use vm_page_free_reserved to make this
5404 	 *	determination.  Consider: some thread might
5405 	 *	need to allocate two pages.  The first allocation
5406 	 *	succeeds, the second fails.  After the first page is freed,
5407 	 *	a call to vm_page_wait must really block.
5408 	 */
5409 	kern_return_t wait_result   = THREAD_NOT_WAITING;
5410 	thread_t      cur_thread    = current_thread();
5411 	bool          is_privileged = cur_thread->options & TH_OPT_VMPRIV;
5412 	bool          need_wakeup   = false;
5413 	event_t       wait_event    = NULL;
5414 #if HAS_MTE
5415 	bool              wakeup_refill_thread = false;
5416 #endif /* HAS_MTE */
5417 
5418 	vm_free_page_lock_spin();
5419 
5420 #if HAS_MTE
5421 	if (cur_thread->page_wait_class == VM_MEMORY_CLASS_TAGGED) {
5422 		if (is_privileged) {
5423 			if (vm_page_free_taggable_count) {
5424 				vm_free_page_unlock();
5425 				goto out;
5426 			}
5427 
5428 			if (vm_page_free_wanted_tagged_privileged++ == 0) {
5429 				wakeup_refill_thread = true;
5430 			}
5431 
5432 			wait_event = (event_t)&vm_page_free_wanted_tagged_privileged;
5433 		} else if (vm_page_free_taggable_count >= vm_page_free_target) {
5434 			vm_free_page_unlock();
5435 			goto out;
5436 		} else {
5437 			if (vm_page_free_wanted_tagged++ == 0) {
5438 				wakeup_refill_thread = true;
5439 			}
5440 
5441 			wait_event = (event_t)&vm_page_free_wanted_tagged;
5442 		}
5443 	} else
5444 #endif /* !HAS_MTE */
5445 	if (is_privileged) {
5446 		if (vm_page_free_count) {
5447 			vm_free_page_unlock();
5448 			goto out;
5449 		}
5450 
5451 		if (vm_page_free_wanted_privileged++ == 0) {
5452 			need_wakeup = true;
5453 		}
5454 
5455 		wait_event = (event_t)&vm_page_free_wanted_privileged;
5456 	} else if (vm_page_free_count >= vm_page_free_target) {
5457 		vm_free_page_unlock();
5458 		goto out;
5459 #if CONFIG_SECLUDED_MEMORY
5460 	} else if (secluded_for_apps &&
5461 	    task_can_use_secluded_mem(current_task(), FALSE)) {
5462 #if 00
5463 		/* XXX FBDP: need pageq lock for this... */
5464 		/* XXX FBDP: might wait even if pages available, */
5465 		/* XXX FBDP: hopefully not for too long... */
5466 		if (vm_page_secluded_count > 0) {
5467 			vm_free_page_unlock();
5468 			goto out;
5469 		}
5470 #endif
5471 		if (vm_page_free_wanted_secluded++ == 0) {
5472 			need_wakeup = true;
5473 		}
5474 
5475 		wait_event = (event_t)&vm_page_free_wanted_secluded;
5476 #endif /* CONFIG_SECLUDED_MEMORY */
5477 	} else {
5478 		if (vm_page_free_wanted++ == 0) {
5479 			need_wakeup = true;
5480 		}
5481 
5482 		wait_event = (event_t)&vm_page_free_count;
5483 	}
5484 
5485 #if HAS_MTE
5486 	/*
5487 	 * If we're here, it means that the free taggable count is low.
5488 	 * If there are enough free pages in the system, we can ask the
5489 	 * fill thread to convert some free untagged pages to free tagged
5490 	 * pages. Otherwise, we will wake up pageout_scan(), which will
5491 	 * free pages, and on the free path, the fill thread will get woken up
5492 	 * (see vm_page_free_queue_handle_wakeups_and_unlock()).
5493 	 *
5494 	 * The fill thread will run or not run under a variety of conditions
5495 	 * (see mteinfo_tag_storage_active_should_refill() for more details),
5496 	 * but what's relevant here is that the fill thread will run so long
5497 	 * as there are tagged waiters. We should at least ensure that the
5498 	 * system has enough free untagged memory to service the existing
5499 	 * tagged waiters.
5500 	 */
5501 	if (wakeup_refill_thread) {
5502 		uint32_t total_tagged_waiters = vm_page_free_wanted_tagged_privileged +
5503 		    vm_page_free_wanted_tagged;
5504 		if (_vm_page_wait_wakeup_fill_thread(total_tagged_waiters)) {
5505 			/* If there are enough pages for tagged waiters. */
5506 		} else {
5507 			/*
5508 			 * Otherwise, wake up pageout_scan(), and the fill thread will
5509 			 * run later.
5510 			 */
5511 			wakeup_refill_thread = false;
5512 			need_wakeup = true;
5513 		}
5514 	}
5515 
5516 #endif /* HAS_MTE */
5517 	if (vm_pageout_running) {
5518 		need_wakeup = false;
5519 	}
5520 
5521 	/*
5522 	 * We don't do a vm_pageout_scan wakeup if we already have
5523 	 * some waiters because vm_pageout_scan checks for waiters
5524 	 * before it returns and does so behind the vm_page_queue_free_lock,
5525 	 * which we own when we bump the waiter counts.
5526 	 */
5527 
5528 	if (vps_dynamic_priority_enabled) {
5529 		/*
5530 		 * We are waking up vm_pageout_scan here. If it needs
5531 		 * the vm_page_queue_free_lock before we unlock it
5532 		 * we'll end up just blocking and incur an extra
5533 		 * context switch. Could be a perf. issue.
5534 		 */
5535 
5536 #if HAS_MTE
5537 		if (cur_thread->page_wait_class != VM_MEMORY_CLASS_REGULAR) {
5538 			panic("vm_page_wait does not support MTE+vps_dynamic_priority_enabled");
5539 		}
5540 #endif /* HAS_MTE */
5541 		if (need_wakeup) {
5542 			thread_wakeup((event_t)&vm_page_free_wanted);
5543 		}
5544 
5545 		/*
5546 		 * LD: This event is going to get recorded every time because
5547 		 * we don't get back THREAD_WAITING from lck_mtx_sleep_with_inheritor.
5548 		 * We just block in that routine.
5549 		 */
5550 		VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
5551 		    vm_page_free_wanted_privileged,
5552 		    vm_page_free_wanted,
5553 #if CONFIG_SECLUDED_MEMORY
5554 		    vm_page_free_wanted_secluded,
5555 #else /* CONFIG_SECLUDED_MEMORY */
5556 		    0,
5557 #endif /* CONFIG_SECLUDED_MEMORY */
5558 		    0);
5559 		wait_result =  lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
5560 		    LCK_SLEEP_UNLOCK,
5561 		    wait_event,
5562 		    vm_pageout_scan_thread,
5563 		    interruptible,
5564 		    0);
5565 	} else {
5566 		wait_result = assert_wait(wait_event, interruptible);
5567 
5568 		vm_free_page_unlock();
5569 
5570 		if (need_wakeup) {
5571 			thread_wakeup((event_t)&vm_page_free_wanted);
5572 		}
5573 #if HAS_MTE
5574 		if (wakeup_refill_thread) {
5575 			assert(!need_wakeup);
5576 			mteinfo_wake_fill_thread();
5577 		}
5578 #endif /* HAS_MTE */
5579 
5580 		if (wait_result != THREAD_WAITING) {
5581 			goto out;
5582 		}
5583 
5584 #if HAS_MTE
5585 		if (cur_thread->page_wait_class == VM_MEMORY_CLASS_TAGGED) {
5586 			VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5587 			    DBG_VM_PAGE_MTE_WAIT_BLOCK,
5588 			    DBG_FUNC_START,
5589 			    vm_page_free_wanted_tagged_privileged,
5590 			    vm_page_free_wanted_tagged,
5591 			    0,
5592 			    0);
5593 			wait_result = thread_block(THREAD_CONTINUE_NULL);
5594 			VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5595 			    DBG_VM_PAGE_MTE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
5596 			goto out;
5597 		}
5598 #endif /* HAS_MTE */
5599 
5600 		VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5601 		    DBG_VM_PAGE_WAIT_BLOCK,
5602 		    DBG_FUNC_START,
5603 		    vm_page_free_wanted_privileged,
5604 		    vm_page_free_wanted,
5605 #if CONFIG_SECLUDED_MEMORY
5606 		    vm_page_free_wanted_secluded,
5607 #else /* CONFIG_SECLUDED_MEMORY */
5608 		    0,
5609 #endif /* CONFIG_SECLUDED_MEMORY */
5610 		    0);
5611 		wait_result = thread_block(THREAD_CONTINUE_NULL);
5612 		VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5613 		    DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
5614 	}
5615 
5616 out:
5617 #if HAS_MTE
5618 	cur_thread->page_wait_class = VM_MEMORY_CLASS_REGULAR;
5619 #endif /* HAS_MTE */
5620 	return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING);
5621 }
5622 
5623 /*
5624  *	vm_page_free_prepare:
5625  *
5626  *	Removes page from any queue it may be on
5627  *	and disassociates it from its VM object.
5628  *
5629  *	Object and page queues must be locked prior to entry.
5630  */
5631 static void
vm_page_free_prepare(vm_page_t mem)5632 vm_page_free_prepare(
5633 	vm_page_t       mem)
5634 {
5635 	vm_page_free_prepare_queues(mem);
5636 	vm_page_free_prepare_object(mem, TRUE);
5637 #if CONFIG_SPTM
5638 	/**
5639 	 * The pmap should retype frames as necessary when pmap_recycle_page()
5640 	 * is called. In order to catch potential cases where this does not
5641 	 * happen, add an appropriate assert here. This code should be
5642 	 * executed on every frame that is about to be released to the VM.
5643 	 */
5644 	const sptm_paddr_t paddr = ((uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)) << PAGE_SHIFT;
5645 	__unused const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
5646 
5647 	assert(frame_type == XNU_DEFAULT);
5648 #endif /* CONFIG_SPTM */
5649 
5650 #if HAS_MTE
5651 	/*
5652 	 * At this point, any busy bit on `mem` has been cleared. If the refill
5653 	 * thread wanted this page, update the cell state from PINNED to CLAIMED.
5654 	 *
5655 	 * We only expect to come through here when swap-ins/outs have erred.
5656 	 */
5657 	if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR && mem->vmp_ts_wanted) {
5658 		mteinfo_tag_storage_wakeup(mem, false);
5659 	}
5660 #endif /* HAS_MTE */
5661 }
5662 
5663 
5664 void
vm_page_free_prepare_queues(vm_page_t mem)5665 vm_page_free_prepare_queues(
5666 	vm_page_t       mem)
5667 {
5668 	vm_object_t     m_object;
5669 
5670 	VM_PAGE_CHECK(mem);
5671 
5672 	assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
5673 	assert(!mem->vmp_cleaning);
5674 	m_object = VM_PAGE_OBJECT(mem);
5675 
5676 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5677 	if (m_object) {
5678 		vm_object_lock_assert_exclusive(m_object);
5679 	}
5680 	if (mem->vmp_laundry) {
5681 		/*
5682 		 * We may have to free a page while it's being laundered
5683 		 * if we lost its pager (due to a forced unmount, for example).
5684 		 * We need to call vm_pageout_steal_laundry() before removing
5685 		 * the page from its VM object, so that we can remove it
5686 		 * from its pageout queue and adjust the laundry accounting
5687 		 */
5688 		vm_pageout_steal_laundry(mem, TRUE);
5689 	}
5690 
5691 	vm_page_queues_remove(mem, TRUE);
5692 
5693 	if (mem->vmp_realtime) {
5694 		mem->vmp_realtime = false;
5695 		VM_COUNTER_DEC(&vm_page_realtime_count);
5696 	}
5697 
5698 	if (VM_PAGE_WIRED(mem)) {
5699 		assert(mem->vmp_wire_count > 0);
5700 
5701 		if (m_object) {
5702 			task_t          owner;
5703 			int             ledger_idx_volatile;
5704 			int             ledger_idx_nonvolatile;
5705 			int             ledger_idx_volatile_compressed;
5706 			int             ledger_idx_nonvolatile_compressed;
5707 			int             ledger_idx_composite;
5708 			int             ledger_idx_external_wired;
5709 			boolean_t       do_footprint;
5710 
5711 			VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
5712 			VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
5713 			VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
5714 
5715 			assert(m_object->resident_page_count >=
5716 			    m_object->wired_page_count);
5717 
5718 			if (m_object->purgable == VM_PURGABLE_VOLATILE) {
5719 				OSAddAtomic(+1, &vm_page_purgeable_count);
5720 				assert(vm_page_purgeable_wired_count > 0);
5721 				OSAddAtomic(-1, &vm_page_purgeable_wired_count);
5722 			}
5723 			if (m_object->internal &&
5724 			    m_object->vo_owner != TASK_NULL &&
5725 			    (m_object->purgable == VM_PURGABLE_VOLATILE ||
5726 			    m_object->purgable == VM_PURGABLE_EMPTY)) {
5727 				owner = VM_OBJECT_OWNER(m_object);
5728 				vm_object_ledger_tag_ledgers(
5729 					m_object,
5730 					&ledger_idx_volatile,
5731 					&ledger_idx_nonvolatile,
5732 					&ledger_idx_volatile_compressed,
5733 					&ledger_idx_nonvolatile_compressed,
5734 					&ledger_idx_composite,
5735 					&ledger_idx_external_wired,
5736 					&do_footprint);
5737 				/*
5738 				 * While wired, this page was accounted
5739 				 * as "non-volatile" but it should now
5740 				 * be accounted as "volatile".
5741 				 */
5742 				/* one less "non-volatile"... */
5743 				ledger_debit(owner->ledger,
5744 				    ledger_idx_nonvolatile,
5745 				    PAGE_SIZE);
5746 				if (do_footprint) {
5747 					/* ... and "phys_footprint" */
5748 					ledger_debit(owner->ledger,
5749 					    task_ledgers.phys_footprint,
5750 					    PAGE_SIZE);
5751 				} else if (ledger_idx_composite != -1) {
5752 					ledger_debit(owner->ledger,
5753 					    ledger_idx_composite,
5754 					    PAGE_SIZE);
5755 				}
5756 				/* one more "volatile" */
5757 				ledger_credit(owner->ledger,
5758 				    ledger_idx_volatile,
5759 				    PAGE_SIZE);
5760 			}
5761 		}
5762 		if (vm_page_is_canonical(mem)) {
5763 			vm_page_wire_count--;
5764 		}
5765 
5766 #if HAS_MTE
5767 		mteinfo_decrement_wire_count(mem, true);
5768 #endif /* HAS_MTE */
5769 
5770 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
5771 		mem->vmp_iopl_wired = false;
5772 		mem->vmp_wire_count = 0;
5773 		assert(!mem->vmp_gobbled);
5774 	} else if (mem->vmp_gobbled) {
5775 		if (vm_page_is_canonical(mem)) {
5776 			vm_page_wire_count--;
5777 		}
5778 		vm_page_gobble_count--;
5779 	}
5780 }
5781 
5782 /*
5783  * like vm_page_init, but we have to preserve fields related to phys page
5784  */
5785 inline static void
vm_page_reset_canonical(vm_page_t mem)5786 vm_page_reset_canonical(vm_page_t mem)
5787 {
5788 	*mem = (struct vm_page){
5789 		.vmp_offset      = (vm_object_offset_t)-1,
5790 		.vmp_q_state     = VM_PAGE_NOT_ON_Q,
5791 		.vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY,
5792 #if XNU_VM_HAS_LOPAGE
5793 		.vmp_lopage      = mem->vmp_lopage,
5794 #endif /* XNU_VM_HAS_LOPAGE */
5795 		.vmp_canonical   = true,
5796 		.vmp_busy        = true,
5797 		.vmp_realtime    = mem->vmp_realtime,
5798 #if HAS_MTE
5799 		.vmp_using_mte   = mem->vmp_using_mte,
5800 #endif
5801 #if !XNU_VM_HAS_LINEAR_PAGES_ARRAY
5802 		.vmp_phys_page   = mem->vmp_phys_page,
5803 #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */
5804 	};
5805 	/* ECC information is out of `struct vm_page` and preserved */
5806 }
5807 
5808 void
vm_page_free_prepare_object(vm_page_t mem,boolean_t remove_from_hash)5809 vm_page_free_prepare_object(vm_page_t mem, boolean_t remove_from_hash)
5810 {
5811 	if (mem->vmp_tabled) {
5812 		vm_page_remove(mem, remove_from_hash);  /* clears tabled, object, offset */
5813 	}
5814 	vm_page_wakeup(VM_OBJECT_NULL, mem);               /* clears wanted */
5815 
5816 	if (vm_page_is_private(mem)) {
5817 		vm_page_reset_private(mem);
5818 	}
5819 	if (vm_page_is_canonical(mem)) {
5820 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
5821 		    mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
5822 		    mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
5823 		    mem->vmp_next_m == 0);
5824 
5825 		pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5826 
5827 		vm_page_reset_canonical(mem);
5828 	}
5829 }
5830 
5831 /*
5832  *	vm_page_release:
5833  *
5834  *	Return a page to the free list.
5835  *
5836  *	Keep in sync with vm_page_free_list().
5837  */
5838 
5839 void
vm_page_release(vm_page_t mem,vmp_release_options_t options)5840 vm_page_release(vm_page_t mem, vmp_release_options_t options)
5841 {
5842 	if (options & VMP_RELEASE_Q_LOCKED) {
5843 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5844 	} else {
5845 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
5846 	}
5847 
5848 	assert(vm_page_is_canonical(mem));
5849 	assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
5850 
5851 	if ((options & VMP_RELEASE_SKIP_FREE_CHECK) == 0) {
5852 		pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5853 	}
5854 
5855 	pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
5856 
5857 
5858 	vm_page_free_queue_enter_list(vm_page_list_for_page(mem), options);
5859 }
5860 
5861 /*
5862  * This version of vm_page_release() is used only at startup
5863  * when we are single-threaded and pages are being released
5864  * for the first time. Hence, no locking or unnecessary checks are made.
5865  * Note: VM_CHECK_MEMORYSTATUS invoked by the caller.
5866  */
5867 void
vm_page_release_startup(vm_page_t mem)5868 vm_page_release_startup(vm_page_t mem)
5869 {
5870 #if HAS_MTE
5871 	if (pmap_in_tag_storage_range(VM_PAGE_GET_PHYS_PAGE(mem)) && is_mte_enabled) {
5872 		/*
5873 		 * Add the MTE tag page to the FREE_MTE_TAG queue.  These pages
5874 		 * can be used/claimed for other purposes (other than tag pages)
5875 		 * provided that they can be reclaimed quickly without waiting
5876 		 * on I/O, e.g. readonly/clean file pages.
5877 		 */
5878 		mteinfo_tag_storage_release_startup(mem);
5879 		return;
5880 	}
5881 #endif /* HAS_MTE */
5882 	vm_page_free_queue_enter_list(vm_page_list_for_page(mem),
5883 	    VMP_RELEASE_STARTUP);
5884 }
5885 
5886 /*
5887  *	vm_page_free:
5888  *
5889  *	Returns the given page to the free list,
5890  *	disassociating it with any VM object.
5891  *
5892  *	Object and page queues must be locked prior to entry.
5893  */
5894 void
vm_page_free(vm_page_t mem)5895 vm_page_free(vm_page_t mem)
5896 {
5897 	vm_page_free_prepare(mem);
5898 
5899 	if (vm_page_is_canonical(mem)) {
5900 		/* page queues are locked */
5901 		vm_page_release(mem, VMP_RELEASE_Q_LOCKED |
5902 		    VMP_RELEASE_SKIP_FREE_CHECK);
5903 	} else {
5904 		vm_page_release_fictitious(mem);
5905 	}
5906 }
5907 
5908 
5909 void
vm_page_free_unlocked(vm_page_t mem,boolean_t remove_from_hash)5910 vm_page_free_unlocked(vm_page_t mem, boolean_t remove_from_hash)
5911 {
5912 	vm_page_lockspin_queues();
5913 	vm_page_free_prepare_queues(mem);
5914 	vm_page_unlock_queues();
5915 
5916 	vm_page_free_prepare_object(mem, remove_from_hash);
5917 
5918 	if (vm_page_is_canonical(mem)) {
5919 		/* page queues are not locked */
5920 		vm_page_release(mem, VMP_RELEASE_SKIP_FREE_CHECK);
5921 	} else {
5922 		vm_page_release_fictitious(mem);
5923 	}
5924 }
5925 
5926 
5927 /*
5928  * Free a list of pages.  The list can be up to several hundred pages,
5929  * as blocked up by vm_pageout_scan().
5930  * The big win is not having to take the free list lock once
5931  * per page.
5932  *
5933  * The VM page queues lock (vm_page_queue_lock) should NOT be held.
5934  * The VM page free queues lock (vm_page_queue_free_lock) should NOT be held.
5935  *
5936  * Keep in sync with vm_page_release().
5937  */
5938 void
vm_page_free_list(vm_page_t freeq,bool prepare_object)5939 vm_page_free_list(vm_page_t freeq, bool prepare_object)
5940 {
5941 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
5942 	LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
5943 
5944 	while (freeq) {
5945 		vm_page_list_t list = { };
5946 
5947 		while (list.vmpl_count < VMP_FREE_BATCH_SIZE && freeq) {
5948 			vm_page_t mem = _vm_page_list_pop(&freeq);
5949 
5950 			assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
5951 			    (mem->vmp_q_state == VM_PAGE_IS_WIRED));
5952 
5953 			if (prepare_object) {
5954 				vm_page_free_prepare_object(mem, TRUE);
5955 			}
5956 
5957 			if (vm_page_is_fictitious(mem)) {
5958 				vm_page_release_fictitious(mem);
5959 				continue;
5960 			}
5961 
5962 			if (!prepare_object) {
5963 				/* vm_page_free_prepare_object() checked it */
5964 				pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5965 			}
5966 
5967 			pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
5968 
5969 
5970 			/*
5971 			 * IMPORTANT: we can't set the page "free" here
5972 			 * because that would make the page eligible for
5973 			 * a physically-contiguous allocation (see
5974 			 * vm_page_find_contiguous()) right away (we don't
5975 			 * hold the vm_page_queue_free lock).  That would
5976 			 * cause trouble because the page is not actually
5977 			 * in the free queue yet...
5978 			 */
5979 
5980 			vm_page_list_push(&list, mem);
5981 		}
5982 
5983 		if (list.vmpl_count) {
5984 			vm_page_free_queue_enter_list(list, VMP_RELEASE_NONE);
5985 		}
5986 	}
5987 }
5988 
5989 
5990 /*
5991  *	vm_page_wire:
5992  *
5993  *	Mark this page as wired down by yet
5994  *	another map, removing it from paging queues
5995  *	as necessary.
5996  *
5997  *	The page's object and the page queues must be locked.
5998  */
5999 
6000 
6001 void
vm_page_wire(vm_page_t mem,vm_tag_t tag,boolean_t check_memorystatus)6002 vm_page_wire(
6003 	vm_page_t mem,
6004 	vm_tag_t           tag,
6005 	boolean_t          check_memorystatus)
6006 {
6007 	vm_object_t     m_object;
6008 
6009 	m_object = VM_PAGE_OBJECT(mem);
6010 
6011 //	dbgLog(current_thread(), mem->vmp_offset, m_object, 1);	/* (TEST/DEBUG) */
6012 
6013 	VM_PAGE_CHECK(mem);
6014 	if (m_object) {
6015 		vm_object_lock_assert_exclusive(m_object);
6016 	} else {
6017 		/*
6018 		 * In theory, the page should be in an object before it
6019 		 * gets wired, since we need to hold the object lock
6020 		 * to update some fields in the page structure.
6021 		 * However, some code (i386 pmap, for example) might want
6022 		 * to wire a page before it gets inserted into an object.
6023 		 * That's somewhat OK, as long as nobody else can get to
6024 		 * that page and update it at the same time.
6025 		 */
6026 	}
6027 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6028 	if (!VM_PAGE_WIRED(mem)) {
6029 		if (mem->vmp_laundry) {
6030 			vm_pageout_steal_laundry(mem, TRUE);
6031 		}
6032 
6033 		vm_page_queues_remove(mem, TRUE);
6034 
6035 		assert(mem->vmp_wire_count == 0);
6036 		mem->vmp_q_state = VM_PAGE_IS_WIRED;
6037 
6038 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
6039 		if (mem->vmp_unmodified_ro == true) {
6040 			/* Object and PageQ locks are held*/
6041 			mem->vmp_unmodified_ro = false;
6042 			os_atomic_dec(&compressor_ro_uncompressed, relaxed);
6043 			vm_object_compressor_pager_state_clr(VM_PAGE_OBJECT(mem), mem->vmp_offset);
6044 		}
6045 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
6046 
6047 		if (m_object) {
6048 			task_t          owner;
6049 			int             ledger_idx_volatile;
6050 			int             ledger_idx_nonvolatile;
6051 			int             ledger_idx_volatile_compressed;
6052 			int             ledger_idx_nonvolatile_compressed;
6053 			int             ledger_idx_composite;
6054 			int             ledger_idx_external_wired;
6055 			boolean_t       do_footprint;
6056 
6057 			VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
6058 			VM_OBJECT_WIRED_PAGE_ADD(m_object, mem);
6059 			VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, tag);
6060 
6061 			assert(m_object->resident_page_count >=
6062 			    m_object->wired_page_count);
6063 			if (m_object->purgable == VM_PURGABLE_VOLATILE) {
6064 				assert(vm_page_purgeable_count > 0);
6065 				OSAddAtomic(-1, &vm_page_purgeable_count);
6066 				OSAddAtomic(1, &vm_page_purgeable_wired_count);
6067 			}
6068 			if (m_object->internal &&
6069 			    m_object->vo_owner != TASK_NULL &&
6070 			    (m_object->purgable == VM_PURGABLE_VOLATILE ||
6071 			    m_object->purgable == VM_PURGABLE_EMPTY)) {
6072 				owner = VM_OBJECT_OWNER(m_object);
6073 				vm_object_ledger_tag_ledgers(
6074 					m_object,
6075 					&ledger_idx_volatile,
6076 					&ledger_idx_nonvolatile,
6077 					&ledger_idx_volatile_compressed,
6078 					&ledger_idx_nonvolatile_compressed,
6079 					&ledger_idx_composite,
6080 					&ledger_idx_external_wired,
6081 					&do_footprint);
6082 				/* less volatile bytes */
6083 				ledger_debit(owner->ledger,
6084 				    ledger_idx_volatile,
6085 				    PAGE_SIZE);
6086 				/* more not-quite-volatile bytes */
6087 				ledger_credit(owner->ledger,
6088 				    ledger_idx_nonvolatile,
6089 				    PAGE_SIZE);
6090 				if (do_footprint) {
6091 					/* more footprint */
6092 					ledger_credit(owner->ledger,
6093 					    task_ledgers.phys_footprint,
6094 					    PAGE_SIZE);
6095 				} else if (ledger_idx_composite != -1) {
6096 					ledger_credit(owner->ledger,
6097 					    ledger_idx_composite,
6098 					    PAGE_SIZE);
6099 				}
6100 			}
6101 
6102 			if (m_object->all_reusable) {
6103 				/*
6104 				 * Wired pages are not counted as "re-usable"
6105 				 * in "all_reusable" VM objects, so nothing
6106 				 * to do here.
6107 				 */
6108 			} else if (mem->vmp_reusable) {
6109 				/*
6110 				 * This page is not "re-usable" when it's
6111 				 * wired, so adjust its state and the
6112 				 * accounting.
6113 				 */
6114 				vm_page_lockconvert_queues();
6115 				vm_object_reuse_pages(m_object,
6116 				    mem->vmp_offset,
6117 				    mem->vmp_offset + PAGE_SIZE_64,
6118 				    FALSE);
6119 			}
6120 		}
6121 		assert(!mem->vmp_reusable);
6122 
6123 		if (vm_page_is_canonical(mem) && !mem->vmp_gobbled) {
6124 			vm_page_wire_count++;
6125 		}
6126 		if (mem->vmp_gobbled) {
6127 			vm_page_gobble_count--;
6128 		}
6129 		mem->vmp_gobbled = FALSE;
6130 
6131 		if (check_memorystatus == TRUE) {
6132 			VM_CHECK_MEMORYSTATUS;
6133 		}
6134 	}
6135 	assert(!mem->vmp_gobbled);
6136 	assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
6137 	mem->vmp_wire_count++;
6138 
6139 #if HAS_MTE
6140 	if (mem->vmp_wire_count == 1 && tag != VM_KERN_MEMORY_MTAG) {
6141 		/*
6142 		 * Only notify Mte Info if the caller isn't
6143 		 * mteinfo_tag_storage_wire_locked().
6144 		 */
6145 		mteinfo_increment_wire_count(mem);
6146 	}
6147 #endif /* HAS_MTE */
6148 
6149 	if (__improbable(mem->vmp_wire_count == 0)) {
6150 		panic("vm_page_wire(%p): wire_count overflow", mem);
6151 	}
6152 	VM_PAGE_CHECK(mem);
6153 }
6154 
6155 /*
6156  *	vm_page_unwire:
6157  *
6158  *	Release one wiring of this page, potentially
6159  *	enabling it to be paged again.
6160  *
6161  *	The page's object and the page queues must be locked.
6162  */
6163 void
vm_page_unwire(vm_page_t mem,boolean_t queueit)6164 vm_page_unwire(
6165 	vm_page_t       mem,
6166 	boolean_t       queueit)
6167 {
6168 	vm_object_t     m_object;
6169 
6170 	m_object = VM_PAGE_OBJECT(mem);
6171 
6172 //	dbgLog(current_thread(), mem->vmp_offset, m_object, 0);	/* (TEST/DEBUG) */
6173 
6174 	VM_PAGE_CHECK(mem);
6175 	assert(VM_PAGE_WIRED(mem));
6176 	assert(mem->vmp_wire_count > 0);
6177 	assert(!mem->vmp_gobbled);
6178 	assert(m_object != VM_OBJECT_NULL);
6179 	vm_object_lock_assert_exclusive(m_object);
6180 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6181 	if (--mem->vmp_wire_count == 0) {
6182 		task_t          owner;
6183 		int             ledger_idx_volatile;
6184 		int             ledger_idx_nonvolatile;
6185 		int             ledger_idx_volatile_compressed;
6186 		int             ledger_idx_nonvolatile_compressed;
6187 		int             ledger_idx_composite;
6188 		int             ledger_idx_external_wired;
6189 		boolean_t       do_footprint;
6190 
6191 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
6192 		mem->vmp_iopl_wired = false;
6193 
6194 		VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
6195 		VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
6196 		VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
6197 		if (vm_page_is_canonical(mem)) {
6198 			vm_page_wire_count--;
6199 		}
6200 
6201 #if HAS_MTE
6202 		mteinfo_decrement_wire_count(mem, true);
6203 #endif /* HAS_MTE */
6204 
6205 		assert(m_object->resident_page_count >=
6206 		    m_object->wired_page_count);
6207 		if (m_object->purgable == VM_PURGABLE_VOLATILE) {
6208 			OSAddAtomic(+1, &vm_page_purgeable_count);
6209 			assert(vm_page_purgeable_wired_count > 0);
6210 			OSAddAtomic(-1, &vm_page_purgeable_wired_count);
6211 		}
6212 		if (m_object->internal &&
6213 		    m_object->vo_owner != TASK_NULL &&
6214 		    (m_object->purgable == VM_PURGABLE_VOLATILE ||
6215 		    m_object->purgable == VM_PURGABLE_EMPTY)) {
6216 			owner = VM_OBJECT_OWNER(m_object);
6217 			vm_object_ledger_tag_ledgers(
6218 				m_object,
6219 				&ledger_idx_volatile,
6220 				&ledger_idx_nonvolatile,
6221 				&ledger_idx_volatile_compressed,
6222 				&ledger_idx_nonvolatile_compressed,
6223 				&ledger_idx_composite,
6224 				&ledger_idx_external_wired,
6225 				&do_footprint);
6226 			/* more volatile bytes */
6227 			ledger_credit(owner->ledger,
6228 			    ledger_idx_volatile,
6229 			    PAGE_SIZE);
6230 			/* less not-quite-volatile bytes */
6231 			ledger_debit(owner->ledger,
6232 			    ledger_idx_nonvolatile,
6233 			    PAGE_SIZE);
6234 			if (do_footprint) {
6235 				/* less footprint */
6236 				ledger_debit(owner->ledger,
6237 				    task_ledgers.phys_footprint,
6238 				    PAGE_SIZE);
6239 			} else if (ledger_idx_composite != -1) {
6240 				ledger_debit(owner->ledger,
6241 				    ledger_idx_composite,
6242 				    PAGE_SIZE);
6243 			}
6244 		}
6245 		assert(!is_kernel_object(m_object));
6246 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
6247 
6248 		if (queueit == TRUE) {
6249 			if (m_object->purgable == VM_PURGABLE_EMPTY) {
6250 				vm_page_deactivate(mem);
6251 			} else {
6252 				vm_page_activate(mem);
6253 			}
6254 		}
6255 
6256 		VM_CHECK_MEMORYSTATUS;
6257 	}
6258 	VM_PAGE_CHECK(mem);
6259 }
6260 
6261 /*
6262  *	vm_page_deactivate:
6263  *
6264  *	Returns the given page to the inactive list,
6265  *	indicating that no physical maps have access
6266  *	to this page.  [Used by the physical mapping system.]
6267  *
6268  *	The page queues must be locked.
6269  */
6270 void
vm_page_deactivate(vm_page_t m)6271 vm_page_deactivate(
6272 	vm_page_t       m)
6273 {
6274 	vm_page_deactivate_internal(m, TRUE);
6275 }
6276 
6277 
6278 void
vm_page_deactivate_internal(vm_page_t m,boolean_t clear_hw_reference)6279 vm_page_deactivate_internal(
6280 	vm_page_t       m,
6281 	boolean_t       clear_hw_reference)
6282 {
6283 	vm_object_t     m_object;
6284 
6285 	m_object = VM_PAGE_OBJECT(m);
6286 
6287 	VM_PAGE_CHECK(m);
6288 	assert(!is_kernel_object(m_object));
6289 	assert(!vm_page_is_guard(m));
6290 
6291 //	dbgLog(VM_PAGE_GET_PHYS_PAGE(m), vm_page_free_count, vm_page_wire_count, 6);	/* (TEST/DEBUG) */
6292 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6293 	/*
6294 	 *	This page is no longer very interesting.  If it was
6295 	 *	interesting (active or inactive/referenced), then we
6296 	 *	clear the reference bit and (re)enter it in the
6297 	 *	inactive queue.  Note wired pages should not have
6298 	 *	their reference bit cleared.
6299 	 */
6300 	assert( !(m->vmp_absent && !m->vmp_unusual));
6301 
6302 	if (m->vmp_gobbled) {           /* can this happen? */
6303 		assert( !VM_PAGE_WIRED(m));
6304 
6305 		if (vm_page_is_canonical(m)) {
6306 			vm_page_wire_count--;
6307 		}
6308 		vm_page_gobble_count--;
6309 		m->vmp_gobbled = FALSE;
6310 	}
6311 	/*
6312 	 * if this page is currently on the pageout queue, we can't do the
6313 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6314 	 * and we can't remove it manually since we would need the object lock
6315 	 * (which is not required here) to decrement the activity_in_progress
6316 	 * reference which is held on the object while the page is in the pageout queue...
6317 	 * just let the normal laundry processing proceed
6318 	 */
6319 	if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6320 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6321 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
6322 	    VM_PAGE_WIRED(m)) {
6323 		return;
6324 	}
6325 	if (!m->vmp_absent && clear_hw_reference == TRUE) {
6326 		vm_page_lockconvert_queues();
6327 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
6328 	}
6329 
6330 	m->vmp_reference = FALSE;
6331 	m->vmp_no_cache = FALSE;
6332 
6333 	if (!VM_PAGE_INACTIVE(m)) {
6334 		vm_page_queues_remove(m, FALSE);
6335 
6336 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
6337 		    m->vmp_dirty && m_object->internal &&
6338 		    (m_object->purgable == VM_PURGABLE_DENY ||
6339 		    m_object->purgable == VM_PURGABLE_NONVOLATILE ||
6340 		    m_object->purgable == VM_PURGABLE_VOLATILE)) {
6341 			vm_page_check_pageable_safe(m);
6342 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
6343 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
6344 			vm_page_throttled_count++;
6345 		} else {
6346 			if (m_object->named &&
6347 			    os_ref_get_count_raw(&m_object->ref_count) == 1) {
6348 				vm_page_speculate(m, FALSE);
6349 #if DEVELOPMENT || DEBUG
6350 				vm_page_speculative_recreated++;
6351 #endif
6352 			} else {
6353 				vm_page_enqueue_inactive(m, FALSE);
6354 			}
6355 		}
6356 	}
6357 }
6358 
6359 /*
6360  * vm_page_enqueue_cleaned
6361  *
6362  * Put the page on the cleaned queue, mark it cleaned, etc.
6363  * Being on the cleaned queue (and having m->clean_queue set)
6364  * does ** NOT ** guarantee that the page is clean!
6365  *
6366  * Call with the queues lock held.
6367  */
6368 
6369 void
vm_page_enqueue_cleaned(vm_page_t m)6370 vm_page_enqueue_cleaned(vm_page_t m)
6371 {
6372 	vm_object_t     m_object;
6373 
6374 	m_object = VM_PAGE_OBJECT(m);
6375 
6376 	assert(!vm_page_is_guard(m));
6377 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6378 	assert(!(m->vmp_absent && !m->vmp_unusual));
6379 
6380 	if (VM_PAGE_WIRED(m)) {
6381 		return;
6382 	}
6383 
6384 	if (m->vmp_gobbled) {
6385 		if (vm_page_is_canonical(m)) {
6386 			vm_page_wire_count--;
6387 		}
6388 		vm_page_gobble_count--;
6389 		m->vmp_gobbled = FALSE;
6390 	}
6391 	/*
6392 	 * if this page is currently on the pageout queue, we can't do the
6393 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6394 	 * and we can't remove it manually since we would need the object lock
6395 	 * (which is not required here) to decrement the activity_in_progress
6396 	 * reference which is held on the object while the page is in the pageout queue...
6397 	 * just let the normal laundry processing proceed
6398 	 */
6399 	if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6400 	    (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
6401 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6402 		return;
6403 	}
6404 	vm_page_queues_remove(m, FALSE);
6405 
6406 	vm_page_check_pageable_safe(m);
6407 	vm_page_queue_enter(&vm_page_queue_cleaned, m, vmp_pageq);
6408 	m->vmp_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q;
6409 	vm_page_cleaned_count++;
6410 
6411 	vm_page_inactive_count++;
6412 	if (m_object->internal) {
6413 		vm_page_pageable_internal_count++;
6414 	} else {
6415 		vm_page_pageable_external_count++;
6416 	}
6417 	vm_page_add_to_specialq(m, TRUE);
6418 	VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
6419 }
6420 
6421 /*
6422  *	vm_page_activate:
6423  *
6424  *	Put the specified page on the active list (if appropriate).
6425  *
6426  *	The page queues must be locked.
6427  */
6428 
6429 void
vm_page_activate(vm_page_t m)6430 vm_page_activate(
6431 	vm_page_t       m)
6432 {
6433 	vm_object_t     m_object;
6434 
6435 	m_object = VM_PAGE_OBJECT(m);
6436 
6437 	VM_PAGE_CHECK(m);
6438 #ifdef  FIXME_4778297
6439 	assert(!is_kernel_object(m_object));
6440 #endif
6441 	assert(!vm_page_is_guard(m));
6442 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6443 	assert( !(m->vmp_absent && !m->vmp_unusual));
6444 
6445 	if (m->vmp_gobbled) {
6446 		assert( !VM_PAGE_WIRED(m));
6447 		if (vm_page_is_canonical(m)) {
6448 			vm_page_wire_count--;
6449 		}
6450 		vm_page_gobble_count--;
6451 		m->vmp_gobbled = FALSE;
6452 	}
6453 	/*
6454 	 * if this page is currently on the pageout queue, we can't do the
6455 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6456 	 * and we can't remove it manually since we would need the object lock
6457 	 * (which is not required here) to decrement the activity_in_progress
6458 	 * reference which is held on the object while the page is in the pageout queue...
6459 	 * just let the normal laundry processing proceed
6460 	 */
6461 	if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6462 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6463 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6464 		return;
6465 	}
6466 
6467 #if DEBUG
6468 	if (m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q) {
6469 		panic("vm_page_activate: already active");
6470 	}
6471 #endif
6472 
6473 	if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
6474 		DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
6475 		DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
6476 	}
6477 
6478 	/*
6479 	 * A freshly activated page should be promoted in the donation queue.
6480 	 * So we remove it here while preserving its hint and we will enqueue
6481 	 * it again in vm_page_enqueue_active.
6482 	 */
6483 	vm_page_queues_remove(m, ((m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) ? TRUE : FALSE));
6484 
6485 	if (!VM_PAGE_WIRED(m)) {
6486 		vm_page_check_pageable_safe(m);
6487 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
6488 		    m->vmp_dirty && m_object->internal &&
6489 		    (m_object->purgable == VM_PURGABLE_DENY ||
6490 		    m_object->purgable == VM_PURGABLE_NONVOLATILE ||
6491 		    m_object->purgable == VM_PURGABLE_VOLATILE)) {
6492 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
6493 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
6494 			vm_page_throttled_count++;
6495 		} else {
6496 #if CONFIG_SECLUDED_MEMORY
6497 			if (secluded_for_filecache &&
6498 			    vm_page_secluded_target != 0 &&
6499 			    num_tasks_can_use_secluded_mem == 0 &&
6500 			    m_object->eligible_for_secluded &&
6501 			    !m->vmp_realtime) {
6502 				vm_page_queue_enter(&vm_page_queue_secluded, m, vmp_pageq);
6503 				m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
6504 				vm_page_secluded_count++;
6505 				VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
6506 				vm_page_secluded_count_inuse++;
6507 				assert(!m_object->internal);
6508 //				vm_page_pageable_external_count++;
6509 			} else
6510 #endif /* CONFIG_SECLUDED_MEMORY */
6511 			vm_page_enqueue_active(m, FALSE);
6512 		}
6513 		m->vmp_reference = TRUE;
6514 		m->vmp_no_cache = FALSE;
6515 	}
6516 	VM_PAGE_CHECK(m);
6517 }
6518 
6519 
6520 /*
6521  *      vm_page_speculate:
6522  *
6523  *      Put the specified page on the speculative list (if appropriate).
6524  *
6525  *      The page queues must be locked.
6526  */
6527 void
vm_page_speculate(vm_page_t m,boolean_t new)6528 vm_page_speculate(
6529 	vm_page_t       m,
6530 	boolean_t       new)
6531 {
6532 	struct vm_speculative_age_q     *aq;
6533 	vm_object_t     m_object;
6534 
6535 	m_object = VM_PAGE_OBJECT(m);
6536 
6537 	VM_PAGE_CHECK(m);
6538 	vm_page_check_pageable_safe(m);
6539 
6540 	assert(!vm_page_is_guard(m));
6541 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6542 	assert(!(m->vmp_absent && !m->vmp_unusual));
6543 	assert(m_object->internal == FALSE);
6544 
6545 	/*
6546 	 * if this page is currently on the pageout queue, we can't do the
6547 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6548 	 * and we can't remove it manually since we would need the object lock
6549 	 * (which is not required here) to decrement the activity_in_progress
6550 	 * reference which is held on the object while the page is in the pageout queue...
6551 	 * just let the normal laundry processing proceed
6552 	 */
6553 	if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6554 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6555 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6556 		return;
6557 	}
6558 
6559 	vm_page_queues_remove(m, FALSE);
6560 
6561 	if (!VM_PAGE_WIRED(m)) {
6562 		mach_timespec_t         ts;
6563 		clock_sec_t sec;
6564 		clock_nsec_t nsec;
6565 
6566 		clock_get_system_nanotime(&sec, &nsec);
6567 		ts.tv_sec = (unsigned int) sec;
6568 		ts.tv_nsec = nsec;
6569 
6570 		if (vm_page_speculative_count == 0) {
6571 			speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6572 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6573 
6574 			aq = &vm_page_queue_speculative[speculative_age_index];
6575 
6576 			/*
6577 			 * set the timer to begin a new group
6578 			 */
6579 			aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
6580 			aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
6581 
6582 			ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
6583 		} else {
6584 			aq = &vm_page_queue_speculative[speculative_age_index];
6585 
6586 			if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) {
6587 				speculative_age_index++;
6588 
6589 				if (speculative_age_index > vm_page_max_speculative_age_q) {
6590 					speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6591 				}
6592 				if (speculative_age_index == speculative_steal_index) {
6593 					speculative_steal_index = speculative_age_index + 1;
6594 
6595 					if (speculative_steal_index > vm_page_max_speculative_age_q) {
6596 						speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6597 					}
6598 				}
6599 				aq = &vm_page_queue_speculative[speculative_age_index];
6600 
6601 				if (!vm_page_queue_empty(&aq->age_q)) {
6602 					vm_page_speculate_ageit(aq);
6603 				}
6604 
6605 				aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
6606 				aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
6607 
6608 				ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
6609 			}
6610 		}
6611 		vm_page_enqueue_tail(&aq->age_q, &m->vmp_pageq);
6612 		m->vmp_q_state = VM_PAGE_ON_SPECULATIVE_Q;
6613 		vm_page_speculative_count++;
6614 		vm_page_pageable_external_count++;
6615 
6616 		if (new == TRUE) {
6617 			vm_object_lock_assert_exclusive(m_object);
6618 
6619 			m_object->pages_created++;
6620 #if DEVELOPMENT || DEBUG
6621 			vm_page_speculative_created++;
6622 #endif
6623 		}
6624 	}
6625 	VM_PAGE_CHECK(m);
6626 }
6627 
6628 
6629 /*
6630  * move pages from the specified aging bin to
6631  * the speculative bin that pageout_scan claims from
6632  *
6633  *      The page queues must be locked.
6634  */
6635 void
vm_page_speculate_ageit(struct vm_speculative_age_q * aq)6636 vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
6637 {
6638 	struct vm_speculative_age_q     *sq;
6639 	vm_page_t       t;
6640 
6641 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
6642 
6643 	if (vm_page_queue_empty(&sq->age_q)) {
6644 		sq->age_q.next = aq->age_q.next;
6645 		sq->age_q.prev = aq->age_q.prev;
6646 
6647 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next);
6648 		t->vmp_pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q);
6649 
6650 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
6651 		t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
6652 	} else {
6653 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
6654 		t->vmp_pageq.next = aq->age_q.next;
6655 
6656 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next);
6657 		t->vmp_pageq.prev = sq->age_q.prev;
6658 
6659 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev);
6660 		t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
6661 
6662 		sq->age_q.prev = aq->age_q.prev;
6663 	}
6664 	vm_page_queue_init(&aq->age_q);
6665 }
6666 
6667 
6668 void
vm_page_lru(vm_page_t m)6669 vm_page_lru(
6670 	vm_page_t       m)
6671 {
6672 	VM_PAGE_CHECK(m);
6673 	assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
6674 	assert(!vm_page_is_guard(m));
6675 
6676 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6677 
6678 	if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q) {
6679 		/*
6680 		 * we don't need to do all the other work that
6681 		 * vm_page_queues_remove and vm_page_enqueue_inactive
6682 		 * bring along for the ride
6683 		 */
6684 		assert(!m->vmp_laundry);
6685 		assert(!vm_page_is_private(m));
6686 
6687 		m->vmp_no_cache = FALSE;
6688 
6689 		vm_page_queue_remove(&vm_page_queue_inactive, m, vmp_pageq);
6690 		vm_page_queue_enter(&vm_page_queue_inactive, m, vmp_pageq);
6691 
6692 		return;
6693 	}
6694 	/*
6695 	 * if this page is currently on the pageout queue, we can't do the
6696 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6697 	 * and we can't remove it manually since we would need the object lock
6698 	 * (which is not required here) to decrement the activity_in_progress
6699 	 * reference which is held on the object while the page is in the pageout queue...
6700 	 * just let the normal laundry processing proceed
6701 	 */
6702 	if (m->vmp_laundry || vm_page_is_private(m) ||
6703 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6704 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
6705 	    VM_PAGE_WIRED(m)) {
6706 		return;
6707 	}
6708 
6709 	m->vmp_no_cache = FALSE;
6710 
6711 	vm_page_queues_remove(m, FALSE);
6712 
6713 	vm_page_enqueue_inactive(m, FALSE);
6714 }
6715 
6716 
6717 void
vm_page_reactivate_all_throttled(void)6718 vm_page_reactivate_all_throttled(void)
6719 {
6720 	vm_page_t       first_throttled, last_throttled;
6721 	vm_page_t       first_active;
6722 	vm_page_t       m;
6723 	int             extra_active_count;
6724 	int             extra_internal_count, extra_external_count;
6725 	vm_object_t     m_object;
6726 
6727 	if (!VM_DYNAMIC_PAGING_ENABLED()) {
6728 		return;
6729 	}
6730 
6731 	extra_active_count = 0;
6732 	extra_internal_count = 0;
6733 	extra_external_count = 0;
6734 	vm_page_lock_queues();
6735 	if (!vm_page_queue_empty(&vm_page_queue_throttled)) {
6736 		/*
6737 		 * Switch "throttled" pages to "active".
6738 		 */
6739 		vm_page_queue_iterate(&vm_page_queue_throttled, m, vmp_pageq) {
6740 			VM_PAGE_CHECK(m);
6741 			assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
6742 
6743 			m_object = VM_PAGE_OBJECT(m);
6744 
6745 			extra_active_count++;
6746 			if (m_object->internal) {
6747 				extra_internal_count++;
6748 			} else {
6749 				extra_external_count++;
6750 			}
6751 
6752 			m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
6753 			VM_PAGE_CHECK(m);
6754 			vm_page_add_to_specialq(m, FALSE);
6755 		}
6756 
6757 		/*
6758 		 * Transfer the entire throttled queue to a regular LRU page queues.
6759 		 * We insert it at the head of the active queue, so that these pages
6760 		 * get re-evaluated by the LRU algorithm first, since they've been
6761 		 * completely out of it until now.
6762 		 */
6763 		first_throttled = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
6764 		last_throttled = (vm_page_t) vm_page_queue_last(&vm_page_queue_throttled);
6765 		first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
6766 		if (vm_page_queue_empty(&vm_page_queue_active)) {
6767 			vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
6768 		} else {
6769 			first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
6770 		}
6771 		vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled);
6772 		first_throttled->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
6773 		last_throttled->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
6774 
6775 #if DEBUG
6776 		printf("reactivated %d throttled pages\n", vm_page_throttled_count);
6777 #endif
6778 		vm_page_queue_init(&vm_page_queue_throttled);
6779 		/*
6780 		 * Adjust the global page counts.
6781 		 */
6782 		vm_page_active_count += extra_active_count;
6783 		vm_page_pageable_internal_count += extra_internal_count;
6784 		vm_page_pageable_external_count += extra_external_count;
6785 		vm_page_throttled_count = 0;
6786 	}
6787 	assert(vm_page_throttled_count == 0);
6788 	assert(vm_page_queue_empty(&vm_page_queue_throttled));
6789 	vm_page_unlock_queues();
6790 }
6791 
6792 
6793 /*
6794  * move pages from the indicated local queue to the global active queue
6795  * its ok to fail if we're below the hard limit and force == FALSE
6796  * the nolocks == TRUE case is to allow this function to be run on
6797  * the hibernate path
6798  */
6799 
6800 void
vm_page_reactivate_local(uint32_t lid,boolean_t force,boolean_t nolocks)6801 vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
6802 {
6803 	struct vpl      *lq;
6804 	vm_page_t       first_local, last_local;
6805 	vm_page_t       first_active;
6806 	vm_page_t       m;
6807 	uint32_t        count = 0;
6808 
6809 	if (vm_page_local_q == NULL) {
6810 		return;
6811 	}
6812 
6813 	lq = zpercpu_get_cpu(vm_page_local_q, lid);
6814 
6815 	if (nolocks == FALSE) {
6816 		if (lq->vpl_count < vm_page_local_q_hard_limit && force == FALSE) {
6817 			if (!vm_page_trylockspin_queues()) {
6818 				return;
6819 			}
6820 		} else {
6821 			vm_page_lockspin_queues();
6822 		}
6823 
6824 		VPL_LOCK(&lq->vpl_lock);
6825 	}
6826 	if (lq->vpl_count) {
6827 		/*
6828 		 * Switch "local" pages to "active".
6829 		 */
6830 		assert(!vm_page_queue_empty(&lq->vpl_queue));
6831 
6832 		vm_page_queue_iterate(&lq->vpl_queue, m, vmp_pageq) {
6833 			VM_PAGE_CHECK(m);
6834 			vm_page_check_pageable_safe(m);
6835 			assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q);
6836 			assert(!vm_page_is_fictitious(m));
6837 
6838 			if (m->vmp_local_id != lid) {
6839 				panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m);
6840 			}
6841 
6842 			m->vmp_local_id = 0;
6843 			m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
6844 			VM_PAGE_CHECK(m);
6845 			vm_page_add_to_specialq(m, FALSE);
6846 			count++;
6847 		}
6848 		if (count != lq->vpl_count) {
6849 			panic("vm_page_reactivate_local: count = %d, vm_page_local_count = %d", count, lq->vpl_count);
6850 		}
6851 
6852 		/*
6853 		 * Transfer the entire local queue to a regular LRU page queues.
6854 		 */
6855 		first_local = (vm_page_t) vm_page_queue_first(&lq->vpl_queue);
6856 		last_local = (vm_page_t) vm_page_queue_last(&lq->vpl_queue);
6857 		first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
6858 
6859 		if (vm_page_queue_empty(&vm_page_queue_active)) {
6860 			vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
6861 		} else {
6862 			first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
6863 		}
6864 		vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
6865 		first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
6866 		last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
6867 
6868 		vm_page_queue_init(&lq->vpl_queue);
6869 		/*
6870 		 * Adjust the global page counts.
6871 		 */
6872 		vm_page_active_count += lq->vpl_count;
6873 		vm_page_pageable_internal_count += lq->vpl_internal_count;
6874 		vm_page_pageable_external_count += lq->vpl_external_count;
6875 		lq->vpl_count = 0;
6876 		lq->vpl_internal_count = 0;
6877 		lq->vpl_external_count = 0;
6878 	}
6879 	assert(vm_page_queue_empty(&lq->vpl_queue));
6880 
6881 	if (nolocks == FALSE) {
6882 		VPL_UNLOCK(&lq->vpl_lock);
6883 
6884 		vm_page_balance_inactive(count / 4);
6885 		vm_page_unlock_queues();
6886 	}
6887 }
6888 
6889 /*
6890  *	vm_page_part_zero_fill:
6891  *
6892  *	Zero-fill a part of the page.
6893  */
6894 #define PMAP_ZERO_PART_PAGE_IMPLEMENTED
6895 void
vm_page_part_zero_fill(vm_page_t m,vm_offset_t m_pa,vm_size_t len)6896 vm_page_part_zero_fill(
6897 	vm_page_t       m,
6898 	vm_offset_t     m_pa,
6899 	vm_size_t       len)
6900 {
6901 #if 0
6902 	/*
6903 	 * we don't hold the page queue lock
6904 	 * so this check isn't safe to make
6905 	 */
6906 	VM_PAGE_CHECK(m);
6907 #endif
6908 
6909 #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED
6910 	pmap_zero_part_page(VM_PAGE_GET_PHYS_PAGE(m), m_pa, len);
6911 #else
6912 	vm_page_t       tmp;
6913 	while (1) {
6914 		tmp = vm_page_grab();
6915 		if (tmp == VM_PAGE_NULL) {
6916 			vm_page_wait(THREAD_UNINT);
6917 			continue;
6918 		}
6919 		break;
6920 	}
6921 	vm_page_zero_fill(
6922 		tmp
6923 #if HAS_MTE
6924 		, false /* zero_tags */
6925 #endif /* HAS_MTE */
6926 		);
6927 	if (m_pa != 0) {
6928 		vm_page_part_copy(m, 0, tmp, 0, m_pa);
6929 	}
6930 	if ((m_pa + len) < PAGE_SIZE) {
6931 		vm_page_part_copy(m, m_pa + len, tmp,
6932 		    m_pa + len, PAGE_SIZE - (m_pa + len));
6933 	}
6934 	vm_page_copy(tmp, m);
6935 	VM_PAGE_FREE(tmp);
6936 #endif
6937 }
6938 
6939 /*!
6940  * @function vm_page_zero_fill
6941  *
6942  * @abstract
6943  * Zero-fill the specified page.
6944  *
6945  * @param m				the page to be zero-filled.
6946  */
6947 #if HAS_MTE && !defined(KASAN)
6948 /*!
6949  * @param zero_tags     if true, and the page is MTE-tagged, its corresponding tags will be zeroed.
6950  */
6951 #endif /* HAS_MTE && !defined(KASAN) */
6952 void
vm_page_zero_fill(vm_page_t m,bool zero_tags)6953 vm_page_zero_fill(
6954 	vm_page_t       m
6955 #if HAS_MTE
6956 	, bool zero_tags
6957 #endif /* HAS_MTE */
6958 	)
6959 {
6960 	int options = 0;
6961 #if 0
6962 	/*
6963 	 * we don't hold the page queue lock
6964 	 * so this check isn't safe to make
6965 	 */
6966 	VM_PAGE_CHECK(m);
6967 #endif
6968 
6969 //	dbgTrace(0xAEAEAEAE, VM_PAGE_GET_PHYS_PAGE(m), 0);		/* (BRINGUP) */
6970 #if HAS_MTE
6971 	assert(!zero_tags || VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6972 
6973 	/*
6974 	 *       TODO: this can be checked more easily using m->vmp_using_mte once
6975 	 *       page reclamation work is complete
6976 	 */
6977 	if (zero_tags && vm_object_is_mte_mappable(VM_PAGE_OBJECT(m))) {
6978 		options = cppvZeroPageTags;
6979 		KDBG(VMDBG_CODE(DBG_VM_PAGE_MTE_ZFOD) | DBG_FUNC_NONE,
6980 		    VM_KERNEL_ADDRHIDE(m), VM_KERNEL_ADDRHIDE(VM_PAGE_OBJECT(m)),
6981 		    m->vmp_offset);
6982 	}
6983 #endif /* HAS_MTE */
6984 	pmap_zero_page_with_options(VM_PAGE_GET_PHYS_PAGE(m), options);
6985 }
6986 
6987 /*
6988  *	vm_page_part_copy:
6989  *
6990  *	copy part of one page to another
6991  *
6992  *	This function is currently only consumed downstream of a
6993  *	vm_map_copy_overwrite(). The implementation has a simpler contract
6994  *	than vm_page_copy() as there's a restricted set of cases that
6995  *	are allowed to be overwriteable. If vm_map_entry_is_overwriteable()
6996  *	is expanded, this function may have to be adjusted.
6997  */
6998 void
vm_page_part_copy(vm_page_t src_m,vm_offset_t src_pa,vm_page_t dst_m,vm_offset_t dst_pa,vm_size_t len)6999 vm_page_part_copy(
7000 	vm_page_t       src_m,
7001 	vm_offset_t     src_pa,
7002 	vm_page_t       dst_m,
7003 	vm_offset_t     dst_pa,
7004 	vm_size_t       len)
7005 {
7006 #if 0
7007 	/*
7008 	 * we don't hold the page queue lock
7009 	 * so this check isn't safe to make
7010 	 */
7011 	VM_PAGE_CHECK(src_m);
7012 	VM_PAGE_CHECK(dst_m);
7013 #endif
7014 
7015 	/*
7016 	 * Copying from/into restricted pages is a security issue,
7017 	 * as it allows for restricted pages' policies bypass.
7018 	 */
7019 	if (vm_page_is_restricted(src_m)) {
7020 		panic("%s: cannot copy from a restricted page", __func__);
7021 	}
7022 
7023 	if (vm_page_is_restricted(dst_m)) {
7024 		panic("%s: cannot copy into a restricted page", __func__);
7025 	}
7026 
7027 #if HAS_MTE
7028 	/*
7029 	 * As an example of a necessary expansion for vm_page_part_copy(),
7030 	 * MTE objects are currently not overwriteable, but whenever
7031 	 * rdar://134375521 ([VM MTE] Handle overwriting of MTE objects)
7032 	 * gets dealt with, we'll have to update the call down here to pass
7033 	 * the right flags to bcopy_phys().
7034 	 */
7035 #endif /* HAS_MTE */
7036 
7037 	pmap_copy_part_page(VM_PAGE_GET_PHYS_PAGE(src_m), src_pa,
7038 	    VM_PAGE_GET_PHYS_PAGE(dst_m), dst_pa, len);
7039 }
7040 
7041 /*
7042  *	vm_page_copy:
7043  *
7044  *	Copy one page to another
7045  */
7046 
7047 int vm_page_copy_cs_validations = 0;
7048 int vm_page_copy_cs_tainted = 0;
7049 
7050 void
vm_page_copy(vm_page_t src_m,vm_page_t dest_m)7051 vm_page_copy(
7052 	vm_page_t       src_m,
7053 	vm_page_t       dest_m)
7054 {
7055 	vm_object_t     src_m_object;
7056 	int             options = 0;
7057 
7058 	src_m_object = VM_PAGE_OBJECT(src_m);
7059 
7060 #if 0
7061 	/*
7062 	 * we don't hold the page queue lock
7063 	 * so this check isn't safe to make
7064 	 */
7065 	VM_PAGE_CHECK(src_m);
7066 	VM_PAGE_CHECK(dest_m);
7067 #endif
7068 	vm_object_lock_assert_held(src_m_object);
7069 
7070 	/*
7071 	 * Copying from/into restricted pages is a security issue,
7072 	 * as it allows for restricted pages' policies bypass.
7073 	 */
7074 	if (vm_page_is_restricted(src_m)) {
7075 		panic("%s: cannot copy from a restricted page", __func__);
7076 	}
7077 
7078 	if (vm_page_is_restricted(dest_m)) {
7079 		panic("%s: cannot copy into a restricted page", __func__);
7080 	}
7081 
7082 	if (src_m_object != VM_OBJECT_NULL &&
7083 	    src_m_object->code_signed) {
7084 		/*
7085 		 * We're copying a page from a code-signed object.
7086 		 * Whoever ends up mapping the copy page might care about
7087 		 * the original page's integrity, so let's validate the
7088 		 * source page now.
7089 		 */
7090 		vm_page_copy_cs_validations++;
7091 		vm_page_validate_cs(src_m, PAGE_SIZE, 0);
7092 #if DEVELOPMENT || DEBUG
7093 		DTRACE_VM4(codesigned_copy,
7094 		    vm_object_t, src_m_object,
7095 		    vm_object_offset_t, src_m->vmp_offset,
7096 		    int, src_m->vmp_cs_validated,
7097 		    int, src_m->vmp_cs_tainted);
7098 #endif /* DEVELOPMENT || DEBUG */
7099 	}
7100 
7101 	/*
7102 	 * Propagate the cs_tainted bit to the copy page. Do not propagate
7103 	 * the cs_validated bit.
7104 	 */
7105 	dest_m->vmp_cs_tainted = src_m->vmp_cs_tainted;
7106 	dest_m->vmp_cs_nx = src_m->vmp_cs_nx;
7107 	if (dest_m->vmp_cs_tainted) {
7108 		vm_page_copy_cs_tainted++;
7109 	}
7110 
7111 #if HAS_MTE
7112 	/*
7113 	 * vm_page_copy-ing from an untagged page into a tagged page
7114 	 * would happen with tag checking disabled and actually potentially be
7115 	 * an MTE violation.
7116 	 */
7117 	if (!src_m->vmp_using_mte && dest_m->vmp_using_mte) {
7118 		panic("Attempt to write to an MTE tagged page through the physical aperture");
7119 	}
7120 
7121 	if (src_m->vmp_using_mte) {
7122 		/* If we are copying from an MTE-enabled page, disable tag checking */
7123 		options |= cppvDisableTagCheck;
7124 
7125 		if (dest_m->vmp_using_mte) {
7126 			/*
7127 			 * If both source and destination are tagged, this means that we are
7128 			 * either CoWing or relocating a page. Tags need to follow along.
7129 			 */
7130 			options |= cppvCopyTags;
7131 		}
7132 	}
7133 #endif /* HAS_MTE */
7134 
7135 	dest_m->vmp_error = VMP_ERROR_GET(src_m); /* sliding src_m might have failed... */
7136 	pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m), options);
7137 }
7138 
7139 #if MACH_ASSERT
7140 static void
_vm_page_print(vm_page_t p)7141 _vm_page_print(
7142 	vm_page_t       p)
7143 {
7144 	printf("vm_page %p: \n", p);
7145 	printf("  pageq: next=%p prev=%p\n",
7146 	    (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next),
7147 	    (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.prev));
7148 	printf("  listq: next=%p prev=%p\n",
7149 	    (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.next)),
7150 	    (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.prev)));
7151 	printf("  next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)));
7152 	printf("  object=%p offset=0x%llx\n", VM_PAGE_OBJECT(p), p->vmp_offset);
7153 	printf("  wire_count=%u\n", p->vmp_wire_count);
7154 	printf("  q_state=%u\n", p->vmp_q_state);
7155 
7156 	printf("  %slaundry, %sref, %sgobbled, %sprivate\n",
7157 	    (p->vmp_laundry ? "" : "!"),
7158 	    (p->vmp_reference ? "" : "!"),
7159 	    (p->vmp_gobbled ? "" : "!"),
7160 	    (vm_page_is_private(p) ? "" : "!"));
7161 	printf("  %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n",
7162 	    (p->vmp_busy ? "" : "!"),
7163 	    (p->vmp_wanted ? "" : "!"),
7164 	    (p->vmp_tabled ? "" : "!"),
7165 	    (vm_page_is_fictitious(p) ? "" : "!"),
7166 	    (p->vmp_pmapped ? "" : "!"),
7167 	    (p->vmp_wpmapped ? "" : "!"));
7168 	printf("  %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n",
7169 	    (p->vmp_free_when_done ? "" : "!"),
7170 	    (p->vmp_absent ? "" : "!"),
7171 	    (VMP_ERROR_GET(p) ? "" : "!"),
7172 	    (p->vmp_dirty ? "" : "!"),
7173 	    (p->vmp_cleaning ? "" : "!"),
7174 	    (p->vmp_precious ? "" : "!"),
7175 	    (p->vmp_clustered ? "" : "!"));
7176 	printf("  %soverwriting, %srestart, %sunusual\n",
7177 	    (p->vmp_overwriting ? "" : "!"),
7178 	    (p->vmp_restart ? "" : "!"),
7179 	    (p->vmp_unusual ? "" : "!"));
7180 	printf("  cs_validated=%d, cs_tainted=%d, cs_nx=%d, %sno_cache\n",
7181 	    p->vmp_cs_validated,
7182 	    p->vmp_cs_tainted,
7183 	    p->vmp_cs_nx,
7184 	    (p->vmp_no_cache ? "" : "!"));
7185 
7186 	printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p));
7187 }
7188 
7189 /*
7190  *	Check that the list of pages is ordered by
7191  *	ascending physical address and has no holes.
7192  */
7193 static int
vm_page_verify_contiguous(vm_page_t pages,unsigned int npages)7194 vm_page_verify_contiguous(
7195 	vm_page_t       pages,
7196 	unsigned int    npages)
7197 {
7198 	vm_page_t               m;
7199 	unsigned int            page_count;
7200 	vm_offset_t             prev_addr;
7201 
7202 	prev_addr = VM_PAGE_GET_PHYS_PAGE(pages);
7203 	page_count = 1;
7204 	for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) {
7205 		if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
7206 			printf("m %p prev_addr 0x%lx, current addr 0x%x\n",
7207 			    m, (long)prev_addr, VM_PAGE_GET_PHYS_PAGE(m));
7208 			printf("pages %p page_count %d npages %d\n", pages, page_count, npages);
7209 			panic("vm_page_verify_contiguous:  not contiguous!");
7210 		}
7211 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
7212 		++page_count;
7213 	}
7214 	if (page_count != npages) {
7215 		printf("pages %p actual count 0x%x but requested 0x%x\n",
7216 		    pages, page_count, npages);
7217 		panic("vm_page_verify_contiguous:  count error");
7218 	}
7219 	return 1;
7220 }
7221 
7222 
7223 /*
7224  *	Check the free lists for proper length etc.
7225  */
7226 static boolean_t vm_page_verify_this_free_list_enabled = FALSE;
7227 static unsigned int
vm_page_verify_free_list(vm_page_queue_head_t * vm_page_queue,unsigned int color,vm_page_t look_for_page,boolean_t expect_page)7228 vm_page_verify_free_list(
7229 	vm_page_queue_head_t    *vm_page_queue,
7230 	unsigned int    color,
7231 	vm_page_t       look_for_page,
7232 	boolean_t       expect_page)
7233 {
7234 	unsigned int    npages;
7235 	vm_page_t       m;
7236 	vm_page_t       prev_m;
7237 	boolean_t       found_page;
7238 
7239 	if (!vm_page_verify_this_free_list_enabled) {
7240 		return 0;
7241 	}
7242 
7243 	found_page = FALSE;
7244 	npages = 0;
7245 	prev_m = (vm_page_t)((uintptr_t)vm_page_queue);
7246 
7247 	vm_page_queue_iterate(vm_page_queue, m, vmp_pageq) {
7248 		if (m == look_for_page) {
7249 			found_page = TRUE;
7250 		}
7251 		if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev) != prev_m) {
7252 			panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p",
7253 			    color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev), prev_m);
7254 		}
7255 		if (!m->vmp_busy) {
7256 			panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy",
7257 			    color, npages, m);
7258 		}
7259 		if (color != (unsigned int) -1) {
7260 			if (VM_PAGE_GET_COLOR(m) != color) {
7261 				panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u",
7262 				    color, npages, m, VM_PAGE_GET_COLOR(m), color);
7263 			}
7264 			if (m->vmp_q_state != VM_PAGE_ON_FREE_Q) {
7265 				panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d",
7266 				    color, npages, m, m->vmp_q_state);
7267 			}
7268 		} else {
7269 			if (m->vmp_q_state != VM_PAGE_ON_FREE_LOCAL_Q) {
7270 				panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d",
7271 				    npages, m, m->vmp_q_state);
7272 			}
7273 		}
7274 		++npages;
7275 		prev_m = m;
7276 	}
7277 	if (look_for_page != VM_PAGE_NULL) {
7278 		unsigned int other_color;
7279 
7280 		if (expect_page && !found_page) {
7281 			printf("vm_page_verify_free_list(color=%u, npages=%u): page %p not found phys=%u\n",
7282 			    color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
7283 			_vm_page_print(look_for_page);
7284 			for (other_color = 0;
7285 			    other_color < vm_colors;
7286 			    other_color++) {
7287 				if (other_color == color) {
7288 					continue;
7289 				}
7290 				vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[other_color].qhead,
7291 				    other_color, look_for_page, FALSE);
7292 			}
7293 #if XNU_VM_HAS_LOPAGE
7294 			if (color == (unsigned int) -1) {
7295 				vm_page_verify_free_list(&vm_lopage_queue_free,
7296 				    (unsigned int) -1, look_for_page, FALSE);
7297 			}
7298 #endif /* XNU_VM_HAS_LOPAGE */
7299 			panic("vm_page_verify_free_list(color=%u)", color);
7300 		}
7301 		if (!expect_page && found_page) {
7302 			printf("vm_page_verify_free_list(color=%u, npages=%u): page %p found phys=%u\n",
7303 			    color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
7304 		}
7305 	}
7306 	return npages;
7307 }
7308 
7309 static boolean_t vm_page_verify_all_free_lists_enabled = FALSE;
7310 static void
vm_page_verify_free_lists(void)7311 vm_page_verify_free_lists( void )
7312 {
7313 	unsigned int    color, npages, nlopages;
7314 	boolean_t       toggle = TRUE;
7315 
7316 	if (!vm_page_verify_all_free_lists_enabled) {
7317 		return;
7318 	}
7319 
7320 	npages = 0;
7321 	nlopages = 0;
7322 
7323 	vm_free_page_lock();
7324 
7325 	if (vm_page_verify_this_free_list_enabled == TRUE) {
7326 		/*
7327 		 * This variable has been set globally for extra checking of
7328 		 * each free list Q. Since we didn't set it, we don't own it
7329 		 * and we shouldn't toggle it.
7330 		 */
7331 		toggle = FALSE;
7332 	}
7333 
7334 	if (toggle == TRUE) {
7335 		vm_page_verify_this_free_list_enabled = TRUE;
7336 	}
7337 
7338 	for (color = 0; color < vm_colors; color++) {
7339 		npages += vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[color].qhead,
7340 		    color, VM_PAGE_NULL, FALSE);
7341 	}
7342 #if XNU_VM_HAS_LOPAGE
7343 	nlopages = vm_page_verify_free_list(&vm_lopage_queue_free,
7344 	    (unsigned int) -1,
7345 	    VM_PAGE_NULL, FALSE);
7346 #endif /* XNU_VM_HAS_LOPAGE */
7347 	if (npages != vm_page_free_count || nlopages != vm_lopage_free_count) {
7348 		panic("vm_page_verify_free_lists:  "
7349 		    "npages %u free_count %d nlopages %u lo_free_count %u",
7350 		    npages, vm_page_free_count, nlopages, vm_lopage_free_count);
7351 	}
7352 
7353 	if (toggle == TRUE) {
7354 		vm_page_verify_this_free_list_enabled = FALSE;
7355 	}
7356 
7357 	vm_free_page_unlock();
7358 }
7359 
7360 #endif  /* MACH_ASSERT */
7361 
7362 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
7363 
7364 /*
7365  *	CONTIGUOUS PAGE ALLOCATION AND HELPER FUNCTIONS
7366  */
7367 
7368 /*
7369  * Helper function used to determine if a page can be relocated
7370  * A page is relocatable if it is in a stable non-transient state
7371  * and if the page being relocated is compatible with the reason for reloc
7372  * The page queue lock must be held, and the object lock too, if the page
7373  * is in an object.
7374  */
7375 boolean_t
vm_page_is_relocatable(vm_page_t m,vm_relocate_reason_t reloc_reason)7376 vm_page_is_relocatable(vm_page_t m, vm_relocate_reason_t reloc_reason)
7377 {
7378 
7379 	if (VM_PAGE_WIRED(m) || m->vmp_gobbled || m->vmp_laundry || m->vmp_wanted ||
7380 	    m->vmp_cleaning || m->vmp_overwriting || m->vmp_free_when_done) {
7381 		/*
7382 		 * Page is in a transient state
7383 		 * or a state we don't want to deal with.
7384 		 */
7385 		return FALSE;
7386 	} else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
7387 	    (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) ||
7388 #if XNU_VM_HAS_LOPAGE
7389 	    (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
7390 #endif /* XNU_VM_HAS_LOPAGE */
7391 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
7392 		/*
7393 		 * Page needs to be on one of our queues (other then the pageout or special
7394 		 * free queues) or it needs to belong to the compressor pool (which is now
7395 		 * indicated by vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out from
7396 		 * the check for VM_PAGE_NOT_ON_Q) in order for it to be stable behind the
7397 		 * locks we hold at this point...
7398 		 */
7399 		return FALSE;
7400 	} else if ((m->vmp_q_state != VM_PAGE_ON_FREE_Q) &&
7401 	    (!m->vmp_tabled || m->vmp_busy)) {
7402 		/*
7403 		 * pages on the free list are always 'busy'
7404 		 * so we couldn't test for 'busy' in the check
7405 		 * for the transient states... pages that are
7406 		 * 'free' are never 'tabled', so we also couldn't
7407 		 * test for 'tabled'.  So we check here to make
7408 		 * sure that a non-free page is not busy and is
7409 		 * tabled on an object...
7410 		 */
7411 		return FALSE;
7412 	}
7413 
7414 	/*
7415 	 * Lastly, check the page against the relocation reason; the page may
7416 	 * be in a relocatable state, but not be a page we WANT to relocate for
7417 	 * the caller's use case.
7418 	 */
7419 	switch (reloc_reason) {
7420 #if HAS_MTE
7421 	case VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM:
7422 	{
7423 		/*
7424 		 * Relocating the content of tag storage pages so the
7425 		 * fill thread can reclaim a page is perfectly valid,
7426 		 * unless the page is busy.
7427 		 */
7428 		if (m->vmp_busy) {
7429 			return FALSE;
7430 		}
7431 		break;
7432 	}
7433 	case VM_RELOCATE_REASON_TAG_STORAGE_WIRE:
7434 #endif /* HAS_MTE */
7435 	case VM_RELOCATE_REASON_CONTIGUOUS:
7436 	{
7437 #if HAS_MTE
7438 		/*
7439 		 * Tag storage pages may be needed for tag storage.  Because
7440 		 * the contiguous allocator is likely being used for wired
7441 		 * allocations, this page is not eligible to be relocated in
7442 		 * this case.
7443 		 */
7444 		if (vm_page_is_tag_storage(m)) {
7445 			return FALSE;
7446 		}
7447 #endif /* HAS_MTE */
7448 		break;
7449 	}
7450 
7451 	default:
7452 	{
7453 		panic("Invalid relocation reason %u", reloc_reason);
7454 		__builtin_unreachable();
7455 	}
7456 	}
7457 
7458 	return TRUE;
7459 }
7460 
7461 /*
7462  * Free up the given page by possibily relocating its contents to a new page
7463  * If the page is on an object the object lock must be held.
7464  *
7465  * Whether or not the page is considered relocatable is contingent on the
7466  * reason it is being relocated.
7467  *
7468  * Return the new page back to the caller if requested, as done in
7469  * vm_object_iopl_wire_full().
7470  *
7471  * The VM page queues lock must also be held.
7472  *
7473  * @returns
7474  * - KERN_SUCCESS               if the relocation was successful.
7475  * - KERN_INVALID_OBJECT        if @c m1's object is VM_OBJECT_NULL.
7476  * - KERN_FAILURE               if the reolcation failed due to @c m1's state.
7477  * - KERN_RESOURCE_SHORTAGE     if no page could be allocated to relocate @c m1.
7478  */
7479 kern_return_t
vm_page_relocate(vm_page_t m1,int * compressed_pages,vm_relocate_reason_t reloc_reason,vm_page_t * new_page)7480 vm_page_relocate(
7481 	vm_page_t            m1,
7482 	int                 *compressed_pages,
7483 	vm_relocate_reason_t reloc_reason,
7484 	vm_page_t*           new_page)
7485 {
7486 	int refmod = 0;
7487 	vm_object_t object = VM_PAGE_OBJECT(m1);
7488 	kern_return_t kr;
7489 
7490 	switch (reloc_reason) {
7491 	case VM_RELOCATE_REASON_CONTIGUOUS:
7492 	{
7493 #if HAS_MTE
7494 		/*
7495 		 * The contiguous allocator should not be considering tag
7496 		 * storage pages.
7497 		 */
7498 		assert(!vm_page_is_tag_storage(m1));
7499 #endif /* HAS_MTE */
7500 		break;
7501 	}
7502 #if HAS_MTE
7503 	case VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM:
7504 	{
7505 		/*
7506 		 * If we are trying to reclaim tag storage, we should be
7507 		 * relocating a tag storage page.
7508 		 */
7509 		assert(vm_page_is_tag_storage(m1));
7510 		if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7511 			vm_page_tag_storage_compressor_relocation_count++;
7512 		}
7513 		break;
7514 	}
7515 	case VM_RELOCATE_REASON_TAG_STORAGE_WIRE:
7516 	{
7517 		assert(vm_page_is_tag_storage(m1) && m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7518 		vm_page_tag_storage_wire_relocation_count++;
7519 		break;
7520 	}
7521 #endif /* HAS_MTE */
7522 	default:
7523 	{
7524 		panic("Unrecognized relocation reason %u\n", reloc_reason);
7525 		break;
7526 	}
7527 	}
7528 
7529 	if (object == VM_OBJECT_NULL) {
7530 		return KERN_INVALID_OBJECT;
7531 	}
7532 
7533 	vm_object_lock_assert_held(object);
7534 
7535 	if (VM_PAGE_WIRED(m1) ||
7536 	    m1->vmp_gobbled ||
7537 	    m1->vmp_laundry ||
7538 	    m1->vmp_wanted ||
7539 	    m1->vmp_cleaning ||
7540 	    m1->vmp_overwriting ||
7541 	    m1->vmp_free_when_done ||
7542 	    m1->vmp_busy ||
7543 	    m1->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
7544 		return KERN_FAILURE;
7545 	}
7546 
7547 	boolean_t disconnected = FALSE;
7548 	boolean_t reusable = FALSE;
7549 
7550 	/*
7551 	 * Pages from reusable objects can be reclaimed directly.
7552 	 */
7553 	if ((m1->vmp_reusable || object->all_reusable) &&
7554 	    m1->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q && !m1->vmp_dirty &&
7555 	    !m1->vmp_reference) {
7556 		/*
7557 		 * reusable page...
7558 		 */
7559 
7560 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
7561 		disconnected = TRUE;
7562 		if (refmod == 0) {
7563 			/*
7564 			 * ... not reused: can steal without relocating contents.
7565 			 */
7566 			reusable = TRUE;
7567 		}
7568 	}
7569 
7570 	if ((m1->vmp_pmapped && !reusable) || m1->vmp_dirty || m1->vmp_precious) {
7571 		vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD;
7572 		vm_object_offset_t offset;
7573 		int copy_page_options = 0;
7574 
7575 #if HAS_MTE
7576 		if (m1->vmp_using_mte) {
7577 			grab_options |= VM_PAGE_GRAB_MTE;
7578 			copy_page_options |= cppvCopyTags;
7579 		}
7580 #endif /* HAS_MTE */
7581 		/* page is not reusable, we need to allocate a new page
7582 		 * and move its contents there.
7583 		 */
7584 		vm_page_t m2 = vm_page_grab_options(grab_options);
7585 
7586 		if (m2 == VM_PAGE_NULL) {
7587 			return KERN_RESOURCE_SHORTAGE;
7588 		}
7589 
7590 		if (!disconnected) {
7591 			if (m1->vmp_pmapped) {
7592 				refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
7593 			} else {
7594 				refmod = 0;
7595 			}
7596 		}
7597 
7598 #if HAS_MTE
7599 		assert(m1->vmp_using_mte == m2->vmp_using_mte);
7600 		if (m1->vmp_using_mte) {
7601 			assert(pmap_is_tagged_page(VM_PAGE_GET_PHYS_PAGE(m2)));
7602 			copy_page_options |= (cppvCopyTags | cppvDisableTagCheck);
7603 		}
7604 #endif /* HAS_MTE */
7605 		/* copy the page's contents */
7606 		pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2), copy_page_options);
7607 
7608 		/* copy the page's state */
7609 		assert(!VM_PAGE_WIRED(m1));
7610 		assert(m1->vmp_q_state != VM_PAGE_ON_FREE_Q);
7611 		assert(m1->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q);
7612 		assert(!m1->vmp_laundry);
7613 		m2->vmp_reference = m1->vmp_reference;
7614 		assert(!m1->vmp_gobbled);
7615 		m2->vmp_no_cache = m1->vmp_no_cache;
7616 		m2->vmp_xpmapped = 0;
7617 		assert(!m1->vmp_busy);
7618 		assert(!m1->vmp_wanted);
7619 		assert(vm_page_is_canonical(m1));
7620 		m2->vmp_pmapped = m1->vmp_pmapped; /* should flush cache ? */
7621 		m2->vmp_wpmapped = m1->vmp_wpmapped;
7622 		assert(!m1->vmp_free_when_done);
7623 		m2->vmp_absent = m1->vmp_absent;
7624 		m2->vmp_error = VMP_ERROR_GET(m1);
7625 		m2->vmp_dirty = m1->vmp_dirty;
7626 		assert(!m1->vmp_cleaning);
7627 		m2->vmp_precious = m1->vmp_precious;
7628 		m2->vmp_clustered = m1->vmp_clustered;
7629 		assert(!m1->vmp_overwriting);
7630 		m2->vmp_restart = m1->vmp_restart;
7631 		m2->vmp_unusual = m1->vmp_unusual;
7632 		m2->vmp_cs_validated = m1->vmp_cs_validated;
7633 		m2->vmp_cs_tainted = m1->vmp_cs_tainted;
7634 		m2->vmp_cs_nx = m1->vmp_cs_nx;
7635 
7636 		m2->vmp_realtime = m1->vmp_realtime;
7637 		m1->vmp_realtime = false;
7638 
7639 		/*
7640 		 * If m1 had really been reusable,
7641 		 * we would have just stolen it, so
7642 		 * let's not propagate its "reusable"
7643 		 * bit and assert that m2 is not
7644 		 * marked as "reusable".
7645 		 */
7646 		// m2->vmp_reusable	= m1->vmp_reusable;
7647 		assert(!m2->vmp_reusable);
7648 
7649 		// assert(!m1->vmp_lopage);
7650 
7651 		if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7652 			m2->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
7653 			/*
7654 			 * We just grabbed m2 up above and so it isn't
7655 			 * going to be on any special Q as yet and so
7656 			 * we don't need to 'remove' it from the special
7657 			 * queues. Just resetting the state should be enough.
7658 			 */
7659 			m2->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
7660 		}
7661 
7662 		/*
7663 		 * page may need to be flushed if
7664 		 * it is marshalled into a UPL
7665 		 * that is going to be used by a device
7666 		 * that doesn't support coherency
7667 		 */
7668 		m2->vmp_written_by_kernel = TRUE;
7669 
7670 		/*
7671 		 * make sure we clear the ref/mod state
7672 		 * from the pmap layer... else we risk
7673 		 * inheriting state from the last time
7674 		 * this page was used...
7675 		 */
7676 		pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2),
7677 		    VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7678 
7679 		if (refmod & VM_MEM_REFERENCED) {
7680 			m2->vmp_reference = TRUE;
7681 		}
7682 		if (refmod & VM_MEM_MODIFIED) {
7683 			SET_PAGE_DIRTY(m2, TRUE);
7684 		}
7685 		offset = m1->vmp_offset;
7686 
7687 		/*
7688 		 * completely cleans up the state
7689 		 * of the page so that it is ready
7690 		 * to be put onto the free list, or
7691 		 * for this purpose it looks like it
7692 		 * just came off of the free list
7693 		 */
7694 		vm_page_free_prepare(m1);
7695 
7696 		/*
7697 		 * now put the substitute page on the object
7698 		 */
7699 		vm_page_insert_internal(m2, object, offset, VM_KERN_MEMORY_NONE, TRUE,
7700 		    TRUE, FALSE, FALSE, NULL);
7701 
7702 		/*
7703 		 * Return the relocated vm_page_t if the caller wants to know.
7704 		 */
7705 		if (new_page) {
7706 			*new_page = m2;
7707 		}
7708 
7709 		if (m2->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7710 			m2->vmp_pmapped = TRUE;
7711 			m2->vmp_wpmapped = TRUE;
7712 
7713 			kr = pmap_enter_check(kernel_pmap, (vm_map_offset_t)m2->vmp_offset, m2,
7714 			    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, TRUE);
7715 
7716 			assert(kr == KERN_SUCCESS);
7717 
7718 			if (compressed_pages) {
7719 				++*compressed_pages;
7720 			}
7721 		} else {
7722 			/* relocated page was not used by the compressor
7723 			 * put it on either the active or inactive lists */
7724 			if (m2->vmp_reference) {
7725 				vm_page_activate(m2);
7726 			} else {
7727 				vm_page_deactivate(m2);
7728 			}
7729 		}
7730 
7731 		/* unset the busy flag (pages on the free queue are busy) and notify if wanted */
7732 		vm_page_wakeup_done(object, m2);
7733 	} else {
7734 		assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7735 
7736 		/*
7737 		 * completely cleans up the state
7738 		 * of the page so that it is ready
7739 		 * to be put onto the free list, or
7740 		 * for this purpose it looks like it
7741 		 * just came off of the free list
7742 		 */
7743 		vm_page_free_prepare(m1);
7744 
7745 		if (new_page) {
7746 			vm_page_t m2;
7747 			vm_object_offset_t offset;
7748 			vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD;
7749 
7750 			/* The caller still wanted a page, so let's give them a new one. */
7751 			offset = m1->vmp_offset;
7752 #if HAS_MTE
7753 			if (m1->vmp_using_mte) {
7754 				grab_options |= VM_PAGE_GRAB_MTE;
7755 			}
7756 #endif /* HAS_MTE */
7757 			m2 = vm_page_grab_options(grab_options);
7758 
7759 			if (m2 == VM_PAGE_NULL) {
7760 				return KERN_RESOURCE_SHORTAGE;
7761 			}
7762 
7763 			/*
7764 			 * make sure we clear the ref/mod state
7765 			 * from the pmap layer... else we risk
7766 			 * inheriting state from the last time
7767 			 * this page was used...
7768 			 */
7769 			pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2),
7770 			    VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7771 
7772 			offset = m1->vmp_offset;
7773 
7774 			/*
7775 			 * now put the substitute page on the object
7776 			 */
7777 			vm_page_insert_internal(m2, object, offset, VM_KERN_MEMORY_NONE, TRUE,
7778 			    TRUE, FALSE, FALSE, NULL);
7779 
7780 			*new_page = m2;
7781 		}
7782 	}
7783 
7784 	/* we're done here */
7785 	return KERN_SUCCESS;
7786 }
7787 
7788 /*
7789  *	CONTIGUOUS PAGE ALLOCATION
7790  *
7791  *	Find a region large enough to contain at least n pages
7792  *	of contiguous physical memory.
7793  *
7794  *	This is done by traversing the vm_page_t array in a linear fashion
7795  *	we assume that the vm_page_t array has the avaiable physical pages in an
7796  *	ordered, ascending list... this is currently true of all our implementations
7797  *      and must remain so... there can be 'holes' in the array...  we also can
7798  *	no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed
7799  *      which use to happen via 'vm_page_convert'... that function was no longer
7800  *      being called and was removed...
7801  *
7802  *	The basic flow consists of stabilizing some of the interesting state of
7803  *	a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our
7804  *	sweep at the beginning of the array looking for pages that meet our criterea
7805  *	for a 'stealable' page... currently we are pretty conservative... if the page
7806  *	meets this criterea and is physically contiguous to the previous page in the 'run'
7807  *      we keep developing it.  If we hit a page that doesn't fit, we reset our state
7808  *	and start to develop a new run... if at this point we've already considered
7809  *      at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold,
7810  *	and mutex_pause (which will yield the processor), to keep the latency low w/r
7811  *	to other threads trying to acquire free pages (or move pages from q to q),
7812  *	and then continue from the spot we left off... we only make 1 pass through the
7813  *	array.  Once we have a 'run' that is long enough, we'll go into the loop which
7814  *      which steals the pages from the queues they're currently on... pages on the free
7815  *	queue can be stolen directly... pages that are on any of the other queues
7816  *	must be removed from the object they are tabled on... this requires taking the
7817  *      object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails
7818  *	or if the state of the page behind the vm_object lock is no longer viable, we'll
7819  *	dump the pages we've currently stolen back to the free list, and pick up our
7820  *	scan from the point where we aborted the 'current' run.
7821  *
7822  *
7823  *	Requirements:
7824  *		- neither vm_page_queue nor vm_free_list lock can be held on entry
7825  *
7826  *	Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL.
7827  *
7828  * Algorithm:
7829  */
7830 
7831 #define MAX_CONSIDERED_BEFORE_YIELD     1000
7832 
7833 
7834 #define RESET_STATE_OF_RUN()    \
7835 	MACRO_BEGIN             \
7836 	prevcontaddr = -2;      \
7837 	start_pnum = -1;        \
7838 	free_considered = 0;    \
7839 	substitute_needed = 0;  \
7840 	npages = 0;             \
7841 	MACRO_END
7842 
7843 /*
7844  * Can we steal in-use (i.e. not free) pages when searching for
7845  * physically-contiguous pages ?
7846  */
7847 #define VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL 1
7848 
7849 static unsigned int vm_page_find_contiguous_last_idx = 0, vm_page_lomem_find_contiguous_last_idx = 0;
7850 #if DEBUG
7851 int vm_page_find_contig_debug = 0;
7852 #endif
7853 
7854 static vm_page_t
vm_page_find_contiguous(unsigned int contig_pages,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)7855 vm_page_find_contiguous(
7856 	unsigned int    contig_pages,
7857 	ppnum_t         max_pnum,
7858 	ppnum_t         pnum_mask,
7859 	boolean_t       wire,
7860 	int             flags)
7861 {
7862 	vm_page_list_t  list = { };
7863 	ppnum_t         prevcontaddr = 0;
7864 	ppnum_t         start_pnum = 0;
7865 	unsigned int    npages = 0, considered = 0, scanned = 0;
7866 	unsigned int    page_idx = 0, start_idx = 0, last_idx = 0, orig_last_idx = 0;
7867 	unsigned int    idx_last_contig_page_found = 0;
7868 	int             free_considered = 0, free_available = 0;
7869 	int             substitute_needed = 0;
7870 	int             zone_gc_called = 0;
7871 	boolean_t       wrapped;
7872 	kern_return_t   kr;
7873 #if DEBUG
7874 	clock_sec_t     tv_start_sec = 0, tv_end_sec = 0;
7875 	clock_usec_t    tv_start_usec = 0, tv_end_usec = 0;
7876 #endif
7877 
7878 	int             yielded = 0;
7879 	int             dumped_run = 0;
7880 	int             stolen_pages = 0;
7881 	int             compressed_pages = 0;
7882 
7883 
7884 	if (contig_pages == 0) {
7885 		return VM_PAGE_NULL;
7886 	}
7887 
7888 full_scan_again:
7889 
7890 #if MACH_ASSERT
7891 	vm_page_verify_free_lists();
7892 #endif
7893 #if DEBUG
7894 	clock_get_system_microtime(&tv_start_sec, &tv_start_usec);
7895 #endif
7896 	PAGE_REPLACEMENT_ALLOWED(TRUE);
7897 
7898 #if XNU_VM_HAS_DELAYED_PAGES
7899 	/*
7900 	 * If there are still delayed pages, try to free up some that match.
7901 	 */
7902 	if (__improbable(vm_delayed_count != 0 && contig_pages != 0)) {
7903 		vm_free_delayed_pages_contig(contig_pages, max_pnum, pnum_mask);
7904 	}
7905 #endif /* XNU_VM_HAS_DELAYED_PAGES */
7906 
7907 	vm_page_lock_queues();
7908 	vm_free_page_lock();
7909 
7910 	RESET_STATE_OF_RUN();
7911 
7912 	scanned = 0;
7913 	considered = 0;
7914 	free_available = vm_page_free_count - vm_page_free_reserved;
7915 
7916 	wrapped = FALSE;
7917 
7918 	if (flags & KMA_LOMEM) {
7919 		idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx;
7920 	} else {
7921 		idx_last_contig_page_found =  vm_page_find_contiguous_last_idx;
7922 	}
7923 
7924 	orig_last_idx = idx_last_contig_page_found;
7925 	last_idx = orig_last_idx;
7926 
7927 	for (page_idx = last_idx, start_idx = last_idx;
7928 	    npages < contig_pages && page_idx < vm_pages_count;
7929 	    page_idx++) {
7930 		vm_page_t m = NULL;
7931 
7932 retry:
7933 		if (wrapped &&
7934 		    npages == 0 &&
7935 		    page_idx >= orig_last_idx) {
7936 			/*
7937 			 * We're back where we started and we haven't
7938 			 * found any suitable contiguous range.  Let's
7939 			 * give up.
7940 			 */
7941 			break;
7942 		}
7943 		scanned++;
7944 		m = vm_page_get(page_idx);
7945 
7946 		assert(vm_page_is_canonical(m));
7947 
7948 		if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) {
7949 			/* no more low pages... */
7950 			break;
7951 		}
7952 		if (!npages & ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0)) {
7953 			/*
7954 			 * not aligned
7955 			 */
7956 			RESET_STATE_OF_RUN();
7957 		} else if (!vm_page_is_relocatable(m,
7958 		    VM_RELOCATE_REASON_CONTIGUOUS)) {
7959 			/*
7960 			 * page is not relocatable */
7961 			RESET_STATE_OF_RUN();
7962 		} else {
7963 			if (VM_PAGE_GET_PHYS_PAGE(m) != prevcontaddr + 1) {
7964 				if ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0) {
7965 					RESET_STATE_OF_RUN();
7966 					goto did_consider;
7967 				} else {
7968 					npages = 1;
7969 					start_idx = page_idx;
7970 					start_pnum = VM_PAGE_GET_PHYS_PAGE(m);
7971 				}
7972 			} else {
7973 				npages++;
7974 			}
7975 			prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m);
7976 
7977 			VM_PAGE_CHECK(m);
7978 			if (m->vmp_q_state == VM_PAGE_ON_FREE_Q) {
7979 				free_considered++;
7980 			} else {
7981 				/*
7982 				 * This page is not free.
7983 				 * If we can't steal used pages,
7984 				 * we have to give up this run
7985 				 * and keep looking.
7986 				 * Otherwise, we might need to
7987 				 * move the contents of this page
7988 				 * into a substitute page.
7989 				 */
7990 #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
7991 				if (m->vmp_pmapped || m->vmp_dirty || m->vmp_precious) {
7992 					substitute_needed++;
7993 				}
7994 #else
7995 				RESET_STATE_OF_RUN();
7996 #endif
7997 			}
7998 
7999 			if ((free_considered + substitute_needed) > free_available) {
8000 				/*
8001 				 * if we let this run continue
8002 				 * we will end up dropping the vm_page_free_count
8003 				 * below the reserve limit... we need to abort
8004 				 * this run, but we can at least re-consider this
8005 				 * page... thus the jump back to 'retry'
8006 				 */
8007 				RESET_STATE_OF_RUN();
8008 
8009 				if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) {
8010 					considered++;
8011 					goto retry;
8012 				}
8013 				/*
8014 				 * free_available == 0
8015 				 * so can't consider any free pages... if
8016 				 * we went to retry in this case, we'd
8017 				 * get stuck looking at the same page
8018 				 * w/o making any forward progress
8019 				 * we also want to take this path if we've already
8020 				 * reached our limit that controls the lock latency
8021 				 */
8022 			}
8023 		}
8024 did_consider:
8025 		if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) {
8026 			PAGE_REPLACEMENT_ALLOWED(FALSE);
8027 
8028 			vm_free_page_unlock();
8029 			vm_page_unlock_queues();
8030 
8031 			mutex_pause(0);
8032 
8033 			PAGE_REPLACEMENT_ALLOWED(TRUE);
8034 
8035 			vm_page_lock_queues();
8036 			vm_free_page_lock();
8037 
8038 			RESET_STATE_OF_RUN();
8039 			/*
8040 			 * reset our free page limit since we
8041 			 * dropped the lock protecting the vm_page_free_queue
8042 			 */
8043 			free_available = vm_page_free_count - vm_page_free_reserved;
8044 			considered = 0;
8045 
8046 			yielded++;
8047 
8048 			goto retry;
8049 		}
8050 		considered++;
8051 	} /* main for-loop end */
8052 
8053 	if (npages != contig_pages) {
8054 		if (!wrapped) {
8055 			/*
8056 			 * We didn't find a contiguous range but we didn't
8057 			 * start from the very first page.
8058 			 * Start again from the very first page.
8059 			 */
8060 			RESET_STATE_OF_RUN();
8061 			if (flags & KMA_LOMEM) {
8062 				idx_last_contig_page_found  = vm_page_lomem_find_contiguous_last_idx = 0;
8063 			} else {
8064 				idx_last_contig_page_found = vm_page_find_contiguous_last_idx = 0;
8065 			}
8066 			last_idx = 0;
8067 			page_idx = last_idx;
8068 			wrapped = TRUE;
8069 			goto retry;
8070 		}
8071 		vm_free_page_unlock();
8072 	} else {
8073 		vm_page_t m1;
8074 		unsigned int cur_idx;
8075 		unsigned int tmp_start_idx;
8076 		vm_object_t locked_object = VM_OBJECT_NULL;
8077 		bool abort_run = false;
8078 
8079 		assert(page_idx - start_idx == contig_pages);
8080 
8081 		tmp_start_idx = start_idx;
8082 
8083 		/*
8084 		 * first pass through to pull the free pages
8085 		 * off of the free queue so that in case we
8086 		 * need substitute pages, we won't grab any
8087 		 * of the free pages in the run... we'll clear
8088 		 * the 'free' bit in the 2nd pass, and even in
8089 		 * an abort_run case, we'll collect all of the
8090 		 * free pages in this run and return them to the free list
8091 		 */
8092 		while (start_idx < page_idx) {
8093 			vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
8094 
8095 			m1 = vm_page_get(start_idx++);
8096 
8097 #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
8098 			assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q);
8099 #endif
8100 			if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) {
8101 				vm_page_free_queue_steal(options, m1);
8102 			}
8103 		}
8104 		if (flags & KMA_LOMEM) {
8105 			vm_page_lomem_find_contiguous_last_idx = page_idx;
8106 		} else {
8107 			vm_page_find_contiguous_last_idx = page_idx;
8108 		}
8109 
8110 		/*
8111 		 * we can drop the free queue lock at this point since
8112 		 * we've pulled any 'free' candidates off of the list
8113 		 * we need it dropped so that we can do a vm_page_grab
8114 		 * when substituing for pmapped/dirty pages
8115 		 */
8116 		vm_free_page_unlock();
8117 
8118 		start_idx = tmp_start_idx;
8119 		cur_idx = page_idx - 1;
8120 
8121 		while (start_idx++ < page_idx) {
8122 			/*
8123 			 * must go through the list from back to front
8124 			 * so that the page list is created in the
8125 			 * correct order - low -> high phys addresses
8126 			 */
8127 			m1 = vm_page_get(cur_idx--);
8128 
8129 			if (m1->vmp_object == 0) {
8130 				/*
8131 				 * page has already been removed from
8132 				 * the free list in the 1st pass
8133 				 */
8134 				assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
8135 				assert(m1->vmp_offset == (vm_object_offset_t) -1);
8136 				assert(m1->vmp_busy);
8137 				assert(!m1->vmp_wanted);
8138 				assert(!m1->vmp_laundry);
8139 			} else {
8140 				/*
8141 				 * try to relocate/steal the page
8142 				 */
8143 				if (abort_run) {
8144 					continue;
8145 				}
8146 
8147 				assert(m1->vmp_q_state != VM_PAGE_NOT_ON_Q);
8148 
8149 				vm_object_t object = VM_PAGE_OBJECT(m1);
8150 
8151 				if (object != locked_object) {
8152 					if (locked_object) {
8153 						vm_object_unlock(locked_object);
8154 						locked_object = VM_OBJECT_NULL;
8155 					}
8156 					if (vm_object_lock_try(object)) {
8157 						locked_object = object;
8158 					} else {
8159 						/* object must be locked to relocate its pages */
8160 						tmp_start_idx = cur_idx;
8161 						abort_run = true;
8162 						continue;
8163 					}
8164 				}
8165 
8166 				kr = vm_page_relocate(m1, &compressed_pages, VM_RELOCATE_REASON_CONTIGUOUS, NULL);
8167 				if (kr != KERN_SUCCESS) {
8168 					if (locked_object) {
8169 						vm_object_unlock(locked_object);
8170 						locked_object = VM_OBJECT_NULL;
8171 					}
8172 					tmp_start_idx = cur_idx;
8173 					abort_run = true;
8174 					continue;
8175 				}
8176 
8177 				stolen_pages++;
8178 			}
8179 
8180 			/* m1 is ours at this point ... */
8181 
8182 			if (m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
8183 				/*
8184 				 * The Q state is preserved on m1 because vm_page_queues_remove doesn't
8185 				 * change it for pages marked as used-by-compressor.
8186 				 */
8187 				vm_page_assign_special_state(m1, VM_PAGE_SPECIAL_Q_BG);
8188 			}
8189 			VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
8190 			vm_page_list_push(&list, m1);
8191 		}
8192 
8193 		if (locked_object) {
8194 			vm_object_unlock(locked_object);
8195 			locked_object = VM_OBJECT_NULL;
8196 		}
8197 
8198 		if (abort_run) {
8199 			/*
8200 			 * want the index of the last
8201 			 * page in this run that was
8202 			 * successfully 'stolen', so back
8203 			 * it up 1 for the auto-decrement on use
8204 			 * and 1 more to bump back over this page
8205 			 */
8206 			page_idx = tmp_start_idx + 2;
8207 			if (page_idx >= vm_pages_count) {
8208 				if (wrapped) {
8209 					if (list.vmpl_count) {
8210 						vm_page_unlock_queues();
8211 						vm_page_free_list(list.vmpl_head, FALSE);
8212 						vm_page_lock_queues();
8213 						list = (vm_page_list_t){ };
8214 					}
8215 					dumped_run++;
8216 					goto done_scanning;
8217 				}
8218 				page_idx = last_idx = 0;
8219 				wrapped = TRUE;
8220 			}
8221 			abort_run = false;
8222 
8223 			/*
8224 			 * We didn't find a contiguous range but we didn't
8225 			 * start from the very first page.
8226 			 * Start again from the very first page.
8227 			 */
8228 			RESET_STATE_OF_RUN();
8229 
8230 			if (flags & KMA_LOMEM) {
8231 				idx_last_contig_page_found  = vm_page_lomem_find_contiguous_last_idx = page_idx;
8232 			} else {
8233 				idx_last_contig_page_found = vm_page_find_contiguous_last_idx = page_idx;
8234 			}
8235 
8236 			last_idx = page_idx;
8237 
8238 			if (list.vmpl_count) {
8239 				vm_page_unlock_queues();
8240 				vm_page_free_list(list.vmpl_head, FALSE);
8241 				vm_page_lock_queues();
8242 				list = (vm_page_list_t){ };
8243 			}
8244 			dumped_run++;
8245 
8246 			vm_free_page_lock();
8247 			/*
8248 			 * reset our free page limit since we
8249 			 * dropped the lock protecting the vm_page_free_queue
8250 			 */
8251 			free_available = vm_page_free_count - vm_page_free_reserved;
8252 			goto retry;
8253 		}
8254 #if HAS_MTE
8255 		else if (list.vmpl_has_tagged) {
8256 			const unified_page_list_t pmap_batch_list = {
8257 				.page_slist = list.vmpl_head,
8258 				.type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
8259 			};
8260 
8261 			/*
8262 			 * We successfully found a contiguous range we could
8263 			 * steal all the pages from.  As a last step, make
8264 			 * certain all pages are regular pages, or convert
8265 			 * any non-regular pages to regular pages.
8266 			 */
8267 			vm_page_unlock_queues();
8268 
8269 			/* Make any tagged pages we stole non-tagged. */
8270 			pmap_unmake_tagged_pages(&pmap_batch_list);
8271 
8272 			vm_free_page_lock();
8273 
8274 			/* Mark any tagged pages we stole as non-tagged. */
8275 			vm_page_list_foreach(m1, list) {
8276 				if (m1->vmp_using_mte) {
8277 					ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(m1);
8278 
8279 					m1->vmp_using_mte = false;
8280 					mteinfo_covered_page_clear_tagged(pnum);
8281 				}
8282 			}
8283 			list.vmpl_has_tagged = false;
8284 			list.vmpl_has_untagged = true;
8285 
8286 			vm_free_page_unlock();
8287 			vm_page_lock_queues();
8288 		}
8289 #endif /* HAS_MTE */
8290 
8291 		vm_page_list_foreach(m1, list) {
8292 			assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
8293 			assert(m1->vmp_wire_count == 0);
8294 
8295 			if (wire == TRUE) {
8296 				m1->vmp_wire_count++;
8297 				m1->vmp_q_state = VM_PAGE_IS_WIRED;
8298 
8299 #if HAS_MTE
8300 				if (m1->vmp_wire_count == 1) {
8301 					mteinfo_increment_wire_count(m1);
8302 				}
8303 #endif /* HAS_MTE */
8304 			} else {
8305 				m1->vmp_gobbled = TRUE;
8306 			}
8307 		}
8308 		if (wire == FALSE) {
8309 			vm_page_gobble_count += npages;
8310 		}
8311 
8312 		/*
8313 		 * gobbled pages are also counted as wired pages
8314 		 */
8315 		vm_page_wire_count += npages;
8316 
8317 		assert(vm_page_verify_contiguous(list.vmpl_head, npages));
8318 	}
8319 done_scanning:
8320 	PAGE_REPLACEMENT_ALLOWED(FALSE);
8321 
8322 	vm_page_unlock_queues();
8323 
8324 #if DEBUG
8325 	clock_get_system_microtime(&tv_end_sec, &tv_end_usec);
8326 
8327 	tv_end_sec -= tv_start_sec;
8328 	if (tv_end_usec < tv_start_usec) {
8329 		tv_end_sec--;
8330 		tv_end_usec += 1000000;
8331 	}
8332 	tv_end_usec -= tv_start_usec;
8333 	if (tv_end_usec >= 1000000) {
8334 		tv_end_sec++;
8335 		tv_end_sec -= 1000000;
8336 	}
8337 	if (vm_page_find_contig_debug) {
8338 		printf("%s(num=%d,low=%d): found %d pages at 0x%llx in %ld.%06ds...  started at %d...  scanned %d pages...  yielded %d times...  dumped run %d times... stole %d pages... stole %d compressed pages\n",
8339 		    __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
8340 		    (long)tv_end_sec, tv_end_usec, orig_last_idx,
8341 		        scanned, yielded, dumped_run, stolen_pages, compressed_pages);
8342 	}
8343 
8344 #endif
8345 #if MACH_ASSERT
8346 	vm_page_verify_free_lists();
8347 #endif
8348 	if (list.vmpl_count == 0 && zone_gc_called < 2) {
8349 		printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages...  yielded %d times...  dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n",
8350 		    __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
8351 		        scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count);
8352 
8353 		if (consider_buffer_cache_collect != NULL) {
8354 			(void)(*consider_buffer_cache_collect)(1);
8355 		}
8356 
8357 		zone_gc(zone_gc_called ? ZONE_GC_DRAIN : ZONE_GC_TRIM);
8358 
8359 		zone_gc_called++;
8360 
8361 		printf("vm_page_find_contiguous: zone_gc called... wired count is %d\n", vm_page_wire_count);
8362 		goto full_scan_again;
8363 	}
8364 
8365 	return list.vmpl_head;
8366 }
8367 
8368 /*
8369  *	Allocate a list of contiguous, wired pages.
8370  */
8371 kern_return_t
cpm_allocate(vm_size_t size,vm_page_t * list,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)8372 cpm_allocate(
8373 	vm_size_t       size,
8374 	vm_page_t       *list,
8375 	ppnum_t         max_pnum,
8376 	ppnum_t         pnum_mask,
8377 	boolean_t       wire,
8378 	int             flags)
8379 {
8380 	vm_page_t               pages;
8381 	unsigned int            npages;
8382 
8383 	if (size % PAGE_SIZE != 0) {
8384 		return KERN_INVALID_ARGUMENT;
8385 	}
8386 
8387 	npages = (unsigned int) (size / PAGE_SIZE);
8388 	if (npages != size / PAGE_SIZE) {
8389 		/* 32-bit overflow */
8390 		return KERN_INVALID_ARGUMENT;
8391 	}
8392 
8393 	/*
8394 	 *	Obtain a pointer to a subset of the free
8395 	 *	list large enough to satisfy the request;
8396 	 *	the region will be physically contiguous.
8397 	 */
8398 	pages = vm_page_find_contiguous(npages, max_pnum, pnum_mask, wire, flags);
8399 
8400 	if (pages == VM_PAGE_NULL) {
8401 		return KERN_NO_SPACE;
8402 	}
8403 	/*
8404 	 * determine need for wakeups
8405 	 */
8406 	if (vm_page_free_count < vm_page_free_min) {
8407 		vm_free_page_lock();
8408 		if (vm_pageout_running == FALSE) {
8409 			vm_free_page_unlock();
8410 			thread_wakeup((event_t) &vm_page_free_wanted);
8411 		} else {
8412 			vm_free_page_unlock();
8413 		}
8414 	}
8415 
8416 	VM_CHECK_MEMORYSTATUS;
8417 
8418 	/*
8419 	 *	The CPM pages should now be available and
8420 	 *	ordered by ascending physical address.
8421 	 */
8422 	assert(vm_page_verify_contiguous(pages, npages));
8423 
8424 	if (flags & KMA_ZERO) {
8425 		for (vm_page_t m = pages; m; m = NEXT_PAGE(m)) {
8426 			vm_page_zero_fill(
8427 				m
8428 #if HAS_MTE
8429 				, false
8430 #endif /* HAS_MTE */
8431 				);
8432 		}
8433 	}
8434 
8435 	*list = pages;
8436 	return KERN_SUCCESS;
8437 }
8438 
8439 
8440 unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT;
8441 
8442 /*
8443  * when working on a 'run' of pages, it is necessary to hold
8444  * the vm_page_queue_lock (a hot global lock) for certain operations
8445  * on the page... however, the majority of the work can be done
8446  * while merely holding the object lock... in fact there are certain
8447  * collections of pages that don't require any work brokered by the
8448  * vm_page_queue_lock... to mitigate the time spent behind the global
8449  * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
8450  * while doing all of the work that doesn't require the vm_page_queue_lock...
8451  * then call vm_page_do_delayed_work to acquire the vm_page_queue_lock and do the
8452  * necessary work for each page... we will grab the busy bit on the page
8453  * if it's not already held so that vm_page_do_delayed_work can drop the object lock
8454  * if it can't immediately take the vm_page_queue_lock in order to compete
8455  * for the locks in the same order that vm_pageout_scan takes them.
8456  * the operation names are modeled after the names of the routines that
8457  * need to be called in order to make the changes very obvious in the
8458  * original loop
8459  *
8460  * On certain configurations, this function may return failure if any of
8461  * the pages in the run has a mapping state that doesn't allow the specified
8462  * operation.  In that case, it will still fully process the run of pages
8463  * in order to avoid requiring the caller to partially undo the work done
8464  * here.
8465  */
8466 
8467 kern_return_t
vm_page_do_delayed_work(vm_object_t object,vm_tag_t tag,struct vm_page_delayed_work * dwp,int dw_count)8468 vm_page_do_delayed_work(
8469 	vm_object_t     object,
8470 	vm_tag_t        tag,
8471 	struct vm_page_delayed_work *dwp,
8472 	int             dw_count)
8473 {
8474 	kern_return_t   kr = KERN_SUCCESS;
8475 	int             j;
8476 	vm_page_t       m;
8477 	vm_page_t       local_free_q = VM_PAGE_NULL;
8478 
8479 	/*
8480 	 * pageout_scan takes the vm_page_lock_queues first
8481 	 * then tries for the object lock... to avoid what
8482 	 * is effectively a lock inversion, we'll go to the
8483 	 * trouble of taking them in that same order... otherwise
8484 	 * if this object contains the majority of the pages resident
8485 	 * in the UBC (or a small set of large objects actively being
8486 	 * worked on contain the majority of the pages), we could
8487 	 * cause the pageout_scan thread to 'starve' in its attempt
8488 	 * to find pages to move to the free queue, since it has to
8489 	 * successfully acquire the object lock of any candidate page
8490 	 * before it can steal/clean it.
8491 	 */
8492 	if (!vm_page_trylock_queues()) {
8493 		vm_object_unlock(object);
8494 
8495 		/*
8496 		 * "Turnstile enabled vm_pageout_scan" can be runnable
8497 		 * for a very long time without getting on a core.
8498 		 * If this is a higher priority thread it could be
8499 		 * waiting here for a very long time respecting the fact
8500 		 * that pageout_scan would like its object after VPS does
8501 		 * a mutex_pause(0).
8502 		 * So we cap the number of yields in the vm_object_lock_avoid()
8503 		 * case to a single mutex_pause(0) which will give vm_pageout_scan
8504 		 * 10us to run and grab the object if needed.
8505 		 */
8506 		vm_page_lock_queues();
8507 
8508 		for (j = 0;; j++) {
8509 			if ((!vm_object_lock_avoid(object) ||
8510 			    (vps_dynamic_priority_enabled && (j > 0))) &&
8511 			    _vm_object_lock_try(object)) {
8512 				break;
8513 			}
8514 			vm_page_unlock_queues();
8515 			mutex_pause(j);
8516 			vm_page_lock_queues();
8517 		}
8518 	}
8519 	for (j = 0; j < dw_count; j++, dwp++) {
8520 		m = dwp->dw_m;
8521 
8522 		if (dwp->dw_mask & DW_vm_pageout_throttle_up) {
8523 			vm_pageout_throttle_up(m);
8524 		}
8525 #if CONFIG_PHANTOM_CACHE
8526 		if (dwp->dw_mask & DW_vm_phantom_cache_update) {
8527 			vm_phantom_cache_update(m);
8528 		}
8529 #endif
8530 		if (dwp->dw_mask & DW_vm_page_wire) {
8531 			vm_page_wire(m, tag, FALSE);
8532 			if (dwp->dw_mask & DW_vm_page_iopl_wire) {
8533 #if CONFIG_SPTM
8534 				/*
8535 				 * The SPTM's security model prevents us from allowing writable I/O
8536 				 * mappings of executable pages.  We need to check that here,
8537 				 * in the same place that we set vmp_iopl_wired, because this
8538 				 * function may have transiently dropped the VM object lock
8539 				 * before reaching this point, which means that frontloading
8540 				 * this check in the caller may not work in all cases.
8541 				 */
8542 				if ((dwp->dw_mask & DW_vm_page_iopl_wire_write) && PMAP_PAGE_IS_USER_EXECUTABLE(m)) {
8543 					if (kr == KERN_SUCCESS) {
8544 						kr = KERN_PROTECTION_FAILURE;
8545 						vm_map_guard_exception(VM_PAGE_GET_PHYS_PAGE(m), kGUARD_EXC_SEC_IOPL_ON_EXEC_PAGE);
8546 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
8547 						    KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_IOPL_ON_EXEC_PAGE),
8548 						    (uintptr_t)(VM_PAGE_GET_PHYS_PAGE(m)));
8549 					}
8550 				} else {
8551 					m->vmp_iopl_wired = true;
8552 				}
8553 #else
8554 				m->vmp_iopl_wired = true;
8555 #endif /* CONFIG_SPTM */
8556 			}
8557 		} else if (dwp->dw_mask & DW_vm_page_unwire) {
8558 			boolean_t       queueit;
8559 
8560 			queueit = (dwp->dw_mask & (DW_vm_page_free | DW_vm_page_deactivate_internal)) ? FALSE : TRUE;
8561 
8562 			vm_page_unwire(m, queueit);
8563 		}
8564 		if (dwp->dw_mask & DW_vm_page_free) {
8565 			vm_page_free_prepare_queues(m);
8566 
8567 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
8568 			/*
8569 			 * Add this page to our list of reclaimed pages,
8570 			 * to be freed later.
8571 			 */
8572 			m->vmp_snext = local_free_q;
8573 			local_free_q = m;
8574 		} else {
8575 			if (dwp->dw_mask & DW_vm_page_deactivate_internal) {
8576 				vm_page_deactivate_internal(m, FALSE);
8577 			} else if (dwp->dw_mask & DW_vm_page_activate) {
8578 				if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
8579 					vm_page_activate(m);
8580 				}
8581 			} else if (dwp->dw_mask & DW_vm_page_speculate) {
8582 				vm_page_speculate(m, TRUE);
8583 			} else if (dwp->dw_mask & DW_enqueue_cleaned) {
8584 				/*
8585 				 * if we didn't hold the object lock and did this,
8586 				 * we might disconnect the page, then someone might
8587 				 * soft fault it back in, then we would put it on the
8588 				 * cleaned queue, and so we would have a referenced (maybe even dirty)
8589 				 * page on that queue, which we don't want
8590 				 */
8591 				int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8592 
8593 				if ((refmod_state & VM_MEM_REFERENCED)) {
8594 					/*
8595 					 * this page has been touched since it got cleaned; let's activate it
8596 					 * if it hasn't already been
8597 					 */
8598 					VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
8599 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
8600 
8601 					if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
8602 						vm_page_activate(m);
8603 					}
8604 				} else {
8605 					m->vmp_reference = FALSE;
8606 					vm_page_enqueue_cleaned(m);
8607 				}
8608 			} else if (dwp->dw_mask & DW_vm_page_lru) {
8609 				vm_page_lru(m);
8610 			} else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) {
8611 				if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
8612 					vm_page_queues_remove(m, TRUE);
8613 				}
8614 			}
8615 			if (dwp->dw_mask & DW_set_reference) {
8616 				m->vmp_reference = TRUE;
8617 			} else if (dwp->dw_mask & DW_clear_reference) {
8618 				m->vmp_reference = FALSE;
8619 			}
8620 
8621 			if (dwp->dw_mask & DW_move_page) {
8622 				if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
8623 					vm_page_queues_remove(m, FALSE);
8624 
8625 					assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
8626 
8627 					vm_page_enqueue_inactive(m, FALSE);
8628 				}
8629 			}
8630 			if (dwp->dw_mask & DW_clear_busy) {
8631 				m->vmp_busy = FALSE;
8632 			}
8633 
8634 			if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8635 				vm_page_wakeup(object, m);
8636 			}
8637 #if HAS_MTE
8638 			if (dwp->dw_mask & DW_vm_page_wakeup_tag_storage) {
8639 				assert(m->vmp_ts_wanted);
8640 				mteinfo_tag_storage_wakeup(m, false);
8641 			}
8642 #endif /* HAS_MTE */
8643 		}
8644 	}
8645 	vm_page_unlock_queues();
8646 
8647 	if (local_free_q) {
8648 		vm_page_free_list(local_free_q, TRUE);
8649 	}
8650 
8651 	VM_CHECK_MEMORYSTATUS;
8652 
8653 	return kr;
8654 }
8655 
8656 __abortlike
8657 static void
__vm_page_alloc_list_failed_panic(vm_size_t page_count,kma_flags_t flags,kern_return_t kr)8658 __vm_page_alloc_list_failed_panic(
8659 	vm_size_t       page_count,
8660 	kma_flags_t     flags,
8661 	kern_return_t   kr)
8662 {
8663 	panic("vm_page_alloc_list(%zd, 0x%x) failed unexpectedly with %d",
8664 	    (size_t)page_count, flags, kr);
8665 }
8666 
8667 kern_return_t
vm_page_alloc_list(vm_size_t page_count,kma_flags_t flags,vm_page_t * list)8668 vm_page_alloc_list(vm_size_t page_count, kma_flags_t flags, vm_page_t *list)
8669 {
8670 	vm_page_t       page_list = VM_PAGE_NULL;
8671 	vm_page_t       mem;
8672 	kern_return_t   kr = KERN_SUCCESS;
8673 	int             page_grab_count = 0;
8674 	task_t          task;
8675 
8676 	for (vm_size_t i = 0; i < page_count; i++) {
8677 		for (;;) {
8678 			vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
8679 
8680 #if HAS_MTE
8681 			if (flags & KMA_TAG) {
8682 				options |= VM_PAGE_GRAB_MTE;
8683 			}
8684 			if (vm_mte_tag_storage_for_compressor && (flags & KMA_COMPRESSOR)) {
8685 				/*
8686 				 * These pages will be used in the compressor pool.
8687 				 * Prefer tag storage pages for these allocations.
8688 				 */
8689 				options |= VM_PAGE_GRAB_ALLOW_TAG_STORAGE;
8690 			}
8691 #endif /* HAS_MTE */
8692 			if (flags & KMA_NOPAGEWAIT) {
8693 				options |= VM_PAGE_GRAB_NOPAGEWAIT;
8694 			}
8695 			if (flags & KMA_LOMEM) {
8696 				mem = vm_page_grablo(options);
8697 			} else {
8698 				mem = vm_page_grab_options(options);
8699 			}
8700 
8701 			if (mem != VM_PAGE_NULL) {
8702 				break;
8703 			}
8704 
8705 			if (flags & KMA_NOPAGEWAIT) {
8706 				kr = KERN_RESOURCE_SHORTAGE;
8707 				goto out;
8708 			}
8709 			if ((flags & KMA_LOMEM) && vm_lopage_needed) {
8710 				kr = KERN_RESOURCE_SHORTAGE;
8711 				goto out;
8712 			}
8713 
8714 			/* VM privileged threads should have waited in vm_page_grab() and not get here. */
8715 			assert(!(current_thread()->options & TH_OPT_VMPRIV));
8716 
8717 			if ((flags & KMA_NOFAIL) == 0 && ptoa_64(page_count) > max_mem / 4) {
8718 				uint64_t unavailable = ptoa_64(vm_page_wire_count + vm_page_free_target);
8719 				if (unavailable > max_mem || ptoa_64(page_count) > (max_mem - unavailable)) {
8720 					kr = KERN_RESOURCE_SHORTAGE;
8721 					goto out;
8722 				}
8723 			}
8724 			VM_PAGE_WAIT();
8725 		}
8726 
8727 		page_grab_count++;
8728 		mem->vmp_snext = page_list;
8729 		page_list = mem;
8730 	}
8731 
8732 	if ((KMA_ZERO | KMA_NOENCRYPT) & flags) {
8733 		for (mem = page_list; mem; mem = mem->vmp_snext) {
8734 			vm_page_zero_fill(
8735 				mem
8736 #if HAS_MTE
8737 				, false /* zero_tags */
8738 #endif /* HAS_MTE */
8739 				);
8740 		}
8741 	}
8742 
8743 out:
8744 	task = current_task_early();
8745 	if (task != NULL) {
8746 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
8747 	}
8748 	counter_add(&vm_page_grab_count_kern, page_grab_count);
8749 
8750 	if (kr == KERN_SUCCESS) {
8751 		*list = page_list;
8752 	} else if (flags & KMA_NOFAIL) {
8753 		__vm_page_alloc_list_failed_panic(page_count, flags, kr);
8754 	} else {
8755 		vm_page_free_list(page_list, FALSE);
8756 	}
8757 
8758 	return kr;
8759 }
8760 
8761 void
vm_page_set_offset(vm_page_t page,vm_object_offset_t offset)8762 vm_page_set_offset(vm_page_t page, vm_object_offset_t offset)
8763 {
8764 	page->vmp_offset = offset;
8765 }
8766 
8767 vm_page_t
vm_page_get_next(vm_page_t page)8768 vm_page_get_next(vm_page_t page)
8769 {
8770 	return page->vmp_snext;
8771 }
8772 
8773 vm_object_offset_t
vm_page_get_offset(vm_page_t page)8774 vm_page_get_offset(vm_page_t page)
8775 {
8776 	return page->vmp_offset;
8777 }
8778 
8779 ppnum_t
vm_page_get_phys_page(vm_page_t page)8780 vm_page_get_phys_page(vm_page_t page)
8781 {
8782 	return VM_PAGE_GET_PHYS_PAGE(page);
8783 }
8784 
8785 
8786 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8787 
8788 #if HIBERNATION
8789 
8790 static uint32_t hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *);
8791 
8792 struct hibernate_statistics {
8793 	int hibernate_considered;
8794 	int hibernate_reentered_on_q;
8795 	int hibernate_found_dirty;
8796 	int hibernate_skipped_cleaning;
8797 	int hibernate_skipped_transient;
8798 	int hibernate_skipped_precious;
8799 	int hibernate_skipped_external;
8800 	int hibernate_queue_nolock;
8801 	int hibernate_queue_paused;
8802 	int hibernate_throttled;
8803 	int hibernate_throttle_timeout;
8804 	int hibernate_drained;
8805 	int hibernate_drain_timeout;
8806 	int cd_lock_failed;
8807 	int cd_found_precious;
8808 	int cd_found_wired;
8809 	int cd_found_busy;
8810 	int cd_found_unusual;
8811 	int cd_found_cleaning;
8812 	int cd_found_laundry;
8813 	int cd_found_dirty;
8814 	int cd_found_xpmapped;
8815 	int cd_skipped_xpmapped;
8816 	int cd_local_free;
8817 	int cd_total_free;
8818 	int cd_vm_page_wire_count;
8819 	int cd_vm_struct_pages_unneeded;
8820 	int cd_pages;
8821 	int cd_discarded;
8822 	int cd_count_wire;
8823 } hibernate_stats;
8824 
8825 #if CONFIG_SPTM
8826 /**
8827  * On SPTM-based systems don't save any executable pages into the hibernation
8828  * image. The SPTM has stronger guarantees around not allowing write access to
8829  * the executable pages than on older systems, which prevents XNU from being
8830  * able to restore any pages mapped as executable.
8831  */
8832 #define HIBERNATE_XPMAPPED_LIMIT        0ULL
8833 #else /* CONFIG_SPTM */
8834 /*
8835  * clamp the number of 'xpmapped' pages we'll sweep into the hibernation image
8836  * so that we don't overrun the estimated image size, which would
8837  * result in a hibernation failure.
8838  *
8839  * We use a size value instead of pages because we don't want to take up more space
8840  * on disk if the system has a 16K page size vs 4K. Also, we are not guaranteed
8841  * to have that additional space available.
8842  *
8843  * Since this was set at 40000 pages on X86 we are going to use 160MB as our
8844  * xpmapped size.
8845  */
8846 #define HIBERNATE_XPMAPPED_LIMIT        ((160 * 1024 * 1024ULL) / PAGE_SIZE)
8847 #endif /* CONFIG_SPTM */
8848 
8849 static int
hibernate_drain_pageout_queue(struct vm_pageout_queue * q)8850 hibernate_drain_pageout_queue(struct vm_pageout_queue *q)
8851 {
8852 	wait_result_t   wait_result;
8853 
8854 	vm_page_lock_queues();
8855 
8856 	while (!vm_page_queue_empty(&q->pgo_pending)) {
8857 		q->pgo_draining = TRUE;
8858 
8859 		assert_wait_timeout((event_t) (&q->pgo_laundry + 1), THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
8860 
8861 		vm_page_unlock_queues();
8862 
8863 		wait_result = thread_block(THREAD_CONTINUE_NULL);
8864 
8865 		if (wait_result == THREAD_TIMED_OUT && !vm_page_queue_empty(&q->pgo_pending)) {
8866 			hibernate_stats.hibernate_drain_timeout++;
8867 
8868 			if (q == &vm_pageout_queue_external) {
8869 				return 0;
8870 			}
8871 
8872 			return 1;
8873 		}
8874 		vm_page_lock_queues();
8875 
8876 		hibernate_stats.hibernate_drained++;
8877 	}
8878 	vm_page_unlock_queues();
8879 
8880 	return 0;
8881 }
8882 
8883 
8884 boolean_t hibernate_skip_external = FALSE;
8885 
8886 static int
hibernate_flush_queue(vm_page_queue_head_t * q,int qcount)8887 hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
8888 {
8889 	vm_page_t       m;
8890 	vm_object_t     l_object = NULL;
8891 	vm_object_t     m_object = NULL;
8892 	int             refmod_state = 0;
8893 	int             try_failed_count = 0;
8894 	int             retval = 0;
8895 	int             current_run = 0;
8896 	struct  vm_pageout_queue *iq;
8897 	struct  vm_pageout_queue *eq;
8898 	struct  vm_pageout_queue *tq;
8899 
8900 	KDBG(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_START,
8901 	    VM_KERNEL_UNSLIDE_OR_PERM(q), qcount);
8902 
8903 	iq = &vm_pageout_queue_internal;
8904 	eq = &vm_pageout_queue_external;
8905 
8906 	vm_page_lock_queues();
8907 
8908 	while (qcount && !vm_page_queue_empty(q)) {
8909 		if (current_run++ == 1000) {
8910 			if (hibernate_should_abort()) {
8911 				retval = 1;
8912 				break;
8913 			}
8914 			current_run = 0;
8915 		}
8916 
8917 		m = (vm_page_t) vm_page_queue_first(q);
8918 		m_object = VM_PAGE_OBJECT(m);
8919 
8920 		/*
8921 		 * check to see if we currently are working
8922 		 * with the same object... if so, we've
8923 		 * already got the lock
8924 		 */
8925 		if (m_object != l_object) {
8926 			/*
8927 			 * the object associated with candidate page is
8928 			 * different from the one we were just working
8929 			 * with... dump the lock if we still own it
8930 			 */
8931 			if (l_object != NULL) {
8932 				vm_object_unlock(l_object);
8933 				l_object = NULL;
8934 			}
8935 			/*
8936 			 * Try to lock object; since we've alread got the
8937 			 * page queues lock, we can only 'try' for this one.
8938 			 * if the 'try' fails, we need to do a mutex_pause
8939 			 * to allow the owner of the object lock a chance to
8940 			 * run...
8941 			 */
8942 			if (!vm_object_lock_try_scan(m_object)) {
8943 				if (try_failed_count > 20) {
8944 					hibernate_stats.hibernate_queue_nolock++;
8945 
8946 					goto reenter_pg_on_q;
8947 				}
8948 
8949 				vm_page_unlock_queues();
8950 				mutex_pause(try_failed_count++);
8951 				vm_page_lock_queues();
8952 
8953 				hibernate_stats.hibernate_queue_paused++;
8954 				continue;
8955 			} else {
8956 				l_object = m_object;
8957 			}
8958 		}
8959 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m)) {
8960 			/*
8961 			 * page is not to be cleaned
8962 			 * put it back on the head of its queue
8963 			 */
8964 			if (m->vmp_cleaning) {
8965 				hibernate_stats.hibernate_skipped_cleaning++;
8966 			} else {
8967 				hibernate_stats.hibernate_skipped_transient++;
8968 			}
8969 
8970 			goto reenter_pg_on_q;
8971 		}
8972 		if (m_object->vo_copy == VM_OBJECT_NULL) {
8973 			if (m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) {
8974 				/*
8975 				 * let the normal hibernate image path
8976 				 * deal with these
8977 				 */
8978 				goto reenter_pg_on_q;
8979 			}
8980 		}
8981 		if (!m->vmp_dirty && m->vmp_pmapped) {
8982 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
8983 
8984 			if ((refmod_state & VM_MEM_MODIFIED)) {
8985 				SET_PAGE_DIRTY(m, FALSE);
8986 			}
8987 		} else {
8988 			refmod_state = 0;
8989 		}
8990 
8991 		if (!m->vmp_dirty) {
8992 			/*
8993 			 * page is not to be cleaned
8994 			 * put it back on the head of its queue
8995 			 */
8996 			if (m->vmp_precious) {
8997 				hibernate_stats.hibernate_skipped_precious++;
8998 			}
8999 
9000 			goto reenter_pg_on_q;
9001 		}
9002 
9003 		if (hibernate_skip_external == TRUE && !m_object->internal) {
9004 			hibernate_stats.hibernate_skipped_external++;
9005 
9006 			goto reenter_pg_on_q;
9007 		}
9008 		tq = NULL;
9009 
9010 		if (m_object->internal) {
9011 			if (VM_PAGE_Q_THROTTLED(iq)) {
9012 				tq = iq;
9013 			}
9014 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
9015 			tq = eq;
9016 		}
9017 
9018 		if (tq != NULL) {
9019 			wait_result_t   wait_result;
9020 			int             wait_count = 5;
9021 
9022 			if (l_object != NULL) {
9023 				vm_object_unlock(l_object);
9024 				l_object = NULL;
9025 			}
9026 
9027 			while (retval == 0) {
9028 				tq->pgo_throttled = TRUE;
9029 
9030 				assert_wait_timeout((event_t) &tq->pgo_laundry, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
9031 
9032 				vm_page_unlock_queues();
9033 
9034 				wait_result = thread_block(THREAD_CONTINUE_NULL);
9035 
9036 				vm_page_lock_queues();
9037 
9038 				if (wait_result != THREAD_TIMED_OUT) {
9039 					break;
9040 				}
9041 				if (!VM_PAGE_Q_THROTTLED(tq)) {
9042 					break;
9043 				}
9044 
9045 				if (hibernate_should_abort()) {
9046 					retval = 1;
9047 				}
9048 
9049 				if (--wait_count == 0) {
9050 					hibernate_stats.hibernate_throttle_timeout++;
9051 
9052 					if (tq == eq) {
9053 						hibernate_skip_external = TRUE;
9054 						break;
9055 					}
9056 					retval = 1;
9057 				}
9058 			}
9059 			if (retval) {
9060 				break;
9061 			}
9062 
9063 			hibernate_stats.hibernate_throttled++;
9064 
9065 			continue;
9066 		}
9067 		/*
9068 		 * we've already factored out pages in the laundry which
9069 		 * means this page can't be on the pageout queue so it's
9070 		 * safe to do the vm_page_queues_remove
9071 		 */
9072 		vm_page_queues_remove(m, TRUE);
9073 
9074 		if (m_object->internal == TRUE) {
9075 			pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m), PMAP_OPTIONS_COMPRESSOR, NULL);
9076 		}
9077 
9078 		vm_pageout_cluster(m);
9079 
9080 		hibernate_stats.hibernate_found_dirty++;
9081 
9082 		goto next_pg;
9083 
9084 reenter_pg_on_q:
9085 		vm_page_queue_remove(q, m, vmp_pageq);
9086 		vm_page_queue_enter(q, m, vmp_pageq);
9087 
9088 		hibernate_stats.hibernate_reentered_on_q++;
9089 next_pg:
9090 		hibernate_stats.hibernate_considered++;
9091 
9092 		qcount--;
9093 		try_failed_count = 0;
9094 	}
9095 	if (l_object != NULL) {
9096 		vm_object_unlock(l_object);
9097 		l_object = NULL;
9098 	}
9099 
9100 	vm_page_unlock_queues();
9101 
9102 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_END, hibernate_stats.hibernate_found_dirty, retval, 0, 0, 0);
9103 
9104 	return retval;
9105 }
9106 
9107 
9108 static int
hibernate_flush_dirty_pages(int pass)9109 hibernate_flush_dirty_pages(int pass)
9110 {
9111 	struct vm_speculative_age_q     *aq;
9112 	uint32_t        i;
9113 
9114 	if (vm_page_local_q) {
9115 		zpercpu_foreach_cpu(lid) {
9116 			vm_page_reactivate_local(lid, TRUE, FALSE);
9117 		}
9118 	}
9119 
9120 	for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
9121 		int             qcount;
9122 		vm_page_t       m;
9123 
9124 		aq = &vm_page_queue_speculative[i];
9125 
9126 		if (vm_page_queue_empty(&aq->age_q)) {
9127 			continue;
9128 		}
9129 		qcount = 0;
9130 
9131 		vm_page_lockspin_queues();
9132 
9133 		vm_page_queue_iterate(&aq->age_q, m, vmp_pageq) {
9134 			qcount++;
9135 		}
9136 		vm_page_unlock_queues();
9137 
9138 		if (qcount) {
9139 			if (hibernate_flush_queue(&aq->age_q, qcount)) {
9140 				return 1;
9141 			}
9142 		}
9143 	}
9144 	if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count)) {
9145 		return 1;
9146 	}
9147 	/* XXX FBDP TODO: flush secluded queue */
9148 	if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count)) {
9149 		return 1;
9150 	}
9151 	if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count)) {
9152 		return 1;
9153 	}
9154 	if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
9155 		return 1;
9156 	}
9157 
9158 	if (pass == 1) {
9159 		vm_compressor_record_warmup_start();
9160 	}
9161 
9162 	if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) {
9163 		if (pass == 1) {
9164 			vm_compressor_record_warmup_end();
9165 		}
9166 		return 1;
9167 	}
9168 	if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
9169 		if (pass == 1) {
9170 			vm_compressor_record_warmup_end();
9171 		}
9172 		return 1;
9173 	}
9174 	if (pass == 1) {
9175 		vm_compressor_record_warmup_end();
9176 	}
9177 
9178 	if (hibernate_skip_external == FALSE && hibernate_drain_pageout_queue(&vm_pageout_queue_external)) {
9179 		return 1;
9180 	}
9181 
9182 	return 0;
9183 }
9184 
9185 
9186 void
hibernate_reset_stats(void)9187 hibernate_reset_stats(void)
9188 {
9189 	bzero(&hibernate_stats, sizeof(struct hibernate_statistics));
9190 }
9191 
9192 
9193 int
hibernate_flush_memory(void)9194 hibernate_flush_memory(void)
9195 {
9196 	int     retval;
9197 
9198 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
9199 
9200 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_START, vm_page_free_count, 0, 0, 0, 0);
9201 
9202 	hibernate_cleaning_in_progress = TRUE;
9203 	hibernate_skip_external = FALSE;
9204 
9205 	if ((retval = hibernate_flush_dirty_pages(1)) == 0) {
9206 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
9207 
9208 		vm_compressor_flush();
9209 
9210 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
9211 
9212 		if (consider_buffer_cache_collect != NULL) {
9213 			unsigned int orig_wire_count;
9214 
9215 			KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_START, 0, 0, 0, 0, 0);
9216 			orig_wire_count = vm_page_wire_count;
9217 
9218 			(void)(*consider_buffer_cache_collect)(1);
9219 			zone_gc(ZONE_GC_DRAIN);
9220 
9221 			HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count);
9222 
9223 			KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_END, orig_wire_count - vm_page_wire_count, 0, 0, 0, 0);
9224 		}
9225 	}
9226 	hibernate_cleaning_in_progress = FALSE;
9227 
9228 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_END, vm_page_free_count, hibernate_stats.hibernate_found_dirty, retval, 0, 0);
9229 
9230 	if (retval) {
9231 		HIBLOG("hibernate_flush_memory() failed to finish - vm_page_compressor_count(%d)\n", VM_PAGE_COMPRESSOR_COUNT);
9232 	}
9233 
9234 
9235 	HIBPRINT("hibernate_flush_memory() considered(%d) reentered_on_q(%d) found_dirty(%d)\n",
9236 	    hibernate_stats.hibernate_considered,
9237 	    hibernate_stats.hibernate_reentered_on_q,
9238 	    hibernate_stats.hibernate_found_dirty);
9239 	HIBPRINT("   skipped_cleaning(%d) skipped_transient(%d) skipped_precious(%d) skipped_external(%d) queue_nolock(%d)\n",
9240 	    hibernate_stats.hibernate_skipped_cleaning,
9241 	    hibernate_stats.hibernate_skipped_transient,
9242 	    hibernate_stats.hibernate_skipped_precious,
9243 	    hibernate_stats.hibernate_skipped_external,
9244 	    hibernate_stats.hibernate_queue_nolock);
9245 	HIBPRINT("   queue_paused(%d) throttled(%d) throttle_timeout(%d) drained(%d) drain_timeout(%d)\n",
9246 	    hibernate_stats.hibernate_queue_paused,
9247 	    hibernate_stats.hibernate_throttled,
9248 	    hibernate_stats.hibernate_throttle_timeout,
9249 	    hibernate_stats.hibernate_drained,
9250 	    hibernate_stats.hibernate_drain_timeout);
9251 
9252 	return retval;
9253 }
9254 
9255 
9256 static void
hibernate_page_list_zero(hibernate_page_list_t * list)9257 hibernate_page_list_zero(hibernate_page_list_t *list)
9258 {
9259 	uint32_t             bank;
9260 	hibernate_bitmap_t * bitmap;
9261 
9262 	bitmap = &list->bank_bitmap[0];
9263 	for (bank = 0; bank < list->bank_count; bank++) {
9264 		uint32_t last_bit;
9265 
9266 		bzero((void *) &bitmap->bitmap[0], bitmap->bitmapwords << 2);
9267 		// set out-of-bound bits at end of bitmap.
9268 		last_bit = ((bitmap->last_page - bitmap->first_page + 1) & 31);
9269 		if (last_bit) {
9270 			bitmap->bitmap[bitmap->bitmapwords - 1] = (0xFFFFFFFF >> last_bit);
9271 		}
9272 
9273 		bitmap = (hibernate_bitmap_t *) &bitmap->bitmap[bitmap->bitmapwords];
9274 	}
9275 }
9276 
9277 static boolean_t
hibernate_consider_discard(vm_page_t m,boolean_t preflight)9278 hibernate_consider_discard(vm_page_t m, boolean_t preflight)
9279 {
9280 	vm_object_t object = NULL;
9281 	int                  refmod_state;
9282 	boolean_t            discard = FALSE;
9283 
9284 	do{
9285 		if (vm_page_is_private(m)) {
9286 			panic("hibernate_consider_discard: private");
9287 		}
9288 
9289 		object = VM_PAGE_OBJECT(m);
9290 
9291 		if (!vm_object_lock_try(object)) {
9292 			object = NULL;
9293 			if (!preflight) {
9294 				hibernate_stats.cd_lock_failed++;
9295 			}
9296 			break;
9297 		}
9298 		if (VM_PAGE_WIRED(m)) {
9299 			if (!preflight) {
9300 				hibernate_stats.cd_found_wired++;
9301 			}
9302 			break;
9303 		}
9304 		if (m->vmp_precious) {
9305 			if (!preflight) {
9306 				hibernate_stats.cd_found_precious++;
9307 			}
9308 			break;
9309 		}
9310 		if (m->vmp_busy || !object->alive) {
9311 			/*
9312 			 *	Somebody is playing with this page.
9313 			 */
9314 			if (!preflight) {
9315 				hibernate_stats.cd_found_busy++;
9316 			}
9317 			break;
9318 		}
9319 		if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
9320 			/*
9321 			 * If it's unusual in anyway, ignore it
9322 			 */
9323 			if (!preflight) {
9324 				hibernate_stats.cd_found_unusual++;
9325 			}
9326 			break;
9327 		}
9328 		if (m->vmp_cleaning) {
9329 			if (!preflight) {
9330 				hibernate_stats.cd_found_cleaning++;
9331 			}
9332 			break;
9333 		}
9334 		if (m->vmp_laundry) {
9335 			if (!preflight) {
9336 				hibernate_stats.cd_found_laundry++;
9337 			}
9338 			break;
9339 		}
9340 		if (!m->vmp_dirty) {
9341 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
9342 
9343 			if (refmod_state & VM_MEM_REFERENCED) {
9344 				m->vmp_reference = TRUE;
9345 			}
9346 			if (refmod_state & VM_MEM_MODIFIED) {
9347 				SET_PAGE_DIRTY(m, FALSE);
9348 			}
9349 		}
9350 
9351 		/*
9352 		 * If it's clean or purgeable we can discard the page on wakeup.
9353 		 */
9354 		discard = (!m->vmp_dirty)
9355 		    || (VM_PURGABLE_VOLATILE == object->purgable)
9356 		    || (VM_PURGABLE_EMPTY == object->purgable);
9357 
9358 
9359 		if (discard == FALSE) {
9360 			if (!preflight) {
9361 				hibernate_stats.cd_found_dirty++;
9362 			}
9363 		} else if (m->vmp_xpmapped && m->vmp_reference && !object->internal) {
9364 			if (hibernate_stats.cd_found_xpmapped < HIBERNATE_XPMAPPED_LIMIT) {
9365 				if (!preflight) {
9366 					hibernate_stats.cd_found_xpmapped++;
9367 				}
9368 				discard = FALSE;
9369 			} else {
9370 				if (!preflight) {
9371 					hibernate_stats.cd_skipped_xpmapped++;
9372 				}
9373 			}
9374 		}
9375 	}while (FALSE);
9376 
9377 	if (object) {
9378 		vm_object_unlock(object);
9379 	}
9380 
9381 	return discard;
9382 }
9383 
9384 
9385 static void
hibernate_discard_page(vm_page_t m)9386 hibernate_discard_page(vm_page_t m)
9387 {
9388 	vm_object_t m_object;
9389 
9390 	if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
9391 		/*
9392 		 * If it's unusual in anyway, ignore
9393 		 */
9394 		return;
9395 	}
9396 
9397 	m_object = VM_PAGE_OBJECT(m);
9398 
9399 #if MACH_ASSERT || DEBUG
9400 	if (!vm_object_lock_try(m_object)) {
9401 		panic("hibernate_discard_page(%p) !vm_object_lock_try", m);
9402 	}
9403 #else
9404 	/* No need to lock page queue for token delete, hibernate_vm_unlock()
9405 	 *  makes sure these locks are uncontended before sleep */
9406 #endif /* MACH_ASSERT || DEBUG */
9407 
9408 	if (m->vmp_pmapped == TRUE) {
9409 		__unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
9410 	}
9411 
9412 	if (m->vmp_laundry) {
9413 		panic("hibernate_discard_page(%p) laundry", m);
9414 	}
9415 	if (vm_page_is_private(m)) {
9416 		panic("hibernate_discard_page(%p) private", m);
9417 	}
9418 	if (vm_page_is_fictitious(m)) {
9419 		panic("hibernate_discard_page(%p) fictitious", m);
9420 	}
9421 
9422 	if (VM_PURGABLE_VOLATILE == m_object->purgable) {
9423 		/* object should be on a queue */
9424 		assert((m_object->objq.next != NULL) && (m_object->objq.prev != NULL));
9425 		purgeable_q_t old_queue = vm_purgeable_object_remove(m_object);
9426 		assert(old_queue);
9427 		if (m_object->purgeable_when_ripe) {
9428 			vm_purgeable_token_delete_first(old_queue);
9429 		}
9430 		vm_object_lock_assert_exclusive(m_object);
9431 		VM_OBJECT_SET_PURGABLE(m_object, VM_PURGABLE_EMPTY);
9432 
9433 		/*
9434 		 * Purgeable ledgers:  pages of VOLATILE and EMPTY objects are
9435 		 * accounted in the "volatile" ledger, so no change here.
9436 		 * We have to update vm_page_purgeable_count, though, since we're
9437 		 * effectively purging this object.
9438 		 */
9439 		unsigned int delta;
9440 		assert(m_object->resident_page_count >= m_object->wired_page_count);
9441 		delta = (m_object->resident_page_count - m_object->wired_page_count);
9442 		assert(vm_page_purgeable_count >= delta);
9443 		assert(delta > 0);
9444 		OSAddAtomic(-delta, (SInt32 *)&vm_page_purgeable_count);
9445 	}
9446 
9447 	vm_page_free(m);
9448 
9449 #if MACH_ASSERT || DEBUG
9450 	vm_object_unlock(m_object);
9451 #endif  /* MACH_ASSERT || DEBUG */
9452 }
9453 
9454 /*
9455  *  Grab locks for hibernate_page_list_setall()
9456  */
9457 void
hibernate_vm_lock_queues(void)9458 hibernate_vm_lock_queues(void)
9459 {
9460 	vm_object_lock(compressor_object);
9461 	vm_page_lock_queues();
9462 	vm_free_page_lock();
9463 	lck_mtx_lock(&vm_purgeable_queue_lock);
9464 
9465 	if (vm_page_local_q) {
9466 		zpercpu_foreach(lq, vm_page_local_q) {
9467 			VPL_LOCK(&lq->vpl_lock);
9468 		}
9469 	}
9470 }
9471 
9472 void
hibernate_vm_unlock_queues(void)9473 hibernate_vm_unlock_queues(void)
9474 {
9475 	if (vm_page_local_q) {
9476 		zpercpu_foreach(lq, vm_page_local_q) {
9477 			VPL_UNLOCK(&lq->vpl_lock);
9478 		}
9479 	}
9480 	lck_mtx_unlock(&vm_purgeable_queue_lock);
9481 	vm_free_page_unlock();
9482 	vm_page_unlock_queues();
9483 	vm_object_unlock(compressor_object);
9484 }
9485 
9486 #if CONFIG_SPTM
9487 static bool
hibernate_sptm_should_force_page_to_wired_pagelist(vm_page_t vmp)9488 hibernate_sptm_should_force_page_to_wired_pagelist(vm_page_t vmp)
9489 {
9490 	const sptm_paddr_t paddr = ptoa_64(VM_PAGE_GET_PHYS_PAGE(vmp));
9491 	const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
9492 	const vm_object_t vmp_objp = VM_PAGE_OBJECT(vmp);
9493 
9494 	return frame_type == XNU_USER_JIT || frame_type == XNU_USER_DEBUG ||
9495 	       (frame_type == XNU_USER_EXEC && vmp_objp->internal == TRUE);
9496 }
9497 #endif
9498 
9499 /*
9500  *  Bits zero in the bitmaps => page needs to be saved. All pages default to be saved,
9501  *  pages known to VM to not need saving are subtracted.
9502  *  Wired pages to be saved are present in page_list_wired, pageable in page_list.
9503  */
9504 
9505 void
hibernate_page_list_setall(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired,hibernate_page_list_t * page_list_pal,boolean_t preflight,boolean_t will_discard,uint32_t * pagesOut)9506 hibernate_page_list_setall(hibernate_page_list_t * page_list,
9507     hibernate_page_list_t * page_list_wired,
9508     hibernate_page_list_t * page_list_pal,
9509     boolean_t preflight,
9510     boolean_t will_discard,
9511     uint32_t * pagesOut)
9512 {
9513 	uint64_t start, end, nsec;
9514 	vm_page_t m;
9515 	vm_page_t next;
9516 	__block uint32_t pages = page_list->page_count;
9517 	__block uint32_t count_wire = pages;
9518 	uint32_t count_anonymous = 0, count_throttled = 0, count_compressor = 0;
9519 	uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0;
9520 	uint32_t count_discard_active    = 0;
9521 	uint32_t count_discard_inactive  = 0;
9522 	uint32_t count_retired = 0;
9523 	uint32_t count_discard_cleaned   = 0;
9524 	uint32_t count_discard_purgeable = 0;
9525 	uint32_t count_discard_speculative = 0;
9526 	uint32_t count_discard_vm_struct_pages = 0;
9527 	uint32_t             bank;
9528 	hibernate_bitmap_t * bitmap;
9529 	hibernate_bitmap_t * bitmap_wired;
9530 	boolean_t                    discard_all;
9531 	boolean_t            discard = FALSE;
9532 
9533 	HIBLOG("hibernate_page_list_setall(preflight %d) start\n", preflight);
9534 
9535 	if (preflight) {
9536 		page_list       = NULL;
9537 		page_list_wired = NULL;
9538 		page_list_pal   = NULL;
9539 		discard_all     = FALSE;
9540 	} else {
9541 		discard_all     = will_discard;
9542 	}
9543 
9544 #if MACH_ASSERT || DEBUG
9545 	if (!preflight) {
9546 		assert(hibernate_vm_locks_are_safe());
9547 		vm_page_lock_queues();
9548 		if (vm_page_local_q) {
9549 			zpercpu_foreach(lq, vm_page_local_q) {
9550 				VPL_LOCK(&lq->vpl_lock);
9551 			}
9552 		}
9553 	}
9554 #endif  /* MACH_ASSERT || DEBUG */
9555 
9556 
9557 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_START, count_wire, 0, 0, 0, 0);
9558 
9559 	clock_get_uptime(&start);
9560 
9561 	if (!preflight) {
9562 		hibernate_page_list_zero(page_list);
9563 		hibernate_page_list_zero(page_list_wired);
9564 		hibernate_page_list_zero(page_list_pal);
9565 
9566 		hibernate_stats.cd_vm_page_wire_count = vm_page_wire_count;
9567 		hibernate_stats.cd_pages = pages;
9568 	}
9569 
9570 	if (vm_page_local_q) {
9571 		zpercpu_foreach_cpu(lid) {
9572 			vm_page_reactivate_local(lid, TRUE, !preflight);
9573 		}
9574 	}
9575 
9576 	if (preflight) {
9577 		vm_object_lock(compressor_object);
9578 		vm_page_lock_queues();
9579 		vm_free_page_lock();
9580 	}
9581 
9582 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
9583 
9584 	hibernation_vmqueues_inspection = TRUE;
9585 
9586 	__auto_type hib_free_boilerplate = ^(vm_page_t page) {
9587 		assert((page->vmp_q_state == VM_PAGE_ON_FREE_Q) ||
9588 #if XNU_VM_HAS_LOPAGE
9589 	    (page->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
9590 #endif /* XNU_VM_HAS_LOPAGE */
9591 	    (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q));
9592 
9593 		pages--;
9594 		count_wire--;
9595 
9596 		if (!preflight) {
9597 			hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(page));
9598 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(page));
9599 
9600 			hibernate_stats.cd_total_free++;
9601 
9602 			if (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) {
9603 				hibernate_stats.cd_local_free++;
9604 			}
9605 		}
9606 	};
9607 
9608 	if (!preflight) {
9609 		percpu_foreach(free_pages_head, free_pages) {
9610 			_vm_page_list_foreach(m, *free_pages_head) {
9611 				assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9612 				hib_free_boilerplate(m);
9613 			}
9614 		}
9615 #if HAS_MTE
9616 		percpu_foreach(mte_pcpu, mte_pcpu) {
9617 			_vm_page_list_foreach(m, mte_pcpu->free_tagged_pages) {
9618 				assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9619 				hib_free_boilerplate(m);
9620 			}
9621 			vm_page_queue_iterate(&mte_pcpu->free_claimed_pages,
9622 			    m, vmp_pageq) {
9623 				assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9624 				hib_free_boilerplate(m);
9625 			}
9626 		}
9627 #endif /* HAS_MTE */
9628 	}
9629 
9630 #if CONFIG_SPTM
9631 	if (vm_pages_free_masks()) {
9632 		uint32_t  bits = vm_pages_free_mask_len() * MAX_COLORS;
9633 		bitmap_t *map  = vm_pages_free_masks_as_bitmap(0);
9634 
9635 		for (int bit = bitmap_first(map, bits);
9636 		    bit >= 0; bit = bitmap_next(map, bit)) {
9637 			ppnum_t   pnum = pmap_first_pnum + bit;
9638 			vm_page_t mem  = vm_page_find_canonical(pnum);
9639 
9640 			hib_free_boilerplate(mem);
9641 		}
9642 	} else
9643 #endif /* CONFIG_SPTM */
9644 	{
9645 		vm_page_free_queue_foreach(&vm_page_queue_free, hib_free_boilerplate);
9646 	}
9647 #if HAS_MTE
9648 	mteinfo_free_queue_foreach(hib_free_boilerplate);
9649 #endif /* HAS_MTE */
9650 #if XNU_VM_HAS_LOPAGE
9651 	vm_page_free_queue_foreach(&vm_lopage_queue_free, hib_free_boilerplate);
9652 #endif /* XNU_VM_HAS_LOPAGE */
9653 
9654 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9655 	while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) {
9656 		assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
9657 
9658 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9659 		discard = FALSE;
9660 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
9661 		    && hibernate_consider_discard(m, preflight)) {
9662 			if (!preflight) {
9663 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9664 			}
9665 			count_discard_inactive++;
9666 			discard = discard_all;
9667 		} else {
9668 			count_throttled++;
9669 		}
9670 		count_wire--;
9671 		if (!preflight) {
9672 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9673 		}
9674 
9675 		if (discard) {
9676 			hibernate_discard_page(m);
9677 		}
9678 		m = next;
9679 	}
9680 
9681 	m = (vm_page_t)vm_page_queue_first(&vm_page_queue_anonymous);
9682 	while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
9683 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
9684 		bool force_to_wired_list = false;       /* Default to NOT forcing page into the wired page list */
9685 #if CONFIG_SPTM
9686 		force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9687 #endif
9688 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9689 		discard = FALSE;
9690 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9691 		    hibernate_consider_discard(m, preflight)) {
9692 			if (!preflight) {
9693 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9694 			}
9695 			if (m->vmp_dirty) {
9696 				count_discard_purgeable++;
9697 			} else {
9698 				count_discard_inactive++;
9699 			}
9700 			discard = discard_all;
9701 		} else {
9702 			/*
9703 			 * If the page must be force-added to the wired page list, prevent it from appearing
9704 			 * in the unwired page list.
9705 			 */
9706 			if (force_to_wired_list) {
9707 				if (!preflight) {
9708 					hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9709 				}
9710 			} else {
9711 				count_anonymous++;
9712 			}
9713 		}
9714 		/*
9715 		 * If the page is NOT being forced into the wired page list, remove it from the
9716 		 * wired page list here.
9717 		 */
9718 		if (!force_to_wired_list) {
9719 			count_wire--;
9720 			if (!preflight) {
9721 				hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9722 			}
9723 		}
9724 		if (discard) {
9725 			hibernate_discard_page(m);
9726 		}
9727 		m = next;
9728 	}
9729 
9730 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
9731 	while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
9732 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
9733 
9734 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9735 		discard = FALSE;
9736 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9737 		    hibernate_consider_discard(m, preflight)) {
9738 			if (!preflight) {
9739 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9740 			}
9741 			if (m->vmp_dirty) {
9742 				count_discard_purgeable++;
9743 			} else {
9744 				count_discard_cleaned++;
9745 			}
9746 			discard = discard_all;
9747 		} else {
9748 			count_cleaned++;
9749 		}
9750 		count_wire--;
9751 		if (!preflight) {
9752 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9753 		}
9754 		if (discard) {
9755 			hibernate_discard_page(m);
9756 		}
9757 		m = next;
9758 	}
9759 
9760 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9761 	while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
9762 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
9763 		bool force_to_wired_list = false;       /* Default to NOT forcing page into the wired page list */
9764 #if CONFIG_SPTM
9765 		force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9766 #endif
9767 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9768 		discard = FALSE;
9769 		if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) &&
9770 		    hibernate_consider_discard(m, preflight)) {
9771 			if (!preflight) {
9772 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9773 			}
9774 			if (m->vmp_dirty) {
9775 				count_discard_purgeable++;
9776 			} else {
9777 				count_discard_active++;
9778 			}
9779 			discard = discard_all;
9780 		} else {
9781 			/*
9782 			 * If the page must be force-added to the wired page list, prevent it from appearing
9783 			 * in the unwired page list.
9784 			 */
9785 			if (force_to_wired_list) {
9786 				if (!preflight) {
9787 					hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9788 				}
9789 			} else {
9790 				count_active++;
9791 			}
9792 		}
9793 		/*
9794 		 * If the page is NOT being forced into the wired page list, remove it from the
9795 		 * wired page list here.
9796 		 */
9797 		if (!force_to_wired_list) {
9798 			count_wire--;
9799 			if (!preflight) {
9800 				hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9801 			}
9802 		}
9803 		if (discard) {
9804 			hibernate_discard_page(m);
9805 		}
9806 		m = next;
9807 	}
9808 
9809 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9810 	while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
9811 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
9812 		bool force_to_wired_list = false;        /* Default to NOT forcing page into the wired page list */
9813 #if CONFIG_SPTM
9814 		force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9815 #endif
9816 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9817 		discard = FALSE;
9818 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9819 		    hibernate_consider_discard(m, preflight)) {
9820 			if (!preflight) {
9821 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9822 			}
9823 			if (m->vmp_dirty) {
9824 				count_discard_purgeable++;
9825 			} else {
9826 				count_discard_inactive++;
9827 			}
9828 			discard = discard_all;
9829 		} else {
9830 			/*
9831 			 * If the page must be force-added to the wired page list, prevent it from appearing
9832 			 * in the unwired page list.
9833 			 */
9834 			if (force_to_wired_list) {
9835 				if (!preflight) {
9836 					hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9837 				}
9838 			} else {
9839 				count_inactive++;
9840 			}
9841 		}
9842 		/*
9843 		 * If the page is NOT being forced into the wired page list, remove it from the
9844 		 * wired page list here.
9845 		 */
9846 		if (!force_to_wired_list) {
9847 			count_wire--;
9848 			if (!preflight) {
9849 				hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9850 			}
9851 		}
9852 		if (discard) {
9853 			hibernate_discard_page(m);
9854 		}
9855 		m = next;
9856 	}
9857 	/* XXX FBDP TODO: secluded queue */
9858 
9859 	for (uint32_t i = 0; i <= vm_page_max_speculative_age_q; i++) {
9860 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
9861 		while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
9862 			assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q,
9863 			    "Bad page: %p (0x%x:0x%x) on queue %d has state: %d (Discard: %d, Preflight: %d)",
9864 			    m, m->vmp_pageq.next, m->vmp_pageq.prev, i, m->vmp_q_state, discard, preflight);
9865 
9866 			next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9867 			discard = FALSE;
9868 			if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9869 			    hibernate_consider_discard(m, preflight)) {
9870 				if (!preflight) {
9871 					hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9872 				}
9873 				count_discard_speculative++;
9874 				discard = discard_all;
9875 			} else {
9876 				count_speculative++;
9877 			}
9878 			count_wire--;
9879 			if (!preflight) {
9880 				hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9881 			}
9882 			if (discard) {
9883 				hibernate_discard_page(m);
9884 			}
9885 			m = next;
9886 		}
9887 	}
9888 
9889 	vm_page_queue_iterate(&compressor_object->memq, m, vmp_listq) {
9890 		assert(m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
9891 
9892 		count_compressor++;
9893 		count_wire--;
9894 		if (!preflight) {
9895 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9896 		}
9897 	}
9898 
9899 
9900 	if (preflight == FALSE && discard_all == TRUE) {
9901 		KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_START);
9902 
9903 		HIBLOG("hibernate_teardown started\n");
9904 		count_discard_vm_struct_pages = hibernate_teardown_vm_structs(page_list, page_list_wired);
9905 		HIBLOG("hibernate_teardown completed - discarded %d\n", count_discard_vm_struct_pages);
9906 
9907 		pages -= count_discard_vm_struct_pages;
9908 		count_wire -= count_discard_vm_struct_pages;
9909 
9910 		hibernate_stats.cd_vm_struct_pages_unneeded = count_discard_vm_struct_pages;
9911 
9912 		KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_END);
9913 	}
9914 
9915 	if (!preflight) {
9916 		// pull wired from hibernate_bitmap
9917 		bitmap = &page_list->bank_bitmap[0];
9918 		bitmap_wired = &page_list_wired->bank_bitmap[0];
9919 		for (bank = 0; bank < page_list->bank_count; bank++) {
9920 			for (uint32_t i = 0; i < bitmap->bitmapwords; i++) {
9921 				bitmap->bitmap[i] = bitmap->bitmap[i] | ~bitmap_wired->bitmap[i];
9922 			}
9923 			bitmap = (hibernate_bitmap_t *)&bitmap->bitmap[bitmap->bitmapwords];
9924 			bitmap_wired = (hibernate_bitmap_t *) &bitmap_wired->bitmap[bitmap_wired->bitmapwords];
9925 		}
9926 	}
9927 
9928 	// machine dependent adjustments
9929 	hibernate_page_list_setall_machine(page_list, page_list_wired, preflight, &pages);
9930 
9931 	if (!preflight) {
9932 		hibernate_stats.cd_count_wire = count_wire;
9933 		hibernate_stats.cd_discarded = count_discard_active +
9934 		    count_discard_inactive + count_discard_purgeable +
9935 		    count_discard_speculative + count_discard_cleaned +
9936 		    count_discard_vm_struct_pages;
9937 	}
9938 
9939 	clock_get_uptime(&end);
9940 	absolutetime_to_nanoseconds(end - start, &nsec);
9941 	HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL);
9942 
9943 	HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, "
9944 	    "zf %d, throt %d, compr %d, xpmapped %d\n"
9945 	    "  %s discard act %d inact %d purgeable %d "
9946 	    "spec %d cleaned %d retired %d\n",
9947 	    pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative,
9948 	    count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped,
9949 	    discard_all ? "did" : "could",
9950 	    count_discard_active, count_discard_inactive, count_discard_purgeable,
9951 	    count_discard_speculative, count_discard_cleaned, count_retired);
9952 
9953 	if (hibernate_stats.cd_skipped_xpmapped) {
9954 		HIBLOG("WARNING: hibernate_page_list_setall skipped %d xpmapped pages\n",
9955 		    hibernate_stats.cd_skipped_xpmapped);
9956 	}
9957 
9958 	*pagesOut = pages - count_discard_active - count_discard_inactive -
9959 	    count_discard_purgeable - count_discard_speculative -
9960 	    count_discard_cleaned - count_retired;
9961 
9962 	if (preflight && will_discard) {
9963 		*pagesOut -= count_compressor + count_throttled +
9964 		    count_anonymous + count_inactive + count_cleaned +
9965 		    count_speculative + count_active;
9966 
9967 		/*
9968 		 * We try to keep max HIBERNATE_XPMAPPED_LIMIT pages around in the hibernation image
9969 		 * even if these are clean and so we need to size the hibernation image accordingly.
9970 		 *
9971 		 * NB: We have to assume all HIBERNATE_XPMAPPED_LIMIT pages might show up because 'dirty'
9972 		 * xpmapped pages aren't distinguishable from other 'dirty' pages in preflight. So we might
9973 		 * only see part of the xpmapped pages if we look at 'cd_found_xpmapped' which solely tracks
9974 		 * clean xpmapped pages.
9975 		 *
9976 		 * Since these pages are all cleaned by the time we are in the post-preflight phase, we might
9977 		 * see a much larger number in 'cd_found_xpmapped' now than we did in the preflight phase
9978 		 */
9979 		*pagesOut +=  HIBERNATE_XPMAPPED_LIMIT;
9980 	}
9981 
9982 	hibernation_vmqueues_inspection = FALSE;
9983 
9984 #if MACH_ASSERT || DEBUG
9985 	if (!preflight) {
9986 		if (vm_page_local_q) {
9987 			zpercpu_foreach(lq, vm_page_local_q) {
9988 				VPL_UNLOCK(&lq->vpl_lock);
9989 			}
9990 		}
9991 		vm_page_unlock_queues();
9992 	}
9993 #endif  /* MACH_ASSERT || DEBUG */
9994 
9995 	if (preflight) {
9996 		vm_free_page_unlock();
9997 		vm_page_unlock_queues();
9998 		vm_object_unlock(compressor_object);
9999 	}
10000 
10001 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_END, count_wire, *pagesOut, 0, 0, 0);
10002 }
10003 
10004 void
hibernate_page_list_discard(hibernate_page_list_t * page_list)10005 hibernate_page_list_discard(hibernate_page_list_t * page_list)
10006 {
10007 	uint64_t  start, end, nsec;
10008 	vm_page_t m;
10009 	vm_page_t next;
10010 	uint32_t  i;
10011 	uint32_t  count_discard_active    = 0;
10012 	uint32_t  count_discard_inactive  = 0;
10013 	uint32_t  count_discard_purgeable = 0;
10014 	uint32_t  count_discard_cleaned   = 0;
10015 	uint32_t  count_discard_speculative = 0;
10016 
10017 
10018 #if MACH_ASSERT || DEBUG
10019 	vm_page_lock_queues();
10020 	if (vm_page_local_q) {
10021 		zpercpu_foreach(lq, vm_page_local_q) {
10022 			VPL_LOCK(&lq->vpl_lock);
10023 		}
10024 	}
10025 #endif  /* MACH_ASSERT || DEBUG */
10026 
10027 	clock_get_uptime(&start);
10028 
10029 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10030 	while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
10031 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
10032 
10033 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10034 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10035 			if (m->vmp_dirty) {
10036 				count_discard_purgeable++;
10037 			} else {
10038 				count_discard_inactive++;
10039 			}
10040 			hibernate_discard_page(m);
10041 		}
10042 		m = next;
10043 	}
10044 
10045 	for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
10046 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
10047 		while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
10048 			assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
10049 
10050 			next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10051 			if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10052 				count_discard_speculative++;
10053 				hibernate_discard_page(m);
10054 			}
10055 			m = next;
10056 		}
10057 	}
10058 
10059 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10060 	while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
10061 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
10062 
10063 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10064 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10065 			if (m->vmp_dirty) {
10066 				count_discard_purgeable++;
10067 			} else {
10068 				count_discard_inactive++;
10069 			}
10070 			hibernate_discard_page(m);
10071 		}
10072 		m = next;
10073 	}
10074 	/* XXX FBDP TODO: secluded queue */
10075 
10076 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10077 	while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
10078 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
10079 
10080 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10081 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10082 			if (m->vmp_dirty) {
10083 				count_discard_purgeable++;
10084 			} else {
10085 				count_discard_active++;
10086 			}
10087 			hibernate_discard_page(m);
10088 		}
10089 		m = next;
10090 	}
10091 
10092 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
10093 	while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
10094 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
10095 
10096 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10097 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10098 			if (m->vmp_dirty) {
10099 				count_discard_purgeable++;
10100 			} else {
10101 				count_discard_cleaned++;
10102 			}
10103 			hibernate_discard_page(m);
10104 		}
10105 		m = next;
10106 	}
10107 
10108 #if MACH_ASSERT || DEBUG
10109 	if (vm_page_local_q) {
10110 		zpercpu_foreach(lq, vm_page_local_q) {
10111 			VPL_UNLOCK(&lq->vpl_lock);
10112 		}
10113 	}
10114 	vm_page_unlock_queues();
10115 #endif  /* MACH_ASSERT || DEBUG */
10116 
10117 	clock_get_uptime(&end);
10118 	absolutetime_to_nanoseconds(end - start, &nsec);
10119 	HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d cleaned %d\n",
10120 	    nsec / 1000000ULL,
10121 	    count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
10122 }
10123 
10124 boolean_t       hibernate_paddr_map_inited = FALSE;
10125 unsigned int    hibernate_teardown_last_valid_compact_indx = -1;
10126 vm_page_t       hibernate_rebuild_hash_list = NULL;
10127 
10128 unsigned int    hibernate_teardown_found_tabled_pages = 0;
10129 unsigned int    hibernate_teardown_found_created_pages = 0;
10130 unsigned int    hibernate_teardown_found_free_pages = 0;
10131 unsigned int    hibernate_teardown_vm_page_free_count;
10132 
10133 
10134 struct ppnum_mapping {
10135 	struct ppnum_mapping    *ppnm_next;
10136 	ppnum_t                 ppnm_base_paddr;
10137 	unsigned int            ppnm_sindx;
10138 	unsigned int            ppnm_eindx;
10139 };
10140 
10141 struct ppnum_mapping    *ppnm_head;
10142 struct ppnum_mapping    *ppnm_last_found = NULL;
10143 
10144 
10145 void
hibernate_create_paddr_map(void)10146 hibernate_create_paddr_map(void)
10147 {
10148 	unsigned int    i;
10149 	ppnum_t         next_ppnum_in_run = 0;
10150 	struct ppnum_mapping *ppnm = NULL;
10151 
10152 	if (hibernate_paddr_map_inited == FALSE) {
10153 		for (i = 0; i < vm_pages_count; i++) {
10154 			if (ppnm) {
10155 				ppnm->ppnm_eindx = i;
10156 			}
10157 
10158 			if (ppnm == NULL || VM_PAGE_GET_PHYS_PAGE(vm_page_get(i)) != next_ppnum_in_run) {
10159 				ppnm = zalloc_permanent_type(struct ppnum_mapping);
10160 
10161 				ppnm->ppnm_next = ppnm_head;
10162 				ppnm_head = ppnm;
10163 
10164 				ppnm->ppnm_sindx = i;
10165 				ppnm->ppnm_base_paddr = VM_PAGE_GET_PHYS_PAGE(vm_page_get(i));
10166 			}
10167 			next_ppnum_in_run = VM_PAGE_GET_PHYS_PAGE(vm_page_get(i)) + 1;
10168 		}
10169 		ppnm->ppnm_eindx = vm_pages_count;
10170 
10171 		hibernate_paddr_map_inited = TRUE;
10172 	}
10173 }
10174 
10175 static ppnum_t
hibernate_lookup_paddr(unsigned int indx)10176 hibernate_lookup_paddr(unsigned int indx)
10177 {
10178 	struct ppnum_mapping *ppnm = NULL;
10179 
10180 	ppnm = ppnm_last_found;
10181 
10182 	if (ppnm) {
10183 		if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
10184 			goto done;
10185 		}
10186 	}
10187 	for (ppnm = ppnm_head; ppnm; ppnm = ppnm->ppnm_next) {
10188 		if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
10189 			ppnm_last_found = ppnm;
10190 			break;
10191 		}
10192 	}
10193 	if (ppnm == NULL) {
10194 		panic("hibernate_lookup_paddr of %d failed", indx);
10195 	}
10196 done:
10197 	return ppnm->ppnm_base_paddr + (indx - ppnm->ppnm_sindx);
10198 }
10199 
10200 
10201 static uint32_t
hibernate_mark_as_unneeded(addr64_t saddr,addr64_t eaddr,hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)10202 hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
10203 {
10204 	addr64_t        saddr_aligned;
10205 	addr64_t        eaddr_aligned;
10206 	addr64_t        addr;
10207 	ppnum_t         paddr;
10208 	unsigned int    mark_as_unneeded_pages = 0;
10209 
10210 	saddr_aligned = (saddr + PAGE_MASK_64) & ~PAGE_MASK_64;
10211 	eaddr_aligned = eaddr & ~PAGE_MASK_64;
10212 
10213 	for (addr = saddr_aligned; addr < eaddr_aligned; addr += PAGE_SIZE_64) {
10214 		paddr = pmap_find_phys(kernel_pmap, addr);
10215 
10216 		assert(paddr);
10217 
10218 		hibernate_page_bitset(page_list, TRUE, paddr);
10219 		hibernate_page_bitset(page_list_wired, TRUE, paddr);
10220 
10221 		mark_as_unneeded_pages++;
10222 	}
10223 	return mark_as_unneeded_pages;
10224 }
10225 
10226 
10227 static void
hibernate_hash_insert_page(vm_page_t mem)10228 hibernate_hash_insert_page(vm_page_t mem)
10229 {
10230 	vm_page_bucket_t *bucket;
10231 	int             hash_id;
10232 	vm_object_t     m_object;
10233 
10234 	m_object = VM_PAGE_OBJECT(mem);
10235 
10236 	assert(mem->vmp_hashed);
10237 	assert(m_object);
10238 	assert(mem->vmp_offset != (vm_object_offset_t) -1);
10239 
10240 	/*
10241 	 *	Insert it into the object_object/offset hash table
10242 	 */
10243 	hash_id = vm_page_hash(m_object, mem->vmp_offset);
10244 	bucket = &vm_page_buckets[hash_id];
10245 
10246 	mem->vmp_next_m = bucket->page_list;
10247 	bucket->page_list = VM_PAGE_PACK_PTR(mem);
10248 }
10249 
10250 
10251 static void
hibernate_free_range_flush(vm_page_list_t * list)10252 hibernate_free_range_flush(vm_page_list_t *list)
10253 {
10254 	vm_page_free_queue_enter_list(*list, VMP_RELEASE_HIBERNATE);
10255 	*list = (vm_page_list_t){ };
10256 }
10257 
10258 static void
hibernate_free_range(vm_page_list_t * list,int sindx,int eindx)10259 hibernate_free_range(vm_page_list_t *list, int sindx, int eindx)
10260 {
10261 	for (; sindx < eindx; sindx++) {
10262 		vm_page_t mem  = vm_page_get(sindx);
10263 		ppnum_t   pnum = hibernate_lookup_paddr(sindx);
10264 
10265 		vm_page_init(mem, pnum);
10266 #if HAS_MTE
10267 		mem->vmp_using_mte = pmap_is_tagged_page(pnum);
10268 #endif /* HAS_MTE */
10269 		vm_page_list_push(list, mem);
10270 
10271 		/* Max batch size of these lists is 255 due to vmp_free_list_result_t */
10272 		if (list->vmpl_count >= UINT8_MAX) {
10273 			hibernate_free_range_flush(list);
10274 		}
10275 	}
10276 }
10277 
10278 void
hibernate_rebuild_vm_structs(void)10279 hibernate_rebuild_vm_structs(void)
10280 {
10281 	int             cindx, sindx, eindx;
10282 	vm_page_list_t  list = { };
10283 	vm_page_t       mem, tmem, mem_next;
10284 	AbsoluteTime    startTime, endTime;
10285 	uint64_t        nsec;
10286 
10287 	if (!hibernate_rebuild_needed) {
10288 		return;
10289 	}
10290 
10291 	KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_START);
10292 	HIBLOG("hibernate_rebuild started\n");
10293 
10294 	clock_get_uptime(&startTime);
10295 
10296 	pal_hib_rebuild_pmap_structs();
10297 
10298 	bzero(&vm_page_buckets[0], vm_page_bucket_count * sizeof(vm_page_bucket_t));
10299 	eindx = vm_pages_count;
10300 
10301 	/*
10302 	 * Mark all the vm_pages[] that have not been initialized yet as being
10303 	 * transient. This is needed to ensure that buddy page search is corrrect.
10304 	 * Without this random data in these vm_pages[] can trip the buddy search
10305 	 */
10306 	for (int i = hibernate_teardown_last_valid_compact_indx + 1; i < eindx; ++i) {
10307 		vm_page_get(i)->vmp_q_state = VM_PAGE_NOT_ON_Q;
10308 	}
10309 
10310 	for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
10311 		mem = vm_page_get(cindx);
10312 		assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
10313 		/*
10314 		 * hibernate_teardown_vm_structs leaves the location where
10315 		 * this vm_page_t must be located in "next".
10316 		 */
10317 		tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10318 		mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
10319 		assert(tmem >= mem);
10320 
10321 		sindx = (int)(tmem - vm_page_get(0));
10322 
10323 		if (mem != tmem) {
10324 			/*
10325 			 * this vm_page_t was moved by hibernate_teardown_vm_structs,
10326 			 * so move it back to its real location
10327 			 */
10328 			*tmem = *mem;
10329 			mem = tmem;
10330 		}
10331 		if (mem->vmp_hashed) {
10332 			hibernate_hash_insert_page(mem);
10333 		}
10334 		/*
10335 		 * the 'hole' between this vm_page_t and the previous
10336 		 * vm_page_t we moved needs to be initialized as
10337 		 * a range of free vm_page_t's
10338 		 */
10339 		hibernate_free_range(&list, sindx + 1, eindx);
10340 
10341 		eindx = sindx;
10342 	}
10343 	hibernate_free_range(&list, 0, sindx);
10344 	hibernate_free_range_flush(&list);
10345 
10346 	VM_CHECK_MEMORYSTATUS;
10347 
10348 	assert(vm_page_free_count == hibernate_teardown_vm_page_free_count);
10349 
10350 	/*
10351 	 * process the list of vm_page_t's that were entered in the hash,
10352 	 * but were not located in the vm_pages arrary... these are
10353 	 * vm_page_t's that were created on the fly (i.e. fictitious)
10354 	 */
10355 	for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) {
10356 		mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10357 
10358 		mem->vmp_next_m = 0;
10359 		hibernate_hash_insert_page(mem);
10360 	}
10361 	hibernate_rebuild_hash_list = NULL;
10362 
10363 	clock_get_uptime(&endTime);
10364 	SUB_ABSOLUTETIME(&endTime, &startTime);
10365 	absolutetime_to_nanoseconds(endTime, &nsec);
10366 
10367 	HIBLOG("hibernate_rebuild completed - took %qd msecs\n", nsec / 1000000ULL);
10368 
10369 	hibernate_rebuild_needed = false;
10370 
10371 	KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END);
10372 }
10373 
10374 static uint32_t
hibernate_teardown_vm_structs(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)10375 hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
10376 {
10377 	unsigned int    compact_target_indx;
10378 	unsigned int    mark_as_unneeded_pages = 0;
10379 	unsigned int    unneeded_vm_page_bucket_pages = 0;
10380 	unsigned int    unneeded_vm_pages_pages = 0;
10381 	unsigned int    unneeded_pmap_pages = 0;
10382 	addr64_t        start_of_unneeded = 0;
10383 	addr64_t        end_of_unneeded = 0;
10384 
10385 
10386 	if (hibernate_should_abort()) {
10387 		return 0;
10388 	}
10389 
10390 	hibernate_rebuild_needed = true;
10391 
10392 	HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, "
10393 	    "active_pages %d, inactive_pages %d, speculative_pages %d, "
10394 	    "cleaned_pages %d, compressor_pages %d\n",
10395 	    vm_page_wire_count, vm_page_free_count,
10396 	    vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count,
10397 	    vm_page_cleaned_count, compressor_object->resident_page_count);
10398 
10399 	for (uint32_t i = 0; i < vm_page_bucket_count; i++) {
10400 		vm_page_bucket_t *bucket = &vm_page_buckets[i];
10401 		vm_page_t mem, mem_next;
10402 
10403 		for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) {
10404 			assert(mem->vmp_hashed);
10405 
10406 			mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10407 
10408 			if (!vm_page_in_array(mem)) {
10409 				mem->vmp_next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list);
10410 				hibernate_rebuild_hash_list = mem;
10411 			}
10412 		}
10413 	}
10414 	unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0],
10415 	    (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired);
10416 	mark_as_unneeded_pages += unneeded_vm_page_bucket_pages;
10417 
10418 	hibernate_teardown_vm_page_free_count = vm_page_free_count;
10419 
10420 	compact_target_indx = 0;
10421 
10422 	vm_free_page_lock();
10423 
10424 	for (uint32_t i = 0; i < vm_pages_count; i++) {
10425 		vm_page_t         mem   = vm_page_get(i);
10426 		ppnum_t           pnum  = VM_PAGE_GET_PHYS_PAGE(mem);
10427 		vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
10428 
10429 		if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
10430 			vm_page_free_queue_remove(class, mem, pnum,
10431 			    VM_PAGE_ON_FREE_Q);
10432 			hibernate_teardown_found_free_pages++;
10433 
10434 			if (vm_page_get(compact_target_indx)->vmp_q_state != VM_PAGE_ON_FREE_Q) {
10435 				compact_target_indx = i;
10436 			}
10437 		} else {
10438 			/*
10439 			 * record this vm_page_t's original location
10440 			 * we need this even if it doesn't get moved
10441 			 * as an indicator to the rebuild function that
10442 			 * we don't have to move it
10443 			 */
10444 			mem->vmp_next_m = VM_PAGE_PACK_PTR(mem);
10445 
10446 			if (vm_page_get(compact_target_indx)->vmp_q_state == VM_PAGE_ON_FREE_Q) {
10447 				/*
10448 				 * we've got a hole to fill, so
10449 				 * move this vm_page_t to it's new home
10450 				 */
10451 				*vm_page_get(compact_target_indx) = *mem;
10452 				mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
10453 
10454 				hibernate_teardown_last_valid_compact_indx = compact_target_indx;
10455 				compact_target_indx++;
10456 			} else {
10457 				hibernate_teardown_last_valid_compact_indx = i;
10458 			}
10459 		}
10460 	}
10461 
10462 	vm_free_page_unlock();
10463 
10464 	unneeded_vm_pages_pages = hibernate_mark_as_unneeded(
10465 		(addr64_t)vm_page_get(hibernate_teardown_last_valid_compact_indx + 1),
10466 		(addr64_t)vm_page_get(vm_pages_count - 1),
10467 		page_list, page_list_wired);
10468 	mark_as_unneeded_pages += unneeded_vm_pages_pages;
10469 
10470 	pal_hib_teardown_pmap_structs(&start_of_unneeded, &end_of_unneeded);
10471 
10472 	if (start_of_unneeded) {
10473 		unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded,
10474 		    end_of_unneeded, page_list, page_list_wired);
10475 		mark_as_unneeded_pages += unneeded_pmap_pages;
10476 	}
10477 	HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n",
10478 	    unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages);
10479 
10480 	return mark_as_unneeded_pages;
10481 }
10482 
10483 #endif /* HIBERNATION */
10484 
10485 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
10486 
10487 #include <mach_vm_debug.h>
10488 #if     MACH_VM_DEBUG
10489 
10490 #include <mach_debug/hash_info.h>
10491 #include <vm/vm_debug_internal.h>
10492 
10493 /*
10494  *	Routine:	vm_page_info
10495  *	Purpose:
10496  *		Return information about the global VP table.
10497  *		Fills the buffer with as much information as possible
10498  *		and returns the desired size of the buffer.
10499  *	Conditions:
10500  *		Nothing locked.  The caller should provide
10501  *		possibly-pageable memory.
10502  */
10503 
10504 unsigned int
vm_page_info(hash_info_bucket_t * info,unsigned int count)10505 vm_page_info(
10506 	hash_info_bucket_t *info,
10507 	unsigned int count)
10508 {
10509 	unsigned int i;
10510 	lck_ticket_t *bucket_lock;
10511 
10512 	if (vm_page_bucket_count < count) {
10513 		count = vm_page_bucket_count;
10514 	}
10515 
10516 	for (i = 0; i < count; i++) {
10517 		vm_page_bucket_t *bucket = &vm_page_buckets[i];
10518 		unsigned int bucket_count = 0;
10519 		vm_page_t m;
10520 
10521 		bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
10522 		lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
10523 
10524 		for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
10525 		    m != VM_PAGE_NULL;
10526 		    m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->vmp_next_m))) {
10527 			bucket_count++;
10528 		}
10529 
10530 		lck_ticket_unlock(bucket_lock);
10531 
10532 		/* don't touch pageable memory while holding locks */
10533 		info[i].hib_count = bucket_count;
10534 	}
10535 
10536 	return vm_page_bucket_count;
10537 }
10538 #endif  /* MACH_VM_DEBUG */
10539 
10540 #if VM_PAGE_BUCKETS_CHECK
10541 void
vm_page_buckets_check(void)10542 vm_page_buckets_check(void)
10543 {
10544 	unsigned int i;
10545 	vm_page_t p;
10546 	unsigned int p_hash;
10547 	vm_page_bucket_t *bucket;
10548 	lck_ticket_t *bucket_lock;
10549 
10550 	if (!vm_page_buckets_check_ready) {
10551 		return;
10552 	}
10553 
10554 #if HIBERNATION
10555 	if (hibernate_rebuild_needed ||
10556 	    hibernate_rebuild_hash_list) {
10557 		panic("BUCKET_CHECK: hibernation in progress: "
10558 		    "rebuild_needed=%d rebuild_hash_list=%p\n",
10559 		    hibernate_rebuild_needed,
10560 		    hibernate_rebuild_hash_list);
10561 	}
10562 #endif /* HIBERNATION */
10563 
10564 #if VM_PAGE_FAKE_BUCKETS
10565 	char *cp;
10566 	for (cp = (char *) vm_page_fake_buckets_start;
10567 	    cp < (char *) vm_page_fake_buckets_end;
10568 	    cp++) {
10569 		if (*cp != 0x5a) {
10570 			panic("BUCKET_CHECK: corruption at %p in fake buckets "
10571 			    "[0x%llx:0x%llx]\n",
10572 			    cp,
10573 			    (uint64_t) vm_page_fake_buckets_start,
10574 			    (uint64_t) vm_page_fake_buckets_end);
10575 		}
10576 	}
10577 #endif /* VM_PAGE_FAKE_BUCKETS */
10578 
10579 	for (i = 0; i < vm_page_bucket_count; i++) {
10580 		vm_object_t     p_object;
10581 
10582 		bucket = &vm_page_buckets[i];
10583 		if (!bucket->page_list) {
10584 			continue;
10585 		}
10586 
10587 		bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
10588 		lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
10589 		p = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
10590 
10591 		while (p != VM_PAGE_NULL) {
10592 			p_object = VM_PAGE_OBJECT(p);
10593 
10594 			if (!p->vmp_hashed) {
10595 				panic("BUCKET_CHECK: page %p (%p,0x%llx) "
10596 				    "hash %d in bucket %d at %p "
10597 				    "is not hashed\n",
10598 				    p, p_object, p->vmp_offset,
10599 				    p_hash, i, bucket);
10600 			}
10601 			p_hash = vm_page_hash(p_object, p->vmp_offset);
10602 			if (p_hash != i) {
10603 				panic("BUCKET_CHECK: corruption in bucket %d "
10604 				    "at %p: page %p object %p offset 0x%llx "
10605 				    "hash %d\n",
10606 				    i, bucket, p, p_object, p->vmp_offset,
10607 				    p_hash);
10608 			}
10609 			p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m));
10610 		}
10611 		lck_ticket_unlock(bucket_lock);
10612 	}
10613 
10614 //	printf("BUCKET_CHECK: checked buckets\n");
10615 }
10616 #endif /* VM_PAGE_BUCKETS_CHECK */
10617 
10618 /*
10619  * 'vm_fault_enter' will place newly created pages (zero-fill and COW) onto the
10620  * local queues if they exist... its the only spot in the system where we add pages
10621  * to those queues...  once on those queues, those pages can only move to one of the
10622  * global page queues or the free queues... they NEVER move from local q to local q.
10623  * the 'local' state is stable when vm_page_queues_remove is called since we're behind
10624  * the global vm_page_queue_lock at this point...  we still need to take the local lock
10625  * in case this operation is being run on a different CPU then the local queue's identity,
10626  * but we don't have to worry about the page moving to a global queue or becoming wired
10627  * while we're grabbing the local lock since those operations would require the global
10628  * vm_page_queue_lock to be held, and we already own it.
10629  *
10630  * this is why its safe to utilze the wire_count field in the vm_page_t as the local_id...
10631  * 'wired' and local are ALWAYS mutually exclusive conditions.
10632  */
10633 
10634 void
vm_page_queues_remove(vm_page_t mem,boolean_t remove_from_specialq)10635 vm_page_queues_remove(vm_page_t mem, boolean_t remove_from_specialq)
10636 {
10637 	boolean_t       was_pageable = TRUE;
10638 	vm_object_t     m_object;
10639 
10640 	m_object = VM_PAGE_OBJECT(mem);
10641 
10642 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10643 
10644 	if (mem->vmp_q_state == VM_PAGE_NOT_ON_Q) {
10645 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10646 		if (remove_from_specialq == TRUE) {
10647 			vm_page_remove_from_specialq(mem);
10648 		}
10649 		/*if (mem->vmp_on_specialq != VM_PAGE_SPECIAL_Q_EMPTY) {
10650 		 *       assert(mem->vmp_specialq.next != 0);
10651 		 *       assert(mem->vmp_specialq.prev != 0);
10652 		 *  } else {*/
10653 		if (mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
10654 			assert(mem->vmp_specialq.next == 0);
10655 			assert(mem->vmp_specialq.prev == 0);
10656 		}
10657 		return;
10658 	}
10659 
10660 	if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
10661 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10662 		assert(mem->vmp_specialq.next == 0 &&
10663 		    mem->vmp_specialq.prev == 0 &&
10664 		    mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
10665 		return;
10666 	}
10667 	if (mem->vmp_q_state == VM_PAGE_IS_WIRED) {
10668 		/*
10669 		 * might put these guys on a list for debugging purposes
10670 		 * if we do, we'll need to remove this assert
10671 		 */
10672 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10673 		assert(mem->vmp_specialq.next == 0 &&
10674 		    mem->vmp_specialq.prev == 0);
10675 		/*
10676 		 * Recall that vmp_on_specialq also means a request to put
10677 		 * it on the special Q. So we don't want to reset that bit
10678 		 * just because a wiring request came in. We might want to
10679 		 * put it on the special queue post-unwiring.
10680 		 *
10681 		 * &&
10682 		 * mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
10683 		 */
10684 		return;
10685 	}
10686 
10687 	assert(m_object != compressor_object);
10688 	assert(!is_kernel_object(m_object));
10689 	assert(!vm_page_is_fictitious(mem));
10690 
10691 	switch (mem->vmp_q_state) {
10692 	case VM_PAGE_ON_ACTIVE_LOCAL_Q:
10693 	{
10694 		struct vpl      *lq;
10695 
10696 		lq = zpercpu_get_cpu(vm_page_local_q, mem->vmp_local_id);
10697 		VPL_LOCK(&lq->vpl_lock);
10698 		vm_page_queue_remove(&lq->vpl_queue, mem, vmp_pageq);
10699 		mem->vmp_local_id = 0;
10700 		lq->vpl_count--;
10701 		if (m_object->internal) {
10702 			lq->vpl_internal_count--;
10703 		} else {
10704 			lq->vpl_external_count--;
10705 		}
10706 		VPL_UNLOCK(&lq->vpl_lock);
10707 		was_pageable = FALSE;
10708 		break;
10709 	}
10710 	case VM_PAGE_ON_ACTIVE_Q:
10711 	{
10712 		vm_page_queue_remove(&vm_page_queue_active, mem, vmp_pageq);
10713 		vm_page_active_count--;
10714 		break;
10715 	}
10716 
10717 	case VM_PAGE_ON_INACTIVE_INTERNAL_Q:
10718 	{
10719 		assert(m_object->internal == TRUE);
10720 
10721 		vm_page_inactive_count--;
10722 		vm_page_queue_remove(&vm_page_queue_anonymous, mem, vmp_pageq);
10723 		vm_page_anonymous_count--;
10724 
10725 		vm_purgeable_q_advance_all();
10726 		vm_page_balance_inactive(3);
10727 		break;
10728 	}
10729 
10730 	case VM_PAGE_ON_INACTIVE_EXTERNAL_Q:
10731 	{
10732 		assert(m_object->internal == FALSE);
10733 
10734 		vm_page_inactive_count--;
10735 		vm_page_queue_remove(&vm_page_queue_inactive, mem, vmp_pageq);
10736 		vm_purgeable_q_advance_all();
10737 		vm_page_balance_inactive(3);
10738 		break;
10739 	}
10740 
10741 	case VM_PAGE_ON_INACTIVE_CLEANED_Q:
10742 	{
10743 		assert(m_object->internal == FALSE);
10744 
10745 		vm_page_inactive_count--;
10746 		vm_page_queue_remove(&vm_page_queue_cleaned, mem, vmp_pageq);
10747 		vm_page_cleaned_count--;
10748 		vm_page_balance_inactive(3);
10749 		break;
10750 	}
10751 
10752 	case VM_PAGE_ON_THROTTLED_Q:
10753 	{
10754 		assert(m_object->internal == TRUE);
10755 
10756 		vm_page_queue_remove(&vm_page_queue_throttled, mem, vmp_pageq);
10757 		vm_page_throttled_count--;
10758 		was_pageable = FALSE;
10759 		break;
10760 	}
10761 
10762 	case VM_PAGE_ON_SPECULATIVE_Q:
10763 	{
10764 		assert(m_object->internal == FALSE);
10765 
10766 		vm_page_remque(&mem->vmp_pageq);
10767 		vm_page_speculative_count--;
10768 		vm_page_balance_inactive(3);
10769 		break;
10770 	}
10771 
10772 #if CONFIG_SECLUDED_MEMORY
10773 	case VM_PAGE_ON_SECLUDED_Q:
10774 	{
10775 		vm_page_queue_remove(&vm_page_queue_secluded, mem, vmp_pageq);
10776 		vm_page_secluded_count--;
10777 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
10778 		if (m_object == VM_OBJECT_NULL) {
10779 			vm_page_secluded_count_free--;
10780 			was_pageable = FALSE;
10781 		} else {
10782 			assert(!m_object->internal);
10783 			vm_page_secluded_count_inuse--;
10784 			was_pageable = FALSE;
10785 //			was_pageable = TRUE;
10786 		}
10787 		break;
10788 	}
10789 #endif /* CONFIG_SECLUDED_MEMORY */
10790 
10791 	default:
10792 	{
10793 		/*
10794 		 *	if (mem->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)
10795 		 *              NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue...
10796 		 *              the caller is responsible for determing if the page is on that queue, and if so, must
10797 		 *              either first remove it (it needs both the page queues lock and the object lock to do
10798 		 *              this via vm_pageout_steal_laundry), or avoid the call to vm_page_queues_remove
10799 		 *
10800 		 *	we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q
10801 		 *	or any of the undefined states
10802 		 */
10803 		panic("vm_page_queues_remove - bad page q_state (%p, %d)", mem, mem->vmp_q_state);
10804 		break;
10805 	}
10806 	}
10807 	VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
10808 	mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
10809 
10810 	if (remove_from_specialq == TRUE) {
10811 		vm_page_remove_from_specialq(mem);
10812 	}
10813 	if (was_pageable) {
10814 		if (m_object->internal) {
10815 			vm_page_pageable_internal_count--;
10816 		} else {
10817 			vm_page_pageable_external_count--;
10818 		}
10819 	}
10820 }
10821 
10822 void
vm_page_remove_internal(vm_page_t page)10823 vm_page_remove_internal(vm_page_t page)
10824 {
10825 	vm_object_t __object = VM_PAGE_OBJECT(page);
10826 	if (page == __object->memq_hint) {
10827 		vm_page_t       __new_hint;
10828 		vm_page_queue_entry_t   __qe;
10829 		__qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->vmp_listq);
10830 		if (vm_page_queue_end(&__object->memq, __qe)) {
10831 			__qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->vmp_listq);
10832 			if (vm_page_queue_end(&__object->memq, __qe)) {
10833 				__qe = NULL;
10834 			}
10835 		}
10836 		__new_hint = (vm_page_t)((uintptr_t) __qe);
10837 		__object->memq_hint = __new_hint;
10838 	}
10839 	vm_page_queue_remove(&__object->memq, page, vmp_listq);
10840 #if CONFIG_SECLUDED_MEMORY
10841 	if (__object->eligible_for_secluded) {
10842 		vm_page_secluded.eligible_for_secluded--;
10843 	}
10844 #endif /* CONFIG_SECLUDED_MEMORY */
10845 #if HAS_MTE
10846 	assert_mte_vmo_matches_vmp(__object, page);
10847 #endif /* HAS_MTE */
10848 }
10849 
10850 void
vm_page_enqueue_inactive(vm_page_t mem,boolean_t first)10851 vm_page_enqueue_inactive(vm_page_t mem, boolean_t first)
10852 {
10853 	vm_object_t     m_object;
10854 
10855 	m_object = VM_PAGE_OBJECT(mem);
10856 
10857 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10858 	assert(!vm_page_is_fictitious(mem));
10859 	assert(!mem->vmp_laundry);
10860 	assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
10861 	vm_page_check_pageable_safe(mem);
10862 
10863 	if (m_object->internal) {
10864 		mem->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
10865 
10866 		if (first == TRUE) {
10867 			vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vmp_pageq);
10868 		} else {
10869 			vm_page_queue_enter(&vm_page_queue_anonymous, mem, vmp_pageq);
10870 		}
10871 
10872 		vm_page_anonymous_count++;
10873 		vm_page_pageable_internal_count++;
10874 	} else {
10875 		mem->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
10876 
10877 		if (first == TRUE) {
10878 			vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vmp_pageq);
10879 		} else {
10880 			vm_page_queue_enter(&vm_page_queue_inactive, mem, vmp_pageq);
10881 		}
10882 
10883 		vm_page_pageable_external_count++;
10884 	}
10885 	vm_page_inactive_count++;
10886 	token_new_pagecount++;
10887 
10888 	vm_page_add_to_specialq(mem, FALSE);
10889 }
10890 
10891 void
vm_page_enqueue_active(vm_page_t mem,boolean_t first)10892 vm_page_enqueue_active(vm_page_t mem, boolean_t first)
10893 {
10894 	vm_object_t     m_object;
10895 
10896 	m_object = VM_PAGE_OBJECT(mem);
10897 
10898 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10899 	assert(!vm_page_is_fictitious(mem));
10900 	assert(!mem->vmp_laundry);
10901 	assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
10902 	vm_page_check_pageable_safe(mem);
10903 
10904 	mem->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
10905 	if (first == TRUE) {
10906 		vm_page_queue_enter_first(&vm_page_queue_active, mem, vmp_pageq);
10907 	} else {
10908 		vm_page_queue_enter(&vm_page_queue_active, mem, vmp_pageq);
10909 	}
10910 	vm_page_active_count++;
10911 
10912 	if (m_object->internal) {
10913 		vm_page_pageable_internal_count++;
10914 	} else {
10915 		vm_page_pageable_external_count++;
10916 	}
10917 
10918 	vm_page_add_to_specialq(mem, FALSE);
10919 	vm_page_balance_inactive(3);
10920 }
10921 
10922 /*
10923  * Pages from special kernel objects shouldn't
10924  * be placed on pageable queues.
10925  */
10926 void
vm_page_check_pageable_safe(vm_page_t page)10927 vm_page_check_pageable_safe(vm_page_t page)
10928 {
10929 	vm_object_t     page_object;
10930 
10931 	page_object = VM_PAGE_OBJECT(page);
10932 
10933 	if (is_kernel_object(page_object)) {
10934 		panic("vm_page_check_pageable_safe: trying to add page"
10935 		    "from a kernel object to pageable queue");
10936 	}
10937 
10938 	if (page_object == compressor_object) {
10939 		panic("vm_page_check_pageable_safe: trying to add page"
10940 		    "from compressor object (%p) to pageable queue", compressor_object);
10941 	}
10942 }
10943 
10944 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
10945 * wired page diagnose
10946 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
10947 
10948 #include <libkern/OSKextLibPrivate.h>
10949 
10950 #define KA_SIZE(namelen, subtotalscount)        \
10951 	(sizeof(struct vm_allocation_site) + (namelen) + 1 + ((subtotalscount) * sizeof(struct vm_allocation_total)))
10952 
10953 #define KA_NAME(alloc)  \
10954 	((char *)(&(alloc)->subtotals[(alloc->subtotalscount)]))
10955 
10956 #define KA_NAME_LEN(alloc)      \
10957     (VM_TAG_NAME_LEN_MAX & (alloc->flags >> VM_TAG_NAME_LEN_SHIFT))
10958 
10959 vm_tag_t
vm_tag_bt(void)10960 vm_tag_bt(void)
10961 {
10962 	uintptr_t* frameptr;
10963 	uintptr_t* frameptr_next;
10964 	uintptr_t retaddr;
10965 	uintptr_t kstackb, kstackt;
10966 	const vm_allocation_site_t * site;
10967 	thread_t cthread;
10968 	kern_allocation_name_t name;
10969 
10970 	cthread = current_thread();
10971 	if (__improbable(cthread == NULL)) {
10972 		return VM_KERN_MEMORY_OSFMK;
10973 	}
10974 
10975 	if ((name = thread_get_kernel_state(cthread)->allocation_name)) {
10976 		if (!name->tag) {
10977 			vm_tag_alloc(name);
10978 		}
10979 		return name->tag;
10980 	}
10981 
10982 	kstackb = cthread->kernel_stack;
10983 	kstackt = kstackb + kernel_stack_size;
10984 
10985 	/* Load stack frame pointer (EBP on x86) into frameptr */
10986 	frameptr = __builtin_frame_address(0);
10987 	site = NULL;
10988 	while (frameptr != NULL) {
10989 		/* Verify thread stack bounds */
10990 		if (((uintptr_t)(frameptr + 2) > kstackt) || ((uintptr_t)frameptr < kstackb)) {
10991 			break;
10992 		}
10993 
10994 		/* Next frame pointer is pointed to by the previous one */
10995 		frameptr_next = (uintptr_t*) *frameptr;
10996 #if defined(HAS_APPLE_PAC)
10997 		frameptr_next = ptrauth_strip(frameptr_next, ptrauth_key_frame_pointer);
10998 #endif
10999 
11000 		/* Pull return address from one spot above the frame pointer */
11001 		retaddr = *(frameptr + 1);
11002 
11003 #if defined(HAS_APPLE_PAC)
11004 		retaddr = (uintptr_t) ptrauth_strip((void *)retaddr, ptrauth_key_return_address);
11005 #endif
11006 
11007 		if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text))
11008 		    || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) {
11009 			site = OSKextGetAllocationSiteForCaller(retaddr);
11010 			break;
11011 		}
11012 		frameptr = frameptr_next;
11013 	}
11014 
11015 	if (site) {
11016 		return site->tag;
11017 	}
11018 
11019 #if MACH_ASSERT
11020 	/*
11021 	 * Kernel tests appear here as unrecognized call sites and would get
11022 	 * no memory tag. Give them a default tag to prevent panics later.
11023 	 */
11024 	if (thread_get_test_option(test_option_vm_prevent_wire_tag_panic)) {
11025 		return VM_KERN_MEMORY_OSFMK;
11026 	}
11027 #endif
11028 
11029 	return VM_KERN_MEMORY_NONE;
11030 }
11031 
11032 static uint64_t free_tag_bits[VM_MAX_TAG_VALUE / 64];
11033 
11034 void
vm_tag_alloc_locked(vm_allocation_site_t * site,vm_allocation_site_t ** releasesiteP)11035 vm_tag_alloc_locked(vm_allocation_site_t * site, vm_allocation_site_t ** releasesiteP)
11036 {
11037 	vm_tag_t tag;
11038 	uint64_t avail;
11039 	uint32_t idx;
11040 	vm_allocation_site_t * prev;
11041 
11042 	if (site->tag) {
11043 		return;
11044 	}
11045 
11046 	idx = 0;
11047 	while (TRUE) {
11048 		avail = free_tag_bits[idx];
11049 		if (avail) {
11050 			tag = (vm_tag_t)__builtin_clzll(avail);
11051 			avail &= ~(1ULL << (63 - tag));
11052 			free_tag_bits[idx] = avail;
11053 			tag += (idx << 6);
11054 			break;
11055 		}
11056 		idx++;
11057 		if (idx >= ARRAY_COUNT(free_tag_bits)) {
11058 			for (idx = 0; idx < ARRAY_COUNT(vm_allocation_sites); idx++) {
11059 				prev = vm_allocation_sites[idx];
11060 				if (!prev) {
11061 					continue;
11062 				}
11063 				if (!KA_NAME_LEN(prev)) {
11064 					continue;
11065 				}
11066 				if (!prev->tag) {
11067 					continue;
11068 				}
11069 				if (prev->total) {
11070 					continue;
11071 				}
11072 				if (1 != prev->refcount) {
11073 					continue;
11074 				}
11075 
11076 				assert(idx == prev->tag);
11077 				tag = (vm_tag_t)idx;
11078 				prev->tag = VM_KERN_MEMORY_NONE;
11079 				*releasesiteP = prev;
11080 				break;
11081 			}
11082 			if (idx >= ARRAY_COUNT(vm_allocation_sites)) {
11083 				tag = VM_KERN_MEMORY_ANY;
11084 			}
11085 			break;
11086 		}
11087 	}
11088 	site->tag = tag;
11089 
11090 	OSAddAtomic16(1, &site->refcount);
11091 
11092 	if (VM_KERN_MEMORY_ANY != tag) {
11093 		vm_allocation_sites[tag] = site;
11094 	}
11095 
11096 	if (tag > vm_allocation_tag_highest) {
11097 		vm_allocation_tag_highest = tag;
11098 	}
11099 }
11100 
11101 static void
vm_tag_free_locked(vm_tag_t tag)11102 vm_tag_free_locked(vm_tag_t tag)
11103 {
11104 	uint64_t avail;
11105 	uint32_t idx;
11106 	uint64_t bit;
11107 
11108 	if (VM_KERN_MEMORY_ANY == tag) {
11109 		return;
11110 	}
11111 
11112 	idx = (tag >> 6);
11113 	avail = free_tag_bits[idx];
11114 	tag &= 63;
11115 	bit = (1ULL << (63 - tag));
11116 	assert(!(avail & bit));
11117 	free_tag_bits[idx] = (avail | bit);
11118 }
11119 
11120 static void
vm_tag_init(void)11121 vm_tag_init(void)
11122 {
11123 	vm_tag_t tag;
11124 	for (tag = VM_KERN_MEMORY_FIRST_DYNAMIC; tag < VM_KERN_MEMORY_ANY; tag++) {
11125 		vm_tag_free_locked(tag);
11126 	}
11127 
11128 	for (tag = VM_KERN_MEMORY_ANY + 1; tag < VM_MAX_TAG_VALUE; tag++) {
11129 		vm_tag_free_locked(tag);
11130 	}
11131 }
11132 
11133 vm_tag_t
vm_tag_alloc(vm_allocation_site_t * site)11134 vm_tag_alloc(vm_allocation_site_t * site)
11135 {
11136 	vm_allocation_site_t * releasesite;
11137 
11138 	if (!site->tag) {
11139 		releasesite = NULL;
11140 		lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11141 		vm_tag_alloc_locked(site, &releasesite);
11142 		lck_ticket_unlock(&vm_allocation_sites_lock);
11143 		if (releasesite) {
11144 			kern_allocation_name_release(releasesite);
11145 		}
11146 	}
11147 
11148 	return site->tag;
11149 }
11150 
11151 #ifndef ARRAY_SIZE
11152 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
11153 #endif /* ARRAY_SIZE */
11154 #define VM_KERN_MEMORY_ELEM(name) [VM_KERN_MEMORY_##name] = "VM_KERN_MEMORY_" #name
11155 const char *vm_kern_memory_names[] = {
11156 	VM_KERN_MEMORY_ELEM(NONE),
11157 	VM_KERN_MEMORY_ELEM(OSFMK),
11158 	VM_KERN_MEMORY_ELEM(BSD),
11159 	VM_KERN_MEMORY_ELEM(IOKIT),
11160 	VM_KERN_MEMORY_ELEM(LIBKERN),
11161 	VM_KERN_MEMORY_ELEM(OSKEXT),
11162 	VM_KERN_MEMORY_ELEM(KEXT),
11163 	VM_KERN_MEMORY_ELEM(IPC),
11164 	VM_KERN_MEMORY_ELEM(STACK),
11165 	VM_KERN_MEMORY_ELEM(CPU),
11166 	VM_KERN_MEMORY_ELEM(PMAP),
11167 	VM_KERN_MEMORY_ELEM(PTE),
11168 	VM_KERN_MEMORY_ELEM(ZONE),
11169 	VM_KERN_MEMORY_ELEM(KALLOC),
11170 	VM_KERN_MEMORY_ELEM(COMPRESSOR),
11171 	VM_KERN_MEMORY_ELEM(COMPRESSED_DATA),
11172 	VM_KERN_MEMORY_ELEM(PHANTOM_CACHE),
11173 	VM_KERN_MEMORY_ELEM(WAITQ),
11174 	VM_KERN_MEMORY_ELEM(DIAG),
11175 	VM_KERN_MEMORY_ELEM(LOG),
11176 	VM_KERN_MEMORY_ELEM(FILE),
11177 	VM_KERN_MEMORY_ELEM(MBUF),
11178 	VM_KERN_MEMORY_ELEM(UBC),
11179 	VM_KERN_MEMORY_ELEM(SECURITY),
11180 	VM_KERN_MEMORY_ELEM(MLOCK),
11181 	VM_KERN_MEMORY_ELEM(REASON),
11182 	VM_KERN_MEMORY_ELEM(SKYWALK),
11183 	VM_KERN_MEMORY_ELEM(LTABLE),
11184 	VM_KERN_MEMORY_ELEM(HV),
11185 	VM_KERN_MEMORY_ELEM(KALLOC_DATA),
11186 	VM_KERN_MEMORY_ELEM(RETIRED),
11187 	VM_KERN_MEMORY_ELEM(KALLOC_TYPE),
11188 	VM_KERN_MEMORY_ELEM(TRIAGE),
11189 	VM_KERN_MEMORY_ELEM(RECOUNT),
11190 	VM_KERN_MEMORY_ELEM(MTAG),
11191 	VM_KERN_MEMORY_ELEM(EXCLAVES),
11192 	VM_KERN_MEMORY_ELEM(EXCLAVES_SHARED),
11193 	VM_KERN_MEMORY_ELEM(KALLOC_SHARED),
11194 	VM_KERN_MEMORY_ELEM(CPUTRACE),
11195 };
11196 
11197 _Static_assert(ARRAY_SIZE(vm_kern_memory_names) == VM_KERN_MEMORY_FIRST_DYNAMIC,
11198     "vm_kern_memory_names must map all counter tags");
11199 
11200 #define VM_KERN_COUNT_ELEM(name) [VM_KERN_COUNT_##name] = "VM_KERN_COUNT_" #name
11201 const char *vm_kern_count_names[] = {
11202 	VM_KERN_COUNT_ELEM(MANAGED),
11203 	VM_KERN_COUNT_ELEM(RESERVED),
11204 	VM_KERN_COUNT_ELEM(WIRED),
11205 	VM_KERN_COUNT_ELEM(WIRED_MANAGED),
11206 	VM_KERN_COUNT_ELEM(STOLEN),
11207 	VM_KERN_COUNT_ELEM(LOPAGE),
11208 	VM_KERN_COUNT_ELEM(MAP_KERNEL),
11209 	VM_KERN_COUNT_ELEM(MAP_ZONE),
11210 	VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE),
11211 	VM_KERN_COUNT_ELEM(WIRED_BOOT),
11212 	VM_KERN_COUNT_ELEM(BOOT_STOLEN),
11213 	VM_KERN_COUNT_ELEM(WIRED_STATIC_KERNELCACHE),
11214 	VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE_DATA),
11215 	VM_KERN_COUNT_ELEM(MAP_KERNEL_DATA),
11216 	VM_KERN_COUNT_ELEM(EXCLAVES_CARVEOUT),
11217 };
11218 
11219 #if VM_BTLOG_TAGS
11220 #define VM_KERN_MEMORY_STR_MAX_LEN (32)
11221 TUNABLE_STR(vmtaglog, VM_KERN_MEMORY_STR_MAX_LEN, "vmtaglog", "");
11222 #define VM_TAG_BTLOG_SIZE (16u << 10)
11223 
11224 btlog_t vmtaglog_btlog;
11225 vm_tag_t vmtaglog_tag;
11226 
11227 static void
vm_tag_log(vm_object_t object,int64_t delta,void * fp)11228 vm_tag_log(vm_object_t object, int64_t delta, void *fp)
11229 {
11230 	if (is_kernel_object(object)) {
11231 		/* kernel object backtraces are tracked in vm entries */
11232 		return;
11233 	}
11234 	if (delta > 0) {
11235 		btref_t ref = btref_get(fp, BTREF_GET_NOWAIT);
11236 		btlog_record(vmtaglog_btlog, object, 0, ref);
11237 	} else if (object->wired_page_count == 0) {
11238 		btlog_erase(vmtaglog_btlog, object);
11239 	}
11240 }
11241 
11242 _Static_assert(ARRAY_SIZE(vm_kern_count_names) == VM_KERN_COUNTER_COUNT,
11243     "vm_kern_count_names must map all counter tags");
11244 
11245 static vm_tag_t
vm_tag_str_to_idx(char tagstr[VM_KERN_MEMORY_STR_MAX_LEN])11246 vm_tag_str_to_idx(char tagstr[VM_KERN_MEMORY_STR_MAX_LEN])
11247 {
11248 	for (vm_tag_t i = VM_KERN_MEMORY_OSFMK; i < ARRAY_SIZE(vm_kern_memory_names); i++) {
11249 		if (!strncmp(vm_kern_memory_names[i], tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11250 			return i;
11251 		}
11252 	}
11253 
11254 	if (!strncmp("dynamic", tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11255 		return VM_KERN_MEMORY_FIRST_DYNAMIC;
11256 	}
11257 
11258 	if (!strncmp("any", tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11259 		return VM_KERN_MEMORY_ANY;
11260 	}
11261 
11262 	printf("Unable to find vm tag %s for btlog\n", tagstr);
11263 	return VM_KERN_MEMORY_NONE;
11264 }
11265 
11266 __startup_func
11267 static void
vm_btlog_init(void)11268 vm_btlog_init(void)
11269 {
11270 	vmtaglog_tag = vm_tag_str_to_idx(vmtaglog);
11271 
11272 	if (vmtaglog_tag != VM_KERN_MEMORY_NONE) {
11273 		vmtaglog_btlog = btlog_create(BTLOG_HASH, VM_TAG_BTLOG_SIZE, 0);
11274 	}
11275 }
11276 STARTUP(ZALLOC, STARTUP_RANK_FIRST, vm_btlog_init);
11277 #endif /* VM_BTLOG_TAGS */
11278 
11279 void
vm_tag_update_size(vm_tag_t tag,int64_t delta,vm_object_t object)11280 vm_tag_update_size(vm_tag_t tag, int64_t delta, vm_object_t object)
11281 {
11282 	assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
11283 
11284 	kern_allocation_update_size(vm_allocation_sites[tag], delta, object);
11285 }
11286 
11287 uint64_t
vm_tag_get_size(vm_tag_t tag)11288 vm_tag_get_size(vm_tag_t tag)
11289 {
11290 	vm_allocation_site_t *allocation;
11291 
11292 	assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
11293 
11294 	allocation = vm_allocation_sites[tag];
11295 	return allocation ? os_atomic_load(&allocation->total, relaxed) : 0;
11296 }
11297 
11298 void
kern_allocation_update_size(kern_allocation_name_t allocation,int64_t delta,__unused vm_object_t object)11299 kern_allocation_update_size(kern_allocation_name_t allocation, int64_t delta, __unused vm_object_t object)
11300 {
11301 	uint64_t value;
11302 
11303 	value = os_atomic_add(&allocation->total, delta, relaxed);
11304 	if (delta < 0) {
11305 		assertf(value + (uint64_t)-delta > value,
11306 		    "tag %d, site %p", allocation->tag, allocation);
11307 	}
11308 
11309 #if DEBUG || DEVELOPMENT
11310 	/* release to publish the new total */
11311 	os_atomic_max(&allocation->peak, value, release);
11312 #endif /* DEBUG || DEVELOPMENT */
11313 
11314 	if (value == (uint64_t)delta && !allocation->tag) {
11315 		vm_tag_alloc(allocation);
11316 	}
11317 
11318 #if VM_BTLOG_TAGS
11319 	if (vmtaglog_matches(allocation->tag) && object) {
11320 		vm_tag_log(object, delta, __builtin_frame_address(0));
11321 	}
11322 #endif /* VM_BTLOG_TAGS */
11323 }
11324 
11325 #if DEBUG || DEVELOPMENT
11326 
11327 void
vm_tag_reset_all_peaks(void)11328 vm_tag_reset_all_peaks(void)
11329 {
11330 	vm_log("resetting peak size for all kernel tags\n");
11331 	for (vm_tag_t tag = 0; tag <= vm_allocation_tag_highest; tag++) {
11332 		vm_tag_reset_peak(tag);
11333 	}
11334 }
11335 
11336 kern_return_t
vm_tag_reset_peak(vm_tag_t tag)11337 vm_tag_reset_peak(vm_tag_t tag)
11338 {
11339 	if (tag > vm_allocation_tag_highest) {
11340 		return KERN_INVALID_ARGUMENT;
11341 	}
11342 
11343 	vm_allocation_site_t *site = vm_allocation_sites[tag];
11344 	vm_log_info("resetting peak size for kernel tag %s\n",
11345 	    KA_NAME(site));
11346 
11347 	uint64_t new_peak = os_atomic_load(&site->total, relaxed);
11348 	/* acquire updates to the total */
11349 	os_atomic_min(&site->peak, new_peak, acquire);
11350 
11351 	return KERN_SUCCESS;
11352 }
11353 
11354 #endif /* DEBUG || DEVELOPMENT */
11355 
11356 #if VM_TAG_SIZECLASSES
11357 
11358 void
vm_allocation_zones_init(void)11359 vm_allocation_zones_init(void)
11360 {
11361 	vm_offset_t   addr;
11362 	vm_size_t     size;
11363 
11364 	const vm_tag_t early_tags[] = {
11365 		VM_KERN_MEMORY_DIAG,
11366 		VM_KERN_MEMORY_KALLOC,
11367 		VM_KERN_MEMORY_KALLOC_DATA,
11368 		VM_KERN_MEMORY_KALLOC_TYPE,
11369 		VM_KERN_MEMORY_LIBKERN,
11370 		VM_KERN_MEMORY_OSFMK,
11371 		VM_KERN_MEMORY_RECOUNT,
11372 	};
11373 
11374 	size = VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *)
11375 	    + ARRAY_COUNT(early_tags) * VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
11376 
11377 	kmem_alloc(kernel_map, &addr, round_page(size),
11378 	    KMA_NOFAIL | KMA_KOBJECT | KMA_ZERO | KMA_PERMANENT,
11379 	    VM_KERN_MEMORY_DIAG);
11380 
11381 	vm_allocation_zone_totals = (vm_allocation_zone_total_t **) addr;
11382 	addr += VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *);
11383 
11384 	// prepopulate early tag ranges so allocations
11385 	// in vm_tag_update_zone_size() and early boot won't recurse
11386 	for (size_t i = 0; i < ARRAY_COUNT(early_tags); i++) {
11387 		vm_allocation_zone_totals[early_tags[i]] = (vm_allocation_zone_total_t *)addr;
11388 		addr += VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
11389 	}
11390 }
11391 
11392 __attribute__((noinline))
11393 static vm_tag_t
vm_tag_zone_stats_alloc(vm_tag_t tag,zalloc_flags_t flags)11394 vm_tag_zone_stats_alloc(vm_tag_t tag, zalloc_flags_t flags)
11395 {
11396 	vm_allocation_zone_total_t *stats;
11397 	vm_size_t size = sizeof(*stats) * VM_TAG_SIZECLASSES;
11398 
11399 	flags = Z_VM_TAG(Z_ZERO | flags, VM_KERN_MEMORY_DIAG);
11400 	stats = kalloc_data(size, flags);
11401 	if (!stats) {
11402 		return VM_KERN_MEMORY_NONE;
11403 	}
11404 	if (!os_atomic_cmpxchg(&vm_allocation_zone_totals[tag], NULL, stats, release)) {
11405 		kfree_data(stats, size);
11406 	}
11407 	return tag;
11408 }
11409 
11410 vm_tag_t
vm_tag_will_update_zone(vm_tag_t tag,uint32_t zflags)11411 vm_tag_will_update_zone(vm_tag_t tag, uint32_t zflags)
11412 {
11413 	assert(VM_KERN_MEMORY_NONE != tag);
11414 	assert(tag < VM_MAX_TAG_VALUE);
11415 
11416 	if (__probable(vm_allocation_zone_totals[tag])) {
11417 		return tag;
11418 	}
11419 	return vm_tag_zone_stats_alloc(tag, zflags);
11420 }
11421 
11422 void
vm_tag_update_zone_size(vm_tag_t tag,uint32_t zidx,long delta)11423 vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta)
11424 {
11425 	vm_allocation_zone_total_t *stats;
11426 	vm_size_t value;
11427 
11428 	assert(VM_KERN_MEMORY_NONE != tag);
11429 	assert(tag < VM_MAX_TAG_VALUE);
11430 
11431 	if (zidx >= VM_TAG_SIZECLASSES) {
11432 		return;
11433 	}
11434 
11435 	stats = vm_allocation_zone_totals[tag];
11436 	assert(stats);
11437 	stats += zidx;
11438 
11439 	value = os_atomic_add(&stats->vazt_total, delta, relaxed);
11440 	if (delta < 0) {
11441 		assertf((long)value >= 0, "zidx %d, tag %d, %p", zidx, tag, stats);
11442 		return;
11443 	} else if (os_atomic_load(&stats->vazt_peak, relaxed) < value) {
11444 		os_atomic_max(&stats->vazt_peak, value, relaxed);
11445 	}
11446 }
11447 
11448 #endif /* VM_TAG_SIZECLASSES */
11449 
11450 void
kern_allocation_update_subtotal(kern_allocation_name_t allocation,vm_tag_t subtag,int64_t delta)11451 kern_allocation_update_subtotal(kern_allocation_name_t allocation, vm_tag_t subtag, int64_t delta)
11452 {
11453 	kern_allocation_name_t other;
11454 	struct vm_allocation_total * total;
11455 	uint32_t subidx;
11456 
11457 	assert(VM_KERN_MEMORY_NONE != subtag);
11458 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11459 	for (subidx = 0; subidx < allocation->subtotalscount; subidx++) {
11460 		total = &allocation->subtotals[subidx];
11461 		if (subtag == total->tag) {
11462 			break;
11463 		}
11464 	}
11465 	if (subidx >= allocation->subtotalscount) {
11466 		for (subidx = 0; subidx < allocation->subtotalscount; subidx++) {
11467 			total = &allocation->subtotals[subidx];
11468 			if ((VM_KERN_MEMORY_NONE == total->tag)
11469 			    || !total->total) {
11470 				total->tag = (vm_tag_t)subtag;
11471 				break;
11472 			}
11473 		}
11474 	}
11475 	assert(subidx < allocation->subtotalscount);
11476 	if (subidx >= allocation->subtotalscount) {
11477 		lck_ticket_unlock(&vm_allocation_sites_lock);
11478 		return;
11479 	}
11480 	if (delta < 0) {
11481 		assertf(total->total >= ((uint64_t)-delta), "name %p", allocation);
11482 	}
11483 	OSAddAtomic64(delta, &total->total);
11484 	lck_ticket_unlock(&vm_allocation_sites_lock);
11485 
11486 	other = vm_allocation_sites[subtag];
11487 	assert(other);
11488 	if (delta < 0) {
11489 		assertf(other->mapped >= ((uint64_t)-delta), "other %p", other);
11490 	}
11491 	OSAddAtomic64(delta, &other->mapped);
11492 }
11493 
11494 const char *
kern_allocation_get_name(kern_allocation_name_t allocation)11495 kern_allocation_get_name(kern_allocation_name_t allocation)
11496 {
11497 	return KA_NAME(allocation);
11498 }
11499 
11500 kern_allocation_name_t
kern_allocation_name_allocate(const char * name,uint16_t subtotalscount)11501 kern_allocation_name_allocate(const char * name, uint16_t subtotalscount)
11502 {
11503 	kern_allocation_name_t allocation;
11504 	uint16_t namelen;
11505 
11506 	namelen = (uint16_t)strnlen(name, MACH_MEMORY_INFO_NAME_MAX_LEN - 1);
11507 
11508 	allocation = kalloc_data(KA_SIZE(namelen, subtotalscount), Z_WAITOK | Z_ZERO);
11509 	allocation->refcount       = 1;
11510 	allocation->subtotalscount = subtotalscount;
11511 	allocation->flags          = (uint16_t)(namelen << VM_TAG_NAME_LEN_SHIFT);
11512 	strlcpy(KA_NAME(allocation), name, namelen + 1);
11513 
11514 	vm_tag_alloc(allocation);
11515 	return allocation;
11516 }
11517 
11518 void
kern_allocation_name_release(kern_allocation_name_t allocation)11519 kern_allocation_name_release(kern_allocation_name_t allocation)
11520 {
11521 	assert(allocation->refcount > 0);
11522 	if (1 == OSAddAtomic16(-1, &allocation->refcount)) {
11523 		kfree_data(allocation,
11524 		    KA_SIZE(KA_NAME_LEN(allocation), allocation->subtotalscount));
11525 	}
11526 }
11527 
11528 #if !VM_TAG_ACTIVE_UPDATE
11529 static void
vm_page_count_object(mach_memory_info_t * info,unsigned int __unused num_info,vm_object_t object)11530 vm_page_count_object(mach_memory_info_t * info, unsigned int __unused num_info, vm_object_t object)
11531 {
11532 	if (!object->wired_page_count) {
11533 		return;
11534 	}
11535 	if (!is_kernel_object(object)) {
11536 		assert(object->wire_tag < num_info);
11537 		info[object->wire_tag].size += ptoa_64(object->wired_page_count);
11538 	}
11539 }
11540 
11541 typedef void (*vm_page_iterate_proc)(mach_memory_info_t * info,
11542     unsigned int num_info, vm_object_t object);
11543 
11544 static void
vm_page_iterate_purgeable_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc,purgeable_q_t queue,int group)11545 vm_page_iterate_purgeable_objects(mach_memory_info_t * info, unsigned int num_info,
11546     vm_page_iterate_proc proc, purgeable_q_t queue,
11547     int group)
11548 {
11549 	vm_object_t object;
11550 
11551 	for (object = (vm_object_t) queue_first(&queue->objq[group]);
11552 	    !queue_end(&queue->objq[group], (queue_entry_t) object);
11553 	    object = (vm_object_t) queue_next(&object->objq)) {
11554 		proc(info, num_info, object);
11555 	}
11556 }
11557 
11558 static void
vm_page_iterate_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc)11559 vm_page_iterate_objects(mach_memory_info_t * info, unsigned int num_info,
11560     vm_page_iterate_proc proc)
11561 {
11562 	vm_object_t     object;
11563 
11564 	lck_spin_lock_grp(&vm_objects_wired_lock, &vm_page_lck_grp_bucket);
11565 	queue_iterate(&vm_objects_wired,
11566 	    object,
11567 	    vm_object_t,
11568 	    wired_objq)
11569 	{
11570 		proc(info, num_info, object);
11571 	}
11572 	lck_spin_unlock(&vm_objects_wired_lock);
11573 }
11574 #endif /* ! VM_TAG_ACTIVE_UPDATE */
11575 
11576 static uint64_t
process_account(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,boolean_t iterated,bool redact_info __unused)11577 process_account(mach_memory_info_t * info, unsigned int num_info,
11578     uint64_t zones_collectable_bytes, boolean_t iterated, bool redact_info __unused)
11579 {
11580 	size_t                 namelen;
11581 	unsigned int           idx, count, nextinfo;
11582 	vm_allocation_site_t * site;
11583 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11584 
11585 	for (idx = 0; idx <= vm_allocation_tag_highest; idx++) {
11586 		site = vm_allocation_sites[idx];
11587 		if (!site) {
11588 			continue;
11589 		}
11590 		info[idx].mapped = site->mapped;
11591 		info[idx].tag    = site->tag;
11592 		if (!iterated) {
11593 			info[idx].size = site->total;
11594 #if DEBUG || DEVELOPMENT
11595 			info[idx].peak = site->peak;
11596 #endif /* DEBUG || DEVELOPMENT */
11597 		} else {
11598 			if (!site->subtotalscount && (site->total != info[idx].size)) {
11599 				printf("tag mismatch[%d] 0x%qx, iter 0x%qx\n", idx, site->total, info[idx].size);
11600 				info[idx].size = site->total;
11601 			}
11602 		}
11603 		info[idx].flags |= VM_KERN_SITE_WIRED;
11604 		if (idx < VM_KERN_MEMORY_FIRST_DYNAMIC) {
11605 			info[idx].site   = idx;
11606 			info[idx].flags |= VM_KERN_SITE_TAG;
11607 			if (VM_KERN_MEMORY_ZONE == idx) {
11608 				info[idx].flags |= VM_KERN_SITE_HIDE;
11609 				info[idx].flags &= ~VM_KERN_SITE_WIRED;
11610 				info[idx].collectable_bytes = zones_collectable_bytes;
11611 			}
11612 			info[idx].flags |= VM_KERN_SITE_NAMED;
11613 			strlcpy(info[idx].name, vm_kern_memory_names[idx], MACH_MEMORY_INFO_NAME_MAX_LEN);
11614 		} else if ((namelen = (VM_TAG_NAME_LEN_MAX & (site->flags >> VM_TAG_NAME_LEN_SHIFT)))) {
11615 			info[idx].site   = 0;
11616 			info[idx].flags |= VM_KERN_SITE_NAMED;
11617 			if (namelen > sizeof(info[idx].name)) {
11618 				namelen = sizeof(info[idx].name);
11619 			}
11620 			strncpy(&info[idx].name[0], KA_NAME(site), namelen);
11621 		} else if (VM_TAG_KMOD & site->flags) {
11622 			info[idx].site   = OSKextGetKmodIDForSite(site, NULL, 0);
11623 			info[idx].flags |= VM_KERN_SITE_KMOD;
11624 		} else {
11625 			info[idx].site   = VM_KERNEL_UNSLIDE(site);
11626 			info[idx].flags |= VM_KERN_SITE_KERNEL;
11627 		}
11628 	}
11629 
11630 	nextinfo = (vm_allocation_tag_highest + 1);
11631 	count    = nextinfo;
11632 	if (count >= num_info) {
11633 		count = num_info;
11634 	}
11635 
11636 	for (idx = 0; idx < count; idx++) {
11637 		site = vm_allocation_sites[idx];
11638 		if (!site) {
11639 			continue;
11640 		}
11641 #if VM_TAG_SIZECLASSES
11642 		vm_allocation_zone_total_t * zone;
11643 		unsigned int                 zidx;
11644 
11645 		if (!redact_info
11646 		    && vm_allocation_zone_totals
11647 		    && (zone = vm_allocation_zone_totals[idx])
11648 		    && (nextinfo < num_info)) {
11649 			for (zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
11650 				if (!zone[zidx].vazt_peak) {
11651 					continue;
11652 				}
11653 				info[nextinfo]        = info[idx];
11654 				info[nextinfo].zone   = zone_index_from_tag_index(zidx);
11655 				info[nextinfo].flags  &= ~VM_KERN_SITE_WIRED;
11656 				info[nextinfo].flags  |= VM_KERN_SITE_ZONE;
11657 				info[nextinfo].flags  |= VM_KERN_SITE_KALLOC;
11658 				info[nextinfo].size   = zone[zidx].vazt_total;
11659 				info[nextinfo].peak   = zone[zidx].vazt_peak;
11660 				info[nextinfo].mapped = 0;
11661 				nextinfo++;
11662 			}
11663 		}
11664 #endif /* VM_TAG_SIZECLASSES */
11665 		if (site->subtotalscount) {
11666 			uint64_t mapped, mapcost, take;
11667 			uint32_t sub;
11668 			vm_tag_t alloctag;
11669 
11670 			info[idx].size = site->total;
11671 			mapped = info[idx].size;
11672 			info[idx].mapped = mapped;
11673 			mapcost = 0;
11674 			for (sub = 0; sub < site->subtotalscount; sub++) {
11675 				alloctag = site->subtotals[sub].tag;
11676 				assert(alloctag < num_info);
11677 				if (info[alloctag].name[0] && alloctag >= VM_KERN_MEMORY_FIRST_DYNAMIC) {
11678 					continue;
11679 				}
11680 				take = site->subtotals[sub].total;
11681 				if (take > info[alloctag].size) {
11682 					take = info[alloctag].size;
11683 				}
11684 				if (take > mapped) {
11685 					take = mapped;
11686 				}
11687 				info[alloctag].mapped  -= take;
11688 				info[alloctag].size    -= take;
11689 				mapped                 -= take;
11690 				mapcost                += take;
11691 			}
11692 			info[idx].size = mapcost;
11693 		}
11694 	}
11695 	lck_ticket_unlock(&vm_allocation_sites_lock);
11696 
11697 	return 0;
11698 }
11699 
11700 uint32_t
vm_page_diagnose_estimate(void)11701 vm_page_diagnose_estimate(void)
11702 {
11703 	vm_allocation_site_t * site;
11704 	uint32_t               count = zone_view_count;
11705 	uint32_t               idx;
11706 
11707 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11708 	for (idx = 0; idx < VM_MAX_TAG_VALUE; idx++) {
11709 		site = vm_allocation_sites[idx];
11710 		if (!site) {
11711 			continue;
11712 		}
11713 		count++;
11714 #if VM_TAG_SIZECLASSES
11715 		if (vm_allocation_zone_totals) {
11716 			vm_allocation_zone_total_t * zone;
11717 			zone = vm_allocation_zone_totals[idx];
11718 			if (!zone) {
11719 				continue;
11720 			}
11721 			for (uint32_t zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
11722 				count += (zone[zidx].vazt_peak != 0);
11723 			}
11724 		}
11725 #endif
11726 	}
11727 	lck_ticket_unlock(&vm_allocation_sites_lock);
11728 
11729 	/* some slop for new tags created */
11730 	count += 8;
11731 	count += VM_KERN_COUNTER_COUNT;
11732 
11733 	return count;
11734 }
11735 
11736 static void
vm_page_diagnose_zone_stats(mach_memory_info_t * info,zone_stats_t zstats,bool percpu)11737 vm_page_diagnose_zone_stats(mach_memory_info_t *info, zone_stats_t zstats,
11738     bool percpu)
11739 {
11740 	zpercpu_foreach(zs, zstats) {
11741 		info->size += zs->zs_mem_allocated - zs->zs_mem_freed;
11742 	}
11743 	if (percpu) {
11744 		info->size *= zpercpu_count();
11745 	}
11746 	info->flags |= VM_KERN_SITE_NAMED | VM_KERN_SITE_ZONE_VIEW;
11747 }
11748 
11749 static void
vm_page_add_info(mach_memory_info_t * info,zone_stats_t stats,bool per_cpu,const char * parent_heap_name,const char * parent_zone_name,const char * view_name)11750 vm_page_add_info(
11751 	mach_memory_info_t     *info,
11752 	zone_stats_t            stats,
11753 	bool                    per_cpu,
11754 	const char             *parent_heap_name,
11755 	const char             *parent_zone_name,
11756 	const char             *view_name)
11757 {
11758 	vm_page_diagnose_zone_stats(info, stats, per_cpu);
11759 	snprintf(info->name, sizeof(info->name),
11760 	    "%s%s[%s]", parent_heap_name, parent_zone_name, view_name);
11761 }
11762 
11763 static void
vm_page_diagnose_zone(mach_memory_info_t * info,zone_t z)11764 vm_page_diagnose_zone(mach_memory_info_t *info, zone_t z)
11765 {
11766 	vm_page_add_info(info, z->z_stats, z->z_percpu, zone_heap_name(z),
11767 	    z->z_name, "raw");
11768 }
11769 
11770 static void
vm_page_add_view(mach_memory_info_t * info,zone_stats_t stats,const char * parent_heap_name,const char * parent_zone_name,const char * view_name)11771 vm_page_add_view(
11772 	mach_memory_info_t     *info,
11773 	zone_stats_t            stats,
11774 	const char             *parent_heap_name,
11775 	const char             *parent_zone_name,
11776 	const char             *view_name)
11777 {
11778 	vm_page_add_info(info, stats, false, parent_heap_name, parent_zone_name,
11779 	    view_name);
11780 }
11781 
11782 static uint32_t
vm_page_diagnose_heap_views(mach_memory_info_t * info,kalloc_heap_t kh,const char * parent_heap_name,const char * parent_zone_name)11783 vm_page_diagnose_heap_views(
11784 	mach_memory_info_t     *info,
11785 	kalloc_heap_t           kh,
11786 	const char             *parent_heap_name,
11787 	const char             *parent_zone_name)
11788 {
11789 	uint32_t i = 0;
11790 
11791 	while (kh) {
11792 		vm_page_add_view(info + i, kh->kh_stats, parent_heap_name,
11793 		    parent_zone_name, kh->kh_name);
11794 		kh = kh->kh_views;
11795 		i++;
11796 	}
11797 	return i;
11798 }
11799 
11800 static uint32_t
vm_page_diagnose_heap(mach_memory_info_t * info,kalloc_heap_t kheap)11801 vm_page_diagnose_heap(mach_memory_info_t *info, kalloc_heap_t kheap)
11802 {
11803 	uint32_t i = 0;
11804 
11805 	for (; i < KHEAP_NUM_ZONES; i++) {
11806 		vm_page_diagnose_zone(info + i, zone_by_id(kheap->kh_zstart + i));
11807 	}
11808 
11809 	i += vm_page_diagnose_heap_views(info + i, kheap->kh_views, kheap->kh_name,
11810 	    NULL);
11811 	return i;
11812 }
11813 
11814 static int
vm_page_diagnose_kt_heaps(mach_memory_info_t * info)11815 vm_page_diagnose_kt_heaps(mach_memory_info_t *info)
11816 {
11817 	uint32_t idx = 0;
11818 	vm_page_add_view(info + idx, KHEAP_KT_VAR->kh_stats, KHEAP_KT_VAR->kh_name,
11819 	    "", "raw");
11820 	idx++;
11821 
11822 	for (uint32_t i = 0; i < KT_VAR_MAX_HEAPS; i++) {
11823 		struct kheap_info heap = kalloc_type_heap_array[i];
11824 		char heap_num_tmp[MAX_ZONE_NAME] = "";
11825 		const char *heap_num;
11826 
11827 		snprintf(&heap_num_tmp[0], MAX_ZONE_NAME, "%u", i);
11828 		heap_num = &heap_num_tmp[0];
11829 
11830 		for (kalloc_type_var_view_t ktv = heap.kt_views; ktv;
11831 		    ktv = (kalloc_type_var_view_t) ktv->kt_next) {
11832 			if (ktv->kt_stats && ktv->kt_stats != KHEAP_KT_VAR->kh_stats) {
11833 				vm_page_add_view(info + idx, ktv->kt_stats, KHEAP_KT_VAR->kh_name,
11834 				    heap_num, ktv->kt_name);
11835 				idx++;
11836 			}
11837 		}
11838 
11839 		idx += vm_page_diagnose_heap_views(info + idx, heap.kh_views,
11840 		    KHEAP_KT_VAR->kh_name, heap_num);
11841 	}
11842 
11843 	return idx;
11844 }
11845 
11846 kern_return_t
vm_page_diagnose(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,bool redact_info)11847 vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes, bool redact_info)
11848 {
11849 	uint64_t                 wired_size;
11850 	uint64_t                 wired_managed_size;
11851 	uint64_t                 wired_reserved_size;
11852 	boolean_t                iterate;
11853 	mach_memory_info_t     * counts;
11854 	uint32_t                 i;
11855 
11856 	vmlp_api_start(VM_PAGE_DIAGNOSE);
11857 
11858 	bzero(info, num_info * sizeof(mach_memory_info_t));
11859 
11860 	if (!vm_page_wire_count_initial) {
11861 		vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_ABORTED);
11862 		return KERN_ABORTED;
11863 	}
11864 
11865 	wired_size          = ptoa_64(vm_page_wire_count);
11866 	wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count);
11867 #if XNU_TARGET_OS_OSX
11868 	wired_size          += ptoa_64(vm_lopage_free_count + vm_page_throttled_count);
11869 	wired_reserved_size += ptoa_64(vm_page_throttled_count);
11870 #endif /* XNU_TARGET_OS_OSX */
11871 #if CONFIG_EXCLAVES
11872 	wired_reserved_size -= exclaves_carveout_size;
11873 #endif /* CONFIG_EXCLAVES */
11874 	wired_managed_size  = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial);
11875 
11876 	wired_size += booter_size;
11877 
11878 	assert(num_info >= VM_KERN_COUNTER_COUNT);
11879 	num_info -= VM_KERN_COUNTER_COUNT;
11880 	counts = &info[num_info];
11881 
11882 #define SET_COUNT(xcount, xsize, xflags) MACRO_BEGIN \
11883     counts[xcount].tag   = VM_MAX_TAG_VALUE + xcount;   \
11884     counts[xcount].site  = (xcount);                            \
11885     counts[xcount].size  = (xsize);                                 \
11886     counts[xcount].mapped  = (xsize);                           \
11887     counts[xcount].flags = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED | xflags; \
11888     strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \
11889     MACRO_END;
11890 
11891 	SET_COUNT(VM_KERN_COUNT_MANAGED, ptoa_64(vm_page_pages), 0);
11892 	SET_COUNT(VM_KERN_COUNT_WIRED, wired_size, 0);
11893 	SET_COUNT(VM_KERN_COUNT_WIRED_MANAGED, wired_managed_size, 0);
11894 	SET_COUNT(VM_KERN_COUNT_RESERVED, wired_reserved_size, VM_KERN_SITE_WIRED);
11895 	SET_COUNT(VM_KERN_COUNT_STOLEN, ptoa_64(vm_page_stolen_count), VM_KERN_SITE_WIRED);
11896 	SET_COUNT(VM_KERN_COUNT_LOPAGE, ptoa_64(vm_lopage_free_count), VM_KERN_SITE_WIRED);
11897 	SET_COUNT(VM_KERN_COUNT_WIRED_BOOT, ptoa_64(vm_page_wire_count_on_boot), 0);
11898 	SET_COUNT(VM_KERN_COUNT_BOOT_STOLEN, booter_size, VM_KERN_SITE_WIRED);
11899 	SET_COUNT(VM_KERN_COUNT_WIRED_STATIC_KERNELCACHE, ptoa_64(vm_page_kernelcache_count), 0);
11900 #if CONFIG_EXCLAVES
11901 	SET_COUNT(VM_KERN_COUNT_EXCLAVES_CARVEOUT, exclaves_carveout_size + exclaves_bundle_size, VM_KERN_SITE_WIRED);
11902 #endif /* CONFIG_EXCLAVES */
11903 
11904 #define SET_MAP(xcount, xsize, xfree, xlargest) MACRO_BEGIN \
11905     counts[xcount].site    = (xcount);                  \
11906     counts[xcount].size    = (xsize);                   \
11907     counts[xcount].mapped  = (xsize);                   \
11908     counts[xcount].free    = (xfree);                   \
11909     counts[xcount].largest = (xlargest);                \
11910     counts[xcount].flags   = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED; \
11911     strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \
11912     MACRO_END;
11913 
11914 	vm_map_size_t map_size, map_free, map_largest;
11915 
11916 	vm_map_sizes(kernel_map, &map_size, &map_free, &map_largest);
11917 	SET_MAP(VM_KERN_COUNT_MAP_KERNEL, map_size, map_free, map_largest);
11918 
11919 	zone_map_sizes(&map_size, &map_free, &map_largest);
11920 	SET_MAP(VM_KERN_COUNT_MAP_ZONE, map_size, map_free, map_largest);
11921 
11922 	assert(num_info >= zone_view_count);
11923 	num_info -= zone_view_count;
11924 	counts = &info[num_info];
11925 	i = 0;
11926 
11927 	if (!redact_info) {
11928 		if (zone_is_data_buffers_kheap(KHEAP_DATA_BUFFERS->kh_heap_id)) {
11929 			i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_BUFFERS);
11930 		}
11931 		if (zone_is_data_shared_kheap(KHEAP_DATA_SHARED->kh_heap_id)) {
11932 			i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_SHARED);
11933 		}
11934 
11935 		if (KHEAP_KT_VAR->kh_heap_id == KHEAP_ID_KT_VAR) {
11936 			i += vm_page_diagnose_kt_heaps(counts + i);
11937 		}
11938 		assert(i <= zone_view_count);
11939 
11940 		zone_index_foreach(zidx) {
11941 			zone_t z = &zone_array[zidx];
11942 			zone_security_flags_t zsflags = zone_security_array[zidx];
11943 			zone_view_t zv = z->z_views;
11944 
11945 			if (zv == NULL) {
11946 				continue;
11947 			}
11948 
11949 			zone_stats_t zv_stats_head = z->z_stats;
11950 			bool has_raw_view = false;
11951 
11952 			for (; zv; zv = zv->zv_next) {
11953 				/*
11954 				 * kalloc_types that allocate from the same zone are linked
11955 				 * as views. Only print the ones that have their own stats.
11956 				 */
11957 				if (zv->zv_stats == zv_stats_head) {
11958 					continue;
11959 				}
11960 				has_raw_view = true;
11961 				vm_page_diagnose_zone_stats(counts + i, zv->zv_stats,
11962 				    z->z_percpu);
11963 				snprintf(counts[i].name, sizeof(counts[i].name), "%s%s[%s]",
11964 				    zone_heap_name(z), z->z_name, zv->zv_name);
11965 				i++;
11966 				assert(i <= zone_view_count);
11967 			}
11968 
11969 			/*
11970 			 * Print raw views for non kalloc or kalloc_type zones
11971 			 */
11972 			bool kalloc_type = zsflags.z_kalloc_type;
11973 			if ((zsflags.z_kheap_id == KHEAP_ID_NONE && !kalloc_type) ||
11974 			    (kalloc_type && has_raw_view)) {
11975 				vm_page_diagnose_zone(counts + i, z);
11976 				i++;
11977 				assert(i <= zone_view_count);
11978 			}
11979 		}
11980 	}
11981 
11982 	iterate = !VM_TAG_ACTIVE_UPDATE;
11983 	if (iterate) {
11984 		enum                       { kMaxKernelDepth = 1 };
11985 		vm_map_t                     maps[kMaxKernelDepth];
11986 		vm_map_entry_t               entries[kMaxKernelDepth];
11987 		vm_map_t                     map;
11988 		vm_map_entry_t               entry;
11989 		vm_object_offset_t           offset;
11990 		vm_page_t                    page;
11991 		int                          stackIdx, count;
11992 
11993 #if !VM_TAG_ACTIVE_UPDATE
11994 		vm_page_iterate_objects(info, num_info, &vm_page_count_object);
11995 #endif /* ! VM_TAG_ACTIVE_UPDATE */
11996 
11997 		map = kernel_map;
11998 		stackIdx = 0;
11999 		while (map) {
12000 			vm_map_lock(map);
12001 			for (entry = map->hdr.links.next; map; entry = entry->vme_next) {
12002 				if (entry->is_sub_map) {
12003 					assert(stackIdx < kMaxKernelDepth);
12004 					maps[stackIdx] = map;
12005 					entries[stackIdx] = entry;
12006 					stackIdx++;
12007 					map = VME_SUBMAP(entry);
12008 					entry = NULL;
12009 					break;
12010 				}
12011 
12012 				vmlp_range_event_entry(map, entry);
12013 
12014 				if (is_kernel_object(VME_OBJECT(entry))) {
12015 					count = 0;
12016 					vm_object_lock(VME_OBJECT(entry));
12017 					for (offset = entry->vme_start; offset < entry->vme_end; offset += page_size) {
12018 						page = vm_page_lookup(VME_OBJECT(entry), offset);
12019 						if (page && VM_PAGE_WIRED(page)) {
12020 							count++;
12021 						}
12022 					}
12023 					vm_object_unlock(VME_OBJECT(entry));
12024 
12025 					if (count) {
12026 						assert(VME_ALIAS(entry) != VM_KERN_MEMORY_NONE);
12027 						assert(VME_ALIAS(entry) < num_info);
12028 						info[VME_ALIAS(entry)].size += ptoa_64(count);
12029 					}
12030 				}
12031 				while (map && (entry == vm_map_last_entry(map))) {
12032 					vm_map_unlock(map);
12033 					if (!stackIdx) {
12034 						map = NULL;
12035 					} else {
12036 						--stackIdx;
12037 						map = maps[stackIdx];
12038 						entry = entries[stackIdx];
12039 					}
12040 				}
12041 			}
12042 		}
12043 	}
12044 
12045 	process_account(info, num_info, zones_collectable_bytes, iterate, redact_info);
12046 
12047 	vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_SUCCESS);
12048 	return KERN_SUCCESS;
12049 }
12050 
12051 #if DEBUG || DEVELOPMENT
12052 
12053 kern_return_t
vm_kern_allocation_info(uintptr_t addr,vm_size_t * size,vm_tag_t * tag,vm_size_t * zone_size)12054 vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_size_t * zone_size)
12055 {
12056 	kern_return_t  ret;
12057 	vm_size_t      zsize;
12058 	vm_map_t       map;
12059 	vm_map_entry_t entry;
12060 
12061 	vmlp_api_start(VM_KERN_ALLOCATION_INFO);
12062 
12063 	zsize = zone_element_info((void *) addr, tag);
12064 	if (zsize) {
12065 		*zone_size = *size = zsize;
12066 		vmlp_api_end(VM_KERN_ALLOCATION_INFO, KERN_SUCCESS);
12067 		return KERN_SUCCESS;
12068 	}
12069 
12070 	*zone_size = 0;
12071 	ret = KERN_INVALID_ADDRESS;
12072 	for (map = kernel_map; map;) {
12073 		vm_map_lock(map);
12074 		if (!vm_map_lookup_entry(map, addr, &entry)) {
12075 			break;
12076 		}
12077 		if (entry->is_sub_map) {
12078 			if (map != kernel_map) {
12079 				break;
12080 			}
12081 			map = VME_SUBMAP(entry);
12082 			continue;
12083 		}
12084 		if (entry->vme_start != addr) {
12085 			break;
12086 		}
12087 
12088 		vmlp_range_event_entry(map, entry);
12089 
12090 		*tag = (vm_tag_t)VME_ALIAS(entry);
12091 		*size = (entry->vme_end - addr);
12092 		ret = KERN_SUCCESS;
12093 		break;
12094 	}
12095 	if (map != kernel_map) {
12096 		vm_map_unlock(map);
12097 	}
12098 	vm_map_unlock(kernel_map);
12099 
12100 	vmlp_api_end(VM_KERN_ALLOCATION_INFO, ret);
12101 	return ret;
12102 }
12103 
12104 // some DEBUG/DEVELOPMENT code to get a process to page out its shared cache TEXT pages,
12105 // only used for DK driver in LPW testing
12106 uint64_t
vm_task_evict_shared_cache(task_t task)12107 vm_task_evict_shared_cache(task_t task)
12108 {
12109 	enum                       { kMaxKernelDepth = 3 };
12110 	vm_map_t                     maps[kMaxKernelDepth];
12111 	vm_map_entry_t               entries[kMaxKernelDepth];
12112 	vm_map_t                     map;
12113 	vm_object_t                  textObject, shadow;
12114 	vm_map_entry_t               entry;
12115 	vm_object_offset_t           textOffset, textSize;
12116 	int                          stackIdx;
12117 	uint64_t                     count;
12118 
12119 	count = counter_load(&task->pageins);
12120 	map = get_task_map(task);
12121 	textObject = NULL;
12122 	stackIdx = 0;
12123 	while (map) {
12124 		vm_map_lock_read(map);
12125 		for (entry = map->hdr.links.next; map; entry = entry->vme_next) {
12126 			if (entry->is_sub_map) {
12127 				assert(stackIdx < kMaxKernelDepth);
12128 				maps[stackIdx] = map;
12129 				entries[stackIdx] = entry;
12130 				stackIdx++;
12131 				map = VME_SUBMAP(entry);
12132 				entry = NULL;
12133 				break;
12134 			}
12135 			if (stackIdx && (VM_PROT_EXECUTE | VM_PROT_READ) == entry->protection) {
12136 				textObject = VME_OBJECT(entry);
12137 				vm_object_lock(textObject);
12138 				while ((shadow = textObject->shadow)) {
12139 					vm_object_lock(shadow);
12140 					vm_object_unlock(textObject);
12141 					textObject = shadow;
12142 				}
12143 				vm_object_reference_locked(textObject);
12144 				vm_object_unlock(textObject);
12145 				textOffset = VME_OFFSET(entry);
12146 				textSize   = entry->vme_end - entry->vme_start;
12147 				entry = vm_map_last_entry(map);
12148 			}
12149 			while (map && (entry == vm_map_last_entry(map))) {
12150 				vm_map_unlock_read(map);
12151 				if (!stackIdx) {
12152 					map = NULL;
12153 				} else {
12154 					--stackIdx;
12155 					map = maps[stackIdx];
12156 					entry = entries[stackIdx];
12157 					if (textObject) {
12158 						entry = vm_map_last_entry(map);
12159 					}
12160 				}
12161 			}
12162 		}
12163 	}
12164 
12165 	if (textObject) {
12166 		vm_object_sync(textObject, textOffset, textSize, true, false, false);
12167 		vm_object_deallocate(textObject);
12168 	}
12169 	return count;
12170 }
12171 
12172 uint64_t
vm_task_pageins(task_t task)12173 vm_task_pageins(task_t task)
12174 {
12175 	return counter_load(&task->pageins);
12176 }
12177 
12178 #endif /* DEBUG || DEVELOPMENT */
12179 
12180 uint32_t
vm_tag_get_kext(vm_tag_t tag,char * name,vm_size_t namelen)12181 vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen)
12182 {
12183 	vm_allocation_site_t * site;
12184 	uint32_t               kmodId;
12185 
12186 	kmodId = 0;
12187 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
12188 	if ((site = vm_allocation_sites[tag])) {
12189 		if (VM_TAG_KMOD & site->flags) {
12190 			kmodId = OSKextGetKmodIDForSite(site, name, namelen);
12191 		}
12192 	}
12193 	lck_ticket_unlock(&vm_allocation_sites_lock);
12194 
12195 	return kmodId;
12196 }
12197 
12198 
12199 #if CONFIG_SECLUDED_MEMORY
12200 /*
12201  * Note that there's no locking around other accesses to vm_page_secluded_target.
12202  * That should be OK, since these are the only place where it can be changed after
12203  * initialization. Other users (like vm_pageout) may see the wrong value briefly,
12204  * but will eventually get the correct value. This brief mismatch is OK as pageout
12205  * and page freeing will auto-adjust the vm_page_secluded_count to match the target
12206  * over time.
12207  */
12208 unsigned int vm_page_secluded_suppress_cnt = 0;
12209 unsigned int vm_page_secluded_save_target;
12210 
12211 LCK_GRP_DECLARE(secluded_suppress_slock_grp, "secluded_suppress_slock");
12212 LCK_SPIN_DECLARE(secluded_suppress_slock, &secluded_suppress_slock_grp);
12213 
12214 void
start_secluded_suppression(task_t task)12215 start_secluded_suppression(task_t task)
12216 {
12217 	if (task->task_suppressed_secluded) {
12218 		return;
12219 	}
12220 	lck_spin_lock(&secluded_suppress_slock);
12221 	if (!task->task_suppressed_secluded && vm_page_secluded_suppress_cnt++ == 0) {
12222 		task->task_suppressed_secluded = TRUE;
12223 		vm_page_secluded_save_target = vm_page_secluded_target;
12224 		vm_page_secluded_target = 0;
12225 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
12226 	}
12227 	lck_spin_unlock(&secluded_suppress_slock);
12228 }
12229 
12230 void
stop_secluded_suppression(task_t task)12231 stop_secluded_suppression(task_t task)
12232 {
12233 	lck_spin_lock(&secluded_suppress_slock);
12234 	if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) {
12235 		task->task_suppressed_secluded = FALSE;
12236 		vm_page_secluded_target = vm_page_secluded_save_target;
12237 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
12238 	}
12239 	lck_spin_unlock(&secluded_suppress_slock);
12240 }
12241 
12242 #endif /* CONFIG_SECLUDED_MEMORY */
12243 
12244 /*
12245  * Move the list of retired pages on the vm_page_queue_retired to
12246  * their final resting place on retired_pages_object.
12247  */
12248 void
vm_retire_boot_pages(void)12249 vm_retire_boot_pages(void)
12250 {
12251 }
12252 
12253 /*
12254  * This holds the reported physical address if an ECC error leads to a panic.
12255  * SMC will store it in PMU SRAM under the 'sECC' key.
12256  */
12257 uint64_t ecc_panic_physical_address = 0;
12258 
12259