1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_page.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Resident memory management module.
63 */
64 #include <debug.h>
65 #include <libkern/OSDebug.h>
66
67 #include <mach/clock_types.h>
68 #include <mach/vm_prot.h>
69 #include <mach/vm_statistics.h>
70 #include <mach/sdt.h>
71 #include <kern/counter.h>
72 #include <kern/host_statistics.h>
73 #include <kern/sched_prim.h>
74 #include <kern/policy_internal.h>
75 #include <kern/task.h>
76 #include <kern/thread.h>
77 #include <kern/kalloc.h>
78 #include <kern/zalloc_internal.h>
79 #include <kern/ledger.h>
80 #include <kern/ecc.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_init_xnu.h>
83 #include <vm/vm_map_internal.h>
84 #include <vm/vm_page_internal.h>
85 #include <vm/vm_pageout_internal.h>
86 #include <vm/vm_kern_xnu.h> /* kmem_alloc() */
87 #include <vm/vm_compressor_pager_internal.h>
88 #include <kern/misc_protos.h>
89 #include <mach_debug/zone_info.h>
90 #include <vm/cpm_internal.h>
91 #include <pexpert/pexpert.h>
92 #include <pexpert/device_tree.h>
93 #include <san/kasan.h>
94 #include <vm/vm_log.h>
95
96 #include <libkern/coreanalytics/coreanalytics.h>
97 #include <kern/backtrace.h>
98 #include <kern/telemetry.h>
99
100 #include <vm/vm_protos_internal.h>
101 #include <vm/memory_object.h>
102 #include <vm/vm_purgeable_internal.h>
103 #include <vm/vm_compressor_internal.h>
104 #include <vm/vm_iokit.h>
105 #include <vm/vm_object_internal.h>
106
107 #if HAS_MTE
108 #include <vm/vm_mteinfo_internal.h>
109 #endif /* HAS_MTE */
110
111 #if defined (__x86_64__)
112 #include <i386/misc_protos.h>
113 #endif
114
115 #if CONFIG_SPTM
116 #include <arm64/sptm/sptm.h>
117 #endif
118
119 #if CONFIG_PHANTOM_CACHE
120 #include <vm/vm_phantom_cache_internal.h>
121 #endif
122
123 #if HIBERNATION
124 #include <IOKit/IOHibernatePrivate.h>
125 #include <machine/pal_hibernate.h>
126 #endif /* HIBERNATION */
127
128 #if CONFIG_SECLUDED_MEMORY
129 static_assert(!XNU_VM_HAS_LOPAGE,
130 "VM_PAGE_ON_SECLUDED_Q and VM_PAGE_ON_FREE_LOPAGE_Q alias");
131 #endif
132
133 #include <sys/kdebug.h>
134
135 #if defined(HAS_APPLE_PAC)
136 #include <ptrauth.h>
137 #endif
138 #if defined(__arm64__)
139 #include <arm/cpu_internal.h>
140 #endif /* defined(__arm64__) */
141
142 /*
143 * During single threaded early boot we don't initialize all pages.
144 * This avoids some delay during boot. They'll be initialized and
145 * added to the free list as needed or after we are multithreaded by
146 * what becomes the pageout thread.
147 *
148 * This slows down booting the DEBUG kernel, particularly on
149 * large memory systems, but is worthwhile in deterministically
150 * trapping uninitialized memory usage.
151 */
152 #if DEBUG
153 static TUNABLE(uint32_t, fillval, "fill", 0xDEB8F177);
154 #else
155 static TUNABLE(uint32_t, fillval, "fill", 0);
156 #endif
157
158 #if MACH_ASSERT
159
160 TUNABLE(bool, vm_check_refs_on_alloc, "vm_check_refs_on_alloc", false);
161
162 #endif /* MACH_ASSERT */
163
164 extern boolean_t vm_pageout_running;
165 extern thread_t vm_pageout_scan_thread;
166 extern bool vps_dynamic_priority_enabled;
167
168 const uint16_t vm_page_inactive_states =
169 BIT(VM_PAGE_ON_INACTIVE_INTERNAL_Q) |
170 BIT(VM_PAGE_ON_INACTIVE_EXTERNAL_Q) |
171 BIT(VM_PAGE_ON_INACTIVE_CLEANED_Q);
172
173 const uint16_t vm_page_active_or_inactive_states =
174 vm_page_inactive_states |
175 #if CONFIG_SECLUDED_MEMORY
176 BIT(VM_PAGE_ON_SECLUDED_Q) |
177 #endif /* CONFIG_SECLUDED_MEMORY */
178 BIT(VM_PAGE_ON_ACTIVE_Q);
179
180 const uint16_t vm_page_non_speculative_pageable_states =
181 vm_page_active_or_inactive_states |
182 BIT(VM_PAGE_ON_THROTTLED_Q);
183
184 const uint16_t vm_page_pageable_states =
185 vm_page_non_speculative_pageable_states |
186 BIT(VM_PAGE_ON_SPECULATIVE_Q);
187
188 #if CONFIG_SECLUDED_MEMORY
189 struct vm_page_secluded_data vm_page_secluded;
190 #endif /* CONFIG_SECLUDED_MEMORY */
191 #if HIBERNATION
192 static bool hibernate_rebuild_needed = false;
193 #endif /* HIBERNATION */
194
195 #if DEVELOPMENT || DEBUG
196 extern struct memory_object_pager_ops shared_region_pager_ops;
197 unsigned int shared_region_pagers_resident_count = 0;
198 unsigned int shared_region_pagers_resident_peak = 0;
199 #endif /* DEVELOPMENT || DEBUG */
200
201
202
203 unsigned int PERCPU_DATA(start_color);
204 vm_page_t PERCPU_DATA(free_pages);
205 SCALABLE_COUNTER_DEFINE(vm_cpu_free_count);
206 boolean_t hibernate_cleaning_in_progress = FALSE;
207
208 atomic_counter_t vm_guard_count;
209
210 #if XNU_VM_HAS_LOPAGE
211 /*
212 * this interface exists to support hardware controllers
213 * incapable of generating DMAs with more than 32 bits
214 * of address on platforms with physical memory > 4G...
215 */
216 vm_page_queue_head_t vm_lopage_queue_free VM_PAGE_PACKED_ALIGNED;
217 uint32_t vm_lopage_free_count = 0;
218 uint32_t vm_lopage_free_limit = 0;
219 uint32_t vm_lopage_lowater = 0;
220 bool vm_lopage_refill = false;
221 bool vm_lopage_needed = false;
222 unsigned int vm_lopages_allocated_q = 0;
223 unsigned int vm_lopages_allocated_cpm_success = 0;
224 unsigned int vm_lopages_allocated_cpm_failed = 0;
225 #endif /* XNU_VM_HAS_LOPAGE */
226
227
228 int speculative_age_index = 0;
229 int speculative_steal_index = 0;
230 struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_RESERVED_SPECULATIVE_AGE_Q + 1];
231
232 boolean_t hibernation_vmqueues_inspection = FALSE; /* Tracks if the hibernation code is looking at the VM queues.
233 * Updated and checked behind the vm_page_queues_lock. */
234
235 static void vm_page_free_prepare(vm_page_t page);
236
237 #if HAS_MTE
238 void vm_page_wire_boot_tags(void);
239 #endif /* HAS_MTE */
240
241 static void vm_tag_init(void);
242
243 /* for debugging purposes */
244 SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask =
245 VM_PAGE_PACKED_FROM_ARRAY;
246 #ifndef __BUILDING_XNU_LIB_UNITTEST__ /* This is not a compile-time constant when building unit-test */
247 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params =
248 VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR);
249 #endif /* __BUILDING_XNU_LIB_UNITTEST__ */
250
251 /*
252 * Associated with page of user-allocatable memory is a
253 * page structure.
254 */
255
256 /*
257 * These variables record the values returned by vm_page_bootstrap,
258 * for debugging purposes. The implementation of pmap_steal_memory
259 * and pmap_startup here also uses them internally.
260 */
261
262 vm_offset_t virtual_space_start;
263 vm_offset_t virtual_space_end;
264 uint32_t vm_page_pages;
265
266 /*
267 * The vm_page_lookup() routine, which provides for fast
268 * (virtual memory object, offset) to page lookup, employs
269 * the following hash table. The vm_page_{insert,remove}
270 * routines install and remove associations in the table.
271 * [This table is often called the virtual-to-physical,
272 * or VP, table.]
273 */
274 typedef struct {
275 vm_page_packed_t page_list;
276 #if MACH_PAGE_HASH_STATS
277 int cur_count; /* current count */
278 int hi_count; /* high water mark */
279 #endif /* MACH_PAGE_HASH_STATS */
280 } vm_page_bucket_t;
281
282
283 #define BUCKETS_PER_LOCK 16
284
285 SECURITY_READ_ONLY_LATE(vm_page_bucket_t *) vm_page_buckets; /* Array of buckets */
286 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_count = 0; /* How big is array? */
287 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_mask; /* Mask for hash function */
288 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_shift; /* Shift for hash function */
289 SECURITY_READ_ONLY_LATE(uint32_t) vm_page_bucket_hash; /* Basic bucket hash */
290 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_lock_count = 0; /* How big is array of locks? */
291
292 #ifndef VM_TAG_ACTIVE_UPDATE
293 #error VM_TAG_ACTIVE_UPDATE
294 #endif
295 #ifndef VM_TAG_SIZECLASSES
296 #error VM_TAG_SIZECLASSES
297 #endif
298
299 /* for debugging */
300 SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE;
301 SECURITY_READ_ONLY_LATE(lck_ticket_t *) vm_page_bucket_locks;
302
303 vm_allocation_site_t vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1];
304 vm_allocation_site_t * vm_allocation_sites[VM_MAX_TAG_VALUE];
305 #if VM_TAG_SIZECLASSES
306 static vm_allocation_zone_total_t **vm_allocation_zone_totals;
307 #endif /* VM_TAG_SIZECLASSES */
308
309 vm_tag_t vm_allocation_tag_highest;
310
311 #if VM_PAGE_BUCKETS_CHECK
312 boolean_t vm_page_buckets_check_ready = FALSE;
313 #if VM_PAGE_FAKE_BUCKETS
314 vm_page_bucket_t *vm_page_fake_buckets; /* decoy buckets */
315 vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
316 #endif /* VM_PAGE_FAKE_BUCKETS */
317 #endif /* VM_PAGE_BUCKETS_CHECK */
318
319 #if MACH_PAGE_HASH_STATS
320 /* This routine is only for debug. It is intended to be called by
321 * hand by a developer using a kernel debugger. This routine prints
322 * out vm_page_hash table statistics to the kernel debug console.
323 */
324 void
hash_debug(void)325 hash_debug(void)
326 {
327 int i;
328 int numbuckets = 0;
329 int highsum = 0;
330 int maxdepth = 0;
331
332 for (i = 0; i < vm_page_bucket_count; i++) {
333 if (vm_page_buckets[i].hi_count) {
334 numbuckets++;
335 highsum += vm_page_buckets[i].hi_count;
336 if (vm_page_buckets[i].hi_count > maxdepth) {
337 maxdepth = vm_page_buckets[i].hi_count;
338 }
339 }
340 }
341 printf("Total number of buckets: %d\n", vm_page_bucket_count);
342 printf("Number used buckets: %d = %d%%\n",
343 numbuckets, 100 * numbuckets / vm_page_bucket_count);
344 printf("Number unused buckets: %d = %d%%\n",
345 vm_page_bucket_count - numbuckets,
346 100 * (vm_page_bucket_count - numbuckets) / vm_page_bucket_count);
347 printf("Sum of bucket max depth: %d\n", highsum);
348 printf("Average bucket depth: %d.%2d\n",
349 highsum / vm_page_bucket_count,
350 highsum % vm_page_bucket_count);
351 printf("Maximum bucket depth: %d\n", maxdepth);
352 }
353 #endif /* MACH_PAGE_HASH_STATS */
354
355 /*
356 * The virtual page size is currently implemented as a runtime
357 * variable, but is constant once initialized using vm_set_page_size.
358 * This initialization must be done in the machine-dependent
359 * bootstrap sequence, before calling other machine-independent
360 * initializations.
361 *
362 * All references to the virtual page size outside this
363 * module must use the PAGE_SIZE, PAGE_MASK and PAGE_SHIFT
364 * constants.
365 */
366 #if defined(__arm64__)
367 vm_size_t page_size;
368 vm_size_t page_mask;
369 int page_shift;
370 #else
371 vm_size_t page_size = PAGE_SIZE;
372 vm_size_t page_mask = PAGE_MASK;
373 int page_shift = PAGE_SHIFT;
374 #endif
375
376 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages;
377 #if XNU_VM_HAS_DELAYED_PAGES
378 vm_page_t vm_pages_end;
379 uint32_t vm_pages_count;
380 #else
381 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_end;
382 SECURITY_READ_ONLY_LATE(uint32_t) vm_pages_count;
383 #endif /* XNU_VM_HAS_DELAYED_PAGES */
384 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
385 SECURITY_READ_ONLY_LATE(ppnum_t) vm_pages_first_pnum;
386 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
387 #if HAS_MTE
388 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_tag_storage;
389 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_tag_storage_end;
390 #endif /* HAS_MTE */
391 #if CONFIG_SPTM
392 /*
393 * When used, these 128bit (MAX_COLORS bits) masks represent a "cluster"
394 * of contiguous free physical pages.
395 *
396 * For each cluster, there is an enqueue "index", which is -1 when there is no
397 * free page in the cluster, or the index in [0, 128) of the page that is
398 * enqueued on the vm_page_free_queue to represent the entire cluster.
399 *
400 * Grouping pages this way has the double nice effect to reduce doubly linked
401 * list (the worst data structure known to man when considering cache misses)
402 * manipulations, and also to mechanically make the VM serve more "contiguous"
403 * pages naturally.
404 */
405 static_assert(XNU_VM_HAS_LINEAR_PAGES_ARRAY);
406 SECURITY_READ_ONLY_LATE(__uint128_t *) _vm_pages_free_masks;
407 SECURITY_READ_ONLY_LATE(int8_t *) _vm_pages_free_enqueue_idx;
408 #endif /* CONFIG_SPTM */
409
410
411 /*
412 * Resident pages that represent real memory
413 * are allocated from a set of free lists,
414 * one per color.
415 */
416 SECURITY_READ_ONLY_LATE(unsigned int) vm_colors;
417 SECURITY_READ_ONLY_LATE(unsigned int) vm_color_mask; /* mask is == (vm_colors-1) */
418 unsigned int vm_cache_geometry_colors = 0; /* set by hw dependent code during startup */
419 unsigned int vm_free_magazine_refill_limit = 0;
420
421 struct vm_page_free_queue vm_page_queue_free;
422
423 unsigned int vm_page_free_wanted;
424 unsigned int vm_page_free_wanted_privileged;
425 #if CONFIG_SECLUDED_MEMORY
426 unsigned int vm_page_free_wanted_secluded;
427 #endif /* CONFIG_SECLUDED_MEMORY */
428 unsigned int vm_page_free_count;
429
430 unsigned int vm_page_realtime_count;
431
432 /*
433 * Occasionally, the virtual memory system uses
434 * resident page structures that do not refer to
435 * real pages, for example to leave a page with
436 * important state information in the VP table.
437 *
438 * These page structures are allocated the way
439 * most other kernel structures are.
440 */
441 SECURITY_READ_ONLY_LATE(zone_t) vm_page_zone;
442 vm_locks_array_t vm_page_locks;
443
444 LCK_ATTR_DECLARE(vm_page_lck_attr, 0, 0);
445 LCK_GRP_DECLARE(vm_page_lck_grp_free, "vm_page_free");
446 LCK_GRP_DECLARE(vm_page_lck_grp_queue, "vm_page_queue");
447 LCK_GRP_DECLARE(vm_page_lck_grp_local, "vm_page_queue_local");
448 LCK_GRP_DECLARE(vm_page_lck_grp_purge, "vm_page_purge");
449 LCK_GRP_DECLARE(vm_page_lck_grp_alloc, "vm_page_alloc");
450 LCK_GRP_DECLARE(vm_page_lck_grp_bucket, "vm_page_bucket");
451 LCK_SPIN_DECLARE_ATTR(vm_objects_wired_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
452 LCK_TICKET_DECLARE(vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
453
454 unsigned int vm_page_local_q_soft_limit = 250;
455 unsigned int vm_page_local_q_hard_limit = 500;
456 struct vpl *__zpercpu vm_page_local_q;
457
458 /* N.B. Guard and fictitious pages must not
459 * be assigned a zero phys_page value.
460 */
461 /*
462 * Fictitious pages don't have a physical address,
463 * but we must initialize phys_page to something.
464 * For debugging, this should be a strange value
465 * that the pmap module can recognize in assertions.
466 */
467 const ppnum_t vm_page_fictitious_addr = (ppnum_t) -1;
468
469 /*
470 * Guard pages are not accessible so they don't
471 * need a physical address, but we need to enter
472 * one in the pmap.
473 * Let's make it recognizable and make sure that
474 * we don't use a real physical page with that
475 * physical address.
476 */
477 const ppnum_t vm_page_guard_addr = (ppnum_t) -2;
478
479 /*
480 * Resident page structures are also chained on
481 * queues that are used by the page replacement
482 * system (pageout daemon). These queues are
483 * defined here, but are shared by the pageout
484 * module. The inactive queue is broken into
485 * file backed and anonymous for convenience as the
486 * pageout daemon often assignes a higher
487 * importance to anonymous pages (less likely to pick)
488 */
489 vm_page_queue_head_t vm_page_queue_active VM_PAGE_PACKED_ALIGNED;
490 vm_page_queue_head_t vm_page_queue_inactive VM_PAGE_PACKED_ALIGNED;
491 #if CONFIG_SECLUDED_MEMORY
492 vm_page_queue_head_t vm_page_queue_secluded VM_PAGE_PACKED_ALIGNED;
493 #endif /* CONFIG_SECLUDED_MEMORY */
494 vm_page_queue_head_t vm_page_queue_anonymous VM_PAGE_PACKED_ALIGNED; /* inactive memory queue for anonymous pages */
495 vm_page_queue_head_t vm_page_queue_throttled VM_PAGE_PACKED_ALIGNED;
496
497 queue_head_t vm_objects_wired;
498
499 vm_page_queue_head_t vm_page_queue_donate VM_PAGE_PACKED_ALIGNED;
500 uint32_t vm_page_donate_mode;
501 uint32_t vm_page_donate_target, vm_page_donate_target_high, vm_page_donate_target_low;
502 uint32_t vm_page_donate_count;
503 bool vm_page_donate_queue_ripe;
504
505
506 vm_page_queue_head_t vm_page_queue_background VM_PAGE_PACKED_ALIGNED;
507 uint32_t vm_page_background_target;
508 uint32_t vm_page_background_target_snapshot;
509 uint32_t vm_page_background_count;
510 uint64_t vm_page_background_promoted_count;
511
512 uint32_t vm_page_background_internal_count;
513 uint32_t vm_page_background_external_count;
514
515 uint32_t vm_page_background_mode;
516 uint32_t vm_page_background_exclude_external;
517
518 unsigned int vm_page_active_count;
519 unsigned int vm_page_inactive_count;
520 unsigned int vm_page_kernelcache_count;
521 #if CONFIG_SECLUDED_MEMORY
522 unsigned int vm_page_secluded_count;
523 unsigned int vm_page_secluded_count_free;
524 unsigned int vm_page_secluded_count_inuse;
525 unsigned int vm_page_secluded_count_over_target;
526 #endif /* CONFIG_SECLUDED_MEMORY */
527 unsigned int vm_page_anonymous_count;
528 unsigned int vm_page_throttled_count;
529 unsigned int vm_page_speculative_count;
530
531 unsigned int vm_page_wire_count;
532 unsigned int vm_page_wire_count_on_boot = 0;
533 unsigned int vm_page_stolen_count = 0;
534 unsigned int vm_page_wire_count_initial;
535 unsigned int vm_page_gobble_count = 0;
536 unsigned int vm_page_kern_lpage_count = 0;
537
538 uint64_t booter_size; /* external so it can be found in core dumps */
539
540 #define VM_PAGE_WIRE_COUNT_WARNING 0
541 #define VM_PAGE_GOBBLE_COUNT_WARNING 0
542
543 unsigned int vm_page_purgeable_count = 0; /* # of pages purgeable now */
544 unsigned int vm_page_purgeable_wired_count = 0; /* # of purgeable pages that are wired now */
545 uint64_t vm_page_purged_count = 0; /* total count of purged pages */
546
547 unsigned int vm_page_xpmapped_external_count = 0;
548 unsigned int vm_page_external_count = 0;
549 unsigned int vm_page_internal_count = 0;
550 unsigned int vm_page_pageable_external_count = 0;
551 unsigned int vm_page_pageable_internal_count = 0;
552
553 #if DEVELOPMENT || DEBUG
554 unsigned int vm_page_speculative_recreated = 0;
555 unsigned int vm_page_speculative_created = 0;
556 unsigned int vm_page_speculative_used = 0;
557 #endif
558
559 _Atomic unsigned int vm_page_swapped_count = 0;
560
561 vm_page_queue_head_t vm_page_queue_cleaned VM_PAGE_PACKED_ALIGNED;
562
563 unsigned int vm_page_cleaned_count = 0;
564
565 uint64_t max_valid_dma_address = 0xffffffffffffffffULL;
566 ppnum_t max_valid_low_ppnum = PPNUM_MAX;
567
568
569 /*
570 * Several page replacement parameters are also
571 * shared with this module, so that page allocation
572 * (done here in vm_page_alloc) can trigger the
573 * pageout daemon.
574 */
575 unsigned int vm_page_free_target = 0;
576 unsigned int vm_page_free_min = 0;
577 unsigned int vm_page_throttle_limit = 0;
578 unsigned int vm_page_inactive_target = 0;
579 #if CONFIG_SECLUDED_MEMORY
580 unsigned int vm_page_secluded_target = 0;
581 #endif /* CONFIG_SECLUDED_MEMORY */
582 unsigned int vm_page_anonymous_min = 0;
583 unsigned int vm_page_free_reserved = 0;
584
585
586 /*
587 * The VM system has a couple of heuristics for deciding
588 * that pages are "uninteresting" and should be placed
589 * on the inactive queue as likely candidates for replacement.
590 * These variables let the heuristics be controlled at run-time
591 * to make experimentation easier.
592 */
593
594 boolean_t vm_page_deactivate_hint = TRUE;
595
596 struct vm_page_stats_reusable vm_page_stats_reusable;
597
598 /*
599 * vm_set_page_size:
600 *
601 * Sets the page size, perhaps based upon the memory
602 * size. Must be called before any use of page-size
603 * dependent functions.
604 *
605 * Sets page_shift and page_mask from page_size.
606 */
607 void
vm_set_page_size(void)608 vm_set_page_size(void)
609 {
610 page_size = PAGE_SIZE;
611 page_mask = PAGE_MASK;
612 page_shift = PAGE_SHIFT;
613
614 if ((page_mask & page_size) != 0) {
615 panic("vm_set_page_size: page size not a power of two");
616 }
617
618 for (page_shift = 0;; page_shift++) {
619 if ((1U << page_shift) == page_size) {
620 break;
621 }
622 }
623 }
624
625 #if HAS_MTE
626
627 bool
vm_page_is_tag_storage_pnum(vm_page_t mem,ppnum_t pnum)628 vm_page_is_tag_storage_pnum(vm_page_t mem, ppnum_t pnum)
629 {
630 return pmap_in_tag_storage_range(pnum) &&
631 !mteinfo_tag_storage_disabled(mem);
632 }
633
634 #endif
635
636 /*
637 * @abstract
638 * Given a page, returns the memory class of that page.
639 */
640 static vm_memory_class_t
vm_page_get_memory_class(vm_page_t mem __unused,ppnum_t pnum __unused)641 vm_page_get_memory_class(vm_page_t mem __unused, ppnum_t pnum __unused)
642 {
643 assert(!vm_page_is_fictitious(mem));
644
645 #if XNU_VM_HAS_LOPAGE
646 if (mem->vmp_lopage) {
647 return VM_MEMORY_CLASS_LOPAGE;
648 }
649 #endif /* XNU_VM_HAS_LOPAGE */
650 #if HAS_MTE
651 if (mem->vmp_using_mte) {
652 return VM_MEMORY_CLASS_TAGGED;
653 } else if (!is_mte_enabled || !pmap_in_tag_storage_range(pnum)) {
654 return VM_MEMORY_CLASS_REGULAR;
655 } else if (mteinfo_tag_storage_disabled(mem)) {
656 return VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
657 } else {
658 return VM_MEMORY_CLASS_TAG_STORAGE;
659 }
660 #else /* !HAS_MTE */
661 return VM_MEMORY_CLASS_REGULAR;
662 #endif /* !HAS_MTE */
663 }
664
665 /*
666 * vm_page_is_restricted:
667 *
668 * Checks if a given vm_page_t is a restricted page.
669 */
670 inline bool
vm_page_is_restricted(vm_page_t mem)671 vm_page_is_restricted(vm_page_t mem)
672 {
673 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(mem);
674 return pmap_is_page_restricted(pn);
675 }
676
677 #ifdef __x86_64__
678
679 #define MAX_CLUMP_SIZE 16
680 #define DEFAULT_CLUMP_SIZE 4
681
682 unsigned int vm_clump_size, vm_clump_mask, vm_clump_shift, vm_clump_promote_threshold;
683
684 #if DEVELOPMENT || DEBUG
685 unsigned long vm_clump_stats[MAX_CLUMP_SIZE + 1];
686 unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
687
688 static inline void
vm_clump_update_stats(unsigned int c)689 vm_clump_update_stats(unsigned int c)
690 {
691 assert(c <= vm_clump_size);
692 if (c > 0 && c <= vm_clump_size) {
693 vm_clump_stats[c] += c;
694 }
695 vm_clump_allocs += c;
696 }
697 #endif /* if DEVELOPMENT || DEBUG */
698
699 /* Called once to setup the VM clump knobs */
700 static void
vm_page_setup_clump(void)701 vm_page_setup_clump( void )
702 {
703 unsigned int override, n;
704
705 vm_clump_size = DEFAULT_CLUMP_SIZE;
706 if (PE_parse_boot_argn("clump_size", &override, sizeof(override))) {
707 vm_clump_size = override;
708 }
709
710 if (vm_clump_size > MAX_CLUMP_SIZE) {
711 panic("vm_page_setup_clump:: clump_size is too large!");
712 }
713 if (vm_clump_size < 1) {
714 panic("vm_page_setup_clump:: clump_size must be >= 1");
715 }
716 if ((vm_clump_size & (vm_clump_size - 1)) != 0) {
717 panic("vm_page_setup_clump:: clump_size must be a power of 2");
718 }
719
720 vm_clump_promote_threshold = vm_clump_size;
721 vm_clump_mask = vm_clump_size - 1;
722 for (vm_clump_shift = 0, n = vm_clump_size; n > 1; n >>= 1, vm_clump_shift++) {
723 ;
724 }
725
726 #if DEVELOPMENT || DEBUG
727 bzero(vm_clump_stats, sizeof(vm_clump_stats));
728 vm_clump_allocs = vm_clump_inserts = vm_clump_inrange = vm_clump_promotes = 0;
729 #endif /* if DEVELOPMENT || DEBUG */
730 }
731
732 #endif /* __x86_64__ */
733
734 void
vm_page_free_queue_init(vm_page_free_queue_t free_queue)735 vm_page_free_queue_init(vm_page_free_queue_t free_queue)
736 {
737 for (unsigned int color = 0; color < MAX_COLORS; color++) {
738 vm_page_queue_init(&free_queue->vmpfq_queues[color].qhead);
739 }
740 }
741
742 /*!
743 * @function vm_page_free_queue_for_class()
744 *
745 * @abstract
746 * Returns the appropriate free queue for the given class and page color.
747 */
748 __pure2
749 static vm_page_queue_t
vm_page_free_queue_for_class(vm_memory_class_t mem_class,unsigned int color)750 vm_page_free_queue_for_class(vm_memory_class_t mem_class, unsigned int color)
751 {
752 switch (mem_class) {
753 case VM_MEMORY_CLASS_REGULAR:
754 #if HAS_MTE
755 case VM_MEMORY_CLASS_TAGGED:
756 case VM_MEMORY_CLASS_TAG_STORAGE:
757 if (is_mte_enabled) {
758 return NULL;
759 }
760 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
761 #endif
762 return &vm_page_queue_free.vmpfq_queues[color].qhead;
763 #if XNU_VM_HAS_LOPAGE
764 case VM_MEMORY_CLASS_LOPAGE:
765 return &vm_lopage_queue_free;
766 #endif /* XNU_VM_HAS_LOPAGE */
767 #if CONFIG_SECLUDED_MEMORY
768 case VM_MEMORY_CLASS_SECLUDED:
769 return &vm_page_queue_secluded;
770 #endif
771 }
772 }
773
774 __pure2
775 static bool
vm_page_free_queue_has_colors(vm_memory_class_t mem_class)776 vm_page_free_queue_has_colors(vm_memory_class_t mem_class)
777 {
778 switch (mem_class) {
779 case VM_MEMORY_CLASS_REGULAR:
780 #if HAS_MTE
781 case VM_MEMORY_CLASS_TAGGED:
782 case VM_MEMORY_CLASS_TAG_STORAGE:
783 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
784 #endif
785 return true;
786 #if XNU_VM_HAS_LOPAGE
787 case VM_MEMORY_CLASS_LOPAGE:
788 return false;
789 #endif /* XNU_VM_HAS_LOPAGE */
790 #if CONFIG_SECLUDED_MEMORY
791 case VM_MEMORY_CLASS_SECLUDED:
792 return false;
793 #endif
794 }
795 }
796
797
798 #if CONFIG_SECLUDED_MEMORY
799
800 static bool
vm_page_secluded_pool_eligible(vm_memory_class_t class)801 vm_page_secluded_pool_eligible(vm_memory_class_t class)
802 {
803 switch (class) {
804 #if XNU_VM_HAS_LOPAGE
805 case VM_MEMORY_CLASS_LOPAGE:
806 return false;
807 #endif /* XNU_VM_HAS_LOPAGE */
808 #if HAS_MTE
809 case VM_MEMORY_CLASS_TAG_STORAGE:
810 case VM_MEMORY_CLASS_TAGGED:
811 return false;
812 #endif /* HAS_MTE */
813 default:
814 return true;
815 }
816 }
817
818 static bool
vm_page_secluded_pool_depleted(void)819 vm_page_secluded_pool_depleted(void)
820 {
821 if (vm_page_free_count <= vm_page_free_reserved) {
822 return false;
823 }
824 if (num_tasks_can_use_secluded_mem) {
825 return false;
826 }
827 return vm_page_secluded_count < vm_page_secluded_target;
828 }
829
830 #endif /* CONFIG_SECLUDED_MEMORY */
831 #if HIBERNATION
832
833 __attribute__((overloadable))
834 static void
835 vm_page_free_queue_foreach(vm_page_queue_t queue, void (^block)(vm_page_t))
836 {
837 vm_page_t page;
838
vm_page_queue_iterate(queue,page,vmp_pageq)839 vm_page_queue_iterate(queue, page, vmp_pageq) {
840 block(page);
841 }
842 }
843
844 __attribute__((overloadable))
845 static void
846 vm_page_free_queue_foreach(vm_page_free_queue_t queue, void (^block)(vm_page_t))
847 {
848 for (unsigned int color = 0; color < vm_colors; color++) {
849 vm_page_free_queue_foreach(&queue->vmpfq_queues[color].qhead, block);
850 }
851 }
852
853 #endif /* HIBERNATION */
854 #if CONFIG_SPTM
855
856 static inline uint32_t
vm_pages_free_mask_len(void)857 vm_pages_free_mask_len(void)
858 {
859 extern pmap_paddr_t real_avail_end;
860
861 uint64_t pnums = atop(real_avail_end) - pmap_first_pnum;
862 static_assert(8 * sizeof(__uint128_t) == MAX_COLORS);
863 return (uint32_t)((pnums + MAX_COLORS - 1) / MAX_COLORS);
864 }
865
866 static inline int8_t
vm_pages_free_mask_bit(ppnum_t pnum)867 vm_pages_free_mask_bit(ppnum_t pnum)
868 {
869 return (int8_t)(pnum & (MAX_COLORS - 1));
870 }
871
872 static inline uint32_t
vm_pages_free_mask_index(ppnum_t pnum)873 vm_pages_free_mask_index(ppnum_t pnum)
874 {
875 return (pnum - pmap_first_pnum) / MAX_COLORS;
876 }
877
878 __pure2
879 static inline __uint128_t *
vm_pages_free_masks(void)880 vm_pages_free_masks(void)
881 {
882 return _vm_pages_free_masks;
883 }
884
885 __pure2
886 static inline bitmap_t *
vm_pages_free_masks_as_bitmap(uint32_t index)887 vm_pages_free_masks_as_bitmap(uint32_t index)
888 {
889 /*
890 * this conversion is gross but helps with codegen for bit-wise
891 * accesses where the __uint128_t type is really yielding poor code.
892 *
893 * This conversion is only legal on little endian architectures.
894 */
895 #ifndef __LITTLE_ENDIAN__
896 #error unsupported configuration
897 #endif
898 return (bitmap_t *)(_vm_pages_free_masks + index);
899 }
900
901 __pure2
902 static inline int8_t *
vm_pages_free_enqueue_idx(uint32_t index)903 vm_pages_free_enqueue_idx(uint32_t index)
904 {
905 return &_vm_pages_free_enqueue_idx[index];
906 }
907
908 /*!
909 * @brief
910 * Return the position of the next bit in "circular" order for a given cluster
911 * of pages, starting at and including @c bit.
912 */
913 static inline int8_t
vm_pages_free_mask_next_bit(uint32_t index,int8_t bit)914 vm_pages_free_mask_next_bit(uint32_t index, int8_t bit)
915 {
916 __uint128_t value = vm_pages_free_masks()[index];
917 __uint128_t mask = ((__uint128_t)1 << bit) - 1;
918
919 if (value == 0) {
920 return -1;
921 }
922
923 if (value & ~mask) {
924 value &= ~mask;
925 }
926 if ((uint64_t)value) {
927 return (int8_t)__builtin_ctzll((uint64_t)value);
928 }
929 return 64 + (int8_t)__builtin_ctzll((uint64_t)(value >> 64));
930 }
931
932 static inline bool
vm_pages_free_mask_test(uint32_t index,int8_t bit)933 vm_pages_free_mask_test(uint32_t index, int8_t bit)
934 {
935 return bitmap_test(vm_pages_free_masks_as_bitmap(index), bit);
936 }
937
938 static inline void
vm_pages_free_mask_set(uint32_t index,int8_t bit)939 vm_pages_free_mask_set(uint32_t index, int8_t bit)
940 {
941 assert(!vm_pages_free_mask_test(index, bit));
942 bitmap_set(vm_pages_free_masks_as_bitmap(index), bit);
943 }
944
945 static inline void
vm_pages_free_mask_clear(uint32_t index,int8_t bit)946 vm_pages_free_mask_clear(uint32_t index, int8_t bit)
947 {
948 assert(vm_pages_free_mask_test(index, bit));
949 bitmap_clear(vm_pages_free_masks_as_bitmap(index), bit);
950 }
951
952 #endif /* CONFIG_SPTM */
953
954 __attribute__((always_inline))
955 void
vm_page_free_queue_enter(vm_memory_class_t class,vm_page_t mem,ppnum_t pnum)956 vm_page_free_queue_enter(vm_memory_class_t class, vm_page_t mem, ppnum_t pnum)
957 {
958 bool enter_first;
959 unsigned int color;
960 vm_page_queue_t queue;
961
962 if (startup_phase >= STARTUP_SUB_KMEM) {
963 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
964 }
965
966 assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
967 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
968 mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
969 mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
970 mem->vmp_next_m == 0 &&
971 mem->vmp_object == 0 &&
972 mem->vmp_wire_count == 0 &&
973 mem->vmp_busy &&
974 !mem->vmp_tabled &&
975 !mem->vmp_laundry &&
976 !mem->vmp_pmapped &&
977 !mem->vmp_wpmapped &&
978 !mem->vmp_realtime);
979
980 switch (class) {
981 #if XNU_VM_HAS_LOPAGE
982 case VM_MEMORY_CLASS_LOPAGE:
983 mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
984 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
985 mem->vmp_lopage = true;
986 mem->vmp_canonical = true;
987 enter_first = true;
988 break;
989 #endif /* XNU_VM_HAS_LOPAGE */
990 #if CONFIG_SECLUDED_MEMORY
991 case VM_MEMORY_CLASS_SECLUDED:
992 if (startup_phase >= STARTUP_SUB_KMEM) {
993 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
994 }
995 mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
996 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
997 mem->vmp_lopage = false;
998 mem->vmp_canonical = true;
999 enter_first = true;
1000 break;
1001 #endif
1002 default:
1003 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
1004 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1005 mem->vmp_lopage = false;
1006 mem->vmp_canonical = true;
1007 enter_first = false;
1008 break;
1009 }
1010
1011 #if HAS_MTE
1012 if (is_mte_enabled) {
1013 switch (class) {
1014 case VM_MEMORY_CLASS_REGULAR:
1015 return mteinfo_covered_page_set_free(pnum, false);
1016 case VM_MEMORY_CLASS_TAGGED:
1017 return mteinfo_covered_page_set_free(pnum, true);
1018 case VM_MEMORY_CLASS_TAG_STORAGE:
1019 return mteinfo_tag_storage_set_inactive(mem, false);
1020 default:
1021 break;
1022 }
1023 }
1024 #endif /* HAS_MTE */
1025
1026 color = VM_PAGE_GET_COLOR_PNUM(pnum);
1027 queue = vm_page_free_queue_for_class(class, color);
1028 #if CONFIG_SPTM
1029 if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) {
1030 uint32_t index = vm_pages_free_mask_index(pnum);
1031 int8_t bit = vm_pages_free_mask_bit(pnum);
1032
1033 if (vm_pages_free_masks()[index] == 0) {
1034 vm_page_queue_enter(queue, mem, vmp_pageq);
1035 *vm_pages_free_enqueue_idx(index) = bit;
1036 }
1037 vm_pages_free_mask_set(index, bit);
1038 } else
1039 #endif /* CONFIG_SPTM */
1040 if (enter_first) {
1041 vm_page_queue_enter_first(queue, mem, vmp_pageq);
1042 } else {
1043 #if defined(__x86_64__)
1044 vm_page_queue_enter_clump(queue, mem);
1045 #else
1046 vm_page_queue_enter(queue, mem, vmp_pageq);
1047 #endif
1048 }
1049
1050 switch (class) {
1051 case VM_MEMORY_CLASS_REGULAR:
1052 VM_COUNTER_INC(&vm_page_queue_free.vmpfq_count);
1053 VM_COUNTER_INC(&vm_page_free_count);
1054 break;
1055 #if HAS_MTE
1056 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1057 VM_COUNTER_INC(&vm_page_queue_free.vmpfq_count);
1058 VM_COUNTER_INC(&vm_page_free_unmanaged_tag_storage_count);
1059 /* these do not participate to the vm page free count */
1060 break;
1061 #endif
1062 #if XNU_VM_HAS_LOPAGE
1063 case VM_MEMORY_CLASS_LOPAGE:
1064 VM_COUNTER_INC(&vm_lopage_free_count);
1065 if (vm_lopage_free_count >= vm_lopage_free_limit) {
1066 vm_lopage_refill = false;
1067 }
1068 break;
1069 #endif /* XNU_VM_HAS_LOPAGE */
1070 #if CONFIG_SECLUDED_MEMORY
1071 case VM_MEMORY_CLASS_SECLUDED:
1072 vm_page_secluded_count++;
1073 vm_page_secluded_count_free++;
1074 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
1075 break;
1076 #endif /* CONFIG_SECLUDED_MEMORY */
1077 default:
1078 __builtin_unreachable();
1079 }
1080 }
1081
1082 /*!
1083 * @typedef vmp_free_list_result_t
1084 *
1085 * @discussion
1086 * This data structure is used by vm_page_free_queue_add_list to track
1087 * how many pages were freed to which free lists, so that it can then drive
1088 * which waiters we are going to wake up.
1089 *
1090 * uint8_t counters are enough because we never free more than 64 pages at
1091 * a time, and this allows for the data structure to be passed by register.
1092 */
1093 typedef struct {
1094 uint8_t vmpr_regular;
1095 #if HAS_MTE
1096 uint8_t vmpr_taggable;
1097 uint8_t vmpr_tag_storage;
1098 #endif /* HAS_MTE */
1099 uint8_t vmpr_lopage;
1100 #if CONFIG_SECLUDED_MEMORY
1101 uint8_t vmpr_secluded;
1102 #endif /* CONFIG_SECLUDED_MEMORY */
1103 } vmp_free_list_result_t;
1104
1105 /*!
1106 * @abstract
1107 * Returns whether there are any threads blocked in VM_PAGE_WAIT().
1108 *
1109 * @discussion
1110 * The page free queue lock must be held.
1111 */
1112 static bool
vm_page_free_queue_has_any_waiters(void)1113 vm_page_free_queue_has_any_waiters(void)
1114 {
1115 uint32_t result = 0;
1116
1117 result |= vm_page_free_wanted;
1118 result |= vm_page_free_wanted_privileged;
1119 #if HAS_MTE
1120 result |= vm_page_free_wanted_tagged;
1121 result |= vm_page_free_wanted_tagged_privileged;
1122 #endif /* HAS_MTE */
1123 #if CONFIG_SECLUDED_MEMORY
1124 result |= vm_page_free_wanted_secluded;
1125 #endif /* CONFIG_SECLUDED_MEMORY */
1126
1127 return result != 0;
1128 }
1129
1130 void
vm_page_free_wakeup(event_t event,uint32_t n)1131 vm_page_free_wakeup(event_t event, uint32_t n)
1132 {
1133 if (vps_dynamic_priority_enabled) {
1134 if (n == UINT32_MAX) {
1135 wakeup_all_with_inheritor(event, THREAD_AWAKENED);
1136 } else {
1137 while (n-- > 0) {
1138 wakeup_one_with_inheritor(event, THREAD_AWAKENED,
1139 LCK_WAKE_DO_NOT_TRANSFER_PUSH, NULL);
1140 }
1141 }
1142 } else {
1143 thread_wakeup_nthreads(event, n);
1144 }
1145 }
1146
1147 /*!
1148 * @abstract
1149 * Helper to wakeup threads in VM_PAGE_WAIT() given
1150 * a vm_page_free_queue_enter_list() result.
1151 *
1152 * @discussion
1153 * The page free queue lock must be held, and is unlocked on return.
1154 *
1155 * @param vmpr The result of a vm_page_free_queue_enter_list() call.
1156 */
1157 __attribute__((noinline))
1158 static void
vm_page_free_queue_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr)1159 vm_page_free_queue_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr)
1160 {
1161 unsigned int need_wakeup = 0;
1162 unsigned int need_priv_wakeup = 0;
1163 #if CONFIG_SECLUDED_MEMORY
1164 unsigned int need_wakeup_secluded = 0;
1165 #endif /* CONFIG_SECLUDED_MEMORY */
1166 unsigned int unpriv_limit;
1167 #if HAS_MTE
1168 unsigned int need_tagged_wakeup = 0;
1169 unsigned int need_priv_tagged_wakeup = 0;
1170 unsigned int unpriv_tagged_limit;
1171 unsigned int n;
1172 bool wakeup_refill_thread = false;
1173 #endif /* HAS_MTE */
1174
1175 #define DONATE_TO_WAITERS(wake, count, waiters_count, limit) ({ \
1176 uint32_t __n = MIN(MIN(waiters_count, vmpr.count), limit); \
1177 waiters_count -= __n; \
1178 vmpr.count -= __n; \
1179 wake += __n; \
1180 __n; \
1181 })
1182
1183 /*
1184 * Step 1: privileged waiters get to be satisfied first
1185 */
1186 #if HAS_MTE
1187 if (vm_page_free_wanted_tagged_privileged) {
1188 DONATE_TO_WAITERS(need_priv_tagged_wakeup,
1189 vmpr_taggable, vm_page_free_wanted_tagged_privileged,
1190 UINT32_MAX);
1191
1192 /*
1193 * If we will not wake up privileged threads, and there are
1194 * tagged privileged waiters, we need the refill thread to do
1195 * an emergency activation or reclaim to fulfill this need.
1196 *
1197 * We need to at least have 2 extra free pages because the
1198 * reclaim path might require to relocate a page to give us one.
1199 */
1200 if (!need_priv_tagged_wakeup &&
1201 vm_page_free_count >= vm_page_free_taggable_count + 2) {
1202 wakeup_refill_thread = true;
1203 }
1204 }
1205 #endif /* HAS_MTE */
1206 if (vm_page_free_wanted_privileged) {
1207 DONATE_TO_WAITERS(need_priv_wakeup,
1208 vmpr_regular, vm_page_free_wanted_privileged,
1209 UINT32_MAX);
1210 #if HAS_MTE
1211 DONATE_TO_WAITERS(need_priv_wakeup,
1212 vmpr_taggable, vm_page_free_wanted_privileged,
1213 UINT32_MAX);
1214 #endif /* HAS_MTE */
1215 }
1216
1217
1218 /*
1219 * Step 2: the privileged reserve needs to be replenished
1220 *
1221 * Let's make sure that we only wake up regular threads
1222 * for free pages above the reserve threshold.
1223 */
1224 if (vm_page_free_count <= vm_page_free_reserved) {
1225 unpriv_limit = 0;
1226 } else {
1227 unpriv_limit = vm_page_free_count - vm_page_free_reserved;
1228 }
1229 #if HAS_MTE
1230 if (vm_page_free_taggable_count <= vm_page_free_reserved) {
1231 unpriv_tagged_limit = 0;
1232 } else {
1233 unpriv_tagged_limit = vm_page_free_taggable_count -
1234 vm_page_free_reserved;
1235 }
1236 #endif /* HAS_MTE */
1237
1238 /*
1239 * Step 3: satisfy secluded waiters, using the secluded pool first,
1240 * regular pages second.
1241 */
1242 #if CONFIG_SECLUDED_MEMORY
1243 if (vm_page_free_wanted_secluded) {
1244 DONATE_TO_WAITERS(need_wakeup_secluded,
1245 vmpr_secluded, vm_page_free_wanted_secluded,
1246 UINT32_MAX);
1247 unpriv_limit -= DONATE_TO_WAITERS(need_wakeup_secluded,
1248 vmpr_regular, vm_page_free_wanted_secluded,
1249 unpriv_limit);
1250
1251 if (vm_page_free_wanted_secluded == 0) {
1252 need_wakeup_secluded = UINT32_MAX;
1253 }
1254 }
1255 #endif /* CONFIG_SECLUDED_MEMORY */
1256
1257 /*
1258 * Step 4: satisfy regular demand last.
1259 */
1260 #if HAS_MTE
1261 if (vm_page_free_wanted_tagged) {
1262 n = DONATE_TO_WAITERS(need_tagged_wakeup,
1263 vmpr_taggable, vm_page_free_wanted_tagged,
1264 MIN(unpriv_limit, unpriv_tagged_limit));
1265
1266 unpriv_limit -= n;
1267 unpriv_tagged_limit -= n;
1268
1269 if (vm_page_free_wanted_tagged == 0) {
1270 need_tagged_wakeup = UINT32_MAX;
1271 } else if (vm_page_free_count >=
1272 MAX(vm_page_free_taggable_count + 2, vm_page_free_min)) {
1273 /*
1274 * If we still have tagged waiters, and that rebalancing
1275 * pages would get us above vm_page_free_min, then wake
1276 * up the refill thread to help do that rebalance.
1277 */
1278 wakeup_refill_thread = true;
1279 }
1280 }
1281 #endif /* HAS_MTE */
1282 if (vm_page_free_wanted) {
1283 unpriv_limit -= DONATE_TO_WAITERS(need_wakeup,
1284 vmpr_regular, vm_page_free_wanted,
1285 unpriv_limit);
1286 #if HAS_MTE
1287 n = DONATE_TO_WAITERS(need_wakeup,
1288 vmpr_taggable, vm_page_free_wanted,
1289 MIN(unpriv_limit, unpriv_tagged_limit));
1290
1291 unpriv_limit -= n;
1292 unpriv_tagged_limit -= n;
1293 #endif /* HAS_MTE */
1294 if (vm_page_free_wanted == 0) {
1295 need_wakeup = UINT32_MAX;
1296 }
1297 }
1298
1299 /*
1300 * We have updated waiter counts, and if that release page happens
1301 * from the context of a thread that's super low priority we might
1302 * starve waking up privileged threads.
1303 *
1304 * While we hold the free page lock, such threads would wake us up via
1305 * the mutex priority inheritance mechanism, but as soon as we drop the
1306 * lock all bets are off.
1307 *
1308 * To avoid this priority inversion that could really hurt the VM,
1309 * disable preemption until we've woken up everyone.
1310 */
1311 disable_preemption();
1312 vm_free_page_unlock();
1313
1314 /*
1315 * Dispatch privileged wakeups
1316 *
1317 * There shouldn't be that many VM-privileged threads,
1318 * so let's wake them all up, even if we don't quite
1319 * have enough pages to satisfy them all.
1320 */
1321 if (need_priv_wakeup) {
1322 vm_page_free_wakeup(&vm_page_free_wanted_privileged,
1323 UINT32_MAX);
1324 }
1325 if (need_wakeup) {
1326 vm_page_free_wakeup(&vm_page_free_count, need_wakeup);
1327 }
1328 #if HAS_MTE
1329 if (need_priv_tagged_wakeup) {
1330 vm_page_free_wakeup(&vm_page_free_wanted_tagged_privileged,
1331 UINT32_MAX);
1332 }
1333 if (need_tagged_wakeup) {
1334 vm_page_free_wakeup(&vm_page_free_wanted_tagged,
1335 need_tagged_wakeup);
1336 }
1337 if (wakeup_refill_thread) {
1338 mteinfo_wake_fill_thread();
1339 }
1340 #endif /* HAS_MTE */
1341 #if CONFIG_SECLUDED_MEMORY
1342 if (need_wakeup_secluded) {
1343 vm_page_free_wakeup(&vm_page_free_wanted_secluded,
1344 need_wakeup_secluded);
1345 }
1346 #endif /* CONFIG_SECLUDED_MEMORY */
1347
1348 enable_preemption();
1349
1350 #undef DONATE_TO_WAITERS
1351 }
1352
1353 /*
1354 * @abstract
1355 * Given a list of pages, put each page on whichever global free queue is
1356 * appropriate.
1357 *
1358 * @discussion
1359 * Must be called with the VM free page lock unlocked.
1360 *
1361 * The list must contain less than 255 elements.
1362 */
1363 #if HAS_MTE
1364 /*
1365 * To put it more bluntly: this will demux pages onto the free tag storage
1366 * queue or the global free queue, as appropriate. If we start freeing tagged
1367 * pages onto the free tagged queue, this function should be updated to deal
1368 * with that too.
1369 */
1370 #endif /* HAS_MTE */
1371 static void
vm_page_free_queue_enter_list(vm_page_list_t list,vmp_release_options_t opts)1372 vm_page_free_queue_enter_list(vm_page_list_t list, vmp_release_options_t opts)
1373 {
1374 bool page_queues_unlock = false;
1375 bool page_queues_locked = false;
1376 bool do_secluded = false;
1377 vmp_free_list_result_t result = { };
1378 vm_page_t mem;
1379
1380 LCK_MTX_ASSERT(&vm_page_queue_lock,
1381 (opts & VMP_RELEASE_Q_LOCKED)
1382 ? LCK_MTX_ASSERT_OWNED
1383 : LCK_MTX_ASSERT_NOTOWNED);
1384
1385 /*
1386 * Hibernation and startup do not really need the lock because
1387 * these are single threaded paths, so from the PoV of that function,
1388 * it's as if VMP_RELEASE_Q_LOCKED was passed.
1389 */
1390 page_queues_locked = (opts & (VMP_RELEASE_STARTUP |
1391 VMP_RELEASE_HIBERNATE |
1392 VMP_RELEASE_Q_LOCKED));
1393
1394 #if CONFIG_SECLUDED_MEMORY
1395 do_secluded = vm_page_secluded_pool_depleted();
1396 #if HAS_MTE
1397 if (do_secluded && list.vmpl_has_tagged &&
1398 (opts & VMP_RELEASE_Q_LOCKED) == 0) {
1399 /*
1400 * Try to do the untagging so that pages become eligible
1401 * for the secluded pool while holding the least amount
1402 * of locks possible.
1403 *
1404 * This does mean we shouldn't do this retyping if the page
1405 * queue lock is held for real. The only path doing this
1406 * right now is vm_page_free() which is one page at a time,
1407 * so it's probably "fine" to not contribute these to the
1408 * secluded pool.
1409 */
1410 const unified_page_list_t pmap_batch_list = {
1411 .page_slist = list.vmpl_head,
1412 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
1413 };
1414
1415 pmap_unmake_tagged_pages(&pmap_batch_list);
1416 vm_page_list_foreach(mem, list) {
1417 mem->vmp_using_mte = false;
1418 }
1419 list.vmpl_has_tagged = false;
1420 list.vmpl_has_untagged = true;
1421 }
1422 #endif /* HAS_MTE */
1423 #endif /* CONFIG_SECLUDED_MEMORY */
1424
1425 if (!page_queues_locked && (list.vmpl_has_realtime || do_secluded)) {
1426 vm_page_lock_queues();
1427 page_queues_locked = true;
1428 page_queues_unlock = true;
1429 }
1430
1431 if (opts & VMP_RELEASE_STARTUP) {
1432 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1433 } else {
1434 vm_free_page_lock_spin();
1435 }
1436
1437 vm_page_list_foreach_consume(mem, &list) {
1438 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
1439 vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
1440
1441 if (mem->vmp_realtime) {
1442 mem->vmp_realtime = false;
1443 VM_COUNTER_DEC(&vm_page_realtime_count);
1444 }
1445
1446 #if XNU_VM_HAS_LOPAGE
1447 if ((class == VM_MEMORY_CLASS_REGULAR ||
1448 class == VM_MEMORY_CLASS_LOPAGE) &&
1449 vm_lopage_refill &&
1450 vm_lopage_free_count < vm_lopage_free_limit &&
1451 pnum < max_valid_low_ppnum) {
1452 class = VM_MEMORY_CLASS_LOPAGE;
1453 } else {
1454 class = VM_MEMORY_CLASS_REGULAR;
1455 }
1456 #endif /* XNU_VM_HAS_LOPAGE */
1457
1458 #if CONFIG_SECLUDED_MEMORY
1459 /*
1460 * XXX FBDP TODO: also avoid refilling secluded queue
1461 * when some IOKit objects are already grabbing from it...
1462 */
1463 if (page_queues_locked &&
1464 vm_page_secluded_pool_eligible(class) &&
1465 vm_page_secluded_pool_depleted()) {
1466 class = VM_MEMORY_CLASS_SECLUDED;
1467 }
1468 #endif /* CONFIG_SECLUDED_MEMORY */
1469
1470 vm_page_free_queue_enter(class, mem, pnum);
1471
1472 switch (class) {
1473 case VM_MEMORY_CLASS_REGULAR:
1474 #if HAS_MTE
1475 if (is_mte_enabled && mteinfo_covered_page_taggable(pnum)) {
1476 result.vmpr_taggable++;
1477 break;
1478 }
1479 OS_FALLTHROUGH;
1480 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1481 #endif /* HAS_MTE */
1482 result.vmpr_regular++;
1483 break;
1484 #if HAS_MTE
1485 case VM_MEMORY_CLASS_TAGGED:
1486 result.vmpr_taggable++;
1487 break;
1488 case VM_MEMORY_CLASS_TAG_STORAGE:
1489 result.vmpr_tag_storage++;
1490 break;
1491 #endif /* HAS_MTE */
1492 #if XNU_VM_HAS_LOPAGE
1493 case VM_MEMORY_CLASS_LOPAGE:
1494 result.vmpr_lopage++;
1495 break;
1496 #endif /* XNU_VM_HAS_LOPAGE */
1497 #if CONFIG_SECLUDED_MEMORY
1498 case VM_MEMORY_CLASS_SECLUDED:
1499 result.vmpr_secluded++;
1500 continue;
1501 #endif /* CONFIG_SECLUDED_MEMORY */
1502 }
1503 }
1504
1505 if (page_queues_unlock) {
1506 vm_page_unlock_queues();
1507 }
1508
1509 vm_pageout_vminfo.vm_page_pages_freed += list.vmpl_count;
1510 VM_DEBUG_CONSTANT_EVENT(vm_page_release, DBG_VM_PAGE_RELEASE,
1511 DBG_FUNC_NONE, list.vmpl_count, 0, 0, 0);
1512
1513 if (opts & VMP_RELEASE_STARTUP) {
1514 /*
1515 * On purpose skip the VM_CHECK_MEMORYSTATUS,
1516 * pmap_startup() will do it,
1517 * and the caller holds the free queue lock the whole time.
1518 */
1519 return;
1520 }
1521
1522 if (vm_page_free_queue_has_any_waiters()) {
1523 vm_page_free_queue_handle_wakeups_and_unlock(result);
1524 } else {
1525 vm_free_page_unlock();
1526 }
1527
1528 if ((opts & VMP_RELEASE_HIBERNATE) == 0) {
1529 /*
1530 * Skip VM_CHECK_MEMORYSTATUS here as
1531 * hibernate_rebuild_vm_structs() will run it after the last flush.
1532 */
1533 VM_CHECK_MEMORYSTATUS;
1534 }
1535 }
1536
1537 __attribute__((always_inline))
1538 void
vm_page_free_queue_remove(vm_memory_class_t class,vm_page_t mem,ppnum_t pnum,vm_page_q_state_t q_state)1539 vm_page_free_queue_remove(
1540 vm_memory_class_t class,
1541 vm_page_t mem,
1542 ppnum_t pnum,
1543 vm_page_q_state_t q_state)
1544 {
1545 unsigned int color;
1546 vm_page_queue_t queue;
1547
1548 if (startup_phase >= STARTUP_SUB_KMEM) {
1549 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1550 }
1551
1552 mem->vmp_q_state = q_state;
1553
1554 #if HAS_MTE
1555 if (is_mte_enabled) {
1556 switch (class) {
1557 case VM_MEMORY_CLASS_REGULAR:
1558 return mteinfo_covered_page_set_used(pnum, false);
1559 case VM_MEMORY_CLASS_TAGGED:
1560 return mteinfo_covered_page_set_used(pnum, true);
1561 case VM_MEMORY_CLASS_TAG_STORAGE:
1562 return mteinfo_tag_storage_set_claimed(mem);
1563 default:
1564 break;
1565 }
1566 }
1567 #endif /* HAS_MTE */
1568
1569 color = VM_PAGE_GET_COLOR_PNUM(pnum);
1570 queue = vm_page_free_queue_for_class(class, color);
1571 #if CONFIG_SPTM
1572 if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) {
1573 uint32_t index = vm_pages_free_mask_index(pnum);
1574 int8_t bit = vm_pages_free_mask_bit(pnum);
1575
1576 vm_pages_free_mask_clear(index, bit);
1577 if (*vm_pages_free_enqueue_idx(index) == bit) {
1578 vm_page_queue_remove(queue, mem, vmp_pageq);
1579 bit = vm_pages_free_mask_next_bit(index, bit);
1580 *vm_pages_free_enqueue_idx(index) = bit;
1581
1582 if (bit != -1) {
1583 assert(vm_pages_free_mask_test(index, bit));
1584 pnum = (pnum & -MAX_COLORS) + bit;
1585 mem = vm_page_find_canonical(pnum);
1586 color = VM_PAGE_GET_COLOR_PNUM(pnum);
1587 queue = vm_page_free_queue_for_class(class, color);
1588 vm_page_queue_enter(queue, mem, vmp_pageq);
1589 }
1590 }
1591 } else
1592 #endif /* CONFIG_SPTM */
1593 {
1594 vm_page_queue_remove(queue, mem, vmp_pageq);
1595 }
1596
1597 switch (class) {
1598 case VM_MEMORY_CLASS_REGULAR:
1599 VM_COUNTER_DEC(&vm_page_queue_free.vmpfq_count);
1600 VM_COUNTER_DEC(&vm_page_free_count);
1601 break;
1602 #if HAS_MTE
1603 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1604 VM_COUNTER_DEC(&vm_page_queue_free.vmpfq_count);
1605 VM_COUNTER_DEC(&vm_page_free_unmanaged_tag_storage_count);
1606 /* these do not participate to the vm page free count */
1607 break;
1608 #endif /* HAS_MTE */
1609 #if XNU_VM_HAS_LOPAGE
1610 case VM_MEMORY_CLASS_LOPAGE:
1611 VM_COUNTER_DEC(&vm_lopage_free_count);
1612 vm_lopages_allocated_q += 1;
1613 if (vm_lopage_free_count < vm_lopage_lowater) {
1614 vm_lopage_refill = true;
1615 }
1616 break;
1617 #endif /* XNU_VM_HAS_LOPAGE */
1618 default:
1619 __builtin_unreachable();
1620 }
1621 }
1622
1623 vm_page_list_t
vm_page_free_queue_grab(vm_grab_options_t options __unused,vm_memory_class_t class,unsigned int num_pages,vm_page_q_state_t q_state)1624 vm_page_free_queue_grab(
1625 vm_grab_options_t options __unused,
1626 vm_memory_class_t class,
1627 unsigned int num_pages,
1628 vm_page_q_state_t q_state)
1629 {
1630 unsigned int *colorp;
1631 unsigned int color;
1632 #if defined(__x86_64__)
1633 unsigned int clump_end = 1;
1634 unsigned int sub_count = 0;
1635 #endif /* __x86_64__ */
1636 vm_page_list_t list = { };
1637
1638 if (startup_phase >= STARTUP_SUB_KMEM) {
1639 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1640 }
1641 assert(get_preemption_level() != 0);
1642 assert(q_state <= VM_PAGE_Q_STATE_LAST_VALID_VALUE);
1643
1644 #if HAS_MTE
1645 if (is_mte_enabled && class != VM_MEMORY_CLASS_DEAD_TAG_STORAGE) {
1646 return mteinfo_free_queue_grab(options, class, num_pages, q_state);
1647 }
1648 #endif /* HAS_MTE */
1649
1650 colorp = PERCPU_GET(start_color);
1651 color = *colorp;
1652
1653 /* Get the pages. */
1654 while (list.vmpl_count < num_pages) {
1655 uint32_t color_offset = 1;
1656 vm_page_queue_t queue;
1657 vm_page_t mem;
1658
1659 queue = vm_page_free_queue_for_class(class, color);
1660 if (!vm_page_free_queue_has_colors(class)) {
1661 assert(!vm_page_queue_empty(queue));
1662 color_offset = 0;
1663 }
1664 while (vm_page_queue_empty(queue)) {
1665 color = (color + 1) & vm_color_mask;
1666 queue = vm_page_free_queue_for_class(class, color);
1667 }
1668
1669 #if defined(__x86_64__)
1670 if (class == VM_MEMORY_CLASS_REGULAR) {
1671 /*
1672 * x86_64 uses a bespoke free queue scheme, where the free path
1673 * tries to cluster clumps of contiguous pages together on
1674 * the free queue to optimize for the platform's memory
1675 * controller.
1676 */
1677 vm_page_queue_remove_first_with_clump(queue, mem, clump_end);
1678 sub_count++;
1679 if (clump_end) {
1680 #if DEVELOPMENT || DEBUG
1681 vm_clump_update_stats(sub_count);
1682 #endif /* !DEVELOPMENT && !DEBUG */
1683 sub_count = 0;
1684 } else {
1685 /* Only change colors at the end of a clump. */
1686 color_offset = 0;
1687 }
1688 } else
1689 #endif /* !defined(__x86_64__) */
1690 {
1691 /* Other targets default to rotating colors after each pop. */
1692 vm_page_queue_remove_first(queue, mem, vmp_pageq);
1693 }
1694
1695 #if CONFIG_SPTM
1696 if (vm_pages_free_masks()) {
1697 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
1698 ppnum_t first_pnum = pnum & -MAX_COLORS;
1699 uint32_t index = vm_pages_free_mask_index(pnum);
1700 int8_t bit = vm_pages_free_mask_bit(pnum);
1701
1702 for (;;) {
1703 vm_pages_free_mask_clear(index, bit);
1704 mem->vmp_q_state = q_state;
1705 vm_page_list_push(&list, mem);
1706
1707 bit = (bit + 1) & (MAX_COLORS - 1);
1708
1709 if (!vm_pages_free_mask_test(index, bit) ||
1710 num_pages <= list.vmpl_count) {
1711 break;
1712 }
1713 mem = vm_page_find_canonical(first_pnum + bit);
1714 }
1715
1716 color = bit & vm_color_mask;
1717
1718 bit = vm_pages_free_mask_next_bit(index, bit);
1719 *vm_pages_free_enqueue_idx(index) = bit;
1720
1721 if (bit != -1) {
1722 assert(vm_pages_free_mask_test(index, bit));
1723 mem = vm_page_find_canonical(first_pnum + bit);
1724 queue = vm_page_free_queue_for_class(class,
1725 bit & vm_color_mask);
1726 vm_page_queue_enter_first(queue, mem, vmp_pageq);
1727 }
1728 } else
1729 #endif /* CONFIG_SPTM */
1730 {
1731 /* Set the page to the client's desired queue state. */
1732 mem->vmp_q_state = q_state;
1733 vm_page_list_push(&list, mem);
1734
1735 color = (color + color_offset) & vm_color_mask;
1736 }
1737 }
1738
1739 switch (class) {
1740 case VM_MEMORY_CLASS_REGULAR:
1741 VM_COUNTER_SUB(&vm_page_queue_free.vmpfq_count, list.vmpl_count);
1742 VM_COUNTER_SUB(&vm_page_free_count, list.vmpl_count);
1743 break;
1744 #if HAS_MTE
1745 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1746 VM_COUNTER_SUB(&vm_page_queue_free.vmpfq_count, list.vmpl_count);
1747 VM_COUNTER_SUB(&vm_page_free_unmanaged_tag_storage_count, list.vmpl_count);
1748 /* these do not participate to the vm page free count */
1749 break;
1750 #endif /* HAS_MTE */
1751 #if XNU_VM_HAS_LOPAGE
1752 case VM_MEMORY_CLASS_LOPAGE:
1753 VM_COUNTER_SUB(&vm_lopage_free_count, list.vmpl_count);
1754 vm_lopages_allocated_q += list.vmpl_count;
1755 if (vm_lopage_free_count < vm_lopage_lowater) {
1756 vm_lopage_refill = true;
1757 }
1758 break;
1759 #endif /* XNU_VM_HAS_LOPAGE */
1760 default:
1761 __builtin_unreachable();
1762 }
1763
1764 /* Record the next page color the CPU should try to get. */
1765 *colorp = color;
1766 #if defined(__x86_64__) && (DEVELOPMENT || DEBUG)
1767 vm_clump_update_stats(sub_count);
1768 #endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */
1769
1770 return list;
1771 }
1772
1773
1774 #define COLOR_GROUPS_TO_STEAL 4
1775
1776 /* Called once during statup, once the cache geometry is known.
1777 */
1778 static void
vm_page_set_colors(void)1779 vm_page_set_colors( void )
1780 {
1781 unsigned int n, override;
1782
1783 #if defined (__x86_64__)
1784 /* adjust #colors because we need to color outside the clump boundary */
1785 vm_cache_geometry_colors >>= vm_clump_shift;
1786 #endif
1787 if (PE_parse_boot_argn("colors", &override, sizeof(override))) { /* colors specified as a boot-arg? */
1788 n = override;
1789 } else if (vm_cache_geometry_colors) { /* do we know what the cache geometry is? */
1790 n = vm_cache_geometry_colors;
1791 } else {
1792 n = DEFAULT_COLORS; /* use default if all else fails */
1793 }
1794 if (n == 0) {
1795 n = 1;
1796 }
1797 if (n > MAX_COLORS) {
1798 n = MAX_COLORS;
1799 }
1800
1801 /* the count must be a power of 2 */
1802 if ((n & (n - 1)) != 0) {
1803 n = DEFAULT_COLORS; /* use default if all else fails */
1804 }
1805 vm_colors = n;
1806 vm_color_mask = n - 1;
1807
1808 vm_free_magazine_refill_limit = vm_colors * COLOR_GROUPS_TO_STEAL;
1809
1810 #if defined (__x86_64__)
1811 /* adjust for reduction in colors due to clumping and multiple cores */
1812 if (real_ncpus) {
1813 vm_free_magazine_refill_limit *= (vm_clump_size * real_ncpus);
1814 }
1815 #endif
1816 }
1817
1818 #if XNU_VM_HAS_DELAYED_PAGES
1819
1820 static uint32_t vm_delayed_count = 0; /* when non-zero, indicates we may have more pages to init */
1821 static ppnum_t delay_above_pnum = PPNUM_MAX;
1822
1823 /*
1824 * For x86 first 8 Gig initializes quickly and gives us lots of lowmem + mem above to start off with.
1825 * If ARM ever uses delayed page initialization, this value may need to be quite different.
1826 */
1827 #define DEFAULT_DELAY_ABOVE_PHYS_GB (8)
1828
1829 /*
1830 * When we have to dip into more delayed pages due to low memory, free up
1831 * a large chunk to get things back to normal. This avoids contention on the
1832 * delayed code allocating page by page.
1833 */
1834 #define VM_DELAY_PAGE_CHUNK ((1024 * 1024 * 1024) / PAGE_SIZE)
1835
1836 /*
1837 * Get and initialize the next delayed page.
1838 */
1839 __attribute__((noinline))
1840 static vm_page_t
vm_get_delayed_page(vm_grab_options_t grab_options)1841 vm_get_delayed_page(vm_grab_options_t grab_options)
1842 {
1843 vm_page_t p;
1844 ppnum_t pnum;
1845
1846 /*
1847 * Get a new page if we have one.
1848 */
1849 vm_free_page_lock();
1850 if (vm_delayed_count == 0) {
1851 vm_free_page_unlock();
1852 return NULL;
1853 }
1854
1855 if (!pmap_next_page(&pnum)) {
1856 vm_delayed_count = 0;
1857 vm_free_page_unlock();
1858 return NULL;
1859 }
1860
1861
1862 assert(vm_delayed_count > 0);
1863 --vm_delayed_count;
1864
1865 #if defined(__x86_64__)
1866 /* x86 cluster code requires increasing phys_page in vm_pages[] */
1867 if (vm_pages_count > 0) {
1868 assert(pnum > vm_page_get(vm_pages_count - 1)->vmp_phys_page);
1869 }
1870 #endif
1871 p = vm_page_get(vm_pages_count);
1872 assert(p < vm_pages_end);
1873 vm_page_init(p, pnum);
1874 ++vm_pages_count;
1875 ++vm_page_pages;
1876 vm_free_page_unlock();
1877
1878 /*
1879 * These pages were initially counted as wired, undo that now.
1880 */
1881 if (grab_options & VM_PAGE_GRAB_Q_LOCK_HELD) {
1882 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1883 } else {
1884 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
1885 vm_page_lockspin_queues();
1886 }
1887 --vm_page_wire_count;
1888 --vm_page_wire_count_initial;
1889 if (vm_page_wire_count_on_boot != 0) {
1890 --vm_page_wire_count_on_boot;
1891 }
1892 if (!(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
1893 vm_page_unlock_queues();
1894 }
1895
1896
1897 if (fillval) {
1898 fillPage(pnum, fillval);
1899 }
1900 return p;
1901 }
1902
1903 /*
1904 * Free all remaining delayed pages to the free lists.
1905 */
1906 void
vm_free_delayed_pages(void)1907 vm_free_delayed_pages(void)
1908 {
1909 vm_page_t p;
1910 vm_page_t list = NULL;
1911 uint_t cnt = 0;
1912 vm_offset_t start_free_va;
1913 int64_t free_size;
1914
1915 while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) {
1916 if (vm_himemory_mode) {
1917 vm_page_release(p, VMP_RELEASE_NONE);
1918 } else {
1919 p->vmp_snext = list;
1920 list = p;
1921 }
1922 ++cnt;
1923 }
1924
1925 /*
1926 * Free the pages in reverse order if not himemory mode.
1927 * Hence the low memory pages will be first on free lists. (LIFO)
1928 */
1929 while (list != NULL) {
1930 p = list;
1931 list = p->vmp_snext;
1932 p->vmp_snext = NULL;
1933 vm_page_release(p, VMP_RELEASE_NONE);
1934 }
1935 #if DEVELOPMENT || DEBUG
1936 kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt);
1937 #endif
1938
1939 /*
1940 * Free up any unused full pages at the end of the vm_pages[] array
1941 */
1942 start_free_va = round_page((vm_offset_t)vm_page_get(vm_pages_count));
1943
1944 #if defined(__x86_64__)
1945 /*
1946 * Since x86 might have used large pages for vm_pages[], we can't
1947 * free starting in the middle of a partially used large page.
1948 */
1949 if (pmap_query_pagesize(kernel_pmap, start_free_va) == I386_LPGBYTES) {
1950 start_free_va = ((start_free_va + I386_LPGMASK) & ~I386_LPGMASK);
1951 }
1952 #endif
1953 if (start_free_va < (vm_offset_t)vm_pages_end) {
1954 free_size = trunc_page((vm_offset_t)vm_pages_end - start_free_va);
1955 if (free_size > 0) {
1956 ml_static_mfree(start_free_va, (vm_offset_t)free_size);
1957 vm_pages_end = (void *)start_free_va;
1958
1959 /*
1960 * Note there's no locking here, as only this thread will ever change this value.
1961 * The reader, vm_page_diagnose, doesn't grab any locks for the counts it looks at.
1962 */
1963 vm_page_stolen_count -= (free_size >> PAGE_SHIFT);
1964
1965 #if DEVELOPMENT || DEBUG
1966 kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n",
1967 (long)free_size, (long)start_free_va);
1968 #endif
1969 }
1970 }
1971 }
1972
1973 /*
1974 * Try and free up enough delayed pages to match a contig memory allocation.
1975 */
1976 static void
vm_free_delayed_pages_contig(uint_t npages,ppnum_t max_pnum,ppnum_t pnum_mask)1977 vm_free_delayed_pages_contig(
1978 uint_t npages,
1979 ppnum_t max_pnum,
1980 ppnum_t pnum_mask)
1981 {
1982 vm_page_t p;
1983 ppnum_t pnum;
1984 uint_t cnt = 0;
1985
1986 /*
1987 * Treat 0 as the absolute max page number.
1988 */
1989 if (max_pnum == 0) {
1990 max_pnum = PPNUM_MAX;
1991 }
1992
1993 /*
1994 * Free till we get a properly aligned start page
1995 */
1996 for (;;) {
1997 p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
1998 if (p == NULL) {
1999 return;
2000 }
2001 pnum = VM_PAGE_GET_PHYS_PAGE(p);
2002 vm_page_release(p, VMP_RELEASE_NONE);
2003 if (pnum >= max_pnum) {
2004 return;
2005 }
2006 if ((pnum & pnum_mask) == 0) {
2007 break;
2008 }
2009 }
2010
2011 /*
2012 * Having a healthy pool of free pages will help performance. We don't
2013 * want to fall back to the delayed code for every page allocation.
2014 */
2015 if (vm_page_free_count < VM_DELAY_PAGE_CHUNK) {
2016 npages += VM_DELAY_PAGE_CHUNK;
2017 }
2018
2019 /*
2020 * Now free up the pages
2021 */
2022 for (cnt = 1; cnt < npages; ++cnt) {
2023 p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
2024 if (p == NULL) {
2025 return;
2026 }
2027 vm_page_release(p, VMP_RELEASE_NONE);
2028 }
2029 }
2030
2031 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2032
2033 #define ROUNDUP_NEXTP2(X) (1U << (32 - __builtin_clz((X) - 1)))
2034
2035 void
vm_page_init_local_q(unsigned int num_cpus)2036 vm_page_init_local_q(unsigned int num_cpus)
2037 {
2038 struct vpl *t_local_q;
2039
2040 /*
2041 * no point in this for a uni-processor system
2042 */
2043 if (num_cpus >= 2) {
2044 ml_cpu_info_t cpu_info;
2045
2046 /*
2047 * Force the allocation alignment to a cacheline,
2048 * because the `vpl` struct has a lock and will be taken
2049 * cross CPU so we want to isolate the rest of the per-CPU
2050 * data to avoid false sharing due to this lock being taken.
2051 */
2052
2053 ml_cpu_get_info(&cpu_info);
2054
2055 t_local_q = zalloc_percpu_permanent(sizeof(struct vpl),
2056 cpu_info.cache_line_size - 1);
2057
2058 zpercpu_foreach(lq, t_local_q) {
2059 VPL_LOCK_INIT(lq, &vm_page_lck_grp_local, &vm_page_lck_attr);
2060 vm_page_queue_init(&lq->vpl_queue);
2061 }
2062
2063 /* make the initialization visible to all cores */
2064 os_atomic_store(&vm_page_local_q, t_local_q, release);
2065 }
2066 }
2067
2068 /*
2069 * vm_init_before_launchd
2070 *
2071 * This should be called right before launchd is loaded.
2072 */
2073 void
vm_init_before_launchd(void)2074 vm_init_before_launchd(void)
2075 {
2076 vm_page_lockspin_queues();
2077 vm_page_wire_count_on_boot = vm_page_wire_count;
2078 vm_page_unlock_queues();
2079 }
2080
2081
2082 /*
2083 * vm_page_bootstrap:
2084 *
2085 * Initializes the resident memory module.
2086 *
2087 * Allocates memory for the page cells, and
2088 * for the object/offset-to-page hash table headers.
2089 * Each page cell is initialized and placed on the free list.
2090 * Returns the range of available kernel virtual memory.
2091 */
2092 __startup_func
2093 void
vm_page_bootstrap(vm_offset_t * startp,vm_offset_t * endp)2094 vm_page_bootstrap(
2095 vm_offset_t *startp,
2096 vm_offset_t *endp)
2097 {
2098 unsigned int i;
2099 unsigned int log1;
2100 unsigned int log2;
2101 unsigned int size;
2102
2103 /*
2104 * Initialize the page queues.
2105 */
2106
2107 lck_mtx_init(&vm_page_queue_free_lock, &vm_page_lck_grp_free, &vm_page_lck_attr);
2108 lck_mtx_init(&vm_page_queue_lock, &vm_page_lck_grp_queue, &vm_page_lck_attr);
2109 lck_mtx_init(&vm_purgeable_queue_lock, &vm_page_lck_grp_purge, &vm_page_lck_attr);
2110
2111 for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) {
2112 int group;
2113
2114 purgeable_queues[i].token_q_head = 0;
2115 purgeable_queues[i].token_q_tail = 0;
2116 for (group = 0; group < NUM_VOLATILE_GROUPS; group++) {
2117 queue_init(&purgeable_queues[i].objq[group]);
2118 }
2119
2120 purgeable_queues[i].type = i;
2121 purgeable_queues[i].new_pages = 0;
2122 #if MACH_ASSERT
2123 purgeable_queues[i].debug_count_tokens = 0;
2124 purgeable_queues[i].debug_count_objects = 0;
2125 #endif
2126 }
2127 ;
2128 purgeable_nonvolatile_count = 0;
2129 queue_init(&purgeable_nonvolatile_queue);
2130
2131 vm_page_free_queue_init(&vm_page_queue_free);
2132 #if XNU_VM_HAS_LOPAGE
2133 vm_page_queue_init(&vm_lopage_queue_free);
2134 #endif /* XNU_VM_HAS_LOPAGE */
2135 vm_page_queue_init(&vm_page_queue_active);
2136 vm_page_queue_init(&vm_page_queue_inactive);
2137 #if CONFIG_SECLUDED_MEMORY
2138 vm_page_queue_init(&vm_page_queue_secluded);
2139 #endif /* CONFIG_SECLUDED_MEMORY */
2140 vm_page_queue_init(&vm_page_queue_cleaned);
2141 vm_page_queue_init(&vm_page_queue_throttled);
2142 vm_page_queue_init(&vm_page_queue_anonymous);
2143 queue_init(&vm_objects_wired);
2144
2145 for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
2146 vm_page_queue_init(&vm_page_queue_speculative[i].age_q);
2147
2148 vm_page_queue_speculative[i].age_ts.tv_sec = 0;
2149 vm_page_queue_speculative[i].age_ts.tv_nsec = 0;
2150 }
2151
2152 vm_page_queue_init(&vm_page_queue_donate);
2153 vm_page_queue_init(&vm_page_queue_background);
2154
2155 vm_page_background_count = 0;
2156 vm_page_background_internal_count = 0;
2157 vm_page_background_external_count = 0;
2158 vm_page_background_promoted_count = 0;
2159
2160 vm_page_background_target = (unsigned int)(atop_64(max_mem) / 25);
2161
2162 if (vm_page_background_target > VM_PAGE_BACKGROUND_TARGET_MAX) {
2163 vm_page_background_target = VM_PAGE_BACKGROUND_TARGET_MAX;
2164 }
2165
2166 #if defined(__LP64__)
2167 vm_page_background_mode = VM_PAGE_BG_ENABLED;
2168 vm_page_donate_mode = VM_PAGE_DONATE_ENABLED;
2169 #else
2170 vm_page_background_mode = VM_PAGE_BG_DISABLED;
2171 vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
2172 #endif
2173 vm_page_background_exclude_external = 0;
2174
2175 PE_parse_boot_argn("vm_page_bg_mode", &vm_page_background_mode, sizeof(vm_page_background_mode));
2176 PE_parse_boot_argn("vm_page_bg_exclude_external", &vm_page_background_exclude_external, sizeof(vm_page_background_exclude_external));
2177 PE_parse_boot_argn("vm_page_bg_target", &vm_page_background_target, sizeof(vm_page_background_target));
2178
2179 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && vm_page_background_mode != VM_PAGE_BG_ENABLED) {
2180 vm_page_background_mode = VM_PAGE_BG_DISABLED;
2181 }
2182
2183 PE_parse_boot_argn("vm_page_donate_mode", &vm_page_donate_mode, sizeof(vm_page_donate_mode));
2184 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED && vm_page_donate_mode != VM_PAGE_DONATE_ENABLED) {
2185 vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
2186 }
2187
2188 vm_page_donate_target_high = VM_PAGE_DONATE_TARGET_HIGHWATER;
2189 vm_page_donate_target_low = VM_PAGE_DONATE_TARGET_LOWWATER;
2190 vm_page_donate_target = vm_page_donate_target_high;
2191 vm_page_donate_count = 0;
2192
2193 vm_page_free_wanted = 0;
2194 vm_page_free_wanted_privileged = 0;
2195 #if CONFIG_SECLUDED_MEMORY
2196 vm_page_free_wanted_secluded = 0;
2197 #endif /* CONFIG_SECLUDED_MEMORY */
2198
2199 #if defined (__x86_64__)
2200 /* this must be called before vm_page_set_colors() */
2201 vm_page_setup_clump();
2202 #endif
2203
2204 vm_page_set_colors();
2205
2206 for (vm_tag_t t = 0; t < VM_KERN_MEMORY_FIRST_DYNAMIC; t++) {
2207 vm_allocation_sites_static[t].refcount = 2;
2208 vm_allocation_sites_static[t].tag = t;
2209 vm_allocation_sites[t] = &vm_allocation_sites_static[t];
2210 }
2211 vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].refcount = 2;
2212 vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].tag = VM_KERN_MEMORY_ANY;
2213 vm_allocation_sites[VM_KERN_MEMORY_ANY] = &vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC];
2214
2215 /*
2216 * Steal memory for the map and zone subsystems.
2217 *
2218 * make sure initialize_ram_ranges() has run before we steal pages for the first time on arm
2219 */
2220 (void)pmap_free_pages();
2221
2222 kernel_startup_initialize_upto(STARTUP_SUB_PMAP_STEAL);
2223
2224 /*
2225 * Allocate (and initialize) the virtual-to-physical
2226 * table hash buckets.
2227 *
2228 * The number of buckets should be a power of two to
2229 * get a good hash function. The following computation
2230 * chooses the first power of two that is greater
2231 * than the number of physical pages in the system.
2232 */
2233
2234 if (vm_page_bucket_count == 0) {
2235 unsigned int npages = pmap_free_pages();
2236
2237 vm_page_bucket_count = 1;
2238 while (vm_page_bucket_count < npages) {
2239 vm_page_bucket_count <<= 1;
2240 }
2241 }
2242 vm_page_bucket_lock_count = (vm_page_bucket_count + BUCKETS_PER_LOCK - 1) / BUCKETS_PER_LOCK;
2243
2244 vm_page_hash_mask = vm_page_bucket_count - 1;
2245
2246 /*
2247 * Calculate object shift value for hashing algorithm:
2248 * O = log2(sizeof(struct vm_object))
2249 * B = log2(vm_page_bucket_count)
2250 * hash shifts the object left by
2251 * B/2 - O
2252 */
2253 size = vm_page_bucket_count;
2254 for (log1 = 0; size > 1; log1++) {
2255 size /= 2;
2256 }
2257 size = sizeof(struct vm_object);
2258 for (log2 = 0; size > 1; log2++) {
2259 size /= 2;
2260 }
2261 vm_page_hash_shift = log1 / 2 - log2 + 1;
2262
2263 vm_page_bucket_hash = 1 << ((log1 + 1) >> 1); /* Get (ceiling of sqrt of table size) */
2264 vm_page_bucket_hash |= 1 << ((log1 + 1) >> 2); /* Get (ceiling of quadroot of table size) */
2265 vm_page_bucket_hash |= 1; /* Set bit and add 1 - always must be 1 to insure unique series */
2266
2267 if (vm_page_hash_mask & vm_page_bucket_count) {
2268 printf("vm_page_bootstrap: WARNING -- strange page hash\n");
2269 }
2270
2271 #if VM_PAGE_BUCKETS_CHECK
2272 #if VM_PAGE_FAKE_BUCKETS
2273 /*
2274 * Allocate a decoy set of page buckets, to detect
2275 * any stomping there.
2276 */
2277 vm_page_fake_buckets = (vm_page_bucket_t *)
2278 pmap_steal_memory(vm_page_bucket_count *
2279 sizeof(vm_page_bucket_t), 0);
2280 vm_page_fake_buckets_start = (vm_map_offset_t) vm_page_fake_buckets;
2281 vm_page_fake_buckets_end =
2282 vm_map_round_page((vm_page_fake_buckets_start +
2283 (vm_page_bucket_count *
2284 sizeof(vm_page_bucket_t))),
2285 PAGE_MASK);
2286 char *cp;
2287 for (cp = (char *)vm_page_fake_buckets_start;
2288 cp < (char *)vm_page_fake_buckets_end;
2289 cp++) {
2290 *cp = 0x5a;
2291 }
2292 #endif /* VM_PAGE_FAKE_BUCKETS */
2293 #endif /* VM_PAGE_BUCKETS_CHECK */
2294
2295 kernel_debug_string_early("vm_page_buckets");
2296 vm_page_buckets = (vm_page_bucket_t *)
2297 pmap_steal_memory(vm_page_bucket_count *
2298 sizeof(vm_page_bucket_t), 0);
2299
2300 kernel_debug_string_early("vm_page_bucket_locks");
2301 vm_page_bucket_locks = (lck_ticket_t *)
2302 pmap_steal_memory(vm_page_bucket_lock_count *
2303 sizeof(lck_ticket_t), 0);
2304
2305 for (i = 0; i < vm_page_bucket_count; i++) {
2306 vm_page_bucket_t *bucket = &vm_page_buckets[i];
2307
2308 bucket->page_list = VM_PAGE_PACK_PTR(VM_PAGE_NULL);
2309 #if MACH_PAGE_HASH_STATS
2310 bucket->cur_count = 0;
2311 bucket->hi_count = 0;
2312 #endif /* MACH_PAGE_HASH_STATS */
2313 }
2314
2315 for (i = 0; i < vm_page_bucket_lock_count; i++) {
2316 lck_ticket_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket);
2317 }
2318
2319 vm_tag_init();
2320
2321 #if VM_PAGE_BUCKETS_CHECK
2322 vm_page_buckets_check_ready = TRUE;
2323 #endif /* VM_PAGE_BUCKETS_CHECK */
2324
2325 /*
2326 * Machine-dependent code allocates the resident page table.
2327 * It uses vm_page_init to initialize the page frames.
2328 * The code also returns to us the virtual space available
2329 * to the kernel. We don't trust the pmap module
2330 * to get the alignment right.
2331 */
2332
2333 kernel_debug_string_early("pmap_startup");
2334 pmap_startup(&virtual_space_start, &virtual_space_end);
2335 virtual_space_start = round_page(virtual_space_start);
2336 virtual_space_end = trunc_page(virtual_space_end);
2337
2338 *startp = virtual_space_start;
2339 *endp = virtual_space_end;
2340
2341 /*
2342 * Compute the initial "wire" count.
2343 * Up until now, the pages which have been set aside are not under
2344 * the VM system's control, so although they aren't explicitly
2345 * wired, they nonetheless can't be moved. At this moment,
2346 * all VM managed pages are "free", courtesy of pmap_startup.
2347 */
2348 assert((unsigned int) atop_64(max_mem) == atop_64(max_mem));
2349 vm_page_wire_count = ((unsigned int) atop_64(max_mem)) -
2350 vm_page_free_count - vm_lopage_free_count;
2351 #if CONFIG_SECLUDED_MEMORY
2352 vm_page_wire_count -= vm_page_secluded_count;
2353 #endif
2354 #if HAS_MTE
2355 /*
2356 * Discount any tag storage pages that we have set aside in
2357 * vm_page_release_startup().
2358 */
2359 vm_page_wire_count -= mte_tag_storage_count;
2360 #endif
2361 vm_page_wire_count_initial = vm_page_wire_count;
2362
2363 /* capture this for later use */
2364 booter_size = ml_get_booter_memory_size();
2365
2366 printf("vm_page_bootstrap: %d free pages, %d wired pages"
2367 #if XNU_VM_HAS_DELAYED_PAGES
2368 ", (up to %d of which are delayed free)"
2369 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2370 "%c",
2371 vm_page_free_count,
2372 vm_page_wire_count,
2373 #if XNU_VM_HAS_DELAYED_PAGES
2374 vm_delayed_count,
2375 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2376 '\n');
2377
2378 kernel_debug_string_early("vm_page_bootstrap complete");
2379 }
2380
2381 #ifndef MACHINE_PAGES
2382 /*
2383 * This is the early boot time allocator for data structures needed to bootstrap the VM system.
2384 * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this
2385 * on ARM yet, due to the combination of a large base page size and smaller RAM devices.
2386 */
2387 __static_testable void *
pmap_steal_memory_internal(vm_size_t size,vm_size_t alignment,boolean_t might_free,unsigned int flags,pmap_mapping_type_t mapping_type)2388 pmap_steal_memory_internal(
2389 vm_size_t size,
2390 vm_size_t alignment,
2391 boolean_t might_free,
2392 unsigned int flags,
2393 pmap_mapping_type_t mapping_type)
2394 {
2395 kern_return_t kr;
2396 vm_offset_t addr;
2397 vm_offset_t end = 0;
2398 vm_offset_t map_addr;
2399 ppnum_t phys_page;
2400 unsigned int pmap_flags;
2401
2402 if (size > UINT64_MAX - sizeof(void *)) {
2403 panic("pmap_steal_memory(): size: 0x%lx", size);
2404 }
2405 /*
2406 * Size needs to be aligned to word size.
2407 */
2408 size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
2409
2410 /*
2411 * Alignment defaults to word size if not specified.
2412 */
2413 if (alignment == 0) {
2414 alignment = sizeof(void*);
2415 }
2416
2417 /*
2418 * Alignment must be no greater than a page and must be a power of two.
2419 */
2420 assert(alignment <= PAGE_SIZE);
2421 assert((alignment & (alignment - 1)) == 0);
2422
2423 /*
2424 * On the first call, get the initial values for virtual address space
2425 * and page align them.
2426 */
2427 if (virtual_space_start == virtual_space_end) {
2428 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
2429 virtual_space_start = round_page(virtual_space_start);
2430 virtual_space_end = trunc_page(virtual_space_end);
2431
2432 #if defined(__x86_64__)
2433 /*
2434 * Release remaining unused section of preallocated KVA and the 4K page tables
2435 * that map it. This makes the VA available for large page mappings.
2436 */
2437 Idle_PTs_release(virtual_space_start, virtual_space_end);
2438 #endif
2439 }
2440
2441 /*
2442 * Allocate the virtual space for this request. On x86, we'll align to a large page
2443 * address if the size is big enough to back with at least 1 large page.
2444 */
2445 #if defined(__x86_64__)
2446 if (size >= I386_LPGBYTES) {
2447 virtual_space_start = ((virtual_space_start + I386_LPGMASK) & ~I386_LPGMASK);
2448 }
2449 #endif
2450 virtual_space_start = (virtual_space_start + (alignment - 1)) & ~(alignment - 1);
2451 addr = virtual_space_start;
2452 virtual_space_start += size;
2453
2454 //kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */
2455
2456 /*
2457 * Allocate and map physical pages to back the new virtual space.
2458 */
2459 map_addr = round_page(addr);
2460 if (os_add_overflow(addr, size, &end)) {
2461 panic("pmap_steal_memory() overflow, addr: %lx, size: 0x%lx", addr, size);
2462 }
2463 while (map_addr < end) {
2464 #if defined(__x86_64__)
2465 /*
2466 * Back with a large page if properly aligned on x86
2467 */
2468 if ((map_addr & I386_LPGMASK) == 0 &&
2469 map_addr + I386_LPGBYTES <= addr + size &&
2470 pmap_pre_expand_large(kernel_pmap, map_addr) == KERN_SUCCESS &&
2471 pmap_next_page_large(&phys_page) == KERN_SUCCESS) {
2472 kr = pmap_enter(kernel_pmap, map_addr, phys_page,
2473 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
2474 VM_WIMG_USE_DEFAULT | VM_MEM_SUPERPAGE, FALSE, mapping_type);
2475
2476 if (kr != KERN_SUCCESS) {
2477 panic("pmap_steal_memory: pmap_enter() large failed, new_addr=%#lx, phys_page=%u",
2478 (unsigned long)map_addr, phys_page);
2479 }
2480 map_addr += I386_LPGBYTES;
2481 vm_page_wire_count += I386_LPGBYTES >> PAGE_SHIFT;
2482 vm_page_stolen_count += I386_LPGBYTES >> PAGE_SHIFT;
2483 vm_page_kern_lpage_count++;
2484 continue;
2485 }
2486 #endif
2487
2488 if (!pmap_next_page_hi(&phys_page, might_free)) {
2489 panic("pmap_steal_memory() size: 0x%llx", (uint64_t)size);
2490 }
2491
2492 #if defined(__x86_64__)
2493 pmap_pre_expand(kernel_pmap, map_addr);
2494 #endif
2495 pmap_flags = flags ? flags : VM_WIMG_USE_DEFAULT;
2496
2497 #if HAS_MTE
2498 if (pmap_flags & VM_MEM_MAP_MTE) {
2499 mteinfo_covered_page_set_stolen_tagged(phys_page);
2500 pmap_make_tagged_page(phys_page);
2501 }
2502 #endif /* HAS_MTE */
2503
2504 kr = pmap_enter(kernel_pmap, map_addr, phys_page,
2505 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
2506 pmap_flags, FALSE, mapping_type);
2507
2508 if (kr != KERN_SUCCESS) {
2509 panic("pmap_steal_memory() pmap_enter failed, map_addr=%#lx, phys_page=%u",
2510 (unsigned long)map_addr, phys_page);
2511 }
2512 map_addr += PAGE_SIZE;
2513
2514 /*
2515 * Account for newly stolen memory
2516 */
2517 vm_page_wire_count++;
2518 vm_page_stolen_count++;
2519 }
2520
2521 #if defined(__x86_64__)
2522 /*
2523 * The call with might_free is currently the last use of pmap_steal_memory*().
2524 * Notify the pmap layer to record which high pages were allocated so far.
2525 */
2526 if (might_free) {
2527 pmap_hi_pages_done();
2528 }
2529 #endif
2530 #if KASAN
2531 kasan_notify_address(round_page(addr), size);
2532 #endif
2533 return (void *) addr;
2534 }
2535
2536 __mockable void *
pmap_steal_memory(vm_size_t size,vm_size_t alignment)2537 pmap_steal_memory(
2538 vm_size_t size,
2539 vm_size_t alignment)
2540 {
2541 return pmap_steal_memory_internal(size, alignment, FALSE, 0, PMAP_MAPPING_TYPE_RESTRICTED);
2542 }
2543
2544 void *
pmap_steal_freeable_memory(vm_size_t size)2545 pmap_steal_freeable_memory(
2546 vm_size_t size)
2547 {
2548 return pmap_steal_memory_internal(size, 0, TRUE, 0, PMAP_MAPPING_TYPE_RESTRICTED);
2549 }
2550
2551 #if HAS_MTE
2552 void *
pmap_steal_zone_memory(vm_size_t size,vm_size_t alignment)2553 pmap_steal_zone_memory(
2554 vm_size_t size,
2555 vm_size_t alignment)
2556 {
2557 return pmap_steal_memory_internal(size, alignment, FALSE, VM_MEM_MAP_MTE, PMAP_MAPPING_TYPE_RESTRICTED);
2558 }
2559 #endif /* HAS_MTE */
2560
2561
2562 #if CONFIG_SECLUDED_MEMORY
2563 /* boot-args to control secluded memory */
2564 TUNABLE_DT(unsigned int, secluded_mem_mb, "/defaults", "kern.secluded_mem_mb", "secluded_mem_mb", 0, TUNABLE_DT_NONE);
2565 /* IOKit can use secluded memory */
2566 TUNABLE(bool, secluded_for_iokit, "secluded_for_iokit", true);
2567 /* apps can use secluded memory */
2568 TUNABLE(bool, secluded_for_apps, "secluded_for_apps", true);
2569 /* filecache can use seclude memory */
2570 TUNABLE(secluded_filecache_mode_t, secluded_for_filecache, "secluded_for_filecache", SECLUDED_FILECACHE_RDONLY);
2571 uint64_t secluded_shutoff_trigger = 0;
2572 uint64_t secluded_shutoff_headroom = 150 * 1024 * 1024; /* original value from N56 */
2573 #endif /* CONFIG_SECLUDED_MEMORY */
2574
2575
2576 #if defined(__arm64__)
2577 extern void patch_low_glo_vm_page_info(void *, void *, uint32_t);
2578 #endif
2579
2580 void vm_page_release_startup(vm_page_t mem);
2581 __mockable void
pmap_startup(vm_offset_t * startp,vm_offset_t * endp)2582 pmap_startup(
2583 vm_offset_t *startp,
2584 vm_offset_t *endp)
2585 {
2586 unsigned int npages;
2587 ppnum_t phys_page;
2588 uint64_t mem_sz;
2589 uint64_t start_ns;
2590 uint64_t now_ns;
2591 uint32_t divisor;
2592 #if XNU_VM_HAS_DELAYED_PAGES
2593 uint_t low_page_count = 0;
2594 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2595
2596 /*
2597 * make sure we are aligned on a 64 byte boundary
2598 * for VM_PAGE_PACK_PTR (it clips off the low-order
2599 * 6 bits of the pointer)
2600 */
2601 if (virtual_space_start != virtual_space_end) {
2602 virtual_space_start = round_page(virtual_space_start);
2603 }
2604
2605 /*
2606 * We calculate how many page frames we will have
2607 * and then allocate the page structures in one chunk.
2608 *
2609 * Note that the calculation here doesn't take into account
2610 * the memory needed to map what's being allocated, i.e. the page
2611 * table entries. So the actual number of pages we get will be
2612 * less than this. To do someday: include that in the computation.
2613 *
2614 * Also for ARM, we don't use the count of free_pages, but rather the
2615 * range from last page to first page (ignore holes due to retired pages).
2616 */
2617
2618 /*
2619 * Initialize and release the page frames.
2620 */
2621 kernel_debug_string_early("page_frame_init");
2622 absolutetime_to_nanoseconds(mach_absolute_time(), &start_ns);
2623 if (fillval) {
2624 kprintf("Filling vm_pages with pattern: 0x%x\n", fillval);
2625 }
2626
2627 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2628 mem_sz = ptoa(pmap_free_pages_span());
2629 #if HAS_MTE
2630 if (!is_mte_enabled)
2631 #endif /* HAS_MTE */
2632 #if CONFIG_SPTM
2633 {
2634 uint32_t count = vm_pages_free_mask_len();
2635
2636 _vm_pages_free_masks = pmap_steal_memory(count *
2637 sizeof(__uint128_t), sizeof(__uint128_t));
2638 _vm_pages_free_enqueue_idx = pmap_steal_memory(count, sizeof(uint8_t));
2639 bzero(_vm_pages_free_masks, count * sizeof(__uint128_t));
2640 memset(_vm_pages_free_enqueue_idx, 0xff, count);
2641 }
2642 #endif /* CONFIG_SPTM */
2643 #else
2644 mem_sz = ptoa(pmap_free_pages());
2645 #endif
2646 mem_sz += round_page(virtual_space_start) - virtual_space_start; /* Account for any slop */
2647 divisor = PAGE_SIZE + sizeof(struct vm_page);
2648 npages = (uint32_t)((mem_sz + divisor - 1) / divisor); /* scaled to include the vm_page_ts */
2649
2650
2651 vm_pages = pmap_steal_freeable_memory(npages * sizeof(struct vm_page));
2652 vm_pages_end = vm_page_get(npages);
2653
2654 #if CONFIG_SECLUDED_MEMORY
2655 /*
2656 * Figure out how much secluded memory to have before we start
2657 * release pages to free lists.
2658 * The default, if specified nowhere else, is no secluded mem.
2659 */
2660 vm_page_secluded_target = (unsigned int)atop_64(secluded_mem_mb * 1024ULL * 1024ULL);
2661
2662 /*
2663 * Allow a really large app to effectively use secluded memory until it exits.
2664 */
2665 if (vm_page_secluded_target != 0) {
2666 /*
2667 * Get an amount from boot-args, else use 1/2 of max_mem.
2668 * 1/2 max_mem was chosen from a Peace daemon tentpole test which
2669 * used munch to induce jetsam thrashing of false idle daemons on N56.
2670 */
2671 int secluded_shutoff_mb;
2672 if (PE_parse_boot_argn("secluded_shutoff_mb", &secluded_shutoff_mb,
2673 sizeof(secluded_shutoff_mb))) {
2674 secluded_shutoff_trigger = (uint64_t)secluded_shutoff_mb * 1024 * 1024;
2675 } else {
2676 secluded_shutoff_trigger = max_mem / 2;
2677 }
2678
2679 /* ensure the headroom value is sensible and avoid underflows */
2680 assert(secluded_shutoff_trigger == 0 || secluded_shutoff_trigger > secluded_shutoff_headroom);
2681 }
2682 #endif /* CONFIG_SECLUDED_MEMORY */
2683
2684 #if defined(__x86_64__)
2685
2686 /*
2687 * Decide how much memory we delay freeing at boot time.
2688 */
2689 uint32_t delay_above_gb;
2690 if (!PE_parse_boot_argn("delay_above_gb", &delay_above_gb, sizeof(delay_above_gb))) {
2691 delay_above_gb = DEFAULT_DELAY_ABOVE_PHYS_GB;
2692 }
2693
2694 if (delay_above_gb == 0) {
2695 delay_above_pnum = PPNUM_MAX;
2696 } else {
2697 delay_above_pnum = delay_above_gb * (1024 * 1024 * 1024 / PAGE_SIZE);
2698 }
2699
2700 /* make sure we have sane breathing room: 1G above low memory */
2701 if (delay_above_pnum <= max_valid_low_ppnum) {
2702 delay_above_pnum = max_valid_low_ppnum + ((1024 * 1024 * 1024) >> PAGE_SHIFT);
2703 }
2704
2705 if (delay_above_pnum < PPNUM_MAX) {
2706 printf("pmap_startup() delaying init/free of page nums > 0x%x\n", delay_above_pnum);
2707 }
2708
2709 #endif /* defined(__x86_64__) */
2710
2711
2712 vm_free_page_lock();
2713
2714 for (uint32_t i = 0; i < npages && pmap_next_page(&phys_page); i++) {
2715 #if XNU_VM_HAS_DELAYED_PAGES
2716 if (phys_page < max_valid_low_ppnum) {
2717 ++low_page_count;
2718 }
2719
2720 /* Are we at high enough pages to delay the rest? */
2721 if (low_page_count > vm_lopage_free_limit &&
2722 phys_page > delay_above_pnum) {
2723 vm_delayed_count = pmap_free_pages();
2724 assert3u(vm_pages_count + vm_delayed_count, <=, npages);
2725 break;
2726 }
2727 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2728
2729 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2730 if (i == 0) {
2731 vm_pages_first_pnum = phys_page;
2732 patch_low_glo_vm_page_info(vm_pages, vm_pages_end,
2733 vm_pages_first_pnum);
2734 #if HAS_MTE
2735 if (is_mte_enabled) {
2736 vm_pages_tag_storage = vm_page_get(
2737 (mte_tag_storage_start_pnum - vm_pages_first_pnum));
2738 vm_pages_tag_storage_end = vm_tag_storage_page_get(mte_tag_storage_count);
2739 assert3p(vm_pages_tag_storage_end, <=, vm_pages_end);
2740 }
2741 #endif /* HAS_MTE */
2742 }
2743 #else
2744 /* The x86 clump freeing code requires increasing ppn's to work correctly */
2745 if (i > 0) {
2746 assert(phys_page > vm_page_get(i - 1)->vmp_phys_page);
2747 }
2748 #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */
2749
2750 ++vm_pages_count;
2751 vm_page_init(vm_page_get(i), phys_page);
2752 if (fillval) {
2753 fillPage(phys_page, fillval);
2754 }
2755 if (vm_himemory_mode) {
2756 vm_page_release_startup(vm_page_get(i));
2757 }
2758 }
2759
2760 vm_page_pages = vm_pages_count; /* used to report to user space */
2761
2762 if (!vm_himemory_mode) {
2763 for (uint32_t i = npages; i-- > 0;) {
2764 /* skip retired pages */
2765 if (!VMP_ERROR_GET(vm_page_get(i))) {
2766 vm_page_release_startup(vm_page_get(i));
2767 }
2768 }
2769 }
2770
2771 vm_free_page_unlock();
2772
2773 absolutetime_to_nanoseconds(mach_absolute_time(), &now_ns);
2774 printf("pmap_startup() init/release time: %lld microsec\n",
2775 (now_ns - start_ns) / NSEC_PER_USEC);
2776 #if XNU_VM_HAS_DELAYED_PAGES
2777 printf("pmap_startup() delayed init/release of %d pages\n",
2778 vm_delayed_count);
2779 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2780
2781 /*
2782 * Validate packing will work properly. This needs to be done last
2783 * after vm_pages_count has been computed.
2784 */
2785 if (npages >= VM_PAGE_PACKED_FROM_ARRAY) {
2786 panic("pmap_startup(): too many pages to support vm_page packing");
2787 }
2788 if ((vm_page_t)VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(vm_pages)) != vm_pages) {
2789 panic("VM_PAGE_PACK_PTR failed on vm_pages - %p", vm_pages);
2790 }
2791 if ((vm_page_t)VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(vm_page_get(vm_pages_count - 1))) !=
2792 vm_page_get(vm_pages_count - 1)) {
2793 panic("VM_PAGE_PACK_PTR failed on vm_pages_end - %p",
2794 vm_page_get(vm_pages_count - 1));
2795 }
2796
2797 VM_CHECK_MEMORYSTATUS;
2798
2799 /*
2800 * We have to re-align virtual_space_start,
2801 * because pmap_steal_memory has been using it.
2802 */
2803 virtual_space_start = round_page(virtual_space_start);
2804 *startp = virtual_space_start;
2805 *endp = virtual_space_end;
2806 }
2807 #endif /* MACHINE_PAGES */
2808
2809 /*
2810 * Create the zone that represents the vm_pages[] array. Nothing ever allocates
2811 * or frees to this zone. It's just here for reporting purposes via zprint command.
2812 * This needs to be done after all initially delayed pages are put on the free lists.
2813 */
2814 void
vm_pages_array_finalize(void)2815 vm_pages_array_finalize(void)
2816 {
2817 (void)zone_create_ext("vm pages array", sizeof(struct vm_page),
2818 ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE, ZONE_ID_VM_PAGES, ^(zone_t z) {
2819 uint64_t vm_page_zone_pages, vm_page_array_zone_data_size;
2820
2821 zone_set_exhaustible(z, 0, true);
2822 /*
2823 * Reflect size and usage information for vm_pages[].
2824 */
2825
2826 z->z_elems_avail = (uint32_t)(vm_pages_end - vm_pages);
2827 z->z_elems_free = z->z_elems_avail - vm_pages_count;
2828 zpercpu_get_cpu(z->z_stats, 0)->zs_mem_allocated =
2829 vm_pages_count * sizeof(struct vm_page);
2830 vm_page_array_zone_data_size = (uint64_t)vm_pages_end - (uint64_t)vm_pages;
2831 vm_page_zone_pages = atop(round_page((vm_offset_t)vm_page_array_zone_data_size));
2832 z->z_wired_cur += vm_page_zone_pages;
2833 z->z_wired_hwm = z->z_wired_cur;
2834 z->z_va_cur = z->z_wired_cur;
2835 /* since zone accounts for these, take them out of stolen */
2836 VM_PAGE_MOVE_STOLEN(vm_page_zone_pages);
2837 });
2838 }
2839
2840 /*
2841 * Create the vm_pages zone. This is used for the vm_page structures for the pages
2842 * that are scavanged from other boot time usages by ml_static_mfree(). As such,
2843 * this needs to happen in early VM bootstrap.
2844 */
2845
2846 __startup_func
2847 static void
vm_page_module_init(void)2848 vm_page_module_init(void)
2849 {
2850 vm_size_t vm_page_with_ppnum_size;
2851
2852 /*
2853 * Since the pointers to elements in this zone will be packed, they
2854 * must have appropriate size. Not strictly what sizeof() reports.
2855 */
2856 vm_page_with_ppnum_size =
2857 (sizeof(struct vm_page_with_ppnum) + (VM_PAGE_PACKED_PTR_ALIGNMENT - 1)) &
2858 ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
2859
2860 vm_page_zone = zone_create_ext("vm pages", vm_page_with_ppnum_size,
2861 ZC_ALIGNMENT_REQUIRED | ZC_VM,
2862 ZONE_ID_ANY, ^(zone_t z) {
2863 /*
2864 * The number "10" is a small number that is larger than the number
2865 * of fictitious pages that any single caller will attempt to allocate
2866 * without blocking.
2867 *
2868 * The largest such number at the moment is kmem_alloc()
2869 * when 2 guard pages are asked. 10 is simply a somewhat larger number,
2870 * taking into account the 50% hysteresis the zone allocator uses.
2871 *
2872 * Note: this works at all because the zone allocator
2873 * doesn't ever allocate fictitious pages.
2874 */
2875 zone_raise_reserve(z, 10);
2876 });
2877 }
2878 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_page_module_init);
2879
2880 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2881 /*
2882 * Radix tree of pages within the [pmap_first_pnum, vm_pages_first_pnum) range,
2883 * in order to support page lookup by pnum (@see vm_page_find_canonical()),
2884 * which corresponds to pages returned to the VM via @c ml_static_mfree().
2885 *
2886 * Kernel vm pages are never freed, which means that this data structure
2887 * is insert only.
2888 *
2889 * Empirically we have about 4-5k such pages, typically in only few rather dense
2890 * contiguous spans, inside a range of roughly 32k pnums.
2891 *
2892 * A radix tree works well with the distribution of keys, but also allows for
2893 * a straightforward lockless lookup path.
2894 */
2895
2896 #define VM_PAGE_RADIX_FANOUT_SHIFT 8
2897 #define VM_PAGE_RADIX_FANOUT (1u << VM_PAGE_RADIX_FANOUT_SHIFT)
2898
2899 typedef uint32_t vm_page_radix_ptr_t;
2900
2901 typedef struct vm_page_radix_node {
2902 vm_page_radix_ptr_t vmpr_array[VM_PAGE_RADIX_FANOUT];
2903 } *vm_page_radix_node_t;
2904
2905 static LCK_GRP_DECLARE(vm_pages_radix_lock_grp, "VM pages radix");
2906 static LCK_MTX_DECLARE(vm_pages_radix_lock, &vm_pages_radix_lock_grp);
2907
2908 static SECURITY_READ_ONLY_LATE(uintptr_t) vm_pages_radix_root;
2909 static uint32_t vm_pages_radix_count;
2910
2911 static vm_page_radix_node_t
vm_page_radix_node_unpack(vm_page_radix_ptr_t ptr)2912 vm_page_radix_node_unpack(vm_page_radix_ptr_t ptr)
2913 {
2914 return (vm_page_radix_node_t)VM_UNPACK_POINTER(ptr, VM_PAGE_PACKED_PTR);
2915 }
2916
2917 static vm_page_radix_ptr_t
vm_page_radix_node_pack(vm_page_radix_node_t node)2918 vm_page_radix_node_pack(vm_page_radix_node_t node)
2919 {
2920 vm_offset_t ptr = (vm_offset_t)node;
2921
2922 VM_ASSERT_POINTER_PACKABLE(ptr, VM_PAGE_PACKED_PTR);
2923 return (vm_page_radix_ptr_t)VM_PACK_POINTER(ptr, VM_PAGE_PACKED_PTR);
2924 }
2925
2926 static uint32_t
vm_page_radix_key(uint32_t level,uint32_t index)2927 vm_page_radix_key(uint32_t level, uint32_t index)
2928 {
2929 uint32_t key = index >> (VM_PAGE_RADIX_FANOUT_SHIFT * level);
2930
2931 return key & (VM_PAGE_RADIX_FANOUT - 1);
2932 }
2933
2934 static vm_page_radix_ptr_t *
vm_page_radix_slot(vm_page_radix_node_t node,uint32_t level,uint32_t index)2935 vm_page_radix_slot(vm_page_radix_node_t node, uint32_t level, uint32_t index)
2936 {
2937 return node->vmpr_array + vm_page_radix_key(level, index);
2938 }
2939
2940 __startup_func
2941 __attribute__((noinline))
2942 static vm_page_radix_node_t
vm_pages_radix_init_root(uint32_t * levelp)2943 vm_pages_radix_init_root(uint32_t *levelp)
2944 {
2945 uint32_t max_index = vm_pages_first_pnum - pmap_first_pnum - 1;
2946 vm_page_radix_node_t root;
2947 uint32_t level;
2948 vm_size_t size;
2949
2950 /*
2951 * Init a top-level node right away, to cover any index within
2952 * [0, vm_pages_first_pnum - pmap_first_pnum)
2953 */
2954 level = (fls(max_index | 1) - 1) / VM_PAGE_RADIX_FANOUT_SHIFT;
2955 size = (vm_page_radix_key(level, max_index) + 1) *
2956 sizeof(vm_page_radix_ptr_t);
2957
2958 root = zalloc_permanent(size, ZALIGN_64);
2959
2960 /*
2961 * Pack the level into the root pointer low bits,
2962 * so that pointer and level can be read atomically.
2963 *
2964 * See vm_pages_radix_load_root().
2965 */
2966 os_atomic_store(&vm_pages_radix_root, (uintptr_t)root | level, release);
2967
2968 *levelp = level;
2969 return root;
2970 }
2971
2972 static vm_page_radix_node_t
vm_pages_radix_node_alloc(vm_page_radix_ptr_t * slot)2973 vm_pages_radix_node_alloc(vm_page_radix_ptr_t *slot)
2974 {
2975 vm_page_radix_node_t node;
2976
2977 node = zalloc_permanent(sizeof(struct vm_page_radix_node),
2978 VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
2979 os_atomic_store(slot, vm_page_radix_node_pack(node), release);
2980 return node;
2981 }
2982
2983 static vm_page_radix_node_t
vm_pages_radix_load_root(uint32_t * level)2984 vm_pages_radix_load_root(uint32_t *level)
2985 {
2986 const uintptr_t VM_PAGE_RADIX_LEVEL_MASK = 0x7ul;
2987
2988 uintptr_t root = os_atomic_load(&vm_pages_radix_root, dependency);
2989
2990 *level = root & VM_PAGE_RADIX_LEVEL_MASK;
2991 root &= ~VM_PAGE_RADIX_LEVEL_MASK;
2992 return (vm_page_radix_node_t)root;
2993 }
2994
2995 vm_page_t
vm_pages_radix_next(uint32_t * cursor,ppnum_t * pnum)2996 vm_pages_radix_next(uint32_t *cursor, ppnum_t *pnum)
2997 {
2998 const uint32_t max_index = vm_pages_first_pnum - pmap_first_pnum;
2999 vm_page_radix_node_t node;
3000 uint32_t level, index;
3001
3002 index = *cursor;
3003 node = vm_pages_radix_load_root(&level);
3004
3005 if (node == NULL) {
3006 return VM_PAGE_NULL;
3007 }
3008
3009 while (index < max_index) {
3010 vm_page_radix_ptr_t *slot = vm_page_radix_slot(node, level, index);
3011 vm_page_radix_ptr_t ptr = os_atomic_load(slot, dependency);
3012
3013 if (ptr == 0) {
3014 uint32_t stride = 1 << (VM_PAGE_RADIX_FANOUT_SHIFT * level);
3015
3016 index = (index + stride) & -stride;
3017 if (vm_page_radix_key(level, index) == 0) {
3018 /* restart lookup at the top */
3019 node = vm_pages_radix_load_root(&level);
3020 }
3021 } else if (level > 0) {
3022 node = vm_page_radix_node_unpack(ptr);
3023 level -= 1;
3024 } else {
3025 *cursor = index + 1;
3026 if (pnum) {
3027 *pnum = pmap_first_pnum + index;
3028 }
3029 return (vm_page_t)VM_PAGE_UNPACK_PTR(ptr);
3030 }
3031 }
3032
3033 if (pnum) {
3034 *pnum = 0;
3035 }
3036 return VM_PAGE_NULL;
3037 }
3038
3039 #if DEBUG || DEVELOPMENT
3040
3041 static int
vm_page_radix_verify_test(int64_t in __unused,int64_t * out)3042 vm_page_radix_verify_test(int64_t in __unused, int64_t *out)
3043 {
3044 uint32_t count = 0;
3045 vm_page_t mem;
3046
3047 lck_mtx_lock(&vm_pages_radix_lock);
3048
3049 vm_pages_radix_for_each(mem) {
3050 count++;
3051 assert(mem == vm_page_find_canonical(VM_PAGE_GET_PHYS_PAGE(mem)));
3052 }
3053
3054 assert(count == vm_pages_radix_count);
3055
3056 lck_mtx_unlock(&vm_pages_radix_lock);
3057
3058 *out = 1;
3059 return 0;
3060 }
3061 SYSCTL_TEST_REGISTER(vm_page_radix_verify, vm_page_radix_verify_test);
3062
3063 #endif /* DEBUG || DEVELOPMENT */
3064
3065 __attribute__((noinline))
3066 static void
vm_pages_radix_insert(ppnum_t pnum,vm_page_t page)3067 vm_pages_radix_insert(ppnum_t pnum, vm_page_t page)
3068 {
3069 vm_page_radix_ptr_t *slot;
3070 vm_page_radix_node_t node;
3071 uint32_t level, index;
3072
3073 assert(!vm_page_in_array(page));
3074 index = pnum - pmap_first_pnum;
3075
3076 lck_mtx_lock(&vm_pages_radix_lock);
3077
3078 node = vm_pages_radix_load_root(&level);
3079 if (node == NULL) {
3080 node = vm_pages_radix_init_root(&level);
3081 }
3082
3083 for (; level > 0; level--) {
3084 slot = vm_page_radix_slot(node, level, index);
3085 if (*slot == 0) {
3086 node = vm_pages_radix_node_alloc(slot);
3087 } else {
3088 node = vm_page_radix_node_unpack(*slot);
3089 }
3090 }
3091
3092 slot = vm_page_radix_slot(node, 0, index);
3093 assert(*slot == 0);
3094 os_atomic_store(slot, VM_PAGE_PACK_PTR(page), release);
3095 vm_pages_radix_count++;
3096
3097 lck_mtx_unlock(&vm_pages_radix_lock);
3098 }
3099
3100 __abortlike
3101 static void
vm_page_for_ppnum_panic(ppnum_t pnum)3102 vm_page_for_ppnum_panic(ppnum_t pnum)
3103 {
3104 if (pnum < pmap_first_pnum) {
3105 panic("physical page is before the start of DRAM: %#x < %#x)",
3106 pnum, pmap_first_pnum);
3107 }
3108 panic("physical page is beyond the end of managed DRAM: %#x >= %#x)",
3109 pnum, vm_pages_first_pnum + vm_pages_count);
3110 }
3111
3112 vm_page_t
vm_page_find_canonical(ppnum_t pnum)3113 vm_page_find_canonical(ppnum_t pnum)
3114 {
3115 vm_page_radix_ptr_t *slot;
3116 vm_page_radix_node_t node;
3117 vm_page_radix_ptr_t ptr;
3118 uint32_t level, index;
3119
3120 if (pnum < pmap_first_pnum) {
3121 vm_page_for_ppnum_panic(pnum);
3122 }
3123
3124 if (pnum >= vm_pages_first_pnum + vm_pages_count) {
3125 /*
3126 * We could receive requests for pages which are beyond the xnu's managed space. (eg: ECC errors)
3127 * These need to be handled gracefully, so we return VM_PAGE_NULL here.
3128 */
3129 return VM_PAGE_NULL;
3130 }
3131
3132 if (__probable(pnum >= vm_pages_first_pnum)) {
3133 return vm_page_get(pnum - vm_pages_first_pnum);
3134 }
3135
3136 index = pnum - pmap_first_pnum;
3137 node = vm_pages_radix_load_root(&level);
3138
3139 for (; node && level > 0; level--) {
3140 slot = vm_page_radix_slot(node, level, index);
3141 ptr = os_atomic_load(slot, dependency);
3142 node = vm_page_radix_node_unpack(ptr);
3143 }
3144
3145 if (__probable(node)) {
3146 slot = vm_page_radix_slot(node, 0, index);
3147 ptr = os_atomic_load(slot, dependency);
3148 return (vm_page_t)VM_PAGE_UNPACK_PTR(ptr);
3149 }
3150
3151 return VM_PAGE_NULL;
3152 }
3153
3154 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
3155
3156 /*!
3157 * @function vm_page_create()
3158 *
3159 * @brief
3160 * Common helper for all vm_page_create* functions.
3161 */
3162 vm_page_t
vm_page_create(ppnum_t phys_page,bool canonical,zalloc_flags_t flags)3163 vm_page_create(ppnum_t phys_page, bool canonical, zalloc_flags_t flags)
3164 {
3165 vm_page_t m;
3166
3167 m = zalloc_flags(vm_page_zone, flags);
3168 if (m) {
3169 vm_page_init(m, phys_page);
3170 if (phys_page == vm_page_guard_addr) {
3171 counter_inc(&vm_guard_count);
3172 }
3173 }
3174 if (canonical) {
3175 assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
3176 m->vmp_canonical = true;
3177 #if HAS_MTE
3178 m->vmp_using_mte = pmap_is_tagged_page(phys_page);
3179 #endif /* HAS_MTE */
3180 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
3181 vm_pages_radix_insert(phys_page, m);
3182 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
3183 vm_free_page_lock();
3184 vm_page_pages++;
3185 vm_free_page_unlock();
3186 }
3187 return m;
3188 }
3189
3190 /*
3191 * Routine: vm_page_create_canonical
3192 * Purpose:
3193 * After the VM system is up, machine-dependent code
3194 * may stumble across more physical memory. For example,
3195 * memory that it was reserving for a frame buffer.
3196 * vm_page_create_canonical turns this memory into available pages.
3197 */
3198
3199 void
vm_page_create_canonical(ppnum_t phys_page)3200 vm_page_create_canonical(ppnum_t phys_page)
3201 {
3202 vm_page_t m;
3203
3204 m = vm_page_create(phys_page, true, Z_WAITOK);
3205 vm_page_release(m, VMP_RELEASE_NONE);
3206 }
3207
3208
3209 /*
3210 * vm_page_hash:
3211 *
3212 * Distributes the object/offset key pair among hash buckets.
3213 *
3214 * NOTE: The bucket count must be a power of 2
3215 */
3216 #define vm_page_hash(object, offset) (\
3217 ( (natural_t)((uintptr_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\
3218 & vm_page_hash_mask)
3219
3220
3221 /*
3222 * vm_page_insert: [ internal use only ]
3223 *
3224 * Inserts the given mem entry into the object/object-page
3225 * table and object list.
3226 *
3227 * The object must be locked.
3228 */
3229 void
vm_page_insert(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)3230 vm_page_insert(
3231 vm_page_t mem,
3232 vm_object_t object,
3233 vm_object_offset_t offset)
3234 {
3235 vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, FALSE, FALSE, NULL);
3236 }
3237
3238 void
vm_page_insert_wired(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag)3239 vm_page_insert_wired(
3240 vm_page_t mem,
3241 vm_object_t object,
3242 vm_object_offset_t offset,
3243 vm_tag_t tag)
3244 {
3245 vm_page_insert_internal(mem, object, offset, tag, FALSE, TRUE, FALSE, FALSE, NULL);
3246 }
3247
3248 void
vm_page_insert_internal(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag,boolean_t queues_lock_held,boolean_t insert_in_hash,boolean_t batch_pmap_op,boolean_t batch_accounting,uint64_t * delayed_ledger_update)3249 vm_page_insert_internal(
3250 vm_page_t mem,
3251 vm_object_t object,
3252 vm_object_offset_t offset,
3253 vm_tag_t tag,
3254 boolean_t queues_lock_held,
3255 boolean_t insert_in_hash,
3256 boolean_t batch_pmap_op,
3257 boolean_t batch_accounting,
3258 uint64_t *delayed_ledger_update)
3259 {
3260 vm_page_bucket_t *bucket;
3261 lck_ticket_t *bucket_lock;
3262 int hash_id;
3263 task_t owner;
3264 int ledger_idx_volatile;
3265 int ledger_idx_nonvolatile;
3266 int ledger_idx_volatile_compressed;
3267 int ledger_idx_nonvolatile_compressed;
3268 int ledger_idx_composite;
3269 int ledger_idx_external_wired;
3270 boolean_t do_footprint;
3271
3272 #if 0
3273 /*
3274 * we may not hold the page queue lock
3275 * so this check isn't safe to make
3276 */
3277 VM_PAGE_CHECK(mem);
3278 #endif
3279
3280 assertf(page_aligned(offset), "0x%llx\n", offset);
3281
3282 assert(!VM_PAGE_WIRED(mem) || !vm_page_is_canonical(mem) ||
3283 (tag != VM_KERN_MEMORY_NONE));
3284
3285 #if HAS_MTE
3286 assert_mte_vmo_matches_vmp(object, mem);
3287 #endif /* HAS_MTE */
3288 vm_object_lock_assert_exclusive(object);
3289 LCK_MTX_ASSERT(&vm_page_queue_lock,
3290 queues_lock_held ? LCK_MTX_ASSERT_OWNED
3291 : LCK_MTX_ASSERT_NOTOWNED);
3292
3293 if (queues_lock_held == FALSE) {
3294 assert(!VM_PAGE_PAGEABLE(mem));
3295 }
3296
3297 if (insert_in_hash == TRUE) {
3298 #if DEBUG || VM_PAGE_BUCKETS_CHECK
3299 if (mem->vmp_tabled || mem->vmp_object) {
3300 panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
3301 "already in (obj=%p,off=0x%llx)",
3302 mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
3303 }
3304 #endif
3305 if (object->internal && (offset >= object->vo_size)) {
3306 panic("vm_page_insert_internal: (page=%p,obj=%p,off=0x%llx,size=0x%llx) inserted at offset past object bounds",
3307 mem, object, offset, object->vo_size);
3308 }
3309
3310 assert(vm_page_lookup(object, offset) == VM_PAGE_NULL);
3311
3312 /*
3313 * Record the object/offset pair in this page
3314 */
3315
3316 mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
3317 mem->vmp_offset = offset;
3318
3319 #if CONFIG_SECLUDED_MEMORY
3320 if (object->eligible_for_secluded) {
3321 vm_page_secluded.eligible_for_secluded++;
3322 }
3323 #endif /* CONFIG_SECLUDED_MEMORY */
3324
3325 /*
3326 * Insert it into the object_object/offset hash table
3327 */
3328 hash_id = vm_page_hash(object, offset);
3329 bucket = &vm_page_buckets[hash_id];
3330 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3331
3332 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3333
3334 mem->vmp_next_m = bucket->page_list;
3335 bucket->page_list = VM_PAGE_PACK_PTR(mem);
3336 assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)));
3337
3338 #if MACH_PAGE_HASH_STATS
3339 if (++bucket->cur_count > bucket->hi_count) {
3340 bucket->hi_count = bucket->cur_count;
3341 }
3342 #endif /* MACH_PAGE_HASH_STATS */
3343 mem->vmp_hashed = TRUE;
3344 lck_ticket_unlock(bucket_lock);
3345 }
3346
3347 {
3348 unsigned int cache_attr;
3349
3350 cache_attr = object->wimg_bits & VM_WIMG_MASK;
3351
3352 #if HAS_MTE
3353 /*
3354 * Set the cache attributes if it's neither the default atttributes
3355 * nor it's WIMG_MTE because we would have already set it before
3356 * inserting the page into this object. There is no need to take
3357 * the set hit.
3358 *
3359 *
3360 */
3361 if (cache_attr == VM_WIMG_MTE) {
3362 if (vm_object_is_mte_mappable_with_page(object, mem)) {
3363 /*
3364 * By now, we expect non-fictitious pages to have been made
3365 * tagged. This should happen in mteinfo_page_list_fix_tagging()
3366 * when the page is inserted onto the per-CPU free tagged queue.
3367 */
3368 assert(mem->vmp_using_mte);
3369 assert(pmap_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem)) == VM_WIMG_MTE);
3370 } else {
3371 /*
3372 * We don't want the object for fictitious pages to have its
3373 * cache attributes set if the object is MTE.
3374 */
3375 }
3376 } else {
3377 #endif /* HAS_MTE */
3378
3379 if (cache_attr != VM_WIMG_USE_DEFAULT) {
3380 PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op);
3381 }
3382
3383 #if HAS_MTE
3384 }
3385 #endif
3386 }
3387
3388 /*
3389 * Now link into the object's list of backed pages.
3390 */
3391 vm_page_queue_enter(&object->memq, mem, vmp_listq);
3392 object->memq_hint = mem;
3393 mem->vmp_tabled = TRUE;
3394
3395 /*
3396 * Show that the object has one more resident page.
3397 */
3398
3399 object->resident_page_count++;
3400 if (VM_PAGE_WIRED(mem)) {
3401 assert(mem->vmp_wire_count > 0);
3402 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
3403 VM_OBJECT_WIRED_PAGE_ADD(object, mem);
3404 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
3405 }
3406 assert(object->resident_page_count >= object->wired_page_count);
3407
3408 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
3409 vm_object_set_chead_hint(object);
3410 #endif
3411
3412 #if DEVELOPMENT || DEBUG
3413 if (object->object_is_shared_cache &&
3414 object->pager != NULL &&
3415 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3416 int new, old;
3417 assert(!object->internal);
3418 new = OSAddAtomic(+1, &shared_region_pagers_resident_count);
3419 do {
3420 old = shared_region_pagers_resident_peak;
3421 } while (old < new &&
3422 !OSCompareAndSwap(old, new, &shared_region_pagers_resident_peak));
3423 }
3424 #endif /* DEVELOPMENT || DEBUG */
3425
3426 if (batch_accounting == FALSE) {
3427 if (object->internal) {
3428 OSAddAtomic(1, &vm_page_internal_count);
3429 } else {
3430 OSAddAtomic(1, &vm_page_external_count);
3431 }
3432 }
3433
3434 /*
3435 * It wouldn't make sense to insert a "reusable" page in
3436 * an object (the page would have been marked "reusable" only
3437 * at the time of a madvise(MADV_FREE_REUSABLE) if it was already
3438 * in the object at that time).
3439 * But a page could be inserted in a "all_reusable" object, if
3440 * something faults it in (a vm_read() from another task or a
3441 * "use-after-free" issue in user space, for example). It can
3442 * also happen if we're relocating a page from that object to
3443 * a different physical page during a physically-contiguous
3444 * allocation.
3445 */
3446 assert(!mem->vmp_reusable);
3447 if (object->all_reusable) {
3448 OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count);
3449 }
3450
3451 if (object->purgable == VM_PURGABLE_DENY &&
3452 !object->vo_ledger_tag) {
3453 owner = TASK_NULL;
3454 } else {
3455 owner = VM_OBJECT_OWNER(object);
3456 vm_object_ledger_tag_ledgers(object,
3457 &ledger_idx_volatile,
3458 &ledger_idx_nonvolatile,
3459 &ledger_idx_volatile_compressed,
3460 &ledger_idx_nonvolatile_compressed,
3461 &ledger_idx_composite,
3462 &ledger_idx_external_wired,
3463 &do_footprint);
3464 }
3465 if (owner &&
3466 object->internal &&
3467 (object->purgable == VM_PURGABLE_NONVOLATILE ||
3468 object->purgable == VM_PURGABLE_DENY ||
3469 VM_PAGE_WIRED(mem))) {
3470 if (delayed_ledger_update) {
3471 *delayed_ledger_update += PAGE_SIZE;
3472 } else {
3473 /* more non-volatile bytes */
3474 ledger_credit(owner->ledger,
3475 ledger_idx_nonvolatile,
3476 PAGE_SIZE);
3477 if (do_footprint) {
3478 /* more footprint */
3479 ledger_credit(owner->ledger,
3480 task_ledgers.phys_footprint,
3481 PAGE_SIZE);
3482 } else if (ledger_idx_composite != -1) {
3483 ledger_credit(owner->ledger,
3484 ledger_idx_composite,
3485 PAGE_SIZE);
3486 }
3487 }
3488 } else if (owner &&
3489 object->internal &&
3490 (object->purgable == VM_PURGABLE_VOLATILE ||
3491 object->purgable == VM_PURGABLE_EMPTY)) {
3492 assert(!VM_PAGE_WIRED(mem));
3493 /* more volatile bytes */
3494 ledger_credit(owner->ledger,
3495 ledger_idx_volatile,
3496 PAGE_SIZE);
3497 }
3498
3499 if (object->purgable == VM_PURGABLE_VOLATILE) {
3500 if (VM_PAGE_WIRED(mem)) {
3501 OSAddAtomic(+1, &vm_page_purgeable_wired_count);
3502 } else {
3503 OSAddAtomic(+1, &vm_page_purgeable_count);
3504 }
3505 } else if (object->purgable == VM_PURGABLE_EMPTY &&
3506 mem->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
3507 /*
3508 * This page belongs to a purged VM object but hasn't
3509 * been purged (because it was "busy").
3510 * It's in the "throttled" queue and hence not
3511 * visible to vm_pageout_scan(). Move it to a pageable
3512 * queue, so that it can eventually be reclaimed, instead
3513 * of lingering in the "empty" object.
3514 */
3515 if (queues_lock_held == FALSE) {
3516 vm_page_lockspin_queues();
3517 }
3518 vm_page_deactivate(mem);
3519 if (queues_lock_held == FALSE) {
3520 vm_page_unlock_queues();
3521 }
3522 }
3523
3524 #if HAS_MTE
3525 /*
3526 * If adding pages to the compressor object, account for whether it's
3527 * tag storage or not.
3528 */
3529 if (object == compressor_object) {
3530 if (vm_page_is_tag_storage(mem)) {
3531 counter_inc(&compressor_tag_storage_pages_in_pool);
3532 } else {
3533 counter_inc(&compressor_non_tag_storage_pages_in_pool);
3534 }
3535 }
3536 #endif /* HAS_MTE */
3537
3538 #if VM_OBJECT_TRACKING_OP_MODIFIED
3539 if (vm_object_tracking_btlog &&
3540 object->internal &&
3541 object->resident_page_count == 0 &&
3542 object->pager == NULL &&
3543 object->shadow != NULL &&
3544 object->shadow->vo_copy == object) {
3545 btlog_record(vm_object_tracking_btlog, object,
3546 VM_OBJECT_TRACKING_OP_MODIFIED,
3547 btref_get(__builtin_frame_address(0), 0));
3548 }
3549 #endif /* VM_OBJECT_TRACKING_OP_MODIFIED */
3550 }
3551
3552 /*
3553 * vm_page_replace:
3554 *
3555 * Exactly like vm_page_insert, except that we first
3556 * remove any existing page at the given offset in object.
3557 *
3558 * The object must be locked.
3559 */
3560 void
vm_page_replace(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)3561 vm_page_replace(
3562 vm_page_t mem,
3563 vm_object_t object,
3564 vm_object_offset_t offset)
3565 {
3566 vm_page_bucket_t *bucket;
3567 vm_page_t found_m = VM_PAGE_NULL;
3568 lck_ticket_t *bucket_lock;
3569 int hash_id;
3570
3571 #if 0
3572 /*
3573 * we don't hold the page queue lock
3574 * so this check isn't safe to make
3575 */
3576 VM_PAGE_CHECK(mem);
3577 #endif
3578 #if HAS_MTE
3579 assert_mte_vmo_matches_vmp(object, mem);
3580 #endif /* HAS_MTE */
3581 vm_object_lock_assert_exclusive(object);
3582 #if DEBUG || VM_PAGE_BUCKETS_CHECK
3583 if (mem->vmp_tabled || mem->vmp_object) {
3584 panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
3585 "already in (obj=%p,off=0x%llx)",
3586 mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
3587 }
3588 #endif
3589 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3590
3591 assert(!VM_PAGE_PAGEABLE(mem));
3592
3593 /*
3594 * Record the object/offset pair in this page
3595 */
3596 mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
3597 mem->vmp_offset = offset;
3598
3599 /*
3600 * Insert it into the object_object/offset hash table,
3601 * replacing any page that might have been there.
3602 */
3603
3604 hash_id = vm_page_hash(object, offset);
3605 bucket = &vm_page_buckets[hash_id];
3606 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3607
3608 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3609
3610 if (bucket->page_list) {
3611 vm_page_packed_t *mp = &bucket->page_list;
3612 vm_page_t m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp));
3613
3614 do {
3615 /*
3616 * compare packed object pointers
3617 */
3618 if (m->vmp_object == mem->vmp_object && m->vmp_offset == offset) {
3619 /*
3620 * Remove old page from hash list
3621 */
3622 *mp = m->vmp_next_m;
3623 m->vmp_hashed = FALSE;
3624 m->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3625
3626 found_m = m;
3627 break;
3628 }
3629 mp = &m->vmp_next_m;
3630 } while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp))));
3631
3632 mem->vmp_next_m = bucket->page_list;
3633 } else {
3634 mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3635 }
3636 /*
3637 * insert new page at head of hash list
3638 */
3639 bucket->page_list = VM_PAGE_PACK_PTR(mem);
3640 mem->vmp_hashed = TRUE;
3641
3642 lck_ticket_unlock(bucket_lock);
3643
3644 if (found_m) {
3645 /*
3646 * there was already a page at the specified
3647 * offset for this object... remove it from
3648 * the object and free it back to the free list
3649 */
3650 vm_page_free_unlocked(found_m, FALSE);
3651 }
3652 vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, FALSE, FALSE, FALSE, NULL);
3653 }
3654
3655 /*
3656 * vm_page_remove: [ internal use only ]
3657 *
3658 * Removes the given mem entry from the object/offset-page
3659 * table and the object page list.
3660 *
3661 * The object must be locked.
3662 */
3663
3664 void
vm_page_remove(vm_page_t mem,boolean_t remove_from_hash)3665 vm_page_remove(
3666 vm_page_t mem,
3667 boolean_t remove_from_hash)
3668 {
3669 vm_page_bucket_t *bucket;
3670 vm_page_t this;
3671 lck_ticket_t *bucket_lock;
3672 int hash_id;
3673 task_t owner;
3674 vm_object_t m_object;
3675 int ledger_idx_volatile;
3676 int ledger_idx_nonvolatile;
3677 int ledger_idx_volatile_compressed;
3678 int ledger_idx_nonvolatile_compressed;
3679 int ledger_idx_composite;
3680 int ledger_idx_external_wired;
3681 int do_footprint;
3682
3683 m_object = VM_PAGE_OBJECT(mem);
3684
3685 vm_object_lock_assert_exclusive(m_object);
3686 assert(mem->vmp_tabled);
3687 assert(!mem->vmp_cleaning);
3688 assert(!mem->vmp_laundry);
3689
3690 if (VM_PAGE_PAGEABLE(mem)) {
3691 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3692 }
3693 #if 0
3694 /*
3695 * we don't hold the page queue lock
3696 * so this check isn't safe to make
3697 */
3698 VM_PAGE_CHECK(mem);
3699 #endif
3700 if (remove_from_hash == TRUE) {
3701 /*
3702 * Remove from the object_object/offset hash table
3703 */
3704 hash_id = vm_page_hash(m_object, mem->vmp_offset);
3705 bucket = &vm_page_buckets[hash_id];
3706 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3707
3708 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3709
3710 if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) {
3711 /* optimize for common case */
3712
3713 bucket->page_list = mem->vmp_next_m;
3714 } else {
3715 vm_page_packed_t *prev;
3716
3717 for (prev = &this->vmp_next_m;
3718 (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem;
3719 prev = &this->vmp_next_m) {
3720 continue;
3721 }
3722 *prev = this->vmp_next_m;
3723 }
3724 #if MACH_PAGE_HASH_STATS
3725 bucket->cur_count--;
3726 #endif /* MACH_PAGE_HASH_STATS */
3727 mem->vmp_hashed = FALSE;
3728 this->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3729 lck_ticket_unlock(bucket_lock);
3730 }
3731 /*
3732 * Now remove from the object's list of backed pages.
3733 */
3734
3735 vm_page_remove_internal(mem);
3736
3737 /*
3738 * And show that the object has one fewer resident
3739 * page.
3740 */
3741
3742 assert(m_object->resident_page_count > 0);
3743 m_object->resident_page_count--;
3744
3745 #if DEVELOPMENT || DEBUG
3746 if (m_object->object_is_shared_cache &&
3747 m_object->pager != NULL &&
3748 m_object->pager->mo_pager_ops == &shared_region_pager_ops) {
3749 assert(!m_object->internal);
3750 OSAddAtomic(-1, &shared_region_pagers_resident_count);
3751 }
3752 #endif /* DEVELOPMENT || DEBUG */
3753
3754 if (m_object->internal) {
3755 #if DEBUG
3756 assert(vm_page_internal_count);
3757 #endif /* DEBUG */
3758
3759 OSAddAtomic(-1, &vm_page_internal_count);
3760 } else {
3761 assert(vm_page_external_count);
3762 OSAddAtomic(-1, &vm_page_external_count);
3763
3764 if (mem->vmp_xpmapped) {
3765 assert(vm_page_xpmapped_external_count);
3766 OSAddAtomic(-1, &vm_page_xpmapped_external_count);
3767 }
3768 }
3769 if (!m_object->internal &&
3770 m_object->cached_list.next &&
3771 m_object->cached_list.prev) {
3772 if (m_object->resident_page_count == 0) {
3773 vm_object_cache_remove(m_object);
3774 }
3775 }
3776
3777 if (VM_PAGE_WIRED(mem)) {
3778 assert(mem->vmp_wire_count > 0);
3779 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
3780 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
3781 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
3782 }
3783 assert(m_object->resident_page_count >=
3784 m_object->wired_page_count);
3785 if (mem->vmp_reusable) {
3786 assert(m_object->reusable_page_count > 0);
3787 m_object->reusable_page_count--;
3788 assert(m_object->reusable_page_count <=
3789 m_object->resident_page_count);
3790 mem->vmp_reusable = FALSE;
3791 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
3792 vm_page_stats_reusable.reused_remove++;
3793 } else if (m_object->all_reusable) {
3794 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
3795 vm_page_stats_reusable.reused_remove++;
3796 }
3797
3798 if (m_object->purgable == VM_PURGABLE_DENY &&
3799 !m_object->vo_ledger_tag) {
3800 owner = TASK_NULL;
3801 } else {
3802 owner = VM_OBJECT_OWNER(m_object);
3803 vm_object_ledger_tag_ledgers(m_object,
3804 &ledger_idx_volatile,
3805 &ledger_idx_nonvolatile,
3806 &ledger_idx_volatile_compressed,
3807 &ledger_idx_nonvolatile_compressed,
3808 &ledger_idx_composite,
3809 &ledger_idx_external_wired,
3810 &do_footprint);
3811 }
3812 if (owner &&
3813 m_object->internal &&
3814 (m_object->purgable == VM_PURGABLE_NONVOLATILE ||
3815 m_object->purgable == VM_PURGABLE_DENY ||
3816 VM_PAGE_WIRED(mem))) {
3817 /* less non-volatile bytes */
3818 ledger_debit(owner->ledger,
3819 ledger_idx_nonvolatile,
3820 PAGE_SIZE);
3821 if (do_footprint) {
3822 /* less footprint */
3823 ledger_debit(owner->ledger,
3824 task_ledgers.phys_footprint,
3825 PAGE_SIZE);
3826 } else if (ledger_idx_composite != -1) {
3827 ledger_debit(owner->ledger,
3828 ledger_idx_composite,
3829 PAGE_SIZE);
3830 }
3831 } else if (owner &&
3832 m_object->internal &&
3833 (m_object->purgable == VM_PURGABLE_VOLATILE ||
3834 m_object->purgable == VM_PURGABLE_EMPTY)) {
3835 assert(!VM_PAGE_WIRED(mem));
3836 /* less volatile bytes */
3837 ledger_debit(owner->ledger,
3838 ledger_idx_volatile,
3839 PAGE_SIZE);
3840 }
3841
3842 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
3843 if (VM_PAGE_WIRED(mem)) {
3844 assert(vm_page_purgeable_wired_count > 0);
3845 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
3846 } else {
3847 assert(vm_page_purgeable_count > 0);
3848 OSAddAtomic(-1, &vm_page_purgeable_count);
3849 }
3850 }
3851
3852 #if HAS_MTE
3853 /*
3854 * If removing pages from the compressor object, account for whether it's
3855 * tag storage or not.
3856 */
3857 if (m_object == compressor_object) {
3858 if (vm_page_is_tag_storage(mem)) {
3859 counter_dec(&compressor_tag_storage_pages_in_pool);
3860 } else {
3861 counter_dec(&compressor_non_tag_storage_pages_in_pool);
3862 }
3863 }
3864
3865 assert_mte_vmo_matches_vmp(m_object, mem);
3866 if (!vm_object_is_mte_mappable(m_object)) {
3867 #endif /* HAS_MTE */
3868 if (m_object->set_cache_attr == TRUE) {
3869 pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0);
3870 }
3871 #if HAS_MTE
3872 }
3873 #endif /* HAS_MTE */
3874
3875 mem->vmp_tabled = FALSE;
3876 mem->vmp_object = 0;
3877 mem->vmp_offset = (vm_object_offset_t) -1;
3878 }
3879
3880
3881 /*
3882 * vm_page_lookup:
3883 *
3884 * Returns the page associated with the object/offset
3885 * pair specified; if none is found, VM_PAGE_NULL is returned.
3886 *
3887 * The object must be locked. No side effects.
3888 */
3889
3890 #define VM_PAGE_HASH_LOOKUP_THRESHOLD 10
3891
3892 #if DEBUG_VM_PAGE_LOOKUP
3893
3894 struct {
3895 uint64_t vpl_total;
3896 uint64_t vpl_empty_obj;
3897 uint64_t vpl_bucket_NULL;
3898 uint64_t vpl_hit_hint;
3899 uint64_t vpl_hit_hint_next;
3900 uint64_t vpl_hit_hint_prev;
3901 uint64_t vpl_fast;
3902 uint64_t vpl_slow;
3903 uint64_t vpl_hit;
3904 uint64_t vpl_miss;
3905
3906 uint64_t vpl_fast_elapsed;
3907 uint64_t vpl_slow_elapsed;
3908 } vm_page_lookup_stats __attribute__((aligned(8)));
3909
3910 #endif
3911
3912 #define KDP_VM_PAGE_WALK_MAX 1000
3913
3914 vm_page_t
kdp_vm_page_lookup(vm_object_t object,vm_object_offset_t offset)3915 kdp_vm_page_lookup(
3916 vm_object_t object,
3917 vm_object_offset_t offset)
3918 {
3919 vm_page_t cur_page;
3920 int num_traversed = 0;
3921
3922 if (not_in_kdp) {
3923 panic("panic: kdp_vm_page_lookup done outside of kernel debugger");
3924 }
3925
3926 vm_page_queue_iterate(&object->memq, cur_page, vmp_listq) {
3927 if (cur_page->vmp_offset == offset) {
3928 return cur_page;
3929 }
3930 num_traversed++;
3931
3932 if (num_traversed >= KDP_VM_PAGE_WALK_MAX) {
3933 return VM_PAGE_NULL;
3934 }
3935 }
3936
3937 return VM_PAGE_NULL;
3938 }
3939
3940 vm_page_t
vm_page_lookup(vm_object_t object,vm_object_offset_t offset)3941 vm_page_lookup(
3942 vm_object_t object,
3943 vm_object_offset_t offset)
3944 {
3945 vm_page_t mem;
3946 vm_page_bucket_t *bucket;
3947 vm_page_queue_entry_t qe;
3948 lck_ticket_t *bucket_lock = NULL;
3949 int hash_id;
3950 #if DEBUG_VM_PAGE_LOOKUP
3951 uint64_t start, elapsed;
3952
3953 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_total);
3954 #endif
3955
3956 #if KASAN_TBI
3957 if (is_kernel_object(object)) {
3958 offset = vm_memtag_canonicalize_kernel(offset);
3959 }
3960 #endif /* KASAN_TBI */
3961
3962 vm_object_lock_assert_held(object);
3963 assertf(page_aligned(offset), "offset 0x%llx\n", offset);
3964
3965 if (object->resident_page_count == 0) {
3966 #if DEBUG_VM_PAGE_LOOKUP
3967 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_empty_obj);
3968 #endif
3969 return VM_PAGE_NULL;
3970 }
3971
3972 mem = object->memq_hint;
3973
3974 if (mem != VM_PAGE_NULL) {
3975 assert(VM_PAGE_OBJECT(mem) == object);
3976
3977 if (mem->vmp_offset == offset) {
3978 #if DEBUG_VM_PAGE_LOOKUP
3979 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint);
3980 #endif
3981 return mem;
3982 }
3983 qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->vmp_listq);
3984
3985 if (!vm_page_queue_end(&object->memq, qe)) {
3986 vm_page_t next_page;
3987
3988 next_page = (vm_page_t)((uintptr_t)qe);
3989 assert(VM_PAGE_OBJECT(next_page) == object);
3990
3991 if (next_page->vmp_offset == offset) {
3992 object->memq_hint = next_page; /* new hint */
3993 #if DEBUG_VM_PAGE_LOOKUP
3994 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_next);
3995 #endif
3996 return next_page;
3997 }
3998 }
3999 qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->vmp_listq);
4000
4001 if (!vm_page_queue_end(&object->memq, qe)) {
4002 vm_page_t prev_page;
4003
4004 prev_page = (vm_page_t)((uintptr_t)qe);
4005 assert(VM_PAGE_OBJECT(prev_page) == object);
4006
4007 if (prev_page->vmp_offset == offset) {
4008 object->memq_hint = prev_page; /* new hint */
4009 #if DEBUG_VM_PAGE_LOOKUP
4010 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_prev);
4011 #endif
4012 return prev_page;
4013 }
4014 }
4015 }
4016 /*
4017 * Search the hash table for this object/offset pair
4018 */
4019 hash_id = vm_page_hash(object, offset);
4020 bucket = &vm_page_buckets[hash_id];
4021
4022 /*
4023 * since we hold the object lock, we are guaranteed that no
4024 * new pages can be inserted into this object... this in turn
4025 * guarantess that the page we're looking for can't exist
4026 * if the bucket it hashes to is currently NULL even when looked
4027 * at outside the scope of the hash bucket lock... this is a
4028 * really cheap optimiztion to avoid taking the lock
4029 */
4030 if (!bucket->page_list) {
4031 #if DEBUG_VM_PAGE_LOOKUP
4032 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_bucket_NULL);
4033 #endif
4034 return VM_PAGE_NULL;
4035 }
4036
4037 #if DEBUG_VM_PAGE_LOOKUP
4038 start = mach_absolute_time();
4039 #endif
4040 if (object->resident_page_count <= VM_PAGE_HASH_LOOKUP_THRESHOLD) {
4041 /*
4042 * on average, it's roughly 3 times faster to run a short memq list
4043 * than to take the spin lock and go through the hash list
4044 */
4045 mem = (vm_page_t)vm_page_queue_first(&object->memq);
4046
4047 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
4048 if (mem->vmp_offset == offset) {
4049 break;
4050 }
4051
4052 mem = (vm_page_t)vm_page_queue_next(&mem->vmp_listq);
4053 }
4054 if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
4055 mem = NULL;
4056 }
4057 } else {
4058 vm_page_object_t packed_object;
4059
4060 packed_object = VM_PAGE_PACK_OBJECT(object);
4061
4062 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
4063
4064 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
4065
4066 for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
4067 mem != VM_PAGE_NULL;
4068 mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m))) {
4069 #if 0
4070 /*
4071 * we don't hold the page queue lock
4072 * so this check isn't safe to make
4073 */
4074 VM_PAGE_CHECK(mem);
4075 #endif
4076 if ((mem->vmp_object == packed_object) && (mem->vmp_offset == offset)) {
4077 break;
4078 }
4079 }
4080 lck_ticket_unlock(bucket_lock);
4081 }
4082
4083 #if DEBUG_VM_PAGE_LOOKUP
4084 elapsed = mach_absolute_time() - start;
4085
4086 if (bucket_lock) {
4087 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_slow);
4088 OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_slow_elapsed);
4089 } else {
4090 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_fast);
4091 OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_fast_elapsed);
4092 }
4093 if (mem != VM_PAGE_NULL) {
4094 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit);
4095 } else {
4096 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_miss);
4097 }
4098 #endif
4099 if (mem != VM_PAGE_NULL) {
4100 assert(VM_PAGE_OBJECT(mem) == object);
4101
4102 object->memq_hint = mem;
4103 }
4104 return mem;
4105 }
4106
4107
4108 /*
4109 * vm_page_rename:
4110 *
4111 * Move the given memory entry from its
4112 * current object to the specified target object/offset.
4113 *
4114 * The object must be locked.
4115 */
4116 void
vm_page_rename(vm_page_t mem,vm_object_t new_object,vm_object_offset_t new_offset)4117 vm_page_rename(
4118 vm_page_t mem,
4119 vm_object_t new_object,
4120 vm_object_offset_t new_offset)
4121 {
4122 boolean_t internal_to_external, external_to_internal;
4123 vm_tag_t tag;
4124 vm_object_t m_object;
4125
4126 m_object = VM_PAGE_OBJECT(mem);
4127
4128 assert(m_object != new_object);
4129 assert(m_object);
4130
4131 /*
4132 * Changes to mem->vmp_object require the page lock because
4133 * the pageout daemon uses that lock to get the object.
4134 */
4135 vm_page_lockspin_queues();
4136
4137 internal_to_external = FALSE;
4138 external_to_internal = FALSE;
4139
4140 if (mem->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
4141 /*
4142 * it's much easier to get the vm_page_pageable_xxx accounting correct
4143 * if we first move the page to the active queue... it's going to end
4144 * up there anyway, and we don't do vm_page_rename's frequently enough
4145 * for this to matter.
4146 */
4147 vm_page_queues_remove(mem, FALSE);
4148 vm_page_activate(mem);
4149 }
4150 if (VM_PAGE_PAGEABLE(mem)) {
4151 if (m_object->internal && !new_object->internal) {
4152 internal_to_external = TRUE;
4153 }
4154 if (!m_object->internal && new_object->internal) {
4155 external_to_internal = TRUE;
4156 }
4157 }
4158
4159 tag = m_object->wire_tag;
4160 vm_page_remove(mem, TRUE);
4161 vm_page_insert_internal(mem, new_object, new_offset, tag, TRUE, TRUE, FALSE, FALSE, NULL);
4162
4163 if (internal_to_external) {
4164 vm_page_pageable_internal_count--;
4165 vm_page_pageable_external_count++;
4166 } else if (external_to_internal) {
4167 vm_page_pageable_external_count--;
4168 vm_page_pageable_internal_count++;
4169 }
4170
4171 vm_page_unlock_queues();
4172 }
4173
4174 /*
4175 * vm_page_init:
4176 *
4177 * Initialize the fields in a new page.
4178 * This takes a structure with random values and initializes it
4179 * so that it can be given to vm_page_release or vm_page_insert.
4180 */
4181 void
vm_page_init(vm_page_t mem,ppnum_t phys_page)4182 vm_page_init(vm_page_t mem, ppnum_t phys_page)
4183 {
4184 assert(phys_page);
4185
4186 #if DEBUG
4187 if ((phys_page != vm_page_fictitious_addr) && (phys_page != vm_page_guard_addr)) {
4188 if (!(pmap_valid_page(phys_page))) {
4189 panic("vm_page_init: non-DRAM phys_page 0x%x", phys_page);
4190 }
4191 }
4192 #endif /* DEBUG */
4193
4194 /*
4195 * Initialize the fields of the vm_page. If adding any new fields to vm_page,
4196 * try to use initial values which match 0. This minimizes the number of writes
4197 * needed for boot-time initialization.
4198 */
4199 assert(VM_PAGE_NOT_ON_Q == 0);
4200 assert(sizeof(*mem) % sizeof(uintptr_t) == 0);
4201 *mem = (struct vm_page) {
4202 .vmp_offset = (vm_object_offset_t)-1,
4203 .vmp_q_state = VM_PAGE_NOT_ON_Q,
4204 .vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY,
4205 .vmp_canonical = vm_page_in_array(mem),
4206 .vmp_busy = true,
4207 };
4208
4209 VM_PAGE_INIT_PHYS_PAGE(mem, phys_page);
4210
4211 #if 0
4212 /*
4213 * we're leaving this turned off for now... currently pages
4214 * come off the free list and are either immediately dirtied/referenced
4215 * due to zero-fill or COW faults, or are used to read or write files...
4216 * in the file I/O case, the UPL mechanism takes care of clearing
4217 * the state of the HW ref/mod bits in a somewhat fragile way.
4218 * Since we may change the way this works in the future (to toughen it up),
4219 * I'm leaving this as a reminder of where these bits could get cleared
4220 */
4221
4222 /*
4223 * make sure both the h/w referenced and modified bits are
4224 * clear at this point... we are especially dependent on
4225 * not finding a 'stale' h/w modified in a number of spots
4226 * once this page goes back into use
4227 */
4228 pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
4229 #endif
4230 }
4231
4232 vm_page_t
vm_page_create_fictitious(void)4233 vm_page_create_fictitious(void)
4234 {
4235 return vm_page_create(vm_page_fictitious_addr, false, Z_WAITOK);
4236 }
4237
4238 vm_page_t
vm_page_create_guard(bool canwait)4239 vm_page_create_guard(bool canwait)
4240 {
4241 return vm_page_create(vm_page_guard_addr, false, canwait ? Z_WAITOK : Z_NOWAIT);
4242 }
4243
4244 vm_page_t
vm_page_create_private(ppnum_t base_page)4245 vm_page_create_private(ppnum_t base_page)
4246 {
4247 assert(base_page != vm_page_fictitious_addr &&
4248 base_page != vm_page_guard_addr);
4249 return vm_page_create(base_page, false, Z_WAITOK);
4250 }
4251
4252 bool
vm_page_is_canonical(const struct vm_page * m)4253 vm_page_is_canonical(const struct vm_page *m)
4254 {
4255 return m->vmp_canonical;
4256 }
4257
4258 bool
vm_page_is_fictitious(const struct vm_page * m)4259 vm_page_is_fictitious(const struct vm_page *m)
4260 {
4261 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
4262 if (vm_page_in_array(m)) {
4263 return false;
4264 }
4265 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
4266 switch (VM_PAGE_GET_PHYS_PAGE(m)) {
4267 case vm_page_guard_addr:
4268 case vm_page_fictitious_addr:
4269 return true;
4270 default:
4271 return false;
4272 }
4273 }
4274
4275 bool
vm_page_is_guard(const struct vm_page * m)4276 vm_page_is_guard(const struct vm_page *m)
4277 {
4278 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
4279 if (vm_page_in_array(m)) {
4280 return false;
4281 }
4282 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
4283 return VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr;
4284 }
4285
4286 bool
vm_page_is_private(const struct vm_page * m)4287 vm_page_is_private(const struct vm_page *m)
4288 {
4289 return !vm_page_is_canonical(m) && !vm_page_is_fictitious(m);
4290 }
4291
4292 void
vm_page_make_private(vm_page_t m,ppnum_t base_page)4293 vm_page_make_private(vm_page_t m, ppnum_t base_page)
4294 {
4295 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4296 assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr);
4297
4298 VM_PAGE_SET_PHYS_PAGE(m, base_page);
4299 }
4300
4301 void
vm_page_reset_private(vm_page_t m)4302 vm_page_reset_private(vm_page_t m)
4303 {
4304 assert(vm_page_is_private(m));
4305
4306 VM_PAGE_SET_PHYS_PAGE(m, vm_page_fictitious_addr);
4307 }
4308
4309 /*
4310 * vm_page_release_fictitious:
4311 *
4312 * Release a fictitious page to the zone pool
4313 */
4314 static void
vm_page_release_fictitious(vm_page_t m)4315 vm_page_release_fictitious(vm_page_t m)
4316 {
4317 assert((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
4318 (m->vmp_q_state == VM_PAGE_IS_WIRED));
4319 assert(vm_page_is_fictitious(m));
4320 assert(!m->vmp_realtime);
4321
4322 if (vm_page_is_guard(m)) {
4323 counter_dec(&vm_guard_count);
4324 }
4325 zfree(vm_page_zone, m);
4326 }
4327
4328 /*
4329 * vm_pool_low():
4330 *
4331 * Return true if it is not likely that a non-vm_privileged thread
4332 * can get memory without blocking. Advisory only, since the
4333 * situation may change under us.
4334 */
4335 bool
vm_pool_low(void)4336 vm_pool_low(void)
4337 {
4338 /* No locking, at worst we will fib. */
4339 return vm_page_free_count <= vm_page_free_reserved;
4340 }
4341
4342 boolean_t vm_darkwake_mode = FALSE;
4343
4344 /*
4345 * vm_update_darkwake_mode():
4346 *
4347 * Tells the VM that the system is in / out of darkwake.
4348 *
4349 * Today, the VM only lowers/raises the background queue target
4350 * so as to favor consuming more/less background pages when
4351 * darwake is ON/OFF.
4352 *
4353 * We might need to do more things in the future.
4354 */
4355
4356 void
vm_update_darkwake_mode(boolean_t darkwake_mode)4357 vm_update_darkwake_mode(boolean_t darkwake_mode)
4358 {
4359 #if XNU_TARGET_OS_OSX && defined(__arm64__)
4360 #pragma unused(darkwake_mode)
4361 assert(vm_darkwake_mode == FALSE);
4362 /*
4363 * Darkwake mode isn't supported for AS macOS.
4364 */
4365 return;
4366 #else /* XNU_TARGET_OS_OSX && __arm64__ */
4367 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
4368
4369 vm_page_lockspin_queues();
4370
4371 if (vm_darkwake_mode == darkwake_mode) {
4372 /*
4373 * No change.
4374 */
4375 vm_page_unlock_queues();
4376 return;
4377 }
4378
4379 vm_darkwake_mode = darkwake_mode;
4380
4381 if (vm_darkwake_mode == TRUE) {
4382 /* save background target to restore later */
4383 vm_page_background_target_snapshot = vm_page_background_target;
4384
4385 /* target is set to 0...no protection for background pages */
4386 vm_page_background_target = 0;
4387 } else if (vm_darkwake_mode == FALSE) {
4388 if (vm_page_background_target_snapshot) {
4389 vm_page_background_target = vm_page_background_target_snapshot;
4390 }
4391 }
4392 vm_page_unlock_queues();
4393 #endif
4394 }
4395
4396 void
vm_page_update_special_state(vm_page_t mem)4397 vm_page_update_special_state(vm_page_t mem)
4398 {
4399 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR || mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
4400 return;
4401 }
4402
4403 switch (mem->vmp_on_specialq) {
4404 case VM_PAGE_SPECIAL_Q_BG:
4405 {
4406 task_t my_task = current_task_early();
4407
4408 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4409 return;
4410 }
4411
4412 if (my_task) {
4413 if (task_get_darkwake_mode(my_task)) {
4414 return;
4415 }
4416 }
4417
4418 if (my_task) {
4419 if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG)) {
4420 return;
4421 }
4422 }
4423 vm_page_lockspin_queues();
4424
4425 vm_page_background_promoted_count++;
4426
4427 vm_page_remove_from_specialq(mem);
4428 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4429
4430 vm_page_unlock_queues();
4431 break;
4432 }
4433
4434 case VM_PAGE_SPECIAL_Q_DONATE:
4435 {
4436 task_t my_task = current_task_early();
4437
4438 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
4439 return;
4440 }
4441
4442 if (my_task->donates_own_pages == false) {
4443 vm_page_lockspin_queues();
4444
4445 vm_page_remove_from_specialq(mem);
4446 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4447
4448 vm_page_unlock_queues();
4449 }
4450 break;
4451 }
4452
4453 default:
4454 {
4455 assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
4456 VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
4457 break;
4458 }
4459 }
4460 }
4461
4462
4463 void
vm_page_assign_special_state(vm_page_t mem,vm_page_specialq_t mode)4464 vm_page_assign_special_state(vm_page_t mem, vm_page_specialq_t mode)
4465 {
4466 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
4467 return;
4468 }
4469
4470 switch (mode) {
4471 case VM_PAGE_SPECIAL_Q_BG:
4472 {
4473 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4474 return;
4475 }
4476
4477 task_t my_task = current_task_early();
4478
4479 if (my_task) {
4480 if (task_get_darkwake_mode(my_task)) {
4481 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
4482 return;
4483 }
4484 }
4485
4486 if (my_task) {
4487 mem->vmp_on_specialq = (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG) ? VM_PAGE_SPECIAL_Q_BG : VM_PAGE_SPECIAL_Q_EMPTY);
4488 }
4489 break;
4490 }
4491
4492 case VM_PAGE_SPECIAL_Q_DONATE:
4493 {
4494 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
4495 return;
4496 }
4497 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
4498 break;
4499 }
4500
4501 default:
4502 break;
4503 }
4504 }
4505
4506
4507 void
vm_page_remove_from_specialq(vm_page_t mem)4508 vm_page_remove_from_specialq(vm_page_t mem)
4509 {
4510 vm_object_t m_object;
4511
4512 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4513
4514 switch (mem->vmp_on_specialq) {
4515 case VM_PAGE_SPECIAL_Q_BG:
4516 {
4517 if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4518 vm_page_queue_remove(&vm_page_queue_background, mem, vmp_specialq);
4519
4520 mem->vmp_specialq.next = 0;
4521 mem->vmp_specialq.prev = 0;
4522
4523 vm_page_background_count--;
4524
4525 m_object = VM_PAGE_OBJECT(mem);
4526
4527 if (m_object->internal) {
4528 vm_page_background_internal_count--;
4529 } else {
4530 vm_page_background_external_count--;
4531 }
4532 }
4533 break;
4534 }
4535
4536 case VM_PAGE_SPECIAL_Q_DONATE:
4537 {
4538 if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4539 vm_page_queue_remove((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4540 mem->vmp_specialq.next = 0;
4541 mem->vmp_specialq.prev = 0;
4542 vm_page_donate_count--;
4543 if (vm_page_donate_queue_ripe && (vm_page_donate_count < vm_page_donate_target)) {
4544 assert(vm_page_donate_target == vm_page_donate_target_low);
4545 vm_page_donate_target = vm_page_donate_target_high;
4546 vm_page_donate_queue_ripe = false;
4547 }
4548 }
4549
4550 break;
4551 }
4552
4553 default:
4554 {
4555 assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
4556 VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
4557 break;
4558 }
4559 }
4560 }
4561
4562
4563 void
vm_page_add_to_specialq(vm_page_t mem,boolean_t first)4564 vm_page_add_to_specialq(vm_page_t mem, boolean_t first)
4565 {
4566 vm_object_t m_object;
4567
4568 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4569
4570 if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4571 return;
4572 }
4573
4574 switch (mem->vmp_on_specialq) {
4575 case VM_PAGE_SPECIAL_Q_BG:
4576 {
4577 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4578 return;
4579 }
4580
4581 m_object = VM_PAGE_OBJECT(mem);
4582
4583 if (vm_page_background_exclude_external && !m_object->internal) {
4584 return;
4585 }
4586
4587 if (first == TRUE) {
4588 vm_page_queue_enter_first(&vm_page_queue_background, mem, vmp_specialq);
4589 } else {
4590 vm_page_queue_enter(&vm_page_queue_background, mem, vmp_specialq);
4591 }
4592 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
4593
4594 vm_page_background_count++;
4595
4596 if (m_object->internal) {
4597 vm_page_background_internal_count++;
4598 } else {
4599 vm_page_background_external_count++;
4600 }
4601 break;
4602 }
4603
4604 case VM_PAGE_SPECIAL_Q_DONATE:
4605 {
4606 if (first == TRUE) {
4607 vm_page_queue_enter_first((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4608 } else {
4609 vm_page_queue_enter((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4610 }
4611 vm_page_donate_count++;
4612 if (!vm_page_donate_queue_ripe && (vm_page_donate_count > vm_page_donate_target)) {
4613 assert(vm_page_donate_target == vm_page_donate_target_high);
4614 vm_page_donate_target = vm_page_donate_target_low;
4615 vm_page_donate_queue_ripe = true;
4616 }
4617 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
4618 break;
4619 }
4620
4621 default:
4622 break;
4623 }
4624 }
4625
4626 /*!
4627 * @brief
4628 * Prepares a page that has been successfully grabbed for the caller.
4629 *
4630 * @discussion
4631 * This function will update accounting, emit tracements, ...
4632 */
4633 static vm_page_t
vm_page_grab_finalize(vm_grab_options_t grab_options __unused,vm_page_t mem)4634 vm_page_grab_finalize(vm_grab_options_t grab_options __unused, vm_page_t mem)
4635 {
4636 task_t task;
4637
4638 #if MACH_ASSERT
4639 /*
4640 * For all free pages, no matter their provenance...
4641 * ensure they are not referenced anywhere,
4642 * and their state is clean.
4643 */
4644 if (vm_check_refs_on_alloc) {
4645 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
4646 }
4647 assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
4648 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
4649 mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
4650 mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
4651 mem->vmp_next_m == 0 &&
4652 mem->vmp_object == 0 &&
4653 mem->vmp_wire_count == 0 &&
4654 mem->vmp_busy &&
4655 !mem->vmp_tabled &&
4656 !mem->vmp_laundry &&
4657 !mem->vmp_pmapped &&
4658 !mem->vmp_wpmapped &&
4659 !mem->vmp_realtime);
4660 #endif /* MACH_ASSERT */
4661
4662 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
4663 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
4664
4665 #if HAS_MTE
4666 if (!(grab_options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE)) {
4667 assert(!vm_page_is_tag_storage(mem));
4668 }
4669 if (grab_options & VM_PAGE_GRAB_MTE) {
4670 assert(mem->vmp_using_mte);
4671 VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB_MTE,
4672 DBG_FUNC_NONE, grab_options, 0, 0, 0);
4673 } else
4674 #endif /* HAS_MTE */
4675 {
4676 VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB,
4677 DBG_FUNC_NONE, grab_options, 0, 0, 0);
4678 }
4679
4680 counter_inc(&vm_page_grab_count);
4681
4682 task = current_task_early();
4683 if (task != TASK_NULL) {
4684 ledger_credit(task->ledger, task_ledgers.pages_grabbed, 1);
4685 }
4686 if (task != TASK_NULL && task != kernel_task) {
4687 /*
4688 * tag:DONATE this is where the donate state of the page
4689 * is decided according to what task grabs it
4690 */
4691 if (task->donates_own_pages) {
4692 vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE);
4693 } else {
4694 vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG);
4695 }
4696 }
4697
4698 return mem;
4699 }
4700
4701 #if __x86_64__
4702 /*
4703 * This can be switched to FALSE to help debug drivers
4704 * that are having problems with memory > 4G.
4705 */
4706 boolean_t vm_himemory_mode = TRUE;
4707 #endif /* __x86_64__ */
4708
4709 #if XNU_VM_HAS_LOPAGE
4710
4711 vm_page_t
vm_page_grablo(vm_grab_options_t grab_options)4712 vm_page_grablo(vm_grab_options_t grab_options)
4713 {
4714 vm_page_t mem = VM_PAGE_NULL;
4715
4716 if (!vm_lopage_needed) {
4717 return vm_page_grab_options(grab_options);
4718 }
4719
4720 vm_free_page_lock_spin();
4721 if (vm_lopage_free_count) {
4722 #if LCK_MTX_USE_ARCH
4723 /*
4724 * Intel locks do not really always disable preemption
4725 * for lck_mtx_lock_spin(), and vm_page_free_queue_grab()
4726 * really want that.
4727 */
4728 disable_preemption();
4729 #endif
4730 mem = vm_page_free_queue_grab(grab_options,
4731 VM_MEMORY_CLASS_LOPAGE, 1, VM_PAGE_NOT_ON_Q).vmpl_head;
4732 #if LCK_MTX_USE_ARCH
4733 enable_preemption();
4734 #endif
4735 }
4736 vm_free_page_unlock();
4737
4738 if (mem == VM_PAGE_NULL) {
4739 if (cpm_allocate(PAGE_SIZE, &mem, atop(PPNUM_MAX), 0, FALSE, KMA_LOMEM) != KERN_SUCCESS) {
4740 vm_free_page_lock_spin();
4741 vm_lopages_allocated_cpm_failed++;
4742 vm_free_page_unlock();
4743
4744 return VM_PAGE_NULL;
4745 }
4746 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
4747
4748 mem->vmp_busy = TRUE;
4749
4750 vm_page_lockspin_queues();
4751
4752 mem->vmp_gobbled = FALSE;
4753 vm_page_gobble_count--;
4754 vm_page_wire_count--;
4755
4756 vm_lopages_allocated_cpm_success++;
4757 vm_page_unlock_queues();
4758 }
4759
4760 return vm_page_grab_finalize(grab_options, mem);
4761 }
4762
4763 #endif /* XNU_VM_HAS_LOPAGE */
4764 #if CONFIG_SECLUDED_MEMORY
4765
4766 /*!
4767 * @brief
4768 * Attempt to allocate a page from the secluded queue
4769 *
4770 * @discussion
4771 * This function will check that the caller is eligible
4772 * for the secluded pool, and if not, return VM_PAGE_NULL.
4773 */
4774 __attribute__((noinline))
4775 static vm_page_t
vm_page_grab_secluded(vm_grab_options_t grab_options)4776 vm_page_grab_secluded(vm_grab_options_t grab_options)
4777 {
4778 vm_page_t mem;
4779 vm_object_t object;
4780 int refmod_state;
4781
4782 #if HAS_MTE
4783 if (grab_options & VM_PAGE_GRAB_MTE) {
4784 return VM_PAGE_NULL;
4785 }
4786 #endif /* HAS_MTE */
4787 if (vm_page_secluded_count == 0) {
4788 return VM_PAGE_NULL;
4789 }
4790
4791 if (grab_options & VM_PAGE_GRAB_SECLUDED) {
4792 vm_page_secluded.grab_for_iokit++;
4793 } else if (!task_can_use_secluded_mem(current_task(), TRUE)) {
4794 return VM_PAGE_NULL;
4795 }
4796
4797
4798 /* secluded queue is protected by the VM page queue lock */
4799 vm_page_lock_queues();
4800
4801 if (vm_page_secluded_count == 0) {
4802 /* no secluded pages to grab... */
4803 vm_page_unlock_queues();
4804 return VM_PAGE_NULL;
4805 }
4806
4807 #if 00
4808 /* can we grab from the secluded queue? */
4809 if (vm_page_secluded_count > vm_page_secluded_target ||
4810 (vm_page_secluded_count > 0 &&
4811 task_can_use_secluded_mem(current_task(), TRUE))) {
4812 /* OK */
4813 } else {
4814 /* can't grab from secluded queue... */
4815 vm_page_unlock_queues();
4816 return VM_PAGE_NULL;
4817 }
4818 #endif
4819
4820 /* we can grab a page from secluded queue! */
4821 assert((vm_page_secluded_count_free +
4822 vm_page_secluded_count_inuse) ==
4823 vm_page_secluded_count);
4824 if (current_task()->task_can_use_secluded_mem) {
4825 assert(num_tasks_can_use_secluded_mem > 0);
4826 }
4827 assert(!vm_page_queue_empty(&vm_page_queue_secluded));
4828 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4829 mem = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
4830 assert(mem->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
4831 vm_page_queues_remove(mem, TRUE);
4832
4833 object = VM_PAGE_OBJECT(mem);
4834
4835 assert(!vm_page_is_fictitious(mem));
4836 assert(!VM_PAGE_WIRED(mem));
4837 if (object == VM_OBJECT_NULL) {
4838 /* free for grab! */
4839 vm_page_unlock_queues();
4840 vm_page_secluded.grab_success_free++;
4841 goto out_success;
4842 }
4843
4844 assert(!object->internal);
4845 // vm_page_pageable_external_count--;
4846
4847 if (!vm_object_lock_try(object)) {
4848 // printf("SECLUDED: page %p: object %p locked\n", mem, object);
4849 vm_page_secluded.grab_failure_locked++;
4850 reactivate_secluded_page:
4851 vm_page_activate(mem);
4852 vm_page_unlock_queues();
4853 return VM_PAGE_NULL;
4854 }
4855 if (mem->vmp_busy ||
4856 mem->vmp_cleaning ||
4857 mem->vmp_laundry) {
4858 /* can't steal page in this state... */
4859 vm_object_unlock(object);
4860 vm_page_secluded.grab_failure_state++;
4861 goto reactivate_secluded_page;
4862 }
4863 if (mem->vmp_realtime) {
4864 /* don't steal pages used by realtime threads... */
4865 vm_object_unlock(object);
4866 vm_page_secluded.grab_failure_realtime++;
4867 goto reactivate_secluded_page;
4868 }
4869
4870 mem->vmp_busy = TRUE;
4871 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
4872 if (refmod_state & VM_MEM_REFERENCED) {
4873 mem->vmp_reference = TRUE;
4874 }
4875 if (refmod_state & VM_MEM_MODIFIED) {
4876 SET_PAGE_DIRTY(mem, FALSE);
4877 }
4878 if (mem->vmp_dirty || mem->vmp_precious) {
4879 /* can't grab a dirty page; re-activate */
4880 // printf("SECLUDED: dirty page %p\n", mem);
4881 vm_page_wakeup_done(object, mem);
4882 vm_page_secluded.grab_failure_dirty++;
4883 vm_object_unlock(object);
4884 goto reactivate_secluded_page;
4885 }
4886 if (mem->vmp_reference) {
4887 /* it's been used but we do need to grab a page... */
4888 }
4889
4890 vm_page_unlock_queues();
4891
4892 /* finish what vm_page_free() would have done... */
4893 vm_page_free_prepare_object(mem, TRUE);
4894 vm_object_unlock(object);
4895 object = VM_OBJECT_NULL;
4896
4897 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
4898 vm_page_secluded.grab_success_other++;
4899
4900 out_success:
4901
4902 if (grab_options & VM_PAGE_GRAB_SECLUDED) {
4903 vm_page_secluded.grab_for_iokit_success++;
4904 }
4905 return mem;
4906 }
4907
4908 uint64_t
vm_page_secluded_drain(void)4909 vm_page_secluded_drain(void)
4910 {
4911 vm_page_t local_freeq;
4912 int local_freed;
4913 uint64_t num_reclaimed;
4914 unsigned int saved_secluded_count, saved_secluded_target;
4915
4916 num_reclaimed = 0;
4917 local_freeq = NULL;
4918 local_freed = 0;
4919
4920 vm_page_lock_queues();
4921
4922 saved_secluded_count = vm_page_secluded_count;
4923 saved_secluded_target = vm_page_secluded_target;
4924 vm_page_secluded_target = 0;
4925 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
4926 while (vm_page_secluded_count) {
4927 vm_page_t secluded_page;
4928
4929 assert((vm_page_secluded_count_free +
4930 vm_page_secluded_count_inuse) ==
4931 vm_page_secluded_count);
4932 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
4933 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
4934
4935 vm_page_queues_remove(secluded_page, FALSE);
4936 assert(!vm_page_is_fictitious(secluded_page));
4937 assert(!VM_PAGE_WIRED(secluded_page));
4938
4939 if (secluded_page->vmp_object == 0) {
4940 /* transfer to free queue */
4941 assert(secluded_page->vmp_busy);
4942 secluded_page->vmp_snext = local_freeq;
4943 local_freeq = secluded_page;
4944 local_freed += 1;
4945 } else {
4946 /* transfer to head of active queue */
4947 vm_page_enqueue_active(secluded_page, FALSE);
4948 secluded_page = VM_PAGE_NULL;
4949 }
4950 num_reclaimed++;
4951 }
4952 vm_page_secluded_target = saved_secluded_target;
4953 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
4954
4955 // printf("FBDP %s:%d secluded_count %d->%d, target %d, reclaimed %lld\n", __FUNCTION__, __LINE__, saved_secluded_count, vm_page_secluded_count, vm_page_secluded_target, num_reclaimed);
4956
4957 vm_page_unlock_queues();
4958
4959 if (local_freed) {
4960 vm_page_free_list(local_freeq, TRUE);
4961 local_freeq = NULL;
4962 local_freed = 0;
4963 }
4964
4965 return num_reclaimed;
4966 }
4967
4968 #endif /* CONFIG_SECLUDED_MEMORY */
4969
4970 /*!
4971 * @brief
4972 * Attempts to allocate a page from the specified per-cpu page queue.
4973 */
4974 static vm_page_t
vm_page_grab_from_cpu(vm_page_t * cpu_list,scalable_counter_t * counter)4975 vm_page_grab_from_cpu(vm_page_t *cpu_list, scalable_counter_t *counter)
4976 {
4977 vm_page_t mem = _vm_page_list_pop(cpu_list);
4978
4979 if (mem != VM_PAGE_NULL) {
4980 #if HIBERNATION
4981 if (hibernate_rebuild_needed) {
4982 panic("should not modify cpu->free_pages while hibernating");
4983 }
4984 #endif /* HIBERNATION */
4985 counter_dec_preemption_disabled(counter);
4986 }
4987 return mem;
4988 }
4989
4990 #if HAS_MTE
4991 /*!
4992 * @brief
4993 * Attempts to allocate pages from free tag storage percpu queue.
4994 */
4995 static vm_page_t
vm_page_grab_claimed_from_cpu(mte_pcpu_t pcpu,vm_grab_options_t options)4996 vm_page_grab_claimed_from_cpu(mte_pcpu_t pcpu, vm_grab_options_t options)
4997 {
4998 vm_page_t mem = VM_PAGE_NULL;
4999
5000 if (!(options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE)) {
5001 return VM_PAGE_NULL;
5002 }
5003
5004 if (vm_page_queue_empty(&pcpu->free_claimed_pages)) {
5005 return VM_PAGE_NULL;
5006 }
5007
5008 lck_ticket_lock(&pcpu->free_claimed_lock, &vm_page_lck_grp_bucket);
5009
5010 if (!vm_page_queue_empty(&pcpu->free_claimed_pages)) {
5011 vm_page_queue_remove_first(&pcpu->free_claimed_pages,
5012 mem, vmp_pageq);
5013 counter_dec_preemption_disabled(&vm_cpu_free_claimed_count);
5014 counter_inc(&vm_cpu_claimed_count);
5015 /* must be done immediately to synchronize with stealing */
5016 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
5017 mem->vmp_local_id = 0;
5018 }
5019
5020 lck_ticket_unlock(&pcpu->free_claimed_lock);
5021
5022 return mem;
5023 }
5024 #endif /* HAS_MTE */
5025
5026 /*!
5027 * @brief
5028 * Attempts to allocate pages from free queues, and to populate the per-cpu
5029 * queue as a side effect.
5030 *
5031 * @discussion
5032 * This function will take the properties of the allocating thread into account
5033 * to decide how many pages it can allocate.
5034 *
5035 * If the free queues are depleted, then it will return VM_PAGE_NULL.
5036 */
5037 __attribute__((noinline))
5038 static vm_page_t
vm_page_grab_slow(vm_grab_options_t grab_options)5039 vm_page_grab_slow(vm_grab_options_t grab_options)
5040 {
5041 #if HAS_MTE
5042 unsigned int mte_draw = 0;
5043 unsigned int mte_slop = 0;
5044 #endif /* HAS_MTE */
5045 unsigned int target = vm_free_magazine_refill_limit;
5046 vm_memory_class_t class = VM_MEMORY_CLASS_REGULAR;
5047 vm_page_t mem = VM_PAGE_NULL;
5048 vm_page_list_t list = { };
5049 vm_page_t *cpu_list = NULL;
5050 scalable_counter_t *counter = NULL;
5051
5052 vm_free_page_lock_spin();
5053 #if LCK_MTX_USE_ARCH
5054 /* Intel does't disable preemption with vm_free_page_lock_spin() */
5055 disable_preemption();
5056 #endif /* LCK_MTX_USE_ARCH */
5057 cpu_list = PERCPU_GET(free_pages);
5058 counter = &vm_cpu_free_count;
5059 #if HAS_MTE
5060 if (grab_options & VM_PAGE_GRAB_MTE) {
5061 again:
5062 cpu_list = &PERCPU_GET(mte_pcpu)->free_tagged_pages;
5063 counter = &vm_cpu_free_tagged_count;
5064 target = vm_free_magazine_refill_limit / 2;
5065 class = VM_MEMORY_CLASS_TAGGED;
5066 mte_slop = 0;
5067 } else if (grab_options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE) {
5068 /*
5069 * Note that this is the last time we'll explicitly try to grab
5070 * free, claimable pages. If it comes down to it, we'll grab either
5071 * normal or dead tag storage pages in vm_page_free_queue_grab()
5072 * and hopefully refill the per-CPU free claimable queue.
5073 */
5074 mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
5075 mem = vm_page_grab_claimed_from_cpu(mte_pcpu, grab_options);
5076 }
5077 if (mem == VM_PAGE_NULL)
5078 #endif /* HAS_MTE */
5079 {
5080 mem = vm_page_grab_from_cpu(cpu_list, counter);
5081 }
5082 if (mem != VM_PAGE_NULL) {
5083 #if LCK_MTX_USE_ARCH
5084 enable_preemption();
5085 #endif /* LCK_MTX_USE_ARCH */
5086 vm_free_page_unlock();
5087 return mem;
5088 }
5089
5090 if (vm_page_free_count <= vm_page_free_reserved) {
5091 if ((current_thread()->options & TH_OPT_VMPRIV) == 0) {
5092 target = 0;
5093 } else if (vm_page_free_count == 0) {
5094 target = 0;
5095 } else {
5096 target = 1;
5097 }
5098 } else {
5099 target = MIN(target, vm_page_free_count - vm_page_free_reserved);
5100 }
5101 #if HAS_MTE
5102 if (grab_options & VM_PAGE_GRAB_MTE) {
5103 mte_draw = target;
5104 target = 0;
5105 if (vm_page_free_taggable_count < mte_draw + vm_page_free_min &&
5106 vm_page_free_count >= mte_draw + vm_page_free_min &&
5107 !(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
5108 /*
5109 * If the mte draw is such that we deplete our reserves,
5110 * but there are enough free untaggable pages available,
5111 * attempt to activate pages in order to rebalance
5112 * toward the taggable pool.
5113 *
5114 * If the operation succeeds, the free page queue lock
5115 * was dropped and we need to re-take it from the top.
5116 */
5117 if (mteinfo_tag_storage_try_activate(mte_draw +
5118 vm_page_free_min - vm_page_free_taggable_count,
5119 /* lock_spin */ true)) {
5120 goto again;
5121 }
5122 }
5123 } else if (target > vm_page_free_count - vm_page_free_taggable_count) {
5124 mte_draw = target - (vm_page_free_count - vm_page_free_taggable_count);
5125 target = (vm_page_free_count - vm_page_free_taggable_count);
5126 } else {
5127 mte_draw = 0;
5128 }
5129
5130 if (vm_page_free_taggable_count <= vm_page_free_reserved) {
5131 if ((current_thread()->options & TH_OPT_VMPRIV) == 0) {
5132 mte_draw = 0;
5133 } else if (vm_page_free_taggable_count == 0) {
5134 mte_draw = 0;
5135 } else if (target) {
5136 mte_draw = 0;
5137 } else {
5138 mte_draw = 1;
5139 }
5140 } else {
5141 mte_draw = MIN(mte_draw,
5142 vm_page_free_taggable_count - vm_page_free_reserved);
5143 }
5144
5145 target += mte_draw;
5146 #endif /* HAS_MTE */
5147
5148 #if HIBERNATION
5149 if (target > 0 && hibernate_rebuild_needed) {
5150 panic("should not modify CPU free_pages while hibernating");
5151 }
5152 #endif /* HIBERNATION */
5153
5154 /*
5155 * Convert the lock hold into a mutex, to signal to waiters that the
5156 * lock may be held for longer.
5157 */
5158 #if !LCK_MTX_USE_ARCH
5159 disable_preemption();
5160 #endif /* !LCK_MTX_USE_ARCH */
5161 vm_free_page_lock_convert();
5162
5163 if (target != 0) {
5164 list = vm_page_free_queue_grab(grab_options, class, target,
5165 VM_PAGE_ON_FREE_LOCAL_Q);
5166 }
5167
5168 #if VM_PAGE_WIRE_COUNT_WARNING
5169 if (vm_page_wire_count >= VM_PAGE_WIRE_COUNT_WARNING) {
5170 printf("mk: vm_page_grab(): high wired page count of %d\n",
5171 vm_page_wire_count);
5172 }
5173 #endif
5174 #if VM_PAGE_GOBBLE_COUNT_WARNING
5175 if (vm_page_gobble_count >= VM_PAGE_GOBBLE_COUNT_WARNING) {
5176 printf("mk: vm_page_grab(): high gobbled page count of %d\n",
5177 vm_page_gobble_count);
5178 }
5179 #endif
5180
5181 if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) {
5182 thread_wakeup(&vm_page_free_wanted);
5183 }
5184
5185 vm_free_page_unlock();
5186
5187 VM_CHECK_MEMORYSTATUS;
5188
5189 if (list.vmpl_head) {
5190 #if HAS_MTE
5191 mteinfo_page_list_fix_tagging(class, &list);
5192 #endif /* HAS_MTE */
5193 /* Steal a page off the list for the caller. */
5194 mem = vm_page_list_pop(&list);
5195
5196 /* Add the remaining pages to the CPU's free list. */
5197 assert(*cpu_list == VM_PAGE_NULL);
5198 *cpu_list = list.vmpl_head;
5199 counter_add_preemption_disabled(counter, list.vmpl_count);
5200 }
5201
5202 enable_preemption();
5203
5204 return mem;
5205 }
5206
5207 vm_page_t
vm_page_grab_options(vm_grab_options_t options)5208 vm_page_grab_options(vm_grab_options_t options)
5209 {
5210 #if HAS_MTE
5211 mte_pcpu_t mte_pcpu;
5212 vm_page_t *cpu_list;
5213 scalable_counter_t *counter;
5214 #endif
5215 vm_page_t mem;
5216
5217 restart:
5218
5219 /*
5220 * Step 1: look at the CPU magazines.
5221 */
5222
5223 disable_preemption();
5224 #if HAS_MTE
5225 mte_pcpu = PERCPU_GET(mte_pcpu);
5226 if (options & VM_PAGE_GRAB_MTE) {
5227 cpu_list = &mte_pcpu->free_tagged_pages;
5228 counter = &vm_cpu_free_tagged_count;
5229 mem = VM_PAGE_NULL;
5230 } else {
5231 cpu_list = PERCPU_GET(free_pages);
5232 counter = &vm_cpu_free_count;
5233 mem = VM_PAGE_NULL;
5234 }
5235
5236 if (options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE) {
5237 mem = vm_page_grab_claimed_from_cpu(mte_pcpu, options);
5238 }
5239 if (mem == VM_PAGE_NULL) {
5240 mem = vm_page_grab_from_cpu(cpu_list, counter);
5241 }
5242 #else
5243 mem = vm_page_grab_from_cpu(PERCPU_GET(free_pages), &vm_cpu_free_count);
5244 #endif /* HAS_MTE */
5245 enable_preemption();
5246
5247 if (mem != VM_PAGE_NULL) {
5248 return vm_page_grab_finalize(options, mem);
5249 }
5250
5251 #if XNU_VM_HAS_DELAYED_PAGES
5252 /*
5253 * If free count is low and we have delayed pages from early boot,
5254 * get one of those instead.
5255 */
5256 if (__improbable(vm_delayed_count > 0 &&
5257 vm_page_free_count <= vm_page_free_target)) {
5258 mem = vm_get_delayed_page(options);
5259 if (mem != VM_PAGE_NULL) {
5260 return vm_page_grab_finalize(options, mem);
5261 }
5262 }
5263 #endif /* XNU_VM_HAS_DELAYED_PAGES */
5264
5265
5266 /*
5267 * Step 2: Try to promote pages from the free queues,
5268 * or the secluded queue if appropriate.
5269 */
5270
5271 mem = vm_page_grab_slow(options);
5272 if (mem != VM_PAGE_NULL) {
5273 return vm_page_grab_finalize(options, mem);
5274 }
5275
5276 #if CONFIG_SECLUDED_MEMORY
5277 mem = vm_page_grab_secluded(options);
5278 if (mem != VM_PAGE_NULL) {
5279 return vm_page_grab_finalize(options, mem);
5280 }
5281 #endif /* CONFIG_SECLUDED_MEMORY */
5282
5283
5284 /*
5285 * Step 3: Privileged threads block and retry, others fail.
5286 */
5287
5288 #if HAS_MTE
5289 if (options & VM_PAGE_GRAB_MTE) {
5290 current_thread()->page_wait_class = VM_MEMORY_CLASS_TAGGED;
5291 } else {
5292 current_thread()->page_wait_class = VM_MEMORY_CLASS_REGULAR;
5293 }
5294 #endif /* HAS_MTE */
5295 if ((options & VM_PAGE_GRAB_NOPAGEWAIT) == 0 &&
5296 (current_thread()->options & TH_OPT_VMPRIV) != 0) {
5297 VM_PAGE_WAIT();
5298 goto restart;
5299 }
5300
5301 return VM_PAGE_NULL;
5302 }
5303
5304 vm_grab_options_t
vm_page_grab_options_for_object(vm_object_t object __unused)5305 vm_page_grab_options_for_object(vm_object_t object __unused)
5306 {
5307 vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
5308
5309 #if CONFIG_SECLUDED_MEMORY
5310 if (object->can_grab_secluded) {
5311 options |= VM_PAGE_GRAB_SECLUDED;
5312 }
5313 #endif /* CONFIG_SECLUDED_MEMORY */
5314 #if HAS_MTE
5315 if (vm_object_is_mte_mappable(object)) {
5316 options |= VM_PAGE_GRAB_MTE;
5317 }
5318 #endif /* HAS_MTE */
5319
5320 return options;
5321 }
5322
5323 /*!
5324 * @function vm_page_free_queue_steal()
5325 *
5326 * @abstract
5327 * Steal a given page from the free queues.
5328 *
5329 * @discussion
5330 * The given page must be in the given free queue, or state may be corrupted.
5331 *
5332 * Internally, the free queue is not synchronized, so any locking must be done
5333 * outside of this function.
5334 *
5335 * This function, like vm_page_grab(), takes care of waking up
5336 * page out scan as needed.
5337 */
5338 static void
vm_page_free_queue_steal(vm_grab_options_t options,vm_page_t mem)5339 vm_page_free_queue_steal(vm_grab_options_t options, vm_page_t mem)
5340 {
5341 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
5342 vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
5343
5344 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
5345 assert(!mem->vmp_lopage && mem->vmp_busy);
5346
5347 vm_page_free_queue_remove(class, mem, pnum, VM_PAGE_NOT_ON_Q);
5348 vm_page_grab_finalize(options, mem);
5349
5350 if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) {
5351 thread_wakeup(&vm_page_free_wanted);
5352 }
5353 }
5354
5355 #if HAS_MTE
5356 /*!
5357 * @function _vm_page_wait_wakeup_fill_thread()
5358 *
5359 * @abstract
5360 * Given the number of waiters, return whether the MTE fill thread should
5361 * wake up.
5362 *
5363 * @discussion
5364 * The idea is to wake up the MTE fill thread without explicitly triggering
5365 * pageout_scan(), which means @c vm_page_free_count must be at least
5366 * @c vm_page_free_min. On top of that, it's possible that tag storage pages
5367 * may get relocated, which means that some free untagged pages will be needed
5368 * to activate a tag storage page. This function uses the naive, pessimistic
5369 * heuristic that a given tag storage page does not have many free covered
5370 * pages, and some number of those tag storage pages will need to be relocated.
5371 *
5372 * The free queue lock should be held during this function.
5373 *
5374 * @param n_waiters The number of waiters for tagged memory.
5375 *
5376 * @returns Whether the system has enough free pages to
5377 * wake up the MTE fill thread.
5378 */
5379 static bool
_vm_page_wait_wakeup_fill_thread(uint32_t n_waiters)5380 _vm_page_wait_wakeup_fill_thread(uint32_t n_waiters)
5381 {
5382 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
5383 return vm_page_free_count > vm_page_free_min + (3 * n_waiters) / 2;
5384 }
5385 #endif /* HAS_MTE */
5386
5387 /*
5388 * vm_page_wait:
5389 *
5390 * Wait for a page to become available.
5391 * If there are plenty of free pages, then we don't sleep.
5392 *
5393 * Returns:
5394 * TRUE: There may be another page, try again
5395 * FALSE: We were interrupted out of our wait, don't try again
5396 */
5397
5398 boolean_t
vm_page_wait(int interruptible)5399 vm_page_wait(int interruptible)
5400 {
5401 /*
5402 * We can't use vm_page_free_reserved to make this
5403 * determination. Consider: some thread might
5404 * need to allocate two pages. The first allocation
5405 * succeeds, the second fails. After the first page is freed,
5406 * a call to vm_page_wait must really block.
5407 */
5408 kern_return_t wait_result = THREAD_NOT_WAITING;
5409 thread_t cur_thread = current_thread();
5410 bool is_privileged = cur_thread->options & TH_OPT_VMPRIV;
5411 bool need_wakeup = false;
5412 event_t wait_event = NULL;
5413 #if HAS_MTE
5414 bool wakeup_refill_thread = false;
5415 #endif /* HAS_MTE */
5416
5417 vm_free_page_lock_spin();
5418
5419 #if HAS_MTE
5420 if (cur_thread->page_wait_class == VM_MEMORY_CLASS_TAGGED) {
5421 if (is_privileged) {
5422 if (vm_page_free_taggable_count) {
5423 vm_free_page_unlock();
5424 goto out;
5425 }
5426
5427 if (vm_page_free_wanted_tagged_privileged++ == 0) {
5428 wakeup_refill_thread = true;
5429 }
5430
5431 wait_event = (event_t)&vm_page_free_wanted_tagged_privileged;
5432 } else if (vm_page_free_taggable_count >= vm_page_free_target) {
5433 vm_free_page_unlock();
5434 goto out;
5435 } else {
5436 if (vm_page_free_wanted_tagged++ == 0) {
5437 wakeup_refill_thread = true;
5438 }
5439
5440 wait_event = (event_t)&vm_page_free_wanted_tagged;
5441 }
5442 } else
5443 #endif /* !HAS_MTE */
5444 if (is_privileged) {
5445 if (vm_page_free_count) {
5446 vm_free_page_unlock();
5447 goto out;
5448 }
5449
5450 if (vm_page_free_wanted_privileged++ == 0) {
5451 need_wakeup = true;
5452 }
5453
5454 wait_event = (event_t)&vm_page_free_wanted_privileged;
5455 } else if (vm_page_free_count >= vm_page_free_target) {
5456 vm_free_page_unlock();
5457 goto out;
5458 #if CONFIG_SECLUDED_MEMORY
5459 } else if (secluded_for_apps &&
5460 task_can_use_secluded_mem(current_task(), FALSE)) {
5461 #if 00
5462 /* XXX FBDP: need pageq lock for this... */
5463 /* XXX FBDP: might wait even if pages available, */
5464 /* XXX FBDP: hopefully not for too long... */
5465 if (vm_page_secluded_count > 0) {
5466 vm_free_page_unlock();
5467 goto out;
5468 }
5469 #endif
5470 if (vm_page_free_wanted_secluded++ == 0) {
5471 need_wakeup = true;
5472 }
5473
5474 wait_event = (event_t)&vm_page_free_wanted_secluded;
5475 #endif /* CONFIG_SECLUDED_MEMORY */
5476 } else {
5477 if (vm_page_free_wanted++ == 0) {
5478 need_wakeup = true;
5479 }
5480
5481 wait_event = (event_t)&vm_page_free_count;
5482 }
5483
5484 #if HAS_MTE
5485 /*
5486 * If we're here, it means that the free taggable count is low.
5487 * If there are enough free pages in the system, we can ask the
5488 * fill thread to convert some free untagged pages to free tagged
5489 * pages. Otherwise, we will wake up pageout_scan(), which will
5490 * free pages, and on the free path, the fill thread will get woken up
5491 * (see vm_page_free_queue_handle_wakeups_and_unlock()).
5492 *
5493 * The fill thread will run or not run under a variety of conditions
5494 * (see mteinfo_tag_storage_active_should_refill() for more details),
5495 * but what's relevant here is that the fill thread will run so long
5496 * as there are tagged waiters. We should at least ensure that the
5497 * system has enough free untagged memory to service the existing
5498 * tagged waiters.
5499 */
5500 if (wakeup_refill_thread) {
5501 uint32_t total_tagged_waiters = vm_page_free_wanted_tagged_privileged +
5502 vm_page_free_wanted_tagged;
5503 if (_vm_page_wait_wakeup_fill_thread(total_tagged_waiters)) {
5504 /* If there are enough pages for tagged waiters. */
5505 } else {
5506 /*
5507 * Otherwise, wake up pageout_scan(), and the fill thread will
5508 * run later.
5509 */
5510 wakeup_refill_thread = false;
5511 need_wakeup = true;
5512 }
5513 }
5514
5515 #endif /* HAS_MTE */
5516 if (vm_pageout_running) {
5517 need_wakeup = false;
5518 }
5519
5520 /*
5521 * We don't do a vm_pageout_scan wakeup if we already have
5522 * some waiters because vm_pageout_scan checks for waiters
5523 * before it returns and does so behind the vm_page_queue_free_lock,
5524 * which we own when we bump the waiter counts.
5525 */
5526
5527 if (vps_dynamic_priority_enabled) {
5528 /*
5529 * We are waking up vm_pageout_scan here. If it needs
5530 * the vm_page_queue_free_lock before we unlock it
5531 * we'll end up just blocking and incur an extra
5532 * context switch. Could be a perf. issue.
5533 */
5534
5535 #if HAS_MTE
5536 if (cur_thread->page_wait_class != VM_MEMORY_CLASS_REGULAR) {
5537 panic("vm_page_wait does not support MTE+vps_dynamic_priority_enabled");
5538 }
5539 #endif /* HAS_MTE */
5540 if (need_wakeup) {
5541 thread_wakeup((event_t)&vm_page_free_wanted);
5542 }
5543
5544 /*
5545 * LD: This event is going to get recorded every time because
5546 * we don't get back THREAD_WAITING from lck_mtx_sleep_with_inheritor.
5547 * We just block in that routine.
5548 */
5549 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
5550 vm_page_free_wanted_privileged,
5551 vm_page_free_wanted,
5552 #if CONFIG_SECLUDED_MEMORY
5553 vm_page_free_wanted_secluded,
5554 #else /* CONFIG_SECLUDED_MEMORY */
5555 0,
5556 #endif /* CONFIG_SECLUDED_MEMORY */
5557 0);
5558 wait_result = lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
5559 LCK_SLEEP_UNLOCK,
5560 wait_event,
5561 vm_pageout_scan_thread,
5562 interruptible,
5563 0);
5564 } else {
5565 wait_result = assert_wait(wait_event, interruptible);
5566
5567 vm_free_page_unlock();
5568
5569 if (need_wakeup) {
5570 thread_wakeup((event_t)&vm_page_free_wanted);
5571 }
5572 #if HAS_MTE
5573 if (wakeup_refill_thread) {
5574 assert(!need_wakeup);
5575 mteinfo_wake_fill_thread();
5576 }
5577 #endif /* HAS_MTE */
5578
5579 if (wait_result != THREAD_WAITING) {
5580 goto out;
5581 }
5582
5583 #if HAS_MTE
5584 if (cur_thread->page_wait_class == VM_MEMORY_CLASS_TAGGED) {
5585 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5586 DBG_VM_PAGE_MTE_WAIT_BLOCK,
5587 DBG_FUNC_START,
5588 vm_page_free_wanted_tagged_privileged,
5589 vm_page_free_wanted_tagged,
5590 0,
5591 0);
5592 wait_result = thread_block(THREAD_CONTINUE_NULL);
5593 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5594 DBG_VM_PAGE_MTE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
5595 goto out;
5596 }
5597 #endif /* HAS_MTE */
5598
5599 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5600 DBG_VM_PAGE_WAIT_BLOCK,
5601 DBG_FUNC_START,
5602 vm_page_free_wanted_privileged,
5603 vm_page_free_wanted,
5604 #if CONFIG_SECLUDED_MEMORY
5605 vm_page_free_wanted_secluded,
5606 #else /* CONFIG_SECLUDED_MEMORY */
5607 0,
5608 #endif /* CONFIG_SECLUDED_MEMORY */
5609 0);
5610 wait_result = thread_block(THREAD_CONTINUE_NULL);
5611 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5612 DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
5613 }
5614
5615 out:
5616 #if HAS_MTE
5617 cur_thread->page_wait_class = VM_MEMORY_CLASS_REGULAR;
5618 #endif /* HAS_MTE */
5619 return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING);
5620 }
5621
5622 /*
5623 * vm_page_free_prepare:
5624 *
5625 * Removes page from any queue it may be on
5626 * and disassociates it from its VM object.
5627 *
5628 * Object and page queues must be locked prior to entry.
5629 */
5630 static void
vm_page_free_prepare(vm_page_t mem)5631 vm_page_free_prepare(
5632 vm_page_t mem)
5633 {
5634 vm_page_free_prepare_queues(mem);
5635 vm_page_free_prepare_object(mem, TRUE);
5636 #if CONFIG_SPTM
5637 /**
5638 * The pmap should retype frames as necessary when pmap_recycle_page()
5639 * is called. In order to catch potential cases where this does not
5640 * happen, add an appropriate assert here. This code should be
5641 * executed on every frame that is about to be released to the VM.
5642 */
5643 const sptm_paddr_t paddr = ((uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)) << PAGE_SHIFT;
5644 __unused const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
5645
5646 assert(frame_type == XNU_DEFAULT);
5647 #endif /* CONFIG_SPTM */
5648
5649 #if HAS_MTE
5650 /*
5651 * At this point, any busy bit on `mem` has been cleared. If the refill
5652 * thread wanted this page, update the cell state from PINNED to CLAIMED.
5653 *
5654 * We only expect to come through here when swap-ins/outs have erred.
5655 */
5656 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR && mem->vmp_ts_wanted) {
5657 mteinfo_tag_storage_wakeup(mem, false);
5658 }
5659 #endif /* HAS_MTE */
5660 }
5661
5662
5663 void
vm_page_free_prepare_queues(vm_page_t mem)5664 vm_page_free_prepare_queues(
5665 vm_page_t mem)
5666 {
5667 vm_object_t m_object;
5668
5669 VM_PAGE_CHECK(mem);
5670
5671 assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
5672 assert(!mem->vmp_cleaning);
5673 m_object = VM_PAGE_OBJECT(mem);
5674
5675 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5676 if (m_object) {
5677 vm_object_lock_assert_exclusive(m_object);
5678 }
5679 if (mem->vmp_laundry) {
5680 /*
5681 * We may have to free a page while it's being laundered
5682 * if we lost its pager (due to a forced unmount, for example).
5683 * We need to call vm_pageout_steal_laundry() before removing
5684 * the page from its VM object, so that we can remove it
5685 * from its pageout queue and adjust the laundry accounting
5686 */
5687 vm_pageout_steal_laundry(mem, TRUE);
5688 }
5689
5690 vm_page_queues_remove(mem, TRUE);
5691
5692 if (mem->vmp_realtime) {
5693 mem->vmp_realtime = false;
5694 VM_COUNTER_DEC(&vm_page_realtime_count);
5695 }
5696
5697 if (VM_PAGE_WIRED(mem)) {
5698 assert(mem->vmp_wire_count > 0);
5699
5700 if (m_object) {
5701 task_t owner;
5702 int ledger_idx_volatile;
5703 int ledger_idx_nonvolatile;
5704 int ledger_idx_volatile_compressed;
5705 int ledger_idx_nonvolatile_compressed;
5706 int ledger_idx_composite;
5707 int ledger_idx_external_wired;
5708 boolean_t do_footprint;
5709
5710 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
5711 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
5712 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
5713
5714 assert(m_object->resident_page_count >=
5715 m_object->wired_page_count);
5716
5717 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
5718 OSAddAtomic(+1, &vm_page_purgeable_count);
5719 assert(vm_page_purgeable_wired_count > 0);
5720 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
5721 }
5722 if (m_object->internal &&
5723 m_object->vo_owner != TASK_NULL &&
5724 (m_object->purgable == VM_PURGABLE_VOLATILE ||
5725 m_object->purgable == VM_PURGABLE_EMPTY)) {
5726 owner = VM_OBJECT_OWNER(m_object);
5727 vm_object_ledger_tag_ledgers(
5728 m_object,
5729 &ledger_idx_volatile,
5730 &ledger_idx_nonvolatile,
5731 &ledger_idx_volatile_compressed,
5732 &ledger_idx_nonvolatile_compressed,
5733 &ledger_idx_composite,
5734 &ledger_idx_external_wired,
5735 &do_footprint);
5736 /*
5737 * While wired, this page was accounted
5738 * as "non-volatile" but it should now
5739 * be accounted as "volatile".
5740 */
5741 /* one less "non-volatile"... */
5742 ledger_debit(owner->ledger,
5743 ledger_idx_nonvolatile,
5744 PAGE_SIZE);
5745 if (do_footprint) {
5746 /* ... and "phys_footprint" */
5747 ledger_debit(owner->ledger,
5748 task_ledgers.phys_footprint,
5749 PAGE_SIZE);
5750 } else if (ledger_idx_composite != -1) {
5751 ledger_debit(owner->ledger,
5752 ledger_idx_composite,
5753 PAGE_SIZE);
5754 }
5755 /* one more "volatile" */
5756 ledger_credit(owner->ledger,
5757 ledger_idx_volatile,
5758 PAGE_SIZE);
5759 }
5760 }
5761 if (vm_page_is_canonical(mem)) {
5762 vm_page_wire_count--;
5763 }
5764
5765 #if HAS_MTE
5766 mteinfo_decrement_wire_count(mem, true);
5767 #endif /* HAS_MTE */
5768
5769 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
5770 mem->vmp_iopl_wired = false;
5771 mem->vmp_wire_count = 0;
5772 assert(!mem->vmp_gobbled);
5773 } else if (mem->vmp_gobbled) {
5774 if (vm_page_is_canonical(mem)) {
5775 vm_page_wire_count--;
5776 }
5777 vm_page_gobble_count--;
5778 }
5779 }
5780
5781 /*
5782 * like vm_page_init, but we have to preserve fields related to phys page
5783 */
5784 inline static void
vm_page_reset_canonical(vm_page_t mem)5785 vm_page_reset_canonical(vm_page_t mem)
5786 {
5787 *mem = (struct vm_page){
5788 .vmp_offset = (vm_object_offset_t)-1,
5789 .vmp_q_state = VM_PAGE_NOT_ON_Q,
5790 .vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY,
5791 #if XNU_VM_HAS_LOPAGE
5792 .vmp_lopage = mem->vmp_lopage,
5793 #endif /* XNU_VM_HAS_LOPAGE */
5794 .vmp_canonical = true,
5795 .vmp_busy = true,
5796 .vmp_realtime = mem->vmp_realtime,
5797 #if HAS_MTE
5798 .vmp_using_mte = mem->vmp_using_mte,
5799 #endif
5800 #if !XNU_VM_HAS_LINEAR_PAGES_ARRAY
5801 .vmp_phys_page = mem->vmp_phys_page,
5802 #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */
5803 };
5804 /* ECC information is out of `struct vm_page` and preserved */
5805 }
5806
5807 void
vm_page_free_prepare_object(vm_page_t mem,boolean_t remove_from_hash)5808 vm_page_free_prepare_object(vm_page_t mem, boolean_t remove_from_hash)
5809 {
5810 if (mem->vmp_tabled) {
5811 vm_page_remove(mem, remove_from_hash); /* clears tabled, object, offset */
5812 }
5813 vm_page_wakeup(VM_OBJECT_NULL, mem); /* clears wanted */
5814
5815 if (vm_page_is_private(mem)) {
5816 vm_page_reset_private(mem);
5817 }
5818 if (vm_page_is_canonical(mem)) {
5819 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
5820 mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
5821 mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
5822 mem->vmp_next_m == 0);
5823
5824 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5825
5826 vm_page_reset_canonical(mem);
5827 }
5828 }
5829
5830 /*
5831 * vm_page_release:
5832 *
5833 * Return a page to the free list.
5834 *
5835 * Keep in sync with vm_page_free_list().
5836 */
5837
5838 void
vm_page_release(vm_page_t mem,vmp_release_options_t options)5839 vm_page_release(vm_page_t mem, vmp_release_options_t options)
5840 {
5841 if (options & VMP_RELEASE_Q_LOCKED) {
5842 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5843 } else {
5844 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
5845 }
5846
5847 assert(vm_page_is_canonical(mem));
5848 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
5849
5850 if ((options & VMP_RELEASE_SKIP_FREE_CHECK) == 0) {
5851 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5852 }
5853
5854 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
5855
5856
5857 vm_page_free_queue_enter_list(vm_page_list_for_page(mem), options);
5858 }
5859
5860 /*
5861 * This version of vm_page_release() is used only at startup
5862 * when we are single-threaded and pages are being released
5863 * for the first time. Hence, no locking or unnecessary checks are made.
5864 * Note: VM_CHECK_MEMORYSTATUS invoked by the caller.
5865 */
5866 void
vm_page_release_startup(vm_page_t mem)5867 vm_page_release_startup(vm_page_t mem)
5868 {
5869 #if HAS_MTE
5870 if (pmap_in_tag_storage_range(VM_PAGE_GET_PHYS_PAGE(mem)) && is_mte_enabled) {
5871 /*
5872 * Add the MTE tag page to the FREE_MTE_TAG queue. These pages
5873 * can be used/claimed for other purposes (other than tag pages)
5874 * provided that they can be reclaimed quickly without waiting
5875 * on I/O, e.g. readonly/clean file pages.
5876 */
5877 mteinfo_tag_storage_release_startup(mem);
5878 return;
5879 }
5880 #endif /* HAS_MTE */
5881 vm_page_free_queue_enter_list(vm_page_list_for_page(mem),
5882 VMP_RELEASE_STARTUP);
5883 }
5884
5885 /*
5886 * vm_page_free:
5887 *
5888 * Returns the given page to the free list,
5889 * disassociating it with any VM object.
5890 *
5891 * Object and page queues must be locked prior to entry.
5892 */
5893 void
vm_page_free(vm_page_t mem)5894 vm_page_free(vm_page_t mem)
5895 {
5896 vm_page_free_prepare(mem);
5897
5898 if (vm_page_is_canonical(mem)) {
5899 /* page queues are locked */
5900 vm_page_release(mem, VMP_RELEASE_Q_LOCKED |
5901 VMP_RELEASE_SKIP_FREE_CHECK);
5902 } else {
5903 vm_page_release_fictitious(mem);
5904 }
5905 }
5906
5907
5908 void
vm_page_free_unlocked(vm_page_t mem,boolean_t remove_from_hash)5909 vm_page_free_unlocked(vm_page_t mem, boolean_t remove_from_hash)
5910 {
5911 vm_page_lockspin_queues();
5912 vm_page_free_prepare_queues(mem);
5913 vm_page_unlock_queues();
5914
5915 vm_page_free_prepare_object(mem, remove_from_hash);
5916
5917 if (vm_page_is_canonical(mem)) {
5918 /* page queues are not locked */
5919 vm_page_release(mem, VMP_RELEASE_SKIP_FREE_CHECK);
5920 } else {
5921 vm_page_release_fictitious(mem);
5922 }
5923 }
5924
5925
5926 /*
5927 * Free a list of pages. The list can be up to several hundred pages,
5928 * as blocked up by vm_pageout_scan().
5929 * The big win is not having to take the free list lock once
5930 * per page.
5931 *
5932 * The VM page queues lock (vm_page_queue_lock) should NOT be held.
5933 * The VM page free queues lock (vm_page_queue_free_lock) should NOT be held.
5934 *
5935 * Keep in sync with vm_page_release().
5936 */
5937 void
vm_page_free_list(vm_page_t freeq,bool prepare_object)5938 vm_page_free_list(vm_page_t freeq, bool prepare_object)
5939 {
5940 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
5941 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
5942
5943 while (freeq) {
5944 vm_page_list_t list = { };
5945
5946 while (list.vmpl_count < VMP_FREE_BATCH_SIZE && freeq) {
5947 vm_page_t mem = _vm_page_list_pop(&freeq);
5948
5949 assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
5950 (mem->vmp_q_state == VM_PAGE_IS_WIRED));
5951
5952 if (prepare_object) {
5953 vm_page_free_prepare_object(mem, TRUE);
5954 }
5955
5956 if (vm_page_is_fictitious(mem)) {
5957 vm_page_release_fictitious(mem);
5958 continue;
5959 }
5960
5961 if (!prepare_object) {
5962 /* vm_page_free_prepare_object() checked it */
5963 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5964 }
5965
5966 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
5967
5968
5969 /*
5970 * IMPORTANT: we can't set the page "free" here
5971 * because that would make the page eligible for
5972 * a physically-contiguous allocation (see
5973 * vm_page_find_contiguous()) right away (we don't
5974 * hold the vm_page_queue_free lock). That would
5975 * cause trouble because the page is not actually
5976 * in the free queue yet...
5977 */
5978
5979 vm_page_list_push(&list, mem);
5980 }
5981
5982 if (list.vmpl_count) {
5983 vm_page_free_queue_enter_list(list, VMP_RELEASE_NONE);
5984 }
5985 }
5986 }
5987
5988
5989 /*
5990 * vm_page_wire:
5991 *
5992 * Mark this page as wired down by yet
5993 * another map, removing it from paging queues
5994 * as necessary.
5995 *
5996 * The page's object and the page queues must be locked.
5997 */
5998
5999
6000 void
vm_page_wire(vm_page_t mem,vm_tag_t tag,boolean_t check_memorystatus)6001 vm_page_wire(
6002 vm_page_t mem,
6003 vm_tag_t tag,
6004 boolean_t check_memorystatus)
6005 {
6006 vm_object_t m_object;
6007
6008 m_object = VM_PAGE_OBJECT(mem);
6009
6010 // dbgLog(current_thread(), mem->vmp_offset, m_object, 1); /* (TEST/DEBUG) */
6011
6012 VM_PAGE_CHECK(mem);
6013 if (m_object) {
6014 vm_object_lock_assert_exclusive(m_object);
6015 } else {
6016 /*
6017 * In theory, the page should be in an object before it
6018 * gets wired, since we need to hold the object lock
6019 * to update some fields in the page structure.
6020 * However, some code (i386 pmap, for example) might want
6021 * to wire a page before it gets inserted into an object.
6022 * That's somewhat OK, as long as nobody else can get to
6023 * that page and update it at the same time.
6024 */
6025 }
6026 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6027 if (!VM_PAGE_WIRED(mem)) {
6028 if (mem->vmp_laundry) {
6029 vm_pageout_steal_laundry(mem, TRUE);
6030 }
6031
6032 vm_page_queues_remove(mem, TRUE);
6033
6034 assert(mem->vmp_wire_count == 0);
6035 mem->vmp_q_state = VM_PAGE_IS_WIRED;
6036
6037 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
6038 if (mem->vmp_unmodified_ro == true) {
6039 /* Object and PageQ locks are held*/
6040 mem->vmp_unmodified_ro = false;
6041 os_atomic_dec(&compressor_ro_uncompressed, relaxed);
6042 vm_object_compressor_pager_state_clr(VM_PAGE_OBJECT(mem), mem->vmp_offset);
6043 }
6044 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
6045
6046 if (m_object) {
6047 task_t owner;
6048 int ledger_idx_volatile;
6049 int ledger_idx_nonvolatile;
6050 int ledger_idx_volatile_compressed;
6051 int ledger_idx_nonvolatile_compressed;
6052 int ledger_idx_composite;
6053 int ledger_idx_external_wired;
6054 boolean_t do_footprint;
6055
6056 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
6057 VM_OBJECT_WIRED_PAGE_ADD(m_object, mem);
6058 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, tag);
6059
6060 assert(m_object->resident_page_count >=
6061 m_object->wired_page_count);
6062 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
6063 assert(vm_page_purgeable_count > 0);
6064 OSAddAtomic(-1, &vm_page_purgeable_count);
6065 OSAddAtomic(1, &vm_page_purgeable_wired_count);
6066 }
6067 if (m_object->internal &&
6068 m_object->vo_owner != TASK_NULL &&
6069 (m_object->purgable == VM_PURGABLE_VOLATILE ||
6070 m_object->purgable == VM_PURGABLE_EMPTY)) {
6071 owner = VM_OBJECT_OWNER(m_object);
6072 vm_object_ledger_tag_ledgers(
6073 m_object,
6074 &ledger_idx_volatile,
6075 &ledger_idx_nonvolatile,
6076 &ledger_idx_volatile_compressed,
6077 &ledger_idx_nonvolatile_compressed,
6078 &ledger_idx_composite,
6079 &ledger_idx_external_wired,
6080 &do_footprint);
6081 /* less volatile bytes */
6082 ledger_debit(owner->ledger,
6083 ledger_idx_volatile,
6084 PAGE_SIZE);
6085 /* more not-quite-volatile bytes */
6086 ledger_credit(owner->ledger,
6087 ledger_idx_nonvolatile,
6088 PAGE_SIZE);
6089 if (do_footprint) {
6090 /* more footprint */
6091 ledger_credit(owner->ledger,
6092 task_ledgers.phys_footprint,
6093 PAGE_SIZE);
6094 } else if (ledger_idx_composite != -1) {
6095 ledger_credit(owner->ledger,
6096 ledger_idx_composite,
6097 PAGE_SIZE);
6098 }
6099 }
6100
6101 if (m_object->all_reusable) {
6102 /*
6103 * Wired pages are not counted as "re-usable"
6104 * in "all_reusable" VM objects, so nothing
6105 * to do here.
6106 */
6107 } else if (mem->vmp_reusable) {
6108 /*
6109 * This page is not "re-usable" when it's
6110 * wired, so adjust its state and the
6111 * accounting.
6112 */
6113 vm_page_lockconvert_queues();
6114 vm_object_reuse_pages(m_object,
6115 mem->vmp_offset,
6116 mem->vmp_offset + PAGE_SIZE_64,
6117 FALSE);
6118 }
6119 }
6120 assert(!mem->vmp_reusable);
6121
6122 if (vm_page_is_canonical(mem) && !mem->vmp_gobbled) {
6123 vm_page_wire_count++;
6124 }
6125 if (mem->vmp_gobbled) {
6126 vm_page_gobble_count--;
6127 }
6128 mem->vmp_gobbled = FALSE;
6129
6130 if (check_memorystatus == TRUE) {
6131 VM_CHECK_MEMORYSTATUS;
6132 }
6133 }
6134 assert(!mem->vmp_gobbled);
6135 assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
6136 mem->vmp_wire_count++;
6137
6138 #if HAS_MTE
6139 if (mem->vmp_wire_count == 1 && tag != VM_KERN_MEMORY_MTAG) {
6140 /*
6141 * Only notify Mte Info if the caller isn't
6142 * mteinfo_tag_storage_wire_locked().
6143 */
6144 mteinfo_increment_wire_count(mem);
6145 }
6146 #endif /* HAS_MTE */
6147
6148 if (__improbable(mem->vmp_wire_count == 0)) {
6149 panic("vm_page_wire(%p): wire_count overflow", mem);
6150 }
6151 VM_PAGE_CHECK(mem);
6152 }
6153
6154 /*
6155 * vm_page_unwire:
6156 *
6157 * Release one wiring of this page, potentially
6158 * enabling it to be paged again.
6159 *
6160 * The page's object and the page queues must be locked.
6161 */
6162 void
vm_page_unwire(vm_page_t mem,boolean_t queueit)6163 vm_page_unwire(
6164 vm_page_t mem,
6165 boolean_t queueit)
6166 {
6167 vm_object_t m_object;
6168
6169 m_object = VM_PAGE_OBJECT(mem);
6170
6171 // dbgLog(current_thread(), mem->vmp_offset, m_object, 0); /* (TEST/DEBUG) */
6172
6173 VM_PAGE_CHECK(mem);
6174 assert(VM_PAGE_WIRED(mem));
6175 assert(mem->vmp_wire_count > 0);
6176 assert(!mem->vmp_gobbled);
6177 assert(m_object != VM_OBJECT_NULL);
6178 vm_object_lock_assert_exclusive(m_object);
6179 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6180 if (--mem->vmp_wire_count == 0) {
6181 task_t owner;
6182 int ledger_idx_volatile;
6183 int ledger_idx_nonvolatile;
6184 int ledger_idx_volatile_compressed;
6185 int ledger_idx_nonvolatile_compressed;
6186 int ledger_idx_composite;
6187 int ledger_idx_external_wired;
6188 boolean_t do_footprint;
6189
6190 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
6191 mem->vmp_iopl_wired = false;
6192
6193 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
6194 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
6195 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
6196 if (vm_page_is_canonical(mem)) {
6197 vm_page_wire_count--;
6198 }
6199
6200 #if HAS_MTE
6201 mteinfo_decrement_wire_count(mem, true);
6202 #endif /* HAS_MTE */
6203
6204 assert(m_object->resident_page_count >=
6205 m_object->wired_page_count);
6206 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
6207 OSAddAtomic(+1, &vm_page_purgeable_count);
6208 assert(vm_page_purgeable_wired_count > 0);
6209 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
6210 }
6211 if (m_object->internal &&
6212 m_object->vo_owner != TASK_NULL &&
6213 (m_object->purgable == VM_PURGABLE_VOLATILE ||
6214 m_object->purgable == VM_PURGABLE_EMPTY)) {
6215 owner = VM_OBJECT_OWNER(m_object);
6216 vm_object_ledger_tag_ledgers(
6217 m_object,
6218 &ledger_idx_volatile,
6219 &ledger_idx_nonvolatile,
6220 &ledger_idx_volatile_compressed,
6221 &ledger_idx_nonvolatile_compressed,
6222 &ledger_idx_composite,
6223 &ledger_idx_external_wired,
6224 &do_footprint);
6225 /* more volatile bytes */
6226 ledger_credit(owner->ledger,
6227 ledger_idx_volatile,
6228 PAGE_SIZE);
6229 /* less not-quite-volatile bytes */
6230 ledger_debit(owner->ledger,
6231 ledger_idx_nonvolatile,
6232 PAGE_SIZE);
6233 if (do_footprint) {
6234 /* less footprint */
6235 ledger_debit(owner->ledger,
6236 task_ledgers.phys_footprint,
6237 PAGE_SIZE);
6238 } else if (ledger_idx_composite != -1) {
6239 ledger_debit(owner->ledger,
6240 ledger_idx_composite,
6241 PAGE_SIZE);
6242 }
6243 }
6244 assert(!is_kernel_object(m_object));
6245 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
6246
6247 if (queueit == TRUE) {
6248 if (m_object->purgable == VM_PURGABLE_EMPTY) {
6249 vm_page_deactivate(mem);
6250 } else {
6251 vm_page_activate(mem);
6252 }
6253 }
6254
6255 VM_CHECK_MEMORYSTATUS;
6256 }
6257 VM_PAGE_CHECK(mem);
6258 }
6259
6260 /*
6261 * vm_page_deactivate:
6262 *
6263 * Returns the given page to the inactive list,
6264 * indicating that no physical maps have access
6265 * to this page. [Used by the physical mapping system.]
6266 *
6267 * The page queues must be locked.
6268 */
6269 void
vm_page_deactivate(vm_page_t m)6270 vm_page_deactivate(
6271 vm_page_t m)
6272 {
6273 vm_page_deactivate_internal(m, TRUE);
6274 }
6275
6276
6277 void
vm_page_deactivate_internal(vm_page_t m,boolean_t clear_hw_reference)6278 vm_page_deactivate_internal(
6279 vm_page_t m,
6280 boolean_t clear_hw_reference)
6281 {
6282 vm_object_t m_object;
6283
6284 m_object = VM_PAGE_OBJECT(m);
6285
6286 VM_PAGE_CHECK(m);
6287 assert(!is_kernel_object(m_object));
6288 assert(!vm_page_is_guard(m));
6289
6290 // dbgLog(VM_PAGE_GET_PHYS_PAGE(m), vm_page_free_count, vm_page_wire_count, 6); /* (TEST/DEBUG) */
6291 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6292 /*
6293 * This page is no longer very interesting. If it was
6294 * interesting (active or inactive/referenced), then we
6295 * clear the reference bit and (re)enter it in the
6296 * inactive queue. Note wired pages should not have
6297 * their reference bit cleared.
6298 */
6299 assert( !(m->vmp_absent && !m->vmp_unusual));
6300
6301 if (m->vmp_gobbled) { /* can this happen? */
6302 assert( !VM_PAGE_WIRED(m));
6303
6304 if (vm_page_is_canonical(m)) {
6305 vm_page_wire_count--;
6306 }
6307 vm_page_gobble_count--;
6308 m->vmp_gobbled = FALSE;
6309 }
6310 /*
6311 * if this page is currently on the pageout queue, we can't do the
6312 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6313 * and we can't remove it manually since we would need the object lock
6314 * (which is not required here) to decrement the activity_in_progress
6315 * reference which is held on the object while the page is in the pageout queue...
6316 * just let the normal laundry processing proceed
6317 */
6318 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6319 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6320 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
6321 VM_PAGE_WIRED(m)) {
6322 return;
6323 }
6324 if (!m->vmp_absent && clear_hw_reference == TRUE) {
6325 vm_page_lockconvert_queues();
6326 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
6327 }
6328
6329 m->vmp_reference = FALSE;
6330 m->vmp_no_cache = FALSE;
6331
6332 if (!VM_PAGE_INACTIVE(m)) {
6333 vm_page_queues_remove(m, FALSE);
6334
6335 if (!VM_DYNAMIC_PAGING_ENABLED() &&
6336 m->vmp_dirty && m_object->internal &&
6337 (m_object->purgable == VM_PURGABLE_DENY ||
6338 m_object->purgable == VM_PURGABLE_NONVOLATILE ||
6339 m_object->purgable == VM_PURGABLE_VOLATILE)) {
6340 vm_page_check_pageable_safe(m);
6341 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
6342 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
6343 vm_page_throttled_count++;
6344 } else {
6345 if (m_object->named &&
6346 os_ref_get_count_raw(&m_object->ref_count) == 1) {
6347 vm_page_speculate(m, FALSE);
6348 #if DEVELOPMENT || DEBUG
6349 vm_page_speculative_recreated++;
6350 #endif
6351 } else {
6352 vm_page_enqueue_inactive(m, FALSE);
6353 }
6354 }
6355 }
6356 }
6357
6358 /*
6359 * vm_page_enqueue_cleaned
6360 *
6361 * Put the page on the cleaned queue, mark it cleaned, etc.
6362 * Being on the cleaned queue (and having m->clean_queue set)
6363 * does ** NOT ** guarantee that the page is clean!
6364 *
6365 * Call with the queues lock held.
6366 */
6367
6368 void
vm_page_enqueue_cleaned(vm_page_t m)6369 vm_page_enqueue_cleaned(vm_page_t m)
6370 {
6371 vm_object_t m_object;
6372
6373 m_object = VM_PAGE_OBJECT(m);
6374
6375 assert(!vm_page_is_guard(m));
6376 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6377 assert(!(m->vmp_absent && !m->vmp_unusual));
6378
6379 if (VM_PAGE_WIRED(m)) {
6380 return;
6381 }
6382
6383 if (m->vmp_gobbled) {
6384 if (vm_page_is_canonical(m)) {
6385 vm_page_wire_count--;
6386 }
6387 vm_page_gobble_count--;
6388 m->vmp_gobbled = FALSE;
6389 }
6390 /*
6391 * if this page is currently on the pageout queue, we can't do the
6392 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6393 * and we can't remove it manually since we would need the object lock
6394 * (which is not required here) to decrement the activity_in_progress
6395 * reference which is held on the object while the page is in the pageout queue...
6396 * just let the normal laundry processing proceed
6397 */
6398 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6399 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
6400 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6401 return;
6402 }
6403 vm_page_queues_remove(m, FALSE);
6404
6405 vm_page_check_pageable_safe(m);
6406 vm_page_queue_enter(&vm_page_queue_cleaned, m, vmp_pageq);
6407 m->vmp_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q;
6408 vm_page_cleaned_count++;
6409
6410 vm_page_inactive_count++;
6411 if (m_object->internal) {
6412 vm_page_pageable_internal_count++;
6413 } else {
6414 vm_page_pageable_external_count++;
6415 }
6416 vm_page_add_to_specialq(m, TRUE);
6417 VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
6418 }
6419
6420 /*
6421 * vm_page_activate:
6422 *
6423 * Put the specified page on the active list (if appropriate).
6424 *
6425 * The page queues must be locked.
6426 */
6427
6428 void
vm_page_activate(vm_page_t m)6429 vm_page_activate(
6430 vm_page_t m)
6431 {
6432 vm_object_t m_object;
6433
6434 m_object = VM_PAGE_OBJECT(m);
6435
6436 VM_PAGE_CHECK(m);
6437 #ifdef FIXME_4778297
6438 assert(!is_kernel_object(m_object));
6439 #endif
6440 assert(!vm_page_is_guard(m));
6441 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6442 assert( !(m->vmp_absent && !m->vmp_unusual));
6443
6444 if (m->vmp_gobbled) {
6445 assert( !VM_PAGE_WIRED(m));
6446 if (vm_page_is_canonical(m)) {
6447 vm_page_wire_count--;
6448 }
6449 vm_page_gobble_count--;
6450 m->vmp_gobbled = FALSE;
6451 }
6452 /*
6453 * if this page is currently on the pageout queue, we can't do the
6454 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6455 * and we can't remove it manually since we would need the object lock
6456 * (which is not required here) to decrement the activity_in_progress
6457 * reference which is held on the object while the page is in the pageout queue...
6458 * just let the normal laundry processing proceed
6459 */
6460 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6461 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6462 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6463 return;
6464 }
6465
6466 #if DEBUG
6467 if (m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q) {
6468 panic("vm_page_activate: already active");
6469 }
6470 #endif
6471
6472 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
6473 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
6474 DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
6475 }
6476
6477 /*
6478 * A freshly activated page should be promoted in the donation queue.
6479 * So we remove it here while preserving its hint and we will enqueue
6480 * it again in vm_page_enqueue_active.
6481 */
6482 vm_page_queues_remove(m, ((m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) ? TRUE : FALSE));
6483
6484 if (!VM_PAGE_WIRED(m)) {
6485 vm_page_check_pageable_safe(m);
6486 if (!VM_DYNAMIC_PAGING_ENABLED() &&
6487 m->vmp_dirty && m_object->internal &&
6488 (m_object->purgable == VM_PURGABLE_DENY ||
6489 m_object->purgable == VM_PURGABLE_NONVOLATILE ||
6490 m_object->purgable == VM_PURGABLE_VOLATILE)) {
6491 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
6492 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
6493 vm_page_throttled_count++;
6494 } else {
6495 #if CONFIG_SECLUDED_MEMORY
6496 if (secluded_for_filecache &&
6497 vm_page_secluded_target != 0 &&
6498 num_tasks_can_use_secluded_mem == 0 &&
6499 m_object->eligible_for_secluded &&
6500 !m->vmp_realtime) {
6501 vm_page_queue_enter(&vm_page_queue_secluded, m, vmp_pageq);
6502 m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
6503 vm_page_secluded_count++;
6504 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
6505 vm_page_secluded_count_inuse++;
6506 assert(!m_object->internal);
6507 // vm_page_pageable_external_count++;
6508 } else
6509 #endif /* CONFIG_SECLUDED_MEMORY */
6510 vm_page_enqueue_active(m, FALSE);
6511 }
6512 m->vmp_reference = TRUE;
6513 m->vmp_no_cache = FALSE;
6514 }
6515 VM_PAGE_CHECK(m);
6516 }
6517
6518
6519 /*
6520 * vm_page_speculate:
6521 *
6522 * Put the specified page on the speculative list (if appropriate).
6523 *
6524 * The page queues must be locked.
6525 */
6526 void
vm_page_speculate(vm_page_t m,boolean_t new)6527 vm_page_speculate(
6528 vm_page_t m,
6529 boolean_t new)
6530 {
6531 struct vm_speculative_age_q *aq;
6532 vm_object_t m_object;
6533
6534 m_object = VM_PAGE_OBJECT(m);
6535
6536 VM_PAGE_CHECK(m);
6537 vm_page_check_pageable_safe(m);
6538
6539 assert(!vm_page_is_guard(m));
6540 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6541 assert(!(m->vmp_absent && !m->vmp_unusual));
6542 assert(m_object->internal == FALSE);
6543
6544 /*
6545 * if this page is currently on the pageout queue, we can't do the
6546 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6547 * and we can't remove it manually since we would need the object lock
6548 * (which is not required here) to decrement the activity_in_progress
6549 * reference which is held on the object while the page is in the pageout queue...
6550 * just let the normal laundry processing proceed
6551 */
6552 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6553 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6554 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6555 return;
6556 }
6557
6558 vm_page_queues_remove(m, FALSE);
6559
6560 if (!VM_PAGE_WIRED(m)) {
6561 mach_timespec_t ts;
6562 clock_sec_t sec;
6563 clock_nsec_t nsec;
6564
6565 clock_get_system_nanotime(&sec, &nsec);
6566 ts.tv_sec = (unsigned int) sec;
6567 ts.tv_nsec = nsec;
6568
6569 if (vm_page_speculative_count == 0) {
6570 speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6571 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6572
6573 aq = &vm_page_queue_speculative[speculative_age_index];
6574
6575 /*
6576 * set the timer to begin a new group
6577 */
6578 aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
6579 aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
6580
6581 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
6582 } else {
6583 aq = &vm_page_queue_speculative[speculative_age_index];
6584
6585 if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) {
6586 speculative_age_index++;
6587
6588 if (speculative_age_index > vm_page_max_speculative_age_q) {
6589 speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6590 }
6591 if (speculative_age_index == speculative_steal_index) {
6592 speculative_steal_index = speculative_age_index + 1;
6593
6594 if (speculative_steal_index > vm_page_max_speculative_age_q) {
6595 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6596 }
6597 }
6598 aq = &vm_page_queue_speculative[speculative_age_index];
6599
6600 if (!vm_page_queue_empty(&aq->age_q)) {
6601 vm_page_speculate_ageit(aq);
6602 }
6603
6604 aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
6605 aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
6606
6607 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
6608 }
6609 }
6610 vm_page_enqueue_tail(&aq->age_q, &m->vmp_pageq);
6611 m->vmp_q_state = VM_PAGE_ON_SPECULATIVE_Q;
6612 vm_page_speculative_count++;
6613 vm_page_pageable_external_count++;
6614
6615 if (new == TRUE) {
6616 vm_object_lock_assert_exclusive(m_object);
6617
6618 m_object->pages_created++;
6619 #if DEVELOPMENT || DEBUG
6620 vm_page_speculative_created++;
6621 #endif
6622 }
6623 }
6624 VM_PAGE_CHECK(m);
6625 }
6626
6627
6628 /*
6629 * move pages from the specified aging bin to
6630 * the speculative bin that pageout_scan claims from
6631 *
6632 * The page queues must be locked.
6633 */
6634 void
vm_page_speculate_ageit(struct vm_speculative_age_q * aq)6635 vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
6636 {
6637 struct vm_speculative_age_q *sq;
6638 vm_page_t t;
6639
6640 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
6641
6642 if (vm_page_queue_empty(&sq->age_q)) {
6643 sq->age_q.next = aq->age_q.next;
6644 sq->age_q.prev = aq->age_q.prev;
6645
6646 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next);
6647 t->vmp_pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q);
6648
6649 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
6650 t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
6651 } else {
6652 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
6653 t->vmp_pageq.next = aq->age_q.next;
6654
6655 t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next);
6656 t->vmp_pageq.prev = sq->age_q.prev;
6657
6658 t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev);
6659 t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
6660
6661 sq->age_q.prev = aq->age_q.prev;
6662 }
6663 vm_page_queue_init(&aq->age_q);
6664 }
6665
6666
6667 void
vm_page_lru(vm_page_t m)6668 vm_page_lru(
6669 vm_page_t m)
6670 {
6671 VM_PAGE_CHECK(m);
6672 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
6673 assert(!vm_page_is_guard(m));
6674
6675 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6676
6677 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q) {
6678 /*
6679 * we don't need to do all the other work that
6680 * vm_page_queues_remove and vm_page_enqueue_inactive
6681 * bring along for the ride
6682 */
6683 assert(!m->vmp_laundry);
6684 assert(!vm_page_is_private(m));
6685
6686 m->vmp_no_cache = FALSE;
6687
6688 vm_page_queue_remove(&vm_page_queue_inactive, m, vmp_pageq);
6689 vm_page_queue_enter(&vm_page_queue_inactive, m, vmp_pageq);
6690
6691 return;
6692 }
6693 /*
6694 * if this page is currently on the pageout queue, we can't do the
6695 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6696 * and we can't remove it manually since we would need the object lock
6697 * (which is not required here) to decrement the activity_in_progress
6698 * reference which is held on the object while the page is in the pageout queue...
6699 * just let the normal laundry processing proceed
6700 */
6701 if (m->vmp_laundry || vm_page_is_private(m) ||
6702 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6703 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
6704 VM_PAGE_WIRED(m)) {
6705 return;
6706 }
6707
6708 m->vmp_no_cache = FALSE;
6709
6710 vm_page_queues_remove(m, FALSE);
6711
6712 vm_page_enqueue_inactive(m, FALSE);
6713 }
6714
6715
6716 void
vm_page_reactivate_all_throttled(void)6717 vm_page_reactivate_all_throttled(void)
6718 {
6719 vm_page_t first_throttled, last_throttled;
6720 vm_page_t first_active;
6721 vm_page_t m;
6722 int extra_active_count;
6723 int extra_internal_count, extra_external_count;
6724 vm_object_t m_object;
6725
6726 if (!VM_DYNAMIC_PAGING_ENABLED()) {
6727 return;
6728 }
6729
6730 extra_active_count = 0;
6731 extra_internal_count = 0;
6732 extra_external_count = 0;
6733 vm_page_lock_queues();
6734 if (!vm_page_queue_empty(&vm_page_queue_throttled)) {
6735 /*
6736 * Switch "throttled" pages to "active".
6737 */
6738 vm_page_queue_iterate(&vm_page_queue_throttled, m, vmp_pageq) {
6739 VM_PAGE_CHECK(m);
6740 assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
6741
6742 m_object = VM_PAGE_OBJECT(m);
6743
6744 extra_active_count++;
6745 if (m_object->internal) {
6746 extra_internal_count++;
6747 } else {
6748 extra_external_count++;
6749 }
6750
6751 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
6752 VM_PAGE_CHECK(m);
6753 vm_page_add_to_specialq(m, FALSE);
6754 }
6755
6756 /*
6757 * Transfer the entire throttled queue to a regular LRU page queues.
6758 * We insert it at the head of the active queue, so that these pages
6759 * get re-evaluated by the LRU algorithm first, since they've been
6760 * completely out of it until now.
6761 */
6762 first_throttled = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
6763 last_throttled = (vm_page_t) vm_page_queue_last(&vm_page_queue_throttled);
6764 first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
6765 if (vm_page_queue_empty(&vm_page_queue_active)) {
6766 vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
6767 } else {
6768 first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
6769 }
6770 vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled);
6771 first_throttled->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
6772 last_throttled->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
6773
6774 #if DEBUG
6775 printf("reactivated %d throttled pages\n", vm_page_throttled_count);
6776 #endif
6777 vm_page_queue_init(&vm_page_queue_throttled);
6778 /*
6779 * Adjust the global page counts.
6780 */
6781 vm_page_active_count += extra_active_count;
6782 vm_page_pageable_internal_count += extra_internal_count;
6783 vm_page_pageable_external_count += extra_external_count;
6784 vm_page_throttled_count = 0;
6785 }
6786 assert(vm_page_throttled_count == 0);
6787 assert(vm_page_queue_empty(&vm_page_queue_throttled));
6788 vm_page_unlock_queues();
6789 }
6790
6791
6792 /*
6793 * move pages from the indicated local queue to the global active queue
6794 * its ok to fail if we're below the hard limit and force == FALSE
6795 * the nolocks == TRUE case is to allow this function to be run on
6796 * the hibernate path
6797 */
6798
6799 void
vm_page_reactivate_local(uint32_t lid,boolean_t force,boolean_t nolocks)6800 vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
6801 {
6802 struct vpl *lq;
6803 vm_page_t first_local, last_local;
6804 vm_page_t first_active;
6805 vm_page_t m;
6806 uint32_t count = 0;
6807
6808 if (vm_page_local_q == NULL) {
6809 return;
6810 }
6811
6812 lq = zpercpu_get_cpu(vm_page_local_q, lid);
6813
6814 if (nolocks == FALSE) {
6815 if (lq->vpl_count < vm_page_local_q_hard_limit && force == FALSE) {
6816 if (!vm_page_trylockspin_queues()) {
6817 return;
6818 }
6819 } else {
6820 vm_page_lockspin_queues();
6821 }
6822
6823 VPL_LOCK(&lq->vpl_lock);
6824 }
6825 if (lq->vpl_count) {
6826 /*
6827 * Switch "local" pages to "active".
6828 */
6829 assert(!vm_page_queue_empty(&lq->vpl_queue));
6830
6831 vm_page_queue_iterate(&lq->vpl_queue, m, vmp_pageq) {
6832 VM_PAGE_CHECK(m);
6833 vm_page_check_pageable_safe(m);
6834 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q);
6835 assert(!vm_page_is_fictitious(m));
6836
6837 if (m->vmp_local_id != lid) {
6838 panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m);
6839 }
6840
6841 m->vmp_local_id = 0;
6842 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
6843 VM_PAGE_CHECK(m);
6844 vm_page_add_to_specialq(m, FALSE);
6845 count++;
6846 }
6847 if (count != lq->vpl_count) {
6848 panic("vm_page_reactivate_local: count = %d, vm_page_local_count = %d", count, lq->vpl_count);
6849 }
6850
6851 /*
6852 * Transfer the entire local queue to a regular LRU page queues.
6853 */
6854 first_local = (vm_page_t) vm_page_queue_first(&lq->vpl_queue);
6855 last_local = (vm_page_t) vm_page_queue_last(&lq->vpl_queue);
6856 first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
6857
6858 if (vm_page_queue_empty(&vm_page_queue_active)) {
6859 vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
6860 } else {
6861 first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
6862 }
6863 vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
6864 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
6865 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
6866
6867 vm_page_queue_init(&lq->vpl_queue);
6868 /*
6869 * Adjust the global page counts.
6870 */
6871 vm_page_active_count += lq->vpl_count;
6872 vm_page_pageable_internal_count += lq->vpl_internal_count;
6873 vm_page_pageable_external_count += lq->vpl_external_count;
6874 lq->vpl_count = 0;
6875 lq->vpl_internal_count = 0;
6876 lq->vpl_external_count = 0;
6877 }
6878 assert(vm_page_queue_empty(&lq->vpl_queue));
6879
6880 if (nolocks == FALSE) {
6881 VPL_UNLOCK(&lq->vpl_lock);
6882
6883 vm_page_balance_inactive(count / 4);
6884 vm_page_unlock_queues();
6885 }
6886 }
6887
6888 /*
6889 * vm_page_part_zero_fill:
6890 *
6891 * Zero-fill a part of the page.
6892 */
6893 #define PMAP_ZERO_PART_PAGE_IMPLEMENTED
6894 void
vm_page_part_zero_fill(vm_page_t m,vm_offset_t m_pa,vm_size_t len)6895 vm_page_part_zero_fill(
6896 vm_page_t m,
6897 vm_offset_t m_pa,
6898 vm_size_t len)
6899 {
6900 #if 0
6901 /*
6902 * we don't hold the page queue lock
6903 * so this check isn't safe to make
6904 */
6905 VM_PAGE_CHECK(m);
6906 #endif
6907
6908 #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED
6909 pmap_zero_part_page(VM_PAGE_GET_PHYS_PAGE(m), m_pa, len);
6910 #else
6911 vm_page_t tmp;
6912 while (1) {
6913 tmp = vm_page_grab();
6914 if (tmp == VM_PAGE_NULL) {
6915 vm_page_wait(THREAD_UNINT);
6916 continue;
6917 }
6918 break;
6919 }
6920 vm_page_zero_fill(
6921 tmp
6922 #if HAS_MTE
6923 , false /* zero_tags */
6924 #endif /* HAS_MTE */
6925 );
6926 if (m_pa != 0) {
6927 vm_page_part_copy(m, 0, tmp, 0, m_pa);
6928 }
6929 if ((m_pa + len) < PAGE_SIZE) {
6930 vm_page_part_copy(m, m_pa + len, tmp,
6931 m_pa + len, PAGE_SIZE - (m_pa + len));
6932 }
6933 vm_page_copy(tmp, m);
6934 VM_PAGE_FREE(tmp);
6935 #endif
6936 }
6937
6938 /*!
6939 * @function vm_page_zero_fill
6940 *
6941 * @abstract
6942 * Zero-fill the specified page.
6943 *
6944 * @param m the page to be zero-filled.
6945 */
6946 #if HAS_MTE && !defined(KASAN)
6947 /*!
6948 * @param zero_tags if true, and the page is MTE-tagged, its corresponding tags will be zeroed.
6949 */
6950 #endif /* HAS_MTE && !defined(KASAN) */
6951 void
vm_page_zero_fill(vm_page_t m,bool zero_tags)6952 vm_page_zero_fill(
6953 vm_page_t m
6954 #if HAS_MTE
6955 , bool zero_tags
6956 #endif /* HAS_MTE */
6957 )
6958 {
6959 int options = 0;
6960 #if 0
6961 /*
6962 * we don't hold the page queue lock
6963 * so this check isn't safe to make
6964 */
6965 VM_PAGE_CHECK(m);
6966 #endif
6967
6968 // dbgTrace(0xAEAEAEAE, VM_PAGE_GET_PHYS_PAGE(m), 0); /* (BRINGUP) */
6969 #if HAS_MTE
6970 assert(!zero_tags || VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6971
6972 /*
6973 * TODO: this can be checked more easily using m->vmp_using_mte once
6974 * page reclamation work is complete
6975 */
6976 if (zero_tags && vm_object_is_mte_mappable(VM_PAGE_OBJECT(m))) {
6977 options = cppvZeroPageTags;
6978 KDBG(VMDBG_CODE(DBG_VM_PAGE_MTE_ZFOD) | DBG_FUNC_NONE,
6979 VM_KERNEL_ADDRHIDE(m), VM_KERNEL_ADDRHIDE(VM_PAGE_OBJECT(m)),
6980 m->vmp_offset);
6981 }
6982 #endif /* HAS_MTE */
6983 pmap_zero_page_with_options(VM_PAGE_GET_PHYS_PAGE(m), options);
6984 }
6985
6986 /*
6987 * vm_page_part_copy:
6988 *
6989 * copy part of one page to another
6990 *
6991 * This function is currently only consumed downstream of a
6992 * vm_map_copy_overwrite(). The implementation has a simpler contract
6993 * than vm_page_copy() as there's a restricted set of cases that
6994 * are allowed to be overwriteable. If vm_map_entry_is_overwriteable()
6995 * is expanded, this function may have to be adjusted.
6996 */
6997 void
vm_page_part_copy(vm_page_t src_m,vm_offset_t src_pa,vm_page_t dst_m,vm_offset_t dst_pa,vm_size_t len)6998 vm_page_part_copy(
6999 vm_page_t src_m,
7000 vm_offset_t src_pa,
7001 vm_page_t dst_m,
7002 vm_offset_t dst_pa,
7003 vm_size_t len)
7004 {
7005 #if 0
7006 /*
7007 * we don't hold the page queue lock
7008 * so this check isn't safe to make
7009 */
7010 VM_PAGE_CHECK(src_m);
7011 VM_PAGE_CHECK(dst_m);
7012 #endif
7013
7014 /*
7015 * Copying from/into restricted pages is a security issue,
7016 * as it allows for restricted pages' policies bypass.
7017 */
7018 if (vm_page_is_restricted(src_m)) {
7019 panic("%s: cannot copy from a restricted page", __func__);
7020 }
7021
7022 if (vm_page_is_restricted(dst_m)) {
7023 panic("%s: cannot copy into a restricted page", __func__);
7024 }
7025
7026 #if HAS_MTE
7027 /*
7028 * As an example of a necessary expansion for vm_page_part_copy(),
7029 * MTE objects are currently not overwriteable, but whenever
7030 * rdar://134375521 ([VM MTE] Handle overwriting of MTE objects)
7031 * gets dealt with, we'll have to update the call down here to pass
7032 * the right flags to bcopy_phys().
7033 */
7034 #endif /* HAS_MTE */
7035
7036 pmap_copy_part_page(VM_PAGE_GET_PHYS_PAGE(src_m), src_pa,
7037 VM_PAGE_GET_PHYS_PAGE(dst_m), dst_pa, len);
7038 }
7039
7040 /*
7041 * vm_page_copy:
7042 *
7043 * Copy one page to another
7044 */
7045
7046 int vm_page_copy_cs_validations = 0;
7047 int vm_page_copy_cs_tainted = 0;
7048
7049 void
vm_page_copy(vm_page_t src_m,vm_page_t dest_m)7050 vm_page_copy(
7051 vm_page_t src_m,
7052 vm_page_t dest_m)
7053 {
7054 vm_object_t src_m_object;
7055 int options = 0;
7056
7057 src_m_object = VM_PAGE_OBJECT(src_m);
7058
7059 #if 0
7060 /*
7061 * we don't hold the page queue lock
7062 * so this check isn't safe to make
7063 */
7064 VM_PAGE_CHECK(src_m);
7065 VM_PAGE_CHECK(dest_m);
7066 #endif
7067 vm_object_lock_assert_held(src_m_object);
7068
7069 /*
7070 * Copying from/into restricted pages is a security issue,
7071 * as it allows for restricted pages' policies bypass.
7072 */
7073 if (vm_page_is_restricted(src_m)) {
7074 panic("%s: cannot copy from a restricted page", __func__);
7075 }
7076
7077 if (vm_page_is_restricted(dest_m)) {
7078 panic("%s: cannot copy into a restricted page", __func__);
7079 }
7080
7081 if (src_m_object != VM_OBJECT_NULL &&
7082 src_m_object->code_signed) {
7083 /*
7084 * We're copying a page from a code-signed object.
7085 * Whoever ends up mapping the copy page might care about
7086 * the original page's integrity, so let's validate the
7087 * source page now.
7088 */
7089 vm_page_copy_cs_validations++;
7090 vm_page_validate_cs(src_m, PAGE_SIZE, 0);
7091 #if DEVELOPMENT || DEBUG
7092 DTRACE_VM4(codesigned_copy,
7093 vm_object_t, src_m_object,
7094 vm_object_offset_t, src_m->vmp_offset,
7095 int, src_m->vmp_cs_validated,
7096 int, src_m->vmp_cs_tainted);
7097 #endif /* DEVELOPMENT || DEBUG */
7098 }
7099
7100 /*
7101 * Propagate the cs_tainted bit to the copy page. Do not propagate
7102 * the cs_validated bit.
7103 */
7104 dest_m->vmp_cs_tainted = src_m->vmp_cs_tainted;
7105 dest_m->vmp_cs_nx = src_m->vmp_cs_nx;
7106 if (dest_m->vmp_cs_tainted) {
7107 vm_page_copy_cs_tainted++;
7108 }
7109
7110 #if HAS_MTE
7111 /*
7112 * vm_page_copy-ing from an untagged page into a tagged page
7113 * would happen with tag checking disabled and actually potentially be
7114 * an MTE violation.
7115 */
7116 if (!src_m->vmp_using_mte && dest_m->vmp_using_mte) {
7117 panic("Attempt to write to an MTE tagged page through the physical aperture");
7118 }
7119
7120 if (src_m->vmp_using_mte) {
7121 /* If we are copying from an MTE-enabled page, disable tag checking */
7122 options |= cppvDisableTagCheck;
7123
7124 if (dest_m->vmp_using_mte) {
7125 /*
7126 * If both source and destination are tagged, this means that we are
7127 * either CoWing or relocating a page. Tags need to follow along.
7128 */
7129 options |= cppvCopyTags;
7130 }
7131 }
7132 #endif /* HAS_MTE */
7133
7134 dest_m->vmp_error = VMP_ERROR_GET(src_m); /* sliding src_m might have failed... */
7135 pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m), options);
7136 }
7137
7138 #if MACH_ASSERT
7139 static void
_vm_page_print(vm_page_t p)7140 _vm_page_print(
7141 vm_page_t p)
7142 {
7143 printf("vm_page %p: \n", p);
7144 printf(" pageq: next=%p prev=%p\n",
7145 (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next),
7146 (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.prev));
7147 printf(" listq: next=%p prev=%p\n",
7148 (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.next)),
7149 (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.prev)));
7150 printf(" next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)));
7151 printf(" object=%p offset=0x%llx\n", VM_PAGE_OBJECT(p), p->vmp_offset);
7152 printf(" wire_count=%u\n", p->vmp_wire_count);
7153 printf(" q_state=%u\n", p->vmp_q_state);
7154
7155 printf(" %slaundry, %sref, %sgobbled, %sprivate\n",
7156 (p->vmp_laundry ? "" : "!"),
7157 (p->vmp_reference ? "" : "!"),
7158 (p->vmp_gobbled ? "" : "!"),
7159 (vm_page_is_private(p) ? "" : "!"));
7160 printf(" %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n",
7161 (p->vmp_busy ? "" : "!"),
7162 (p->vmp_wanted ? "" : "!"),
7163 (p->vmp_tabled ? "" : "!"),
7164 (vm_page_is_fictitious(p) ? "" : "!"),
7165 (p->vmp_pmapped ? "" : "!"),
7166 (p->vmp_wpmapped ? "" : "!"));
7167 printf(" %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n",
7168 (p->vmp_free_when_done ? "" : "!"),
7169 (p->vmp_absent ? "" : "!"),
7170 (VMP_ERROR_GET(p) ? "" : "!"),
7171 (p->vmp_dirty ? "" : "!"),
7172 (p->vmp_cleaning ? "" : "!"),
7173 (p->vmp_precious ? "" : "!"),
7174 (p->vmp_clustered ? "" : "!"));
7175 printf(" %soverwriting, %srestart, %sunusual\n",
7176 (p->vmp_overwriting ? "" : "!"),
7177 (p->vmp_restart ? "" : "!"),
7178 (p->vmp_unusual ? "" : "!"));
7179 printf(" cs_validated=%d, cs_tainted=%d, cs_nx=%d, %sno_cache\n",
7180 p->vmp_cs_validated,
7181 p->vmp_cs_tainted,
7182 p->vmp_cs_nx,
7183 (p->vmp_no_cache ? "" : "!"));
7184
7185 printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p));
7186 }
7187
7188 /*
7189 * Check that the list of pages is ordered by
7190 * ascending physical address and has no holes.
7191 */
7192 static int
vm_page_verify_contiguous(vm_page_t pages,unsigned int npages)7193 vm_page_verify_contiguous(
7194 vm_page_t pages,
7195 unsigned int npages)
7196 {
7197 vm_page_t m;
7198 unsigned int page_count;
7199 vm_offset_t prev_addr;
7200
7201 prev_addr = VM_PAGE_GET_PHYS_PAGE(pages);
7202 page_count = 1;
7203 for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) {
7204 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
7205 printf("m %p prev_addr 0x%lx, current addr 0x%x\n",
7206 m, (long)prev_addr, VM_PAGE_GET_PHYS_PAGE(m));
7207 printf("pages %p page_count %d npages %d\n", pages, page_count, npages);
7208 panic("vm_page_verify_contiguous: not contiguous!");
7209 }
7210 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
7211 ++page_count;
7212 }
7213 if (page_count != npages) {
7214 printf("pages %p actual count 0x%x but requested 0x%x\n",
7215 pages, page_count, npages);
7216 panic("vm_page_verify_contiguous: count error");
7217 }
7218 return 1;
7219 }
7220
7221
7222 /*
7223 * Check the free lists for proper length etc.
7224 */
7225 static boolean_t vm_page_verify_this_free_list_enabled = FALSE;
7226 static unsigned int
vm_page_verify_free_list(vm_page_queue_head_t * vm_page_queue,unsigned int color,vm_page_t look_for_page,boolean_t expect_page)7227 vm_page_verify_free_list(
7228 vm_page_queue_head_t *vm_page_queue,
7229 unsigned int color,
7230 vm_page_t look_for_page,
7231 boolean_t expect_page)
7232 {
7233 unsigned int npages;
7234 vm_page_t m;
7235 vm_page_t prev_m;
7236 boolean_t found_page;
7237
7238 if (!vm_page_verify_this_free_list_enabled) {
7239 return 0;
7240 }
7241
7242 found_page = FALSE;
7243 npages = 0;
7244 prev_m = (vm_page_t)((uintptr_t)vm_page_queue);
7245
7246 vm_page_queue_iterate(vm_page_queue, m, vmp_pageq) {
7247 if (m == look_for_page) {
7248 found_page = TRUE;
7249 }
7250 if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev) != prev_m) {
7251 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p",
7252 color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev), prev_m);
7253 }
7254 if (!m->vmp_busy) {
7255 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy",
7256 color, npages, m);
7257 }
7258 if (color != (unsigned int) -1) {
7259 if (VM_PAGE_GET_COLOR(m) != color) {
7260 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u",
7261 color, npages, m, VM_PAGE_GET_COLOR(m), color);
7262 }
7263 if (m->vmp_q_state != VM_PAGE_ON_FREE_Q) {
7264 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d",
7265 color, npages, m, m->vmp_q_state);
7266 }
7267 } else {
7268 if (m->vmp_q_state != VM_PAGE_ON_FREE_LOCAL_Q) {
7269 panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d",
7270 npages, m, m->vmp_q_state);
7271 }
7272 }
7273 ++npages;
7274 prev_m = m;
7275 }
7276 if (look_for_page != VM_PAGE_NULL) {
7277 unsigned int other_color;
7278
7279 if (expect_page && !found_page) {
7280 printf("vm_page_verify_free_list(color=%u, npages=%u): page %p not found phys=%u\n",
7281 color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
7282 _vm_page_print(look_for_page);
7283 for (other_color = 0;
7284 other_color < vm_colors;
7285 other_color++) {
7286 if (other_color == color) {
7287 continue;
7288 }
7289 vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[other_color].qhead,
7290 other_color, look_for_page, FALSE);
7291 }
7292 #if XNU_VM_HAS_LOPAGE
7293 if (color == (unsigned int) -1) {
7294 vm_page_verify_free_list(&vm_lopage_queue_free,
7295 (unsigned int) -1, look_for_page, FALSE);
7296 }
7297 #endif /* XNU_VM_HAS_LOPAGE */
7298 panic("vm_page_verify_free_list(color=%u)", color);
7299 }
7300 if (!expect_page && found_page) {
7301 printf("vm_page_verify_free_list(color=%u, npages=%u): page %p found phys=%u\n",
7302 color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
7303 }
7304 }
7305 return npages;
7306 }
7307
7308 static boolean_t vm_page_verify_all_free_lists_enabled = FALSE;
7309 static void
vm_page_verify_free_lists(void)7310 vm_page_verify_free_lists( void )
7311 {
7312 unsigned int color, npages, nlopages;
7313 boolean_t toggle = TRUE;
7314
7315 if (!vm_page_verify_all_free_lists_enabled) {
7316 return;
7317 }
7318
7319 npages = 0;
7320 nlopages = 0;
7321
7322 vm_free_page_lock();
7323
7324 if (vm_page_verify_this_free_list_enabled == TRUE) {
7325 /*
7326 * This variable has been set globally for extra checking of
7327 * each free list Q. Since we didn't set it, we don't own it
7328 * and we shouldn't toggle it.
7329 */
7330 toggle = FALSE;
7331 }
7332
7333 if (toggle == TRUE) {
7334 vm_page_verify_this_free_list_enabled = TRUE;
7335 }
7336
7337 for (color = 0; color < vm_colors; color++) {
7338 npages += vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[color].qhead,
7339 color, VM_PAGE_NULL, FALSE);
7340 }
7341 #if XNU_VM_HAS_LOPAGE
7342 nlopages = vm_page_verify_free_list(&vm_lopage_queue_free,
7343 (unsigned int) -1,
7344 VM_PAGE_NULL, FALSE);
7345 #endif /* XNU_VM_HAS_LOPAGE */
7346 if (npages != vm_page_free_count || nlopages != vm_lopage_free_count) {
7347 panic("vm_page_verify_free_lists: "
7348 "npages %u free_count %d nlopages %u lo_free_count %u",
7349 npages, vm_page_free_count, nlopages, vm_lopage_free_count);
7350 }
7351
7352 if (toggle == TRUE) {
7353 vm_page_verify_this_free_list_enabled = FALSE;
7354 }
7355
7356 vm_free_page_unlock();
7357 }
7358
7359 #endif /* MACH_ASSERT */
7360
7361 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
7362
7363 /*
7364 * CONTIGUOUS PAGE ALLOCATION AND HELPER FUNCTIONS
7365 */
7366
7367 /*
7368 * Helper function used to determine if a page can be relocated
7369 * A page is relocatable if it is in a stable non-transient state
7370 * and if the page being relocated is compatible with the reason for reloc
7371 * The page queue lock must be held, and the object lock too, if the page
7372 * is in an object.
7373 */
7374 boolean_t
vm_page_is_relocatable(vm_page_t m,vm_relocate_reason_t reloc_reason)7375 vm_page_is_relocatable(vm_page_t m, vm_relocate_reason_t reloc_reason)
7376 {
7377
7378 if (VM_PAGE_WIRED(m) || m->vmp_gobbled || m->vmp_laundry || m->vmp_wanted ||
7379 m->vmp_cleaning || m->vmp_overwriting || m->vmp_free_when_done) {
7380 /*
7381 * Page is in a transient state
7382 * or a state we don't want to deal with.
7383 */
7384 return FALSE;
7385 } else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
7386 (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) ||
7387 #if XNU_VM_HAS_LOPAGE
7388 (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
7389 #endif /* XNU_VM_HAS_LOPAGE */
7390 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
7391 /*
7392 * Page needs to be on one of our queues (other then the pageout or special
7393 * free queues) or it needs to belong to the compressor pool (which is now
7394 * indicated by vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out from
7395 * the check for VM_PAGE_NOT_ON_Q) in order for it to be stable behind the
7396 * locks we hold at this point...
7397 */
7398 return FALSE;
7399 } else if ((m->vmp_q_state != VM_PAGE_ON_FREE_Q) &&
7400 (!m->vmp_tabled || m->vmp_busy)) {
7401 /*
7402 * pages on the free list are always 'busy'
7403 * so we couldn't test for 'busy' in the check
7404 * for the transient states... pages that are
7405 * 'free' are never 'tabled', so we also couldn't
7406 * test for 'tabled'. So we check here to make
7407 * sure that a non-free page is not busy and is
7408 * tabled on an object...
7409 */
7410 return FALSE;
7411 }
7412
7413 /*
7414 * Lastly, check the page against the relocation reason; the page may
7415 * be in a relocatable state, but not be a page we WANT to relocate for
7416 * the caller's use case.
7417 */
7418 switch (reloc_reason) {
7419 #if HAS_MTE
7420 case VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM:
7421 {
7422 /*
7423 * Relocating the content of tag storage pages so the
7424 * fill thread can reclaim a page is perfectly valid,
7425 * unless the page is busy.
7426 */
7427 if (m->vmp_busy) {
7428 return FALSE;
7429 }
7430 break;
7431 }
7432 case VM_RELOCATE_REASON_TAG_STORAGE_WIRE:
7433 #endif /* HAS_MTE */
7434 case VM_RELOCATE_REASON_CONTIGUOUS:
7435 {
7436 #if HAS_MTE
7437 /*
7438 * Tag storage pages may be needed for tag storage. Because
7439 * the contiguous allocator is likely being used for wired
7440 * allocations, this page is not eligible to be relocated in
7441 * this case.
7442 */
7443 if (vm_page_is_tag_storage(m)) {
7444 return FALSE;
7445 }
7446 #endif /* HAS_MTE */
7447 break;
7448 }
7449
7450 default:
7451 {
7452 panic("Invalid relocation reason %u", reloc_reason);
7453 __builtin_unreachable();
7454 }
7455 }
7456
7457 return TRUE;
7458 }
7459
7460 /*
7461 * Free up the given page by possibily relocating its contents to a new page
7462 * If the page is on an object the object lock must be held.
7463 *
7464 * Whether or not the page is considered relocatable is contingent on the
7465 * reason it is being relocated.
7466 *
7467 * Return the new page back to the caller if requested, as done in
7468 * vm_object_iopl_wire_full().
7469 *
7470 * The VM page queues lock must also be held.
7471 *
7472 * @returns
7473 * - KERN_SUCCESS if the relocation was successful.
7474 * - KERN_INVALID_OBJECT if @c m1's object is VM_OBJECT_NULL.
7475 * - KERN_FAILURE if the reolcation failed due to @c m1's state.
7476 * - KERN_RESOURCE_SHORTAGE if no page could be allocated to relocate @c m1.
7477 */
7478 kern_return_t
vm_page_relocate(vm_page_t m1,int * compressed_pages,vm_relocate_reason_t reloc_reason,vm_page_t * new_page)7479 vm_page_relocate(
7480 vm_page_t m1,
7481 int *compressed_pages,
7482 vm_relocate_reason_t reloc_reason,
7483 vm_page_t* new_page)
7484 {
7485 int refmod = 0;
7486 vm_object_t object = VM_PAGE_OBJECT(m1);
7487 kern_return_t kr;
7488
7489 switch (reloc_reason) {
7490 case VM_RELOCATE_REASON_CONTIGUOUS:
7491 {
7492 #if HAS_MTE
7493 /*
7494 * The contiguous allocator should not be considering tag
7495 * storage pages.
7496 */
7497 assert(!vm_page_is_tag_storage(m1));
7498 #endif /* HAS_MTE */
7499 break;
7500 }
7501 #if HAS_MTE
7502 case VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM:
7503 {
7504 /*
7505 * If we are trying to reclaim tag storage, we should be
7506 * relocating a tag storage page.
7507 */
7508 assert(vm_page_is_tag_storage(m1));
7509 if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7510 vm_page_tag_storage_compressor_relocation_count++;
7511 }
7512 break;
7513 }
7514 case VM_RELOCATE_REASON_TAG_STORAGE_WIRE:
7515 {
7516 assert(vm_page_is_tag_storage(m1) && m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7517 vm_page_tag_storage_wire_relocation_count++;
7518 break;
7519 }
7520 #endif /* HAS_MTE */
7521 default:
7522 {
7523 panic("Unrecognized relocation reason %u\n", reloc_reason);
7524 break;
7525 }
7526 }
7527
7528 if (object == VM_OBJECT_NULL) {
7529 return KERN_INVALID_OBJECT;
7530 }
7531
7532 vm_object_lock_assert_held(object);
7533
7534 if (VM_PAGE_WIRED(m1) ||
7535 m1->vmp_gobbled ||
7536 m1->vmp_laundry ||
7537 m1->vmp_wanted ||
7538 m1->vmp_cleaning ||
7539 m1->vmp_overwriting ||
7540 m1->vmp_free_when_done ||
7541 m1->vmp_busy ||
7542 m1->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
7543 return KERN_FAILURE;
7544 }
7545
7546 boolean_t disconnected = FALSE;
7547 boolean_t reusable = FALSE;
7548
7549 /*
7550 * Pages from reusable objects can be reclaimed directly.
7551 */
7552 if ((m1->vmp_reusable || object->all_reusable) &&
7553 m1->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q && !m1->vmp_dirty &&
7554 !m1->vmp_reference) {
7555 /*
7556 * reusable page...
7557 */
7558
7559 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
7560 disconnected = TRUE;
7561 if (refmod == 0) {
7562 /*
7563 * ... not reused: can steal without relocating contents.
7564 */
7565 reusable = TRUE;
7566 }
7567 }
7568
7569 if ((m1->vmp_pmapped && !reusable) || m1->vmp_dirty || m1->vmp_precious) {
7570 vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD;
7571 vm_object_offset_t offset;
7572 int copy_page_options = 0;
7573
7574 #if HAS_MTE
7575 if (m1->vmp_using_mte) {
7576 grab_options |= VM_PAGE_GRAB_MTE;
7577 copy_page_options |= cppvCopyTags;
7578 }
7579 #endif /* HAS_MTE */
7580 /* page is not reusable, we need to allocate a new page
7581 * and move its contents there.
7582 */
7583 vm_page_t m2 = vm_page_grab_options(grab_options);
7584
7585 if (m2 == VM_PAGE_NULL) {
7586 return KERN_RESOURCE_SHORTAGE;
7587 }
7588
7589 if (!disconnected) {
7590 if (m1->vmp_pmapped) {
7591 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
7592 } else {
7593 refmod = 0;
7594 }
7595 }
7596
7597 #if HAS_MTE
7598 assert(m1->vmp_using_mte == m2->vmp_using_mte);
7599 if (m1->vmp_using_mte) {
7600 assert(pmap_is_tagged_page(VM_PAGE_GET_PHYS_PAGE(m2)));
7601 copy_page_options |= (cppvCopyTags | cppvDisableTagCheck);
7602 }
7603 #endif /* HAS_MTE */
7604 /* copy the page's contents */
7605 pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2), copy_page_options);
7606
7607 /* copy the page's state */
7608 assert(!VM_PAGE_WIRED(m1));
7609 assert(m1->vmp_q_state != VM_PAGE_ON_FREE_Q);
7610 assert(m1->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q);
7611 assert(!m1->vmp_laundry);
7612 m2->vmp_reference = m1->vmp_reference;
7613 assert(!m1->vmp_gobbled);
7614 m2->vmp_no_cache = m1->vmp_no_cache;
7615 m2->vmp_xpmapped = 0;
7616 assert(!m1->vmp_busy);
7617 assert(!m1->vmp_wanted);
7618 assert(vm_page_is_canonical(m1));
7619 m2->vmp_pmapped = m1->vmp_pmapped; /* should flush cache ? */
7620 m2->vmp_wpmapped = m1->vmp_wpmapped;
7621 assert(!m1->vmp_free_when_done);
7622 m2->vmp_absent = m1->vmp_absent;
7623 m2->vmp_error = VMP_ERROR_GET(m1);
7624 m2->vmp_dirty = m1->vmp_dirty;
7625 assert(!m1->vmp_cleaning);
7626 m2->vmp_precious = m1->vmp_precious;
7627 m2->vmp_clustered = m1->vmp_clustered;
7628 assert(!m1->vmp_overwriting);
7629 m2->vmp_restart = m1->vmp_restart;
7630 m2->vmp_unusual = m1->vmp_unusual;
7631 m2->vmp_cs_validated = m1->vmp_cs_validated;
7632 m2->vmp_cs_tainted = m1->vmp_cs_tainted;
7633 m2->vmp_cs_nx = m1->vmp_cs_nx;
7634
7635 m2->vmp_realtime = m1->vmp_realtime;
7636 m1->vmp_realtime = false;
7637
7638 /*
7639 * If m1 had really been reusable,
7640 * we would have just stolen it, so
7641 * let's not propagate its "reusable"
7642 * bit and assert that m2 is not
7643 * marked as "reusable".
7644 */
7645 // m2->vmp_reusable = m1->vmp_reusable;
7646 assert(!m2->vmp_reusable);
7647
7648 // assert(!m1->vmp_lopage);
7649
7650 if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7651 m2->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
7652 /*
7653 * We just grabbed m2 up above and so it isn't
7654 * going to be on any special Q as yet and so
7655 * we don't need to 'remove' it from the special
7656 * queues. Just resetting the state should be enough.
7657 */
7658 m2->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
7659 }
7660
7661 /*
7662 * page may need to be flushed if
7663 * it is marshalled into a UPL
7664 * that is going to be used by a device
7665 * that doesn't support coherency
7666 */
7667 m2->vmp_written_by_kernel = TRUE;
7668
7669 /*
7670 * make sure we clear the ref/mod state
7671 * from the pmap layer... else we risk
7672 * inheriting state from the last time
7673 * this page was used...
7674 */
7675 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2),
7676 VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7677
7678 if (refmod & VM_MEM_REFERENCED) {
7679 m2->vmp_reference = TRUE;
7680 }
7681 if (refmod & VM_MEM_MODIFIED) {
7682 SET_PAGE_DIRTY(m2, TRUE);
7683 }
7684 offset = m1->vmp_offset;
7685
7686 /*
7687 * completely cleans up the state
7688 * of the page so that it is ready
7689 * to be put onto the free list, or
7690 * for this purpose it looks like it
7691 * just came off of the free list
7692 */
7693 vm_page_free_prepare(m1);
7694
7695 /*
7696 * now put the substitute page on the object
7697 */
7698 vm_page_insert_internal(m2, object, offset, VM_KERN_MEMORY_NONE, TRUE,
7699 TRUE, FALSE, FALSE, NULL);
7700
7701 /*
7702 * Return the relocated vm_page_t if the caller wants to know.
7703 */
7704 if (new_page) {
7705 *new_page = m2;
7706 }
7707
7708 if (m2->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7709 m2->vmp_pmapped = TRUE;
7710 m2->vmp_wpmapped = TRUE;
7711
7712 kr = pmap_enter_check(kernel_pmap, (vm_map_offset_t)m2->vmp_offset, m2,
7713 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, TRUE);
7714
7715 assert(kr == KERN_SUCCESS);
7716
7717 if (compressed_pages) {
7718 ++*compressed_pages;
7719 }
7720 } else {
7721 /* relocated page was not used by the compressor
7722 * put it on either the active or inactive lists */
7723 if (m2->vmp_reference) {
7724 vm_page_activate(m2);
7725 } else {
7726 vm_page_deactivate(m2);
7727 }
7728 }
7729
7730 /* unset the busy flag (pages on the free queue are busy) and notify if wanted */
7731 vm_page_wakeup_done(object, m2);
7732 } else {
7733 assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7734
7735 /*
7736 * completely cleans up the state
7737 * of the page so that it is ready
7738 * to be put onto the free list, or
7739 * for this purpose it looks like it
7740 * just came off of the free list
7741 */
7742 vm_page_free_prepare(m1);
7743
7744 if (new_page) {
7745 vm_page_t m2;
7746 vm_object_offset_t offset;
7747 vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD;
7748
7749 /* The caller still wanted a page, so let's give them a new one. */
7750 offset = m1->vmp_offset;
7751 #if HAS_MTE
7752 if (m1->vmp_using_mte) {
7753 grab_options |= VM_PAGE_GRAB_MTE;
7754 }
7755 #endif /* HAS_MTE */
7756 m2 = vm_page_grab_options(grab_options);
7757
7758 if (m2 == VM_PAGE_NULL) {
7759 return KERN_RESOURCE_SHORTAGE;
7760 }
7761
7762 /*
7763 * make sure we clear the ref/mod state
7764 * from the pmap layer... else we risk
7765 * inheriting state from the last time
7766 * this page was used...
7767 */
7768 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2),
7769 VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7770
7771 offset = m1->vmp_offset;
7772
7773 /*
7774 * now put the substitute page on the object
7775 */
7776 vm_page_insert_internal(m2, object, offset, VM_KERN_MEMORY_NONE, TRUE,
7777 TRUE, FALSE, FALSE, NULL);
7778
7779 *new_page = m2;
7780 }
7781 }
7782
7783 /* we're done here */
7784 return KERN_SUCCESS;
7785 }
7786
7787 /*
7788 * CONTIGUOUS PAGE ALLOCATION
7789 *
7790 * Find a region large enough to contain at least n pages
7791 * of contiguous physical memory.
7792 *
7793 * This is done by traversing the vm_page_t array in a linear fashion
7794 * we assume that the vm_page_t array has the avaiable physical pages in an
7795 * ordered, ascending list... this is currently true of all our implementations
7796 * and must remain so... there can be 'holes' in the array... we also can
7797 * no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed
7798 * which use to happen via 'vm_page_convert'... that function was no longer
7799 * being called and was removed...
7800 *
7801 * The basic flow consists of stabilizing some of the interesting state of
7802 * a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our
7803 * sweep at the beginning of the array looking for pages that meet our criterea
7804 * for a 'stealable' page... currently we are pretty conservative... if the page
7805 * meets this criterea and is physically contiguous to the previous page in the 'run'
7806 * we keep developing it. If we hit a page that doesn't fit, we reset our state
7807 * and start to develop a new run... if at this point we've already considered
7808 * at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold,
7809 * and mutex_pause (which will yield the processor), to keep the latency low w/r
7810 * to other threads trying to acquire free pages (or move pages from q to q),
7811 * and then continue from the spot we left off... we only make 1 pass through the
7812 * array. Once we have a 'run' that is long enough, we'll go into the loop which
7813 * which steals the pages from the queues they're currently on... pages on the free
7814 * queue can be stolen directly... pages that are on any of the other queues
7815 * must be removed from the object they are tabled on... this requires taking the
7816 * object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails
7817 * or if the state of the page behind the vm_object lock is no longer viable, we'll
7818 * dump the pages we've currently stolen back to the free list, and pick up our
7819 * scan from the point where we aborted the 'current' run.
7820 *
7821 *
7822 * Requirements:
7823 * - neither vm_page_queue nor vm_free_list lock can be held on entry
7824 *
7825 * Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL.
7826 *
7827 * Algorithm:
7828 */
7829
7830 #define MAX_CONSIDERED_BEFORE_YIELD 1000
7831
7832
7833 #define RESET_STATE_OF_RUN() \
7834 MACRO_BEGIN \
7835 prevcontaddr = -2; \
7836 start_pnum = -1; \
7837 free_considered = 0; \
7838 substitute_needed = 0; \
7839 npages = 0; \
7840 MACRO_END
7841
7842 /*
7843 * Can we steal in-use (i.e. not free) pages when searching for
7844 * physically-contiguous pages ?
7845 */
7846 #define VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL 1
7847
7848 static unsigned int vm_page_find_contiguous_last_idx = 0, vm_page_lomem_find_contiguous_last_idx = 0;
7849 #if DEBUG
7850 int vm_page_find_contig_debug = 0;
7851 #endif
7852
7853 static vm_page_t
vm_page_find_contiguous(unsigned int contig_pages,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)7854 vm_page_find_contiguous(
7855 unsigned int contig_pages,
7856 ppnum_t max_pnum,
7857 ppnum_t pnum_mask,
7858 boolean_t wire,
7859 int flags)
7860 {
7861 vm_page_list_t list = { };
7862 ppnum_t prevcontaddr = 0;
7863 ppnum_t start_pnum = 0;
7864 unsigned int npages = 0, considered = 0, scanned = 0;
7865 unsigned int page_idx = 0, start_idx = 0, last_idx = 0, orig_last_idx = 0;
7866 unsigned int idx_last_contig_page_found = 0;
7867 int free_considered = 0, free_available = 0;
7868 int substitute_needed = 0;
7869 int zone_gc_called = 0;
7870 boolean_t wrapped;
7871 kern_return_t kr;
7872 #if DEBUG
7873 clock_sec_t tv_start_sec = 0, tv_end_sec = 0;
7874 clock_usec_t tv_start_usec = 0, tv_end_usec = 0;
7875 #endif
7876
7877 int yielded = 0;
7878 int dumped_run = 0;
7879 int stolen_pages = 0;
7880 int compressed_pages = 0;
7881
7882
7883 if (contig_pages == 0) {
7884 return VM_PAGE_NULL;
7885 }
7886
7887 full_scan_again:
7888
7889 #if MACH_ASSERT
7890 vm_page_verify_free_lists();
7891 #endif
7892 #if DEBUG
7893 clock_get_system_microtime(&tv_start_sec, &tv_start_usec);
7894 #endif
7895 PAGE_REPLACEMENT_ALLOWED(TRUE);
7896
7897 #if XNU_VM_HAS_DELAYED_PAGES
7898 /*
7899 * If there are still delayed pages, try to free up some that match.
7900 */
7901 if (__improbable(vm_delayed_count != 0 && contig_pages != 0)) {
7902 vm_free_delayed_pages_contig(contig_pages, max_pnum, pnum_mask);
7903 }
7904 #endif /* XNU_VM_HAS_DELAYED_PAGES */
7905
7906 vm_page_lock_queues();
7907 vm_free_page_lock();
7908
7909 RESET_STATE_OF_RUN();
7910
7911 scanned = 0;
7912 considered = 0;
7913 free_available = vm_page_free_count - vm_page_free_reserved;
7914
7915 wrapped = FALSE;
7916
7917 if (flags & KMA_LOMEM) {
7918 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx;
7919 } else {
7920 idx_last_contig_page_found = vm_page_find_contiguous_last_idx;
7921 }
7922
7923 orig_last_idx = idx_last_contig_page_found;
7924 last_idx = orig_last_idx;
7925
7926 for (page_idx = last_idx, start_idx = last_idx;
7927 npages < contig_pages && page_idx < vm_pages_count;
7928 page_idx++) {
7929 vm_page_t m = NULL;
7930
7931 retry:
7932 if (wrapped &&
7933 npages == 0 &&
7934 page_idx >= orig_last_idx) {
7935 /*
7936 * We're back where we started and we haven't
7937 * found any suitable contiguous range. Let's
7938 * give up.
7939 */
7940 break;
7941 }
7942 scanned++;
7943 m = vm_page_get(page_idx);
7944
7945 assert(vm_page_is_canonical(m));
7946
7947 if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) {
7948 /* no more low pages... */
7949 break;
7950 }
7951 if (!npages & ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0)) {
7952 /*
7953 * not aligned
7954 */
7955 RESET_STATE_OF_RUN();
7956 } else if (!vm_page_is_relocatable(m,
7957 VM_RELOCATE_REASON_CONTIGUOUS)) {
7958 /*
7959 * page is not relocatable */
7960 RESET_STATE_OF_RUN();
7961 } else {
7962 if (VM_PAGE_GET_PHYS_PAGE(m) != prevcontaddr + 1) {
7963 if ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0) {
7964 RESET_STATE_OF_RUN();
7965 goto did_consider;
7966 } else {
7967 npages = 1;
7968 start_idx = page_idx;
7969 start_pnum = VM_PAGE_GET_PHYS_PAGE(m);
7970 }
7971 } else {
7972 npages++;
7973 }
7974 prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m);
7975
7976 VM_PAGE_CHECK(m);
7977 if (m->vmp_q_state == VM_PAGE_ON_FREE_Q) {
7978 free_considered++;
7979 } else {
7980 /*
7981 * This page is not free.
7982 * If we can't steal used pages,
7983 * we have to give up this run
7984 * and keep looking.
7985 * Otherwise, we might need to
7986 * move the contents of this page
7987 * into a substitute page.
7988 */
7989 #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
7990 if (m->vmp_pmapped || m->vmp_dirty || m->vmp_precious) {
7991 substitute_needed++;
7992 }
7993 #else
7994 RESET_STATE_OF_RUN();
7995 #endif
7996 }
7997
7998 if ((free_considered + substitute_needed) > free_available) {
7999 /*
8000 * if we let this run continue
8001 * we will end up dropping the vm_page_free_count
8002 * below the reserve limit... we need to abort
8003 * this run, but we can at least re-consider this
8004 * page... thus the jump back to 'retry'
8005 */
8006 RESET_STATE_OF_RUN();
8007
8008 if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) {
8009 considered++;
8010 goto retry;
8011 }
8012 /*
8013 * free_available == 0
8014 * so can't consider any free pages... if
8015 * we went to retry in this case, we'd
8016 * get stuck looking at the same page
8017 * w/o making any forward progress
8018 * we also want to take this path if we've already
8019 * reached our limit that controls the lock latency
8020 */
8021 }
8022 }
8023 did_consider:
8024 if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) {
8025 PAGE_REPLACEMENT_ALLOWED(FALSE);
8026
8027 vm_free_page_unlock();
8028 vm_page_unlock_queues();
8029
8030 mutex_pause(0);
8031
8032 PAGE_REPLACEMENT_ALLOWED(TRUE);
8033
8034 vm_page_lock_queues();
8035 vm_free_page_lock();
8036
8037 RESET_STATE_OF_RUN();
8038 /*
8039 * reset our free page limit since we
8040 * dropped the lock protecting the vm_page_free_queue
8041 */
8042 free_available = vm_page_free_count - vm_page_free_reserved;
8043 considered = 0;
8044
8045 yielded++;
8046
8047 goto retry;
8048 }
8049 considered++;
8050 } /* main for-loop end */
8051
8052 if (npages != contig_pages) {
8053 if (!wrapped) {
8054 /*
8055 * We didn't find a contiguous range but we didn't
8056 * start from the very first page.
8057 * Start again from the very first page.
8058 */
8059 RESET_STATE_OF_RUN();
8060 if (flags & KMA_LOMEM) {
8061 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx = 0;
8062 } else {
8063 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = 0;
8064 }
8065 last_idx = 0;
8066 page_idx = last_idx;
8067 wrapped = TRUE;
8068 goto retry;
8069 }
8070 vm_free_page_unlock();
8071 } else {
8072 vm_page_t m1;
8073 unsigned int cur_idx;
8074 unsigned int tmp_start_idx;
8075 vm_object_t locked_object = VM_OBJECT_NULL;
8076 bool abort_run = false;
8077
8078 assert(page_idx - start_idx == contig_pages);
8079
8080 tmp_start_idx = start_idx;
8081
8082 /*
8083 * first pass through to pull the free pages
8084 * off of the free queue so that in case we
8085 * need substitute pages, we won't grab any
8086 * of the free pages in the run... we'll clear
8087 * the 'free' bit in the 2nd pass, and even in
8088 * an abort_run case, we'll collect all of the
8089 * free pages in this run and return them to the free list
8090 */
8091 while (start_idx < page_idx) {
8092 vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
8093
8094 m1 = vm_page_get(start_idx++);
8095
8096 #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
8097 assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q);
8098 #endif
8099 if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) {
8100 vm_page_free_queue_steal(options, m1);
8101 }
8102 }
8103 if (flags & KMA_LOMEM) {
8104 vm_page_lomem_find_contiguous_last_idx = page_idx;
8105 } else {
8106 vm_page_find_contiguous_last_idx = page_idx;
8107 }
8108
8109 /*
8110 * we can drop the free queue lock at this point since
8111 * we've pulled any 'free' candidates off of the list
8112 * we need it dropped so that we can do a vm_page_grab
8113 * when substituing for pmapped/dirty pages
8114 */
8115 vm_free_page_unlock();
8116
8117 start_idx = tmp_start_idx;
8118 cur_idx = page_idx - 1;
8119
8120 while (start_idx++ < page_idx) {
8121 /*
8122 * must go through the list from back to front
8123 * so that the page list is created in the
8124 * correct order - low -> high phys addresses
8125 */
8126 m1 = vm_page_get(cur_idx--);
8127
8128 if (m1->vmp_object == 0) {
8129 /*
8130 * page has already been removed from
8131 * the free list in the 1st pass
8132 */
8133 assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
8134 assert(m1->vmp_offset == (vm_object_offset_t) -1);
8135 assert(m1->vmp_busy);
8136 assert(!m1->vmp_wanted);
8137 assert(!m1->vmp_laundry);
8138 } else {
8139 /*
8140 * try to relocate/steal the page
8141 */
8142 if (abort_run) {
8143 continue;
8144 }
8145
8146 assert(m1->vmp_q_state != VM_PAGE_NOT_ON_Q);
8147
8148 vm_object_t object = VM_PAGE_OBJECT(m1);
8149
8150 if (object != locked_object) {
8151 if (locked_object) {
8152 vm_object_unlock(locked_object);
8153 locked_object = VM_OBJECT_NULL;
8154 }
8155 if (vm_object_lock_try(object)) {
8156 locked_object = object;
8157 } else {
8158 /* object must be locked to relocate its pages */
8159 tmp_start_idx = cur_idx;
8160 abort_run = true;
8161 continue;
8162 }
8163 }
8164
8165 kr = vm_page_relocate(m1, &compressed_pages, VM_RELOCATE_REASON_CONTIGUOUS, NULL);
8166 if (kr != KERN_SUCCESS) {
8167 if (locked_object) {
8168 vm_object_unlock(locked_object);
8169 locked_object = VM_OBJECT_NULL;
8170 }
8171 tmp_start_idx = cur_idx;
8172 abort_run = true;
8173 continue;
8174 }
8175
8176 stolen_pages++;
8177 }
8178
8179 /* m1 is ours at this point ... */
8180
8181 if (m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
8182 /*
8183 * The Q state is preserved on m1 because vm_page_queues_remove doesn't
8184 * change it for pages marked as used-by-compressor.
8185 */
8186 vm_page_assign_special_state(m1, VM_PAGE_SPECIAL_Q_BG);
8187 }
8188 VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
8189 vm_page_list_push(&list, m1);
8190 }
8191
8192 if (locked_object) {
8193 vm_object_unlock(locked_object);
8194 locked_object = VM_OBJECT_NULL;
8195 }
8196
8197 if (abort_run) {
8198 /*
8199 * want the index of the last
8200 * page in this run that was
8201 * successfully 'stolen', so back
8202 * it up 1 for the auto-decrement on use
8203 * and 1 more to bump back over this page
8204 */
8205 page_idx = tmp_start_idx + 2;
8206 if (page_idx >= vm_pages_count) {
8207 if (wrapped) {
8208 if (list.vmpl_count) {
8209 vm_page_unlock_queues();
8210 vm_page_free_list(list.vmpl_head, FALSE);
8211 vm_page_lock_queues();
8212 list = (vm_page_list_t){ };
8213 }
8214 dumped_run++;
8215 goto done_scanning;
8216 }
8217 page_idx = last_idx = 0;
8218 wrapped = TRUE;
8219 }
8220 abort_run = false;
8221
8222 /*
8223 * We didn't find a contiguous range but we didn't
8224 * start from the very first page.
8225 * Start again from the very first page.
8226 */
8227 RESET_STATE_OF_RUN();
8228
8229 if (flags & KMA_LOMEM) {
8230 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx = page_idx;
8231 } else {
8232 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = page_idx;
8233 }
8234
8235 last_idx = page_idx;
8236
8237 if (list.vmpl_count) {
8238 vm_page_unlock_queues();
8239 vm_page_free_list(list.vmpl_head, FALSE);
8240 vm_page_lock_queues();
8241 list = (vm_page_list_t){ };
8242 }
8243 dumped_run++;
8244
8245 vm_free_page_lock();
8246 /*
8247 * reset our free page limit since we
8248 * dropped the lock protecting the vm_page_free_queue
8249 */
8250 free_available = vm_page_free_count - vm_page_free_reserved;
8251 goto retry;
8252 }
8253 #if HAS_MTE
8254 else if (list.vmpl_has_tagged) {
8255 const unified_page_list_t pmap_batch_list = {
8256 .page_slist = list.vmpl_head,
8257 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
8258 };
8259
8260 /*
8261 * We successfully found a contiguous range we could
8262 * steal all the pages from. As a last step, make
8263 * certain all pages are regular pages, or convert
8264 * any non-regular pages to regular pages.
8265 */
8266 vm_page_unlock_queues();
8267
8268 /* Make any tagged pages we stole non-tagged. */
8269 pmap_unmake_tagged_pages(&pmap_batch_list);
8270
8271 vm_free_page_lock();
8272
8273 /* Mark any tagged pages we stole as non-tagged. */
8274 vm_page_list_foreach(m1, list) {
8275 if (m1->vmp_using_mte) {
8276 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(m1);
8277
8278 m1->vmp_using_mte = false;
8279 mteinfo_covered_page_clear_tagged(pnum);
8280 }
8281 }
8282 list.vmpl_has_tagged = false;
8283 list.vmpl_has_untagged = true;
8284
8285 vm_free_page_unlock();
8286 vm_page_lock_queues();
8287 }
8288 #endif /* HAS_MTE */
8289
8290 vm_page_list_foreach(m1, list) {
8291 assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
8292 assert(m1->vmp_wire_count == 0);
8293
8294 if (wire == TRUE) {
8295 m1->vmp_wire_count++;
8296 m1->vmp_q_state = VM_PAGE_IS_WIRED;
8297
8298 #if HAS_MTE
8299 if (m1->vmp_wire_count == 1) {
8300 mteinfo_increment_wire_count(m1);
8301 }
8302 #endif /* HAS_MTE */
8303 } else {
8304 m1->vmp_gobbled = TRUE;
8305 }
8306 }
8307 if (wire == FALSE) {
8308 vm_page_gobble_count += npages;
8309 }
8310
8311 /*
8312 * gobbled pages are also counted as wired pages
8313 */
8314 vm_page_wire_count += npages;
8315
8316 assert(vm_page_verify_contiguous(list.vmpl_head, npages));
8317 }
8318 done_scanning:
8319 PAGE_REPLACEMENT_ALLOWED(FALSE);
8320
8321 vm_page_unlock_queues();
8322
8323 #if DEBUG
8324 clock_get_system_microtime(&tv_end_sec, &tv_end_usec);
8325
8326 tv_end_sec -= tv_start_sec;
8327 if (tv_end_usec < tv_start_usec) {
8328 tv_end_sec--;
8329 tv_end_usec += 1000000;
8330 }
8331 tv_end_usec -= tv_start_usec;
8332 if (tv_end_usec >= 1000000) {
8333 tv_end_sec++;
8334 tv_end_sec -= 1000000;
8335 }
8336 if (vm_page_find_contig_debug) {
8337 printf("%s(num=%d,low=%d): found %d pages at 0x%llx in %ld.%06ds... started at %d... scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages\n",
8338 __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
8339 (long)tv_end_sec, tv_end_usec, orig_last_idx,
8340 scanned, yielded, dumped_run, stolen_pages, compressed_pages);
8341 }
8342
8343 #endif
8344 #if MACH_ASSERT
8345 vm_page_verify_free_lists();
8346 #endif
8347 if (list.vmpl_count == 0 && zone_gc_called < 2) {
8348 printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n",
8349 __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
8350 scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count);
8351
8352 if (consider_buffer_cache_collect != NULL) {
8353 (void)(*consider_buffer_cache_collect)(1);
8354 }
8355
8356 zone_gc(zone_gc_called ? ZONE_GC_DRAIN : ZONE_GC_TRIM);
8357
8358 zone_gc_called++;
8359
8360 printf("vm_page_find_contiguous: zone_gc called... wired count is %d\n", vm_page_wire_count);
8361 goto full_scan_again;
8362 }
8363
8364 return list.vmpl_head;
8365 }
8366
8367 /*
8368 * Allocate a list of contiguous, wired pages.
8369 */
8370 kern_return_t
cpm_allocate(vm_size_t size,vm_page_t * list,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)8371 cpm_allocate(
8372 vm_size_t size,
8373 vm_page_t *list,
8374 ppnum_t max_pnum,
8375 ppnum_t pnum_mask,
8376 boolean_t wire,
8377 int flags)
8378 {
8379 vm_page_t pages;
8380 unsigned int npages;
8381
8382 if (size % PAGE_SIZE != 0) {
8383 return KERN_INVALID_ARGUMENT;
8384 }
8385
8386 npages = (unsigned int) (size / PAGE_SIZE);
8387 if (npages != size / PAGE_SIZE) {
8388 /* 32-bit overflow */
8389 return KERN_INVALID_ARGUMENT;
8390 }
8391
8392 /*
8393 * Obtain a pointer to a subset of the free
8394 * list large enough to satisfy the request;
8395 * the region will be physically contiguous.
8396 */
8397 pages = vm_page_find_contiguous(npages, max_pnum, pnum_mask, wire, flags);
8398
8399 if (pages == VM_PAGE_NULL) {
8400 return KERN_NO_SPACE;
8401 }
8402 /*
8403 * determine need for wakeups
8404 */
8405 if (vm_page_free_count < vm_page_free_min) {
8406 vm_free_page_lock();
8407 if (vm_pageout_running == FALSE) {
8408 vm_free_page_unlock();
8409 thread_wakeup((event_t) &vm_page_free_wanted);
8410 } else {
8411 vm_free_page_unlock();
8412 }
8413 }
8414
8415 VM_CHECK_MEMORYSTATUS;
8416
8417 /*
8418 * The CPM pages should now be available and
8419 * ordered by ascending physical address.
8420 */
8421 assert(vm_page_verify_contiguous(pages, npages));
8422
8423 if (flags & KMA_ZERO) {
8424 for (vm_page_t m = pages; m; m = NEXT_PAGE(m)) {
8425 vm_page_zero_fill(
8426 m
8427 #if HAS_MTE
8428 , false
8429 #endif /* HAS_MTE */
8430 );
8431 }
8432 }
8433
8434 *list = pages;
8435 return KERN_SUCCESS;
8436 }
8437
8438
8439 unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT;
8440
8441 /*
8442 * when working on a 'run' of pages, it is necessary to hold
8443 * the vm_page_queue_lock (a hot global lock) for certain operations
8444 * on the page... however, the majority of the work can be done
8445 * while merely holding the object lock... in fact there are certain
8446 * collections of pages that don't require any work brokered by the
8447 * vm_page_queue_lock... to mitigate the time spent behind the global
8448 * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
8449 * while doing all of the work that doesn't require the vm_page_queue_lock...
8450 * then call vm_page_do_delayed_work to acquire the vm_page_queue_lock and do the
8451 * necessary work for each page... we will grab the busy bit on the page
8452 * if it's not already held so that vm_page_do_delayed_work can drop the object lock
8453 * if it can't immediately take the vm_page_queue_lock in order to compete
8454 * for the locks in the same order that vm_pageout_scan takes them.
8455 * the operation names are modeled after the names of the routines that
8456 * need to be called in order to make the changes very obvious in the
8457 * original loop
8458 *
8459 * On certain configurations, this function may return failure if any of
8460 * the pages in the run has a mapping state that doesn't allow the specified
8461 * operation. In that case, it will still fully process the run of pages
8462 * in order to avoid requiring the caller to partially undo the work done
8463 * here.
8464 */
8465
8466 kern_return_t
vm_page_do_delayed_work(vm_object_t object,vm_tag_t tag,struct vm_page_delayed_work * dwp,int dw_count)8467 vm_page_do_delayed_work(
8468 vm_object_t object,
8469 vm_tag_t tag,
8470 struct vm_page_delayed_work *dwp,
8471 int dw_count)
8472 {
8473 kern_return_t kr = KERN_SUCCESS;
8474 int j;
8475 vm_page_t m;
8476 vm_page_t local_free_q = VM_PAGE_NULL;
8477
8478 /*
8479 * pageout_scan takes the vm_page_lock_queues first
8480 * then tries for the object lock... to avoid what
8481 * is effectively a lock inversion, we'll go to the
8482 * trouble of taking them in that same order... otherwise
8483 * if this object contains the majority of the pages resident
8484 * in the UBC (or a small set of large objects actively being
8485 * worked on contain the majority of the pages), we could
8486 * cause the pageout_scan thread to 'starve' in its attempt
8487 * to find pages to move to the free queue, since it has to
8488 * successfully acquire the object lock of any candidate page
8489 * before it can steal/clean it.
8490 */
8491 if (!vm_page_trylock_queues()) {
8492 vm_object_unlock(object);
8493
8494 /*
8495 * "Turnstile enabled vm_pageout_scan" can be runnable
8496 * for a very long time without getting on a core.
8497 * If this is a higher priority thread it could be
8498 * waiting here for a very long time respecting the fact
8499 * that pageout_scan would like its object after VPS does
8500 * a mutex_pause(0).
8501 * So we cap the number of yields in the vm_object_lock_avoid()
8502 * case to a single mutex_pause(0) which will give vm_pageout_scan
8503 * 10us to run and grab the object if needed.
8504 */
8505 vm_page_lock_queues();
8506
8507 for (j = 0;; j++) {
8508 if ((!vm_object_lock_avoid(object) ||
8509 (vps_dynamic_priority_enabled && (j > 0))) &&
8510 _vm_object_lock_try(object)) {
8511 break;
8512 }
8513 vm_page_unlock_queues();
8514 mutex_pause(j);
8515 vm_page_lock_queues();
8516 }
8517 }
8518 for (j = 0; j < dw_count; j++, dwp++) {
8519 m = dwp->dw_m;
8520
8521 if (dwp->dw_mask & DW_vm_pageout_throttle_up) {
8522 vm_pageout_throttle_up(m);
8523 }
8524 #if CONFIG_PHANTOM_CACHE
8525 if (dwp->dw_mask & DW_vm_phantom_cache_update) {
8526 vm_phantom_cache_update(m);
8527 }
8528 #endif
8529 if (dwp->dw_mask & DW_vm_page_wire) {
8530 vm_page_wire(m, tag, FALSE);
8531 if (dwp->dw_mask & DW_vm_page_iopl_wire) {
8532 #if CONFIG_SPTM
8533 /*
8534 * The SPTM's security model prevents us from allowing writable I/O
8535 * mappings of executable pages. We need to check that here,
8536 * in the same place that we set vmp_iopl_wired, because this
8537 * function may have transiently dropped the VM object lock
8538 * before reaching this point, which means that frontloading
8539 * this check in the caller may not work in all cases.
8540 */
8541 if ((dwp->dw_mask & DW_vm_page_iopl_wire_write) && PMAP_PAGE_IS_USER_EXECUTABLE(m)) {
8542 if (kr == KERN_SUCCESS) {
8543 kr = KERN_PROTECTION_FAILURE;
8544 vm_map_guard_exception(VM_PAGE_GET_PHYS_PAGE(m), kGUARD_EXC_SEC_IOPL_ON_EXEC_PAGE);
8545 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
8546 KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_IOPL_ON_EXEC_PAGE),
8547 (uintptr_t)(VM_PAGE_GET_PHYS_PAGE(m)));
8548 }
8549 } else {
8550 m->vmp_iopl_wired = true;
8551 }
8552 #else
8553 m->vmp_iopl_wired = true;
8554 #endif /* CONFIG_SPTM */
8555 }
8556 } else if (dwp->dw_mask & DW_vm_page_unwire) {
8557 boolean_t queueit;
8558
8559 queueit = (dwp->dw_mask & (DW_vm_page_free | DW_vm_page_deactivate_internal)) ? FALSE : TRUE;
8560
8561 vm_page_unwire(m, queueit);
8562 }
8563 if (dwp->dw_mask & DW_vm_page_free) {
8564 vm_page_free_prepare_queues(m);
8565
8566 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
8567 /*
8568 * Add this page to our list of reclaimed pages,
8569 * to be freed later.
8570 */
8571 m->vmp_snext = local_free_q;
8572 local_free_q = m;
8573 } else {
8574 if (dwp->dw_mask & DW_vm_page_deactivate_internal) {
8575 vm_page_deactivate_internal(m, FALSE);
8576 } else if (dwp->dw_mask & DW_vm_page_activate) {
8577 if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
8578 vm_page_activate(m);
8579 }
8580 } else if (dwp->dw_mask & DW_vm_page_speculate) {
8581 vm_page_speculate(m, TRUE);
8582 } else if (dwp->dw_mask & DW_enqueue_cleaned) {
8583 /*
8584 * if we didn't hold the object lock and did this,
8585 * we might disconnect the page, then someone might
8586 * soft fault it back in, then we would put it on the
8587 * cleaned queue, and so we would have a referenced (maybe even dirty)
8588 * page on that queue, which we don't want
8589 */
8590 int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8591
8592 if ((refmod_state & VM_MEM_REFERENCED)) {
8593 /*
8594 * this page has been touched since it got cleaned; let's activate it
8595 * if it hasn't already been
8596 */
8597 VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
8598 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
8599
8600 if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
8601 vm_page_activate(m);
8602 }
8603 } else {
8604 m->vmp_reference = FALSE;
8605 vm_page_enqueue_cleaned(m);
8606 }
8607 } else if (dwp->dw_mask & DW_vm_page_lru) {
8608 vm_page_lru(m);
8609 } else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) {
8610 if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
8611 vm_page_queues_remove(m, TRUE);
8612 }
8613 }
8614 if (dwp->dw_mask & DW_set_reference) {
8615 m->vmp_reference = TRUE;
8616 } else if (dwp->dw_mask & DW_clear_reference) {
8617 m->vmp_reference = FALSE;
8618 }
8619
8620 if (dwp->dw_mask & DW_move_page) {
8621 if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
8622 vm_page_queues_remove(m, FALSE);
8623
8624 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
8625
8626 vm_page_enqueue_inactive(m, FALSE);
8627 }
8628 }
8629 if (dwp->dw_mask & DW_clear_busy) {
8630 m->vmp_busy = FALSE;
8631 }
8632
8633 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8634 vm_page_wakeup(object, m);
8635 }
8636 #if HAS_MTE
8637 if (dwp->dw_mask & DW_vm_page_wakeup_tag_storage) {
8638 assert(m->vmp_ts_wanted);
8639 mteinfo_tag_storage_wakeup(m, false);
8640 }
8641 #endif /* HAS_MTE */
8642 }
8643 }
8644 vm_page_unlock_queues();
8645
8646 if (local_free_q) {
8647 vm_page_free_list(local_free_q, TRUE);
8648 }
8649
8650 VM_CHECK_MEMORYSTATUS;
8651
8652 return kr;
8653 }
8654
8655 __abortlike
8656 static void
__vm_page_alloc_list_failed_panic(vm_size_t page_count,kma_flags_t flags,kern_return_t kr)8657 __vm_page_alloc_list_failed_panic(
8658 vm_size_t page_count,
8659 kma_flags_t flags,
8660 kern_return_t kr)
8661 {
8662 panic("vm_page_alloc_list(%zd, 0x%x) failed unexpectedly with %d",
8663 (size_t)page_count, flags, kr);
8664 }
8665
8666 kern_return_t
vm_page_alloc_list(vm_size_t page_count,kma_flags_t flags,vm_page_t * list)8667 vm_page_alloc_list(vm_size_t page_count, kma_flags_t flags, vm_page_t *list)
8668 {
8669 vm_page_t page_list = VM_PAGE_NULL;
8670 vm_page_t mem;
8671 kern_return_t kr = KERN_SUCCESS;
8672 int page_grab_count = 0;
8673 task_t task;
8674
8675 for (vm_size_t i = 0; i < page_count; i++) {
8676 for (;;) {
8677 vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
8678
8679 #if HAS_MTE
8680 if (flags & KMA_TAG) {
8681 options |= VM_PAGE_GRAB_MTE;
8682 }
8683 if (vm_mte_tag_storage_for_compressor && (flags & KMA_COMPRESSOR)) {
8684 /*
8685 * These pages will be used in the compressor pool.
8686 * Prefer tag storage pages for these allocations.
8687 */
8688 options |= VM_PAGE_GRAB_ALLOW_TAG_STORAGE;
8689 }
8690 #endif /* HAS_MTE */
8691 if (flags & KMA_NOPAGEWAIT) {
8692 options |= VM_PAGE_GRAB_NOPAGEWAIT;
8693 }
8694 if (flags & KMA_LOMEM) {
8695 mem = vm_page_grablo(options);
8696 } else {
8697 mem = vm_page_grab_options(options);
8698 }
8699
8700 if (mem != VM_PAGE_NULL) {
8701 break;
8702 }
8703
8704 if (flags & KMA_NOPAGEWAIT) {
8705 kr = KERN_RESOURCE_SHORTAGE;
8706 goto out;
8707 }
8708 if ((flags & KMA_LOMEM) && vm_lopage_needed) {
8709 kr = KERN_RESOURCE_SHORTAGE;
8710 goto out;
8711 }
8712
8713 /* VM privileged threads should have waited in vm_page_grab() and not get here. */
8714 assert(!(current_thread()->options & TH_OPT_VMPRIV));
8715
8716 if ((flags & KMA_NOFAIL) == 0 && ptoa_64(page_count) > max_mem / 4) {
8717 uint64_t unavailable = ptoa_64(vm_page_wire_count + vm_page_free_target);
8718 if (unavailable > max_mem || ptoa_64(page_count) > (max_mem - unavailable)) {
8719 kr = KERN_RESOURCE_SHORTAGE;
8720 goto out;
8721 }
8722 }
8723 VM_PAGE_WAIT();
8724 }
8725
8726 page_grab_count++;
8727 mem->vmp_snext = page_list;
8728 page_list = mem;
8729 }
8730
8731 if ((KMA_ZERO | KMA_NOENCRYPT) & flags) {
8732 for (mem = page_list; mem; mem = mem->vmp_snext) {
8733 vm_page_zero_fill(
8734 mem
8735 #if HAS_MTE
8736 , false /* zero_tags */
8737 #endif /* HAS_MTE */
8738 );
8739 }
8740 }
8741
8742 out:
8743 task = current_task_early();
8744 if (task != NULL) {
8745 ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
8746 }
8747 counter_add(&vm_page_grab_count_kern, page_grab_count);
8748
8749 if (kr == KERN_SUCCESS) {
8750 *list = page_list;
8751 } else if (flags & KMA_NOFAIL) {
8752 __vm_page_alloc_list_failed_panic(page_count, flags, kr);
8753 } else {
8754 vm_page_free_list(page_list, FALSE);
8755 }
8756
8757 return kr;
8758 }
8759
8760 void
vm_page_set_offset(vm_page_t page,vm_object_offset_t offset)8761 vm_page_set_offset(vm_page_t page, vm_object_offset_t offset)
8762 {
8763 page->vmp_offset = offset;
8764 }
8765
8766 vm_page_t
vm_page_get_next(vm_page_t page)8767 vm_page_get_next(vm_page_t page)
8768 {
8769 return page->vmp_snext;
8770 }
8771
8772 vm_object_offset_t
vm_page_get_offset(vm_page_t page)8773 vm_page_get_offset(vm_page_t page)
8774 {
8775 return page->vmp_offset;
8776 }
8777
8778 ppnum_t
vm_page_get_phys_page(vm_page_t page)8779 vm_page_get_phys_page(vm_page_t page)
8780 {
8781 return VM_PAGE_GET_PHYS_PAGE(page);
8782 }
8783
8784
8785 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8786
8787 #if HIBERNATION
8788
8789 static uint32_t hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *);
8790
8791 struct hibernate_statistics {
8792 int hibernate_considered;
8793 int hibernate_reentered_on_q;
8794 int hibernate_found_dirty;
8795 int hibernate_skipped_cleaning;
8796 int hibernate_skipped_transient;
8797 int hibernate_skipped_precious;
8798 int hibernate_skipped_external;
8799 int hibernate_queue_nolock;
8800 int hibernate_queue_paused;
8801 int hibernate_throttled;
8802 int hibernate_throttle_timeout;
8803 int hibernate_drained;
8804 int hibernate_drain_timeout;
8805 int cd_lock_failed;
8806 int cd_found_precious;
8807 int cd_found_wired;
8808 int cd_found_busy;
8809 int cd_found_unusual;
8810 int cd_found_cleaning;
8811 int cd_found_laundry;
8812 int cd_found_dirty;
8813 int cd_found_xpmapped;
8814 int cd_skipped_xpmapped;
8815 int cd_local_free;
8816 int cd_total_free;
8817 int cd_vm_page_wire_count;
8818 int cd_vm_struct_pages_unneeded;
8819 int cd_pages;
8820 int cd_discarded;
8821 int cd_count_wire;
8822 } hibernate_stats;
8823
8824 #if CONFIG_SPTM
8825 /**
8826 * On SPTM-based systems don't save any executable pages into the hibernation
8827 * image. The SPTM has stronger guarantees around not allowing write access to
8828 * the executable pages than on older systems, which prevents XNU from being
8829 * able to restore any pages mapped as executable.
8830 */
8831 #define HIBERNATE_XPMAPPED_LIMIT 0ULL
8832 #else /* CONFIG_SPTM */
8833 /*
8834 * clamp the number of 'xpmapped' pages we'll sweep into the hibernation image
8835 * so that we don't overrun the estimated image size, which would
8836 * result in a hibernation failure.
8837 *
8838 * We use a size value instead of pages because we don't want to take up more space
8839 * on disk if the system has a 16K page size vs 4K. Also, we are not guaranteed
8840 * to have that additional space available.
8841 *
8842 * Since this was set at 40000 pages on X86 we are going to use 160MB as our
8843 * xpmapped size.
8844 */
8845 #define HIBERNATE_XPMAPPED_LIMIT ((160 * 1024 * 1024ULL) / PAGE_SIZE)
8846 #endif /* CONFIG_SPTM */
8847
8848 static int
hibernate_drain_pageout_queue(struct vm_pageout_queue * q)8849 hibernate_drain_pageout_queue(struct vm_pageout_queue *q)
8850 {
8851 wait_result_t wait_result;
8852
8853 vm_page_lock_queues();
8854
8855 while (!vm_page_queue_empty(&q->pgo_pending)) {
8856 q->pgo_draining = TRUE;
8857
8858 assert_wait_timeout((event_t) (&q->pgo_laundry + 1), THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
8859
8860 vm_page_unlock_queues();
8861
8862 wait_result = thread_block(THREAD_CONTINUE_NULL);
8863
8864 if (wait_result == THREAD_TIMED_OUT && !vm_page_queue_empty(&q->pgo_pending)) {
8865 hibernate_stats.hibernate_drain_timeout++;
8866
8867 if (q == &vm_pageout_queue_external) {
8868 return 0;
8869 }
8870
8871 return 1;
8872 }
8873 vm_page_lock_queues();
8874
8875 hibernate_stats.hibernate_drained++;
8876 }
8877 vm_page_unlock_queues();
8878
8879 return 0;
8880 }
8881
8882
8883 boolean_t hibernate_skip_external = FALSE;
8884
8885 static int
hibernate_flush_queue(vm_page_queue_head_t * q,int qcount)8886 hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
8887 {
8888 vm_page_t m;
8889 vm_object_t l_object = NULL;
8890 vm_object_t m_object = NULL;
8891 int refmod_state = 0;
8892 int try_failed_count = 0;
8893 int retval = 0;
8894 int current_run = 0;
8895 struct vm_pageout_queue *iq;
8896 struct vm_pageout_queue *eq;
8897 struct vm_pageout_queue *tq;
8898
8899 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_START,
8900 VM_KERNEL_UNSLIDE_OR_PERM(q), qcount);
8901
8902 iq = &vm_pageout_queue_internal;
8903 eq = &vm_pageout_queue_external;
8904
8905 vm_page_lock_queues();
8906
8907 while (qcount && !vm_page_queue_empty(q)) {
8908 if (current_run++ == 1000) {
8909 if (hibernate_should_abort()) {
8910 retval = 1;
8911 break;
8912 }
8913 current_run = 0;
8914 }
8915
8916 m = (vm_page_t) vm_page_queue_first(q);
8917 m_object = VM_PAGE_OBJECT(m);
8918
8919 /*
8920 * check to see if we currently are working
8921 * with the same object... if so, we've
8922 * already got the lock
8923 */
8924 if (m_object != l_object) {
8925 /*
8926 * the object associated with candidate page is
8927 * different from the one we were just working
8928 * with... dump the lock if we still own it
8929 */
8930 if (l_object != NULL) {
8931 vm_object_unlock(l_object);
8932 l_object = NULL;
8933 }
8934 /*
8935 * Try to lock object; since we've alread got the
8936 * page queues lock, we can only 'try' for this one.
8937 * if the 'try' fails, we need to do a mutex_pause
8938 * to allow the owner of the object lock a chance to
8939 * run...
8940 */
8941 if (!vm_object_lock_try_scan(m_object)) {
8942 if (try_failed_count > 20) {
8943 hibernate_stats.hibernate_queue_nolock++;
8944
8945 goto reenter_pg_on_q;
8946 }
8947
8948 vm_page_unlock_queues();
8949 mutex_pause(try_failed_count++);
8950 vm_page_lock_queues();
8951
8952 hibernate_stats.hibernate_queue_paused++;
8953 continue;
8954 } else {
8955 l_object = m_object;
8956 }
8957 }
8958 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m)) {
8959 /*
8960 * page is not to be cleaned
8961 * put it back on the head of its queue
8962 */
8963 if (m->vmp_cleaning) {
8964 hibernate_stats.hibernate_skipped_cleaning++;
8965 } else {
8966 hibernate_stats.hibernate_skipped_transient++;
8967 }
8968
8969 goto reenter_pg_on_q;
8970 }
8971 if (m_object->vo_copy == VM_OBJECT_NULL) {
8972 if (m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) {
8973 /*
8974 * let the normal hibernate image path
8975 * deal with these
8976 */
8977 goto reenter_pg_on_q;
8978 }
8979 }
8980 if (!m->vmp_dirty && m->vmp_pmapped) {
8981 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
8982
8983 if ((refmod_state & VM_MEM_MODIFIED)) {
8984 SET_PAGE_DIRTY(m, FALSE);
8985 }
8986 } else {
8987 refmod_state = 0;
8988 }
8989
8990 if (!m->vmp_dirty) {
8991 /*
8992 * page is not to be cleaned
8993 * put it back on the head of its queue
8994 */
8995 if (m->vmp_precious) {
8996 hibernate_stats.hibernate_skipped_precious++;
8997 }
8998
8999 goto reenter_pg_on_q;
9000 }
9001
9002 if (hibernate_skip_external == TRUE && !m_object->internal) {
9003 hibernate_stats.hibernate_skipped_external++;
9004
9005 goto reenter_pg_on_q;
9006 }
9007 tq = NULL;
9008
9009 if (m_object->internal) {
9010 if (VM_PAGE_Q_THROTTLED(iq)) {
9011 tq = iq;
9012 }
9013 } else if (VM_PAGE_Q_THROTTLED(eq)) {
9014 tq = eq;
9015 }
9016
9017 if (tq != NULL) {
9018 wait_result_t wait_result;
9019 int wait_count = 5;
9020
9021 if (l_object != NULL) {
9022 vm_object_unlock(l_object);
9023 l_object = NULL;
9024 }
9025
9026 while (retval == 0) {
9027 tq->pgo_throttled = TRUE;
9028
9029 assert_wait_timeout((event_t) &tq->pgo_laundry, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
9030
9031 vm_page_unlock_queues();
9032
9033 wait_result = thread_block(THREAD_CONTINUE_NULL);
9034
9035 vm_page_lock_queues();
9036
9037 if (wait_result != THREAD_TIMED_OUT) {
9038 break;
9039 }
9040 if (!VM_PAGE_Q_THROTTLED(tq)) {
9041 break;
9042 }
9043
9044 if (hibernate_should_abort()) {
9045 retval = 1;
9046 }
9047
9048 if (--wait_count == 0) {
9049 hibernate_stats.hibernate_throttle_timeout++;
9050
9051 if (tq == eq) {
9052 hibernate_skip_external = TRUE;
9053 break;
9054 }
9055 retval = 1;
9056 }
9057 }
9058 if (retval) {
9059 break;
9060 }
9061
9062 hibernate_stats.hibernate_throttled++;
9063
9064 continue;
9065 }
9066 /*
9067 * we've already factored out pages in the laundry which
9068 * means this page can't be on the pageout queue so it's
9069 * safe to do the vm_page_queues_remove
9070 */
9071 vm_page_queues_remove(m, TRUE);
9072
9073 if (m_object->internal == TRUE) {
9074 pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m), PMAP_OPTIONS_COMPRESSOR, NULL);
9075 }
9076
9077 vm_pageout_cluster(m);
9078
9079 hibernate_stats.hibernate_found_dirty++;
9080
9081 goto next_pg;
9082
9083 reenter_pg_on_q:
9084 vm_page_queue_remove(q, m, vmp_pageq);
9085 vm_page_queue_enter(q, m, vmp_pageq);
9086
9087 hibernate_stats.hibernate_reentered_on_q++;
9088 next_pg:
9089 hibernate_stats.hibernate_considered++;
9090
9091 qcount--;
9092 try_failed_count = 0;
9093 }
9094 if (l_object != NULL) {
9095 vm_object_unlock(l_object);
9096 l_object = NULL;
9097 }
9098
9099 vm_page_unlock_queues();
9100
9101 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_END, hibernate_stats.hibernate_found_dirty, retval, 0, 0, 0);
9102
9103 return retval;
9104 }
9105
9106
9107 static int
hibernate_flush_dirty_pages(int pass)9108 hibernate_flush_dirty_pages(int pass)
9109 {
9110 struct vm_speculative_age_q *aq;
9111 uint32_t i;
9112
9113 if (vm_page_local_q) {
9114 zpercpu_foreach_cpu(lid) {
9115 vm_page_reactivate_local(lid, TRUE, FALSE);
9116 }
9117 }
9118
9119 for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
9120 int qcount;
9121 vm_page_t m;
9122
9123 aq = &vm_page_queue_speculative[i];
9124
9125 if (vm_page_queue_empty(&aq->age_q)) {
9126 continue;
9127 }
9128 qcount = 0;
9129
9130 vm_page_lockspin_queues();
9131
9132 vm_page_queue_iterate(&aq->age_q, m, vmp_pageq) {
9133 qcount++;
9134 }
9135 vm_page_unlock_queues();
9136
9137 if (qcount) {
9138 if (hibernate_flush_queue(&aq->age_q, qcount)) {
9139 return 1;
9140 }
9141 }
9142 }
9143 if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count)) {
9144 return 1;
9145 }
9146 /* XXX FBDP TODO: flush secluded queue */
9147 if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count)) {
9148 return 1;
9149 }
9150 if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count)) {
9151 return 1;
9152 }
9153 if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
9154 return 1;
9155 }
9156
9157 if (pass == 1) {
9158 vm_compressor_record_warmup_start();
9159 }
9160
9161 if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) {
9162 if (pass == 1) {
9163 vm_compressor_record_warmup_end();
9164 }
9165 return 1;
9166 }
9167 if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
9168 if (pass == 1) {
9169 vm_compressor_record_warmup_end();
9170 }
9171 return 1;
9172 }
9173 if (pass == 1) {
9174 vm_compressor_record_warmup_end();
9175 }
9176
9177 if (hibernate_skip_external == FALSE && hibernate_drain_pageout_queue(&vm_pageout_queue_external)) {
9178 return 1;
9179 }
9180
9181 return 0;
9182 }
9183
9184
9185 void
hibernate_reset_stats(void)9186 hibernate_reset_stats(void)
9187 {
9188 bzero(&hibernate_stats, sizeof(struct hibernate_statistics));
9189 }
9190
9191
9192 int
hibernate_flush_memory(void)9193 hibernate_flush_memory(void)
9194 {
9195 int retval;
9196
9197 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
9198
9199 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_START, vm_page_free_count, 0, 0, 0, 0);
9200
9201 hibernate_cleaning_in_progress = TRUE;
9202 hibernate_skip_external = FALSE;
9203
9204 if ((retval = hibernate_flush_dirty_pages(1)) == 0) {
9205 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
9206
9207 vm_compressor_flush();
9208
9209 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
9210
9211 if (consider_buffer_cache_collect != NULL) {
9212 unsigned int orig_wire_count;
9213
9214 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_START, 0, 0, 0, 0, 0);
9215 orig_wire_count = vm_page_wire_count;
9216
9217 (void)(*consider_buffer_cache_collect)(1);
9218 zone_gc(ZONE_GC_DRAIN);
9219
9220 HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count);
9221
9222 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_END, orig_wire_count - vm_page_wire_count, 0, 0, 0, 0);
9223 }
9224 }
9225 hibernate_cleaning_in_progress = FALSE;
9226
9227 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_END, vm_page_free_count, hibernate_stats.hibernate_found_dirty, retval, 0, 0);
9228
9229 if (retval) {
9230 HIBLOG("hibernate_flush_memory() failed to finish - vm_page_compressor_count(%d)\n", VM_PAGE_COMPRESSOR_COUNT);
9231 }
9232
9233
9234 HIBPRINT("hibernate_flush_memory() considered(%d) reentered_on_q(%d) found_dirty(%d)\n",
9235 hibernate_stats.hibernate_considered,
9236 hibernate_stats.hibernate_reentered_on_q,
9237 hibernate_stats.hibernate_found_dirty);
9238 HIBPRINT(" skipped_cleaning(%d) skipped_transient(%d) skipped_precious(%d) skipped_external(%d) queue_nolock(%d)\n",
9239 hibernate_stats.hibernate_skipped_cleaning,
9240 hibernate_stats.hibernate_skipped_transient,
9241 hibernate_stats.hibernate_skipped_precious,
9242 hibernate_stats.hibernate_skipped_external,
9243 hibernate_stats.hibernate_queue_nolock);
9244 HIBPRINT(" queue_paused(%d) throttled(%d) throttle_timeout(%d) drained(%d) drain_timeout(%d)\n",
9245 hibernate_stats.hibernate_queue_paused,
9246 hibernate_stats.hibernate_throttled,
9247 hibernate_stats.hibernate_throttle_timeout,
9248 hibernate_stats.hibernate_drained,
9249 hibernate_stats.hibernate_drain_timeout);
9250
9251 return retval;
9252 }
9253
9254
9255 static void
hibernate_page_list_zero(hibernate_page_list_t * list)9256 hibernate_page_list_zero(hibernate_page_list_t *list)
9257 {
9258 uint32_t bank;
9259 hibernate_bitmap_t * bitmap;
9260
9261 bitmap = &list->bank_bitmap[0];
9262 for (bank = 0; bank < list->bank_count; bank++) {
9263 uint32_t last_bit;
9264
9265 bzero((void *) &bitmap->bitmap[0], bitmap->bitmapwords << 2);
9266 // set out-of-bound bits at end of bitmap.
9267 last_bit = ((bitmap->last_page - bitmap->first_page + 1) & 31);
9268 if (last_bit) {
9269 bitmap->bitmap[bitmap->bitmapwords - 1] = (0xFFFFFFFF >> last_bit);
9270 }
9271
9272 bitmap = (hibernate_bitmap_t *) &bitmap->bitmap[bitmap->bitmapwords];
9273 }
9274 }
9275
9276 static boolean_t
hibernate_consider_discard(vm_page_t m,boolean_t preflight)9277 hibernate_consider_discard(vm_page_t m, boolean_t preflight)
9278 {
9279 vm_object_t object = NULL;
9280 int refmod_state;
9281 boolean_t discard = FALSE;
9282
9283 do{
9284 if (vm_page_is_private(m)) {
9285 panic("hibernate_consider_discard: private");
9286 }
9287
9288 object = VM_PAGE_OBJECT(m);
9289
9290 if (!vm_object_lock_try(object)) {
9291 object = NULL;
9292 if (!preflight) {
9293 hibernate_stats.cd_lock_failed++;
9294 }
9295 break;
9296 }
9297 if (VM_PAGE_WIRED(m)) {
9298 if (!preflight) {
9299 hibernate_stats.cd_found_wired++;
9300 }
9301 break;
9302 }
9303 if (m->vmp_precious) {
9304 if (!preflight) {
9305 hibernate_stats.cd_found_precious++;
9306 }
9307 break;
9308 }
9309 if (m->vmp_busy || !object->alive) {
9310 /*
9311 * Somebody is playing with this page.
9312 */
9313 if (!preflight) {
9314 hibernate_stats.cd_found_busy++;
9315 }
9316 break;
9317 }
9318 if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
9319 /*
9320 * If it's unusual in anyway, ignore it
9321 */
9322 if (!preflight) {
9323 hibernate_stats.cd_found_unusual++;
9324 }
9325 break;
9326 }
9327 if (m->vmp_cleaning) {
9328 if (!preflight) {
9329 hibernate_stats.cd_found_cleaning++;
9330 }
9331 break;
9332 }
9333 if (m->vmp_laundry) {
9334 if (!preflight) {
9335 hibernate_stats.cd_found_laundry++;
9336 }
9337 break;
9338 }
9339 if (!m->vmp_dirty) {
9340 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
9341
9342 if (refmod_state & VM_MEM_REFERENCED) {
9343 m->vmp_reference = TRUE;
9344 }
9345 if (refmod_state & VM_MEM_MODIFIED) {
9346 SET_PAGE_DIRTY(m, FALSE);
9347 }
9348 }
9349
9350 /*
9351 * If it's clean or purgeable we can discard the page on wakeup.
9352 */
9353 discard = (!m->vmp_dirty)
9354 || (VM_PURGABLE_VOLATILE == object->purgable)
9355 || (VM_PURGABLE_EMPTY == object->purgable);
9356
9357
9358 if (discard == FALSE) {
9359 if (!preflight) {
9360 hibernate_stats.cd_found_dirty++;
9361 }
9362 } else if (m->vmp_xpmapped && m->vmp_reference && !object->internal) {
9363 if (hibernate_stats.cd_found_xpmapped < HIBERNATE_XPMAPPED_LIMIT) {
9364 if (!preflight) {
9365 hibernate_stats.cd_found_xpmapped++;
9366 }
9367 discard = FALSE;
9368 } else {
9369 if (!preflight) {
9370 hibernate_stats.cd_skipped_xpmapped++;
9371 }
9372 }
9373 }
9374 }while (FALSE);
9375
9376 if (object) {
9377 vm_object_unlock(object);
9378 }
9379
9380 return discard;
9381 }
9382
9383
9384 static void
hibernate_discard_page(vm_page_t m)9385 hibernate_discard_page(vm_page_t m)
9386 {
9387 vm_object_t m_object;
9388
9389 if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
9390 /*
9391 * If it's unusual in anyway, ignore
9392 */
9393 return;
9394 }
9395
9396 m_object = VM_PAGE_OBJECT(m);
9397
9398 #if MACH_ASSERT || DEBUG
9399 if (!vm_object_lock_try(m_object)) {
9400 panic("hibernate_discard_page(%p) !vm_object_lock_try", m);
9401 }
9402 #else
9403 /* No need to lock page queue for token delete, hibernate_vm_unlock()
9404 * makes sure these locks are uncontended before sleep */
9405 #endif /* MACH_ASSERT || DEBUG */
9406
9407 if (m->vmp_pmapped == TRUE) {
9408 __unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
9409 }
9410
9411 if (m->vmp_laundry) {
9412 panic("hibernate_discard_page(%p) laundry", m);
9413 }
9414 if (vm_page_is_private(m)) {
9415 panic("hibernate_discard_page(%p) private", m);
9416 }
9417 if (vm_page_is_fictitious(m)) {
9418 panic("hibernate_discard_page(%p) fictitious", m);
9419 }
9420
9421 if (VM_PURGABLE_VOLATILE == m_object->purgable) {
9422 /* object should be on a queue */
9423 assert((m_object->objq.next != NULL) && (m_object->objq.prev != NULL));
9424 purgeable_q_t old_queue = vm_purgeable_object_remove(m_object);
9425 assert(old_queue);
9426 if (m_object->purgeable_when_ripe) {
9427 vm_purgeable_token_delete_first(old_queue);
9428 }
9429 vm_object_lock_assert_exclusive(m_object);
9430 VM_OBJECT_SET_PURGABLE(m_object, VM_PURGABLE_EMPTY);
9431
9432 /*
9433 * Purgeable ledgers: pages of VOLATILE and EMPTY objects are
9434 * accounted in the "volatile" ledger, so no change here.
9435 * We have to update vm_page_purgeable_count, though, since we're
9436 * effectively purging this object.
9437 */
9438 unsigned int delta;
9439 assert(m_object->resident_page_count >= m_object->wired_page_count);
9440 delta = (m_object->resident_page_count - m_object->wired_page_count);
9441 assert(vm_page_purgeable_count >= delta);
9442 assert(delta > 0);
9443 OSAddAtomic(-delta, (SInt32 *)&vm_page_purgeable_count);
9444 }
9445
9446 vm_page_free(m);
9447
9448 #if MACH_ASSERT || DEBUG
9449 vm_object_unlock(m_object);
9450 #endif /* MACH_ASSERT || DEBUG */
9451 }
9452
9453 /*
9454 * Grab locks for hibernate_page_list_setall()
9455 */
9456 void
hibernate_vm_lock_queues(void)9457 hibernate_vm_lock_queues(void)
9458 {
9459 vm_object_lock(compressor_object);
9460 vm_page_lock_queues();
9461 vm_free_page_lock();
9462 lck_mtx_lock(&vm_purgeable_queue_lock);
9463
9464 if (vm_page_local_q) {
9465 zpercpu_foreach(lq, vm_page_local_q) {
9466 VPL_LOCK(&lq->vpl_lock);
9467 }
9468 }
9469 }
9470
9471 void
hibernate_vm_unlock_queues(void)9472 hibernate_vm_unlock_queues(void)
9473 {
9474 if (vm_page_local_q) {
9475 zpercpu_foreach(lq, vm_page_local_q) {
9476 VPL_UNLOCK(&lq->vpl_lock);
9477 }
9478 }
9479 lck_mtx_unlock(&vm_purgeable_queue_lock);
9480 vm_free_page_unlock();
9481 vm_page_unlock_queues();
9482 vm_object_unlock(compressor_object);
9483 }
9484
9485 #if CONFIG_SPTM
9486 static bool
hibernate_sptm_should_force_page_to_wired_pagelist(vm_page_t vmp)9487 hibernate_sptm_should_force_page_to_wired_pagelist(vm_page_t vmp)
9488 {
9489 const sptm_paddr_t paddr = ptoa_64(VM_PAGE_GET_PHYS_PAGE(vmp));
9490 const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
9491 const vm_object_t vmp_objp = VM_PAGE_OBJECT(vmp);
9492
9493 return frame_type == XNU_USER_JIT || frame_type == XNU_USER_DEBUG ||
9494 (frame_type == XNU_USER_EXEC && vmp_objp->internal == TRUE);
9495 }
9496 #endif
9497
9498 /*
9499 * Bits zero in the bitmaps => page needs to be saved. All pages default to be saved,
9500 * pages known to VM to not need saving are subtracted.
9501 * Wired pages to be saved are present in page_list_wired, pageable in page_list.
9502 */
9503
9504 void
hibernate_page_list_setall(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired,hibernate_page_list_t * page_list_pal,boolean_t preflight,boolean_t will_discard,uint32_t * pagesOut)9505 hibernate_page_list_setall(hibernate_page_list_t * page_list,
9506 hibernate_page_list_t * page_list_wired,
9507 hibernate_page_list_t * page_list_pal,
9508 boolean_t preflight,
9509 boolean_t will_discard,
9510 uint32_t * pagesOut)
9511 {
9512 uint64_t start, end, nsec;
9513 vm_page_t m;
9514 vm_page_t next;
9515 __block uint32_t pages = page_list->page_count;
9516 __block uint32_t count_wire = pages;
9517 uint32_t count_anonymous = 0, count_throttled = 0, count_compressor = 0;
9518 uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0;
9519 uint32_t count_discard_active = 0;
9520 uint32_t count_discard_inactive = 0;
9521 uint32_t count_retired = 0;
9522 uint32_t count_discard_cleaned = 0;
9523 uint32_t count_discard_purgeable = 0;
9524 uint32_t count_discard_speculative = 0;
9525 uint32_t count_discard_vm_struct_pages = 0;
9526 uint32_t bank;
9527 hibernate_bitmap_t * bitmap;
9528 hibernate_bitmap_t * bitmap_wired;
9529 boolean_t discard_all;
9530 boolean_t discard = FALSE;
9531
9532 HIBLOG("hibernate_page_list_setall(preflight %d) start\n", preflight);
9533
9534 if (preflight) {
9535 page_list = NULL;
9536 page_list_wired = NULL;
9537 page_list_pal = NULL;
9538 discard_all = FALSE;
9539 } else {
9540 discard_all = will_discard;
9541 }
9542
9543 #if MACH_ASSERT || DEBUG
9544 if (!preflight) {
9545 assert(hibernate_vm_locks_are_safe());
9546 vm_page_lock_queues();
9547 if (vm_page_local_q) {
9548 zpercpu_foreach(lq, vm_page_local_q) {
9549 VPL_LOCK(&lq->vpl_lock);
9550 }
9551 }
9552 }
9553 #endif /* MACH_ASSERT || DEBUG */
9554
9555
9556 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_START, count_wire, 0, 0, 0, 0);
9557
9558 clock_get_uptime(&start);
9559
9560 if (!preflight) {
9561 hibernate_page_list_zero(page_list);
9562 hibernate_page_list_zero(page_list_wired);
9563 hibernate_page_list_zero(page_list_pal);
9564
9565 hibernate_stats.cd_vm_page_wire_count = vm_page_wire_count;
9566 hibernate_stats.cd_pages = pages;
9567 }
9568
9569 if (vm_page_local_q) {
9570 zpercpu_foreach_cpu(lid) {
9571 vm_page_reactivate_local(lid, TRUE, !preflight);
9572 }
9573 }
9574
9575 if (preflight) {
9576 vm_object_lock(compressor_object);
9577 vm_page_lock_queues();
9578 vm_free_page_lock();
9579 }
9580
9581 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
9582
9583 hibernation_vmqueues_inspection = TRUE;
9584
9585 __auto_type hib_free_boilerplate = ^(vm_page_t page) {
9586 assert((page->vmp_q_state == VM_PAGE_ON_FREE_Q) ||
9587 #if XNU_VM_HAS_LOPAGE
9588 (page->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
9589 #endif /* XNU_VM_HAS_LOPAGE */
9590 (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q));
9591
9592 pages--;
9593 count_wire--;
9594
9595 if (!preflight) {
9596 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(page));
9597 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(page));
9598
9599 hibernate_stats.cd_total_free++;
9600
9601 if (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) {
9602 hibernate_stats.cd_local_free++;
9603 }
9604 }
9605 };
9606
9607 if (!preflight) {
9608 percpu_foreach(free_pages_head, free_pages) {
9609 _vm_page_list_foreach(m, *free_pages_head) {
9610 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9611 hib_free_boilerplate(m);
9612 }
9613 }
9614 #if HAS_MTE
9615 percpu_foreach(mte_pcpu, mte_pcpu) {
9616 _vm_page_list_foreach(m, mte_pcpu->free_tagged_pages) {
9617 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9618 hib_free_boilerplate(m);
9619 }
9620 vm_page_queue_iterate(&mte_pcpu->free_claimed_pages,
9621 m, vmp_pageq) {
9622 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9623 hib_free_boilerplate(m);
9624 }
9625 }
9626 #endif /* HAS_MTE */
9627 }
9628
9629 #if CONFIG_SPTM
9630 if (vm_pages_free_masks()) {
9631 uint32_t bits = vm_pages_free_mask_len() * MAX_COLORS;
9632 bitmap_t *map = vm_pages_free_masks_as_bitmap(0);
9633
9634 for (int bit = bitmap_first(map, bits);
9635 bit >= 0; bit = bitmap_next(map, bit)) {
9636 ppnum_t pnum = pmap_first_pnum + bit;
9637 vm_page_t mem = vm_page_find_canonical(pnum);
9638
9639 hib_free_boilerplate(mem);
9640 }
9641 } else
9642 #endif /* CONFIG_SPTM */
9643 {
9644 vm_page_free_queue_foreach(&vm_page_queue_free, hib_free_boilerplate);
9645 }
9646 #if HAS_MTE
9647 mteinfo_free_queue_foreach(hib_free_boilerplate);
9648 #endif /* HAS_MTE */
9649 #if XNU_VM_HAS_LOPAGE
9650 vm_page_free_queue_foreach(&vm_lopage_queue_free, hib_free_boilerplate);
9651 #endif /* XNU_VM_HAS_LOPAGE */
9652
9653 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9654 while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) {
9655 assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
9656
9657 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9658 discard = FALSE;
9659 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
9660 && hibernate_consider_discard(m, preflight)) {
9661 if (!preflight) {
9662 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9663 }
9664 count_discard_inactive++;
9665 discard = discard_all;
9666 } else {
9667 count_throttled++;
9668 }
9669 count_wire--;
9670 if (!preflight) {
9671 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9672 }
9673
9674 if (discard) {
9675 hibernate_discard_page(m);
9676 }
9677 m = next;
9678 }
9679
9680 m = (vm_page_t)vm_page_queue_first(&vm_page_queue_anonymous);
9681 while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
9682 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
9683 bool force_to_wired_list = false; /* Default to NOT forcing page into the wired page list */
9684 #if CONFIG_SPTM
9685 force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9686 #endif
9687 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9688 discard = FALSE;
9689 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9690 hibernate_consider_discard(m, preflight)) {
9691 if (!preflight) {
9692 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9693 }
9694 if (m->vmp_dirty) {
9695 count_discard_purgeable++;
9696 } else {
9697 count_discard_inactive++;
9698 }
9699 discard = discard_all;
9700 } else {
9701 /*
9702 * If the page must be force-added to the wired page list, prevent it from appearing
9703 * in the unwired page list.
9704 */
9705 if (force_to_wired_list) {
9706 if (!preflight) {
9707 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9708 }
9709 } else {
9710 count_anonymous++;
9711 }
9712 }
9713 /*
9714 * If the page is NOT being forced into the wired page list, remove it from the
9715 * wired page list here.
9716 */
9717 if (!force_to_wired_list) {
9718 count_wire--;
9719 if (!preflight) {
9720 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9721 }
9722 }
9723 if (discard) {
9724 hibernate_discard_page(m);
9725 }
9726 m = next;
9727 }
9728
9729 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
9730 while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
9731 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
9732
9733 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9734 discard = FALSE;
9735 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9736 hibernate_consider_discard(m, preflight)) {
9737 if (!preflight) {
9738 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9739 }
9740 if (m->vmp_dirty) {
9741 count_discard_purgeable++;
9742 } else {
9743 count_discard_cleaned++;
9744 }
9745 discard = discard_all;
9746 } else {
9747 count_cleaned++;
9748 }
9749 count_wire--;
9750 if (!preflight) {
9751 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9752 }
9753 if (discard) {
9754 hibernate_discard_page(m);
9755 }
9756 m = next;
9757 }
9758
9759 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9760 while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
9761 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
9762 bool force_to_wired_list = false; /* Default to NOT forcing page into the wired page list */
9763 #if CONFIG_SPTM
9764 force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9765 #endif
9766 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9767 discard = FALSE;
9768 if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) &&
9769 hibernate_consider_discard(m, preflight)) {
9770 if (!preflight) {
9771 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9772 }
9773 if (m->vmp_dirty) {
9774 count_discard_purgeable++;
9775 } else {
9776 count_discard_active++;
9777 }
9778 discard = discard_all;
9779 } else {
9780 /*
9781 * If the page must be force-added to the wired page list, prevent it from appearing
9782 * in the unwired page list.
9783 */
9784 if (force_to_wired_list) {
9785 if (!preflight) {
9786 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9787 }
9788 } else {
9789 count_active++;
9790 }
9791 }
9792 /*
9793 * If the page is NOT being forced into the wired page list, remove it from the
9794 * wired page list here.
9795 */
9796 if (!force_to_wired_list) {
9797 count_wire--;
9798 if (!preflight) {
9799 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9800 }
9801 }
9802 if (discard) {
9803 hibernate_discard_page(m);
9804 }
9805 m = next;
9806 }
9807
9808 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9809 while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
9810 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
9811 bool force_to_wired_list = false; /* Default to NOT forcing page into the wired page list */
9812 #if CONFIG_SPTM
9813 force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9814 #endif
9815 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9816 discard = FALSE;
9817 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9818 hibernate_consider_discard(m, preflight)) {
9819 if (!preflight) {
9820 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9821 }
9822 if (m->vmp_dirty) {
9823 count_discard_purgeable++;
9824 } else {
9825 count_discard_inactive++;
9826 }
9827 discard = discard_all;
9828 } else {
9829 /*
9830 * If the page must be force-added to the wired page list, prevent it from appearing
9831 * in the unwired page list.
9832 */
9833 if (force_to_wired_list) {
9834 if (!preflight) {
9835 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9836 }
9837 } else {
9838 count_inactive++;
9839 }
9840 }
9841 /*
9842 * If the page is NOT being forced into the wired page list, remove it from the
9843 * wired page list here.
9844 */
9845 if (!force_to_wired_list) {
9846 count_wire--;
9847 if (!preflight) {
9848 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9849 }
9850 }
9851 if (discard) {
9852 hibernate_discard_page(m);
9853 }
9854 m = next;
9855 }
9856 /* XXX FBDP TODO: secluded queue */
9857
9858 for (uint32_t i = 0; i <= vm_page_max_speculative_age_q; i++) {
9859 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
9860 while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
9861 assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q,
9862 "Bad page: %p (0x%x:0x%x) on queue %d has state: %d (Discard: %d, Preflight: %d)",
9863 m, m->vmp_pageq.next, m->vmp_pageq.prev, i, m->vmp_q_state, discard, preflight);
9864
9865 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9866 discard = FALSE;
9867 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9868 hibernate_consider_discard(m, preflight)) {
9869 if (!preflight) {
9870 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9871 }
9872 count_discard_speculative++;
9873 discard = discard_all;
9874 } else {
9875 count_speculative++;
9876 }
9877 count_wire--;
9878 if (!preflight) {
9879 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9880 }
9881 if (discard) {
9882 hibernate_discard_page(m);
9883 }
9884 m = next;
9885 }
9886 }
9887
9888 vm_page_queue_iterate(&compressor_object->memq, m, vmp_listq) {
9889 assert(m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
9890
9891 count_compressor++;
9892 count_wire--;
9893 if (!preflight) {
9894 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9895 }
9896 }
9897
9898
9899 if (preflight == FALSE && discard_all == TRUE) {
9900 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_START);
9901
9902 HIBLOG("hibernate_teardown started\n");
9903 count_discard_vm_struct_pages = hibernate_teardown_vm_structs(page_list, page_list_wired);
9904 HIBLOG("hibernate_teardown completed - discarded %d\n", count_discard_vm_struct_pages);
9905
9906 pages -= count_discard_vm_struct_pages;
9907 count_wire -= count_discard_vm_struct_pages;
9908
9909 hibernate_stats.cd_vm_struct_pages_unneeded = count_discard_vm_struct_pages;
9910
9911 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_END);
9912 }
9913
9914 if (!preflight) {
9915 // pull wired from hibernate_bitmap
9916 bitmap = &page_list->bank_bitmap[0];
9917 bitmap_wired = &page_list_wired->bank_bitmap[0];
9918 for (bank = 0; bank < page_list->bank_count; bank++) {
9919 for (uint32_t i = 0; i < bitmap->bitmapwords; i++) {
9920 bitmap->bitmap[i] = bitmap->bitmap[i] | ~bitmap_wired->bitmap[i];
9921 }
9922 bitmap = (hibernate_bitmap_t *)&bitmap->bitmap[bitmap->bitmapwords];
9923 bitmap_wired = (hibernate_bitmap_t *) &bitmap_wired->bitmap[bitmap_wired->bitmapwords];
9924 }
9925 }
9926
9927 // machine dependent adjustments
9928 hibernate_page_list_setall_machine(page_list, page_list_wired, preflight, &pages);
9929
9930 if (!preflight) {
9931 hibernate_stats.cd_count_wire = count_wire;
9932 hibernate_stats.cd_discarded = count_discard_active +
9933 count_discard_inactive + count_discard_purgeable +
9934 count_discard_speculative + count_discard_cleaned +
9935 count_discard_vm_struct_pages;
9936 }
9937
9938 clock_get_uptime(&end);
9939 absolutetime_to_nanoseconds(end - start, &nsec);
9940 HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL);
9941
9942 HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, "
9943 "zf %d, throt %d, compr %d, xpmapped %d\n"
9944 " %s discard act %d inact %d purgeable %d "
9945 "spec %d cleaned %d retired %d\n",
9946 pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative,
9947 count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped,
9948 discard_all ? "did" : "could",
9949 count_discard_active, count_discard_inactive, count_discard_purgeable,
9950 count_discard_speculative, count_discard_cleaned, count_retired);
9951
9952 if (hibernate_stats.cd_skipped_xpmapped) {
9953 HIBLOG("WARNING: hibernate_page_list_setall skipped %d xpmapped pages\n",
9954 hibernate_stats.cd_skipped_xpmapped);
9955 }
9956
9957 *pagesOut = pages - count_discard_active - count_discard_inactive -
9958 count_discard_purgeable - count_discard_speculative -
9959 count_discard_cleaned - count_retired;
9960
9961 if (preflight && will_discard) {
9962 *pagesOut -= count_compressor + count_throttled +
9963 count_anonymous + count_inactive + count_cleaned +
9964 count_speculative + count_active;
9965
9966 /*
9967 * We try to keep max HIBERNATE_XPMAPPED_LIMIT pages around in the hibernation image
9968 * even if these are clean and so we need to size the hibernation image accordingly.
9969 *
9970 * NB: We have to assume all HIBERNATE_XPMAPPED_LIMIT pages might show up because 'dirty'
9971 * xpmapped pages aren't distinguishable from other 'dirty' pages in preflight. So we might
9972 * only see part of the xpmapped pages if we look at 'cd_found_xpmapped' which solely tracks
9973 * clean xpmapped pages.
9974 *
9975 * Since these pages are all cleaned by the time we are in the post-preflight phase, we might
9976 * see a much larger number in 'cd_found_xpmapped' now than we did in the preflight phase
9977 */
9978 *pagesOut += HIBERNATE_XPMAPPED_LIMIT;
9979 }
9980
9981 hibernation_vmqueues_inspection = FALSE;
9982
9983 #if MACH_ASSERT || DEBUG
9984 if (!preflight) {
9985 if (vm_page_local_q) {
9986 zpercpu_foreach(lq, vm_page_local_q) {
9987 VPL_UNLOCK(&lq->vpl_lock);
9988 }
9989 }
9990 vm_page_unlock_queues();
9991 }
9992 #endif /* MACH_ASSERT || DEBUG */
9993
9994 if (preflight) {
9995 vm_free_page_unlock();
9996 vm_page_unlock_queues();
9997 vm_object_unlock(compressor_object);
9998 }
9999
10000 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_END, count_wire, *pagesOut, 0, 0, 0);
10001 }
10002
10003 void
hibernate_page_list_discard(hibernate_page_list_t * page_list)10004 hibernate_page_list_discard(hibernate_page_list_t * page_list)
10005 {
10006 uint64_t start, end, nsec;
10007 vm_page_t m;
10008 vm_page_t next;
10009 uint32_t i;
10010 uint32_t count_discard_active = 0;
10011 uint32_t count_discard_inactive = 0;
10012 uint32_t count_discard_purgeable = 0;
10013 uint32_t count_discard_cleaned = 0;
10014 uint32_t count_discard_speculative = 0;
10015
10016
10017 #if MACH_ASSERT || DEBUG
10018 vm_page_lock_queues();
10019 if (vm_page_local_q) {
10020 zpercpu_foreach(lq, vm_page_local_q) {
10021 VPL_LOCK(&lq->vpl_lock);
10022 }
10023 }
10024 #endif /* MACH_ASSERT || DEBUG */
10025
10026 clock_get_uptime(&start);
10027
10028 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10029 while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
10030 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
10031
10032 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10033 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10034 if (m->vmp_dirty) {
10035 count_discard_purgeable++;
10036 } else {
10037 count_discard_inactive++;
10038 }
10039 hibernate_discard_page(m);
10040 }
10041 m = next;
10042 }
10043
10044 for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
10045 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
10046 while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
10047 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
10048
10049 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10050 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10051 count_discard_speculative++;
10052 hibernate_discard_page(m);
10053 }
10054 m = next;
10055 }
10056 }
10057
10058 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10059 while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
10060 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
10061
10062 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10063 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10064 if (m->vmp_dirty) {
10065 count_discard_purgeable++;
10066 } else {
10067 count_discard_inactive++;
10068 }
10069 hibernate_discard_page(m);
10070 }
10071 m = next;
10072 }
10073 /* XXX FBDP TODO: secluded queue */
10074
10075 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10076 while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
10077 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
10078
10079 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10080 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10081 if (m->vmp_dirty) {
10082 count_discard_purgeable++;
10083 } else {
10084 count_discard_active++;
10085 }
10086 hibernate_discard_page(m);
10087 }
10088 m = next;
10089 }
10090
10091 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
10092 while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
10093 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
10094
10095 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10096 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10097 if (m->vmp_dirty) {
10098 count_discard_purgeable++;
10099 } else {
10100 count_discard_cleaned++;
10101 }
10102 hibernate_discard_page(m);
10103 }
10104 m = next;
10105 }
10106
10107 #if MACH_ASSERT || DEBUG
10108 if (vm_page_local_q) {
10109 zpercpu_foreach(lq, vm_page_local_q) {
10110 VPL_UNLOCK(&lq->vpl_lock);
10111 }
10112 }
10113 vm_page_unlock_queues();
10114 #endif /* MACH_ASSERT || DEBUG */
10115
10116 clock_get_uptime(&end);
10117 absolutetime_to_nanoseconds(end - start, &nsec);
10118 HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d cleaned %d\n",
10119 nsec / 1000000ULL,
10120 count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
10121 }
10122
10123 boolean_t hibernate_paddr_map_inited = FALSE;
10124 unsigned int hibernate_teardown_last_valid_compact_indx = -1;
10125 vm_page_t hibernate_rebuild_hash_list = NULL;
10126
10127 unsigned int hibernate_teardown_found_tabled_pages = 0;
10128 unsigned int hibernate_teardown_found_created_pages = 0;
10129 unsigned int hibernate_teardown_found_free_pages = 0;
10130 unsigned int hibernate_teardown_vm_page_free_count;
10131
10132
10133 struct ppnum_mapping {
10134 struct ppnum_mapping *ppnm_next;
10135 ppnum_t ppnm_base_paddr;
10136 unsigned int ppnm_sindx;
10137 unsigned int ppnm_eindx;
10138 };
10139
10140 struct ppnum_mapping *ppnm_head;
10141 struct ppnum_mapping *ppnm_last_found = NULL;
10142
10143
10144 void
hibernate_create_paddr_map(void)10145 hibernate_create_paddr_map(void)
10146 {
10147 unsigned int i;
10148 ppnum_t next_ppnum_in_run = 0;
10149 struct ppnum_mapping *ppnm = NULL;
10150
10151 if (hibernate_paddr_map_inited == FALSE) {
10152 for (i = 0; i < vm_pages_count; i++) {
10153 if (ppnm) {
10154 ppnm->ppnm_eindx = i;
10155 }
10156
10157 if (ppnm == NULL || VM_PAGE_GET_PHYS_PAGE(vm_page_get(i)) != next_ppnum_in_run) {
10158 ppnm = zalloc_permanent_type(struct ppnum_mapping);
10159
10160 ppnm->ppnm_next = ppnm_head;
10161 ppnm_head = ppnm;
10162
10163 ppnm->ppnm_sindx = i;
10164 ppnm->ppnm_base_paddr = VM_PAGE_GET_PHYS_PAGE(vm_page_get(i));
10165 }
10166 next_ppnum_in_run = VM_PAGE_GET_PHYS_PAGE(vm_page_get(i)) + 1;
10167 }
10168 ppnm->ppnm_eindx = vm_pages_count;
10169
10170 hibernate_paddr_map_inited = TRUE;
10171 }
10172 }
10173
10174 static ppnum_t
hibernate_lookup_paddr(unsigned int indx)10175 hibernate_lookup_paddr(unsigned int indx)
10176 {
10177 struct ppnum_mapping *ppnm = NULL;
10178
10179 ppnm = ppnm_last_found;
10180
10181 if (ppnm) {
10182 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
10183 goto done;
10184 }
10185 }
10186 for (ppnm = ppnm_head; ppnm; ppnm = ppnm->ppnm_next) {
10187 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
10188 ppnm_last_found = ppnm;
10189 break;
10190 }
10191 }
10192 if (ppnm == NULL) {
10193 panic("hibernate_lookup_paddr of %d failed", indx);
10194 }
10195 done:
10196 return ppnm->ppnm_base_paddr + (indx - ppnm->ppnm_sindx);
10197 }
10198
10199
10200 static uint32_t
hibernate_mark_as_unneeded(addr64_t saddr,addr64_t eaddr,hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)10201 hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
10202 {
10203 addr64_t saddr_aligned;
10204 addr64_t eaddr_aligned;
10205 addr64_t addr;
10206 ppnum_t paddr;
10207 unsigned int mark_as_unneeded_pages = 0;
10208
10209 saddr_aligned = (saddr + PAGE_MASK_64) & ~PAGE_MASK_64;
10210 eaddr_aligned = eaddr & ~PAGE_MASK_64;
10211
10212 for (addr = saddr_aligned; addr < eaddr_aligned; addr += PAGE_SIZE_64) {
10213 paddr = pmap_find_phys(kernel_pmap, addr);
10214
10215 assert(paddr);
10216
10217 hibernate_page_bitset(page_list, TRUE, paddr);
10218 hibernate_page_bitset(page_list_wired, TRUE, paddr);
10219
10220 mark_as_unneeded_pages++;
10221 }
10222 return mark_as_unneeded_pages;
10223 }
10224
10225
10226 static void
hibernate_hash_insert_page(vm_page_t mem)10227 hibernate_hash_insert_page(vm_page_t mem)
10228 {
10229 vm_page_bucket_t *bucket;
10230 int hash_id;
10231 vm_object_t m_object;
10232
10233 m_object = VM_PAGE_OBJECT(mem);
10234
10235 assert(mem->vmp_hashed);
10236 assert(m_object);
10237 assert(mem->vmp_offset != (vm_object_offset_t) -1);
10238
10239 /*
10240 * Insert it into the object_object/offset hash table
10241 */
10242 hash_id = vm_page_hash(m_object, mem->vmp_offset);
10243 bucket = &vm_page_buckets[hash_id];
10244
10245 mem->vmp_next_m = bucket->page_list;
10246 bucket->page_list = VM_PAGE_PACK_PTR(mem);
10247 }
10248
10249
10250 static void
hibernate_free_range_flush(vm_page_list_t * list)10251 hibernate_free_range_flush(vm_page_list_t *list)
10252 {
10253 vm_page_free_queue_enter_list(*list, VMP_RELEASE_HIBERNATE);
10254 *list = (vm_page_list_t){ };
10255 }
10256
10257 static void
hibernate_free_range(vm_page_list_t * list,int sindx,int eindx)10258 hibernate_free_range(vm_page_list_t *list, int sindx, int eindx)
10259 {
10260 for (; sindx < eindx; sindx++) {
10261 vm_page_t mem = vm_page_get(sindx);
10262 ppnum_t pnum = hibernate_lookup_paddr(sindx);
10263
10264 vm_page_init(mem, pnum);
10265 #if HAS_MTE
10266 mem->vmp_using_mte = pmap_is_tagged_page(pnum);
10267 #endif /* HAS_MTE */
10268 vm_page_list_push(list, mem);
10269
10270 /* Max batch size of these lists is 255 due to vmp_free_list_result_t */
10271 if (list->vmpl_count >= UINT8_MAX) {
10272 hibernate_free_range_flush(list);
10273 }
10274 }
10275 }
10276
10277 void
hibernate_rebuild_vm_structs(void)10278 hibernate_rebuild_vm_structs(void)
10279 {
10280 int cindx, sindx, eindx;
10281 vm_page_list_t list = { };
10282 vm_page_t mem, tmem, mem_next;
10283 AbsoluteTime startTime, endTime;
10284 uint64_t nsec;
10285
10286 if (!hibernate_rebuild_needed) {
10287 return;
10288 }
10289
10290 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_START);
10291 HIBLOG("hibernate_rebuild started\n");
10292
10293 clock_get_uptime(&startTime);
10294
10295 pal_hib_rebuild_pmap_structs();
10296
10297 bzero(&vm_page_buckets[0], vm_page_bucket_count * sizeof(vm_page_bucket_t));
10298 eindx = vm_pages_count;
10299
10300 /*
10301 * Mark all the vm_pages[] that have not been initialized yet as being
10302 * transient. This is needed to ensure that buddy page search is corrrect.
10303 * Without this random data in these vm_pages[] can trip the buddy search
10304 */
10305 for (int i = hibernate_teardown_last_valid_compact_indx + 1; i < eindx; ++i) {
10306 vm_page_get(i)->vmp_q_state = VM_PAGE_NOT_ON_Q;
10307 }
10308
10309 for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
10310 mem = vm_page_get(cindx);
10311 assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
10312 /*
10313 * hibernate_teardown_vm_structs leaves the location where
10314 * this vm_page_t must be located in "next".
10315 */
10316 tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10317 mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
10318 assert(tmem >= mem);
10319
10320 sindx = (int)(tmem - vm_page_get(0));
10321
10322 if (mem != tmem) {
10323 /*
10324 * this vm_page_t was moved by hibernate_teardown_vm_structs,
10325 * so move it back to its real location
10326 */
10327 *tmem = *mem;
10328 mem = tmem;
10329 }
10330 if (mem->vmp_hashed) {
10331 hibernate_hash_insert_page(mem);
10332 }
10333 /*
10334 * the 'hole' between this vm_page_t and the previous
10335 * vm_page_t we moved needs to be initialized as
10336 * a range of free vm_page_t's
10337 */
10338 hibernate_free_range(&list, sindx + 1, eindx);
10339
10340 eindx = sindx;
10341 }
10342 hibernate_free_range(&list, 0, sindx);
10343 hibernate_free_range_flush(&list);
10344
10345 VM_CHECK_MEMORYSTATUS;
10346
10347 assert(vm_page_free_count == hibernate_teardown_vm_page_free_count);
10348
10349 /*
10350 * process the list of vm_page_t's that were entered in the hash,
10351 * but were not located in the vm_pages arrary... these are
10352 * vm_page_t's that were created on the fly (i.e. fictitious)
10353 */
10354 for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) {
10355 mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10356
10357 mem->vmp_next_m = 0;
10358 hibernate_hash_insert_page(mem);
10359 }
10360 hibernate_rebuild_hash_list = NULL;
10361
10362 clock_get_uptime(&endTime);
10363 SUB_ABSOLUTETIME(&endTime, &startTime);
10364 absolutetime_to_nanoseconds(endTime, &nsec);
10365
10366 HIBLOG("hibernate_rebuild completed - took %qd msecs\n", nsec / 1000000ULL);
10367
10368 hibernate_rebuild_needed = false;
10369
10370 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END);
10371 }
10372
10373 static uint32_t
hibernate_teardown_vm_structs(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)10374 hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
10375 {
10376 unsigned int compact_target_indx;
10377 unsigned int mark_as_unneeded_pages = 0;
10378 unsigned int unneeded_vm_page_bucket_pages = 0;
10379 unsigned int unneeded_vm_pages_pages = 0;
10380 unsigned int unneeded_pmap_pages = 0;
10381 addr64_t start_of_unneeded = 0;
10382 addr64_t end_of_unneeded = 0;
10383
10384
10385 if (hibernate_should_abort()) {
10386 return 0;
10387 }
10388
10389 hibernate_rebuild_needed = true;
10390
10391 HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, "
10392 "active_pages %d, inactive_pages %d, speculative_pages %d, "
10393 "cleaned_pages %d, compressor_pages %d\n",
10394 vm_page_wire_count, vm_page_free_count,
10395 vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count,
10396 vm_page_cleaned_count, compressor_object->resident_page_count);
10397
10398 for (uint32_t i = 0; i < vm_page_bucket_count; i++) {
10399 vm_page_bucket_t *bucket = &vm_page_buckets[i];
10400 vm_page_t mem, mem_next;
10401
10402 for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) {
10403 assert(mem->vmp_hashed);
10404
10405 mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10406
10407 if (!vm_page_in_array(mem)) {
10408 mem->vmp_next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list);
10409 hibernate_rebuild_hash_list = mem;
10410 }
10411 }
10412 }
10413 unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0],
10414 (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired);
10415 mark_as_unneeded_pages += unneeded_vm_page_bucket_pages;
10416
10417 hibernate_teardown_vm_page_free_count = vm_page_free_count;
10418
10419 compact_target_indx = 0;
10420
10421 vm_free_page_lock();
10422
10423 for (uint32_t i = 0; i < vm_pages_count; i++) {
10424 vm_page_t mem = vm_page_get(i);
10425 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
10426 vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
10427
10428 if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
10429 vm_page_free_queue_remove(class, mem, pnum,
10430 VM_PAGE_ON_FREE_Q);
10431 hibernate_teardown_found_free_pages++;
10432
10433 if (vm_page_get(compact_target_indx)->vmp_q_state != VM_PAGE_ON_FREE_Q) {
10434 compact_target_indx = i;
10435 }
10436 } else {
10437 /*
10438 * record this vm_page_t's original location
10439 * we need this even if it doesn't get moved
10440 * as an indicator to the rebuild function that
10441 * we don't have to move it
10442 */
10443 mem->vmp_next_m = VM_PAGE_PACK_PTR(mem);
10444
10445 if (vm_page_get(compact_target_indx)->vmp_q_state == VM_PAGE_ON_FREE_Q) {
10446 /*
10447 * we've got a hole to fill, so
10448 * move this vm_page_t to it's new home
10449 */
10450 *vm_page_get(compact_target_indx) = *mem;
10451 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
10452
10453 hibernate_teardown_last_valid_compact_indx = compact_target_indx;
10454 compact_target_indx++;
10455 } else {
10456 hibernate_teardown_last_valid_compact_indx = i;
10457 }
10458 }
10459 }
10460
10461 vm_free_page_unlock();
10462
10463 unneeded_vm_pages_pages = hibernate_mark_as_unneeded(
10464 (addr64_t)vm_page_get(hibernate_teardown_last_valid_compact_indx + 1),
10465 (addr64_t)vm_page_get(vm_pages_count - 1),
10466 page_list, page_list_wired);
10467 mark_as_unneeded_pages += unneeded_vm_pages_pages;
10468
10469 pal_hib_teardown_pmap_structs(&start_of_unneeded, &end_of_unneeded);
10470
10471 if (start_of_unneeded) {
10472 unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded,
10473 end_of_unneeded, page_list, page_list_wired);
10474 mark_as_unneeded_pages += unneeded_pmap_pages;
10475 }
10476 HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n",
10477 unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages);
10478
10479 return mark_as_unneeded_pages;
10480 }
10481
10482 #endif /* HIBERNATION */
10483
10484 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
10485
10486 #include <mach_vm_debug.h>
10487 #if MACH_VM_DEBUG
10488
10489 #include <mach_debug/hash_info.h>
10490 #include <vm/vm_debug_internal.h>
10491
10492 /*
10493 * Routine: vm_page_info
10494 * Purpose:
10495 * Return information about the global VP table.
10496 * Fills the buffer with as much information as possible
10497 * and returns the desired size of the buffer.
10498 * Conditions:
10499 * Nothing locked. The caller should provide
10500 * possibly-pageable memory.
10501 */
10502
10503 unsigned int
vm_page_info(hash_info_bucket_t * info,unsigned int count)10504 vm_page_info(
10505 hash_info_bucket_t *info,
10506 unsigned int count)
10507 {
10508 unsigned int i;
10509 lck_ticket_t *bucket_lock;
10510
10511 if (vm_page_bucket_count < count) {
10512 count = vm_page_bucket_count;
10513 }
10514
10515 for (i = 0; i < count; i++) {
10516 vm_page_bucket_t *bucket = &vm_page_buckets[i];
10517 unsigned int bucket_count = 0;
10518 vm_page_t m;
10519
10520 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
10521 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
10522
10523 for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
10524 m != VM_PAGE_NULL;
10525 m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->vmp_next_m))) {
10526 bucket_count++;
10527 }
10528
10529 lck_ticket_unlock(bucket_lock);
10530
10531 /* don't touch pageable memory while holding locks */
10532 info[i].hib_count = bucket_count;
10533 }
10534
10535 return vm_page_bucket_count;
10536 }
10537 #endif /* MACH_VM_DEBUG */
10538
10539 #if VM_PAGE_BUCKETS_CHECK
10540 void
vm_page_buckets_check(void)10541 vm_page_buckets_check(void)
10542 {
10543 unsigned int i;
10544 vm_page_t p;
10545 unsigned int p_hash;
10546 vm_page_bucket_t *bucket;
10547 lck_ticket_t *bucket_lock;
10548
10549 if (!vm_page_buckets_check_ready) {
10550 return;
10551 }
10552
10553 #if HIBERNATION
10554 if (hibernate_rebuild_needed ||
10555 hibernate_rebuild_hash_list) {
10556 panic("BUCKET_CHECK: hibernation in progress: "
10557 "rebuild_needed=%d rebuild_hash_list=%p\n",
10558 hibernate_rebuild_needed,
10559 hibernate_rebuild_hash_list);
10560 }
10561 #endif /* HIBERNATION */
10562
10563 #if VM_PAGE_FAKE_BUCKETS
10564 char *cp;
10565 for (cp = (char *) vm_page_fake_buckets_start;
10566 cp < (char *) vm_page_fake_buckets_end;
10567 cp++) {
10568 if (*cp != 0x5a) {
10569 panic("BUCKET_CHECK: corruption at %p in fake buckets "
10570 "[0x%llx:0x%llx]\n",
10571 cp,
10572 (uint64_t) vm_page_fake_buckets_start,
10573 (uint64_t) vm_page_fake_buckets_end);
10574 }
10575 }
10576 #endif /* VM_PAGE_FAKE_BUCKETS */
10577
10578 for (i = 0; i < vm_page_bucket_count; i++) {
10579 vm_object_t p_object;
10580
10581 bucket = &vm_page_buckets[i];
10582 if (!bucket->page_list) {
10583 continue;
10584 }
10585
10586 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
10587 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
10588 p = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
10589
10590 while (p != VM_PAGE_NULL) {
10591 p_object = VM_PAGE_OBJECT(p);
10592
10593 if (!p->vmp_hashed) {
10594 panic("BUCKET_CHECK: page %p (%p,0x%llx) "
10595 "hash %d in bucket %d at %p "
10596 "is not hashed\n",
10597 p, p_object, p->vmp_offset,
10598 p_hash, i, bucket);
10599 }
10600 p_hash = vm_page_hash(p_object, p->vmp_offset);
10601 if (p_hash != i) {
10602 panic("BUCKET_CHECK: corruption in bucket %d "
10603 "at %p: page %p object %p offset 0x%llx "
10604 "hash %d\n",
10605 i, bucket, p, p_object, p->vmp_offset,
10606 p_hash);
10607 }
10608 p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m));
10609 }
10610 lck_ticket_unlock(bucket_lock);
10611 }
10612
10613 // printf("BUCKET_CHECK: checked buckets\n");
10614 }
10615 #endif /* VM_PAGE_BUCKETS_CHECK */
10616
10617 /*
10618 * 'vm_fault_enter' will place newly created pages (zero-fill and COW) onto the
10619 * local queues if they exist... its the only spot in the system where we add pages
10620 * to those queues... once on those queues, those pages can only move to one of the
10621 * global page queues or the free queues... they NEVER move from local q to local q.
10622 * the 'local' state is stable when vm_page_queues_remove is called since we're behind
10623 * the global vm_page_queue_lock at this point... we still need to take the local lock
10624 * in case this operation is being run on a different CPU then the local queue's identity,
10625 * but we don't have to worry about the page moving to a global queue or becoming wired
10626 * while we're grabbing the local lock since those operations would require the global
10627 * vm_page_queue_lock to be held, and we already own it.
10628 *
10629 * this is why its safe to utilze the wire_count field in the vm_page_t as the local_id...
10630 * 'wired' and local are ALWAYS mutually exclusive conditions.
10631 */
10632
10633 void
vm_page_queues_remove(vm_page_t mem,boolean_t remove_from_specialq)10634 vm_page_queues_remove(vm_page_t mem, boolean_t remove_from_specialq)
10635 {
10636 boolean_t was_pageable = TRUE;
10637 vm_object_t m_object;
10638
10639 m_object = VM_PAGE_OBJECT(mem);
10640
10641 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10642
10643 if (mem->vmp_q_state == VM_PAGE_NOT_ON_Q) {
10644 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10645 if (remove_from_specialq == TRUE) {
10646 vm_page_remove_from_specialq(mem);
10647 }
10648 /*if (mem->vmp_on_specialq != VM_PAGE_SPECIAL_Q_EMPTY) {
10649 * assert(mem->vmp_specialq.next != 0);
10650 * assert(mem->vmp_specialq.prev != 0);
10651 * } else {*/
10652 if (mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
10653 assert(mem->vmp_specialq.next == 0);
10654 assert(mem->vmp_specialq.prev == 0);
10655 }
10656 return;
10657 }
10658
10659 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
10660 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10661 assert(mem->vmp_specialq.next == 0 &&
10662 mem->vmp_specialq.prev == 0 &&
10663 mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
10664 return;
10665 }
10666 if (mem->vmp_q_state == VM_PAGE_IS_WIRED) {
10667 /*
10668 * might put these guys on a list for debugging purposes
10669 * if we do, we'll need to remove this assert
10670 */
10671 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10672 assert(mem->vmp_specialq.next == 0 &&
10673 mem->vmp_specialq.prev == 0);
10674 /*
10675 * Recall that vmp_on_specialq also means a request to put
10676 * it on the special Q. So we don't want to reset that bit
10677 * just because a wiring request came in. We might want to
10678 * put it on the special queue post-unwiring.
10679 *
10680 * &&
10681 * mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
10682 */
10683 return;
10684 }
10685
10686 assert(m_object != compressor_object);
10687 assert(!is_kernel_object(m_object));
10688 assert(!vm_page_is_fictitious(mem));
10689
10690 switch (mem->vmp_q_state) {
10691 case VM_PAGE_ON_ACTIVE_LOCAL_Q:
10692 {
10693 struct vpl *lq;
10694
10695 lq = zpercpu_get_cpu(vm_page_local_q, mem->vmp_local_id);
10696 VPL_LOCK(&lq->vpl_lock);
10697 vm_page_queue_remove(&lq->vpl_queue, mem, vmp_pageq);
10698 mem->vmp_local_id = 0;
10699 lq->vpl_count--;
10700 if (m_object->internal) {
10701 lq->vpl_internal_count--;
10702 } else {
10703 lq->vpl_external_count--;
10704 }
10705 VPL_UNLOCK(&lq->vpl_lock);
10706 was_pageable = FALSE;
10707 break;
10708 }
10709 case VM_PAGE_ON_ACTIVE_Q:
10710 {
10711 vm_page_queue_remove(&vm_page_queue_active, mem, vmp_pageq);
10712 vm_page_active_count--;
10713 break;
10714 }
10715
10716 case VM_PAGE_ON_INACTIVE_INTERNAL_Q:
10717 {
10718 assert(m_object->internal == TRUE);
10719
10720 vm_page_inactive_count--;
10721 vm_page_queue_remove(&vm_page_queue_anonymous, mem, vmp_pageq);
10722 vm_page_anonymous_count--;
10723
10724 vm_purgeable_q_advance_all();
10725 vm_page_balance_inactive(3);
10726 break;
10727 }
10728
10729 case VM_PAGE_ON_INACTIVE_EXTERNAL_Q:
10730 {
10731 assert(m_object->internal == FALSE);
10732
10733 vm_page_inactive_count--;
10734 vm_page_queue_remove(&vm_page_queue_inactive, mem, vmp_pageq);
10735 vm_purgeable_q_advance_all();
10736 vm_page_balance_inactive(3);
10737 break;
10738 }
10739
10740 case VM_PAGE_ON_INACTIVE_CLEANED_Q:
10741 {
10742 assert(m_object->internal == FALSE);
10743
10744 vm_page_inactive_count--;
10745 vm_page_queue_remove(&vm_page_queue_cleaned, mem, vmp_pageq);
10746 vm_page_cleaned_count--;
10747 vm_page_balance_inactive(3);
10748 break;
10749 }
10750
10751 case VM_PAGE_ON_THROTTLED_Q:
10752 {
10753 assert(m_object->internal == TRUE);
10754
10755 vm_page_queue_remove(&vm_page_queue_throttled, mem, vmp_pageq);
10756 vm_page_throttled_count--;
10757 was_pageable = FALSE;
10758 break;
10759 }
10760
10761 case VM_PAGE_ON_SPECULATIVE_Q:
10762 {
10763 assert(m_object->internal == FALSE);
10764
10765 vm_page_remque(&mem->vmp_pageq);
10766 vm_page_speculative_count--;
10767 vm_page_balance_inactive(3);
10768 break;
10769 }
10770
10771 #if CONFIG_SECLUDED_MEMORY
10772 case VM_PAGE_ON_SECLUDED_Q:
10773 {
10774 vm_page_queue_remove(&vm_page_queue_secluded, mem, vmp_pageq);
10775 vm_page_secluded_count--;
10776 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
10777 if (m_object == VM_OBJECT_NULL) {
10778 vm_page_secluded_count_free--;
10779 was_pageable = FALSE;
10780 } else {
10781 assert(!m_object->internal);
10782 vm_page_secluded_count_inuse--;
10783 was_pageable = FALSE;
10784 // was_pageable = TRUE;
10785 }
10786 break;
10787 }
10788 #endif /* CONFIG_SECLUDED_MEMORY */
10789
10790 default:
10791 {
10792 /*
10793 * if (mem->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)
10794 * NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue...
10795 * the caller is responsible for determing if the page is on that queue, and if so, must
10796 * either first remove it (it needs both the page queues lock and the object lock to do
10797 * this via vm_pageout_steal_laundry), or avoid the call to vm_page_queues_remove
10798 *
10799 * we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q
10800 * or any of the undefined states
10801 */
10802 panic("vm_page_queues_remove - bad page q_state (%p, %d)", mem, mem->vmp_q_state);
10803 break;
10804 }
10805 }
10806 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
10807 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
10808
10809 if (remove_from_specialq == TRUE) {
10810 vm_page_remove_from_specialq(mem);
10811 }
10812 if (was_pageable) {
10813 if (m_object->internal) {
10814 vm_page_pageable_internal_count--;
10815 } else {
10816 vm_page_pageable_external_count--;
10817 }
10818 }
10819 }
10820
10821 void
vm_page_remove_internal(vm_page_t page)10822 vm_page_remove_internal(vm_page_t page)
10823 {
10824 vm_object_t __object = VM_PAGE_OBJECT(page);
10825 if (page == __object->memq_hint) {
10826 vm_page_t __new_hint;
10827 vm_page_queue_entry_t __qe;
10828 __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->vmp_listq);
10829 if (vm_page_queue_end(&__object->memq, __qe)) {
10830 __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->vmp_listq);
10831 if (vm_page_queue_end(&__object->memq, __qe)) {
10832 __qe = NULL;
10833 }
10834 }
10835 __new_hint = (vm_page_t)((uintptr_t) __qe);
10836 __object->memq_hint = __new_hint;
10837 }
10838 vm_page_queue_remove(&__object->memq, page, vmp_listq);
10839 #if CONFIG_SECLUDED_MEMORY
10840 if (__object->eligible_for_secluded) {
10841 vm_page_secluded.eligible_for_secluded--;
10842 }
10843 #endif /* CONFIG_SECLUDED_MEMORY */
10844 #if HAS_MTE
10845 assert_mte_vmo_matches_vmp(__object, page);
10846 #endif /* HAS_MTE */
10847 }
10848
10849 void
vm_page_enqueue_inactive(vm_page_t mem,boolean_t first)10850 vm_page_enqueue_inactive(vm_page_t mem, boolean_t first)
10851 {
10852 vm_object_t m_object;
10853
10854 m_object = VM_PAGE_OBJECT(mem);
10855
10856 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10857 assert(!vm_page_is_fictitious(mem));
10858 assert(!mem->vmp_laundry);
10859 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
10860 vm_page_check_pageable_safe(mem);
10861
10862 if (m_object->internal) {
10863 mem->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
10864
10865 if (first == TRUE) {
10866 vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vmp_pageq);
10867 } else {
10868 vm_page_queue_enter(&vm_page_queue_anonymous, mem, vmp_pageq);
10869 }
10870
10871 vm_page_anonymous_count++;
10872 vm_page_pageable_internal_count++;
10873 } else {
10874 mem->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
10875
10876 if (first == TRUE) {
10877 vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vmp_pageq);
10878 } else {
10879 vm_page_queue_enter(&vm_page_queue_inactive, mem, vmp_pageq);
10880 }
10881
10882 vm_page_pageable_external_count++;
10883 }
10884 vm_page_inactive_count++;
10885 token_new_pagecount++;
10886
10887 vm_page_add_to_specialq(mem, FALSE);
10888 }
10889
10890 void
vm_page_enqueue_active(vm_page_t mem,boolean_t first)10891 vm_page_enqueue_active(vm_page_t mem, boolean_t first)
10892 {
10893 vm_object_t m_object;
10894
10895 m_object = VM_PAGE_OBJECT(mem);
10896
10897 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10898 assert(!vm_page_is_fictitious(mem));
10899 assert(!mem->vmp_laundry);
10900 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
10901 vm_page_check_pageable_safe(mem);
10902
10903 mem->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
10904 if (first == TRUE) {
10905 vm_page_queue_enter_first(&vm_page_queue_active, mem, vmp_pageq);
10906 } else {
10907 vm_page_queue_enter(&vm_page_queue_active, mem, vmp_pageq);
10908 }
10909 vm_page_active_count++;
10910
10911 if (m_object->internal) {
10912 vm_page_pageable_internal_count++;
10913 } else {
10914 vm_page_pageable_external_count++;
10915 }
10916
10917 vm_page_add_to_specialq(mem, FALSE);
10918 vm_page_balance_inactive(3);
10919 }
10920
10921 /*
10922 * Pages from special kernel objects shouldn't
10923 * be placed on pageable queues.
10924 */
10925 void
vm_page_check_pageable_safe(vm_page_t page)10926 vm_page_check_pageable_safe(vm_page_t page)
10927 {
10928 vm_object_t page_object;
10929
10930 page_object = VM_PAGE_OBJECT(page);
10931
10932 if (is_kernel_object(page_object)) {
10933 panic("vm_page_check_pageable_safe: trying to add page"
10934 "from a kernel object to pageable queue");
10935 }
10936
10937 if (page_object == compressor_object) {
10938 panic("vm_page_check_pageable_safe: trying to add page"
10939 "from compressor object (%p) to pageable queue", compressor_object);
10940 }
10941 }
10942
10943 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
10944 * wired page diagnose
10945 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
10946
10947 #include <libkern/OSKextLibPrivate.h>
10948
10949 #define KA_SIZE(namelen, subtotalscount) \
10950 (sizeof(struct vm_allocation_site) + (namelen) + 1 + ((subtotalscount) * sizeof(struct vm_allocation_total)))
10951
10952 #define KA_NAME(alloc) \
10953 ((char *)(&(alloc)->subtotals[(alloc->subtotalscount)]))
10954
10955 #define KA_NAME_LEN(alloc) \
10956 (VM_TAG_NAME_LEN_MAX & (alloc->flags >> VM_TAG_NAME_LEN_SHIFT))
10957
10958 vm_tag_t
vm_tag_bt(void)10959 vm_tag_bt(void)
10960 {
10961 uintptr_t* frameptr;
10962 uintptr_t* frameptr_next;
10963 uintptr_t retaddr;
10964 uintptr_t kstackb, kstackt;
10965 const vm_allocation_site_t * site;
10966 thread_t cthread;
10967 kern_allocation_name_t name;
10968
10969 cthread = current_thread();
10970 if (__improbable(cthread == NULL)) {
10971 return VM_KERN_MEMORY_OSFMK;
10972 }
10973
10974 if ((name = thread_get_kernel_state(cthread)->allocation_name)) {
10975 if (!name->tag) {
10976 vm_tag_alloc(name);
10977 }
10978 return name->tag;
10979 }
10980
10981 kstackb = cthread->kernel_stack;
10982 kstackt = kstackb + kernel_stack_size;
10983
10984 /* Load stack frame pointer (EBP on x86) into frameptr */
10985 frameptr = __builtin_frame_address(0);
10986 site = NULL;
10987 while (frameptr != NULL) {
10988 /* Verify thread stack bounds */
10989 if (((uintptr_t)(frameptr + 2) > kstackt) || ((uintptr_t)frameptr < kstackb)) {
10990 break;
10991 }
10992
10993 /* Next frame pointer is pointed to by the previous one */
10994 frameptr_next = (uintptr_t*) *frameptr;
10995 #if defined(HAS_APPLE_PAC)
10996 frameptr_next = ptrauth_strip(frameptr_next, ptrauth_key_frame_pointer);
10997 #endif
10998
10999 /* Pull return address from one spot above the frame pointer */
11000 retaddr = *(frameptr + 1);
11001
11002 #if defined(HAS_APPLE_PAC)
11003 retaddr = (uintptr_t) ptrauth_strip((void *)retaddr, ptrauth_key_return_address);
11004 #endif
11005
11006 if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text))
11007 || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) {
11008 site = OSKextGetAllocationSiteForCaller(retaddr);
11009 break;
11010 }
11011 frameptr = frameptr_next;
11012 }
11013
11014 if (site) {
11015 return site->tag;
11016 }
11017
11018 #if MACH_ASSERT
11019 /*
11020 * Kernel tests appear here as unrecognized call sites and would get
11021 * no memory tag. Give them a default tag to prevent panics later.
11022 */
11023 if (thread_get_test_option(test_option_vm_prevent_wire_tag_panic)) {
11024 return VM_KERN_MEMORY_OSFMK;
11025 }
11026 #endif
11027
11028 return VM_KERN_MEMORY_NONE;
11029 }
11030
11031 static uint64_t free_tag_bits[VM_MAX_TAG_VALUE / 64];
11032
11033 void
vm_tag_alloc_locked(vm_allocation_site_t * site,vm_allocation_site_t ** releasesiteP)11034 vm_tag_alloc_locked(vm_allocation_site_t * site, vm_allocation_site_t ** releasesiteP)
11035 {
11036 vm_tag_t tag;
11037 uint64_t avail;
11038 uint32_t idx;
11039 vm_allocation_site_t * prev;
11040
11041 if (site->tag) {
11042 return;
11043 }
11044
11045 idx = 0;
11046 while (TRUE) {
11047 avail = free_tag_bits[idx];
11048 if (avail) {
11049 tag = (vm_tag_t)__builtin_clzll(avail);
11050 avail &= ~(1ULL << (63 - tag));
11051 free_tag_bits[idx] = avail;
11052 tag += (idx << 6);
11053 break;
11054 }
11055 idx++;
11056 if (idx >= ARRAY_COUNT(free_tag_bits)) {
11057 for (idx = 0; idx < ARRAY_COUNT(vm_allocation_sites); idx++) {
11058 prev = vm_allocation_sites[idx];
11059 if (!prev) {
11060 continue;
11061 }
11062 if (!KA_NAME_LEN(prev)) {
11063 continue;
11064 }
11065 if (!prev->tag) {
11066 continue;
11067 }
11068 if (prev->total) {
11069 continue;
11070 }
11071 if (1 != prev->refcount) {
11072 continue;
11073 }
11074
11075 assert(idx == prev->tag);
11076 tag = (vm_tag_t)idx;
11077 prev->tag = VM_KERN_MEMORY_NONE;
11078 *releasesiteP = prev;
11079 break;
11080 }
11081 if (idx >= ARRAY_COUNT(vm_allocation_sites)) {
11082 tag = VM_KERN_MEMORY_ANY;
11083 }
11084 break;
11085 }
11086 }
11087 site->tag = tag;
11088
11089 OSAddAtomic16(1, &site->refcount);
11090
11091 if (VM_KERN_MEMORY_ANY != tag) {
11092 vm_allocation_sites[tag] = site;
11093 }
11094
11095 if (tag > vm_allocation_tag_highest) {
11096 vm_allocation_tag_highest = tag;
11097 }
11098 }
11099
11100 static void
vm_tag_free_locked(vm_tag_t tag)11101 vm_tag_free_locked(vm_tag_t tag)
11102 {
11103 uint64_t avail;
11104 uint32_t idx;
11105 uint64_t bit;
11106
11107 if (VM_KERN_MEMORY_ANY == tag) {
11108 return;
11109 }
11110
11111 idx = (tag >> 6);
11112 avail = free_tag_bits[idx];
11113 tag &= 63;
11114 bit = (1ULL << (63 - tag));
11115 assert(!(avail & bit));
11116 free_tag_bits[idx] = (avail | bit);
11117 }
11118
11119 static void
vm_tag_init(void)11120 vm_tag_init(void)
11121 {
11122 vm_tag_t tag;
11123 for (tag = VM_KERN_MEMORY_FIRST_DYNAMIC; tag < VM_KERN_MEMORY_ANY; tag++) {
11124 vm_tag_free_locked(tag);
11125 }
11126
11127 for (tag = VM_KERN_MEMORY_ANY + 1; tag < VM_MAX_TAG_VALUE; tag++) {
11128 vm_tag_free_locked(tag);
11129 }
11130 }
11131
11132 vm_tag_t
vm_tag_alloc(vm_allocation_site_t * site)11133 vm_tag_alloc(vm_allocation_site_t * site)
11134 {
11135 vm_allocation_site_t * releasesite;
11136
11137 if (!site->tag) {
11138 releasesite = NULL;
11139 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11140 vm_tag_alloc_locked(site, &releasesite);
11141 lck_ticket_unlock(&vm_allocation_sites_lock);
11142 if (releasesite) {
11143 kern_allocation_name_release(releasesite);
11144 }
11145 }
11146
11147 return site->tag;
11148 }
11149
11150 #ifndef ARRAY_SIZE
11151 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
11152 #endif /* ARRAY_SIZE */
11153 #define VM_KERN_MEMORY_ELEM(name) [VM_KERN_MEMORY_##name] = "VM_KERN_MEMORY_" #name
11154 const char *vm_kern_memory_names[] = {
11155 VM_KERN_MEMORY_ELEM(NONE),
11156 VM_KERN_MEMORY_ELEM(OSFMK),
11157 VM_KERN_MEMORY_ELEM(BSD),
11158 VM_KERN_MEMORY_ELEM(IOKIT),
11159 VM_KERN_MEMORY_ELEM(LIBKERN),
11160 VM_KERN_MEMORY_ELEM(OSKEXT),
11161 VM_KERN_MEMORY_ELEM(KEXT),
11162 VM_KERN_MEMORY_ELEM(IPC),
11163 VM_KERN_MEMORY_ELEM(STACK),
11164 VM_KERN_MEMORY_ELEM(CPU),
11165 VM_KERN_MEMORY_ELEM(PMAP),
11166 VM_KERN_MEMORY_ELEM(PTE),
11167 VM_KERN_MEMORY_ELEM(ZONE),
11168 VM_KERN_MEMORY_ELEM(KALLOC),
11169 VM_KERN_MEMORY_ELEM(COMPRESSOR),
11170 VM_KERN_MEMORY_ELEM(COMPRESSED_DATA),
11171 VM_KERN_MEMORY_ELEM(PHANTOM_CACHE),
11172 VM_KERN_MEMORY_ELEM(WAITQ),
11173 VM_KERN_MEMORY_ELEM(DIAG),
11174 VM_KERN_MEMORY_ELEM(LOG),
11175 VM_KERN_MEMORY_ELEM(FILE),
11176 VM_KERN_MEMORY_ELEM(MBUF),
11177 VM_KERN_MEMORY_ELEM(UBC),
11178 VM_KERN_MEMORY_ELEM(SECURITY),
11179 VM_KERN_MEMORY_ELEM(MLOCK),
11180 VM_KERN_MEMORY_ELEM(REASON),
11181 VM_KERN_MEMORY_ELEM(SKYWALK),
11182 VM_KERN_MEMORY_ELEM(LTABLE),
11183 VM_KERN_MEMORY_ELEM(HV),
11184 VM_KERN_MEMORY_ELEM(KALLOC_DATA),
11185 VM_KERN_MEMORY_ELEM(RETIRED),
11186 VM_KERN_MEMORY_ELEM(KALLOC_TYPE),
11187 VM_KERN_MEMORY_ELEM(TRIAGE),
11188 VM_KERN_MEMORY_ELEM(RECOUNT),
11189 VM_KERN_MEMORY_ELEM(MTAG),
11190 VM_KERN_MEMORY_ELEM(EXCLAVES),
11191 VM_KERN_MEMORY_ELEM(EXCLAVES_SHARED),
11192 VM_KERN_MEMORY_ELEM(KALLOC_SHARED),
11193 VM_KERN_MEMORY_ELEM(CPUTRACE),
11194 };
11195
11196 _Static_assert(ARRAY_SIZE(vm_kern_memory_names) == VM_KERN_MEMORY_FIRST_DYNAMIC,
11197 "vm_kern_memory_names must map all counter tags");
11198
11199 #define VM_KERN_COUNT_ELEM(name) [VM_KERN_COUNT_##name] = "VM_KERN_COUNT_" #name
11200 const char *vm_kern_count_names[] = {
11201 VM_KERN_COUNT_ELEM(MANAGED),
11202 VM_KERN_COUNT_ELEM(RESERVED),
11203 VM_KERN_COUNT_ELEM(WIRED),
11204 VM_KERN_COUNT_ELEM(WIRED_MANAGED),
11205 VM_KERN_COUNT_ELEM(STOLEN),
11206 VM_KERN_COUNT_ELEM(LOPAGE),
11207 VM_KERN_COUNT_ELEM(MAP_KERNEL),
11208 VM_KERN_COUNT_ELEM(MAP_ZONE),
11209 VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE),
11210 VM_KERN_COUNT_ELEM(WIRED_BOOT),
11211 VM_KERN_COUNT_ELEM(BOOT_STOLEN),
11212 VM_KERN_COUNT_ELEM(WIRED_STATIC_KERNELCACHE),
11213 VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE_DATA),
11214 VM_KERN_COUNT_ELEM(MAP_KERNEL_DATA),
11215 VM_KERN_COUNT_ELEM(EXCLAVES_CARVEOUT),
11216 };
11217
11218 #if VM_BTLOG_TAGS
11219 #define VM_KERN_MEMORY_STR_MAX_LEN (32)
11220 TUNABLE_STR(vmtaglog, VM_KERN_MEMORY_STR_MAX_LEN, "vmtaglog", "");
11221 #define VM_TAG_BTLOG_SIZE (16u << 10)
11222
11223 btlog_t vmtaglog_btlog;
11224 vm_tag_t vmtaglog_tag;
11225
11226 static void
vm_tag_log(vm_object_t object,int64_t delta,void * fp)11227 vm_tag_log(vm_object_t object, int64_t delta, void *fp)
11228 {
11229 if (is_kernel_object(object)) {
11230 /* kernel object backtraces are tracked in vm entries */
11231 return;
11232 }
11233 if (delta > 0) {
11234 btref_t ref = btref_get(fp, BTREF_GET_NOWAIT);
11235 btlog_record(vmtaglog_btlog, object, 0, ref);
11236 } else if (object->wired_page_count == 0) {
11237 btlog_erase(vmtaglog_btlog, object);
11238 }
11239 }
11240
11241 _Static_assert(ARRAY_SIZE(vm_kern_count_names) == VM_KERN_COUNTER_COUNT,
11242 "vm_kern_count_names must map all counter tags");
11243
11244 static vm_tag_t
vm_tag_str_to_idx(char tagstr[VM_KERN_MEMORY_STR_MAX_LEN])11245 vm_tag_str_to_idx(char tagstr[VM_KERN_MEMORY_STR_MAX_LEN])
11246 {
11247 for (vm_tag_t i = VM_KERN_MEMORY_OSFMK; i < ARRAY_SIZE(vm_kern_memory_names); i++) {
11248 if (!strncmp(vm_kern_memory_names[i], tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11249 return i;
11250 }
11251 }
11252
11253 if (!strncmp("dynamic", tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11254 return VM_KERN_MEMORY_FIRST_DYNAMIC;
11255 }
11256
11257 if (!strncmp("any", tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11258 return VM_KERN_MEMORY_ANY;
11259 }
11260
11261 printf("Unable to find vm tag %s for btlog\n", tagstr);
11262 return VM_KERN_MEMORY_NONE;
11263 }
11264
11265 __startup_func
11266 static void
vm_btlog_init(void)11267 vm_btlog_init(void)
11268 {
11269 vmtaglog_tag = vm_tag_str_to_idx(vmtaglog);
11270
11271 if (vmtaglog_tag != VM_KERN_MEMORY_NONE) {
11272 vmtaglog_btlog = btlog_create(BTLOG_HASH, VM_TAG_BTLOG_SIZE, 0);
11273 }
11274 }
11275 STARTUP(ZALLOC, STARTUP_RANK_FIRST, vm_btlog_init);
11276 #endif /* VM_BTLOG_TAGS */
11277
11278 void
vm_tag_update_size(vm_tag_t tag,int64_t delta,vm_object_t object)11279 vm_tag_update_size(vm_tag_t tag, int64_t delta, vm_object_t object)
11280 {
11281 assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
11282
11283 kern_allocation_update_size(vm_allocation_sites[tag], delta, object);
11284 }
11285
11286 uint64_t
vm_tag_get_size(vm_tag_t tag)11287 vm_tag_get_size(vm_tag_t tag)
11288 {
11289 vm_allocation_site_t *allocation;
11290
11291 assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
11292
11293 allocation = vm_allocation_sites[tag];
11294 return allocation ? os_atomic_load(&allocation->total, relaxed) : 0;
11295 }
11296
11297 void
kern_allocation_update_size(kern_allocation_name_t allocation,int64_t delta,__unused vm_object_t object)11298 kern_allocation_update_size(kern_allocation_name_t allocation, int64_t delta, __unused vm_object_t object)
11299 {
11300 uint64_t value;
11301
11302 value = os_atomic_add(&allocation->total, delta, relaxed);
11303 if (delta < 0) {
11304 assertf(value + (uint64_t)-delta > value,
11305 "tag %d, site %p", allocation->tag, allocation);
11306 }
11307
11308 #if DEBUG || DEVELOPMENT
11309 /* release to publish the new total */
11310 os_atomic_max(&allocation->peak, value, release);
11311 #endif /* DEBUG || DEVELOPMENT */
11312
11313 if (value == (uint64_t)delta && !allocation->tag) {
11314 vm_tag_alloc(allocation);
11315 }
11316
11317 #if VM_BTLOG_TAGS
11318 if (vmtaglog_matches(allocation->tag) && object) {
11319 vm_tag_log(object, delta, __builtin_frame_address(0));
11320 }
11321 #endif /* VM_BTLOG_TAGS */
11322 }
11323
11324 #if DEBUG || DEVELOPMENT
11325
11326 void
vm_tag_reset_all_peaks(void)11327 vm_tag_reset_all_peaks(void)
11328 {
11329 vm_log("resetting peak size for all kernel tags\n");
11330 for (vm_tag_t tag = 0; tag <= vm_allocation_tag_highest; tag++) {
11331 vm_tag_reset_peak(tag);
11332 }
11333 }
11334
11335 kern_return_t
vm_tag_reset_peak(vm_tag_t tag)11336 vm_tag_reset_peak(vm_tag_t tag)
11337 {
11338 if (tag > vm_allocation_tag_highest) {
11339 return KERN_INVALID_ARGUMENT;
11340 }
11341
11342 vm_allocation_site_t *site = vm_allocation_sites[tag];
11343 vm_log_info("resetting peak size for kernel tag %s\n",
11344 KA_NAME(site));
11345
11346 uint64_t new_peak = os_atomic_load(&site->total, relaxed);
11347 /* acquire updates to the total */
11348 os_atomic_min(&site->peak, new_peak, acquire);
11349
11350 return KERN_SUCCESS;
11351 }
11352
11353 #endif /* DEBUG || DEVELOPMENT */
11354
11355 #if VM_TAG_SIZECLASSES
11356
11357 void
vm_allocation_zones_init(void)11358 vm_allocation_zones_init(void)
11359 {
11360 vm_offset_t addr;
11361 vm_size_t size;
11362
11363 const vm_tag_t early_tags[] = {
11364 VM_KERN_MEMORY_DIAG,
11365 VM_KERN_MEMORY_KALLOC,
11366 VM_KERN_MEMORY_KALLOC_DATA,
11367 VM_KERN_MEMORY_KALLOC_TYPE,
11368 VM_KERN_MEMORY_LIBKERN,
11369 VM_KERN_MEMORY_OSFMK,
11370 VM_KERN_MEMORY_RECOUNT,
11371 };
11372
11373 size = VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *)
11374 + ARRAY_COUNT(early_tags) * VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
11375
11376 kmem_alloc(kernel_map, &addr, round_page(size),
11377 KMA_NOFAIL | KMA_KOBJECT | KMA_ZERO | KMA_PERMANENT,
11378 VM_KERN_MEMORY_DIAG);
11379
11380 vm_allocation_zone_totals = (vm_allocation_zone_total_t **) addr;
11381 addr += VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *);
11382
11383 // prepopulate early tag ranges so allocations
11384 // in vm_tag_update_zone_size() and early boot won't recurse
11385 for (size_t i = 0; i < ARRAY_COUNT(early_tags); i++) {
11386 vm_allocation_zone_totals[early_tags[i]] = (vm_allocation_zone_total_t *)addr;
11387 addr += VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
11388 }
11389 }
11390
11391 __attribute__((noinline))
11392 static vm_tag_t
vm_tag_zone_stats_alloc(vm_tag_t tag,zalloc_flags_t flags)11393 vm_tag_zone_stats_alloc(vm_tag_t tag, zalloc_flags_t flags)
11394 {
11395 vm_allocation_zone_total_t *stats;
11396 vm_size_t size = sizeof(*stats) * VM_TAG_SIZECLASSES;
11397
11398 flags = Z_VM_TAG(Z_ZERO | flags, VM_KERN_MEMORY_DIAG);
11399 stats = kalloc_data(size, flags);
11400 if (!stats) {
11401 return VM_KERN_MEMORY_NONE;
11402 }
11403 if (!os_atomic_cmpxchg(&vm_allocation_zone_totals[tag], NULL, stats, release)) {
11404 kfree_data(stats, size);
11405 }
11406 return tag;
11407 }
11408
11409 vm_tag_t
vm_tag_will_update_zone(vm_tag_t tag,uint32_t zflags)11410 vm_tag_will_update_zone(vm_tag_t tag, uint32_t zflags)
11411 {
11412 assert(VM_KERN_MEMORY_NONE != tag);
11413 assert(tag < VM_MAX_TAG_VALUE);
11414
11415 if (__probable(vm_allocation_zone_totals[tag])) {
11416 return tag;
11417 }
11418 return vm_tag_zone_stats_alloc(tag, zflags);
11419 }
11420
11421 void
vm_tag_update_zone_size(vm_tag_t tag,uint32_t zidx,long delta)11422 vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta)
11423 {
11424 vm_allocation_zone_total_t *stats;
11425 vm_size_t value;
11426
11427 assert(VM_KERN_MEMORY_NONE != tag);
11428 assert(tag < VM_MAX_TAG_VALUE);
11429
11430 if (zidx >= VM_TAG_SIZECLASSES) {
11431 return;
11432 }
11433
11434 stats = vm_allocation_zone_totals[tag];
11435 assert(stats);
11436 stats += zidx;
11437
11438 value = os_atomic_add(&stats->vazt_total, delta, relaxed);
11439 if (delta < 0) {
11440 assertf((long)value >= 0, "zidx %d, tag %d, %p", zidx, tag, stats);
11441 return;
11442 } else if (os_atomic_load(&stats->vazt_peak, relaxed) < value) {
11443 os_atomic_max(&stats->vazt_peak, value, relaxed);
11444 }
11445 }
11446
11447 #endif /* VM_TAG_SIZECLASSES */
11448
11449 void
kern_allocation_update_subtotal(kern_allocation_name_t allocation,vm_tag_t subtag,int64_t delta)11450 kern_allocation_update_subtotal(kern_allocation_name_t allocation, vm_tag_t subtag, int64_t delta)
11451 {
11452 kern_allocation_name_t other;
11453 struct vm_allocation_total * total;
11454 uint32_t subidx;
11455
11456 assert(VM_KERN_MEMORY_NONE != subtag);
11457 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11458 for (subidx = 0; subidx < allocation->subtotalscount; subidx++) {
11459 total = &allocation->subtotals[subidx];
11460 if (subtag == total->tag) {
11461 break;
11462 }
11463 }
11464 if (subidx >= allocation->subtotalscount) {
11465 for (subidx = 0; subidx < allocation->subtotalscount; subidx++) {
11466 total = &allocation->subtotals[subidx];
11467 if ((VM_KERN_MEMORY_NONE == total->tag)
11468 || !total->total) {
11469 total->tag = (vm_tag_t)subtag;
11470 break;
11471 }
11472 }
11473 }
11474 assert(subidx < allocation->subtotalscount);
11475 if (subidx >= allocation->subtotalscount) {
11476 lck_ticket_unlock(&vm_allocation_sites_lock);
11477 return;
11478 }
11479 if (delta < 0) {
11480 assertf(total->total >= ((uint64_t)-delta), "name %p", allocation);
11481 }
11482 OSAddAtomic64(delta, &total->total);
11483 lck_ticket_unlock(&vm_allocation_sites_lock);
11484
11485 other = vm_allocation_sites[subtag];
11486 assert(other);
11487 if (delta < 0) {
11488 assertf(other->mapped >= ((uint64_t)-delta), "other %p", other);
11489 }
11490 OSAddAtomic64(delta, &other->mapped);
11491 }
11492
11493 const char *
kern_allocation_get_name(kern_allocation_name_t allocation)11494 kern_allocation_get_name(kern_allocation_name_t allocation)
11495 {
11496 return KA_NAME(allocation);
11497 }
11498
11499 kern_allocation_name_t
kern_allocation_name_allocate(const char * name,uint16_t subtotalscount)11500 kern_allocation_name_allocate(const char * name, uint16_t subtotalscount)
11501 {
11502 kern_allocation_name_t allocation;
11503 uint16_t namelen;
11504
11505 namelen = (uint16_t)strnlen(name, MACH_MEMORY_INFO_NAME_MAX_LEN - 1);
11506
11507 allocation = kalloc_data(KA_SIZE(namelen, subtotalscount), Z_WAITOK | Z_ZERO);
11508 allocation->refcount = 1;
11509 allocation->subtotalscount = subtotalscount;
11510 allocation->flags = (uint16_t)(namelen << VM_TAG_NAME_LEN_SHIFT);
11511 strlcpy(KA_NAME(allocation), name, namelen + 1);
11512
11513 vm_tag_alloc(allocation);
11514 return allocation;
11515 }
11516
11517 void
kern_allocation_name_release(kern_allocation_name_t allocation)11518 kern_allocation_name_release(kern_allocation_name_t allocation)
11519 {
11520 assert(allocation->refcount > 0);
11521 if (1 == OSAddAtomic16(-1, &allocation->refcount)) {
11522 kfree_data(allocation,
11523 KA_SIZE(KA_NAME_LEN(allocation), allocation->subtotalscount));
11524 }
11525 }
11526
11527 #if !VM_TAG_ACTIVE_UPDATE
11528 static void
vm_page_count_object(mach_memory_info_t * info,unsigned int __unused num_info,vm_object_t object)11529 vm_page_count_object(mach_memory_info_t * info, unsigned int __unused num_info, vm_object_t object)
11530 {
11531 if (!object->wired_page_count) {
11532 return;
11533 }
11534 if (!is_kernel_object(object)) {
11535 assert(object->wire_tag < num_info);
11536 info[object->wire_tag].size += ptoa_64(object->wired_page_count);
11537 }
11538 }
11539
11540 typedef void (*vm_page_iterate_proc)(mach_memory_info_t * info,
11541 unsigned int num_info, vm_object_t object);
11542
11543 static void
vm_page_iterate_purgeable_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc,purgeable_q_t queue,int group)11544 vm_page_iterate_purgeable_objects(mach_memory_info_t * info, unsigned int num_info,
11545 vm_page_iterate_proc proc, purgeable_q_t queue,
11546 int group)
11547 {
11548 vm_object_t object;
11549
11550 for (object = (vm_object_t) queue_first(&queue->objq[group]);
11551 !queue_end(&queue->objq[group], (queue_entry_t) object);
11552 object = (vm_object_t) queue_next(&object->objq)) {
11553 proc(info, num_info, object);
11554 }
11555 }
11556
11557 static void
vm_page_iterate_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc)11558 vm_page_iterate_objects(mach_memory_info_t * info, unsigned int num_info,
11559 vm_page_iterate_proc proc)
11560 {
11561 vm_object_t object;
11562
11563 lck_spin_lock_grp(&vm_objects_wired_lock, &vm_page_lck_grp_bucket);
11564 queue_iterate(&vm_objects_wired,
11565 object,
11566 vm_object_t,
11567 wired_objq)
11568 {
11569 proc(info, num_info, object);
11570 }
11571 lck_spin_unlock(&vm_objects_wired_lock);
11572 }
11573 #endif /* ! VM_TAG_ACTIVE_UPDATE */
11574
11575 static uint64_t
process_account(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,boolean_t iterated,bool redact_info __unused)11576 process_account(mach_memory_info_t * info, unsigned int num_info,
11577 uint64_t zones_collectable_bytes, boolean_t iterated, bool redact_info __unused)
11578 {
11579 size_t namelen;
11580 unsigned int idx, count, nextinfo;
11581 vm_allocation_site_t * site;
11582 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11583
11584 for (idx = 0; idx <= vm_allocation_tag_highest; idx++) {
11585 site = vm_allocation_sites[idx];
11586 if (!site) {
11587 continue;
11588 }
11589 info[idx].mapped = site->mapped;
11590 info[idx].tag = site->tag;
11591 if (!iterated) {
11592 info[idx].size = site->total;
11593 #if DEBUG || DEVELOPMENT
11594 info[idx].peak = site->peak;
11595 #endif /* DEBUG || DEVELOPMENT */
11596 } else {
11597 if (!site->subtotalscount && (site->total != info[idx].size)) {
11598 printf("tag mismatch[%d] 0x%qx, iter 0x%qx\n", idx, site->total, info[idx].size);
11599 info[idx].size = site->total;
11600 }
11601 }
11602 info[idx].flags |= VM_KERN_SITE_WIRED;
11603 if (idx < VM_KERN_MEMORY_FIRST_DYNAMIC) {
11604 info[idx].site = idx;
11605 info[idx].flags |= VM_KERN_SITE_TAG;
11606 if (VM_KERN_MEMORY_ZONE == idx) {
11607 info[idx].flags |= VM_KERN_SITE_HIDE;
11608 info[idx].flags &= ~VM_KERN_SITE_WIRED;
11609 info[idx].collectable_bytes = zones_collectable_bytes;
11610 }
11611 info[idx].flags |= VM_KERN_SITE_NAMED;
11612 strlcpy(info[idx].name, vm_kern_memory_names[idx], MACH_MEMORY_INFO_NAME_MAX_LEN);
11613 } else if ((namelen = (VM_TAG_NAME_LEN_MAX & (site->flags >> VM_TAG_NAME_LEN_SHIFT)))) {
11614 info[idx].site = 0;
11615 info[idx].flags |= VM_KERN_SITE_NAMED;
11616 if (namelen > sizeof(info[idx].name)) {
11617 namelen = sizeof(info[idx].name);
11618 }
11619 strncpy(&info[idx].name[0], KA_NAME(site), namelen);
11620 } else if (VM_TAG_KMOD & site->flags) {
11621 info[idx].site = OSKextGetKmodIDForSite(site, NULL, 0);
11622 info[idx].flags |= VM_KERN_SITE_KMOD;
11623 } else {
11624 info[idx].site = VM_KERNEL_UNSLIDE(site);
11625 info[idx].flags |= VM_KERN_SITE_KERNEL;
11626 }
11627 }
11628
11629 nextinfo = (vm_allocation_tag_highest + 1);
11630 count = nextinfo;
11631 if (count >= num_info) {
11632 count = num_info;
11633 }
11634
11635 for (idx = 0; idx < count; idx++) {
11636 site = vm_allocation_sites[idx];
11637 if (!site) {
11638 continue;
11639 }
11640 #if VM_TAG_SIZECLASSES
11641 vm_allocation_zone_total_t * zone;
11642 unsigned int zidx;
11643
11644 if (!redact_info
11645 && vm_allocation_zone_totals
11646 && (zone = vm_allocation_zone_totals[idx])
11647 && (nextinfo < num_info)) {
11648 for (zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
11649 if (!zone[zidx].vazt_peak) {
11650 continue;
11651 }
11652 info[nextinfo] = info[idx];
11653 info[nextinfo].zone = zone_index_from_tag_index(zidx);
11654 info[nextinfo].flags &= ~VM_KERN_SITE_WIRED;
11655 info[nextinfo].flags |= VM_KERN_SITE_ZONE;
11656 info[nextinfo].flags |= VM_KERN_SITE_KALLOC;
11657 info[nextinfo].size = zone[zidx].vazt_total;
11658 info[nextinfo].peak = zone[zidx].vazt_peak;
11659 info[nextinfo].mapped = 0;
11660 nextinfo++;
11661 }
11662 }
11663 #endif /* VM_TAG_SIZECLASSES */
11664 if (site->subtotalscount) {
11665 uint64_t mapped, mapcost, take;
11666 uint32_t sub;
11667 vm_tag_t alloctag;
11668
11669 info[idx].size = site->total;
11670 mapped = info[idx].size;
11671 info[idx].mapped = mapped;
11672 mapcost = 0;
11673 for (sub = 0; sub < site->subtotalscount; sub++) {
11674 alloctag = site->subtotals[sub].tag;
11675 assert(alloctag < num_info);
11676 if (info[alloctag].name[0] && alloctag >= VM_KERN_MEMORY_FIRST_DYNAMIC) {
11677 continue;
11678 }
11679 take = site->subtotals[sub].total;
11680 if (take > info[alloctag].size) {
11681 take = info[alloctag].size;
11682 }
11683 if (take > mapped) {
11684 take = mapped;
11685 }
11686 info[alloctag].mapped -= take;
11687 info[alloctag].size -= take;
11688 mapped -= take;
11689 mapcost += take;
11690 }
11691 info[idx].size = mapcost;
11692 }
11693 }
11694 lck_ticket_unlock(&vm_allocation_sites_lock);
11695
11696 return 0;
11697 }
11698
11699 uint32_t
vm_page_diagnose_estimate(void)11700 vm_page_diagnose_estimate(void)
11701 {
11702 vm_allocation_site_t * site;
11703 uint32_t count = zone_view_count;
11704 uint32_t idx;
11705
11706 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11707 for (idx = 0; idx < VM_MAX_TAG_VALUE; idx++) {
11708 site = vm_allocation_sites[idx];
11709 if (!site) {
11710 continue;
11711 }
11712 count++;
11713 #if VM_TAG_SIZECLASSES
11714 if (vm_allocation_zone_totals) {
11715 vm_allocation_zone_total_t * zone;
11716 zone = vm_allocation_zone_totals[idx];
11717 if (!zone) {
11718 continue;
11719 }
11720 for (uint32_t zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
11721 count += (zone[zidx].vazt_peak != 0);
11722 }
11723 }
11724 #endif
11725 }
11726 lck_ticket_unlock(&vm_allocation_sites_lock);
11727
11728 /* some slop for new tags created */
11729 count += 8;
11730 count += VM_KERN_COUNTER_COUNT;
11731
11732 return count;
11733 }
11734
11735 static void
vm_page_diagnose_zone_stats(mach_memory_info_t * info,zone_stats_t zstats,bool percpu)11736 vm_page_diagnose_zone_stats(mach_memory_info_t *info, zone_stats_t zstats,
11737 bool percpu)
11738 {
11739 zpercpu_foreach(zs, zstats) {
11740 info->size += zs->zs_mem_allocated - zs->zs_mem_freed;
11741 }
11742 if (percpu) {
11743 info->size *= zpercpu_count();
11744 }
11745 info->flags |= VM_KERN_SITE_NAMED | VM_KERN_SITE_ZONE_VIEW;
11746 }
11747
11748 static void
vm_page_add_info(mach_memory_info_t * info,zone_stats_t stats,bool per_cpu,const char * parent_heap_name,const char * parent_zone_name,const char * view_name)11749 vm_page_add_info(
11750 mach_memory_info_t *info,
11751 zone_stats_t stats,
11752 bool per_cpu,
11753 const char *parent_heap_name,
11754 const char *parent_zone_name,
11755 const char *view_name)
11756 {
11757 vm_page_diagnose_zone_stats(info, stats, per_cpu);
11758 snprintf(info->name, sizeof(info->name),
11759 "%s%s[%s]", parent_heap_name, parent_zone_name, view_name);
11760 }
11761
11762 static void
vm_page_diagnose_zone(mach_memory_info_t * info,zone_t z)11763 vm_page_diagnose_zone(mach_memory_info_t *info, zone_t z)
11764 {
11765 vm_page_add_info(info, z->z_stats, z->z_percpu, zone_heap_name(z),
11766 z->z_name, "raw");
11767 }
11768
11769 static void
vm_page_add_view(mach_memory_info_t * info,zone_stats_t stats,const char * parent_heap_name,const char * parent_zone_name,const char * view_name)11770 vm_page_add_view(
11771 mach_memory_info_t *info,
11772 zone_stats_t stats,
11773 const char *parent_heap_name,
11774 const char *parent_zone_name,
11775 const char *view_name)
11776 {
11777 vm_page_add_info(info, stats, false, parent_heap_name, parent_zone_name,
11778 view_name);
11779 }
11780
11781 static uint32_t
vm_page_diagnose_heap_views(mach_memory_info_t * info,kalloc_heap_t kh,const char * parent_heap_name,const char * parent_zone_name)11782 vm_page_diagnose_heap_views(
11783 mach_memory_info_t *info,
11784 kalloc_heap_t kh,
11785 const char *parent_heap_name,
11786 const char *parent_zone_name)
11787 {
11788 uint32_t i = 0;
11789
11790 while (kh) {
11791 vm_page_add_view(info + i, kh->kh_stats, parent_heap_name,
11792 parent_zone_name, kh->kh_name);
11793 kh = kh->kh_views;
11794 i++;
11795 }
11796 return i;
11797 }
11798
11799 static uint32_t
vm_page_diagnose_heap(mach_memory_info_t * info,kalloc_heap_t kheap)11800 vm_page_diagnose_heap(mach_memory_info_t *info, kalloc_heap_t kheap)
11801 {
11802 uint32_t i = 0;
11803
11804 for (; i < KHEAP_NUM_ZONES; i++) {
11805 vm_page_diagnose_zone(info + i, zone_by_id(kheap->kh_zstart + i));
11806 }
11807
11808 i += vm_page_diagnose_heap_views(info + i, kheap->kh_views, kheap->kh_name,
11809 NULL);
11810 return i;
11811 }
11812
11813 static int
vm_page_diagnose_kt_heaps(mach_memory_info_t * info)11814 vm_page_diagnose_kt_heaps(mach_memory_info_t *info)
11815 {
11816 uint32_t idx = 0;
11817 vm_page_add_view(info + idx, KHEAP_KT_VAR->kh_stats, KHEAP_KT_VAR->kh_name,
11818 "", "raw");
11819 idx++;
11820
11821 for (uint32_t i = 0; i < KT_VAR_MAX_HEAPS; i++) {
11822 struct kheap_info heap = kalloc_type_heap_array[i];
11823 char heap_num_tmp[MAX_ZONE_NAME] = "";
11824 const char *heap_num;
11825
11826 snprintf(&heap_num_tmp[0], MAX_ZONE_NAME, "%u", i);
11827 heap_num = &heap_num_tmp[0];
11828
11829 for (kalloc_type_var_view_t ktv = heap.kt_views; ktv;
11830 ktv = (kalloc_type_var_view_t) ktv->kt_next) {
11831 if (ktv->kt_stats && ktv->kt_stats != KHEAP_KT_VAR->kh_stats) {
11832 vm_page_add_view(info + idx, ktv->kt_stats, KHEAP_KT_VAR->kh_name,
11833 heap_num, ktv->kt_name);
11834 idx++;
11835 }
11836 }
11837
11838 idx += vm_page_diagnose_heap_views(info + idx, heap.kh_views,
11839 KHEAP_KT_VAR->kh_name, heap_num);
11840 }
11841
11842 return idx;
11843 }
11844
11845 kern_return_t
vm_page_diagnose(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,bool redact_info)11846 vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes, bool redact_info)
11847 {
11848 uint64_t wired_size;
11849 uint64_t wired_managed_size;
11850 uint64_t wired_reserved_size;
11851 boolean_t iterate;
11852 mach_memory_info_t * counts;
11853 uint32_t i;
11854
11855 vmlp_api_start(VM_PAGE_DIAGNOSE);
11856
11857 bzero(info, num_info * sizeof(mach_memory_info_t));
11858
11859 if (!vm_page_wire_count_initial) {
11860 vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_ABORTED);
11861 return KERN_ABORTED;
11862 }
11863
11864 wired_size = ptoa_64(vm_page_wire_count);
11865 wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count);
11866 #if XNU_TARGET_OS_OSX
11867 wired_size += ptoa_64(vm_lopage_free_count + vm_page_throttled_count);
11868 wired_reserved_size += ptoa_64(vm_page_throttled_count);
11869 #endif /* XNU_TARGET_OS_OSX */
11870 wired_managed_size = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial);
11871
11872 wired_size += booter_size;
11873
11874 assert(num_info >= VM_KERN_COUNTER_COUNT);
11875 num_info -= VM_KERN_COUNTER_COUNT;
11876 counts = &info[num_info];
11877
11878 #define SET_COUNT(xcount, xsize, xflags) MACRO_BEGIN \
11879 counts[xcount].tag = VM_MAX_TAG_VALUE + xcount; \
11880 counts[xcount].site = (xcount); \
11881 counts[xcount].size = (xsize); \
11882 counts[xcount].mapped = (xsize); \
11883 counts[xcount].flags = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED | xflags; \
11884 strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \
11885 MACRO_END;
11886
11887 SET_COUNT(VM_KERN_COUNT_MANAGED, ptoa_64(vm_page_pages), 0);
11888 SET_COUNT(VM_KERN_COUNT_WIRED, wired_size, 0);
11889 SET_COUNT(VM_KERN_COUNT_WIRED_MANAGED, wired_managed_size, 0);
11890 SET_COUNT(VM_KERN_COUNT_RESERVED, wired_reserved_size, VM_KERN_SITE_WIRED);
11891 SET_COUNT(VM_KERN_COUNT_STOLEN, ptoa_64(vm_page_stolen_count), VM_KERN_SITE_WIRED);
11892 SET_COUNT(VM_KERN_COUNT_LOPAGE, ptoa_64(vm_lopage_free_count), VM_KERN_SITE_WIRED);
11893 SET_COUNT(VM_KERN_COUNT_WIRED_BOOT, ptoa_64(vm_page_wire_count_on_boot), 0);
11894 SET_COUNT(VM_KERN_COUNT_BOOT_STOLEN, booter_size, VM_KERN_SITE_WIRED);
11895 SET_COUNT(VM_KERN_COUNT_WIRED_STATIC_KERNELCACHE, ptoa_64(vm_page_kernelcache_count), 0);
11896 #if CONFIG_SPTM
11897 SET_COUNT(VM_KERN_COUNT_EXCLAVES_CARVEOUT, SPTMArgs->sk_carveout_size, 0);
11898 #endif
11899
11900 #define SET_MAP(xcount, xsize, xfree, xlargest) MACRO_BEGIN \
11901 counts[xcount].site = (xcount); \
11902 counts[xcount].size = (xsize); \
11903 counts[xcount].mapped = (xsize); \
11904 counts[xcount].free = (xfree); \
11905 counts[xcount].largest = (xlargest); \
11906 counts[xcount].flags = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED; \
11907 strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \
11908 MACRO_END;
11909
11910 vm_map_size_t map_size, map_free, map_largest;
11911
11912 vm_map_sizes(kernel_map, &map_size, &map_free, &map_largest);
11913 SET_MAP(VM_KERN_COUNT_MAP_KERNEL, map_size, map_free, map_largest);
11914
11915 zone_map_sizes(&map_size, &map_free, &map_largest);
11916 SET_MAP(VM_KERN_COUNT_MAP_ZONE, map_size, map_free, map_largest);
11917
11918 assert(num_info >= zone_view_count);
11919 num_info -= zone_view_count;
11920 counts = &info[num_info];
11921 i = 0;
11922
11923 if (!redact_info) {
11924 if (zone_is_data_buffers_kheap(KHEAP_DATA_BUFFERS->kh_heap_id)) {
11925 i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_BUFFERS);
11926 }
11927 if (zone_is_data_shared_kheap(KHEAP_DATA_SHARED->kh_heap_id)) {
11928 i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_SHARED);
11929 }
11930
11931 if (KHEAP_KT_VAR->kh_heap_id == KHEAP_ID_KT_VAR) {
11932 i += vm_page_diagnose_kt_heaps(counts + i);
11933 }
11934 assert(i <= zone_view_count);
11935
11936 zone_index_foreach(zidx) {
11937 zone_t z = &zone_array[zidx];
11938 zone_security_flags_t zsflags = zone_security_array[zidx];
11939 zone_view_t zv = z->z_views;
11940
11941 if (zv == NULL) {
11942 continue;
11943 }
11944
11945 zone_stats_t zv_stats_head = z->z_stats;
11946 bool has_raw_view = false;
11947
11948 for (; zv; zv = zv->zv_next) {
11949 /*
11950 * kalloc_types that allocate from the same zone are linked
11951 * as views. Only print the ones that have their own stats.
11952 */
11953 if (zv->zv_stats == zv_stats_head) {
11954 continue;
11955 }
11956 has_raw_view = true;
11957 vm_page_diagnose_zone_stats(counts + i, zv->zv_stats,
11958 z->z_percpu);
11959 snprintf(counts[i].name, sizeof(counts[i].name), "%s%s[%s]",
11960 zone_heap_name(z), z->z_name, zv->zv_name);
11961 i++;
11962 assert(i <= zone_view_count);
11963 }
11964
11965 /*
11966 * Print raw views for non kalloc or kalloc_type zones
11967 */
11968 bool kalloc_type = zsflags.z_kalloc_type;
11969 if ((zsflags.z_kheap_id == KHEAP_ID_NONE && !kalloc_type) ||
11970 (kalloc_type && has_raw_view)) {
11971 vm_page_diagnose_zone(counts + i, z);
11972 i++;
11973 assert(i <= zone_view_count);
11974 }
11975 }
11976 }
11977
11978 iterate = !VM_TAG_ACTIVE_UPDATE;
11979 if (iterate) {
11980 enum { kMaxKernelDepth = 1 };
11981 vm_map_t maps[kMaxKernelDepth];
11982 vm_map_entry_t entries[kMaxKernelDepth];
11983 vm_map_t map;
11984 vm_map_entry_t entry;
11985 vm_object_offset_t offset;
11986 vm_page_t page;
11987 int stackIdx, count;
11988
11989 #if !VM_TAG_ACTIVE_UPDATE
11990 vm_page_iterate_objects(info, num_info, &vm_page_count_object);
11991 #endif /* ! VM_TAG_ACTIVE_UPDATE */
11992
11993 map = kernel_map;
11994 stackIdx = 0;
11995 while (map) {
11996 vm_map_lock(map);
11997 for (entry = map->hdr.links.next; map; entry = entry->vme_next) {
11998 if (entry->is_sub_map) {
11999 assert(stackIdx < kMaxKernelDepth);
12000 maps[stackIdx] = map;
12001 entries[stackIdx] = entry;
12002 stackIdx++;
12003 map = VME_SUBMAP(entry);
12004 entry = NULL;
12005 break;
12006 }
12007
12008 vmlp_range_event_entry(map, entry);
12009
12010 if (is_kernel_object(VME_OBJECT(entry))) {
12011 count = 0;
12012 vm_object_lock(VME_OBJECT(entry));
12013 for (offset = entry->vme_start; offset < entry->vme_end; offset += page_size) {
12014 page = vm_page_lookup(VME_OBJECT(entry), offset);
12015 if (page && VM_PAGE_WIRED(page)) {
12016 count++;
12017 }
12018 }
12019 vm_object_unlock(VME_OBJECT(entry));
12020
12021 if (count) {
12022 assert(VME_ALIAS(entry) != VM_KERN_MEMORY_NONE);
12023 assert(VME_ALIAS(entry) < num_info);
12024 info[VME_ALIAS(entry)].size += ptoa_64(count);
12025 }
12026 }
12027 while (map && (entry == vm_map_last_entry(map))) {
12028 vm_map_unlock(map);
12029 if (!stackIdx) {
12030 map = NULL;
12031 } else {
12032 --stackIdx;
12033 map = maps[stackIdx];
12034 entry = entries[stackIdx];
12035 }
12036 }
12037 }
12038 }
12039 }
12040
12041 process_account(info, num_info, zones_collectable_bytes, iterate, redact_info);
12042
12043 vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_SUCCESS);
12044 return KERN_SUCCESS;
12045 }
12046
12047 #if DEBUG || DEVELOPMENT
12048
12049 kern_return_t
vm_kern_allocation_info(uintptr_t addr,vm_size_t * size,vm_tag_t * tag,vm_size_t * zone_size)12050 vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_size_t * zone_size)
12051 {
12052 kern_return_t ret;
12053 vm_size_t zsize;
12054 vm_map_t map;
12055 vm_map_entry_t entry;
12056
12057 vmlp_api_start(VM_KERN_ALLOCATION_INFO);
12058
12059 zsize = zone_element_info((void *) addr, tag);
12060 if (zsize) {
12061 *zone_size = *size = zsize;
12062 vmlp_api_end(VM_KERN_ALLOCATION_INFO, KERN_SUCCESS);
12063 return KERN_SUCCESS;
12064 }
12065
12066 *zone_size = 0;
12067 ret = KERN_INVALID_ADDRESS;
12068 for (map = kernel_map; map;) {
12069 vm_map_lock(map);
12070 if (!vm_map_lookup_entry(map, addr, &entry)) {
12071 break;
12072 }
12073 if (entry->is_sub_map) {
12074 if (map != kernel_map) {
12075 break;
12076 }
12077 map = VME_SUBMAP(entry);
12078 continue;
12079 }
12080 if (entry->vme_start != addr) {
12081 break;
12082 }
12083
12084 vmlp_range_event_entry(map, entry);
12085
12086 *tag = (vm_tag_t)VME_ALIAS(entry);
12087 *size = (entry->vme_end - addr);
12088 ret = KERN_SUCCESS;
12089 break;
12090 }
12091 if (map != kernel_map) {
12092 vm_map_unlock(map);
12093 }
12094 vm_map_unlock(kernel_map);
12095
12096 vmlp_api_end(VM_KERN_ALLOCATION_INFO, ret);
12097 return ret;
12098 }
12099
12100 // some DEBUG/DEVELOPMENT code to get a process to page out its shared cache TEXT pages,
12101 // only used for DK driver in LPW testing
12102 uint64_t
vm_task_evict_shared_cache(task_t task)12103 vm_task_evict_shared_cache(task_t task)
12104 {
12105 enum { kMaxKernelDepth = 3 };
12106 vm_map_t maps[kMaxKernelDepth];
12107 vm_map_entry_t entries[kMaxKernelDepth];
12108 vm_map_t map;
12109 vm_object_t textObject, shadow;
12110 vm_map_entry_t entry;
12111 vm_object_offset_t textOffset, textSize;
12112 int stackIdx;
12113 uint64_t count;
12114
12115 count = counter_load(&task->pageins);
12116 map = get_task_map(task);
12117 textObject = NULL;
12118 stackIdx = 0;
12119 while (map) {
12120 vm_map_lock_read(map);
12121 for (entry = map->hdr.links.next; map; entry = entry->vme_next) {
12122 if (entry->is_sub_map) {
12123 assert(stackIdx < kMaxKernelDepth);
12124 maps[stackIdx] = map;
12125 entries[stackIdx] = entry;
12126 stackIdx++;
12127 map = VME_SUBMAP(entry);
12128 entry = NULL;
12129 break;
12130 }
12131 if (stackIdx && (VM_PROT_EXECUTE | VM_PROT_READ) == entry->protection) {
12132 textObject = VME_OBJECT(entry);
12133 vm_object_lock(textObject);
12134 while ((shadow = textObject->shadow)) {
12135 vm_object_lock(shadow);
12136 vm_object_unlock(textObject);
12137 textObject = shadow;
12138 }
12139 vm_object_reference_locked(textObject);
12140 vm_object_unlock(textObject);
12141 textOffset = VME_OFFSET(entry);
12142 textSize = entry->vme_end - entry->vme_start;
12143 entry = vm_map_last_entry(map);
12144 }
12145 while (map && (entry == vm_map_last_entry(map))) {
12146 vm_map_unlock_read(map);
12147 if (!stackIdx) {
12148 map = NULL;
12149 } else {
12150 --stackIdx;
12151 map = maps[stackIdx];
12152 entry = entries[stackIdx];
12153 if (textObject) {
12154 entry = vm_map_last_entry(map);
12155 }
12156 }
12157 }
12158 }
12159 }
12160
12161 if (textObject) {
12162 vm_object_sync(textObject, textOffset, textSize, true, false, false);
12163 vm_object_deallocate(textObject);
12164 }
12165 return count;
12166 }
12167
12168 uint64_t
vm_task_pageins(task_t task)12169 vm_task_pageins(task_t task)
12170 {
12171 return counter_load(&task->pageins);
12172 }
12173
12174 #endif /* DEBUG || DEVELOPMENT */
12175
12176 uint32_t
vm_tag_get_kext(vm_tag_t tag,char * name,vm_size_t namelen)12177 vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen)
12178 {
12179 vm_allocation_site_t * site;
12180 uint32_t kmodId;
12181
12182 kmodId = 0;
12183 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
12184 if ((site = vm_allocation_sites[tag])) {
12185 if (VM_TAG_KMOD & site->flags) {
12186 kmodId = OSKextGetKmodIDForSite(site, name, namelen);
12187 }
12188 }
12189 lck_ticket_unlock(&vm_allocation_sites_lock);
12190
12191 return kmodId;
12192 }
12193
12194
12195 #if CONFIG_SECLUDED_MEMORY
12196 /*
12197 * Note that there's no locking around other accesses to vm_page_secluded_target.
12198 * That should be OK, since these are the only place where it can be changed after
12199 * initialization. Other users (like vm_pageout) may see the wrong value briefly,
12200 * but will eventually get the correct value. This brief mismatch is OK as pageout
12201 * and page freeing will auto-adjust the vm_page_secluded_count to match the target
12202 * over time.
12203 */
12204 unsigned int vm_page_secluded_suppress_cnt = 0;
12205 unsigned int vm_page_secluded_save_target;
12206
12207 LCK_GRP_DECLARE(secluded_suppress_slock_grp, "secluded_suppress_slock");
12208 LCK_SPIN_DECLARE(secluded_suppress_slock, &secluded_suppress_slock_grp);
12209
12210 void
start_secluded_suppression(task_t task)12211 start_secluded_suppression(task_t task)
12212 {
12213 if (task->task_suppressed_secluded) {
12214 return;
12215 }
12216 lck_spin_lock(&secluded_suppress_slock);
12217 if (!task->task_suppressed_secluded && vm_page_secluded_suppress_cnt++ == 0) {
12218 task->task_suppressed_secluded = TRUE;
12219 vm_page_secluded_save_target = vm_page_secluded_target;
12220 vm_page_secluded_target = 0;
12221 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
12222 }
12223 lck_spin_unlock(&secluded_suppress_slock);
12224 }
12225
12226 void
stop_secluded_suppression(task_t task)12227 stop_secluded_suppression(task_t task)
12228 {
12229 lck_spin_lock(&secluded_suppress_slock);
12230 if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) {
12231 task->task_suppressed_secluded = FALSE;
12232 vm_page_secluded_target = vm_page_secluded_save_target;
12233 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
12234 }
12235 lck_spin_unlock(&secluded_suppress_slock);
12236 }
12237
12238 #endif /* CONFIG_SECLUDED_MEMORY */
12239
12240 /*
12241 * Move the list of retired pages on the vm_page_queue_retired to
12242 * their final resting place on retired_pages_object.
12243 */
12244 void
vm_retire_boot_pages(void)12245 vm_retire_boot_pages(void)
12246 {
12247 }
12248
12249 /*
12250 * This holds the reported physical address if an ECC error leads to a panic.
12251 * SMC will store it in PMU SRAM under the 'sECC' key.
12252 */
12253 uint64_t ecc_panic_physical_address = 0;
12254
12255