1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_page.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Resident memory management module.
63 */
64 #include <debug.h>
65 #include <libkern/OSDebug.h>
66
67 #include <mach/clock_types.h>
68 #include <mach/vm_prot.h>
69 #include <mach/vm_statistics.h>
70 #include <mach/sdt.h>
71 #include <kern/counter.h>
72 #include <kern/exclaves_memory.h>
73 #include <kern/host_statistics.h>
74 #include <kern/sched_prim.h>
75 #include <kern/policy_internal.h>
76 #include <kern/task.h>
77 #include <kern/thread.h>
78 #include <kern/kalloc.h>
79 #include <kern/zalloc_internal.h>
80 #include <kern/ledger.h>
81 #include <kern/ecc.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_init_xnu.h>
84 #include <vm/vm_map_internal.h>
85 #include <vm/vm_page_internal.h>
86 #include <vm/vm_pageout_internal.h>
87 #include <vm/vm_kern_xnu.h> /* kmem_alloc() */
88 #include <vm/vm_compressor_pager_internal.h>
89 #include <kern/misc_protos.h>
90 #include <mach_debug/zone_info.h>
91 #include <vm/cpm_internal.h>
92 #include <pexpert/pexpert.h>
93 #include <pexpert/device_tree.h>
94 #include <san/kasan.h>
95 #include <vm/vm_log.h>
96
97 #include <libkern/coreanalytics/coreanalytics.h>
98 #include <kern/backtrace.h>
99 #include <kern/telemetry.h>
100
101 #include <vm/vm_protos_internal.h>
102 #include <vm/memory_object.h>
103 #include <vm/vm_purgeable_internal.h>
104 #include <vm/vm_compressor_internal.h>
105 #include <vm/vm_iokit.h>
106 #include <vm/vm_object_internal.h>
107
108 #if HAS_MTE
109 #include <vm/vm_mteinfo_internal.h>
110 #endif /* HAS_MTE */
111
112 #if defined (__x86_64__)
113 #include <i386/misc_protos.h>
114 #endif
115
116 #if CONFIG_SPTM
117 #include <arm64/sptm/sptm.h>
118 #endif
119
120 #if CONFIG_PHANTOM_CACHE
121 #include <vm/vm_phantom_cache_internal.h>
122 #endif
123
124 #if HIBERNATION
125 #include <IOKit/IOHibernatePrivate.h>
126 #include <machine/pal_hibernate.h>
127 #endif /* HIBERNATION */
128
129 #if CONFIG_SECLUDED_MEMORY
130 static_assert(!XNU_VM_HAS_LOPAGE,
131 "VM_PAGE_ON_SECLUDED_Q and VM_PAGE_ON_FREE_LOPAGE_Q alias");
132 #endif
133
134 #include <sys/kdebug.h>
135
136 #if defined(HAS_APPLE_PAC)
137 #include <ptrauth.h>
138 #endif
139 #if defined(__arm64__)
140 #include <arm/cpu_internal.h>
141 #endif /* defined(__arm64__) */
142
143 /*
144 * During single threaded early boot we don't initialize all pages.
145 * This avoids some delay during boot. They'll be initialized and
146 * added to the free list as needed or after we are multithreaded by
147 * what becomes the pageout thread.
148 *
149 * This slows down booting the DEBUG kernel, particularly on
150 * large memory systems, but is worthwhile in deterministically
151 * trapping uninitialized memory usage.
152 */
153 #if DEBUG
154 static TUNABLE(uint32_t, fillval, "fill", 0xDEB8F177);
155 #else
156 static TUNABLE(uint32_t, fillval, "fill", 0);
157 #endif
158
159 #if MACH_ASSERT
160
161 TUNABLE(bool, vm_check_refs_on_alloc, "vm_check_refs_on_alloc", false);
162
163 #endif /* MACH_ASSERT */
164
165 extern boolean_t vm_pageout_running;
166 extern thread_t vm_pageout_scan_thread;
167 extern bool vps_dynamic_priority_enabled;
168
169 const uint16_t vm_page_inactive_states =
170 BIT(VM_PAGE_ON_INACTIVE_INTERNAL_Q) |
171 BIT(VM_PAGE_ON_INACTIVE_EXTERNAL_Q) |
172 BIT(VM_PAGE_ON_INACTIVE_CLEANED_Q);
173
174 const uint16_t vm_page_active_or_inactive_states =
175 vm_page_inactive_states |
176 #if CONFIG_SECLUDED_MEMORY
177 BIT(VM_PAGE_ON_SECLUDED_Q) |
178 #endif /* CONFIG_SECLUDED_MEMORY */
179 BIT(VM_PAGE_ON_ACTIVE_Q);
180
181 const uint16_t vm_page_non_speculative_pageable_states =
182 vm_page_active_or_inactive_states |
183 BIT(VM_PAGE_ON_THROTTLED_Q);
184
185 const uint16_t vm_page_pageable_states =
186 vm_page_non_speculative_pageable_states |
187 BIT(VM_PAGE_ON_SPECULATIVE_Q);
188
189 #if CONFIG_SECLUDED_MEMORY
190 struct vm_page_secluded_data vm_page_secluded;
191 #endif /* CONFIG_SECLUDED_MEMORY */
192 #if HIBERNATION
193 static bool hibernate_rebuild_needed = false;
194 #endif /* HIBERNATION */
195
196 #if DEVELOPMENT || DEBUG
197 extern struct memory_object_pager_ops shared_region_pager_ops;
198 unsigned int shared_region_pagers_resident_count = 0;
199 unsigned int shared_region_pagers_resident_peak = 0;
200 #endif /* DEVELOPMENT || DEBUG */
201
202
203
204 unsigned int PERCPU_DATA(start_color);
205 vm_page_t PERCPU_DATA(free_pages);
206 SCALABLE_COUNTER_DEFINE(vm_cpu_free_count);
207 boolean_t hibernate_cleaning_in_progress = FALSE;
208
209 atomic_counter_t vm_guard_count;
210
211 #if XNU_VM_HAS_LOPAGE
212 /*
213 * this interface exists to support hardware controllers
214 * incapable of generating DMAs with more than 32 bits
215 * of address on platforms with physical memory > 4G...
216 */
217 vm_page_queue_head_t vm_lopage_queue_free VM_PAGE_PACKED_ALIGNED;
218 uint32_t vm_lopage_free_count = 0;
219 uint32_t vm_lopage_free_limit = 0;
220 uint32_t vm_lopage_lowater = 0;
221 bool vm_lopage_refill = false;
222 bool vm_lopage_needed = false;
223 unsigned int vm_lopages_allocated_q = 0;
224 unsigned int vm_lopages_allocated_cpm_success = 0;
225 unsigned int vm_lopages_allocated_cpm_failed = 0;
226 #endif /* XNU_VM_HAS_LOPAGE */
227
228
229 int speculative_age_index = 0;
230 int speculative_steal_index = 0;
231 struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_RESERVED_SPECULATIVE_AGE_Q + 1];
232
233 boolean_t hibernation_vmqueues_inspection = FALSE; /* Tracks if the hibernation code is looking at the VM queues.
234 * Updated and checked behind the vm_page_queues_lock. */
235
236 static void vm_page_free_prepare(vm_page_t page);
237
238 #if HAS_MTE
239 void vm_page_wire_boot_tags(void);
240 #endif /* HAS_MTE */
241
242 static void vm_tag_init(void);
243
244 /* for debugging purposes */
245 SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask =
246 VM_PAGE_PACKED_FROM_ARRAY;
247 #ifndef __BUILDING_XNU_LIB_UNITTEST__ /* This is not a compile-time constant when building unit-test */
248 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params =
249 VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR);
250 #endif /* __BUILDING_XNU_LIB_UNITTEST__ */
251
252 /*
253 * Associated with page of user-allocatable memory is a
254 * page structure.
255 */
256
257 /*
258 * These variables record the values returned by vm_page_bootstrap,
259 * for debugging purposes. The implementation of pmap_steal_memory
260 * and pmap_startup here also uses them internally.
261 */
262
263 vm_offset_t virtual_space_start;
264 vm_offset_t virtual_space_end;
265 uint32_t vm_page_pages;
266
267 /*
268 * The vm_page_lookup() routine, which provides for fast
269 * (virtual memory object, offset) to page lookup, employs
270 * the following hash table. The vm_page_{insert,remove}
271 * routines install and remove associations in the table.
272 * [This table is often called the virtual-to-physical,
273 * or VP, table.]
274 */
275 typedef struct {
276 vm_page_packed_t page_list;
277 #if MACH_PAGE_HASH_STATS
278 int cur_count; /* current count */
279 int hi_count; /* high water mark */
280 #endif /* MACH_PAGE_HASH_STATS */
281 } vm_page_bucket_t;
282
283
284 #define BUCKETS_PER_LOCK 16
285
286 SECURITY_READ_ONLY_LATE(vm_page_bucket_t *) vm_page_buckets; /* Array of buckets */
287 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_count = 0; /* How big is array? */
288 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_mask; /* Mask for hash function */
289 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_shift; /* Shift for hash function */
290 SECURITY_READ_ONLY_LATE(uint32_t) vm_page_bucket_hash; /* Basic bucket hash */
291 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_lock_count = 0; /* How big is array of locks? */
292
293 #ifndef VM_TAG_ACTIVE_UPDATE
294 #error VM_TAG_ACTIVE_UPDATE
295 #endif
296 #ifndef VM_TAG_SIZECLASSES
297 #error VM_TAG_SIZECLASSES
298 #endif
299
300 /* for debugging */
301 SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE;
302 SECURITY_READ_ONLY_LATE(lck_ticket_t *) vm_page_bucket_locks;
303
304 vm_allocation_site_t vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1];
305 vm_allocation_site_t * vm_allocation_sites[VM_MAX_TAG_VALUE];
306 #if VM_TAG_SIZECLASSES
307 static vm_allocation_zone_total_t **vm_allocation_zone_totals;
308 #endif /* VM_TAG_SIZECLASSES */
309
310 vm_tag_t vm_allocation_tag_highest;
311
312 #if VM_PAGE_BUCKETS_CHECK
313 boolean_t vm_page_buckets_check_ready = FALSE;
314 #if VM_PAGE_FAKE_BUCKETS
315 vm_page_bucket_t *vm_page_fake_buckets; /* decoy buckets */
316 vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
317 #endif /* VM_PAGE_FAKE_BUCKETS */
318 #endif /* VM_PAGE_BUCKETS_CHECK */
319
320 #if MACH_PAGE_HASH_STATS
321 /* This routine is only for debug. It is intended to be called by
322 * hand by a developer using a kernel debugger. This routine prints
323 * out vm_page_hash table statistics to the kernel debug console.
324 */
325 void
hash_debug(void)326 hash_debug(void)
327 {
328 int i;
329 int numbuckets = 0;
330 int highsum = 0;
331 int maxdepth = 0;
332
333 for (i = 0; i < vm_page_bucket_count; i++) {
334 if (vm_page_buckets[i].hi_count) {
335 numbuckets++;
336 highsum += vm_page_buckets[i].hi_count;
337 if (vm_page_buckets[i].hi_count > maxdepth) {
338 maxdepth = vm_page_buckets[i].hi_count;
339 }
340 }
341 }
342 printf("Total number of buckets: %d\n", vm_page_bucket_count);
343 printf("Number used buckets: %d = %d%%\n",
344 numbuckets, 100 * numbuckets / vm_page_bucket_count);
345 printf("Number unused buckets: %d = %d%%\n",
346 vm_page_bucket_count - numbuckets,
347 100 * (vm_page_bucket_count - numbuckets) / vm_page_bucket_count);
348 printf("Sum of bucket max depth: %d\n", highsum);
349 printf("Average bucket depth: %d.%2d\n",
350 highsum / vm_page_bucket_count,
351 highsum % vm_page_bucket_count);
352 printf("Maximum bucket depth: %d\n", maxdepth);
353 }
354 #endif /* MACH_PAGE_HASH_STATS */
355
356 /*
357 * The virtual page size is currently implemented as a runtime
358 * variable, but is constant once initialized using vm_set_page_size.
359 * This initialization must be done in the machine-dependent
360 * bootstrap sequence, before calling other machine-independent
361 * initializations.
362 *
363 * All references to the virtual page size outside this
364 * module must use the PAGE_SIZE, PAGE_MASK and PAGE_SHIFT
365 * constants.
366 */
367 #if defined(__arm64__)
368 vm_size_t page_size;
369 vm_size_t page_mask;
370 int page_shift;
371 #else
372 vm_size_t page_size = PAGE_SIZE;
373 vm_size_t page_mask = PAGE_MASK;
374 int page_shift = PAGE_SHIFT;
375 #endif
376
377 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages;
378 #if XNU_VM_HAS_DELAYED_PAGES
379 vm_page_t vm_pages_end;
380 uint32_t vm_pages_count;
381 #else
382 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_end;
383 SECURITY_READ_ONLY_LATE(uint32_t) vm_pages_count;
384 #endif /* XNU_VM_HAS_DELAYED_PAGES */
385 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
386 SECURITY_READ_ONLY_LATE(ppnum_t) vm_pages_first_pnum;
387 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
388 #if HAS_MTE
389 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_tag_storage;
390 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages_tag_storage_end;
391 #endif /* HAS_MTE */
392 #if CONFIG_SPTM
393 /*
394 * When used, these 128bit (MAX_COLORS bits) masks represent a "cluster"
395 * of contiguous free physical pages.
396 *
397 * For each cluster, there is an enqueue "index", which is -1 when there is no
398 * free page in the cluster, or the index in [0, 128) of the page that is
399 * enqueued on the vm_page_free_queue to represent the entire cluster.
400 *
401 * Grouping pages this way has the double nice effect to reduce doubly linked
402 * list (the worst data structure known to man when considering cache misses)
403 * manipulations, and also to mechanically make the VM serve more "contiguous"
404 * pages naturally.
405 */
406 static_assert(XNU_VM_HAS_LINEAR_PAGES_ARRAY);
407 SECURITY_READ_ONLY_LATE(__uint128_t *) _vm_pages_free_masks;
408 SECURITY_READ_ONLY_LATE(int8_t *) _vm_pages_free_enqueue_idx;
409 #endif /* CONFIG_SPTM */
410
411
412 /*
413 * Resident pages that represent real memory
414 * are allocated from a set of free lists,
415 * one per color.
416 */
417 SECURITY_READ_ONLY_LATE(unsigned int) vm_colors;
418 SECURITY_READ_ONLY_LATE(unsigned int) vm_color_mask; /* mask is == (vm_colors-1) */
419 unsigned int vm_cache_geometry_colors = 0; /* set by hw dependent code during startup */
420 unsigned int vm_free_magazine_refill_limit = 0;
421
422 struct vm_page_free_queue vm_page_queue_free;
423
424 unsigned int vm_page_free_wanted;
425 unsigned int vm_page_free_wanted_privileged;
426 #if CONFIG_SECLUDED_MEMORY
427 unsigned int vm_page_free_wanted_secluded;
428 #endif /* CONFIG_SECLUDED_MEMORY */
429 unsigned int vm_page_free_count;
430
431 unsigned int vm_page_realtime_count;
432
433 /*
434 * Occasionally, the virtual memory system uses
435 * resident page structures that do not refer to
436 * real pages, for example to leave a page with
437 * important state information in the VP table.
438 *
439 * These page structures are allocated the way
440 * most other kernel structures are.
441 */
442 SECURITY_READ_ONLY_LATE(zone_t) vm_page_zone;
443 vm_locks_array_t vm_page_locks;
444
445 LCK_ATTR_DECLARE(vm_page_lck_attr, 0, 0);
446 LCK_GRP_DECLARE(vm_page_lck_grp_free, "vm_page_free");
447 LCK_GRP_DECLARE(vm_page_lck_grp_queue, "vm_page_queue");
448 LCK_GRP_DECLARE(vm_page_lck_grp_local, "vm_page_queue_local");
449 LCK_GRP_DECLARE(vm_page_lck_grp_purge, "vm_page_purge");
450 LCK_GRP_DECLARE(vm_page_lck_grp_alloc, "vm_page_alloc");
451 LCK_GRP_DECLARE(vm_page_lck_grp_bucket, "vm_page_bucket");
452 LCK_SPIN_DECLARE_ATTR(vm_objects_wired_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
453 LCK_TICKET_DECLARE(vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
454
455 unsigned int vm_page_local_q_soft_limit = 250;
456 unsigned int vm_page_local_q_hard_limit = 500;
457 struct vpl *__zpercpu vm_page_local_q;
458
459 /* N.B. Guard and fictitious pages must not
460 * be assigned a zero phys_page value.
461 */
462 /*
463 * Fictitious pages don't have a physical address,
464 * but we must initialize phys_page to something.
465 * For debugging, this should be a strange value
466 * that the pmap module can recognize in assertions.
467 */
468 const ppnum_t vm_page_fictitious_addr = (ppnum_t) -1;
469
470 /*
471 * Guard pages are not accessible so they don't
472 * need a physical address, but we need to enter
473 * one in the pmap.
474 * Let's make it recognizable and make sure that
475 * we don't use a real physical page with that
476 * physical address.
477 */
478 const ppnum_t vm_page_guard_addr = (ppnum_t) -2;
479
480 /*
481 * Resident page structures are also chained on
482 * queues that are used by the page replacement
483 * system (pageout daemon). These queues are
484 * defined here, but are shared by the pageout
485 * module. The inactive queue is broken into
486 * file backed and anonymous for convenience as the
487 * pageout daemon often assignes a higher
488 * importance to anonymous pages (less likely to pick)
489 */
490 vm_page_queue_head_t vm_page_queue_active VM_PAGE_PACKED_ALIGNED;
491 vm_page_queue_head_t vm_page_queue_inactive VM_PAGE_PACKED_ALIGNED;
492 #if CONFIG_SECLUDED_MEMORY
493 vm_page_queue_head_t vm_page_queue_secluded VM_PAGE_PACKED_ALIGNED;
494 #endif /* CONFIG_SECLUDED_MEMORY */
495 vm_page_queue_head_t vm_page_queue_anonymous VM_PAGE_PACKED_ALIGNED; /* inactive memory queue for anonymous pages */
496 vm_page_queue_head_t vm_page_queue_throttled VM_PAGE_PACKED_ALIGNED;
497
498 queue_head_t vm_objects_wired;
499
500 vm_page_queue_head_t vm_page_queue_donate VM_PAGE_PACKED_ALIGNED;
501 uint32_t vm_page_donate_mode;
502 uint32_t vm_page_donate_target, vm_page_donate_target_high, vm_page_donate_target_low;
503 uint32_t vm_page_donate_count;
504 bool vm_page_donate_queue_ripe;
505
506
507 vm_page_queue_head_t vm_page_queue_background VM_PAGE_PACKED_ALIGNED;
508 uint32_t vm_page_background_target;
509 uint32_t vm_page_background_target_snapshot;
510 uint32_t vm_page_background_count;
511 uint64_t vm_page_background_promoted_count;
512
513 uint32_t vm_page_background_internal_count;
514 uint32_t vm_page_background_external_count;
515
516 uint32_t vm_page_background_mode;
517 uint32_t vm_page_background_exclude_external;
518
519 unsigned int vm_page_active_count;
520 unsigned int vm_page_inactive_count;
521 unsigned int vm_page_kernelcache_count;
522 #if CONFIG_SECLUDED_MEMORY
523 unsigned int vm_page_secluded_count;
524 unsigned int vm_page_secluded_count_free;
525 unsigned int vm_page_secluded_count_inuse;
526 unsigned int vm_page_secluded_count_over_target;
527 #endif /* CONFIG_SECLUDED_MEMORY */
528 unsigned int vm_page_anonymous_count;
529 unsigned int vm_page_throttled_count;
530 unsigned int vm_page_speculative_count;
531
532 unsigned int vm_page_wire_count;
533 unsigned int vm_page_wire_count_on_boot = 0;
534 unsigned int vm_page_stolen_count = 0;
535 unsigned int vm_page_wire_count_initial;
536 unsigned int vm_page_gobble_count = 0;
537 unsigned int vm_page_kern_lpage_count = 0;
538
539 uint64_t booter_size; /* external so it can be found in core dumps */
540
541 #define VM_PAGE_WIRE_COUNT_WARNING 0
542 #define VM_PAGE_GOBBLE_COUNT_WARNING 0
543
544 unsigned int vm_page_purgeable_count = 0; /* # of pages purgeable now */
545 unsigned int vm_page_purgeable_wired_count = 0; /* # of purgeable pages that are wired now */
546 uint64_t vm_page_purged_count = 0; /* total count of purged pages */
547
548 unsigned int vm_page_xpmapped_external_count = 0;
549 unsigned int vm_page_external_count = 0;
550 unsigned int vm_page_internal_count = 0;
551 unsigned int vm_page_pageable_external_count = 0;
552 unsigned int vm_page_pageable_internal_count = 0;
553
554 #if DEVELOPMENT || DEBUG
555 unsigned int vm_page_speculative_recreated = 0;
556 unsigned int vm_page_speculative_created = 0;
557 unsigned int vm_page_speculative_used = 0;
558 #endif
559
560 _Atomic unsigned int vm_page_swapped_count = 0;
561
562 vm_page_queue_head_t vm_page_queue_cleaned VM_PAGE_PACKED_ALIGNED;
563
564 unsigned int vm_page_cleaned_count = 0;
565
566 uint64_t max_valid_dma_address = 0xffffffffffffffffULL;
567 ppnum_t max_valid_low_ppnum = PPNUM_MAX;
568
569
570 /*
571 * Several page replacement parameters are also
572 * shared with this module, so that page allocation
573 * (done here in vm_page_alloc) can trigger the
574 * pageout daemon.
575 */
576 unsigned int vm_page_free_target = 0;
577 unsigned int vm_page_free_min = 0;
578 unsigned int vm_page_throttle_limit = 0;
579 unsigned int vm_page_inactive_target = 0;
580 #if CONFIG_SECLUDED_MEMORY
581 unsigned int vm_page_secluded_target = 0;
582 #endif /* CONFIG_SECLUDED_MEMORY */
583 unsigned int vm_page_anonymous_min = 0;
584 unsigned int vm_page_free_reserved = 0;
585
586
587 /*
588 * The VM system has a couple of heuristics for deciding
589 * that pages are "uninteresting" and should be placed
590 * on the inactive queue as likely candidates for replacement.
591 * These variables let the heuristics be controlled at run-time
592 * to make experimentation easier.
593 */
594
595 boolean_t vm_page_deactivate_hint = TRUE;
596
597 struct vm_page_stats_reusable vm_page_stats_reusable;
598
599 /*
600 * vm_set_page_size:
601 *
602 * Sets the page size, perhaps based upon the memory
603 * size. Must be called before any use of page-size
604 * dependent functions.
605 *
606 * Sets page_shift and page_mask from page_size.
607 */
608 void
vm_set_page_size(void)609 vm_set_page_size(void)
610 {
611 page_size = PAGE_SIZE;
612 page_mask = PAGE_MASK;
613 page_shift = PAGE_SHIFT;
614
615 if ((page_mask & page_size) != 0) {
616 panic("vm_set_page_size: page size not a power of two");
617 }
618
619 for (page_shift = 0;; page_shift++) {
620 if ((1U << page_shift) == page_size) {
621 break;
622 }
623 }
624 }
625
626 #if HAS_MTE
627
628 bool
vm_page_is_tag_storage_pnum(vm_page_t mem,ppnum_t pnum)629 vm_page_is_tag_storage_pnum(vm_page_t mem, ppnum_t pnum)
630 {
631 return pmap_in_tag_storage_range(pnum) &&
632 !mteinfo_tag_storage_disabled(mem);
633 }
634
635 #endif
636
637 /*
638 * @abstract
639 * Given a page, returns the memory class of that page.
640 */
641 static vm_memory_class_t
vm_page_get_memory_class(vm_page_t mem __unused,ppnum_t pnum __unused)642 vm_page_get_memory_class(vm_page_t mem __unused, ppnum_t pnum __unused)
643 {
644 assert(!vm_page_is_fictitious(mem));
645
646 #if XNU_VM_HAS_LOPAGE
647 if (mem->vmp_lopage) {
648 return VM_MEMORY_CLASS_LOPAGE;
649 }
650 #endif /* XNU_VM_HAS_LOPAGE */
651 #if HAS_MTE
652 if (mem->vmp_using_mte) {
653 return VM_MEMORY_CLASS_TAGGED;
654 } else if (!is_mte_enabled || !pmap_in_tag_storage_range(pnum)) {
655 return VM_MEMORY_CLASS_REGULAR;
656 } else if (mteinfo_tag_storage_disabled(mem)) {
657 return VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
658 } else {
659 return VM_MEMORY_CLASS_TAG_STORAGE;
660 }
661 #else /* !HAS_MTE */
662 return VM_MEMORY_CLASS_REGULAR;
663 #endif /* !HAS_MTE */
664 }
665
666 /*
667 * vm_page_is_restricted:
668 *
669 * Checks if a given vm_page_t is a restricted page.
670 */
671 inline bool
vm_page_is_restricted(vm_page_t mem)672 vm_page_is_restricted(vm_page_t mem)
673 {
674 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(mem);
675 return pmap_is_page_restricted(pn);
676 }
677
678 #ifdef __x86_64__
679
680 #define MAX_CLUMP_SIZE 16
681 #define DEFAULT_CLUMP_SIZE 4
682
683 unsigned int vm_clump_size, vm_clump_mask, vm_clump_shift, vm_clump_promote_threshold;
684
685 #if DEVELOPMENT || DEBUG
686 unsigned long vm_clump_stats[MAX_CLUMP_SIZE + 1];
687 unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
688
689 static inline void
vm_clump_update_stats(unsigned int c)690 vm_clump_update_stats(unsigned int c)
691 {
692 assert(c <= vm_clump_size);
693 if (c > 0 && c <= vm_clump_size) {
694 vm_clump_stats[c] += c;
695 }
696 vm_clump_allocs += c;
697 }
698 #endif /* if DEVELOPMENT || DEBUG */
699
700 /* Called once to setup the VM clump knobs */
701 static void
vm_page_setup_clump(void)702 vm_page_setup_clump( void )
703 {
704 unsigned int override, n;
705
706 vm_clump_size = DEFAULT_CLUMP_SIZE;
707 if (PE_parse_boot_argn("clump_size", &override, sizeof(override))) {
708 vm_clump_size = override;
709 }
710
711 if (vm_clump_size > MAX_CLUMP_SIZE) {
712 panic("vm_page_setup_clump:: clump_size is too large!");
713 }
714 if (vm_clump_size < 1) {
715 panic("vm_page_setup_clump:: clump_size must be >= 1");
716 }
717 if ((vm_clump_size & (vm_clump_size - 1)) != 0) {
718 panic("vm_page_setup_clump:: clump_size must be a power of 2");
719 }
720
721 vm_clump_promote_threshold = vm_clump_size;
722 vm_clump_mask = vm_clump_size - 1;
723 for (vm_clump_shift = 0, n = vm_clump_size; n > 1; n >>= 1, vm_clump_shift++) {
724 ;
725 }
726
727 #if DEVELOPMENT || DEBUG
728 bzero(vm_clump_stats, sizeof(vm_clump_stats));
729 vm_clump_allocs = vm_clump_inserts = vm_clump_inrange = vm_clump_promotes = 0;
730 #endif /* if DEVELOPMENT || DEBUG */
731 }
732
733 #endif /* __x86_64__ */
734
735 void
vm_page_free_queue_init(vm_page_free_queue_t free_queue)736 vm_page_free_queue_init(vm_page_free_queue_t free_queue)
737 {
738 for (unsigned int color = 0; color < MAX_COLORS; color++) {
739 vm_page_queue_init(&free_queue->vmpfq_queues[color].qhead);
740 }
741 }
742
743 /*!
744 * @function vm_page_free_queue_for_class()
745 *
746 * @abstract
747 * Returns the appropriate free queue for the given class and page color.
748 */
749 __pure2
750 static vm_page_queue_t
vm_page_free_queue_for_class(vm_memory_class_t mem_class,unsigned int color)751 vm_page_free_queue_for_class(vm_memory_class_t mem_class, unsigned int color)
752 {
753 switch (mem_class) {
754 case VM_MEMORY_CLASS_REGULAR:
755 #if HAS_MTE
756 case VM_MEMORY_CLASS_TAGGED:
757 case VM_MEMORY_CLASS_TAG_STORAGE:
758 if (is_mte_enabled) {
759 return NULL;
760 }
761 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
762 #endif
763 return &vm_page_queue_free.vmpfq_queues[color].qhead;
764 #if XNU_VM_HAS_LOPAGE
765 case VM_MEMORY_CLASS_LOPAGE:
766 return &vm_lopage_queue_free;
767 #endif /* XNU_VM_HAS_LOPAGE */
768 #if CONFIG_SECLUDED_MEMORY
769 case VM_MEMORY_CLASS_SECLUDED:
770 return &vm_page_queue_secluded;
771 #endif
772 }
773 }
774
775 __pure2
776 static bool
vm_page_free_queue_has_colors(vm_memory_class_t mem_class)777 vm_page_free_queue_has_colors(vm_memory_class_t mem_class)
778 {
779 switch (mem_class) {
780 case VM_MEMORY_CLASS_REGULAR:
781 #if HAS_MTE
782 case VM_MEMORY_CLASS_TAGGED:
783 case VM_MEMORY_CLASS_TAG_STORAGE:
784 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
785 #endif
786 return true;
787 #if XNU_VM_HAS_LOPAGE
788 case VM_MEMORY_CLASS_LOPAGE:
789 return false;
790 #endif /* XNU_VM_HAS_LOPAGE */
791 #if CONFIG_SECLUDED_MEMORY
792 case VM_MEMORY_CLASS_SECLUDED:
793 return false;
794 #endif
795 }
796 }
797
798
799 #if CONFIG_SECLUDED_MEMORY
800
801 static bool
vm_page_secluded_pool_eligible(vm_memory_class_t class)802 vm_page_secluded_pool_eligible(vm_memory_class_t class)
803 {
804 switch (class) {
805 #if XNU_VM_HAS_LOPAGE
806 case VM_MEMORY_CLASS_LOPAGE:
807 return false;
808 #endif /* XNU_VM_HAS_LOPAGE */
809 #if HAS_MTE
810 case VM_MEMORY_CLASS_TAG_STORAGE:
811 case VM_MEMORY_CLASS_TAGGED:
812 return false;
813 #endif /* HAS_MTE */
814 default:
815 return true;
816 }
817 }
818
819 static bool
vm_page_secluded_pool_depleted(void)820 vm_page_secluded_pool_depleted(void)
821 {
822 if (vm_page_free_count <= vm_page_free_reserved) {
823 return false;
824 }
825 if (num_tasks_can_use_secluded_mem) {
826 return false;
827 }
828 return vm_page_secluded_count < vm_page_secluded_target;
829 }
830
831 #endif /* CONFIG_SECLUDED_MEMORY */
832 #if HIBERNATION
833
834 __attribute__((overloadable))
835 static void
836 vm_page_free_queue_foreach(vm_page_queue_t queue, void (^block)(vm_page_t))
837 {
838 vm_page_t page;
839
vm_page_queue_iterate(queue,page,vmp_pageq)840 vm_page_queue_iterate(queue, page, vmp_pageq) {
841 block(page);
842 }
843 }
844
845 __attribute__((overloadable))
846 static void
847 vm_page_free_queue_foreach(vm_page_free_queue_t queue, void (^block)(vm_page_t))
848 {
849 for (unsigned int color = 0; color < vm_colors; color++) {
850 vm_page_free_queue_foreach(&queue->vmpfq_queues[color].qhead, block);
851 }
852 }
853
854 #endif /* HIBERNATION */
855 #if CONFIG_SPTM
856
857 static inline uint32_t
vm_pages_free_mask_len(void)858 vm_pages_free_mask_len(void)
859 {
860 extern pmap_paddr_t real_avail_end;
861
862 uint64_t pnums = atop(real_avail_end) - pmap_first_pnum;
863 static_assert(8 * sizeof(__uint128_t) == MAX_COLORS);
864 return (uint32_t)((pnums + MAX_COLORS - 1) / MAX_COLORS);
865 }
866
867 static inline int8_t
vm_pages_free_mask_bit(ppnum_t pnum)868 vm_pages_free_mask_bit(ppnum_t pnum)
869 {
870 return (int8_t)(pnum & (MAX_COLORS - 1));
871 }
872
873 static inline uint32_t
vm_pages_free_mask_index(ppnum_t pnum)874 vm_pages_free_mask_index(ppnum_t pnum)
875 {
876 return (pnum - pmap_first_pnum) / MAX_COLORS;
877 }
878
879 __pure2
880 static inline __uint128_t *
vm_pages_free_masks(void)881 vm_pages_free_masks(void)
882 {
883 return _vm_pages_free_masks;
884 }
885
886 __pure2
887 static inline bitmap_t *
vm_pages_free_masks_as_bitmap(uint32_t index)888 vm_pages_free_masks_as_bitmap(uint32_t index)
889 {
890 /*
891 * this conversion is gross but helps with codegen for bit-wise
892 * accesses where the __uint128_t type is really yielding poor code.
893 *
894 * This conversion is only legal on little endian architectures.
895 */
896 #ifndef __LITTLE_ENDIAN__
897 #error unsupported configuration
898 #endif
899 return (bitmap_t *)(_vm_pages_free_masks + index);
900 }
901
902 __pure2
903 static inline int8_t *
vm_pages_free_enqueue_idx(uint32_t index)904 vm_pages_free_enqueue_idx(uint32_t index)
905 {
906 return &_vm_pages_free_enqueue_idx[index];
907 }
908
909 /*!
910 * @brief
911 * Return the position of the next bit in "circular" order for a given cluster
912 * of pages, starting at and including @c bit.
913 */
914 static inline int8_t
vm_pages_free_mask_next_bit(uint32_t index,int8_t bit)915 vm_pages_free_mask_next_bit(uint32_t index, int8_t bit)
916 {
917 __uint128_t value = vm_pages_free_masks()[index];
918 __uint128_t mask = ((__uint128_t)1 << bit) - 1;
919
920 if (value == 0) {
921 return -1;
922 }
923
924 if (value & ~mask) {
925 value &= ~mask;
926 }
927 if ((uint64_t)value) {
928 return (int8_t)__builtin_ctzll((uint64_t)value);
929 }
930 return 64 + (int8_t)__builtin_ctzll((uint64_t)(value >> 64));
931 }
932
933 static inline bool
vm_pages_free_mask_test(uint32_t index,int8_t bit)934 vm_pages_free_mask_test(uint32_t index, int8_t bit)
935 {
936 return bitmap_test(vm_pages_free_masks_as_bitmap(index), bit);
937 }
938
939 static inline void
vm_pages_free_mask_set(uint32_t index,int8_t bit)940 vm_pages_free_mask_set(uint32_t index, int8_t bit)
941 {
942 assert(!vm_pages_free_mask_test(index, bit));
943 bitmap_set(vm_pages_free_masks_as_bitmap(index), bit);
944 }
945
946 static inline void
vm_pages_free_mask_clear(uint32_t index,int8_t bit)947 vm_pages_free_mask_clear(uint32_t index, int8_t bit)
948 {
949 assert(vm_pages_free_mask_test(index, bit));
950 bitmap_clear(vm_pages_free_masks_as_bitmap(index), bit);
951 }
952
953 #endif /* CONFIG_SPTM */
954
955 __attribute__((always_inline))
956 void
vm_page_free_queue_enter(vm_memory_class_t class,vm_page_t mem,ppnum_t pnum)957 vm_page_free_queue_enter(vm_memory_class_t class, vm_page_t mem, ppnum_t pnum)
958 {
959 bool enter_first;
960 unsigned int color;
961 vm_page_queue_t queue;
962
963 if (startup_phase >= STARTUP_SUB_KMEM) {
964 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
965 }
966
967 assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
968 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
969 mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
970 mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
971 mem->vmp_next_m == 0 &&
972 mem->vmp_object == 0 &&
973 mem->vmp_wire_count == 0 &&
974 mem->vmp_busy &&
975 !mem->vmp_tabled &&
976 !mem->vmp_laundry &&
977 !mem->vmp_pmapped &&
978 !mem->vmp_wpmapped &&
979 !mem->vmp_realtime);
980
981 switch (class) {
982 #if XNU_VM_HAS_LOPAGE
983 case VM_MEMORY_CLASS_LOPAGE:
984 mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
985 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
986 mem->vmp_lopage = true;
987 mem->vmp_canonical = true;
988 enter_first = true;
989 break;
990 #endif /* XNU_VM_HAS_LOPAGE */
991 #if CONFIG_SECLUDED_MEMORY
992 case VM_MEMORY_CLASS_SECLUDED:
993 if (startup_phase >= STARTUP_SUB_KMEM) {
994 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
995 }
996 mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
997 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
998 mem->vmp_lopage = false;
999 mem->vmp_canonical = true;
1000 enter_first = true;
1001 break;
1002 #endif
1003 default:
1004 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
1005 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1006 mem->vmp_lopage = false;
1007 mem->vmp_canonical = true;
1008 enter_first = false;
1009 break;
1010 }
1011
1012 #if HAS_MTE
1013 if (is_mte_enabled) {
1014 switch (class) {
1015 case VM_MEMORY_CLASS_REGULAR:
1016 return mteinfo_covered_page_set_free(pnum, false);
1017 case VM_MEMORY_CLASS_TAGGED:
1018 return mteinfo_covered_page_set_free(pnum, true);
1019 case VM_MEMORY_CLASS_TAG_STORAGE:
1020 return mteinfo_tag_storage_set_inactive(mem, false);
1021 default:
1022 break;
1023 }
1024 }
1025 #endif /* HAS_MTE */
1026
1027 color = VM_PAGE_GET_COLOR_PNUM(pnum);
1028 queue = vm_page_free_queue_for_class(class, color);
1029 #if CONFIG_SPTM
1030 if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) {
1031 uint32_t index = vm_pages_free_mask_index(pnum);
1032 int8_t bit = vm_pages_free_mask_bit(pnum);
1033
1034 if (vm_pages_free_masks()[index] == 0) {
1035 vm_page_queue_enter(queue, mem, vmp_pageq);
1036 *vm_pages_free_enqueue_idx(index) = bit;
1037 }
1038 vm_pages_free_mask_set(index, bit);
1039 } else
1040 #endif /* CONFIG_SPTM */
1041 if (enter_first) {
1042 vm_page_queue_enter_first(queue, mem, vmp_pageq);
1043 } else {
1044 #if defined(__x86_64__)
1045 vm_page_queue_enter_clump(queue, mem);
1046 #else
1047 vm_page_queue_enter(queue, mem, vmp_pageq);
1048 #endif
1049 }
1050
1051 switch (class) {
1052 case VM_MEMORY_CLASS_REGULAR:
1053 VM_COUNTER_INC(&vm_page_queue_free.vmpfq_count);
1054 VM_COUNTER_INC(&vm_page_free_count);
1055 break;
1056 #if HAS_MTE
1057 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1058 VM_COUNTER_INC(&vm_page_queue_free.vmpfq_count);
1059 VM_COUNTER_INC(&vm_page_free_unmanaged_tag_storage_count);
1060 /* these do not participate to the vm page free count */
1061 break;
1062 #endif
1063 #if XNU_VM_HAS_LOPAGE
1064 case VM_MEMORY_CLASS_LOPAGE:
1065 VM_COUNTER_INC(&vm_lopage_free_count);
1066 if (vm_lopage_free_count >= vm_lopage_free_limit) {
1067 vm_lopage_refill = false;
1068 }
1069 break;
1070 #endif /* XNU_VM_HAS_LOPAGE */
1071 #if CONFIG_SECLUDED_MEMORY
1072 case VM_MEMORY_CLASS_SECLUDED:
1073 vm_page_secluded_count++;
1074 vm_page_secluded_count_free++;
1075 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
1076 break;
1077 #endif /* CONFIG_SECLUDED_MEMORY */
1078 default:
1079 __builtin_unreachable();
1080 }
1081 }
1082
1083 /*!
1084 * @typedef vmp_free_list_result_t
1085 *
1086 * @discussion
1087 * This data structure is used by vm_page_free_queue_add_list to track
1088 * how many pages were freed to which free lists, so that it can then drive
1089 * which waiters we are going to wake up.
1090 *
1091 * uint8_t counters are enough because we never free more than 64 pages at
1092 * a time, and this allows for the data structure to be passed by register.
1093 */
1094 typedef struct {
1095 uint8_t vmpr_regular;
1096 #if HAS_MTE
1097 uint8_t vmpr_taggable;
1098 uint8_t vmpr_tag_storage;
1099 #endif /* HAS_MTE */
1100 uint8_t vmpr_lopage;
1101 #if CONFIG_SECLUDED_MEMORY
1102 uint8_t vmpr_secluded;
1103 #endif /* CONFIG_SECLUDED_MEMORY */
1104 } vmp_free_list_result_t;
1105
1106 /*!
1107 * @abstract
1108 * Returns whether there are any threads blocked in VM_PAGE_WAIT().
1109 *
1110 * @discussion
1111 * The page free queue lock must be held.
1112 */
1113 static bool
vm_page_free_queue_has_any_waiters(void)1114 vm_page_free_queue_has_any_waiters(void)
1115 {
1116 uint32_t result = 0;
1117
1118 result |= vm_page_free_wanted;
1119 result |= vm_page_free_wanted_privileged;
1120 #if HAS_MTE
1121 result |= vm_page_free_wanted_tagged;
1122 result |= vm_page_free_wanted_tagged_privileged;
1123 #endif /* HAS_MTE */
1124 #if CONFIG_SECLUDED_MEMORY
1125 result |= vm_page_free_wanted_secluded;
1126 #endif /* CONFIG_SECLUDED_MEMORY */
1127
1128 return result != 0;
1129 }
1130
1131 void
vm_page_free_wakeup(event_t event,uint32_t n)1132 vm_page_free_wakeup(event_t event, uint32_t n)
1133 {
1134 if (vps_dynamic_priority_enabled) {
1135 if (n == UINT32_MAX) {
1136 wakeup_all_with_inheritor(event, THREAD_AWAKENED);
1137 } else {
1138 while (n-- > 0) {
1139 wakeup_one_with_inheritor(event, THREAD_AWAKENED,
1140 LCK_WAKE_DO_NOT_TRANSFER_PUSH, NULL);
1141 }
1142 }
1143 } else {
1144 thread_wakeup_nthreads(event, n);
1145 }
1146 }
1147
1148 /*!
1149 * @abstract
1150 * Helper to wakeup threads in VM_PAGE_WAIT() given
1151 * a vm_page_free_queue_enter_list() result.
1152 *
1153 * @discussion
1154 * The page free queue lock must be held, and is unlocked on return.
1155 *
1156 * @param vmpr The result of a vm_page_free_queue_enter_list() call.
1157 */
1158 __attribute__((noinline))
1159 static void
vm_page_free_queue_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr)1160 vm_page_free_queue_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr)
1161 {
1162 unsigned int need_wakeup = 0;
1163 unsigned int need_priv_wakeup = 0;
1164 #if CONFIG_SECLUDED_MEMORY
1165 unsigned int need_wakeup_secluded = 0;
1166 #endif /* CONFIG_SECLUDED_MEMORY */
1167 unsigned int unpriv_limit;
1168 #if HAS_MTE
1169 unsigned int need_tagged_wakeup = 0;
1170 unsigned int need_priv_tagged_wakeup = 0;
1171 unsigned int unpriv_tagged_limit;
1172 unsigned int n;
1173 bool wakeup_refill_thread = false;
1174 #endif /* HAS_MTE */
1175
1176 #define DONATE_TO_WAITERS(wake, count, waiters_count, limit) ({ \
1177 uint32_t __n = MIN(MIN(waiters_count, vmpr.count), limit); \
1178 waiters_count -= __n; \
1179 vmpr.count -= __n; \
1180 wake += __n; \
1181 __n; \
1182 })
1183
1184 /*
1185 * Step 1: privileged waiters get to be satisfied first
1186 */
1187 #if HAS_MTE
1188 if (vm_page_free_wanted_tagged_privileged) {
1189 DONATE_TO_WAITERS(need_priv_tagged_wakeup,
1190 vmpr_taggable, vm_page_free_wanted_tagged_privileged,
1191 UINT32_MAX);
1192
1193 /*
1194 * If we will not wake up privileged threads, and there are
1195 * tagged privileged waiters, we need the refill thread to do
1196 * an emergency activation or reclaim to fulfill this need.
1197 *
1198 * We need to at least have 2 extra free pages because the
1199 * reclaim path might require to relocate a page to give us one.
1200 */
1201 if (!need_priv_tagged_wakeup &&
1202 vm_page_free_count >= vm_page_free_taggable_count + 2) {
1203 wakeup_refill_thread = true;
1204 }
1205 }
1206 #endif /* HAS_MTE */
1207 if (vm_page_free_wanted_privileged) {
1208 DONATE_TO_WAITERS(need_priv_wakeup,
1209 vmpr_regular, vm_page_free_wanted_privileged,
1210 UINT32_MAX);
1211 #if HAS_MTE
1212 DONATE_TO_WAITERS(need_priv_wakeup,
1213 vmpr_taggable, vm_page_free_wanted_privileged,
1214 UINT32_MAX);
1215 #endif /* HAS_MTE */
1216 }
1217
1218
1219 /*
1220 * Step 2: the privileged reserve needs to be replenished
1221 *
1222 * Let's make sure that we only wake up regular threads
1223 * for free pages above the reserve threshold.
1224 */
1225 if (vm_page_free_count <= vm_page_free_reserved) {
1226 unpriv_limit = 0;
1227 } else {
1228 unpriv_limit = vm_page_free_count - vm_page_free_reserved;
1229 }
1230 #if HAS_MTE
1231 if (vm_page_free_taggable_count <= vm_page_free_reserved) {
1232 unpriv_tagged_limit = 0;
1233 } else {
1234 unpriv_tagged_limit = vm_page_free_taggable_count -
1235 vm_page_free_reserved;
1236 }
1237 #endif /* HAS_MTE */
1238
1239 /*
1240 * Step 3: satisfy secluded waiters, using the secluded pool first,
1241 * regular pages second.
1242 */
1243 #if CONFIG_SECLUDED_MEMORY
1244 if (vm_page_free_wanted_secluded) {
1245 DONATE_TO_WAITERS(need_wakeup_secluded,
1246 vmpr_secluded, vm_page_free_wanted_secluded,
1247 UINT32_MAX);
1248 unpriv_limit -= DONATE_TO_WAITERS(need_wakeup_secluded,
1249 vmpr_regular, vm_page_free_wanted_secluded,
1250 unpriv_limit);
1251
1252 if (vm_page_free_wanted_secluded == 0) {
1253 need_wakeup_secluded = UINT32_MAX;
1254 }
1255 }
1256 #endif /* CONFIG_SECLUDED_MEMORY */
1257
1258 /*
1259 * Step 4: satisfy regular demand last.
1260 */
1261 #if HAS_MTE
1262 if (vm_page_free_wanted_tagged) {
1263 n = DONATE_TO_WAITERS(need_tagged_wakeup,
1264 vmpr_taggable, vm_page_free_wanted_tagged,
1265 MIN(unpriv_limit, unpriv_tagged_limit));
1266
1267 unpriv_limit -= n;
1268 unpriv_tagged_limit -= n;
1269
1270 if (vm_page_free_wanted_tagged == 0) {
1271 need_tagged_wakeup = UINT32_MAX;
1272 } else if (vm_page_free_count >=
1273 MAX(vm_page_free_taggable_count + 2, vm_page_free_min)) {
1274 /*
1275 * If we still have tagged waiters, and that rebalancing
1276 * pages would get us above vm_page_free_min, then wake
1277 * up the refill thread to help do that rebalance.
1278 */
1279 wakeup_refill_thread = true;
1280 }
1281 }
1282 #endif /* HAS_MTE */
1283 if (vm_page_free_wanted) {
1284 unpriv_limit -= DONATE_TO_WAITERS(need_wakeup,
1285 vmpr_regular, vm_page_free_wanted,
1286 unpriv_limit);
1287 #if HAS_MTE
1288 n = DONATE_TO_WAITERS(need_wakeup,
1289 vmpr_taggable, vm_page_free_wanted,
1290 MIN(unpriv_limit, unpriv_tagged_limit));
1291
1292 unpriv_limit -= n;
1293 unpriv_tagged_limit -= n;
1294 #endif /* HAS_MTE */
1295 if (vm_page_free_wanted == 0) {
1296 need_wakeup = UINT32_MAX;
1297 }
1298 }
1299
1300 /*
1301 * We have updated waiter counts, and if that release page happens
1302 * from the context of a thread that's super low priority we might
1303 * starve waking up privileged threads.
1304 *
1305 * While we hold the free page lock, such threads would wake us up via
1306 * the mutex priority inheritance mechanism, but as soon as we drop the
1307 * lock all bets are off.
1308 *
1309 * To avoid this priority inversion that could really hurt the VM,
1310 * disable preemption until we've woken up everyone.
1311 */
1312 disable_preemption();
1313 vm_free_page_unlock();
1314
1315 /*
1316 * Dispatch privileged wakeups
1317 *
1318 * There shouldn't be that many VM-privileged threads,
1319 * so let's wake them all up, even if we don't quite
1320 * have enough pages to satisfy them all.
1321 */
1322 if (need_priv_wakeup) {
1323 vm_page_free_wakeup(&vm_page_free_wanted_privileged,
1324 UINT32_MAX);
1325 }
1326 if (need_wakeup) {
1327 vm_page_free_wakeup(&vm_page_free_count, need_wakeup);
1328 }
1329 #if HAS_MTE
1330 if (need_priv_tagged_wakeup) {
1331 vm_page_free_wakeup(&vm_page_free_wanted_tagged_privileged,
1332 UINT32_MAX);
1333 }
1334 if (need_tagged_wakeup) {
1335 vm_page_free_wakeup(&vm_page_free_wanted_tagged,
1336 need_tagged_wakeup);
1337 }
1338 if (wakeup_refill_thread) {
1339 mteinfo_wake_fill_thread();
1340 }
1341 #endif /* HAS_MTE */
1342 #if CONFIG_SECLUDED_MEMORY
1343 if (need_wakeup_secluded) {
1344 vm_page_free_wakeup(&vm_page_free_wanted_secluded,
1345 need_wakeup_secluded);
1346 }
1347 #endif /* CONFIG_SECLUDED_MEMORY */
1348
1349 enable_preemption();
1350
1351 #undef DONATE_TO_WAITERS
1352 }
1353
1354 /*
1355 * @abstract
1356 * Given a list of pages, put each page on whichever global free queue is
1357 * appropriate.
1358 *
1359 * @discussion
1360 * Must be called with the VM free page lock unlocked.
1361 *
1362 * The list must contain less than 255 elements.
1363 */
1364 #if HAS_MTE
1365 /*
1366 * To put it more bluntly: this will demux pages onto the free tag storage
1367 * queue or the global free queue, as appropriate. If we start freeing tagged
1368 * pages onto the free tagged queue, this function should be updated to deal
1369 * with that too.
1370 */
1371 #endif /* HAS_MTE */
1372 static void
vm_page_free_queue_enter_list(vm_page_list_t list,vmp_release_options_t opts)1373 vm_page_free_queue_enter_list(vm_page_list_t list, vmp_release_options_t opts)
1374 {
1375 bool page_queues_unlock = false;
1376 bool page_queues_locked = false;
1377 bool do_secluded = false;
1378 vmp_free_list_result_t result = { };
1379 vm_page_t mem;
1380
1381 LCK_MTX_ASSERT(&vm_page_queue_lock,
1382 (opts & VMP_RELEASE_Q_LOCKED)
1383 ? LCK_MTX_ASSERT_OWNED
1384 : LCK_MTX_ASSERT_NOTOWNED);
1385
1386 /*
1387 * Hibernation and startup do not really need the lock because
1388 * these are single threaded paths, so from the PoV of that function,
1389 * it's as if VMP_RELEASE_Q_LOCKED was passed.
1390 */
1391 page_queues_locked = (opts & (VMP_RELEASE_STARTUP |
1392 VMP_RELEASE_HIBERNATE |
1393 VMP_RELEASE_Q_LOCKED));
1394
1395 #if CONFIG_SECLUDED_MEMORY
1396 do_secluded = vm_page_secluded_pool_depleted();
1397 #if HAS_MTE
1398 if (do_secluded && list.vmpl_has_tagged &&
1399 (opts & VMP_RELEASE_Q_LOCKED) == 0) {
1400 /*
1401 * Try to do the untagging so that pages become eligible
1402 * for the secluded pool while holding the least amount
1403 * of locks possible.
1404 *
1405 * This does mean we shouldn't do this retyping if the page
1406 * queue lock is held for real. The only path doing this
1407 * right now is vm_page_free() which is one page at a time,
1408 * so it's probably "fine" to not contribute these to the
1409 * secluded pool.
1410 */
1411 const unified_page_list_t pmap_batch_list = {
1412 .page_slist = list.vmpl_head,
1413 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
1414 };
1415
1416 pmap_unmake_tagged_pages(&pmap_batch_list);
1417 vm_page_list_foreach(mem, list) {
1418 mem->vmp_using_mte = false;
1419 }
1420 list.vmpl_has_tagged = false;
1421 list.vmpl_has_untagged = true;
1422 }
1423 #endif /* HAS_MTE */
1424 #endif /* CONFIG_SECLUDED_MEMORY */
1425
1426 if (!page_queues_locked && (list.vmpl_has_realtime || do_secluded)) {
1427 vm_page_lock_queues();
1428 page_queues_locked = true;
1429 page_queues_unlock = true;
1430 }
1431
1432 if (opts & VMP_RELEASE_STARTUP) {
1433 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1434 } else {
1435 vm_free_page_lock_spin();
1436 }
1437
1438 vm_page_list_foreach_consume(mem, &list) {
1439 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
1440 vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
1441
1442 if (mem->vmp_realtime) {
1443 mem->vmp_realtime = false;
1444 VM_COUNTER_DEC(&vm_page_realtime_count);
1445 }
1446
1447 #if XNU_VM_HAS_LOPAGE
1448 if ((class == VM_MEMORY_CLASS_REGULAR ||
1449 class == VM_MEMORY_CLASS_LOPAGE) &&
1450 vm_lopage_refill &&
1451 vm_lopage_free_count < vm_lopage_free_limit &&
1452 pnum < max_valid_low_ppnum) {
1453 class = VM_MEMORY_CLASS_LOPAGE;
1454 } else {
1455 class = VM_MEMORY_CLASS_REGULAR;
1456 }
1457 #endif /* XNU_VM_HAS_LOPAGE */
1458
1459 #if CONFIG_SECLUDED_MEMORY
1460 /*
1461 * XXX FBDP TODO: also avoid refilling secluded queue
1462 * when some IOKit objects are already grabbing from it...
1463 */
1464 if (page_queues_locked &&
1465 vm_page_secluded_pool_eligible(class) &&
1466 vm_page_secluded_pool_depleted()) {
1467 class = VM_MEMORY_CLASS_SECLUDED;
1468 }
1469 #endif /* CONFIG_SECLUDED_MEMORY */
1470
1471 vm_page_free_queue_enter(class, mem, pnum);
1472
1473 switch (class) {
1474 case VM_MEMORY_CLASS_REGULAR:
1475 #if HAS_MTE
1476 if (is_mte_enabled && mteinfo_covered_page_taggable(pnum)) {
1477 result.vmpr_taggable++;
1478 break;
1479 }
1480 OS_FALLTHROUGH;
1481 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1482 #endif /* HAS_MTE */
1483 result.vmpr_regular++;
1484 break;
1485 #if HAS_MTE
1486 case VM_MEMORY_CLASS_TAGGED:
1487 result.vmpr_taggable++;
1488 break;
1489 case VM_MEMORY_CLASS_TAG_STORAGE:
1490 result.vmpr_tag_storage++;
1491 break;
1492 #endif /* HAS_MTE */
1493 #if XNU_VM_HAS_LOPAGE
1494 case VM_MEMORY_CLASS_LOPAGE:
1495 result.vmpr_lopage++;
1496 break;
1497 #endif /* XNU_VM_HAS_LOPAGE */
1498 #if CONFIG_SECLUDED_MEMORY
1499 case VM_MEMORY_CLASS_SECLUDED:
1500 result.vmpr_secluded++;
1501 continue;
1502 #endif /* CONFIG_SECLUDED_MEMORY */
1503 }
1504 }
1505
1506 if (page_queues_unlock) {
1507 vm_page_unlock_queues();
1508 }
1509
1510 vm_pageout_vminfo.vm_page_pages_freed += list.vmpl_count;
1511 VM_DEBUG_CONSTANT_EVENT(vm_page_release, DBG_VM_PAGE_RELEASE,
1512 DBG_FUNC_NONE, list.vmpl_count, 0, 0, 0);
1513
1514 if (opts & VMP_RELEASE_STARTUP) {
1515 /*
1516 * On purpose skip the VM_CHECK_MEMORYSTATUS,
1517 * pmap_startup() will do it,
1518 * and the caller holds the free queue lock the whole time.
1519 */
1520 return;
1521 }
1522
1523 if (vm_page_free_queue_has_any_waiters()) {
1524 vm_page_free_queue_handle_wakeups_and_unlock(result);
1525 } else {
1526 vm_free_page_unlock();
1527 }
1528
1529 if ((opts & VMP_RELEASE_HIBERNATE) == 0) {
1530 /*
1531 * Skip VM_CHECK_MEMORYSTATUS here as
1532 * hibernate_rebuild_vm_structs() will run it after the last flush.
1533 */
1534 VM_CHECK_MEMORYSTATUS;
1535 }
1536 }
1537
1538 __attribute__((always_inline))
1539 void
vm_page_free_queue_remove(vm_memory_class_t class,vm_page_t mem,ppnum_t pnum,vm_page_q_state_t q_state)1540 vm_page_free_queue_remove(
1541 vm_memory_class_t class,
1542 vm_page_t mem,
1543 ppnum_t pnum,
1544 vm_page_q_state_t q_state)
1545 {
1546 unsigned int color;
1547 vm_page_queue_t queue;
1548
1549 if (startup_phase >= STARTUP_SUB_KMEM) {
1550 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1551 }
1552
1553 mem->vmp_q_state = q_state;
1554
1555 #if HAS_MTE
1556 if (is_mte_enabled) {
1557 switch (class) {
1558 case VM_MEMORY_CLASS_REGULAR:
1559 return mteinfo_covered_page_set_used(pnum, false);
1560 case VM_MEMORY_CLASS_TAGGED:
1561 return mteinfo_covered_page_set_used(pnum, true);
1562 case VM_MEMORY_CLASS_TAG_STORAGE:
1563 return mteinfo_tag_storage_set_claimed(mem);
1564 default:
1565 break;
1566 }
1567 }
1568 #endif /* HAS_MTE */
1569
1570 color = VM_PAGE_GET_COLOR_PNUM(pnum);
1571 queue = vm_page_free_queue_for_class(class, color);
1572 #if CONFIG_SPTM
1573 if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) {
1574 uint32_t index = vm_pages_free_mask_index(pnum);
1575 int8_t bit = vm_pages_free_mask_bit(pnum);
1576
1577 vm_pages_free_mask_clear(index, bit);
1578 if (*vm_pages_free_enqueue_idx(index) == bit) {
1579 vm_page_queue_remove(queue, mem, vmp_pageq);
1580 bit = vm_pages_free_mask_next_bit(index, bit);
1581 *vm_pages_free_enqueue_idx(index) = bit;
1582
1583 if (bit != -1) {
1584 assert(vm_pages_free_mask_test(index, bit));
1585 pnum = (pnum & -MAX_COLORS) + bit;
1586 mem = vm_page_find_canonical(pnum);
1587 color = VM_PAGE_GET_COLOR_PNUM(pnum);
1588 queue = vm_page_free_queue_for_class(class, color);
1589 vm_page_queue_enter(queue, mem, vmp_pageq);
1590 }
1591 }
1592 } else
1593 #endif /* CONFIG_SPTM */
1594 {
1595 vm_page_queue_remove(queue, mem, vmp_pageq);
1596 }
1597
1598 switch (class) {
1599 case VM_MEMORY_CLASS_REGULAR:
1600 VM_COUNTER_DEC(&vm_page_queue_free.vmpfq_count);
1601 VM_COUNTER_DEC(&vm_page_free_count);
1602 break;
1603 #if HAS_MTE
1604 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1605 VM_COUNTER_DEC(&vm_page_queue_free.vmpfq_count);
1606 VM_COUNTER_DEC(&vm_page_free_unmanaged_tag_storage_count);
1607 /* these do not participate to the vm page free count */
1608 break;
1609 #endif /* HAS_MTE */
1610 #if XNU_VM_HAS_LOPAGE
1611 case VM_MEMORY_CLASS_LOPAGE:
1612 VM_COUNTER_DEC(&vm_lopage_free_count);
1613 vm_lopages_allocated_q += 1;
1614 if (vm_lopage_free_count < vm_lopage_lowater) {
1615 vm_lopage_refill = true;
1616 }
1617 break;
1618 #endif /* XNU_VM_HAS_LOPAGE */
1619 default:
1620 __builtin_unreachable();
1621 }
1622 }
1623
1624 vm_page_list_t
vm_page_free_queue_grab(vm_grab_options_t options __unused,vm_memory_class_t class,unsigned int num_pages,vm_page_q_state_t q_state)1625 vm_page_free_queue_grab(
1626 vm_grab_options_t options __unused,
1627 vm_memory_class_t class,
1628 unsigned int num_pages,
1629 vm_page_q_state_t q_state)
1630 {
1631 unsigned int *colorp;
1632 unsigned int color;
1633 #if defined(__x86_64__)
1634 unsigned int clump_end = 1;
1635 unsigned int sub_count = 0;
1636 #endif /* __x86_64__ */
1637 vm_page_list_t list = { };
1638
1639 if (startup_phase >= STARTUP_SUB_KMEM) {
1640 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
1641 }
1642 assert(get_preemption_level() != 0);
1643 assert(q_state <= VM_PAGE_Q_STATE_LAST_VALID_VALUE);
1644
1645 #if HAS_MTE
1646 if (is_mte_enabled && class != VM_MEMORY_CLASS_DEAD_TAG_STORAGE) {
1647 return mteinfo_free_queue_grab(options, class, num_pages, q_state);
1648 }
1649 #endif /* HAS_MTE */
1650
1651 colorp = PERCPU_GET(start_color);
1652 color = *colorp;
1653
1654 /* Get the pages. */
1655 while (list.vmpl_count < num_pages) {
1656 uint32_t color_offset = 1;
1657 vm_page_queue_t queue;
1658 vm_page_t mem;
1659
1660 queue = vm_page_free_queue_for_class(class, color);
1661 if (!vm_page_free_queue_has_colors(class)) {
1662 assert(!vm_page_queue_empty(queue));
1663 color_offset = 0;
1664 }
1665 while (vm_page_queue_empty(queue)) {
1666 color = (color + 1) & vm_color_mask;
1667 queue = vm_page_free_queue_for_class(class, color);
1668 }
1669
1670 #if defined(__x86_64__)
1671 if (class == VM_MEMORY_CLASS_REGULAR) {
1672 /*
1673 * x86_64 uses a bespoke free queue scheme, where the free path
1674 * tries to cluster clumps of contiguous pages together on
1675 * the free queue to optimize for the platform's memory
1676 * controller.
1677 */
1678 vm_page_queue_remove_first_with_clump(queue, mem, clump_end);
1679 sub_count++;
1680 if (clump_end) {
1681 #if DEVELOPMENT || DEBUG
1682 vm_clump_update_stats(sub_count);
1683 #endif /* !DEVELOPMENT && !DEBUG */
1684 sub_count = 0;
1685 } else {
1686 /* Only change colors at the end of a clump. */
1687 color_offset = 0;
1688 }
1689 } else
1690 #endif /* !defined(__x86_64__) */
1691 {
1692 /* Other targets default to rotating colors after each pop. */
1693 vm_page_queue_remove_first(queue, mem, vmp_pageq);
1694 }
1695
1696 #if CONFIG_SPTM
1697 if (vm_pages_free_masks()) {
1698 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
1699 ppnum_t first_pnum = pnum & -MAX_COLORS;
1700 uint32_t index = vm_pages_free_mask_index(pnum);
1701 int8_t bit = vm_pages_free_mask_bit(pnum);
1702
1703 for (;;) {
1704 vm_pages_free_mask_clear(index, bit);
1705 mem->vmp_q_state = q_state;
1706 vm_page_list_push(&list, mem);
1707
1708 bit = (bit + 1) & (MAX_COLORS - 1);
1709
1710 if (!vm_pages_free_mask_test(index, bit) ||
1711 num_pages <= list.vmpl_count) {
1712 break;
1713 }
1714 mem = vm_page_find_canonical(first_pnum + bit);
1715 }
1716
1717 color = bit & vm_color_mask;
1718
1719 bit = vm_pages_free_mask_next_bit(index, bit);
1720 *vm_pages_free_enqueue_idx(index) = bit;
1721
1722 if (bit != -1) {
1723 assert(vm_pages_free_mask_test(index, bit));
1724 mem = vm_page_find_canonical(first_pnum + bit);
1725 queue = vm_page_free_queue_for_class(class,
1726 bit & vm_color_mask);
1727 vm_page_queue_enter_first(queue, mem, vmp_pageq);
1728 }
1729 } else
1730 #endif /* CONFIG_SPTM */
1731 {
1732 /* Set the page to the client's desired queue state. */
1733 mem->vmp_q_state = q_state;
1734 vm_page_list_push(&list, mem);
1735
1736 color = (color + color_offset) & vm_color_mask;
1737 }
1738 }
1739
1740 switch (class) {
1741 case VM_MEMORY_CLASS_REGULAR:
1742 VM_COUNTER_SUB(&vm_page_queue_free.vmpfq_count, list.vmpl_count);
1743 VM_COUNTER_SUB(&vm_page_free_count, list.vmpl_count);
1744 break;
1745 #if HAS_MTE
1746 case VM_MEMORY_CLASS_DEAD_TAG_STORAGE:
1747 VM_COUNTER_SUB(&vm_page_queue_free.vmpfq_count, list.vmpl_count);
1748 VM_COUNTER_SUB(&vm_page_free_unmanaged_tag_storage_count, list.vmpl_count);
1749 /* these do not participate to the vm page free count */
1750 break;
1751 #endif /* HAS_MTE */
1752 #if XNU_VM_HAS_LOPAGE
1753 case VM_MEMORY_CLASS_LOPAGE:
1754 VM_COUNTER_SUB(&vm_lopage_free_count, list.vmpl_count);
1755 vm_lopages_allocated_q += list.vmpl_count;
1756 if (vm_lopage_free_count < vm_lopage_lowater) {
1757 vm_lopage_refill = true;
1758 }
1759 break;
1760 #endif /* XNU_VM_HAS_LOPAGE */
1761 default:
1762 __builtin_unreachable();
1763 }
1764
1765 /* Record the next page color the CPU should try to get. */
1766 *colorp = color;
1767 #if defined(__x86_64__) && (DEVELOPMENT || DEBUG)
1768 vm_clump_update_stats(sub_count);
1769 #endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */
1770
1771 return list;
1772 }
1773
1774
1775 #define COLOR_GROUPS_TO_STEAL 4
1776
1777 /* Called once during statup, once the cache geometry is known.
1778 */
1779 static void
vm_page_set_colors(void)1780 vm_page_set_colors( void )
1781 {
1782 unsigned int n, override;
1783
1784 #if defined (__x86_64__)
1785 /* adjust #colors because we need to color outside the clump boundary */
1786 vm_cache_geometry_colors >>= vm_clump_shift;
1787 #endif
1788 if (PE_parse_boot_argn("colors", &override, sizeof(override))) { /* colors specified as a boot-arg? */
1789 n = override;
1790 } else if (vm_cache_geometry_colors) { /* do we know what the cache geometry is? */
1791 n = vm_cache_geometry_colors;
1792 } else {
1793 n = DEFAULT_COLORS; /* use default if all else fails */
1794 }
1795 if (n == 0) {
1796 n = 1;
1797 }
1798 if (n > MAX_COLORS) {
1799 n = MAX_COLORS;
1800 }
1801
1802 /* the count must be a power of 2 */
1803 if ((n & (n - 1)) != 0) {
1804 n = DEFAULT_COLORS; /* use default if all else fails */
1805 }
1806 vm_colors = n;
1807 vm_color_mask = n - 1;
1808
1809 vm_free_magazine_refill_limit = vm_colors * COLOR_GROUPS_TO_STEAL;
1810
1811 #if defined (__x86_64__)
1812 /* adjust for reduction in colors due to clumping and multiple cores */
1813 if (real_ncpus) {
1814 vm_free_magazine_refill_limit *= (vm_clump_size * real_ncpus);
1815 }
1816 #endif
1817 }
1818
1819 #if XNU_VM_HAS_DELAYED_PAGES
1820
1821 static uint32_t vm_delayed_count = 0; /* when non-zero, indicates we may have more pages to init */
1822 static ppnum_t delay_above_pnum = PPNUM_MAX;
1823
1824 /*
1825 * For x86 first 8 Gig initializes quickly and gives us lots of lowmem + mem above to start off with.
1826 * If ARM ever uses delayed page initialization, this value may need to be quite different.
1827 */
1828 #define DEFAULT_DELAY_ABOVE_PHYS_GB (8)
1829
1830 /*
1831 * When we have to dip into more delayed pages due to low memory, free up
1832 * a large chunk to get things back to normal. This avoids contention on the
1833 * delayed code allocating page by page.
1834 */
1835 #define VM_DELAY_PAGE_CHUNK ((1024 * 1024 * 1024) / PAGE_SIZE)
1836
1837 /*
1838 * Get and initialize the next delayed page.
1839 */
1840 __attribute__((noinline))
1841 static vm_page_t
vm_get_delayed_page(vm_grab_options_t grab_options)1842 vm_get_delayed_page(vm_grab_options_t grab_options)
1843 {
1844 vm_page_t p;
1845 ppnum_t pnum;
1846
1847 /*
1848 * Get a new page if we have one.
1849 */
1850 vm_free_page_lock();
1851 if (vm_delayed_count == 0) {
1852 vm_free_page_unlock();
1853 return NULL;
1854 }
1855
1856 if (!pmap_next_page(&pnum)) {
1857 vm_delayed_count = 0;
1858 vm_free_page_unlock();
1859 return NULL;
1860 }
1861
1862
1863 assert(vm_delayed_count > 0);
1864 --vm_delayed_count;
1865
1866 #if defined(__x86_64__)
1867 /* x86 cluster code requires increasing phys_page in vm_pages[] */
1868 if (vm_pages_count > 0) {
1869 assert(pnum > vm_page_get(vm_pages_count - 1)->vmp_phys_page);
1870 }
1871 #endif
1872 p = vm_page_get(vm_pages_count);
1873 assert(p < vm_pages_end);
1874 vm_page_init(p, pnum);
1875 ++vm_pages_count;
1876 ++vm_page_pages;
1877 vm_free_page_unlock();
1878
1879 /*
1880 * These pages were initially counted as wired, undo that now.
1881 */
1882 if (grab_options & VM_PAGE_GRAB_Q_LOCK_HELD) {
1883 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1884 } else {
1885 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
1886 vm_page_lockspin_queues();
1887 }
1888 --vm_page_wire_count;
1889 --vm_page_wire_count_initial;
1890 if (vm_page_wire_count_on_boot != 0) {
1891 --vm_page_wire_count_on_boot;
1892 }
1893 if (!(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
1894 vm_page_unlock_queues();
1895 }
1896
1897
1898 if (fillval) {
1899 fillPage(pnum, fillval);
1900 }
1901 return p;
1902 }
1903
1904 /*
1905 * Free all remaining delayed pages to the free lists.
1906 */
1907 void
vm_free_delayed_pages(void)1908 vm_free_delayed_pages(void)
1909 {
1910 vm_page_t p;
1911 vm_page_t list = NULL;
1912 uint_t cnt = 0;
1913 vm_offset_t start_free_va;
1914 int64_t free_size;
1915
1916 while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) {
1917 if (vm_himemory_mode) {
1918 vm_page_release(p, VMP_RELEASE_NONE);
1919 } else {
1920 p->vmp_snext = list;
1921 list = p;
1922 }
1923 ++cnt;
1924 }
1925
1926 /*
1927 * Free the pages in reverse order if not himemory mode.
1928 * Hence the low memory pages will be first on free lists. (LIFO)
1929 */
1930 while (list != NULL) {
1931 p = list;
1932 list = p->vmp_snext;
1933 p->vmp_snext = NULL;
1934 vm_page_release(p, VMP_RELEASE_NONE);
1935 }
1936 #if DEVELOPMENT || DEBUG
1937 kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt);
1938 #endif
1939
1940 /*
1941 * Free up any unused full pages at the end of the vm_pages[] array
1942 */
1943 start_free_va = round_page((vm_offset_t)vm_page_get(vm_pages_count));
1944
1945 #if defined(__x86_64__)
1946 /*
1947 * Since x86 might have used large pages for vm_pages[], we can't
1948 * free starting in the middle of a partially used large page.
1949 */
1950 if (pmap_query_pagesize(kernel_pmap, start_free_va) == I386_LPGBYTES) {
1951 start_free_va = ((start_free_va + I386_LPGMASK) & ~I386_LPGMASK);
1952 }
1953 #endif
1954 if (start_free_va < (vm_offset_t)vm_pages_end) {
1955 free_size = trunc_page((vm_offset_t)vm_pages_end - start_free_va);
1956 if (free_size > 0) {
1957 ml_static_mfree(start_free_va, (vm_offset_t)free_size);
1958 vm_pages_end = (void *)start_free_va;
1959
1960 /*
1961 * Note there's no locking here, as only this thread will ever change this value.
1962 * The reader, vm_page_diagnose, doesn't grab any locks for the counts it looks at.
1963 */
1964 vm_page_stolen_count -= (free_size >> PAGE_SHIFT);
1965
1966 #if DEVELOPMENT || DEBUG
1967 kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n",
1968 (long)free_size, (long)start_free_va);
1969 #endif
1970 }
1971 }
1972 }
1973
1974 /*
1975 * Try and free up enough delayed pages to match a contig memory allocation.
1976 */
1977 static void
vm_free_delayed_pages_contig(uint_t npages,ppnum_t max_pnum,ppnum_t pnum_mask)1978 vm_free_delayed_pages_contig(
1979 uint_t npages,
1980 ppnum_t max_pnum,
1981 ppnum_t pnum_mask)
1982 {
1983 vm_page_t p;
1984 ppnum_t pnum;
1985 uint_t cnt = 0;
1986
1987 /*
1988 * Treat 0 as the absolute max page number.
1989 */
1990 if (max_pnum == 0) {
1991 max_pnum = PPNUM_MAX;
1992 }
1993
1994 /*
1995 * Free till we get a properly aligned start page
1996 */
1997 for (;;) {
1998 p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
1999 if (p == NULL) {
2000 return;
2001 }
2002 pnum = VM_PAGE_GET_PHYS_PAGE(p);
2003 vm_page_release(p, VMP_RELEASE_NONE);
2004 if (pnum >= max_pnum) {
2005 return;
2006 }
2007 if ((pnum & pnum_mask) == 0) {
2008 break;
2009 }
2010 }
2011
2012 /*
2013 * Having a healthy pool of free pages will help performance. We don't
2014 * want to fall back to the delayed code for every page allocation.
2015 */
2016 if (vm_page_free_count < VM_DELAY_PAGE_CHUNK) {
2017 npages += VM_DELAY_PAGE_CHUNK;
2018 }
2019
2020 /*
2021 * Now free up the pages
2022 */
2023 for (cnt = 1; cnt < npages; ++cnt) {
2024 p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
2025 if (p == NULL) {
2026 return;
2027 }
2028 vm_page_release(p, VMP_RELEASE_NONE);
2029 }
2030 }
2031
2032 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2033
2034 #define ROUNDUP_NEXTP2(X) (1U << (32 - __builtin_clz((X) - 1)))
2035
2036 void
vm_page_init_local_q(unsigned int num_cpus)2037 vm_page_init_local_q(unsigned int num_cpus)
2038 {
2039 struct vpl *t_local_q;
2040
2041 /*
2042 * no point in this for a uni-processor system
2043 */
2044 if (num_cpus >= 2) {
2045 ml_cpu_info_t cpu_info;
2046
2047 /*
2048 * Force the allocation alignment to a cacheline,
2049 * because the `vpl` struct has a lock and will be taken
2050 * cross CPU so we want to isolate the rest of the per-CPU
2051 * data to avoid false sharing due to this lock being taken.
2052 */
2053
2054 ml_cpu_get_info(&cpu_info);
2055
2056 t_local_q = zalloc_percpu_permanent(sizeof(struct vpl),
2057 cpu_info.cache_line_size - 1);
2058
2059 zpercpu_foreach(lq, t_local_q) {
2060 VPL_LOCK_INIT(lq, &vm_page_lck_grp_local, &vm_page_lck_attr);
2061 vm_page_queue_init(&lq->vpl_queue);
2062 }
2063
2064 /* make the initialization visible to all cores */
2065 os_atomic_store(&vm_page_local_q, t_local_q, release);
2066 }
2067 }
2068
2069 /*
2070 * vm_init_before_launchd
2071 *
2072 * This should be called right before launchd is loaded.
2073 */
2074 void
vm_init_before_launchd(void)2075 vm_init_before_launchd(void)
2076 {
2077 vm_page_lockspin_queues();
2078 vm_page_wire_count_on_boot = vm_page_wire_count;
2079 vm_page_unlock_queues();
2080 }
2081
2082
2083 /*
2084 * vm_page_bootstrap:
2085 *
2086 * Initializes the resident memory module.
2087 *
2088 * Allocates memory for the page cells, and
2089 * for the object/offset-to-page hash table headers.
2090 * Each page cell is initialized and placed on the free list.
2091 * Returns the range of available kernel virtual memory.
2092 */
2093 __startup_func
2094 void
vm_page_bootstrap(vm_offset_t * startp,vm_offset_t * endp)2095 vm_page_bootstrap(
2096 vm_offset_t *startp,
2097 vm_offset_t *endp)
2098 {
2099 unsigned int i;
2100 unsigned int log1;
2101 unsigned int log2;
2102 unsigned int size;
2103
2104 /*
2105 * Initialize the page queues.
2106 */
2107
2108 lck_mtx_init(&vm_page_queue_free_lock, &vm_page_lck_grp_free, &vm_page_lck_attr);
2109 lck_mtx_init(&vm_page_queue_lock, &vm_page_lck_grp_queue, &vm_page_lck_attr);
2110 lck_mtx_init(&vm_purgeable_queue_lock, &vm_page_lck_grp_purge, &vm_page_lck_attr);
2111
2112 for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) {
2113 int group;
2114
2115 purgeable_queues[i].token_q_head = 0;
2116 purgeable_queues[i].token_q_tail = 0;
2117 for (group = 0; group < NUM_VOLATILE_GROUPS; group++) {
2118 queue_init(&purgeable_queues[i].objq[group]);
2119 }
2120
2121 purgeable_queues[i].type = i;
2122 purgeable_queues[i].new_pages = 0;
2123 #if MACH_ASSERT
2124 purgeable_queues[i].debug_count_tokens = 0;
2125 purgeable_queues[i].debug_count_objects = 0;
2126 #endif
2127 }
2128 ;
2129 purgeable_nonvolatile_count = 0;
2130 queue_init(&purgeable_nonvolatile_queue);
2131
2132 vm_page_free_queue_init(&vm_page_queue_free);
2133 #if XNU_VM_HAS_LOPAGE
2134 vm_page_queue_init(&vm_lopage_queue_free);
2135 #endif /* XNU_VM_HAS_LOPAGE */
2136 vm_page_queue_init(&vm_page_queue_active);
2137 vm_page_queue_init(&vm_page_queue_inactive);
2138 #if CONFIG_SECLUDED_MEMORY
2139 vm_page_queue_init(&vm_page_queue_secluded);
2140 #endif /* CONFIG_SECLUDED_MEMORY */
2141 vm_page_queue_init(&vm_page_queue_cleaned);
2142 vm_page_queue_init(&vm_page_queue_throttled);
2143 vm_page_queue_init(&vm_page_queue_anonymous);
2144 queue_init(&vm_objects_wired);
2145
2146 for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
2147 vm_page_queue_init(&vm_page_queue_speculative[i].age_q);
2148
2149 vm_page_queue_speculative[i].age_ts.tv_sec = 0;
2150 vm_page_queue_speculative[i].age_ts.tv_nsec = 0;
2151 }
2152
2153 vm_page_queue_init(&vm_page_queue_donate);
2154 vm_page_queue_init(&vm_page_queue_background);
2155
2156 vm_page_background_count = 0;
2157 vm_page_background_internal_count = 0;
2158 vm_page_background_external_count = 0;
2159 vm_page_background_promoted_count = 0;
2160
2161 vm_page_background_target = (unsigned int)(atop_64(max_mem) / 25);
2162
2163 if (vm_page_background_target > VM_PAGE_BACKGROUND_TARGET_MAX) {
2164 vm_page_background_target = VM_PAGE_BACKGROUND_TARGET_MAX;
2165 }
2166
2167 #if defined(__LP64__)
2168 vm_page_background_mode = VM_PAGE_BG_ENABLED;
2169 vm_page_donate_mode = VM_PAGE_DONATE_ENABLED;
2170 #else
2171 vm_page_background_mode = VM_PAGE_BG_DISABLED;
2172 vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
2173 #endif
2174 vm_page_background_exclude_external = 0;
2175
2176 PE_parse_boot_argn("vm_page_bg_mode", &vm_page_background_mode, sizeof(vm_page_background_mode));
2177 PE_parse_boot_argn("vm_page_bg_exclude_external", &vm_page_background_exclude_external, sizeof(vm_page_background_exclude_external));
2178 PE_parse_boot_argn("vm_page_bg_target", &vm_page_background_target, sizeof(vm_page_background_target));
2179
2180 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && vm_page_background_mode != VM_PAGE_BG_ENABLED) {
2181 vm_page_background_mode = VM_PAGE_BG_DISABLED;
2182 }
2183
2184 PE_parse_boot_argn("vm_page_donate_mode", &vm_page_donate_mode, sizeof(vm_page_donate_mode));
2185 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED && vm_page_donate_mode != VM_PAGE_DONATE_ENABLED) {
2186 vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
2187 }
2188
2189 vm_page_donate_target_high = VM_PAGE_DONATE_TARGET_HIGHWATER;
2190 vm_page_donate_target_low = VM_PAGE_DONATE_TARGET_LOWWATER;
2191 vm_page_donate_target = vm_page_donate_target_high;
2192 vm_page_donate_count = 0;
2193
2194 vm_page_free_wanted = 0;
2195 vm_page_free_wanted_privileged = 0;
2196 #if CONFIG_SECLUDED_MEMORY
2197 vm_page_free_wanted_secluded = 0;
2198 #endif /* CONFIG_SECLUDED_MEMORY */
2199
2200 #if defined (__x86_64__)
2201 /* this must be called before vm_page_set_colors() */
2202 vm_page_setup_clump();
2203 #endif
2204
2205 vm_page_set_colors();
2206
2207 for (vm_tag_t t = 0; t < VM_KERN_MEMORY_FIRST_DYNAMIC; t++) {
2208 vm_allocation_sites_static[t].refcount = 2;
2209 vm_allocation_sites_static[t].tag = t;
2210 vm_allocation_sites[t] = &vm_allocation_sites_static[t];
2211 }
2212 vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].refcount = 2;
2213 vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].tag = VM_KERN_MEMORY_ANY;
2214 vm_allocation_sites[VM_KERN_MEMORY_ANY] = &vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC];
2215
2216 /*
2217 * Steal memory for the map and zone subsystems.
2218 *
2219 * make sure initialize_ram_ranges() has run before we steal pages for the first time on arm
2220 */
2221 (void)pmap_free_pages();
2222
2223 kernel_startup_initialize_upto(STARTUP_SUB_PMAP_STEAL);
2224
2225 /*
2226 * Allocate (and initialize) the virtual-to-physical
2227 * table hash buckets.
2228 *
2229 * The number of buckets should be a power of two to
2230 * get a good hash function. The following computation
2231 * chooses the first power of two that is greater
2232 * than the number of physical pages in the system.
2233 */
2234
2235 if (vm_page_bucket_count == 0) {
2236 unsigned int npages = pmap_free_pages();
2237
2238 vm_page_bucket_count = 1;
2239 while (vm_page_bucket_count < npages) {
2240 vm_page_bucket_count <<= 1;
2241 }
2242 }
2243 vm_page_bucket_lock_count = (vm_page_bucket_count + BUCKETS_PER_LOCK - 1) / BUCKETS_PER_LOCK;
2244
2245 vm_page_hash_mask = vm_page_bucket_count - 1;
2246
2247 /*
2248 * Calculate object shift value for hashing algorithm:
2249 * O = log2(sizeof(struct vm_object))
2250 * B = log2(vm_page_bucket_count)
2251 * hash shifts the object left by
2252 * B/2 - O
2253 */
2254 size = vm_page_bucket_count;
2255 for (log1 = 0; size > 1; log1++) {
2256 size /= 2;
2257 }
2258 size = sizeof(struct vm_object);
2259 for (log2 = 0; size > 1; log2++) {
2260 size /= 2;
2261 }
2262 vm_page_hash_shift = log1 / 2 - log2 + 1;
2263
2264 vm_page_bucket_hash = 1 << ((log1 + 1) >> 1); /* Get (ceiling of sqrt of table size) */
2265 vm_page_bucket_hash |= 1 << ((log1 + 1) >> 2); /* Get (ceiling of quadroot of table size) */
2266 vm_page_bucket_hash |= 1; /* Set bit and add 1 - always must be 1 to insure unique series */
2267
2268 if (vm_page_hash_mask & vm_page_bucket_count) {
2269 printf("vm_page_bootstrap: WARNING -- strange page hash\n");
2270 }
2271
2272 #if VM_PAGE_BUCKETS_CHECK
2273 #if VM_PAGE_FAKE_BUCKETS
2274 /*
2275 * Allocate a decoy set of page buckets, to detect
2276 * any stomping there.
2277 */
2278 vm_page_fake_buckets = (vm_page_bucket_t *)
2279 pmap_steal_memory(vm_page_bucket_count *
2280 sizeof(vm_page_bucket_t), 0);
2281 vm_page_fake_buckets_start = (vm_map_offset_t) vm_page_fake_buckets;
2282 vm_page_fake_buckets_end =
2283 vm_map_round_page((vm_page_fake_buckets_start +
2284 (vm_page_bucket_count *
2285 sizeof(vm_page_bucket_t))),
2286 PAGE_MASK);
2287 char *cp;
2288 for (cp = (char *)vm_page_fake_buckets_start;
2289 cp < (char *)vm_page_fake_buckets_end;
2290 cp++) {
2291 *cp = 0x5a;
2292 }
2293 #endif /* VM_PAGE_FAKE_BUCKETS */
2294 #endif /* VM_PAGE_BUCKETS_CHECK */
2295
2296 kernel_debug_string_early("vm_page_buckets");
2297 vm_page_buckets = (vm_page_bucket_t *)
2298 pmap_steal_memory(vm_page_bucket_count *
2299 sizeof(vm_page_bucket_t), 0);
2300
2301 kernel_debug_string_early("vm_page_bucket_locks");
2302 vm_page_bucket_locks = (lck_ticket_t *)
2303 pmap_steal_memory(vm_page_bucket_lock_count *
2304 sizeof(lck_ticket_t), 0);
2305
2306 for (i = 0; i < vm_page_bucket_count; i++) {
2307 vm_page_bucket_t *bucket = &vm_page_buckets[i];
2308
2309 bucket->page_list = VM_PAGE_PACK_PTR(VM_PAGE_NULL);
2310 #if MACH_PAGE_HASH_STATS
2311 bucket->cur_count = 0;
2312 bucket->hi_count = 0;
2313 #endif /* MACH_PAGE_HASH_STATS */
2314 }
2315
2316 for (i = 0; i < vm_page_bucket_lock_count; i++) {
2317 lck_ticket_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket);
2318 }
2319
2320 vm_tag_init();
2321
2322 #if VM_PAGE_BUCKETS_CHECK
2323 vm_page_buckets_check_ready = TRUE;
2324 #endif /* VM_PAGE_BUCKETS_CHECK */
2325
2326 /*
2327 * Machine-dependent code allocates the resident page table.
2328 * It uses vm_page_init to initialize the page frames.
2329 * The code also returns to us the virtual space available
2330 * to the kernel. We don't trust the pmap module
2331 * to get the alignment right.
2332 */
2333
2334 kernel_debug_string_early("pmap_startup");
2335 pmap_startup(&virtual_space_start, &virtual_space_end);
2336 virtual_space_start = round_page(virtual_space_start);
2337 virtual_space_end = trunc_page(virtual_space_end);
2338
2339 *startp = virtual_space_start;
2340 *endp = virtual_space_end;
2341
2342 /*
2343 * Compute the initial "wire" count.
2344 * Up until now, the pages which have been set aside are not under
2345 * the VM system's control, so although they aren't explicitly
2346 * wired, they nonetheless can't be moved. At this moment,
2347 * all VM managed pages are "free", courtesy of pmap_startup.
2348 */
2349 assert((unsigned int) atop_64(max_mem) == atop_64(max_mem));
2350 vm_page_wire_count = ((unsigned int) atop_64(max_mem)) -
2351 vm_page_free_count - vm_lopage_free_count;
2352 #if CONFIG_SECLUDED_MEMORY
2353 vm_page_wire_count -= vm_page_secluded_count;
2354 #endif
2355 #if HAS_MTE
2356 /*
2357 * Discount any tag storage pages that we have set aside in
2358 * vm_page_release_startup().
2359 */
2360 vm_page_wire_count -= mte_tag_storage_count;
2361 #endif
2362 vm_page_wire_count_initial = vm_page_wire_count;
2363
2364 /* capture this for later use */
2365 booter_size = ml_get_booter_memory_size();
2366
2367 printf("vm_page_bootstrap: %d free pages, %d wired pages"
2368 #if XNU_VM_HAS_DELAYED_PAGES
2369 ", (up to %d of which are delayed free)"
2370 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2371 "%c",
2372 vm_page_free_count,
2373 vm_page_wire_count,
2374 #if XNU_VM_HAS_DELAYED_PAGES
2375 vm_delayed_count,
2376 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2377 '\n');
2378
2379 kernel_debug_string_early("vm_page_bootstrap complete");
2380 }
2381
2382 #ifndef MACHINE_PAGES
2383 /*
2384 * This is the early boot time allocator for data structures needed to bootstrap the VM system.
2385 * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this
2386 * on ARM yet, due to the combination of a large base page size and smaller RAM devices.
2387 */
2388 __static_testable void *
pmap_steal_memory_internal(vm_size_t size,vm_size_t alignment,boolean_t might_free,unsigned int flags,pmap_mapping_type_t mapping_type)2389 pmap_steal_memory_internal(
2390 vm_size_t size,
2391 vm_size_t alignment,
2392 boolean_t might_free,
2393 unsigned int flags,
2394 pmap_mapping_type_t mapping_type)
2395 {
2396 kern_return_t kr;
2397 vm_offset_t addr;
2398 vm_offset_t end = 0;
2399 vm_offset_t map_addr;
2400 ppnum_t phys_page;
2401 unsigned int pmap_flags;
2402
2403 if (size > UINT64_MAX - sizeof(void *)) {
2404 panic("pmap_steal_memory(): size: 0x%lx", size);
2405 }
2406 /*
2407 * Size needs to be aligned to word size.
2408 */
2409 size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
2410
2411 /*
2412 * Alignment defaults to word size if not specified.
2413 */
2414 if (alignment == 0) {
2415 alignment = sizeof(void*);
2416 }
2417
2418 /*
2419 * Alignment must be no greater than a page and must be a power of two.
2420 */
2421 assert(alignment <= PAGE_SIZE);
2422 assert((alignment & (alignment - 1)) == 0);
2423
2424 /*
2425 * On the first call, get the initial values for virtual address space
2426 * and page align them.
2427 */
2428 if (virtual_space_start == virtual_space_end) {
2429 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
2430 virtual_space_start = round_page(virtual_space_start);
2431 virtual_space_end = trunc_page(virtual_space_end);
2432
2433 #if defined(__x86_64__)
2434 /*
2435 * Release remaining unused section of preallocated KVA and the 4K page tables
2436 * that map it. This makes the VA available for large page mappings.
2437 */
2438 Idle_PTs_release(virtual_space_start, virtual_space_end);
2439 #endif
2440 }
2441
2442 /*
2443 * Allocate the virtual space for this request. On x86, we'll align to a large page
2444 * address if the size is big enough to back with at least 1 large page.
2445 */
2446 #if defined(__x86_64__)
2447 if (size >= I386_LPGBYTES) {
2448 virtual_space_start = ((virtual_space_start + I386_LPGMASK) & ~I386_LPGMASK);
2449 }
2450 #endif
2451 virtual_space_start = (virtual_space_start + (alignment - 1)) & ~(alignment - 1);
2452 addr = virtual_space_start;
2453 virtual_space_start += size;
2454
2455 //kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */
2456
2457 /*
2458 * Allocate and map physical pages to back the new virtual space.
2459 */
2460 map_addr = round_page(addr);
2461 if (os_add_overflow(addr, size, &end)) {
2462 panic("pmap_steal_memory() overflow, addr: %lx, size: 0x%lx", addr, size);
2463 }
2464 while (map_addr < end) {
2465 #if defined(__x86_64__)
2466 /*
2467 * Back with a large page if properly aligned on x86
2468 */
2469 if ((map_addr & I386_LPGMASK) == 0 &&
2470 map_addr + I386_LPGBYTES <= addr + size &&
2471 pmap_pre_expand_large(kernel_pmap, map_addr) == KERN_SUCCESS &&
2472 pmap_next_page_large(&phys_page) == KERN_SUCCESS) {
2473 kr = pmap_enter(kernel_pmap, map_addr, phys_page,
2474 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
2475 VM_WIMG_USE_DEFAULT | VM_MEM_SUPERPAGE, FALSE, mapping_type);
2476
2477 if (kr != KERN_SUCCESS) {
2478 panic("pmap_steal_memory: pmap_enter() large failed, new_addr=%#lx, phys_page=%u",
2479 (unsigned long)map_addr, phys_page);
2480 }
2481 map_addr += I386_LPGBYTES;
2482 vm_page_wire_count += I386_LPGBYTES >> PAGE_SHIFT;
2483 vm_page_stolen_count += I386_LPGBYTES >> PAGE_SHIFT;
2484 vm_page_kern_lpage_count++;
2485 continue;
2486 }
2487 #endif
2488
2489 if (!pmap_next_page_hi(&phys_page, might_free)) {
2490 panic("pmap_steal_memory() size: 0x%llx", (uint64_t)size);
2491 }
2492
2493 #if defined(__x86_64__)
2494 pmap_pre_expand(kernel_pmap, map_addr);
2495 #endif
2496 pmap_flags = flags ? flags : VM_WIMG_USE_DEFAULT;
2497
2498 #if HAS_MTE
2499 if (pmap_flags & VM_MEM_MAP_MTE) {
2500 mteinfo_covered_page_set_stolen_tagged(phys_page);
2501 pmap_make_tagged_page(phys_page);
2502 }
2503 #endif /* HAS_MTE */
2504
2505 kr = pmap_enter(kernel_pmap, map_addr, phys_page,
2506 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
2507 pmap_flags, FALSE, mapping_type);
2508
2509 if (kr != KERN_SUCCESS) {
2510 panic("pmap_steal_memory() pmap_enter failed, map_addr=%#lx, phys_page=%u",
2511 (unsigned long)map_addr, phys_page);
2512 }
2513 map_addr += PAGE_SIZE;
2514
2515 /*
2516 * Account for newly stolen memory
2517 */
2518 vm_page_wire_count++;
2519 vm_page_stolen_count++;
2520 }
2521
2522 #if defined(__x86_64__)
2523 /*
2524 * The call with might_free is currently the last use of pmap_steal_memory*().
2525 * Notify the pmap layer to record which high pages were allocated so far.
2526 */
2527 if (might_free) {
2528 pmap_hi_pages_done();
2529 }
2530 #endif
2531 #if KASAN
2532 kasan_notify_address(round_page(addr), size);
2533 #endif
2534 return (void *) addr;
2535 }
2536
2537 __mockable void *
pmap_steal_memory(vm_size_t size,vm_size_t alignment)2538 pmap_steal_memory(
2539 vm_size_t size,
2540 vm_size_t alignment)
2541 {
2542 return pmap_steal_memory_internal(size, alignment, FALSE, 0, PMAP_MAPPING_TYPE_RESTRICTED);
2543 }
2544
2545 void *
pmap_steal_freeable_memory(vm_size_t size)2546 pmap_steal_freeable_memory(
2547 vm_size_t size)
2548 {
2549 return pmap_steal_memory_internal(size, 0, TRUE, 0, PMAP_MAPPING_TYPE_RESTRICTED);
2550 }
2551
2552 #if HAS_MTE
2553 void *
pmap_steal_zone_memory(vm_size_t size,vm_size_t alignment)2554 pmap_steal_zone_memory(
2555 vm_size_t size,
2556 vm_size_t alignment)
2557 {
2558 return pmap_steal_memory_internal(size, alignment, FALSE, VM_MEM_MAP_MTE, PMAP_MAPPING_TYPE_RESTRICTED);
2559 }
2560 #endif /* HAS_MTE */
2561
2562
2563 #if CONFIG_SECLUDED_MEMORY
2564 /* boot-args to control secluded memory */
2565 TUNABLE_DT(unsigned int, secluded_mem_mb, "/defaults", "kern.secluded_mem_mb", "secluded_mem_mb", 0, TUNABLE_DT_NONE);
2566 /* IOKit can use secluded memory */
2567 TUNABLE(bool, secluded_for_iokit, "secluded_for_iokit", true);
2568 /* apps can use secluded memory */
2569 TUNABLE(bool, secluded_for_apps, "secluded_for_apps", true);
2570 /* filecache can use seclude memory */
2571 TUNABLE(secluded_filecache_mode_t, secluded_for_filecache, "secluded_for_filecache", SECLUDED_FILECACHE_RDONLY);
2572 uint64_t secluded_shutoff_trigger = 0;
2573 uint64_t secluded_shutoff_headroom = 150 * 1024 * 1024; /* original value from N56 */
2574 #endif /* CONFIG_SECLUDED_MEMORY */
2575
2576
2577 #if defined(__arm64__)
2578 extern void patch_low_glo_vm_page_info(void *, void *, uint32_t);
2579 #endif
2580
2581 void vm_page_release_startup(vm_page_t mem);
2582 __mockable void
pmap_startup(vm_offset_t * startp,vm_offset_t * endp)2583 pmap_startup(
2584 vm_offset_t *startp,
2585 vm_offset_t *endp)
2586 {
2587 unsigned int npages;
2588 ppnum_t phys_page;
2589 uint64_t mem_sz;
2590 uint64_t start_ns;
2591 uint64_t now_ns;
2592 uint32_t divisor;
2593 #if XNU_VM_HAS_DELAYED_PAGES
2594 uint_t low_page_count = 0;
2595 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2596
2597 /*
2598 * make sure we are aligned on a 64 byte boundary
2599 * for VM_PAGE_PACK_PTR (it clips off the low-order
2600 * 6 bits of the pointer)
2601 */
2602 if (virtual_space_start != virtual_space_end) {
2603 virtual_space_start = round_page(virtual_space_start);
2604 }
2605
2606 /*
2607 * We calculate how many page frames we will have
2608 * and then allocate the page structures in one chunk.
2609 *
2610 * Note that the calculation here doesn't take into account
2611 * the memory needed to map what's being allocated, i.e. the page
2612 * table entries. So the actual number of pages we get will be
2613 * less than this. To do someday: include that in the computation.
2614 *
2615 * Also for ARM, we don't use the count of free_pages, but rather the
2616 * range from last page to first page (ignore holes due to retired pages).
2617 */
2618
2619 /*
2620 * Initialize and release the page frames.
2621 */
2622 kernel_debug_string_early("page_frame_init");
2623 absolutetime_to_nanoseconds(mach_absolute_time(), &start_ns);
2624 if (fillval) {
2625 kprintf("Filling vm_pages with pattern: 0x%x\n", fillval);
2626 }
2627
2628 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2629 mem_sz = ptoa(pmap_free_pages_span());
2630 #if HAS_MTE
2631 if (!is_mte_enabled)
2632 #endif /* HAS_MTE */
2633 #if CONFIG_SPTM
2634 {
2635 uint32_t count = vm_pages_free_mask_len();
2636
2637 _vm_pages_free_masks = pmap_steal_memory(count *
2638 sizeof(__uint128_t), sizeof(__uint128_t));
2639 _vm_pages_free_enqueue_idx = pmap_steal_memory(count, sizeof(uint8_t));
2640 bzero(_vm_pages_free_masks, count * sizeof(__uint128_t));
2641 memset(_vm_pages_free_enqueue_idx, 0xff, count);
2642 }
2643 #endif /* CONFIG_SPTM */
2644 #else
2645 mem_sz = ptoa(pmap_free_pages());
2646 #endif
2647 mem_sz += round_page(virtual_space_start) - virtual_space_start; /* Account for any slop */
2648 divisor = PAGE_SIZE + sizeof(struct vm_page);
2649 npages = (uint32_t)((mem_sz + divisor - 1) / divisor); /* scaled to include the vm_page_ts */
2650
2651
2652 vm_pages = pmap_steal_freeable_memory(npages * sizeof(struct vm_page));
2653 vm_pages_end = vm_page_get(npages);
2654
2655 #if CONFIG_SECLUDED_MEMORY
2656 /*
2657 * Figure out how much secluded memory to have before we start
2658 * release pages to free lists.
2659 * The default, if specified nowhere else, is no secluded mem.
2660 */
2661 vm_page_secluded_target = (unsigned int)atop_64(secluded_mem_mb * 1024ULL * 1024ULL);
2662
2663 /*
2664 * Allow a really large app to effectively use secluded memory until it exits.
2665 */
2666 if (vm_page_secluded_target != 0) {
2667 /*
2668 * Get an amount from boot-args, else use 1/2 of max_mem.
2669 * 1/2 max_mem was chosen from a Peace daemon tentpole test which
2670 * used munch to induce jetsam thrashing of false idle daemons on N56.
2671 */
2672 int secluded_shutoff_mb;
2673 if (PE_parse_boot_argn("secluded_shutoff_mb", &secluded_shutoff_mb,
2674 sizeof(secluded_shutoff_mb))) {
2675 secluded_shutoff_trigger = (uint64_t)secluded_shutoff_mb * 1024 * 1024;
2676 } else {
2677 secluded_shutoff_trigger = max_mem / 2;
2678 }
2679
2680 /* ensure the headroom value is sensible and avoid underflows */
2681 assert(secluded_shutoff_trigger == 0 || secluded_shutoff_trigger > secluded_shutoff_headroom);
2682 }
2683 #endif /* CONFIG_SECLUDED_MEMORY */
2684
2685 #if defined(__x86_64__)
2686
2687 /*
2688 * Decide how much memory we delay freeing at boot time.
2689 */
2690 uint32_t delay_above_gb;
2691 if (!PE_parse_boot_argn("delay_above_gb", &delay_above_gb, sizeof(delay_above_gb))) {
2692 delay_above_gb = DEFAULT_DELAY_ABOVE_PHYS_GB;
2693 }
2694
2695 if (delay_above_gb == 0) {
2696 delay_above_pnum = PPNUM_MAX;
2697 } else {
2698 delay_above_pnum = delay_above_gb * (1024 * 1024 * 1024 / PAGE_SIZE);
2699 }
2700
2701 /* make sure we have sane breathing room: 1G above low memory */
2702 if (delay_above_pnum <= max_valid_low_ppnum) {
2703 delay_above_pnum = max_valid_low_ppnum + ((1024 * 1024 * 1024) >> PAGE_SHIFT);
2704 }
2705
2706 if (delay_above_pnum < PPNUM_MAX) {
2707 printf("pmap_startup() delaying init/free of page nums > 0x%x\n", delay_above_pnum);
2708 }
2709
2710 #endif /* defined(__x86_64__) */
2711
2712
2713 vm_free_page_lock();
2714
2715 for (uint32_t i = 0; i < npages && pmap_next_page(&phys_page); i++) {
2716 #if XNU_VM_HAS_DELAYED_PAGES
2717 if (phys_page < max_valid_low_ppnum) {
2718 ++low_page_count;
2719 }
2720
2721 /* Are we at high enough pages to delay the rest? */
2722 if (low_page_count > vm_lopage_free_limit &&
2723 phys_page > delay_above_pnum) {
2724 vm_delayed_count = pmap_free_pages();
2725 assert3u(vm_pages_count + vm_delayed_count, <=, npages);
2726 break;
2727 }
2728 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2729
2730 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2731 if (i == 0) {
2732 vm_pages_first_pnum = phys_page;
2733 patch_low_glo_vm_page_info(vm_pages, vm_pages_end,
2734 vm_pages_first_pnum);
2735 #if HAS_MTE
2736 if (is_mte_enabled) {
2737 vm_pages_tag_storage = vm_page_get(
2738 (mte_tag_storage_start_pnum - vm_pages_first_pnum));
2739 vm_pages_tag_storage_end = vm_tag_storage_page_get(mte_tag_storage_count);
2740 assert3p(vm_pages_tag_storage_end, <=, vm_pages_end);
2741 }
2742 #endif /* HAS_MTE */
2743 }
2744 #else
2745 /* The x86 clump freeing code requires increasing ppn's to work correctly */
2746 if (i > 0) {
2747 assert(phys_page > vm_page_get(i - 1)->vmp_phys_page);
2748 }
2749 #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */
2750
2751 ++vm_pages_count;
2752 vm_page_init(vm_page_get(i), phys_page);
2753 if (fillval) {
2754 fillPage(phys_page, fillval);
2755 }
2756 if (vm_himemory_mode) {
2757 vm_page_release_startup(vm_page_get(i));
2758 }
2759 }
2760
2761 vm_page_pages = vm_pages_count; /* used to report to user space */
2762
2763 if (!vm_himemory_mode) {
2764 for (uint32_t i = npages; i-- > 0;) {
2765 /* skip retired pages */
2766 if (!VMP_ERROR_GET(vm_page_get(i))) {
2767 vm_page_release_startup(vm_page_get(i));
2768 }
2769 }
2770 }
2771
2772 vm_free_page_unlock();
2773
2774 absolutetime_to_nanoseconds(mach_absolute_time(), &now_ns);
2775 printf("pmap_startup() init/release time: %lld microsec\n",
2776 (now_ns - start_ns) / NSEC_PER_USEC);
2777 #if XNU_VM_HAS_DELAYED_PAGES
2778 printf("pmap_startup() delayed init/release of %d pages\n",
2779 vm_delayed_count);
2780 #endif /* XNU_VM_HAS_DELAYED_PAGES */
2781
2782 /*
2783 * Validate packing will work properly. This needs to be done last
2784 * after vm_pages_count has been computed.
2785 */
2786 if (npages >= VM_PAGE_PACKED_FROM_ARRAY) {
2787 panic("pmap_startup(): too many pages to support vm_page packing");
2788 }
2789 if ((vm_page_t)VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(vm_pages)) != vm_pages) {
2790 panic("VM_PAGE_PACK_PTR failed on vm_pages - %p", vm_pages);
2791 }
2792 if ((vm_page_t)VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(vm_page_get(vm_pages_count - 1))) !=
2793 vm_page_get(vm_pages_count - 1)) {
2794 panic("VM_PAGE_PACK_PTR failed on vm_pages_end - %p",
2795 vm_page_get(vm_pages_count - 1));
2796 }
2797
2798 VM_CHECK_MEMORYSTATUS;
2799
2800 /*
2801 * We have to re-align virtual_space_start,
2802 * because pmap_steal_memory has been using it.
2803 */
2804 virtual_space_start = round_page(virtual_space_start);
2805 *startp = virtual_space_start;
2806 *endp = virtual_space_end;
2807 }
2808 #endif /* MACHINE_PAGES */
2809
2810 /*
2811 * Create the zone that represents the vm_pages[] array. Nothing ever allocates
2812 * or frees to this zone. It's just here for reporting purposes via zprint command.
2813 * This needs to be done after all initially delayed pages are put on the free lists.
2814 */
2815 void
vm_pages_array_finalize(void)2816 vm_pages_array_finalize(void)
2817 {
2818 (void)zone_create_ext("vm pages array", sizeof(struct vm_page),
2819 ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE, ZONE_ID_VM_PAGES, ^(zone_t z) {
2820 uint64_t vm_page_zone_pages, vm_page_array_zone_data_size;
2821
2822 zone_set_exhaustible(z, 0, true);
2823 /*
2824 * Reflect size and usage information for vm_pages[].
2825 */
2826
2827 z->z_elems_avail = (uint32_t)(vm_pages_end - vm_pages);
2828 z->z_elems_free = z->z_elems_avail - vm_pages_count;
2829 zpercpu_get_cpu(z->z_stats, 0)->zs_mem_allocated =
2830 vm_pages_count * sizeof(struct vm_page);
2831 vm_page_array_zone_data_size = (uint64_t)vm_pages_end - (uint64_t)vm_pages;
2832 vm_page_zone_pages = atop(round_page((vm_offset_t)vm_page_array_zone_data_size));
2833 z->z_wired_cur += vm_page_zone_pages;
2834 z->z_wired_hwm = z->z_wired_cur;
2835 z->z_va_cur = z->z_wired_cur;
2836 /* since zone accounts for these, take them out of stolen */
2837 VM_PAGE_MOVE_STOLEN(vm_page_zone_pages);
2838 });
2839 }
2840
2841 /*
2842 * Create the vm_pages zone. This is used for the vm_page structures for the pages
2843 * that are scavanged from other boot time usages by ml_static_mfree(). As such,
2844 * this needs to happen in early VM bootstrap.
2845 */
2846
2847 __startup_func
2848 static void
vm_page_module_init(void)2849 vm_page_module_init(void)
2850 {
2851 vm_size_t vm_page_with_ppnum_size;
2852
2853 /*
2854 * Since the pointers to elements in this zone will be packed, they
2855 * must have appropriate size. Not strictly what sizeof() reports.
2856 */
2857 vm_page_with_ppnum_size =
2858 (sizeof(struct vm_page_with_ppnum) + (VM_PAGE_PACKED_PTR_ALIGNMENT - 1)) &
2859 ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
2860
2861 vm_page_zone = zone_create_ext("vm pages", vm_page_with_ppnum_size,
2862 ZC_ALIGNMENT_REQUIRED | ZC_VM,
2863 ZONE_ID_ANY, ^(zone_t z) {
2864 /*
2865 * The number "10" is a small number that is larger than the number
2866 * of fictitious pages that any single caller will attempt to allocate
2867 * without blocking.
2868 *
2869 * The largest such number at the moment is kmem_alloc()
2870 * when 2 guard pages are asked. 10 is simply a somewhat larger number,
2871 * taking into account the 50% hysteresis the zone allocator uses.
2872 *
2873 * Note: this works at all because the zone allocator
2874 * doesn't ever allocate fictitious pages.
2875 */
2876 zone_raise_reserve(z, 10);
2877 });
2878 }
2879 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_page_module_init);
2880
2881 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
2882 /*
2883 * Radix tree of pages within the [pmap_first_pnum, vm_pages_first_pnum) range,
2884 * in order to support page lookup by pnum (@see vm_page_find_canonical()),
2885 * which corresponds to pages returned to the VM via @c ml_static_mfree().
2886 *
2887 * Kernel vm pages are never freed, which means that this data structure
2888 * is insert only.
2889 *
2890 * Empirically we have about 4-5k such pages, typically in only few rather dense
2891 * contiguous spans, inside a range of roughly 32k pnums.
2892 *
2893 * A radix tree works well with the distribution of keys, but also allows for
2894 * a straightforward lockless lookup path.
2895 */
2896
2897 #define VM_PAGE_RADIX_FANOUT_SHIFT 8
2898 #define VM_PAGE_RADIX_FANOUT (1u << VM_PAGE_RADIX_FANOUT_SHIFT)
2899
2900 typedef uint32_t vm_page_radix_ptr_t;
2901
2902 typedef struct vm_page_radix_node {
2903 vm_page_radix_ptr_t vmpr_array[VM_PAGE_RADIX_FANOUT];
2904 } *vm_page_radix_node_t;
2905
2906 static LCK_GRP_DECLARE(vm_pages_radix_lock_grp, "VM pages radix");
2907 static LCK_MTX_DECLARE(vm_pages_radix_lock, &vm_pages_radix_lock_grp);
2908
2909 static SECURITY_READ_ONLY_LATE(uintptr_t) vm_pages_radix_root;
2910 static uint32_t vm_pages_radix_count;
2911
2912 static vm_page_radix_node_t
vm_page_radix_node_unpack(vm_page_radix_ptr_t ptr)2913 vm_page_radix_node_unpack(vm_page_radix_ptr_t ptr)
2914 {
2915 return (vm_page_radix_node_t)VM_UNPACK_POINTER(ptr, VM_PAGE_PACKED_PTR);
2916 }
2917
2918 static vm_page_radix_ptr_t
vm_page_radix_node_pack(vm_page_radix_node_t node)2919 vm_page_radix_node_pack(vm_page_radix_node_t node)
2920 {
2921 vm_offset_t ptr = (vm_offset_t)node;
2922
2923 VM_ASSERT_POINTER_PACKABLE(ptr, VM_PAGE_PACKED_PTR);
2924 return (vm_page_radix_ptr_t)VM_PACK_POINTER(ptr, VM_PAGE_PACKED_PTR);
2925 }
2926
2927 static uint32_t
vm_page_radix_key(uint32_t level,uint32_t index)2928 vm_page_radix_key(uint32_t level, uint32_t index)
2929 {
2930 uint32_t key = index >> (VM_PAGE_RADIX_FANOUT_SHIFT * level);
2931
2932 return key & (VM_PAGE_RADIX_FANOUT - 1);
2933 }
2934
2935 static vm_page_radix_ptr_t *
vm_page_radix_slot(vm_page_radix_node_t node,uint32_t level,uint32_t index)2936 vm_page_radix_slot(vm_page_radix_node_t node, uint32_t level, uint32_t index)
2937 {
2938 return node->vmpr_array + vm_page_radix_key(level, index);
2939 }
2940
2941 __startup_func
2942 __attribute__((noinline))
2943 static vm_page_radix_node_t
vm_pages_radix_init_root(uint32_t * levelp)2944 vm_pages_radix_init_root(uint32_t *levelp)
2945 {
2946 uint32_t max_index = vm_pages_first_pnum - pmap_first_pnum - 1;
2947 vm_page_radix_node_t root;
2948 uint32_t level;
2949 vm_size_t size;
2950
2951 /*
2952 * Init a top-level node right away, to cover any index within
2953 * [0, vm_pages_first_pnum - pmap_first_pnum)
2954 */
2955 level = (fls(max_index | 1) - 1) / VM_PAGE_RADIX_FANOUT_SHIFT;
2956 size = (vm_page_radix_key(level, max_index) + 1) *
2957 sizeof(vm_page_radix_ptr_t);
2958
2959 root = zalloc_permanent(size, ZALIGN_64);
2960
2961 /*
2962 * Pack the level into the root pointer low bits,
2963 * so that pointer and level can be read atomically.
2964 *
2965 * See vm_pages_radix_load_root().
2966 */
2967 os_atomic_store(&vm_pages_radix_root, (uintptr_t)root | level, release);
2968
2969 *levelp = level;
2970 return root;
2971 }
2972
2973 static vm_page_radix_node_t
vm_pages_radix_node_alloc(vm_page_radix_ptr_t * slot)2974 vm_pages_radix_node_alloc(vm_page_radix_ptr_t *slot)
2975 {
2976 vm_page_radix_node_t node;
2977
2978 node = zalloc_permanent(sizeof(struct vm_page_radix_node),
2979 VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
2980 os_atomic_store(slot, vm_page_radix_node_pack(node), release);
2981 return node;
2982 }
2983
2984 static vm_page_radix_node_t
vm_pages_radix_load_root(uint32_t * level)2985 vm_pages_radix_load_root(uint32_t *level)
2986 {
2987 const uintptr_t VM_PAGE_RADIX_LEVEL_MASK = 0x7ul;
2988
2989 uintptr_t root = os_atomic_load(&vm_pages_radix_root, dependency);
2990
2991 *level = root & VM_PAGE_RADIX_LEVEL_MASK;
2992 root &= ~VM_PAGE_RADIX_LEVEL_MASK;
2993 return (vm_page_radix_node_t)root;
2994 }
2995
2996 vm_page_t
vm_pages_radix_next(uint32_t * cursor,ppnum_t * pnum)2997 vm_pages_radix_next(uint32_t *cursor, ppnum_t *pnum)
2998 {
2999 const uint32_t max_index = vm_pages_first_pnum - pmap_first_pnum;
3000 vm_page_radix_node_t node;
3001 uint32_t level, index;
3002
3003 index = *cursor;
3004 node = vm_pages_radix_load_root(&level);
3005
3006 if (node == NULL) {
3007 return VM_PAGE_NULL;
3008 }
3009
3010 while (index < max_index) {
3011 vm_page_radix_ptr_t *slot = vm_page_radix_slot(node, level, index);
3012 vm_page_radix_ptr_t ptr = os_atomic_load(slot, dependency);
3013
3014 if (ptr == 0) {
3015 uint32_t stride = 1 << (VM_PAGE_RADIX_FANOUT_SHIFT * level);
3016
3017 index = (index + stride) & -stride;
3018 if (vm_page_radix_key(level, index) == 0) {
3019 /* restart lookup at the top */
3020 node = vm_pages_radix_load_root(&level);
3021 }
3022 } else if (level > 0) {
3023 node = vm_page_radix_node_unpack(ptr);
3024 level -= 1;
3025 } else {
3026 *cursor = index + 1;
3027 if (pnum) {
3028 *pnum = pmap_first_pnum + index;
3029 }
3030 return (vm_page_t)VM_PAGE_UNPACK_PTR(ptr);
3031 }
3032 }
3033
3034 if (pnum) {
3035 *pnum = 0;
3036 }
3037 return VM_PAGE_NULL;
3038 }
3039
3040 #if DEBUG || DEVELOPMENT
3041
3042 static int
vm_page_radix_verify_test(int64_t in __unused,int64_t * out)3043 vm_page_radix_verify_test(int64_t in __unused, int64_t *out)
3044 {
3045 uint32_t count = 0;
3046 vm_page_t mem;
3047
3048 lck_mtx_lock(&vm_pages_radix_lock);
3049
3050 vm_pages_radix_for_each(mem) {
3051 count++;
3052 assert(mem == vm_page_find_canonical(VM_PAGE_GET_PHYS_PAGE(mem)));
3053 }
3054
3055 assert(count == vm_pages_radix_count);
3056
3057 lck_mtx_unlock(&vm_pages_radix_lock);
3058
3059 *out = 1;
3060 return 0;
3061 }
3062 SYSCTL_TEST_REGISTER(vm_page_radix_verify, vm_page_radix_verify_test);
3063
3064 #endif /* DEBUG || DEVELOPMENT */
3065
3066 __attribute__((noinline))
3067 static void
vm_pages_radix_insert(ppnum_t pnum,vm_page_t page)3068 vm_pages_radix_insert(ppnum_t pnum, vm_page_t page)
3069 {
3070 vm_page_radix_ptr_t *slot;
3071 vm_page_radix_node_t node;
3072 uint32_t level, index;
3073
3074 assert(!vm_page_in_array(page));
3075 index = pnum - pmap_first_pnum;
3076
3077 lck_mtx_lock(&vm_pages_radix_lock);
3078
3079 node = vm_pages_radix_load_root(&level);
3080 if (node == NULL) {
3081 node = vm_pages_radix_init_root(&level);
3082 }
3083
3084 for (; level > 0; level--) {
3085 slot = vm_page_radix_slot(node, level, index);
3086 if (*slot == 0) {
3087 node = vm_pages_radix_node_alloc(slot);
3088 } else {
3089 node = vm_page_radix_node_unpack(*slot);
3090 }
3091 }
3092
3093 slot = vm_page_radix_slot(node, 0, index);
3094 assert(*slot == 0);
3095 os_atomic_store(slot, VM_PAGE_PACK_PTR(page), release);
3096 vm_pages_radix_count++;
3097
3098 lck_mtx_unlock(&vm_pages_radix_lock);
3099 }
3100
3101 __abortlike
3102 static void
vm_page_for_ppnum_panic(ppnum_t pnum)3103 vm_page_for_ppnum_panic(ppnum_t pnum)
3104 {
3105 if (pnum < pmap_first_pnum) {
3106 panic("physical page is before the start of DRAM: %#x < %#x)",
3107 pnum, pmap_first_pnum);
3108 }
3109 panic("physical page is beyond the end of managed DRAM: %#x >= %#x)",
3110 pnum, vm_pages_first_pnum + vm_pages_count);
3111 }
3112
3113 vm_page_t
vm_page_find_canonical(ppnum_t pnum)3114 vm_page_find_canonical(ppnum_t pnum)
3115 {
3116 vm_page_radix_ptr_t *slot;
3117 vm_page_radix_node_t node;
3118 vm_page_radix_ptr_t ptr;
3119 uint32_t level, index;
3120
3121 if (pnum < pmap_first_pnum) {
3122 vm_page_for_ppnum_panic(pnum);
3123 }
3124
3125 if (pnum >= vm_pages_first_pnum + vm_pages_count) {
3126 /*
3127 * We could receive requests for pages which are beyond the xnu's managed space. (eg: ECC errors)
3128 * These need to be handled gracefully, so we return VM_PAGE_NULL here.
3129 */
3130 return VM_PAGE_NULL;
3131 }
3132
3133 if (__probable(pnum >= vm_pages_first_pnum)) {
3134 return vm_page_get(pnum - vm_pages_first_pnum);
3135 }
3136
3137 index = pnum - pmap_first_pnum;
3138 node = vm_pages_radix_load_root(&level);
3139
3140 for (; node && level > 0; level--) {
3141 slot = vm_page_radix_slot(node, level, index);
3142 ptr = os_atomic_load(slot, dependency);
3143 node = vm_page_radix_node_unpack(ptr);
3144 }
3145
3146 if (__probable(node)) {
3147 slot = vm_page_radix_slot(node, 0, index);
3148 ptr = os_atomic_load(slot, dependency);
3149 return (vm_page_t)VM_PAGE_UNPACK_PTR(ptr);
3150 }
3151
3152 return VM_PAGE_NULL;
3153 }
3154
3155 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
3156
3157 /*!
3158 * @function vm_page_create()
3159 *
3160 * @brief
3161 * Common helper for all vm_page_create* functions.
3162 */
3163 vm_page_t
vm_page_create(ppnum_t phys_page,bool canonical,zalloc_flags_t flags)3164 vm_page_create(ppnum_t phys_page, bool canonical, zalloc_flags_t flags)
3165 {
3166 vm_page_t m;
3167
3168 m = zalloc_flags(vm_page_zone, flags);
3169 if (m) {
3170 vm_page_init(m, phys_page);
3171 if (phys_page == vm_page_guard_addr) {
3172 counter_inc(&vm_guard_count);
3173 }
3174 }
3175 if (canonical) {
3176 assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
3177 m->vmp_canonical = true;
3178 #if HAS_MTE
3179 m->vmp_using_mte = pmap_is_tagged_page(phys_page);
3180 #endif /* HAS_MTE */
3181 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
3182 vm_pages_radix_insert(phys_page, m);
3183 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
3184 vm_free_page_lock();
3185 vm_page_pages++;
3186 vm_free_page_unlock();
3187 }
3188 return m;
3189 }
3190
3191 /*
3192 * Routine: vm_page_create_canonical
3193 * Purpose:
3194 * After the VM system is up, machine-dependent code
3195 * may stumble across more physical memory. For example,
3196 * memory that it was reserving for a frame buffer.
3197 * vm_page_create_canonical turns this memory into available pages.
3198 */
3199
3200 void
vm_page_create_canonical(ppnum_t phys_page)3201 vm_page_create_canonical(ppnum_t phys_page)
3202 {
3203 vm_page_t m;
3204
3205 m = vm_page_create(phys_page, true, Z_WAITOK);
3206 vm_page_release(m, VMP_RELEASE_NONE);
3207 }
3208
3209
3210 /*
3211 * vm_page_hash:
3212 *
3213 * Distributes the object/offset key pair among hash buckets.
3214 *
3215 * NOTE: The bucket count must be a power of 2
3216 */
3217 #define vm_page_hash(object, offset) (\
3218 ( (natural_t)((uintptr_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\
3219 & vm_page_hash_mask)
3220
3221
3222 /*
3223 * vm_page_insert: [ internal use only ]
3224 *
3225 * Inserts the given mem entry into the object/object-page
3226 * table and object list.
3227 *
3228 * The object must be locked.
3229 */
3230 void
vm_page_insert(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)3231 vm_page_insert(
3232 vm_page_t mem,
3233 vm_object_t object,
3234 vm_object_offset_t offset)
3235 {
3236 vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, FALSE, FALSE, NULL);
3237 }
3238
3239 void
vm_page_insert_wired(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag)3240 vm_page_insert_wired(
3241 vm_page_t mem,
3242 vm_object_t object,
3243 vm_object_offset_t offset,
3244 vm_tag_t tag)
3245 {
3246 vm_page_insert_internal(mem, object, offset, tag, FALSE, TRUE, FALSE, FALSE, NULL);
3247 }
3248
3249 void
vm_page_insert_internal(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag,boolean_t queues_lock_held,boolean_t insert_in_hash,boolean_t batch_pmap_op,boolean_t batch_accounting,uint64_t * delayed_ledger_update)3250 vm_page_insert_internal(
3251 vm_page_t mem,
3252 vm_object_t object,
3253 vm_object_offset_t offset,
3254 vm_tag_t tag,
3255 boolean_t queues_lock_held,
3256 boolean_t insert_in_hash,
3257 boolean_t batch_pmap_op,
3258 boolean_t batch_accounting,
3259 uint64_t *delayed_ledger_update)
3260 {
3261 vm_page_bucket_t *bucket;
3262 lck_ticket_t *bucket_lock;
3263 int hash_id;
3264 task_t owner;
3265 int ledger_idx_volatile;
3266 int ledger_idx_nonvolatile;
3267 int ledger_idx_volatile_compressed;
3268 int ledger_idx_nonvolatile_compressed;
3269 int ledger_idx_composite;
3270 int ledger_idx_external_wired;
3271 boolean_t do_footprint;
3272
3273 #if 0
3274 /*
3275 * we may not hold the page queue lock
3276 * so this check isn't safe to make
3277 */
3278 VM_PAGE_CHECK(mem);
3279 #endif
3280
3281 assertf(page_aligned(offset), "0x%llx\n", offset);
3282
3283 assert(!VM_PAGE_WIRED(mem) || !vm_page_is_canonical(mem) ||
3284 (tag != VM_KERN_MEMORY_NONE));
3285
3286 #if HAS_MTE
3287 assert_mte_vmo_matches_vmp(object, mem);
3288 #endif /* HAS_MTE */
3289 vm_object_lock_assert_exclusive(object);
3290 LCK_MTX_ASSERT(&vm_page_queue_lock,
3291 queues_lock_held ? LCK_MTX_ASSERT_OWNED
3292 : LCK_MTX_ASSERT_NOTOWNED);
3293
3294 if (queues_lock_held == FALSE) {
3295 assert(!VM_PAGE_PAGEABLE(mem));
3296 }
3297
3298 if (insert_in_hash == TRUE) {
3299 #if DEBUG || VM_PAGE_BUCKETS_CHECK
3300 if (mem->vmp_tabled || mem->vmp_object) {
3301 panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
3302 "already in (obj=%p,off=0x%llx)",
3303 mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
3304 }
3305 #endif
3306 if (object->internal && (offset >= object->vo_size)) {
3307 panic("vm_page_insert_internal: (page=%p,obj=%p,off=0x%llx,size=0x%llx) inserted at offset past object bounds",
3308 mem, object, offset, object->vo_size);
3309 }
3310
3311 assert(vm_page_lookup(object, offset) == VM_PAGE_NULL);
3312
3313 /*
3314 * Record the object/offset pair in this page
3315 */
3316
3317 mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
3318 mem->vmp_offset = offset;
3319
3320 #if CONFIG_SECLUDED_MEMORY
3321 if (object->eligible_for_secluded) {
3322 vm_page_secluded.eligible_for_secluded++;
3323 }
3324 #endif /* CONFIG_SECLUDED_MEMORY */
3325
3326 /*
3327 * Insert it into the object_object/offset hash table
3328 */
3329 hash_id = vm_page_hash(object, offset);
3330 bucket = &vm_page_buckets[hash_id];
3331 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3332
3333 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3334
3335 mem->vmp_next_m = bucket->page_list;
3336 bucket->page_list = VM_PAGE_PACK_PTR(mem);
3337 assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)));
3338
3339 #if MACH_PAGE_HASH_STATS
3340 if (++bucket->cur_count > bucket->hi_count) {
3341 bucket->hi_count = bucket->cur_count;
3342 }
3343 #endif /* MACH_PAGE_HASH_STATS */
3344 mem->vmp_hashed = TRUE;
3345 lck_ticket_unlock(bucket_lock);
3346 }
3347
3348 {
3349 unsigned int cache_attr;
3350
3351 cache_attr = object->wimg_bits & VM_WIMG_MASK;
3352
3353 #if HAS_MTE
3354 /*
3355 * Set the cache attributes if it's neither the default atttributes
3356 * nor it's WIMG_MTE because we would have already set it before
3357 * inserting the page into this object. There is no need to take
3358 * the set hit.
3359 *
3360 *
3361 */
3362 if (cache_attr == VM_WIMG_MTE) {
3363 if (vm_object_is_mte_mappable_with_page(object, mem)) {
3364 /*
3365 * By now, we expect non-fictitious pages to have been made
3366 * tagged. This should happen in mteinfo_page_list_fix_tagging()
3367 * when the page is inserted onto the per-CPU free tagged queue.
3368 */
3369 assert(mem->vmp_using_mte);
3370 assert(pmap_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem)) == VM_WIMG_MTE);
3371 } else {
3372 /*
3373 * We don't want the object for fictitious pages to have its
3374 * cache attributes set if the object is MTE.
3375 */
3376 }
3377 } else {
3378 #endif /* HAS_MTE */
3379
3380 if (cache_attr != VM_WIMG_USE_DEFAULT) {
3381 PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op);
3382 }
3383
3384 #if HAS_MTE
3385 }
3386 #endif
3387 }
3388
3389 /*
3390 * Now link into the object's list of backed pages.
3391 */
3392 vm_page_queue_enter(&object->memq, mem, vmp_listq);
3393 object->memq_hint = mem;
3394 mem->vmp_tabled = TRUE;
3395
3396 /*
3397 * Show that the object has one more resident page.
3398 */
3399
3400 object->resident_page_count++;
3401 if (VM_PAGE_WIRED(mem)) {
3402 assert(mem->vmp_wire_count > 0);
3403 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
3404 VM_OBJECT_WIRED_PAGE_ADD(object, mem);
3405 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
3406 }
3407 assert(object->resident_page_count >= object->wired_page_count);
3408
3409 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
3410 vm_object_set_chead_hint(object);
3411 #endif
3412
3413 #if DEVELOPMENT || DEBUG
3414 if (object->object_is_shared_cache &&
3415 object->pager != NULL &&
3416 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3417 int new, old;
3418 assert(!object->internal);
3419 new = OSAddAtomic(+1, &shared_region_pagers_resident_count);
3420 do {
3421 old = shared_region_pagers_resident_peak;
3422 } while (old < new &&
3423 !OSCompareAndSwap(old, new, &shared_region_pagers_resident_peak));
3424 }
3425 #endif /* DEVELOPMENT || DEBUG */
3426
3427 if (batch_accounting == FALSE) {
3428 if (object->internal) {
3429 OSAddAtomic(1, &vm_page_internal_count);
3430 } else {
3431 OSAddAtomic(1, &vm_page_external_count);
3432 }
3433 }
3434
3435 /*
3436 * It wouldn't make sense to insert a "reusable" page in
3437 * an object (the page would have been marked "reusable" only
3438 * at the time of a madvise(MADV_FREE_REUSABLE) if it was already
3439 * in the object at that time).
3440 * But a page could be inserted in a "all_reusable" object, if
3441 * something faults it in (a vm_read() from another task or a
3442 * "use-after-free" issue in user space, for example). It can
3443 * also happen if we're relocating a page from that object to
3444 * a different physical page during a physically-contiguous
3445 * allocation.
3446 */
3447 assert(!mem->vmp_reusable);
3448 if (object->all_reusable) {
3449 OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count);
3450 }
3451
3452 if (object->purgable == VM_PURGABLE_DENY &&
3453 !object->vo_ledger_tag) {
3454 owner = TASK_NULL;
3455 } else {
3456 owner = VM_OBJECT_OWNER(object);
3457 vm_object_ledger_tag_ledgers(object,
3458 &ledger_idx_volatile,
3459 &ledger_idx_nonvolatile,
3460 &ledger_idx_volatile_compressed,
3461 &ledger_idx_nonvolatile_compressed,
3462 &ledger_idx_composite,
3463 &ledger_idx_external_wired,
3464 &do_footprint);
3465 }
3466 if (owner &&
3467 object->internal &&
3468 (object->purgable == VM_PURGABLE_NONVOLATILE ||
3469 object->purgable == VM_PURGABLE_DENY ||
3470 VM_PAGE_WIRED(mem))) {
3471 if (delayed_ledger_update) {
3472 *delayed_ledger_update += PAGE_SIZE;
3473 } else {
3474 /* more non-volatile bytes */
3475 ledger_credit(owner->ledger,
3476 ledger_idx_nonvolatile,
3477 PAGE_SIZE);
3478 if (do_footprint) {
3479 /* more footprint */
3480 ledger_credit(owner->ledger,
3481 task_ledgers.phys_footprint,
3482 PAGE_SIZE);
3483 } else if (ledger_idx_composite != -1) {
3484 ledger_credit(owner->ledger,
3485 ledger_idx_composite,
3486 PAGE_SIZE);
3487 }
3488 }
3489 } else if (owner &&
3490 object->internal &&
3491 (object->purgable == VM_PURGABLE_VOLATILE ||
3492 object->purgable == VM_PURGABLE_EMPTY)) {
3493 assert(!VM_PAGE_WIRED(mem));
3494 /* more volatile bytes */
3495 ledger_credit(owner->ledger,
3496 ledger_idx_volatile,
3497 PAGE_SIZE);
3498 }
3499
3500 if (object->purgable == VM_PURGABLE_VOLATILE) {
3501 if (VM_PAGE_WIRED(mem)) {
3502 OSAddAtomic(+1, &vm_page_purgeable_wired_count);
3503 } else {
3504 OSAddAtomic(+1, &vm_page_purgeable_count);
3505 }
3506 } else if (object->purgable == VM_PURGABLE_EMPTY &&
3507 mem->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
3508 /*
3509 * This page belongs to a purged VM object but hasn't
3510 * been purged (because it was "busy").
3511 * It's in the "throttled" queue and hence not
3512 * visible to vm_pageout_scan(). Move it to a pageable
3513 * queue, so that it can eventually be reclaimed, instead
3514 * of lingering in the "empty" object.
3515 */
3516 if (queues_lock_held == FALSE) {
3517 vm_page_lockspin_queues();
3518 }
3519 vm_page_deactivate(mem);
3520 if (queues_lock_held == FALSE) {
3521 vm_page_unlock_queues();
3522 }
3523 }
3524
3525 #if HAS_MTE
3526 /*
3527 * If adding pages to the compressor object, account for whether it's
3528 * tag storage or not.
3529 */
3530 if (object == compressor_object) {
3531 if (vm_page_is_tag_storage(mem)) {
3532 counter_inc(&compressor_tag_storage_pages_in_pool);
3533 } else {
3534 counter_inc(&compressor_non_tag_storage_pages_in_pool);
3535 }
3536 }
3537 #endif /* HAS_MTE */
3538
3539 #if VM_OBJECT_TRACKING_OP_MODIFIED
3540 if (vm_object_tracking_btlog &&
3541 object->internal &&
3542 object->resident_page_count == 0 &&
3543 object->pager == NULL &&
3544 object->shadow != NULL &&
3545 object->shadow->vo_copy == object) {
3546 btlog_record(vm_object_tracking_btlog, object,
3547 VM_OBJECT_TRACKING_OP_MODIFIED,
3548 btref_get(__builtin_frame_address(0), 0));
3549 }
3550 #endif /* VM_OBJECT_TRACKING_OP_MODIFIED */
3551 }
3552
3553 /*
3554 * vm_page_replace:
3555 *
3556 * Exactly like vm_page_insert, except that we first
3557 * remove any existing page at the given offset in object.
3558 *
3559 * The object must be locked.
3560 */
3561 void
vm_page_replace(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)3562 vm_page_replace(
3563 vm_page_t mem,
3564 vm_object_t object,
3565 vm_object_offset_t offset)
3566 {
3567 vm_page_bucket_t *bucket;
3568 vm_page_t found_m = VM_PAGE_NULL;
3569 lck_ticket_t *bucket_lock;
3570 int hash_id;
3571
3572 #if 0
3573 /*
3574 * we don't hold the page queue lock
3575 * so this check isn't safe to make
3576 */
3577 VM_PAGE_CHECK(mem);
3578 #endif
3579 #if HAS_MTE
3580 assert_mte_vmo_matches_vmp(object, mem);
3581 #endif /* HAS_MTE */
3582 vm_object_lock_assert_exclusive(object);
3583 #if DEBUG || VM_PAGE_BUCKETS_CHECK
3584 if (mem->vmp_tabled || mem->vmp_object) {
3585 panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
3586 "already in (obj=%p,off=0x%llx)",
3587 mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
3588 }
3589 #endif
3590 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3591
3592 assert(!VM_PAGE_PAGEABLE(mem));
3593
3594 /*
3595 * Record the object/offset pair in this page
3596 */
3597 mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
3598 mem->vmp_offset = offset;
3599
3600 /*
3601 * Insert it into the object_object/offset hash table,
3602 * replacing any page that might have been there.
3603 */
3604
3605 hash_id = vm_page_hash(object, offset);
3606 bucket = &vm_page_buckets[hash_id];
3607 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3608
3609 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3610
3611 if (bucket->page_list) {
3612 vm_page_packed_t *mp = &bucket->page_list;
3613 vm_page_t m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp));
3614
3615 do {
3616 /*
3617 * compare packed object pointers
3618 */
3619 if (m->vmp_object == mem->vmp_object && m->vmp_offset == offset) {
3620 /*
3621 * Remove old page from hash list
3622 */
3623 *mp = m->vmp_next_m;
3624 m->vmp_hashed = FALSE;
3625 m->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3626
3627 found_m = m;
3628 break;
3629 }
3630 mp = &m->vmp_next_m;
3631 } while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp))));
3632
3633 mem->vmp_next_m = bucket->page_list;
3634 } else {
3635 mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3636 }
3637 /*
3638 * insert new page at head of hash list
3639 */
3640 bucket->page_list = VM_PAGE_PACK_PTR(mem);
3641 mem->vmp_hashed = TRUE;
3642
3643 lck_ticket_unlock(bucket_lock);
3644
3645 if (found_m) {
3646 /*
3647 * there was already a page at the specified
3648 * offset for this object... remove it from
3649 * the object and free it back to the free list
3650 */
3651 vm_page_free_unlocked(found_m, FALSE);
3652 }
3653 vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, FALSE, FALSE, FALSE, NULL);
3654 }
3655
3656 /*
3657 * vm_page_remove: [ internal use only ]
3658 *
3659 * Removes the given mem entry from the object/offset-page
3660 * table and the object page list.
3661 *
3662 * The object must be locked.
3663 */
3664
3665 void
vm_page_remove(vm_page_t mem,boolean_t remove_from_hash)3666 vm_page_remove(
3667 vm_page_t mem,
3668 boolean_t remove_from_hash)
3669 {
3670 vm_page_bucket_t *bucket;
3671 vm_page_t this;
3672 lck_ticket_t *bucket_lock;
3673 int hash_id;
3674 task_t owner;
3675 vm_object_t m_object;
3676 int ledger_idx_volatile;
3677 int ledger_idx_nonvolatile;
3678 int ledger_idx_volatile_compressed;
3679 int ledger_idx_nonvolatile_compressed;
3680 int ledger_idx_composite;
3681 int ledger_idx_external_wired;
3682 int do_footprint;
3683
3684 m_object = VM_PAGE_OBJECT(mem);
3685
3686 vm_object_lock_assert_exclusive(m_object);
3687 assert(mem->vmp_tabled);
3688 assert(!mem->vmp_cleaning);
3689 assert(!mem->vmp_laundry);
3690
3691 if (VM_PAGE_PAGEABLE(mem)) {
3692 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3693 }
3694 #if 0
3695 /*
3696 * we don't hold the page queue lock
3697 * so this check isn't safe to make
3698 */
3699 VM_PAGE_CHECK(mem);
3700 #endif
3701 if (remove_from_hash == TRUE) {
3702 /*
3703 * Remove from the object_object/offset hash table
3704 */
3705 hash_id = vm_page_hash(m_object, mem->vmp_offset);
3706 bucket = &vm_page_buckets[hash_id];
3707 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
3708
3709 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
3710
3711 if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) {
3712 /* optimize for common case */
3713
3714 bucket->page_list = mem->vmp_next_m;
3715 } else {
3716 vm_page_packed_t *prev;
3717
3718 for (prev = &this->vmp_next_m;
3719 (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem;
3720 prev = &this->vmp_next_m) {
3721 continue;
3722 }
3723 *prev = this->vmp_next_m;
3724 }
3725 #if MACH_PAGE_HASH_STATS
3726 bucket->cur_count--;
3727 #endif /* MACH_PAGE_HASH_STATS */
3728 mem->vmp_hashed = FALSE;
3729 this->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
3730 lck_ticket_unlock(bucket_lock);
3731 }
3732 /*
3733 * Now remove from the object's list of backed pages.
3734 */
3735
3736 vm_page_remove_internal(mem);
3737
3738 /*
3739 * And show that the object has one fewer resident
3740 * page.
3741 */
3742
3743 assert(m_object->resident_page_count > 0);
3744 m_object->resident_page_count--;
3745
3746 #if DEVELOPMENT || DEBUG
3747 if (m_object->object_is_shared_cache &&
3748 m_object->pager != NULL &&
3749 m_object->pager->mo_pager_ops == &shared_region_pager_ops) {
3750 assert(!m_object->internal);
3751 OSAddAtomic(-1, &shared_region_pagers_resident_count);
3752 }
3753 #endif /* DEVELOPMENT || DEBUG */
3754
3755 if (m_object->internal) {
3756 #if DEBUG
3757 assert(vm_page_internal_count);
3758 #endif /* DEBUG */
3759
3760 OSAddAtomic(-1, &vm_page_internal_count);
3761 } else {
3762 assert(vm_page_external_count);
3763 OSAddAtomic(-1, &vm_page_external_count);
3764
3765 if (mem->vmp_xpmapped) {
3766 assert(vm_page_xpmapped_external_count);
3767 OSAddAtomic(-1, &vm_page_xpmapped_external_count);
3768 }
3769 }
3770 if (!m_object->internal &&
3771 m_object->cached_list.next &&
3772 m_object->cached_list.prev) {
3773 if (m_object->resident_page_count == 0) {
3774 vm_object_cache_remove(m_object);
3775 }
3776 }
3777
3778 if (VM_PAGE_WIRED(mem)) {
3779 assert(mem->vmp_wire_count > 0);
3780 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
3781 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
3782 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
3783 }
3784 assert(m_object->resident_page_count >=
3785 m_object->wired_page_count);
3786 if (mem->vmp_reusable) {
3787 assert(m_object->reusable_page_count > 0);
3788 m_object->reusable_page_count--;
3789 assert(m_object->reusable_page_count <=
3790 m_object->resident_page_count);
3791 mem->vmp_reusable = FALSE;
3792 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
3793 vm_page_stats_reusable.reused_remove++;
3794 } else if (m_object->all_reusable) {
3795 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
3796 vm_page_stats_reusable.reused_remove++;
3797 }
3798
3799 if (m_object->purgable == VM_PURGABLE_DENY &&
3800 !m_object->vo_ledger_tag) {
3801 owner = TASK_NULL;
3802 } else {
3803 owner = VM_OBJECT_OWNER(m_object);
3804 vm_object_ledger_tag_ledgers(m_object,
3805 &ledger_idx_volatile,
3806 &ledger_idx_nonvolatile,
3807 &ledger_idx_volatile_compressed,
3808 &ledger_idx_nonvolatile_compressed,
3809 &ledger_idx_composite,
3810 &ledger_idx_external_wired,
3811 &do_footprint);
3812 }
3813 if (owner &&
3814 m_object->internal &&
3815 (m_object->purgable == VM_PURGABLE_NONVOLATILE ||
3816 m_object->purgable == VM_PURGABLE_DENY ||
3817 VM_PAGE_WIRED(mem))) {
3818 /* less non-volatile bytes */
3819 ledger_debit(owner->ledger,
3820 ledger_idx_nonvolatile,
3821 PAGE_SIZE);
3822 if (do_footprint) {
3823 /* less footprint */
3824 ledger_debit(owner->ledger,
3825 task_ledgers.phys_footprint,
3826 PAGE_SIZE);
3827 } else if (ledger_idx_composite != -1) {
3828 ledger_debit(owner->ledger,
3829 ledger_idx_composite,
3830 PAGE_SIZE);
3831 }
3832 } else if (owner &&
3833 m_object->internal &&
3834 (m_object->purgable == VM_PURGABLE_VOLATILE ||
3835 m_object->purgable == VM_PURGABLE_EMPTY)) {
3836 assert(!VM_PAGE_WIRED(mem));
3837 /* less volatile bytes */
3838 ledger_debit(owner->ledger,
3839 ledger_idx_volatile,
3840 PAGE_SIZE);
3841 }
3842
3843 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
3844 if (VM_PAGE_WIRED(mem)) {
3845 assert(vm_page_purgeable_wired_count > 0);
3846 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
3847 } else {
3848 assert(vm_page_purgeable_count > 0);
3849 OSAddAtomic(-1, &vm_page_purgeable_count);
3850 }
3851 }
3852
3853 #if HAS_MTE
3854 /*
3855 * If removing pages from the compressor object, account for whether it's
3856 * tag storage or not.
3857 */
3858 if (m_object == compressor_object) {
3859 if (vm_page_is_tag_storage(mem)) {
3860 counter_dec(&compressor_tag_storage_pages_in_pool);
3861 } else {
3862 counter_dec(&compressor_non_tag_storage_pages_in_pool);
3863 }
3864 }
3865
3866 assert_mte_vmo_matches_vmp(m_object, mem);
3867 if (!vm_object_is_mte_mappable(m_object)) {
3868 #endif /* HAS_MTE */
3869 if (m_object->set_cache_attr == TRUE) {
3870 pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0);
3871 }
3872 #if HAS_MTE
3873 }
3874 #endif /* HAS_MTE */
3875
3876 mem->vmp_tabled = FALSE;
3877 mem->vmp_object = 0;
3878 mem->vmp_offset = (vm_object_offset_t) -1;
3879 }
3880
3881
3882 /*
3883 * vm_page_lookup:
3884 *
3885 * Returns the page associated with the object/offset
3886 * pair specified; if none is found, VM_PAGE_NULL is returned.
3887 *
3888 * The object must be locked. No side effects.
3889 */
3890
3891 #define VM_PAGE_HASH_LOOKUP_THRESHOLD 10
3892
3893 #if DEBUG_VM_PAGE_LOOKUP
3894
3895 struct {
3896 uint64_t vpl_total;
3897 uint64_t vpl_empty_obj;
3898 uint64_t vpl_bucket_NULL;
3899 uint64_t vpl_hit_hint;
3900 uint64_t vpl_hit_hint_next;
3901 uint64_t vpl_hit_hint_prev;
3902 uint64_t vpl_fast;
3903 uint64_t vpl_slow;
3904 uint64_t vpl_hit;
3905 uint64_t vpl_miss;
3906
3907 uint64_t vpl_fast_elapsed;
3908 uint64_t vpl_slow_elapsed;
3909 } vm_page_lookup_stats __attribute__((aligned(8)));
3910
3911 #endif
3912
3913 #define KDP_VM_PAGE_WALK_MAX 1000
3914
3915 vm_page_t
kdp_vm_page_lookup(vm_object_t object,vm_object_offset_t offset)3916 kdp_vm_page_lookup(
3917 vm_object_t object,
3918 vm_object_offset_t offset)
3919 {
3920 vm_page_t cur_page;
3921 int num_traversed = 0;
3922
3923 if (not_in_kdp) {
3924 panic("panic: kdp_vm_page_lookup done outside of kernel debugger");
3925 }
3926
3927 vm_page_queue_iterate(&object->memq, cur_page, vmp_listq) {
3928 if (cur_page->vmp_offset == offset) {
3929 return cur_page;
3930 }
3931 num_traversed++;
3932
3933 if (num_traversed >= KDP_VM_PAGE_WALK_MAX) {
3934 return VM_PAGE_NULL;
3935 }
3936 }
3937
3938 return VM_PAGE_NULL;
3939 }
3940
3941 vm_page_t
vm_page_lookup(vm_object_t object,vm_object_offset_t offset)3942 vm_page_lookup(
3943 vm_object_t object,
3944 vm_object_offset_t offset)
3945 {
3946 vm_page_t mem;
3947 vm_page_bucket_t *bucket;
3948 vm_page_queue_entry_t qe;
3949 lck_ticket_t *bucket_lock = NULL;
3950 int hash_id;
3951 #if DEBUG_VM_PAGE_LOOKUP
3952 uint64_t start, elapsed;
3953
3954 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_total);
3955 #endif
3956
3957 #if KASAN_TBI
3958 if (is_kernel_object(object)) {
3959 offset = vm_memtag_canonicalize_kernel(offset);
3960 }
3961 #endif /* KASAN_TBI */
3962
3963 vm_object_lock_assert_held(object);
3964 assertf(page_aligned(offset), "offset 0x%llx\n", offset);
3965
3966 if (object->resident_page_count == 0) {
3967 #if DEBUG_VM_PAGE_LOOKUP
3968 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_empty_obj);
3969 #endif
3970 return VM_PAGE_NULL;
3971 }
3972
3973 mem = object->memq_hint;
3974
3975 if (mem != VM_PAGE_NULL) {
3976 assert(VM_PAGE_OBJECT(mem) == object);
3977
3978 if (mem->vmp_offset == offset) {
3979 #if DEBUG_VM_PAGE_LOOKUP
3980 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint);
3981 #endif
3982 return mem;
3983 }
3984 qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->vmp_listq);
3985
3986 if (!vm_page_queue_end(&object->memq, qe)) {
3987 vm_page_t next_page;
3988
3989 next_page = (vm_page_t)((uintptr_t)qe);
3990 assert(VM_PAGE_OBJECT(next_page) == object);
3991
3992 if (next_page->vmp_offset == offset) {
3993 object->memq_hint = next_page; /* new hint */
3994 #if DEBUG_VM_PAGE_LOOKUP
3995 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_next);
3996 #endif
3997 return next_page;
3998 }
3999 }
4000 qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->vmp_listq);
4001
4002 if (!vm_page_queue_end(&object->memq, qe)) {
4003 vm_page_t prev_page;
4004
4005 prev_page = (vm_page_t)((uintptr_t)qe);
4006 assert(VM_PAGE_OBJECT(prev_page) == object);
4007
4008 if (prev_page->vmp_offset == offset) {
4009 object->memq_hint = prev_page; /* new hint */
4010 #if DEBUG_VM_PAGE_LOOKUP
4011 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_prev);
4012 #endif
4013 return prev_page;
4014 }
4015 }
4016 }
4017 /*
4018 * Search the hash table for this object/offset pair
4019 */
4020 hash_id = vm_page_hash(object, offset);
4021 bucket = &vm_page_buckets[hash_id];
4022
4023 /*
4024 * since we hold the object lock, we are guaranteed that no
4025 * new pages can be inserted into this object... this in turn
4026 * guarantess that the page we're looking for can't exist
4027 * if the bucket it hashes to is currently NULL even when looked
4028 * at outside the scope of the hash bucket lock... this is a
4029 * really cheap optimiztion to avoid taking the lock
4030 */
4031 if (!bucket->page_list) {
4032 #if DEBUG_VM_PAGE_LOOKUP
4033 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_bucket_NULL);
4034 #endif
4035 return VM_PAGE_NULL;
4036 }
4037
4038 #if DEBUG_VM_PAGE_LOOKUP
4039 start = mach_absolute_time();
4040 #endif
4041 if (object->resident_page_count <= VM_PAGE_HASH_LOOKUP_THRESHOLD) {
4042 /*
4043 * on average, it's roughly 3 times faster to run a short memq list
4044 * than to take the spin lock and go through the hash list
4045 */
4046 mem = (vm_page_t)vm_page_queue_first(&object->memq);
4047
4048 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
4049 if (mem->vmp_offset == offset) {
4050 break;
4051 }
4052
4053 mem = (vm_page_t)vm_page_queue_next(&mem->vmp_listq);
4054 }
4055 if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
4056 mem = NULL;
4057 }
4058 } else {
4059 vm_page_object_t packed_object;
4060
4061 packed_object = VM_PAGE_PACK_OBJECT(object);
4062
4063 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
4064
4065 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
4066
4067 for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
4068 mem != VM_PAGE_NULL;
4069 mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m))) {
4070 #if 0
4071 /*
4072 * we don't hold the page queue lock
4073 * so this check isn't safe to make
4074 */
4075 VM_PAGE_CHECK(mem);
4076 #endif
4077 if ((mem->vmp_object == packed_object) && (mem->vmp_offset == offset)) {
4078 break;
4079 }
4080 }
4081 lck_ticket_unlock(bucket_lock);
4082 }
4083
4084 #if DEBUG_VM_PAGE_LOOKUP
4085 elapsed = mach_absolute_time() - start;
4086
4087 if (bucket_lock) {
4088 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_slow);
4089 OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_slow_elapsed);
4090 } else {
4091 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_fast);
4092 OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_fast_elapsed);
4093 }
4094 if (mem != VM_PAGE_NULL) {
4095 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit);
4096 } else {
4097 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_miss);
4098 }
4099 #endif
4100 if (mem != VM_PAGE_NULL) {
4101 assert(VM_PAGE_OBJECT(mem) == object);
4102
4103 object->memq_hint = mem;
4104 }
4105 return mem;
4106 }
4107
4108
4109 /*
4110 * vm_page_rename:
4111 *
4112 * Move the given memory entry from its
4113 * current object to the specified target object/offset.
4114 *
4115 * The object must be locked.
4116 */
4117 void
vm_page_rename(vm_page_t mem,vm_object_t new_object,vm_object_offset_t new_offset)4118 vm_page_rename(
4119 vm_page_t mem,
4120 vm_object_t new_object,
4121 vm_object_offset_t new_offset)
4122 {
4123 boolean_t internal_to_external, external_to_internal;
4124 vm_tag_t tag;
4125 vm_object_t m_object;
4126
4127 m_object = VM_PAGE_OBJECT(mem);
4128
4129 assert(m_object != new_object);
4130 assert(m_object);
4131
4132 /*
4133 * Changes to mem->vmp_object require the page lock because
4134 * the pageout daemon uses that lock to get the object.
4135 */
4136 vm_page_lockspin_queues();
4137
4138 internal_to_external = FALSE;
4139 external_to_internal = FALSE;
4140
4141 if (mem->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
4142 /*
4143 * it's much easier to get the vm_page_pageable_xxx accounting correct
4144 * if we first move the page to the active queue... it's going to end
4145 * up there anyway, and we don't do vm_page_rename's frequently enough
4146 * for this to matter.
4147 */
4148 vm_page_queues_remove(mem, FALSE);
4149 vm_page_activate(mem);
4150 }
4151 if (VM_PAGE_PAGEABLE(mem)) {
4152 if (m_object->internal && !new_object->internal) {
4153 internal_to_external = TRUE;
4154 }
4155 if (!m_object->internal && new_object->internal) {
4156 external_to_internal = TRUE;
4157 }
4158 }
4159
4160 tag = m_object->wire_tag;
4161 vm_page_remove(mem, TRUE);
4162 vm_page_insert_internal(mem, new_object, new_offset, tag, TRUE, TRUE, FALSE, FALSE, NULL);
4163
4164 if (internal_to_external) {
4165 vm_page_pageable_internal_count--;
4166 vm_page_pageable_external_count++;
4167 } else if (external_to_internal) {
4168 vm_page_pageable_external_count--;
4169 vm_page_pageable_internal_count++;
4170 }
4171
4172 vm_page_unlock_queues();
4173 }
4174
4175 /*
4176 * vm_page_init:
4177 *
4178 * Initialize the fields in a new page.
4179 * This takes a structure with random values and initializes it
4180 * so that it can be given to vm_page_release or vm_page_insert.
4181 */
4182 void
vm_page_init(vm_page_t mem,ppnum_t phys_page)4183 vm_page_init(vm_page_t mem, ppnum_t phys_page)
4184 {
4185 assert(phys_page);
4186
4187 #if DEBUG
4188 if ((phys_page != vm_page_fictitious_addr) && (phys_page != vm_page_guard_addr)) {
4189 if (!(pmap_valid_page(phys_page))) {
4190 panic("vm_page_init: non-DRAM phys_page 0x%x", phys_page);
4191 }
4192 }
4193 #endif /* DEBUG */
4194
4195 /*
4196 * Initialize the fields of the vm_page. If adding any new fields to vm_page,
4197 * try to use initial values which match 0. This minimizes the number of writes
4198 * needed for boot-time initialization.
4199 */
4200 assert(VM_PAGE_NOT_ON_Q == 0);
4201 assert(sizeof(*mem) % sizeof(uintptr_t) == 0);
4202 *mem = (struct vm_page) {
4203 .vmp_offset = (vm_object_offset_t)-1,
4204 .vmp_q_state = VM_PAGE_NOT_ON_Q,
4205 .vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY,
4206 .vmp_canonical = vm_page_in_array(mem),
4207 .vmp_busy = true,
4208 };
4209
4210 VM_PAGE_INIT_PHYS_PAGE(mem, phys_page);
4211
4212 #if 0
4213 /*
4214 * we're leaving this turned off for now... currently pages
4215 * come off the free list and are either immediately dirtied/referenced
4216 * due to zero-fill or COW faults, or are used to read or write files...
4217 * in the file I/O case, the UPL mechanism takes care of clearing
4218 * the state of the HW ref/mod bits in a somewhat fragile way.
4219 * Since we may change the way this works in the future (to toughen it up),
4220 * I'm leaving this as a reminder of where these bits could get cleared
4221 */
4222
4223 /*
4224 * make sure both the h/w referenced and modified bits are
4225 * clear at this point... we are especially dependent on
4226 * not finding a 'stale' h/w modified in a number of spots
4227 * once this page goes back into use
4228 */
4229 pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
4230 #endif
4231 }
4232
4233 vm_page_t
vm_page_create_fictitious(void)4234 vm_page_create_fictitious(void)
4235 {
4236 return vm_page_create(vm_page_fictitious_addr, false, Z_WAITOK);
4237 }
4238
4239 vm_page_t
vm_page_create_guard(bool canwait)4240 vm_page_create_guard(bool canwait)
4241 {
4242 return vm_page_create(vm_page_guard_addr, false, canwait ? Z_WAITOK : Z_NOWAIT);
4243 }
4244
4245 vm_page_t
vm_page_create_private(ppnum_t base_page)4246 vm_page_create_private(ppnum_t base_page)
4247 {
4248 assert(base_page != vm_page_fictitious_addr &&
4249 base_page != vm_page_guard_addr);
4250 return vm_page_create(base_page, false, Z_WAITOK);
4251 }
4252
4253 bool
vm_page_is_canonical(const struct vm_page * m)4254 vm_page_is_canonical(const struct vm_page *m)
4255 {
4256 return m->vmp_canonical;
4257 }
4258
4259 bool
vm_page_is_fictitious(const struct vm_page * m)4260 vm_page_is_fictitious(const struct vm_page *m)
4261 {
4262 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
4263 if (vm_page_in_array(m)) {
4264 return false;
4265 }
4266 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
4267 switch (VM_PAGE_GET_PHYS_PAGE(m)) {
4268 case vm_page_guard_addr:
4269 case vm_page_fictitious_addr:
4270 return true;
4271 default:
4272 return false;
4273 }
4274 }
4275
4276 bool
vm_page_is_guard(const struct vm_page * m)4277 vm_page_is_guard(const struct vm_page *m)
4278 {
4279 #if XNU_VM_HAS_LINEAR_PAGES_ARRAY
4280 if (vm_page_in_array(m)) {
4281 return false;
4282 }
4283 #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */
4284 return VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr;
4285 }
4286
4287 bool
vm_page_is_private(const struct vm_page * m)4288 vm_page_is_private(const struct vm_page *m)
4289 {
4290 return !vm_page_is_canonical(m) && !vm_page_is_fictitious(m);
4291 }
4292
4293 void
vm_page_make_private(vm_page_t m,ppnum_t base_page)4294 vm_page_make_private(vm_page_t m, ppnum_t base_page)
4295 {
4296 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4297 assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr);
4298
4299 VM_PAGE_SET_PHYS_PAGE(m, base_page);
4300 }
4301
4302 void
vm_page_reset_private(vm_page_t m)4303 vm_page_reset_private(vm_page_t m)
4304 {
4305 assert(vm_page_is_private(m));
4306
4307 VM_PAGE_SET_PHYS_PAGE(m, vm_page_fictitious_addr);
4308 }
4309
4310 /*
4311 * vm_page_release_fictitious:
4312 *
4313 * Release a fictitious page to the zone pool
4314 */
4315 static void
vm_page_release_fictitious(vm_page_t m)4316 vm_page_release_fictitious(vm_page_t m)
4317 {
4318 assert((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
4319 (m->vmp_q_state == VM_PAGE_IS_WIRED));
4320 assert(vm_page_is_fictitious(m));
4321 assert(!m->vmp_realtime);
4322
4323 if (vm_page_is_guard(m)) {
4324 counter_dec(&vm_guard_count);
4325 }
4326 zfree(vm_page_zone, m);
4327 }
4328
4329 /*
4330 * vm_pool_low():
4331 *
4332 * Return true if it is not likely that a non-vm_privileged thread
4333 * can get memory without blocking. Advisory only, since the
4334 * situation may change under us.
4335 */
4336 bool
vm_pool_low(void)4337 vm_pool_low(void)
4338 {
4339 /* No locking, at worst we will fib. */
4340 return vm_page_free_count <= vm_page_free_reserved;
4341 }
4342
4343 boolean_t vm_darkwake_mode = FALSE;
4344
4345 /*
4346 * vm_update_darkwake_mode():
4347 *
4348 * Tells the VM that the system is in / out of darkwake.
4349 *
4350 * Today, the VM only lowers/raises the background queue target
4351 * so as to favor consuming more/less background pages when
4352 * darwake is ON/OFF.
4353 *
4354 * We might need to do more things in the future.
4355 */
4356
4357 void
vm_update_darkwake_mode(boolean_t darkwake_mode)4358 vm_update_darkwake_mode(boolean_t darkwake_mode)
4359 {
4360 #if XNU_TARGET_OS_OSX && defined(__arm64__)
4361 #pragma unused(darkwake_mode)
4362 assert(vm_darkwake_mode == FALSE);
4363 /*
4364 * Darkwake mode isn't supported for AS macOS.
4365 */
4366 return;
4367 #else /* XNU_TARGET_OS_OSX && __arm64__ */
4368 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
4369
4370 vm_page_lockspin_queues();
4371
4372 if (vm_darkwake_mode == darkwake_mode) {
4373 /*
4374 * No change.
4375 */
4376 vm_page_unlock_queues();
4377 return;
4378 }
4379
4380 vm_darkwake_mode = darkwake_mode;
4381
4382 if (vm_darkwake_mode == TRUE) {
4383 /* save background target to restore later */
4384 vm_page_background_target_snapshot = vm_page_background_target;
4385
4386 /* target is set to 0...no protection for background pages */
4387 vm_page_background_target = 0;
4388 } else if (vm_darkwake_mode == FALSE) {
4389 if (vm_page_background_target_snapshot) {
4390 vm_page_background_target = vm_page_background_target_snapshot;
4391 }
4392 }
4393 vm_page_unlock_queues();
4394 #endif
4395 }
4396
4397 void
vm_page_update_special_state(vm_page_t mem)4398 vm_page_update_special_state(vm_page_t mem)
4399 {
4400 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR || mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
4401 return;
4402 }
4403
4404 switch (mem->vmp_on_specialq) {
4405 case VM_PAGE_SPECIAL_Q_BG:
4406 {
4407 task_t my_task = current_task_early();
4408
4409 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4410 return;
4411 }
4412
4413 if (my_task) {
4414 if (task_get_darkwake_mode(my_task)) {
4415 return;
4416 }
4417 }
4418
4419 if (my_task) {
4420 if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG)) {
4421 return;
4422 }
4423 }
4424 vm_page_lockspin_queues();
4425
4426 vm_page_background_promoted_count++;
4427
4428 vm_page_remove_from_specialq(mem);
4429 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4430
4431 vm_page_unlock_queues();
4432 break;
4433 }
4434
4435 case VM_PAGE_SPECIAL_Q_DONATE:
4436 {
4437 task_t my_task = current_task_early();
4438
4439 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
4440 return;
4441 }
4442
4443 if (my_task->donates_own_pages == false) {
4444 vm_page_lockspin_queues();
4445
4446 vm_page_remove_from_specialq(mem);
4447 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4448
4449 vm_page_unlock_queues();
4450 }
4451 break;
4452 }
4453
4454 default:
4455 {
4456 assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
4457 VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
4458 break;
4459 }
4460 }
4461 }
4462
4463
4464 void
vm_page_assign_special_state(vm_page_t mem,vm_page_specialq_t mode)4465 vm_page_assign_special_state(vm_page_t mem, vm_page_specialq_t mode)
4466 {
4467 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
4468 return;
4469 }
4470
4471 switch (mode) {
4472 case VM_PAGE_SPECIAL_Q_BG:
4473 {
4474 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4475 return;
4476 }
4477
4478 task_t my_task = current_task_early();
4479
4480 if (my_task) {
4481 if (task_get_darkwake_mode(my_task)) {
4482 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
4483 return;
4484 }
4485 }
4486
4487 if (my_task) {
4488 mem->vmp_on_specialq = (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG) ? VM_PAGE_SPECIAL_Q_BG : VM_PAGE_SPECIAL_Q_EMPTY);
4489 }
4490 break;
4491 }
4492
4493 case VM_PAGE_SPECIAL_Q_DONATE:
4494 {
4495 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
4496 return;
4497 }
4498 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
4499 break;
4500 }
4501
4502 default:
4503 break;
4504 }
4505 }
4506
4507
4508 void
vm_page_remove_from_specialq(vm_page_t mem)4509 vm_page_remove_from_specialq(vm_page_t mem)
4510 {
4511 vm_object_t m_object;
4512
4513 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4514
4515 switch (mem->vmp_on_specialq) {
4516 case VM_PAGE_SPECIAL_Q_BG:
4517 {
4518 if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4519 vm_page_queue_remove(&vm_page_queue_background, mem, vmp_specialq);
4520
4521 mem->vmp_specialq.next = 0;
4522 mem->vmp_specialq.prev = 0;
4523
4524 vm_page_background_count--;
4525
4526 m_object = VM_PAGE_OBJECT(mem);
4527
4528 if (m_object->internal) {
4529 vm_page_background_internal_count--;
4530 } else {
4531 vm_page_background_external_count--;
4532 }
4533 }
4534 break;
4535 }
4536
4537 case VM_PAGE_SPECIAL_Q_DONATE:
4538 {
4539 if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4540 vm_page_queue_remove((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4541 mem->vmp_specialq.next = 0;
4542 mem->vmp_specialq.prev = 0;
4543 vm_page_donate_count--;
4544 if (vm_page_donate_queue_ripe && (vm_page_donate_count < vm_page_donate_target)) {
4545 assert(vm_page_donate_target == vm_page_donate_target_low);
4546 vm_page_donate_target = vm_page_donate_target_high;
4547 vm_page_donate_queue_ripe = false;
4548 }
4549 }
4550
4551 break;
4552 }
4553
4554 default:
4555 {
4556 assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
4557 VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
4558 break;
4559 }
4560 }
4561 }
4562
4563
4564 void
vm_page_add_to_specialq(vm_page_t mem,boolean_t first)4565 vm_page_add_to_specialq(vm_page_t mem, boolean_t first)
4566 {
4567 vm_object_t m_object;
4568
4569 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4570
4571 if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
4572 return;
4573 }
4574
4575 switch (mem->vmp_on_specialq) {
4576 case VM_PAGE_SPECIAL_Q_BG:
4577 {
4578 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
4579 return;
4580 }
4581
4582 m_object = VM_PAGE_OBJECT(mem);
4583
4584 if (vm_page_background_exclude_external && !m_object->internal) {
4585 return;
4586 }
4587
4588 if (first == TRUE) {
4589 vm_page_queue_enter_first(&vm_page_queue_background, mem, vmp_specialq);
4590 } else {
4591 vm_page_queue_enter(&vm_page_queue_background, mem, vmp_specialq);
4592 }
4593 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
4594
4595 vm_page_background_count++;
4596
4597 if (m_object->internal) {
4598 vm_page_background_internal_count++;
4599 } else {
4600 vm_page_background_external_count++;
4601 }
4602 break;
4603 }
4604
4605 case VM_PAGE_SPECIAL_Q_DONATE:
4606 {
4607 if (first == TRUE) {
4608 vm_page_queue_enter_first((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4609 } else {
4610 vm_page_queue_enter((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
4611 }
4612 vm_page_donate_count++;
4613 if (!vm_page_donate_queue_ripe && (vm_page_donate_count > vm_page_donate_target)) {
4614 assert(vm_page_donate_target == vm_page_donate_target_high);
4615 vm_page_donate_target = vm_page_donate_target_low;
4616 vm_page_donate_queue_ripe = true;
4617 }
4618 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
4619 break;
4620 }
4621
4622 default:
4623 break;
4624 }
4625 }
4626
4627 /*!
4628 * @brief
4629 * Prepares a page that has been successfully grabbed for the caller.
4630 *
4631 * @discussion
4632 * This function will update accounting, emit tracements, ...
4633 */
4634 static vm_page_t
vm_page_grab_finalize(vm_grab_options_t grab_options __unused,vm_page_t mem)4635 vm_page_grab_finalize(vm_grab_options_t grab_options __unused, vm_page_t mem)
4636 {
4637 task_t task;
4638
4639 #if MACH_ASSERT
4640 /*
4641 * For all free pages, no matter their provenance...
4642 * ensure they are not referenced anywhere,
4643 * and their state is clean.
4644 */
4645 if (vm_check_refs_on_alloc) {
4646 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
4647 }
4648 assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
4649 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
4650 mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
4651 mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
4652 mem->vmp_next_m == 0 &&
4653 mem->vmp_object == 0 &&
4654 mem->vmp_wire_count == 0 &&
4655 mem->vmp_busy &&
4656 !mem->vmp_tabled &&
4657 !mem->vmp_laundry &&
4658 !mem->vmp_pmapped &&
4659 !mem->vmp_wpmapped &&
4660 !mem->vmp_realtime);
4661 #endif /* MACH_ASSERT */
4662
4663 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
4664 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
4665
4666 #if HAS_MTE
4667 if (!(grab_options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE)) {
4668 assert(!vm_page_is_tag_storage(mem));
4669 }
4670 if (grab_options & VM_PAGE_GRAB_MTE) {
4671 assert(mem->vmp_using_mte);
4672 VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB_MTE,
4673 DBG_FUNC_NONE, grab_options, 0, 0, 0);
4674 } else
4675 #endif /* HAS_MTE */
4676 {
4677 VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB,
4678 DBG_FUNC_NONE, grab_options, 0, 0, 0);
4679 }
4680
4681 counter_inc(&vm_page_grab_count);
4682
4683 task = current_task_early();
4684 if (task != TASK_NULL) {
4685 ledger_credit(task->ledger, task_ledgers.pages_grabbed, 1);
4686 }
4687 if (task != TASK_NULL && task != kernel_task) {
4688 /*
4689 * tag:DONATE this is where the donate state of the page
4690 * is decided according to what task grabs it
4691 */
4692 if (task->donates_own_pages) {
4693 vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE);
4694 } else {
4695 vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG);
4696 }
4697 }
4698
4699 return mem;
4700 }
4701
4702 #if __x86_64__
4703 /*
4704 * This can be switched to FALSE to help debug drivers
4705 * that are having problems with memory > 4G.
4706 */
4707 boolean_t vm_himemory_mode = TRUE;
4708 #endif /* __x86_64__ */
4709
4710 #if XNU_VM_HAS_LOPAGE
4711
4712 vm_page_t
vm_page_grablo(vm_grab_options_t grab_options)4713 vm_page_grablo(vm_grab_options_t grab_options)
4714 {
4715 vm_page_t mem = VM_PAGE_NULL;
4716
4717 if (!vm_lopage_needed) {
4718 return vm_page_grab_options(grab_options);
4719 }
4720
4721 vm_free_page_lock_spin();
4722 if (vm_lopage_free_count) {
4723 #if LCK_MTX_USE_ARCH
4724 /*
4725 * Intel locks do not really always disable preemption
4726 * for lck_mtx_lock_spin(), and vm_page_free_queue_grab()
4727 * really want that.
4728 */
4729 disable_preemption();
4730 #endif
4731 mem = vm_page_free_queue_grab(grab_options,
4732 VM_MEMORY_CLASS_LOPAGE, 1, VM_PAGE_NOT_ON_Q).vmpl_head;
4733 #if LCK_MTX_USE_ARCH
4734 enable_preemption();
4735 #endif
4736 }
4737 vm_free_page_unlock();
4738
4739 if (mem == VM_PAGE_NULL) {
4740 if (cpm_allocate(PAGE_SIZE, &mem, atop(PPNUM_MAX), 0, FALSE, KMA_LOMEM) != KERN_SUCCESS) {
4741 vm_free_page_lock_spin();
4742 vm_lopages_allocated_cpm_failed++;
4743 vm_free_page_unlock();
4744
4745 return VM_PAGE_NULL;
4746 }
4747 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
4748
4749 mem->vmp_busy = TRUE;
4750
4751 vm_page_lockspin_queues();
4752
4753 mem->vmp_gobbled = FALSE;
4754 vm_page_gobble_count--;
4755 vm_page_wire_count--;
4756
4757 vm_lopages_allocated_cpm_success++;
4758 vm_page_unlock_queues();
4759 }
4760
4761 return vm_page_grab_finalize(grab_options, mem);
4762 }
4763
4764 #endif /* XNU_VM_HAS_LOPAGE */
4765 #if CONFIG_SECLUDED_MEMORY
4766
4767 /*!
4768 * @brief
4769 * Attempt to allocate a page from the secluded queue
4770 *
4771 * @discussion
4772 * This function will check that the caller is eligible
4773 * for the secluded pool, and if not, return VM_PAGE_NULL.
4774 */
4775 __attribute__((noinline))
4776 static vm_page_t
vm_page_grab_secluded(vm_grab_options_t grab_options)4777 vm_page_grab_secluded(vm_grab_options_t grab_options)
4778 {
4779 vm_page_t mem;
4780 vm_object_t object;
4781 int refmod_state;
4782
4783 #if HAS_MTE
4784 if (grab_options & VM_PAGE_GRAB_MTE) {
4785 return VM_PAGE_NULL;
4786 }
4787 #endif /* HAS_MTE */
4788 if (vm_page_secluded_count == 0) {
4789 return VM_PAGE_NULL;
4790 }
4791
4792 if (grab_options & VM_PAGE_GRAB_SECLUDED) {
4793 vm_page_secluded.grab_for_iokit++;
4794 } else if (!task_can_use_secluded_mem(current_task(), TRUE)) {
4795 return VM_PAGE_NULL;
4796 }
4797
4798
4799 /* secluded queue is protected by the VM page queue lock */
4800 vm_page_lock_queues();
4801
4802 if (vm_page_secluded_count == 0) {
4803 /* no secluded pages to grab... */
4804 vm_page_unlock_queues();
4805 return VM_PAGE_NULL;
4806 }
4807
4808 #if 00
4809 /* can we grab from the secluded queue? */
4810 if (vm_page_secluded_count > vm_page_secluded_target ||
4811 (vm_page_secluded_count > 0 &&
4812 task_can_use_secluded_mem(current_task(), TRUE))) {
4813 /* OK */
4814 } else {
4815 /* can't grab from secluded queue... */
4816 vm_page_unlock_queues();
4817 return VM_PAGE_NULL;
4818 }
4819 #endif
4820
4821 /* we can grab a page from secluded queue! */
4822 assert((vm_page_secluded_count_free +
4823 vm_page_secluded_count_inuse) ==
4824 vm_page_secluded_count);
4825 if (current_task()->task_can_use_secluded_mem) {
4826 assert(num_tasks_can_use_secluded_mem > 0);
4827 }
4828 assert(!vm_page_queue_empty(&vm_page_queue_secluded));
4829 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4830 mem = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
4831 assert(mem->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
4832 vm_page_queues_remove(mem, TRUE);
4833
4834 object = VM_PAGE_OBJECT(mem);
4835
4836 assert(!vm_page_is_fictitious(mem));
4837 assert(!VM_PAGE_WIRED(mem));
4838 if (object == VM_OBJECT_NULL) {
4839 /* free for grab! */
4840 vm_page_unlock_queues();
4841 vm_page_secluded.grab_success_free++;
4842 goto out_success;
4843 }
4844
4845 assert(!object->internal);
4846 // vm_page_pageable_external_count--;
4847
4848 if (!vm_object_lock_try(object)) {
4849 // printf("SECLUDED: page %p: object %p locked\n", mem, object);
4850 vm_page_secluded.grab_failure_locked++;
4851 reactivate_secluded_page:
4852 vm_page_activate(mem);
4853 vm_page_unlock_queues();
4854 return VM_PAGE_NULL;
4855 }
4856 if (mem->vmp_busy ||
4857 mem->vmp_cleaning ||
4858 mem->vmp_laundry) {
4859 /* can't steal page in this state... */
4860 vm_object_unlock(object);
4861 vm_page_secluded.grab_failure_state++;
4862 goto reactivate_secluded_page;
4863 }
4864 if (mem->vmp_realtime) {
4865 /* don't steal pages used by realtime threads... */
4866 vm_object_unlock(object);
4867 vm_page_secluded.grab_failure_realtime++;
4868 goto reactivate_secluded_page;
4869 }
4870
4871 mem->vmp_busy = TRUE;
4872 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
4873 if (refmod_state & VM_MEM_REFERENCED) {
4874 mem->vmp_reference = TRUE;
4875 }
4876 if (refmod_state & VM_MEM_MODIFIED) {
4877 SET_PAGE_DIRTY(mem, FALSE);
4878 }
4879 if (mem->vmp_dirty || mem->vmp_precious) {
4880 /* can't grab a dirty page; re-activate */
4881 // printf("SECLUDED: dirty page %p\n", mem);
4882 vm_page_wakeup_done(object, mem);
4883 vm_page_secluded.grab_failure_dirty++;
4884 vm_object_unlock(object);
4885 goto reactivate_secluded_page;
4886 }
4887 if (mem->vmp_reference) {
4888 /* it's been used but we do need to grab a page... */
4889 }
4890
4891 vm_page_unlock_queues();
4892
4893 /* finish what vm_page_free() would have done... */
4894 vm_page_free_prepare_object(mem, TRUE);
4895 vm_object_unlock(object);
4896 object = VM_OBJECT_NULL;
4897
4898 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
4899 vm_page_secluded.grab_success_other++;
4900
4901 out_success:
4902
4903 if (grab_options & VM_PAGE_GRAB_SECLUDED) {
4904 vm_page_secluded.grab_for_iokit_success++;
4905 }
4906 return mem;
4907 }
4908
4909 uint64_t
vm_page_secluded_drain(void)4910 vm_page_secluded_drain(void)
4911 {
4912 vm_page_t local_freeq;
4913 int local_freed;
4914 uint64_t num_reclaimed;
4915 unsigned int saved_secluded_count, saved_secluded_target;
4916
4917 num_reclaimed = 0;
4918 local_freeq = NULL;
4919 local_freed = 0;
4920
4921 vm_page_lock_queues();
4922
4923 saved_secluded_count = vm_page_secluded_count;
4924 saved_secluded_target = vm_page_secluded_target;
4925 vm_page_secluded_target = 0;
4926 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
4927 while (vm_page_secluded_count) {
4928 vm_page_t secluded_page;
4929
4930 assert((vm_page_secluded_count_free +
4931 vm_page_secluded_count_inuse) ==
4932 vm_page_secluded_count);
4933 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
4934 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
4935
4936 vm_page_queues_remove(secluded_page, FALSE);
4937 assert(!vm_page_is_fictitious(secluded_page));
4938 assert(!VM_PAGE_WIRED(secluded_page));
4939
4940 if (secluded_page->vmp_object == 0) {
4941 /* transfer to free queue */
4942 assert(secluded_page->vmp_busy);
4943 secluded_page->vmp_snext = local_freeq;
4944 local_freeq = secluded_page;
4945 local_freed += 1;
4946 } else {
4947 /* transfer to head of active queue */
4948 vm_page_enqueue_active(secluded_page, FALSE);
4949 secluded_page = VM_PAGE_NULL;
4950 }
4951 num_reclaimed++;
4952 }
4953 vm_page_secluded_target = saved_secluded_target;
4954 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
4955
4956 // printf("FBDP %s:%d secluded_count %d->%d, target %d, reclaimed %lld\n", __FUNCTION__, __LINE__, saved_secluded_count, vm_page_secluded_count, vm_page_secluded_target, num_reclaimed);
4957
4958 vm_page_unlock_queues();
4959
4960 if (local_freed) {
4961 vm_page_free_list(local_freeq, TRUE);
4962 local_freeq = NULL;
4963 local_freed = 0;
4964 }
4965
4966 return num_reclaimed;
4967 }
4968
4969 #endif /* CONFIG_SECLUDED_MEMORY */
4970
4971 /*!
4972 * @brief
4973 * Attempts to allocate a page from the specified per-cpu page queue.
4974 */
4975 static vm_page_t
vm_page_grab_from_cpu(vm_page_t * cpu_list,scalable_counter_t * counter)4976 vm_page_grab_from_cpu(vm_page_t *cpu_list, scalable_counter_t *counter)
4977 {
4978 vm_page_t mem = _vm_page_list_pop(cpu_list);
4979
4980 if (mem != VM_PAGE_NULL) {
4981 #if HIBERNATION
4982 if (hibernate_rebuild_needed) {
4983 panic("should not modify cpu->free_pages while hibernating");
4984 }
4985 #endif /* HIBERNATION */
4986 counter_dec_preemption_disabled(counter);
4987 }
4988 return mem;
4989 }
4990
4991 #if HAS_MTE
4992 /*!
4993 * @brief
4994 * Attempts to allocate pages from free tag storage percpu queue.
4995 */
4996 static vm_page_t
vm_page_grab_claimed_from_cpu(mte_pcpu_t pcpu,vm_grab_options_t options)4997 vm_page_grab_claimed_from_cpu(mte_pcpu_t pcpu, vm_grab_options_t options)
4998 {
4999 vm_page_t mem = VM_PAGE_NULL;
5000
5001 if (!(options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE)) {
5002 return VM_PAGE_NULL;
5003 }
5004
5005 if (vm_page_queue_empty(&pcpu->free_claimed_pages)) {
5006 return VM_PAGE_NULL;
5007 }
5008
5009 lck_ticket_lock(&pcpu->free_claimed_lock, &vm_page_lck_grp_bucket);
5010
5011 if (!vm_page_queue_empty(&pcpu->free_claimed_pages)) {
5012 vm_page_queue_remove_first(&pcpu->free_claimed_pages,
5013 mem, vmp_pageq);
5014 counter_dec_preemption_disabled(&vm_cpu_free_claimed_count);
5015 counter_inc(&vm_cpu_claimed_count);
5016 /* must be done immediately to synchronize with stealing */
5017 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
5018 mem->vmp_local_id = 0;
5019 }
5020
5021 lck_ticket_unlock(&pcpu->free_claimed_lock);
5022
5023 return mem;
5024 }
5025 #endif /* HAS_MTE */
5026
5027 /*!
5028 * @brief
5029 * Attempts to allocate pages from free queues, and to populate the per-cpu
5030 * queue as a side effect.
5031 *
5032 * @discussion
5033 * This function will take the properties of the allocating thread into account
5034 * to decide how many pages it can allocate.
5035 *
5036 * If the free queues are depleted, then it will return VM_PAGE_NULL.
5037 */
5038 __attribute__((noinline))
5039 static vm_page_t
vm_page_grab_slow(vm_grab_options_t grab_options)5040 vm_page_grab_slow(vm_grab_options_t grab_options)
5041 {
5042 #if HAS_MTE
5043 unsigned int mte_draw = 0;
5044 unsigned int mte_slop = 0;
5045 #endif /* HAS_MTE */
5046 unsigned int target = vm_free_magazine_refill_limit;
5047 vm_memory_class_t class = VM_MEMORY_CLASS_REGULAR;
5048 vm_page_t mem = VM_PAGE_NULL;
5049 vm_page_list_t list = { };
5050 vm_page_t *cpu_list = NULL;
5051 scalable_counter_t *counter = NULL;
5052
5053 vm_free_page_lock_spin();
5054 #if LCK_MTX_USE_ARCH
5055 /* Intel does't disable preemption with vm_free_page_lock_spin() */
5056 disable_preemption();
5057 #endif /* LCK_MTX_USE_ARCH */
5058 cpu_list = PERCPU_GET(free_pages);
5059 counter = &vm_cpu_free_count;
5060 #if HAS_MTE
5061 if (grab_options & VM_PAGE_GRAB_MTE) {
5062 again:
5063 cpu_list = &PERCPU_GET(mte_pcpu)->free_tagged_pages;
5064 counter = &vm_cpu_free_tagged_count;
5065 target = vm_free_magazine_refill_limit / 2;
5066 class = VM_MEMORY_CLASS_TAGGED;
5067 mte_slop = 0;
5068 } else if (grab_options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE) {
5069 /*
5070 * Note that this is the last time we'll explicitly try to grab
5071 * free, claimable pages. If it comes down to it, we'll grab either
5072 * normal or dead tag storage pages in vm_page_free_queue_grab()
5073 * and hopefully refill the per-CPU free claimable queue.
5074 */
5075 mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
5076 mem = vm_page_grab_claimed_from_cpu(mte_pcpu, grab_options);
5077 }
5078 if (mem == VM_PAGE_NULL)
5079 #endif /* HAS_MTE */
5080 {
5081 mem = vm_page_grab_from_cpu(cpu_list, counter);
5082 }
5083 if (mem != VM_PAGE_NULL) {
5084 #if LCK_MTX_USE_ARCH
5085 enable_preemption();
5086 #endif /* LCK_MTX_USE_ARCH */
5087 vm_free_page_unlock();
5088 return mem;
5089 }
5090
5091 if (vm_page_free_count <= vm_page_free_reserved) {
5092 if ((current_thread()->options & TH_OPT_VMPRIV) == 0) {
5093 target = 0;
5094 } else if (vm_page_free_count == 0) {
5095 target = 0;
5096 } else {
5097 target = 1;
5098 }
5099 } else {
5100 target = MIN(target, vm_page_free_count - vm_page_free_reserved);
5101 }
5102 #if HAS_MTE
5103 if (grab_options & VM_PAGE_GRAB_MTE) {
5104 mte_draw = target;
5105 target = 0;
5106 if (vm_page_free_taggable_count < mte_draw + vm_page_free_min &&
5107 vm_page_free_count >= mte_draw + vm_page_free_min &&
5108 !(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
5109 /*
5110 * If the mte draw is such that we deplete our reserves,
5111 * but there are enough free untaggable pages available,
5112 * attempt to activate pages in order to rebalance
5113 * toward the taggable pool.
5114 *
5115 * If the operation succeeds, the free page queue lock
5116 * was dropped and we need to re-take it from the top.
5117 */
5118 if (mteinfo_tag_storage_try_activate(mte_draw +
5119 vm_page_free_min - vm_page_free_taggable_count,
5120 /* lock_spin */ true)) {
5121 goto again;
5122 }
5123 }
5124 } else if (target > vm_page_free_count - vm_page_free_taggable_count) {
5125 mte_draw = target - (vm_page_free_count - vm_page_free_taggable_count);
5126 target = (vm_page_free_count - vm_page_free_taggable_count);
5127 } else {
5128 mte_draw = 0;
5129 }
5130
5131 if (vm_page_free_taggable_count <= vm_page_free_reserved) {
5132 if ((current_thread()->options & TH_OPT_VMPRIV) == 0) {
5133 mte_draw = 0;
5134 } else if (vm_page_free_taggable_count == 0) {
5135 mte_draw = 0;
5136 } else if (target) {
5137 mte_draw = 0;
5138 } else {
5139 mte_draw = 1;
5140 }
5141 } else {
5142 mte_draw = MIN(mte_draw,
5143 vm_page_free_taggable_count - vm_page_free_reserved);
5144 }
5145
5146 target += mte_draw;
5147 #endif /* HAS_MTE */
5148
5149 #if HIBERNATION
5150 if (target > 0 && hibernate_rebuild_needed) {
5151 panic("should not modify CPU free_pages while hibernating");
5152 }
5153 #endif /* HIBERNATION */
5154
5155 /*
5156 * Convert the lock hold into a mutex, to signal to waiters that the
5157 * lock may be held for longer.
5158 */
5159 #if !LCK_MTX_USE_ARCH
5160 disable_preemption();
5161 #endif /* !LCK_MTX_USE_ARCH */
5162 vm_free_page_lock_convert();
5163
5164 if (target != 0) {
5165 list = vm_page_free_queue_grab(grab_options, class, target,
5166 VM_PAGE_ON_FREE_LOCAL_Q);
5167 }
5168
5169 #if VM_PAGE_WIRE_COUNT_WARNING
5170 if (vm_page_wire_count >= VM_PAGE_WIRE_COUNT_WARNING) {
5171 printf("mk: vm_page_grab(): high wired page count of %d\n",
5172 vm_page_wire_count);
5173 }
5174 #endif
5175 #if VM_PAGE_GOBBLE_COUNT_WARNING
5176 if (vm_page_gobble_count >= VM_PAGE_GOBBLE_COUNT_WARNING) {
5177 printf("mk: vm_page_grab(): high gobbled page count of %d\n",
5178 vm_page_gobble_count);
5179 }
5180 #endif
5181
5182 if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) {
5183 thread_wakeup(&vm_page_free_wanted);
5184 }
5185
5186 vm_free_page_unlock();
5187
5188 VM_CHECK_MEMORYSTATUS;
5189
5190 if (list.vmpl_head) {
5191 #if HAS_MTE
5192 mteinfo_page_list_fix_tagging(class, &list);
5193 #endif /* HAS_MTE */
5194 /* Steal a page off the list for the caller. */
5195 mem = vm_page_list_pop(&list);
5196
5197 /* Add the remaining pages to the CPU's free list. */
5198 assert(*cpu_list == VM_PAGE_NULL);
5199 *cpu_list = list.vmpl_head;
5200 counter_add_preemption_disabled(counter, list.vmpl_count);
5201 }
5202
5203 enable_preemption();
5204
5205 return mem;
5206 }
5207
5208 vm_page_t
vm_page_grab_options(vm_grab_options_t options)5209 vm_page_grab_options(vm_grab_options_t options)
5210 {
5211 #if HAS_MTE
5212 mte_pcpu_t mte_pcpu;
5213 vm_page_t *cpu_list;
5214 scalable_counter_t *counter;
5215 #endif
5216 vm_page_t mem;
5217
5218 restart:
5219
5220 /*
5221 * Step 1: look at the CPU magazines.
5222 */
5223
5224 disable_preemption();
5225 #if HAS_MTE
5226 mte_pcpu = PERCPU_GET(mte_pcpu);
5227 if (options & VM_PAGE_GRAB_MTE) {
5228 cpu_list = &mte_pcpu->free_tagged_pages;
5229 counter = &vm_cpu_free_tagged_count;
5230 mem = VM_PAGE_NULL;
5231 } else {
5232 cpu_list = PERCPU_GET(free_pages);
5233 counter = &vm_cpu_free_count;
5234 mem = VM_PAGE_NULL;
5235 }
5236
5237 if (options & VM_PAGE_GRAB_ALLOW_TAG_STORAGE) {
5238 mem = vm_page_grab_claimed_from_cpu(mte_pcpu, options);
5239 }
5240 if (mem == VM_PAGE_NULL) {
5241 mem = vm_page_grab_from_cpu(cpu_list, counter);
5242 }
5243 #else
5244 mem = vm_page_grab_from_cpu(PERCPU_GET(free_pages), &vm_cpu_free_count);
5245 #endif /* HAS_MTE */
5246 enable_preemption();
5247
5248 if (mem != VM_PAGE_NULL) {
5249 return vm_page_grab_finalize(options, mem);
5250 }
5251
5252 #if XNU_VM_HAS_DELAYED_PAGES
5253 /*
5254 * If free count is low and we have delayed pages from early boot,
5255 * get one of those instead.
5256 */
5257 if (__improbable(vm_delayed_count > 0 &&
5258 vm_page_free_count <= vm_page_free_target)) {
5259 mem = vm_get_delayed_page(options);
5260 if (mem != VM_PAGE_NULL) {
5261 return vm_page_grab_finalize(options, mem);
5262 }
5263 }
5264 #endif /* XNU_VM_HAS_DELAYED_PAGES */
5265
5266
5267 /*
5268 * Step 2: Try to promote pages from the free queues,
5269 * or the secluded queue if appropriate.
5270 */
5271
5272 mem = vm_page_grab_slow(options);
5273 if (mem != VM_PAGE_NULL) {
5274 return vm_page_grab_finalize(options, mem);
5275 }
5276
5277 #if CONFIG_SECLUDED_MEMORY
5278 mem = vm_page_grab_secluded(options);
5279 if (mem != VM_PAGE_NULL) {
5280 return vm_page_grab_finalize(options, mem);
5281 }
5282 #endif /* CONFIG_SECLUDED_MEMORY */
5283
5284
5285 /*
5286 * Step 3: Privileged threads block and retry, others fail.
5287 */
5288
5289 #if HAS_MTE
5290 if (options & VM_PAGE_GRAB_MTE) {
5291 current_thread()->page_wait_class = VM_MEMORY_CLASS_TAGGED;
5292 } else {
5293 current_thread()->page_wait_class = VM_MEMORY_CLASS_REGULAR;
5294 }
5295 #endif /* HAS_MTE */
5296 if ((options & VM_PAGE_GRAB_NOPAGEWAIT) == 0 &&
5297 (current_thread()->options & TH_OPT_VMPRIV) != 0) {
5298 VM_PAGE_WAIT();
5299 goto restart;
5300 }
5301
5302 return VM_PAGE_NULL;
5303 }
5304
5305 vm_grab_options_t
vm_page_grab_options_for_object(vm_object_t object __unused)5306 vm_page_grab_options_for_object(vm_object_t object __unused)
5307 {
5308 vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
5309
5310 #if CONFIG_SECLUDED_MEMORY
5311 if (object->can_grab_secluded) {
5312 options |= VM_PAGE_GRAB_SECLUDED;
5313 }
5314 #endif /* CONFIG_SECLUDED_MEMORY */
5315 #if HAS_MTE
5316 if (vm_object_is_mte_mappable(object)) {
5317 options |= VM_PAGE_GRAB_MTE;
5318 }
5319 #endif /* HAS_MTE */
5320
5321 return options;
5322 }
5323
5324 /*!
5325 * @function vm_page_free_queue_steal()
5326 *
5327 * @abstract
5328 * Steal a given page from the free queues.
5329 *
5330 * @discussion
5331 * The given page must be in the given free queue, or state may be corrupted.
5332 *
5333 * Internally, the free queue is not synchronized, so any locking must be done
5334 * outside of this function.
5335 *
5336 * This function, like vm_page_grab(), takes care of waking up
5337 * page out scan as needed.
5338 */
5339 static void
vm_page_free_queue_steal(vm_grab_options_t options,vm_page_t mem)5340 vm_page_free_queue_steal(vm_grab_options_t options, vm_page_t mem)
5341 {
5342 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
5343 vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
5344
5345 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
5346 assert(!mem->vmp_lopage && mem->vmp_busy);
5347
5348 vm_page_free_queue_remove(class, mem, pnum, VM_PAGE_NOT_ON_Q);
5349 vm_page_grab_finalize(options, mem);
5350
5351 if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) {
5352 thread_wakeup(&vm_page_free_wanted);
5353 }
5354 }
5355
5356 #if HAS_MTE
5357 /*!
5358 * @function _vm_page_wait_wakeup_fill_thread()
5359 *
5360 * @abstract
5361 * Given the number of waiters, return whether the MTE fill thread should
5362 * wake up.
5363 *
5364 * @discussion
5365 * The idea is to wake up the MTE fill thread without explicitly triggering
5366 * pageout_scan(), which means @c vm_page_free_count must be at least
5367 * @c vm_page_free_min. On top of that, it's possible that tag storage pages
5368 * may get relocated, which means that some free untagged pages will be needed
5369 * to activate a tag storage page. This function uses the naive, pessimistic
5370 * heuristic that a given tag storage page does not have many free covered
5371 * pages, and some number of those tag storage pages will need to be relocated.
5372 *
5373 * The free queue lock should be held during this function.
5374 *
5375 * @param n_waiters The number of waiters for tagged memory.
5376 *
5377 * @returns Whether the system has enough free pages to
5378 * wake up the MTE fill thread.
5379 */
5380 static bool
_vm_page_wait_wakeup_fill_thread(uint32_t n_waiters)5381 _vm_page_wait_wakeup_fill_thread(uint32_t n_waiters)
5382 {
5383 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
5384 return vm_page_free_count > vm_page_free_min + (3 * n_waiters) / 2;
5385 }
5386 #endif /* HAS_MTE */
5387
5388 /*
5389 * vm_page_wait:
5390 *
5391 * Wait for a page to become available.
5392 * If there are plenty of free pages, then we don't sleep.
5393 *
5394 * Returns:
5395 * TRUE: There may be another page, try again
5396 * FALSE: We were interrupted out of our wait, don't try again
5397 */
5398
5399 boolean_t
vm_page_wait(int interruptible)5400 vm_page_wait(int interruptible)
5401 {
5402 /*
5403 * We can't use vm_page_free_reserved to make this
5404 * determination. Consider: some thread might
5405 * need to allocate two pages. The first allocation
5406 * succeeds, the second fails. After the first page is freed,
5407 * a call to vm_page_wait must really block.
5408 */
5409 kern_return_t wait_result = THREAD_NOT_WAITING;
5410 thread_t cur_thread = current_thread();
5411 bool is_privileged = cur_thread->options & TH_OPT_VMPRIV;
5412 bool need_wakeup = false;
5413 event_t wait_event = NULL;
5414 #if HAS_MTE
5415 bool wakeup_refill_thread = false;
5416 #endif /* HAS_MTE */
5417
5418 vm_free_page_lock_spin();
5419
5420 #if HAS_MTE
5421 if (cur_thread->page_wait_class == VM_MEMORY_CLASS_TAGGED) {
5422 if (is_privileged) {
5423 if (vm_page_free_taggable_count) {
5424 vm_free_page_unlock();
5425 goto out;
5426 }
5427
5428 if (vm_page_free_wanted_tagged_privileged++ == 0) {
5429 wakeup_refill_thread = true;
5430 }
5431
5432 wait_event = (event_t)&vm_page_free_wanted_tagged_privileged;
5433 } else if (vm_page_free_taggable_count >= vm_page_free_target) {
5434 vm_free_page_unlock();
5435 goto out;
5436 } else {
5437 if (vm_page_free_wanted_tagged++ == 0) {
5438 wakeup_refill_thread = true;
5439 }
5440
5441 wait_event = (event_t)&vm_page_free_wanted_tagged;
5442 }
5443 } else
5444 #endif /* !HAS_MTE */
5445 if (is_privileged) {
5446 if (vm_page_free_count) {
5447 vm_free_page_unlock();
5448 goto out;
5449 }
5450
5451 if (vm_page_free_wanted_privileged++ == 0) {
5452 need_wakeup = true;
5453 }
5454
5455 wait_event = (event_t)&vm_page_free_wanted_privileged;
5456 } else if (vm_page_free_count >= vm_page_free_target) {
5457 vm_free_page_unlock();
5458 goto out;
5459 #if CONFIG_SECLUDED_MEMORY
5460 } else if (secluded_for_apps &&
5461 task_can_use_secluded_mem(current_task(), FALSE)) {
5462 #if 00
5463 /* XXX FBDP: need pageq lock for this... */
5464 /* XXX FBDP: might wait even if pages available, */
5465 /* XXX FBDP: hopefully not for too long... */
5466 if (vm_page_secluded_count > 0) {
5467 vm_free_page_unlock();
5468 goto out;
5469 }
5470 #endif
5471 if (vm_page_free_wanted_secluded++ == 0) {
5472 need_wakeup = true;
5473 }
5474
5475 wait_event = (event_t)&vm_page_free_wanted_secluded;
5476 #endif /* CONFIG_SECLUDED_MEMORY */
5477 } else {
5478 if (vm_page_free_wanted++ == 0) {
5479 need_wakeup = true;
5480 }
5481
5482 wait_event = (event_t)&vm_page_free_count;
5483 }
5484
5485 #if HAS_MTE
5486 /*
5487 * If we're here, it means that the free taggable count is low.
5488 * If there are enough free pages in the system, we can ask the
5489 * fill thread to convert some free untagged pages to free tagged
5490 * pages. Otherwise, we will wake up pageout_scan(), which will
5491 * free pages, and on the free path, the fill thread will get woken up
5492 * (see vm_page_free_queue_handle_wakeups_and_unlock()).
5493 *
5494 * The fill thread will run or not run under a variety of conditions
5495 * (see mteinfo_tag_storage_active_should_refill() for more details),
5496 * but what's relevant here is that the fill thread will run so long
5497 * as there are tagged waiters. We should at least ensure that the
5498 * system has enough free untagged memory to service the existing
5499 * tagged waiters.
5500 */
5501 if (wakeup_refill_thread) {
5502 uint32_t total_tagged_waiters = vm_page_free_wanted_tagged_privileged +
5503 vm_page_free_wanted_tagged;
5504 if (_vm_page_wait_wakeup_fill_thread(total_tagged_waiters)) {
5505 /* If there are enough pages for tagged waiters. */
5506 } else {
5507 /*
5508 * Otherwise, wake up pageout_scan(), and the fill thread will
5509 * run later.
5510 */
5511 wakeup_refill_thread = false;
5512 need_wakeup = true;
5513 }
5514 }
5515
5516 #endif /* HAS_MTE */
5517 if (vm_pageout_running) {
5518 need_wakeup = false;
5519 }
5520
5521 /*
5522 * We don't do a vm_pageout_scan wakeup if we already have
5523 * some waiters because vm_pageout_scan checks for waiters
5524 * before it returns and does so behind the vm_page_queue_free_lock,
5525 * which we own when we bump the waiter counts.
5526 */
5527
5528 if (vps_dynamic_priority_enabled) {
5529 /*
5530 * We are waking up vm_pageout_scan here. If it needs
5531 * the vm_page_queue_free_lock before we unlock it
5532 * we'll end up just blocking and incur an extra
5533 * context switch. Could be a perf. issue.
5534 */
5535
5536 #if HAS_MTE
5537 if (cur_thread->page_wait_class != VM_MEMORY_CLASS_REGULAR) {
5538 panic("vm_page_wait does not support MTE+vps_dynamic_priority_enabled");
5539 }
5540 #endif /* HAS_MTE */
5541 if (need_wakeup) {
5542 thread_wakeup((event_t)&vm_page_free_wanted);
5543 }
5544
5545 /*
5546 * LD: This event is going to get recorded every time because
5547 * we don't get back THREAD_WAITING from lck_mtx_sleep_with_inheritor.
5548 * We just block in that routine.
5549 */
5550 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
5551 vm_page_free_wanted_privileged,
5552 vm_page_free_wanted,
5553 #if CONFIG_SECLUDED_MEMORY
5554 vm_page_free_wanted_secluded,
5555 #else /* CONFIG_SECLUDED_MEMORY */
5556 0,
5557 #endif /* CONFIG_SECLUDED_MEMORY */
5558 0);
5559 wait_result = lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
5560 LCK_SLEEP_UNLOCK,
5561 wait_event,
5562 vm_pageout_scan_thread,
5563 interruptible,
5564 0);
5565 } else {
5566 wait_result = assert_wait(wait_event, interruptible);
5567
5568 vm_free_page_unlock();
5569
5570 if (need_wakeup) {
5571 thread_wakeup((event_t)&vm_page_free_wanted);
5572 }
5573 #if HAS_MTE
5574 if (wakeup_refill_thread) {
5575 assert(!need_wakeup);
5576 mteinfo_wake_fill_thread();
5577 }
5578 #endif /* HAS_MTE */
5579
5580 if (wait_result != THREAD_WAITING) {
5581 goto out;
5582 }
5583
5584 #if HAS_MTE
5585 if (cur_thread->page_wait_class == VM_MEMORY_CLASS_TAGGED) {
5586 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5587 DBG_VM_PAGE_MTE_WAIT_BLOCK,
5588 DBG_FUNC_START,
5589 vm_page_free_wanted_tagged_privileged,
5590 vm_page_free_wanted_tagged,
5591 0,
5592 0);
5593 wait_result = thread_block(THREAD_CONTINUE_NULL);
5594 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5595 DBG_VM_PAGE_MTE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
5596 goto out;
5597 }
5598 #endif /* HAS_MTE */
5599
5600 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5601 DBG_VM_PAGE_WAIT_BLOCK,
5602 DBG_FUNC_START,
5603 vm_page_free_wanted_privileged,
5604 vm_page_free_wanted,
5605 #if CONFIG_SECLUDED_MEMORY
5606 vm_page_free_wanted_secluded,
5607 #else /* CONFIG_SECLUDED_MEMORY */
5608 0,
5609 #endif /* CONFIG_SECLUDED_MEMORY */
5610 0);
5611 wait_result = thread_block(THREAD_CONTINUE_NULL);
5612 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
5613 DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
5614 }
5615
5616 out:
5617 #if HAS_MTE
5618 cur_thread->page_wait_class = VM_MEMORY_CLASS_REGULAR;
5619 #endif /* HAS_MTE */
5620 return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING);
5621 }
5622
5623 /*
5624 * vm_page_free_prepare:
5625 *
5626 * Removes page from any queue it may be on
5627 * and disassociates it from its VM object.
5628 *
5629 * Object and page queues must be locked prior to entry.
5630 */
5631 static void
vm_page_free_prepare(vm_page_t mem)5632 vm_page_free_prepare(
5633 vm_page_t mem)
5634 {
5635 vm_page_free_prepare_queues(mem);
5636 vm_page_free_prepare_object(mem, TRUE);
5637 #if CONFIG_SPTM
5638 /**
5639 * The pmap should retype frames as necessary when pmap_recycle_page()
5640 * is called. In order to catch potential cases where this does not
5641 * happen, add an appropriate assert here. This code should be
5642 * executed on every frame that is about to be released to the VM.
5643 */
5644 const sptm_paddr_t paddr = ((uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)) << PAGE_SHIFT;
5645 __unused const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
5646
5647 assert(frame_type == XNU_DEFAULT);
5648 #endif /* CONFIG_SPTM */
5649
5650 #if HAS_MTE
5651 /*
5652 * At this point, any busy bit on `mem` has been cleared. If the refill
5653 * thread wanted this page, update the cell state from PINNED to CLAIMED.
5654 *
5655 * We only expect to come through here when swap-ins/outs have erred.
5656 */
5657 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR && mem->vmp_ts_wanted) {
5658 mteinfo_tag_storage_wakeup(mem, false);
5659 }
5660 #endif /* HAS_MTE */
5661 }
5662
5663
5664 void
vm_page_free_prepare_queues(vm_page_t mem)5665 vm_page_free_prepare_queues(
5666 vm_page_t mem)
5667 {
5668 vm_object_t m_object;
5669
5670 VM_PAGE_CHECK(mem);
5671
5672 assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
5673 assert(!mem->vmp_cleaning);
5674 m_object = VM_PAGE_OBJECT(mem);
5675
5676 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5677 if (m_object) {
5678 vm_object_lock_assert_exclusive(m_object);
5679 }
5680 if (mem->vmp_laundry) {
5681 /*
5682 * We may have to free a page while it's being laundered
5683 * if we lost its pager (due to a forced unmount, for example).
5684 * We need to call vm_pageout_steal_laundry() before removing
5685 * the page from its VM object, so that we can remove it
5686 * from its pageout queue and adjust the laundry accounting
5687 */
5688 vm_pageout_steal_laundry(mem, TRUE);
5689 }
5690
5691 vm_page_queues_remove(mem, TRUE);
5692
5693 if (mem->vmp_realtime) {
5694 mem->vmp_realtime = false;
5695 VM_COUNTER_DEC(&vm_page_realtime_count);
5696 }
5697
5698 if (VM_PAGE_WIRED(mem)) {
5699 assert(mem->vmp_wire_count > 0);
5700
5701 if (m_object) {
5702 task_t owner;
5703 int ledger_idx_volatile;
5704 int ledger_idx_nonvolatile;
5705 int ledger_idx_volatile_compressed;
5706 int ledger_idx_nonvolatile_compressed;
5707 int ledger_idx_composite;
5708 int ledger_idx_external_wired;
5709 boolean_t do_footprint;
5710
5711 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
5712 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
5713 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
5714
5715 assert(m_object->resident_page_count >=
5716 m_object->wired_page_count);
5717
5718 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
5719 OSAddAtomic(+1, &vm_page_purgeable_count);
5720 assert(vm_page_purgeable_wired_count > 0);
5721 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
5722 }
5723 if (m_object->internal &&
5724 m_object->vo_owner != TASK_NULL &&
5725 (m_object->purgable == VM_PURGABLE_VOLATILE ||
5726 m_object->purgable == VM_PURGABLE_EMPTY)) {
5727 owner = VM_OBJECT_OWNER(m_object);
5728 vm_object_ledger_tag_ledgers(
5729 m_object,
5730 &ledger_idx_volatile,
5731 &ledger_idx_nonvolatile,
5732 &ledger_idx_volatile_compressed,
5733 &ledger_idx_nonvolatile_compressed,
5734 &ledger_idx_composite,
5735 &ledger_idx_external_wired,
5736 &do_footprint);
5737 /*
5738 * While wired, this page was accounted
5739 * as "non-volatile" but it should now
5740 * be accounted as "volatile".
5741 */
5742 /* one less "non-volatile"... */
5743 ledger_debit(owner->ledger,
5744 ledger_idx_nonvolatile,
5745 PAGE_SIZE);
5746 if (do_footprint) {
5747 /* ... and "phys_footprint" */
5748 ledger_debit(owner->ledger,
5749 task_ledgers.phys_footprint,
5750 PAGE_SIZE);
5751 } else if (ledger_idx_composite != -1) {
5752 ledger_debit(owner->ledger,
5753 ledger_idx_composite,
5754 PAGE_SIZE);
5755 }
5756 /* one more "volatile" */
5757 ledger_credit(owner->ledger,
5758 ledger_idx_volatile,
5759 PAGE_SIZE);
5760 }
5761 }
5762 if (vm_page_is_canonical(mem)) {
5763 vm_page_wire_count--;
5764 }
5765
5766 #if HAS_MTE
5767 mteinfo_decrement_wire_count(mem, true);
5768 #endif /* HAS_MTE */
5769
5770 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
5771 mem->vmp_iopl_wired = false;
5772 mem->vmp_wire_count = 0;
5773 assert(!mem->vmp_gobbled);
5774 } else if (mem->vmp_gobbled) {
5775 if (vm_page_is_canonical(mem)) {
5776 vm_page_wire_count--;
5777 }
5778 vm_page_gobble_count--;
5779 }
5780 }
5781
5782 /*
5783 * like vm_page_init, but we have to preserve fields related to phys page
5784 */
5785 inline static void
vm_page_reset_canonical(vm_page_t mem)5786 vm_page_reset_canonical(vm_page_t mem)
5787 {
5788 *mem = (struct vm_page){
5789 .vmp_offset = (vm_object_offset_t)-1,
5790 .vmp_q_state = VM_PAGE_NOT_ON_Q,
5791 .vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY,
5792 #if XNU_VM_HAS_LOPAGE
5793 .vmp_lopage = mem->vmp_lopage,
5794 #endif /* XNU_VM_HAS_LOPAGE */
5795 .vmp_canonical = true,
5796 .vmp_busy = true,
5797 .vmp_realtime = mem->vmp_realtime,
5798 #if HAS_MTE
5799 .vmp_using_mte = mem->vmp_using_mte,
5800 #endif
5801 #if !XNU_VM_HAS_LINEAR_PAGES_ARRAY
5802 .vmp_phys_page = mem->vmp_phys_page,
5803 #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */
5804 };
5805 /* ECC information is out of `struct vm_page` and preserved */
5806 }
5807
5808 void
vm_page_free_prepare_object(vm_page_t mem,boolean_t remove_from_hash)5809 vm_page_free_prepare_object(vm_page_t mem, boolean_t remove_from_hash)
5810 {
5811 if (mem->vmp_tabled) {
5812 vm_page_remove(mem, remove_from_hash); /* clears tabled, object, offset */
5813 }
5814 vm_page_wakeup(VM_OBJECT_NULL, mem); /* clears wanted */
5815
5816 if (vm_page_is_private(mem)) {
5817 vm_page_reset_private(mem);
5818 }
5819 if (vm_page_is_canonical(mem)) {
5820 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 &&
5821 mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 &&
5822 mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 &&
5823 mem->vmp_next_m == 0);
5824
5825 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5826
5827 vm_page_reset_canonical(mem);
5828 }
5829 }
5830
5831 /*
5832 * vm_page_release:
5833 *
5834 * Return a page to the free list.
5835 *
5836 * Keep in sync with vm_page_free_list().
5837 */
5838
5839 void
vm_page_release(vm_page_t mem,vmp_release_options_t options)5840 vm_page_release(vm_page_t mem, vmp_release_options_t options)
5841 {
5842 if (options & VMP_RELEASE_Q_LOCKED) {
5843 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5844 } else {
5845 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
5846 }
5847
5848 assert(vm_page_is_canonical(mem));
5849 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
5850
5851 if ((options & VMP_RELEASE_SKIP_FREE_CHECK) == 0) {
5852 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5853 }
5854
5855 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
5856
5857
5858 vm_page_free_queue_enter_list(vm_page_list_for_page(mem), options);
5859 }
5860
5861 /*
5862 * This version of vm_page_release() is used only at startup
5863 * when we are single-threaded and pages are being released
5864 * for the first time. Hence, no locking or unnecessary checks are made.
5865 * Note: VM_CHECK_MEMORYSTATUS invoked by the caller.
5866 */
5867 void
vm_page_release_startup(vm_page_t mem)5868 vm_page_release_startup(vm_page_t mem)
5869 {
5870 #if HAS_MTE
5871 if (pmap_in_tag_storage_range(VM_PAGE_GET_PHYS_PAGE(mem)) && is_mte_enabled) {
5872 /*
5873 * Add the MTE tag page to the FREE_MTE_TAG queue. These pages
5874 * can be used/claimed for other purposes (other than tag pages)
5875 * provided that they can be reclaimed quickly without waiting
5876 * on I/O, e.g. readonly/clean file pages.
5877 */
5878 mteinfo_tag_storage_release_startup(mem);
5879 return;
5880 }
5881 #endif /* HAS_MTE */
5882 vm_page_free_queue_enter_list(vm_page_list_for_page(mem),
5883 VMP_RELEASE_STARTUP);
5884 }
5885
5886 /*
5887 * vm_page_free:
5888 *
5889 * Returns the given page to the free list,
5890 * disassociating it with any VM object.
5891 *
5892 * Object and page queues must be locked prior to entry.
5893 */
5894 void
vm_page_free(vm_page_t mem)5895 vm_page_free(vm_page_t mem)
5896 {
5897 vm_page_free_prepare(mem);
5898
5899 if (vm_page_is_canonical(mem)) {
5900 /* page queues are locked */
5901 vm_page_release(mem, VMP_RELEASE_Q_LOCKED |
5902 VMP_RELEASE_SKIP_FREE_CHECK);
5903 } else {
5904 vm_page_release_fictitious(mem);
5905 }
5906 }
5907
5908
5909 void
vm_page_free_unlocked(vm_page_t mem,boolean_t remove_from_hash)5910 vm_page_free_unlocked(vm_page_t mem, boolean_t remove_from_hash)
5911 {
5912 vm_page_lockspin_queues();
5913 vm_page_free_prepare_queues(mem);
5914 vm_page_unlock_queues();
5915
5916 vm_page_free_prepare_object(mem, remove_from_hash);
5917
5918 if (vm_page_is_canonical(mem)) {
5919 /* page queues are not locked */
5920 vm_page_release(mem, VMP_RELEASE_SKIP_FREE_CHECK);
5921 } else {
5922 vm_page_release_fictitious(mem);
5923 }
5924 }
5925
5926
5927 /*
5928 * Free a list of pages. The list can be up to several hundred pages,
5929 * as blocked up by vm_pageout_scan().
5930 * The big win is not having to take the free list lock once
5931 * per page.
5932 *
5933 * The VM page queues lock (vm_page_queue_lock) should NOT be held.
5934 * The VM page free queues lock (vm_page_queue_free_lock) should NOT be held.
5935 *
5936 * Keep in sync with vm_page_release().
5937 */
5938 void
vm_page_free_list(vm_page_t freeq,bool prepare_object)5939 vm_page_free_list(vm_page_t freeq, bool prepare_object)
5940 {
5941 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
5942 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
5943
5944 while (freeq) {
5945 vm_page_list_t list = { };
5946
5947 while (list.vmpl_count < VMP_FREE_BATCH_SIZE && freeq) {
5948 vm_page_t mem = _vm_page_list_pop(&freeq);
5949
5950 assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
5951 (mem->vmp_q_state == VM_PAGE_IS_WIRED));
5952
5953 if (prepare_object) {
5954 vm_page_free_prepare_object(mem, TRUE);
5955 }
5956
5957 if (vm_page_is_fictitious(mem)) {
5958 vm_page_release_fictitious(mem);
5959 continue;
5960 }
5961
5962 if (!prepare_object) {
5963 /* vm_page_free_prepare_object() checked it */
5964 pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem));
5965 }
5966
5967 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
5968
5969
5970 /*
5971 * IMPORTANT: we can't set the page "free" here
5972 * because that would make the page eligible for
5973 * a physically-contiguous allocation (see
5974 * vm_page_find_contiguous()) right away (we don't
5975 * hold the vm_page_queue_free lock). That would
5976 * cause trouble because the page is not actually
5977 * in the free queue yet...
5978 */
5979
5980 vm_page_list_push(&list, mem);
5981 }
5982
5983 if (list.vmpl_count) {
5984 vm_page_free_queue_enter_list(list, VMP_RELEASE_NONE);
5985 }
5986 }
5987 }
5988
5989
5990 /*
5991 * vm_page_wire:
5992 *
5993 * Mark this page as wired down by yet
5994 * another map, removing it from paging queues
5995 * as necessary.
5996 *
5997 * The page's object and the page queues must be locked.
5998 */
5999
6000
6001 void
vm_page_wire(vm_page_t mem,vm_tag_t tag,boolean_t check_memorystatus)6002 vm_page_wire(
6003 vm_page_t mem,
6004 vm_tag_t tag,
6005 boolean_t check_memorystatus)
6006 {
6007 vm_object_t m_object;
6008
6009 m_object = VM_PAGE_OBJECT(mem);
6010
6011 // dbgLog(current_thread(), mem->vmp_offset, m_object, 1); /* (TEST/DEBUG) */
6012
6013 VM_PAGE_CHECK(mem);
6014 if (m_object) {
6015 vm_object_lock_assert_exclusive(m_object);
6016 } else {
6017 /*
6018 * In theory, the page should be in an object before it
6019 * gets wired, since we need to hold the object lock
6020 * to update some fields in the page structure.
6021 * However, some code (i386 pmap, for example) might want
6022 * to wire a page before it gets inserted into an object.
6023 * That's somewhat OK, as long as nobody else can get to
6024 * that page and update it at the same time.
6025 */
6026 }
6027 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6028 if (!VM_PAGE_WIRED(mem)) {
6029 if (mem->vmp_laundry) {
6030 vm_pageout_steal_laundry(mem, TRUE);
6031 }
6032
6033 vm_page_queues_remove(mem, TRUE);
6034
6035 assert(mem->vmp_wire_count == 0);
6036 mem->vmp_q_state = VM_PAGE_IS_WIRED;
6037
6038 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
6039 if (mem->vmp_unmodified_ro == true) {
6040 /* Object and PageQ locks are held*/
6041 mem->vmp_unmodified_ro = false;
6042 os_atomic_dec(&compressor_ro_uncompressed, relaxed);
6043 vm_object_compressor_pager_state_clr(VM_PAGE_OBJECT(mem), mem->vmp_offset);
6044 }
6045 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
6046
6047 if (m_object) {
6048 task_t owner;
6049 int ledger_idx_volatile;
6050 int ledger_idx_nonvolatile;
6051 int ledger_idx_volatile_compressed;
6052 int ledger_idx_nonvolatile_compressed;
6053 int ledger_idx_composite;
6054 int ledger_idx_external_wired;
6055 boolean_t do_footprint;
6056
6057 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
6058 VM_OBJECT_WIRED_PAGE_ADD(m_object, mem);
6059 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, tag);
6060
6061 assert(m_object->resident_page_count >=
6062 m_object->wired_page_count);
6063 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
6064 assert(vm_page_purgeable_count > 0);
6065 OSAddAtomic(-1, &vm_page_purgeable_count);
6066 OSAddAtomic(1, &vm_page_purgeable_wired_count);
6067 }
6068 if (m_object->internal &&
6069 m_object->vo_owner != TASK_NULL &&
6070 (m_object->purgable == VM_PURGABLE_VOLATILE ||
6071 m_object->purgable == VM_PURGABLE_EMPTY)) {
6072 owner = VM_OBJECT_OWNER(m_object);
6073 vm_object_ledger_tag_ledgers(
6074 m_object,
6075 &ledger_idx_volatile,
6076 &ledger_idx_nonvolatile,
6077 &ledger_idx_volatile_compressed,
6078 &ledger_idx_nonvolatile_compressed,
6079 &ledger_idx_composite,
6080 &ledger_idx_external_wired,
6081 &do_footprint);
6082 /* less volatile bytes */
6083 ledger_debit(owner->ledger,
6084 ledger_idx_volatile,
6085 PAGE_SIZE);
6086 /* more not-quite-volatile bytes */
6087 ledger_credit(owner->ledger,
6088 ledger_idx_nonvolatile,
6089 PAGE_SIZE);
6090 if (do_footprint) {
6091 /* more footprint */
6092 ledger_credit(owner->ledger,
6093 task_ledgers.phys_footprint,
6094 PAGE_SIZE);
6095 } else if (ledger_idx_composite != -1) {
6096 ledger_credit(owner->ledger,
6097 ledger_idx_composite,
6098 PAGE_SIZE);
6099 }
6100 }
6101
6102 if (m_object->all_reusable) {
6103 /*
6104 * Wired pages are not counted as "re-usable"
6105 * in "all_reusable" VM objects, so nothing
6106 * to do here.
6107 */
6108 } else if (mem->vmp_reusable) {
6109 /*
6110 * This page is not "re-usable" when it's
6111 * wired, so adjust its state and the
6112 * accounting.
6113 */
6114 vm_page_lockconvert_queues();
6115 vm_object_reuse_pages(m_object,
6116 mem->vmp_offset,
6117 mem->vmp_offset + PAGE_SIZE_64,
6118 FALSE);
6119 }
6120 }
6121 assert(!mem->vmp_reusable);
6122
6123 if (vm_page_is_canonical(mem) && !mem->vmp_gobbled) {
6124 vm_page_wire_count++;
6125 }
6126 if (mem->vmp_gobbled) {
6127 vm_page_gobble_count--;
6128 }
6129 mem->vmp_gobbled = FALSE;
6130
6131 if (check_memorystatus == TRUE) {
6132 VM_CHECK_MEMORYSTATUS;
6133 }
6134 }
6135 assert(!mem->vmp_gobbled);
6136 assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
6137 mem->vmp_wire_count++;
6138
6139 #if HAS_MTE
6140 if (mem->vmp_wire_count == 1 && tag != VM_KERN_MEMORY_MTAG) {
6141 /*
6142 * Only notify Mte Info if the caller isn't
6143 * mteinfo_tag_storage_wire_locked().
6144 */
6145 mteinfo_increment_wire_count(mem);
6146 }
6147 #endif /* HAS_MTE */
6148
6149 if (__improbable(mem->vmp_wire_count == 0)) {
6150 panic("vm_page_wire(%p): wire_count overflow", mem);
6151 }
6152 VM_PAGE_CHECK(mem);
6153 }
6154
6155 /*
6156 * vm_page_unwire:
6157 *
6158 * Release one wiring of this page, potentially
6159 * enabling it to be paged again.
6160 *
6161 * The page's object and the page queues must be locked.
6162 */
6163 void
vm_page_unwire(vm_page_t mem,boolean_t queueit)6164 vm_page_unwire(
6165 vm_page_t mem,
6166 boolean_t queueit)
6167 {
6168 vm_object_t m_object;
6169
6170 m_object = VM_PAGE_OBJECT(mem);
6171
6172 // dbgLog(current_thread(), mem->vmp_offset, m_object, 0); /* (TEST/DEBUG) */
6173
6174 VM_PAGE_CHECK(mem);
6175 assert(VM_PAGE_WIRED(mem));
6176 assert(mem->vmp_wire_count > 0);
6177 assert(!mem->vmp_gobbled);
6178 assert(m_object != VM_OBJECT_NULL);
6179 vm_object_lock_assert_exclusive(m_object);
6180 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6181 if (--mem->vmp_wire_count == 0) {
6182 task_t owner;
6183 int ledger_idx_volatile;
6184 int ledger_idx_nonvolatile;
6185 int ledger_idx_volatile_compressed;
6186 int ledger_idx_nonvolatile_compressed;
6187 int ledger_idx_composite;
6188 int ledger_idx_external_wired;
6189 boolean_t do_footprint;
6190
6191 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
6192 mem->vmp_iopl_wired = false;
6193
6194 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
6195 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
6196 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
6197 if (vm_page_is_canonical(mem)) {
6198 vm_page_wire_count--;
6199 }
6200
6201 #if HAS_MTE
6202 mteinfo_decrement_wire_count(mem, true);
6203 #endif /* HAS_MTE */
6204
6205 assert(m_object->resident_page_count >=
6206 m_object->wired_page_count);
6207 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
6208 OSAddAtomic(+1, &vm_page_purgeable_count);
6209 assert(vm_page_purgeable_wired_count > 0);
6210 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
6211 }
6212 if (m_object->internal &&
6213 m_object->vo_owner != TASK_NULL &&
6214 (m_object->purgable == VM_PURGABLE_VOLATILE ||
6215 m_object->purgable == VM_PURGABLE_EMPTY)) {
6216 owner = VM_OBJECT_OWNER(m_object);
6217 vm_object_ledger_tag_ledgers(
6218 m_object,
6219 &ledger_idx_volatile,
6220 &ledger_idx_nonvolatile,
6221 &ledger_idx_volatile_compressed,
6222 &ledger_idx_nonvolatile_compressed,
6223 &ledger_idx_composite,
6224 &ledger_idx_external_wired,
6225 &do_footprint);
6226 /* more volatile bytes */
6227 ledger_credit(owner->ledger,
6228 ledger_idx_volatile,
6229 PAGE_SIZE);
6230 /* less not-quite-volatile bytes */
6231 ledger_debit(owner->ledger,
6232 ledger_idx_nonvolatile,
6233 PAGE_SIZE);
6234 if (do_footprint) {
6235 /* less footprint */
6236 ledger_debit(owner->ledger,
6237 task_ledgers.phys_footprint,
6238 PAGE_SIZE);
6239 } else if (ledger_idx_composite != -1) {
6240 ledger_debit(owner->ledger,
6241 ledger_idx_composite,
6242 PAGE_SIZE);
6243 }
6244 }
6245 assert(!is_kernel_object(m_object));
6246 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
6247
6248 if (queueit == TRUE) {
6249 if (m_object->purgable == VM_PURGABLE_EMPTY) {
6250 vm_page_deactivate(mem);
6251 } else {
6252 vm_page_activate(mem);
6253 }
6254 }
6255
6256 VM_CHECK_MEMORYSTATUS;
6257 }
6258 VM_PAGE_CHECK(mem);
6259 }
6260
6261 /*
6262 * vm_page_deactivate:
6263 *
6264 * Returns the given page to the inactive list,
6265 * indicating that no physical maps have access
6266 * to this page. [Used by the physical mapping system.]
6267 *
6268 * The page queues must be locked.
6269 */
6270 void
vm_page_deactivate(vm_page_t m)6271 vm_page_deactivate(
6272 vm_page_t m)
6273 {
6274 vm_page_deactivate_internal(m, TRUE);
6275 }
6276
6277
6278 void
vm_page_deactivate_internal(vm_page_t m,boolean_t clear_hw_reference)6279 vm_page_deactivate_internal(
6280 vm_page_t m,
6281 boolean_t clear_hw_reference)
6282 {
6283 vm_object_t m_object;
6284
6285 m_object = VM_PAGE_OBJECT(m);
6286
6287 VM_PAGE_CHECK(m);
6288 assert(!is_kernel_object(m_object));
6289 assert(!vm_page_is_guard(m));
6290
6291 // dbgLog(VM_PAGE_GET_PHYS_PAGE(m), vm_page_free_count, vm_page_wire_count, 6); /* (TEST/DEBUG) */
6292 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6293 /*
6294 * This page is no longer very interesting. If it was
6295 * interesting (active or inactive/referenced), then we
6296 * clear the reference bit and (re)enter it in the
6297 * inactive queue. Note wired pages should not have
6298 * their reference bit cleared.
6299 */
6300 assert( !(m->vmp_absent && !m->vmp_unusual));
6301
6302 if (m->vmp_gobbled) { /* can this happen? */
6303 assert( !VM_PAGE_WIRED(m));
6304
6305 if (vm_page_is_canonical(m)) {
6306 vm_page_wire_count--;
6307 }
6308 vm_page_gobble_count--;
6309 m->vmp_gobbled = FALSE;
6310 }
6311 /*
6312 * if this page is currently on the pageout queue, we can't do the
6313 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6314 * and we can't remove it manually since we would need the object lock
6315 * (which is not required here) to decrement the activity_in_progress
6316 * reference which is held on the object while the page is in the pageout queue...
6317 * just let the normal laundry processing proceed
6318 */
6319 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6320 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6321 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
6322 VM_PAGE_WIRED(m)) {
6323 return;
6324 }
6325 if (!m->vmp_absent && clear_hw_reference == TRUE) {
6326 vm_page_lockconvert_queues();
6327 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
6328 }
6329
6330 m->vmp_reference = FALSE;
6331 m->vmp_no_cache = FALSE;
6332
6333 if (!VM_PAGE_INACTIVE(m)) {
6334 vm_page_queues_remove(m, FALSE);
6335
6336 if (!VM_DYNAMIC_PAGING_ENABLED() &&
6337 m->vmp_dirty && m_object->internal &&
6338 (m_object->purgable == VM_PURGABLE_DENY ||
6339 m_object->purgable == VM_PURGABLE_NONVOLATILE ||
6340 m_object->purgable == VM_PURGABLE_VOLATILE)) {
6341 vm_page_check_pageable_safe(m);
6342 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
6343 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
6344 vm_page_throttled_count++;
6345 } else {
6346 if (m_object->named &&
6347 os_ref_get_count_raw(&m_object->ref_count) == 1) {
6348 vm_page_speculate(m, FALSE);
6349 #if DEVELOPMENT || DEBUG
6350 vm_page_speculative_recreated++;
6351 #endif
6352 } else {
6353 vm_page_enqueue_inactive(m, FALSE);
6354 }
6355 }
6356 }
6357 }
6358
6359 /*
6360 * vm_page_enqueue_cleaned
6361 *
6362 * Put the page on the cleaned queue, mark it cleaned, etc.
6363 * Being on the cleaned queue (and having m->clean_queue set)
6364 * does ** NOT ** guarantee that the page is clean!
6365 *
6366 * Call with the queues lock held.
6367 */
6368
6369 void
vm_page_enqueue_cleaned(vm_page_t m)6370 vm_page_enqueue_cleaned(vm_page_t m)
6371 {
6372 vm_object_t m_object;
6373
6374 m_object = VM_PAGE_OBJECT(m);
6375
6376 assert(!vm_page_is_guard(m));
6377 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6378 assert(!(m->vmp_absent && !m->vmp_unusual));
6379
6380 if (VM_PAGE_WIRED(m)) {
6381 return;
6382 }
6383
6384 if (m->vmp_gobbled) {
6385 if (vm_page_is_canonical(m)) {
6386 vm_page_wire_count--;
6387 }
6388 vm_page_gobble_count--;
6389 m->vmp_gobbled = FALSE;
6390 }
6391 /*
6392 * if this page is currently on the pageout queue, we can't do the
6393 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6394 * and we can't remove it manually since we would need the object lock
6395 * (which is not required here) to decrement the activity_in_progress
6396 * reference which is held on the object while the page is in the pageout queue...
6397 * just let the normal laundry processing proceed
6398 */
6399 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6400 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
6401 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6402 return;
6403 }
6404 vm_page_queues_remove(m, FALSE);
6405
6406 vm_page_check_pageable_safe(m);
6407 vm_page_queue_enter(&vm_page_queue_cleaned, m, vmp_pageq);
6408 m->vmp_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q;
6409 vm_page_cleaned_count++;
6410
6411 vm_page_inactive_count++;
6412 if (m_object->internal) {
6413 vm_page_pageable_internal_count++;
6414 } else {
6415 vm_page_pageable_external_count++;
6416 }
6417 vm_page_add_to_specialq(m, TRUE);
6418 VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
6419 }
6420
6421 /*
6422 * vm_page_activate:
6423 *
6424 * Put the specified page on the active list (if appropriate).
6425 *
6426 * The page queues must be locked.
6427 */
6428
6429 void
vm_page_activate(vm_page_t m)6430 vm_page_activate(
6431 vm_page_t m)
6432 {
6433 vm_object_t m_object;
6434
6435 m_object = VM_PAGE_OBJECT(m);
6436
6437 VM_PAGE_CHECK(m);
6438 #ifdef FIXME_4778297
6439 assert(!is_kernel_object(m_object));
6440 #endif
6441 assert(!vm_page_is_guard(m));
6442 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6443 assert( !(m->vmp_absent && !m->vmp_unusual));
6444
6445 if (m->vmp_gobbled) {
6446 assert( !VM_PAGE_WIRED(m));
6447 if (vm_page_is_canonical(m)) {
6448 vm_page_wire_count--;
6449 }
6450 vm_page_gobble_count--;
6451 m->vmp_gobbled = FALSE;
6452 }
6453 /*
6454 * if this page is currently on the pageout queue, we can't do the
6455 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6456 * and we can't remove it manually since we would need the object lock
6457 * (which is not required here) to decrement the activity_in_progress
6458 * reference which is held on the object while the page is in the pageout queue...
6459 * just let the normal laundry processing proceed
6460 */
6461 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6462 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6463 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6464 return;
6465 }
6466
6467 #if DEBUG
6468 if (m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q) {
6469 panic("vm_page_activate: already active");
6470 }
6471 #endif
6472
6473 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
6474 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
6475 DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
6476 }
6477
6478 /*
6479 * A freshly activated page should be promoted in the donation queue.
6480 * So we remove it here while preserving its hint and we will enqueue
6481 * it again in vm_page_enqueue_active.
6482 */
6483 vm_page_queues_remove(m, ((m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) ? TRUE : FALSE));
6484
6485 if (!VM_PAGE_WIRED(m)) {
6486 vm_page_check_pageable_safe(m);
6487 if (!VM_DYNAMIC_PAGING_ENABLED() &&
6488 m->vmp_dirty && m_object->internal &&
6489 (m_object->purgable == VM_PURGABLE_DENY ||
6490 m_object->purgable == VM_PURGABLE_NONVOLATILE ||
6491 m_object->purgable == VM_PURGABLE_VOLATILE)) {
6492 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
6493 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
6494 vm_page_throttled_count++;
6495 } else {
6496 #if CONFIG_SECLUDED_MEMORY
6497 if (secluded_for_filecache &&
6498 vm_page_secluded_target != 0 &&
6499 num_tasks_can_use_secluded_mem == 0 &&
6500 m_object->eligible_for_secluded &&
6501 !m->vmp_realtime) {
6502 vm_page_queue_enter(&vm_page_queue_secluded, m, vmp_pageq);
6503 m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
6504 vm_page_secluded_count++;
6505 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
6506 vm_page_secluded_count_inuse++;
6507 assert(!m_object->internal);
6508 // vm_page_pageable_external_count++;
6509 } else
6510 #endif /* CONFIG_SECLUDED_MEMORY */
6511 vm_page_enqueue_active(m, FALSE);
6512 }
6513 m->vmp_reference = TRUE;
6514 m->vmp_no_cache = FALSE;
6515 }
6516 VM_PAGE_CHECK(m);
6517 }
6518
6519
6520 /*
6521 * vm_page_speculate:
6522 *
6523 * Put the specified page on the speculative list (if appropriate).
6524 *
6525 * The page queues must be locked.
6526 */
6527 void
vm_page_speculate(vm_page_t m,boolean_t new)6528 vm_page_speculate(
6529 vm_page_t m,
6530 boolean_t new)
6531 {
6532 struct vm_speculative_age_q *aq;
6533 vm_object_t m_object;
6534
6535 m_object = VM_PAGE_OBJECT(m);
6536
6537 VM_PAGE_CHECK(m);
6538 vm_page_check_pageable_safe(m);
6539
6540 assert(!vm_page_is_guard(m));
6541 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6542 assert(!(m->vmp_absent && !m->vmp_unusual));
6543 assert(m_object->internal == FALSE);
6544
6545 /*
6546 * if this page is currently on the pageout queue, we can't do the
6547 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6548 * and we can't remove it manually since we would need the object lock
6549 * (which is not required here) to decrement the activity_in_progress
6550 * reference which is held on the object while the page is in the pageout queue...
6551 * just let the normal laundry processing proceed
6552 */
6553 if (m->vmp_laundry || !vm_page_is_canonical(m) ||
6554 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6555 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6556 return;
6557 }
6558
6559 vm_page_queues_remove(m, FALSE);
6560
6561 if (!VM_PAGE_WIRED(m)) {
6562 mach_timespec_t ts;
6563 clock_sec_t sec;
6564 clock_nsec_t nsec;
6565
6566 clock_get_system_nanotime(&sec, &nsec);
6567 ts.tv_sec = (unsigned int) sec;
6568 ts.tv_nsec = nsec;
6569
6570 if (vm_page_speculative_count == 0) {
6571 speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6572 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6573
6574 aq = &vm_page_queue_speculative[speculative_age_index];
6575
6576 /*
6577 * set the timer to begin a new group
6578 */
6579 aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
6580 aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
6581
6582 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
6583 } else {
6584 aq = &vm_page_queue_speculative[speculative_age_index];
6585
6586 if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) {
6587 speculative_age_index++;
6588
6589 if (speculative_age_index > vm_page_max_speculative_age_q) {
6590 speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6591 }
6592 if (speculative_age_index == speculative_steal_index) {
6593 speculative_steal_index = speculative_age_index + 1;
6594
6595 if (speculative_steal_index > vm_page_max_speculative_age_q) {
6596 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
6597 }
6598 }
6599 aq = &vm_page_queue_speculative[speculative_age_index];
6600
6601 if (!vm_page_queue_empty(&aq->age_q)) {
6602 vm_page_speculate_ageit(aq);
6603 }
6604
6605 aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
6606 aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
6607
6608 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
6609 }
6610 }
6611 vm_page_enqueue_tail(&aq->age_q, &m->vmp_pageq);
6612 m->vmp_q_state = VM_PAGE_ON_SPECULATIVE_Q;
6613 vm_page_speculative_count++;
6614 vm_page_pageable_external_count++;
6615
6616 if (new == TRUE) {
6617 vm_object_lock_assert_exclusive(m_object);
6618
6619 m_object->pages_created++;
6620 #if DEVELOPMENT || DEBUG
6621 vm_page_speculative_created++;
6622 #endif
6623 }
6624 }
6625 VM_PAGE_CHECK(m);
6626 }
6627
6628
6629 /*
6630 * move pages from the specified aging bin to
6631 * the speculative bin that pageout_scan claims from
6632 *
6633 * The page queues must be locked.
6634 */
6635 void
vm_page_speculate_ageit(struct vm_speculative_age_q * aq)6636 vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
6637 {
6638 struct vm_speculative_age_q *sq;
6639 vm_page_t t;
6640
6641 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
6642
6643 if (vm_page_queue_empty(&sq->age_q)) {
6644 sq->age_q.next = aq->age_q.next;
6645 sq->age_q.prev = aq->age_q.prev;
6646
6647 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next);
6648 t->vmp_pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q);
6649
6650 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
6651 t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
6652 } else {
6653 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
6654 t->vmp_pageq.next = aq->age_q.next;
6655
6656 t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next);
6657 t->vmp_pageq.prev = sq->age_q.prev;
6658
6659 t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev);
6660 t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
6661
6662 sq->age_q.prev = aq->age_q.prev;
6663 }
6664 vm_page_queue_init(&aq->age_q);
6665 }
6666
6667
6668 void
vm_page_lru(vm_page_t m)6669 vm_page_lru(
6670 vm_page_t m)
6671 {
6672 VM_PAGE_CHECK(m);
6673 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
6674 assert(!vm_page_is_guard(m));
6675
6676 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6677
6678 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q) {
6679 /*
6680 * we don't need to do all the other work that
6681 * vm_page_queues_remove and vm_page_enqueue_inactive
6682 * bring along for the ride
6683 */
6684 assert(!m->vmp_laundry);
6685 assert(!vm_page_is_private(m));
6686
6687 m->vmp_no_cache = FALSE;
6688
6689 vm_page_queue_remove(&vm_page_queue_inactive, m, vmp_pageq);
6690 vm_page_queue_enter(&vm_page_queue_inactive, m, vmp_pageq);
6691
6692 return;
6693 }
6694 /*
6695 * if this page is currently on the pageout queue, we can't do the
6696 * vm_page_queues_remove (which doesn't handle the pageout queue case)
6697 * and we can't remove it manually since we would need the object lock
6698 * (which is not required here) to decrement the activity_in_progress
6699 * reference which is held on the object while the page is in the pageout queue...
6700 * just let the normal laundry processing proceed
6701 */
6702 if (m->vmp_laundry || vm_page_is_private(m) ||
6703 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
6704 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
6705 VM_PAGE_WIRED(m)) {
6706 return;
6707 }
6708
6709 m->vmp_no_cache = FALSE;
6710
6711 vm_page_queues_remove(m, FALSE);
6712
6713 vm_page_enqueue_inactive(m, FALSE);
6714 }
6715
6716
6717 void
vm_page_reactivate_all_throttled(void)6718 vm_page_reactivate_all_throttled(void)
6719 {
6720 vm_page_t first_throttled, last_throttled;
6721 vm_page_t first_active;
6722 vm_page_t m;
6723 int extra_active_count;
6724 int extra_internal_count, extra_external_count;
6725 vm_object_t m_object;
6726
6727 if (!VM_DYNAMIC_PAGING_ENABLED()) {
6728 return;
6729 }
6730
6731 extra_active_count = 0;
6732 extra_internal_count = 0;
6733 extra_external_count = 0;
6734 vm_page_lock_queues();
6735 if (!vm_page_queue_empty(&vm_page_queue_throttled)) {
6736 /*
6737 * Switch "throttled" pages to "active".
6738 */
6739 vm_page_queue_iterate(&vm_page_queue_throttled, m, vmp_pageq) {
6740 VM_PAGE_CHECK(m);
6741 assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
6742
6743 m_object = VM_PAGE_OBJECT(m);
6744
6745 extra_active_count++;
6746 if (m_object->internal) {
6747 extra_internal_count++;
6748 } else {
6749 extra_external_count++;
6750 }
6751
6752 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
6753 VM_PAGE_CHECK(m);
6754 vm_page_add_to_specialq(m, FALSE);
6755 }
6756
6757 /*
6758 * Transfer the entire throttled queue to a regular LRU page queues.
6759 * We insert it at the head of the active queue, so that these pages
6760 * get re-evaluated by the LRU algorithm first, since they've been
6761 * completely out of it until now.
6762 */
6763 first_throttled = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
6764 last_throttled = (vm_page_t) vm_page_queue_last(&vm_page_queue_throttled);
6765 first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
6766 if (vm_page_queue_empty(&vm_page_queue_active)) {
6767 vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
6768 } else {
6769 first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
6770 }
6771 vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled);
6772 first_throttled->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
6773 last_throttled->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
6774
6775 #if DEBUG
6776 printf("reactivated %d throttled pages\n", vm_page_throttled_count);
6777 #endif
6778 vm_page_queue_init(&vm_page_queue_throttled);
6779 /*
6780 * Adjust the global page counts.
6781 */
6782 vm_page_active_count += extra_active_count;
6783 vm_page_pageable_internal_count += extra_internal_count;
6784 vm_page_pageable_external_count += extra_external_count;
6785 vm_page_throttled_count = 0;
6786 }
6787 assert(vm_page_throttled_count == 0);
6788 assert(vm_page_queue_empty(&vm_page_queue_throttled));
6789 vm_page_unlock_queues();
6790 }
6791
6792
6793 /*
6794 * move pages from the indicated local queue to the global active queue
6795 * its ok to fail if we're below the hard limit and force == FALSE
6796 * the nolocks == TRUE case is to allow this function to be run on
6797 * the hibernate path
6798 */
6799
6800 void
vm_page_reactivate_local(uint32_t lid,boolean_t force,boolean_t nolocks)6801 vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
6802 {
6803 struct vpl *lq;
6804 vm_page_t first_local, last_local;
6805 vm_page_t first_active;
6806 vm_page_t m;
6807 uint32_t count = 0;
6808
6809 if (vm_page_local_q == NULL) {
6810 return;
6811 }
6812
6813 lq = zpercpu_get_cpu(vm_page_local_q, lid);
6814
6815 if (nolocks == FALSE) {
6816 if (lq->vpl_count < vm_page_local_q_hard_limit && force == FALSE) {
6817 if (!vm_page_trylockspin_queues()) {
6818 return;
6819 }
6820 } else {
6821 vm_page_lockspin_queues();
6822 }
6823
6824 VPL_LOCK(&lq->vpl_lock);
6825 }
6826 if (lq->vpl_count) {
6827 /*
6828 * Switch "local" pages to "active".
6829 */
6830 assert(!vm_page_queue_empty(&lq->vpl_queue));
6831
6832 vm_page_queue_iterate(&lq->vpl_queue, m, vmp_pageq) {
6833 VM_PAGE_CHECK(m);
6834 vm_page_check_pageable_safe(m);
6835 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q);
6836 assert(!vm_page_is_fictitious(m));
6837
6838 if (m->vmp_local_id != lid) {
6839 panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m);
6840 }
6841
6842 m->vmp_local_id = 0;
6843 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
6844 VM_PAGE_CHECK(m);
6845 vm_page_add_to_specialq(m, FALSE);
6846 count++;
6847 }
6848 if (count != lq->vpl_count) {
6849 panic("vm_page_reactivate_local: count = %d, vm_page_local_count = %d", count, lq->vpl_count);
6850 }
6851
6852 /*
6853 * Transfer the entire local queue to a regular LRU page queues.
6854 */
6855 first_local = (vm_page_t) vm_page_queue_first(&lq->vpl_queue);
6856 last_local = (vm_page_t) vm_page_queue_last(&lq->vpl_queue);
6857 first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
6858
6859 if (vm_page_queue_empty(&vm_page_queue_active)) {
6860 vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
6861 } else {
6862 first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
6863 }
6864 vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
6865 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
6866 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
6867
6868 vm_page_queue_init(&lq->vpl_queue);
6869 /*
6870 * Adjust the global page counts.
6871 */
6872 vm_page_active_count += lq->vpl_count;
6873 vm_page_pageable_internal_count += lq->vpl_internal_count;
6874 vm_page_pageable_external_count += lq->vpl_external_count;
6875 lq->vpl_count = 0;
6876 lq->vpl_internal_count = 0;
6877 lq->vpl_external_count = 0;
6878 }
6879 assert(vm_page_queue_empty(&lq->vpl_queue));
6880
6881 if (nolocks == FALSE) {
6882 VPL_UNLOCK(&lq->vpl_lock);
6883
6884 vm_page_balance_inactive(count / 4);
6885 vm_page_unlock_queues();
6886 }
6887 }
6888
6889 /*
6890 * vm_page_part_zero_fill:
6891 *
6892 * Zero-fill a part of the page.
6893 */
6894 #define PMAP_ZERO_PART_PAGE_IMPLEMENTED
6895 void
vm_page_part_zero_fill(vm_page_t m,vm_offset_t m_pa,vm_size_t len)6896 vm_page_part_zero_fill(
6897 vm_page_t m,
6898 vm_offset_t m_pa,
6899 vm_size_t len)
6900 {
6901 #if 0
6902 /*
6903 * we don't hold the page queue lock
6904 * so this check isn't safe to make
6905 */
6906 VM_PAGE_CHECK(m);
6907 #endif
6908
6909 #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED
6910 pmap_zero_part_page(VM_PAGE_GET_PHYS_PAGE(m), m_pa, len);
6911 #else
6912 vm_page_t tmp;
6913 while (1) {
6914 tmp = vm_page_grab();
6915 if (tmp == VM_PAGE_NULL) {
6916 vm_page_wait(THREAD_UNINT);
6917 continue;
6918 }
6919 break;
6920 }
6921 vm_page_zero_fill(
6922 tmp
6923 #if HAS_MTE
6924 , false /* zero_tags */
6925 #endif /* HAS_MTE */
6926 );
6927 if (m_pa != 0) {
6928 vm_page_part_copy(m, 0, tmp, 0, m_pa);
6929 }
6930 if ((m_pa + len) < PAGE_SIZE) {
6931 vm_page_part_copy(m, m_pa + len, tmp,
6932 m_pa + len, PAGE_SIZE - (m_pa + len));
6933 }
6934 vm_page_copy(tmp, m);
6935 VM_PAGE_FREE(tmp);
6936 #endif
6937 }
6938
6939 /*!
6940 * @function vm_page_zero_fill
6941 *
6942 * @abstract
6943 * Zero-fill the specified page.
6944 *
6945 * @param m the page to be zero-filled.
6946 */
6947 #if HAS_MTE && !defined(KASAN)
6948 /*!
6949 * @param zero_tags if true, and the page is MTE-tagged, its corresponding tags will be zeroed.
6950 */
6951 #endif /* HAS_MTE && !defined(KASAN) */
6952 void
vm_page_zero_fill(vm_page_t m,bool zero_tags)6953 vm_page_zero_fill(
6954 vm_page_t m
6955 #if HAS_MTE
6956 , bool zero_tags
6957 #endif /* HAS_MTE */
6958 )
6959 {
6960 int options = 0;
6961 #if 0
6962 /*
6963 * we don't hold the page queue lock
6964 * so this check isn't safe to make
6965 */
6966 VM_PAGE_CHECK(m);
6967 #endif
6968
6969 // dbgTrace(0xAEAEAEAE, VM_PAGE_GET_PHYS_PAGE(m), 0); /* (BRINGUP) */
6970 #if HAS_MTE
6971 assert(!zero_tags || VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6972
6973 /*
6974 * TODO: this can be checked more easily using m->vmp_using_mte once
6975 * page reclamation work is complete
6976 */
6977 if (zero_tags && vm_object_is_mte_mappable(VM_PAGE_OBJECT(m))) {
6978 options = cppvZeroPageTags;
6979 KDBG(VMDBG_CODE(DBG_VM_PAGE_MTE_ZFOD) | DBG_FUNC_NONE,
6980 VM_KERNEL_ADDRHIDE(m), VM_KERNEL_ADDRHIDE(VM_PAGE_OBJECT(m)),
6981 m->vmp_offset);
6982 }
6983 #endif /* HAS_MTE */
6984 pmap_zero_page_with_options(VM_PAGE_GET_PHYS_PAGE(m), options);
6985 }
6986
6987 /*
6988 * vm_page_part_copy:
6989 *
6990 * copy part of one page to another
6991 *
6992 * This function is currently only consumed downstream of a
6993 * vm_map_copy_overwrite(). The implementation has a simpler contract
6994 * than vm_page_copy() as there's a restricted set of cases that
6995 * are allowed to be overwriteable. If vm_map_entry_is_overwriteable()
6996 * is expanded, this function may have to be adjusted.
6997 */
6998 void
vm_page_part_copy(vm_page_t src_m,vm_offset_t src_pa,vm_page_t dst_m,vm_offset_t dst_pa,vm_size_t len)6999 vm_page_part_copy(
7000 vm_page_t src_m,
7001 vm_offset_t src_pa,
7002 vm_page_t dst_m,
7003 vm_offset_t dst_pa,
7004 vm_size_t len)
7005 {
7006 #if 0
7007 /*
7008 * we don't hold the page queue lock
7009 * so this check isn't safe to make
7010 */
7011 VM_PAGE_CHECK(src_m);
7012 VM_PAGE_CHECK(dst_m);
7013 #endif
7014
7015 /*
7016 * Copying from/into restricted pages is a security issue,
7017 * as it allows for restricted pages' policies bypass.
7018 */
7019 if (vm_page_is_restricted(src_m)) {
7020 panic("%s: cannot copy from a restricted page", __func__);
7021 }
7022
7023 if (vm_page_is_restricted(dst_m)) {
7024 panic("%s: cannot copy into a restricted page", __func__);
7025 }
7026
7027 #if HAS_MTE
7028 /*
7029 * As an example of a necessary expansion for vm_page_part_copy(),
7030 * MTE objects are currently not overwriteable, but whenever
7031 * rdar://134375521 ([VM MTE] Handle overwriting of MTE objects)
7032 * gets dealt with, we'll have to update the call down here to pass
7033 * the right flags to bcopy_phys().
7034 */
7035 #endif /* HAS_MTE */
7036
7037 pmap_copy_part_page(VM_PAGE_GET_PHYS_PAGE(src_m), src_pa,
7038 VM_PAGE_GET_PHYS_PAGE(dst_m), dst_pa, len);
7039 }
7040
7041 /*
7042 * vm_page_copy:
7043 *
7044 * Copy one page to another
7045 */
7046
7047 int vm_page_copy_cs_validations = 0;
7048 int vm_page_copy_cs_tainted = 0;
7049
7050 void
vm_page_copy(vm_page_t src_m,vm_page_t dest_m)7051 vm_page_copy(
7052 vm_page_t src_m,
7053 vm_page_t dest_m)
7054 {
7055 vm_object_t src_m_object;
7056 int options = 0;
7057
7058 src_m_object = VM_PAGE_OBJECT(src_m);
7059
7060 #if 0
7061 /*
7062 * we don't hold the page queue lock
7063 * so this check isn't safe to make
7064 */
7065 VM_PAGE_CHECK(src_m);
7066 VM_PAGE_CHECK(dest_m);
7067 #endif
7068 vm_object_lock_assert_held(src_m_object);
7069
7070 /*
7071 * Copying from/into restricted pages is a security issue,
7072 * as it allows for restricted pages' policies bypass.
7073 */
7074 if (vm_page_is_restricted(src_m)) {
7075 panic("%s: cannot copy from a restricted page", __func__);
7076 }
7077
7078 if (vm_page_is_restricted(dest_m)) {
7079 panic("%s: cannot copy into a restricted page", __func__);
7080 }
7081
7082 if (src_m_object != VM_OBJECT_NULL &&
7083 src_m_object->code_signed) {
7084 /*
7085 * We're copying a page from a code-signed object.
7086 * Whoever ends up mapping the copy page might care about
7087 * the original page's integrity, so let's validate the
7088 * source page now.
7089 */
7090 vm_page_copy_cs_validations++;
7091 vm_page_validate_cs(src_m, PAGE_SIZE, 0);
7092 #if DEVELOPMENT || DEBUG
7093 DTRACE_VM4(codesigned_copy,
7094 vm_object_t, src_m_object,
7095 vm_object_offset_t, src_m->vmp_offset,
7096 int, src_m->vmp_cs_validated,
7097 int, src_m->vmp_cs_tainted);
7098 #endif /* DEVELOPMENT || DEBUG */
7099 }
7100
7101 /*
7102 * Propagate the cs_tainted bit to the copy page. Do not propagate
7103 * the cs_validated bit.
7104 */
7105 dest_m->vmp_cs_tainted = src_m->vmp_cs_tainted;
7106 dest_m->vmp_cs_nx = src_m->vmp_cs_nx;
7107 if (dest_m->vmp_cs_tainted) {
7108 vm_page_copy_cs_tainted++;
7109 }
7110
7111 #if HAS_MTE
7112 /*
7113 * vm_page_copy-ing from an untagged page into a tagged page
7114 * would happen with tag checking disabled and actually potentially be
7115 * an MTE violation.
7116 */
7117 if (!src_m->vmp_using_mte && dest_m->vmp_using_mte) {
7118 panic("Attempt to write to an MTE tagged page through the physical aperture");
7119 }
7120
7121 if (src_m->vmp_using_mte) {
7122 /* If we are copying from an MTE-enabled page, disable tag checking */
7123 options |= cppvDisableTagCheck;
7124
7125 if (dest_m->vmp_using_mte) {
7126 /*
7127 * If both source and destination are tagged, this means that we are
7128 * either CoWing or relocating a page. Tags need to follow along.
7129 */
7130 options |= cppvCopyTags;
7131 }
7132 }
7133 #endif /* HAS_MTE */
7134
7135 dest_m->vmp_error = VMP_ERROR_GET(src_m); /* sliding src_m might have failed... */
7136 pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m), options);
7137 }
7138
7139 #if MACH_ASSERT
7140 static void
_vm_page_print(vm_page_t p)7141 _vm_page_print(
7142 vm_page_t p)
7143 {
7144 printf("vm_page %p: \n", p);
7145 printf(" pageq: next=%p prev=%p\n",
7146 (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next),
7147 (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.prev));
7148 printf(" listq: next=%p prev=%p\n",
7149 (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.next)),
7150 (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.prev)));
7151 printf(" next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)));
7152 printf(" object=%p offset=0x%llx\n", VM_PAGE_OBJECT(p), p->vmp_offset);
7153 printf(" wire_count=%u\n", p->vmp_wire_count);
7154 printf(" q_state=%u\n", p->vmp_q_state);
7155
7156 printf(" %slaundry, %sref, %sgobbled, %sprivate\n",
7157 (p->vmp_laundry ? "" : "!"),
7158 (p->vmp_reference ? "" : "!"),
7159 (p->vmp_gobbled ? "" : "!"),
7160 (vm_page_is_private(p) ? "" : "!"));
7161 printf(" %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n",
7162 (p->vmp_busy ? "" : "!"),
7163 (p->vmp_wanted ? "" : "!"),
7164 (p->vmp_tabled ? "" : "!"),
7165 (vm_page_is_fictitious(p) ? "" : "!"),
7166 (p->vmp_pmapped ? "" : "!"),
7167 (p->vmp_wpmapped ? "" : "!"));
7168 printf(" %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n",
7169 (p->vmp_free_when_done ? "" : "!"),
7170 (p->vmp_absent ? "" : "!"),
7171 (VMP_ERROR_GET(p) ? "" : "!"),
7172 (p->vmp_dirty ? "" : "!"),
7173 (p->vmp_cleaning ? "" : "!"),
7174 (p->vmp_precious ? "" : "!"),
7175 (p->vmp_clustered ? "" : "!"));
7176 printf(" %soverwriting, %srestart, %sunusual\n",
7177 (p->vmp_overwriting ? "" : "!"),
7178 (p->vmp_restart ? "" : "!"),
7179 (p->vmp_unusual ? "" : "!"));
7180 printf(" cs_validated=%d, cs_tainted=%d, cs_nx=%d, %sno_cache\n",
7181 p->vmp_cs_validated,
7182 p->vmp_cs_tainted,
7183 p->vmp_cs_nx,
7184 (p->vmp_no_cache ? "" : "!"));
7185
7186 printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p));
7187 }
7188
7189 /*
7190 * Check that the list of pages is ordered by
7191 * ascending physical address and has no holes.
7192 */
7193 static int
vm_page_verify_contiguous(vm_page_t pages,unsigned int npages)7194 vm_page_verify_contiguous(
7195 vm_page_t pages,
7196 unsigned int npages)
7197 {
7198 vm_page_t m;
7199 unsigned int page_count;
7200 vm_offset_t prev_addr;
7201
7202 prev_addr = VM_PAGE_GET_PHYS_PAGE(pages);
7203 page_count = 1;
7204 for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) {
7205 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
7206 printf("m %p prev_addr 0x%lx, current addr 0x%x\n",
7207 m, (long)prev_addr, VM_PAGE_GET_PHYS_PAGE(m));
7208 printf("pages %p page_count %d npages %d\n", pages, page_count, npages);
7209 panic("vm_page_verify_contiguous: not contiguous!");
7210 }
7211 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
7212 ++page_count;
7213 }
7214 if (page_count != npages) {
7215 printf("pages %p actual count 0x%x but requested 0x%x\n",
7216 pages, page_count, npages);
7217 panic("vm_page_verify_contiguous: count error");
7218 }
7219 return 1;
7220 }
7221
7222
7223 /*
7224 * Check the free lists for proper length etc.
7225 */
7226 static boolean_t vm_page_verify_this_free_list_enabled = FALSE;
7227 static unsigned int
vm_page_verify_free_list(vm_page_queue_head_t * vm_page_queue,unsigned int color,vm_page_t look_for_page,boolean_t expect_page)7228 vm_page_verify_free_list(
7229 vm_page_queue_head_t *vm_page_queue,
7230 unsigned int color,
7231 vm_page_t look_for_page,
7232 boolean_t expect_page)
7233 {
7234 unsigned int npages;
7235 vm_page_t m;
7236 vm_page_t prev_m;
7237 boolean_t found_page;
7238
7239 if (!vm_page_verify_this_free_list_enabled) {
7240 return 0;
7241 }
7242
7243 found_page = FALSE;
7244 npages = 0;
7245 prev_m = (vm_page_t)((uintptr_t)vm_page_queue);
7246
7247 vm_page_queue_iterate(vm_page_queue, m, vmp_pageq) {
7248 if (m == look_for_page) {
7249 found_page = TRUE;
7250 }
7251 if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev) != prev_m) {
7252 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p",
7253 color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev), prev_m);
7254 }
7255 if (!m->vmp_busy) {
7256 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy",
7257 color, npages, m);
7258 }
7259 if (color != (unsigned int) -1) {
7260 if (VM_PAGE_GET_COLOR(m) != color) {
7261 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u",
7262 color, npages, m, VM_PAGE_GET_COLOR(m), color);
7263 }
7264 if (m->vmp_q_state != VM_PAGE_ON_FREE_Q) {
7265 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d",
7266 color, npages, m, m->vmp_q_state);
7267 }
7268 } else {
7269 if (m->vmp_q_state != VM_PAGE_ON_FREE_LOCAL_Q) {
7270 panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d",
7271 npages, m, m->vmp_q_state);
7272 }
7273 }
7274 ++npages;
7275 prev_m = m;
7276 }
7277 if (look_for_page != VM_PAGE_NULL) {
7278 unsigned int other_color;
7279
7280 if (expect_page && !found_page) {
7281 printf("vm_page_verify_free_list(color=%u, npages=%u): page %p not found phys=%u\n",
7282 color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
7283 _vm_page_print(look_for_page);
7284 for (other_color = 0;
7285 other_color < vm_colors;
7286 other_color++) {
7287 if (other_color == color) {
7288 continue;
7289 }
7290 vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[other_color].qhead,
7291 other_color, look_for_page, FALSE);
7292 }
7293 #if XNU_VM_HAS_LOPAGE
7294 if (color == (unsigned int) -1) {
7295 vm_page_verify_free_list(&vm_lopage_queue_free,
7296 (unsigned int) -1, look_for_page, FALSE);
7297 }
7298 #endif /* XNU_VM_HAS_LOPAGE */
7299 panic("vm_page_verify_free_list(color=%u)", color);
7300 }
7301 if (!expect_page && found_page) {
7302 printf("vm_page_verify_free_list(color=%u, npages=%u): page %p found phys=%u\n",
7303 color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
7304 }
7305 }
7306 return npages;
7307 }
7308
7309 static boolean_t vm_page_verify_all_free_lists_enabled = FALSE;
7310 static void
vm_page_verify_free_lists(void)7311 vm_page_verify_free_lists( void )
7312 {
7313 unsigned int color, npages, nlopages;
7314 boolean_t toggle = TRUE;
7315
7316 if (!vm_page_verify_all_free_lists_enabled) {
7317 return;
7318 }
7319
7320 npages = 0;
7321 nlopages = 0;
7322
7323 vm_free_page_lock();
7324
7325 if (vm_page_verify_this_free_list_enabled == TRUE) {
7326 /*
7327 * This variable has been set globally for extra checking of
7328 * each free list Q. Since we didn't set it, we don't own it
7329 * and we shouldn't toggle it.
7330 */
7331 toggle = FALSE;
7332 }
7333
7334 if (toggle == TRUE) {
7335 vm_page_verify_this_free_list_enabled = TRUE;
7336 }
7337
7338 for (color = 0; color < vm_colors; color++) {
7339 npages += vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[color].qhead,
7340 color, VM_PAGE_NULL, FALSE);
7341 }
7342 #if XNU_VM_HAS_LOPAGE
7343 nlopages = vm_page_verify_free_list(&vm_lopage_queue_free,
7344 (unsigned int) -1,
7345 VM_PAGE_NULL, FALSE);
7346 #endif /* XNU_VM_HAS_LOPAGE */
7347 if (npages != vm_page_free_count || nlopages != vm_lopage_free_count) {
7348 panic("vm_page_verify_free_lists: "
7349 "npages %u free_count %d nlopages %u lo_free_count %u",
7350 npages, vm_page_free_count, nlopages, vm_lopage_free_count);
7351 }
7352
7353 if (toggle == TRUE) {
7354 vm_page_verify_this_free_list_enabled = FALSE;
7355 }
7356
7357 vm_free_page_unlock();
7358 }
7359
7360 #endif /* MACH_ASSERT */
7361
7362 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
7363
7364 /*
7365 * CONTIGUOUS PAGE ALLOCATION AND HELPER FUNCTIONS
7366 */
7367
7368 /*
7369 * Helper function used to determine if a page can be relocated
7370 * A page is relocatable if it is in a stable non-transient state
7371 * and if the page being relocated is compatible with the reason for reloc
7372 * The page queue lock must be held, and the object lock too, if the page
7373 * is in an object.
7374 */
7375 boolean_t
vm_page_is_relocatable(vm_page_t m,vm_relocate_reason_t reloc_reason)7376 vm_page_is_relocatable(vm_page_t m, vm_relocate_reason_t reloc_reason)
7377 {
7378
7379 if (VM_PAGE_WIRED(m) || m->vmp_gobbled || m->vmp_laundry || m->vmp_wanted ||
7380 m->vmp_cleaning || m->vmp_overwriting || m->vmp_free_when_done) {
7381 /*
7382 * Page is in a transient state
7383 * or a state we don't want to deal with.
7384 */
7385 return FALSE;
7386 } else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
7387 (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) ||
7388 #if XNU_VM_HAS_LOPAGE
7389 (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
7390 #endif /* XNU_VM_HAS_LOPAGE */
7391 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
7392 /*
7393 * Page needs to be on one of our queues (other then the pageout or special
7394 * free queues) or it needs to belong to the compressor pool (which is now
7395 * indicated by vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out from
7396 * the check for VM_PAGE_NOT_ON_Q) in order for it to be stable behind the
7397 * locks we hold at this point...
7398 */
7399 return FALSE;
7400 } else if ((m->vmp_q_state != VM_PAGE_ON_FREE_Q) &&
7401 (!m->vmp_tabled || m->vmp_busy)) {
7402 /*
7403 * pages on the free list are always 'busy'
7404 * so we couldn't test for 'busy' in the check
7405 * for the transient states... pages that are
7406 * 'free' are never 'tabled', so we also couldn't
7407 * test for 'tabled'. So we check here to make
7408 * sure that a non-free page is not busy and is
7409 * tabled on an object...
7410 */
7411 return FALSE;
7412 }
7413
7414 /*
7415 * Lastly, check the page against the relocation reason; the page may
7416 * be in a relocatable state, but not be a page we WANT to relocate for
7417 * the caller's use case.
7418 */
7419 switch (reloc_reason) {
7420 #if HAS_MTE
7421 case VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM:
7422 {
7423 /*
7424 * Relocating the content of tag storage pages so the
7425 * fill thread can reclaim a page is perfectly valid,
7426 * unless the page is busy.
7427 */
7428 if (m->vmp_busy) {
7429 return FALSE;
7430 }
7431 break;
7432 }
7433 case VM_RELOCATE_REASON_TAG_STORAGE_WIRE:
7434 #endif /* HAS_MTE */
7435 case VM_RELOCATE_REASON_CONTIGUOUS:
7436 {
7437 #if HAS_MTE
7438 /*
7439 * Tag storage pages may be needed for tag storage. Because
7440 * the contiguous allocator is likely being used for wired
7441 * allocations, this page is not eligible to be relocated in
7442 * this case.
7443 */
7444 if (vm_page_is_tag_storage(m)) {
7445 return FALSE;
7446 }
7447 #endif /* HAS_MTE */
7448 break;
7449 }
7450
7451 default:
7452 {
7453 panic("Invalid relocation reason %u", reloc_reason);
7454 __builtin_unreachable();
7455 }
7456 }
7457
7458 return TRUE;
7459 }
7460
7461 /*
7462 * Free up the given page by possibily relocating its contents to a new page
7463 * If the page is on an object the object lock must be held.
7464 *
7465 * Whether or not the page is considered relocatable is contingent on the
7466 * reason it is being relocated.
7467 *
7468 * Return the new page back to the caller if requested, as done in
7469 * vm_object_iopl_wire_full().
7470 *
7471 * The VM page queues lock must also be held.
7472 *
7473 * @returns
7474 * - KERN_SUCCESS if the relocation was successful.
7475 * - KERN_INVALID_OBJECT if @c m1's object is VM_OBJECT_NULL.
7476 * - KERN_FAILURE if the reolcation failed due to @c m1's state.
7477 * - KERN_RESOURCE_SHORTAGE if no page could be allocated to relocate @c m1.
7478 */
7479 kern_return_t
vm_page_relocate(vm_page_t m1,int * compressed_pages,vm_relocate_reason_t reloc_reason,vm_page_t * new_page)7480 vm_page_relocate(
7481 vm_page_t m1,
7482 int *compressed_pages,
7483 vm_relocate_reason_t reloc_reason,
7484 vm_page_t* new_page)
7485 {
7486 int refmod = 0;
7487 vm_object_t object = VM_PAGE_OBJECT(m1);
7488 kern_return_t kr;
7489
7490 switch (reloc_reason) {
7491 case VM_RELOCATE_REASON_CONTIGUOUS:
7492 {
7493 #if HAS_MTE
7494 /*
7495 * The contiguous allocator should not be considering tag
7496 * storage pages.
7497 */
7498 assert(!vm_page_is_tag_storage(m1));
7499 #endif /* HAS_MTE */
7500 break;
7501 }
7502 #if HAS_MTE
7503 case VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM:
7504 {
7505 /*
7506 * If we are trying to reclaim tag storage, we should be
7507 * relocating a tag storage page.
7508 */
7509 assert(vm_page_is_tag_storage(m1));
7510 if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7511 vm_page_tag_storage_compressor_relocation_count++;
7512 }
7513 break;
7514 }
7515 case VM_RELOCATE_REASON_TAG_STORAGE_WIRE:
7516 {
7517 assert(vm_page_is_tag_storage(m1) && m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7518 vm_page_tag_storage_wire_relocation_count++;
7519 break;
7520 }
7521 #endif /* HAS_MTE */
7522 default:
7523 {
7524 panic("Unrecognized relocation reason %u\n", reloc_reason);
7525 break;
7526 }
7527 }
7528
7529 if (object == VM_OBJECT_NULL) {
7530 return KERN_INVALID_OBJECT;
7531 }
7532
7533 vm_object_lock_assert_held(object);
7534
7535 if (VM_PAGE_WIRED(m1) ||
7536 m1->vmp_gobbled ||
7537 m1->vmp_laundry ||
7538 m1->vmp_wanted ||
7539 m1->vmp_cleaning ||
7540 m1->vmp_overwriting ||
7541 m1->vmp_free_when_done ||
7542 m1->vmp_busy ||
7543 m1->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
7544 return KERN_FAILURE;
7545 }
7546
7547 boolean_t disconnected = FALSE;
7548 boolean_t reusable = FALSE;
7549
7550 /*
7551 * Pages from reusable objects can be reclaimed directly.
7552 */
7553 if ((m1->vmp_reusable || object->all_reusable) &&
7554 m1->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q && !m1->vmp_dirty &&
7555 !m1->vmp_reference) {
7556 /*
7557 * reusable page...
7558 */
7559
7560 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
7561 disconnected = TRUE;
7562 if (refmod == 0) {
7563 /*
7564 * ... not reused: can steal without relocating contents.
7565 */
7566 reusable = TRUE;
7567 }
7568 }
7569
7570 if ((m1->vmp_pmapped && !reusable) || m1->vmp_dirty || m1->vmp_precious) {
7571 vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD;
7572 vm_object_offset_t offset;
7573 int copy_page_options = 0;
7574
7575 #if HAS_MTE
7576 if (m1->vmp_using_mte) {
7577 grab_options |= VM_PAGE_GRAB_MTE;
7578 copy_page_options |= cppvCopyTags;
7579 }
7580 #endif /* HAS_MTE */
7581 /* page is not reusable, we need to allocate a new page
7582 * and move its contents there.
7583 */
7584 vm_page_t m2 = vm_page_grab_options(grab_options);
7585
7586 if (m2 == VM_PAGE_NULL) {
7587 return KERN_RESOURCE_SHORTAGE;
7588 }
7589
7590 if (!disconnected) {
7591 if (m1->vmp_pmapped) {
7592 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
7593 } else {
7594 refmod = 0;
7595 }
7596 }
7597
7598 #if HAS_MTE
7599 assert(m1->vmp_using_mte == m2->vmp_using_mte);
7600 if (m1->vmp_using_mte) {
7601 assert(pmap_is_tagged_page(VM_PAGE_GET_PHYS_PAGE(m2)));
7602 copy_page_options |= (cppvCopyTags | cppvDisableTagCheck);
7603 }
7604 #endif /* HAS_MTE */
7605 /* copy the page's contents */
7606 pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2), copy_page_options);
7607
7608 /* copy the page's state */
7609 assert(!VM_PAGE_WIRED(m1));
7610 assert(m1->vmp_q_state != VM_PAGE_ON_FREE_Q);
7611 assert(m1->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q);
7612 assert(!m1->vmp_laundry);
7613 m2->vmp_reference = m1->vmp_reference;
7614 assert(!m1->vmp_gobbled);
7615 m2->vmp_no_cache = m1->vmp_no_cache;
7616 m2->vmp_xpmapped = 0;
7617 assert(!m1->vmp_busy);
7618 assert(!m1->vmp_wanted);
7619 assert(vm_page_is_canonical(m1));
7620 m2->vmp_pmapped = m1->vmp_pmapped; /* should flush cache ? */
7621 m2->vmp_wpmapped = m1->vmp_wpmapped;
7622 assert(!m1->vmp_free_when_done);
7623 m2->vmp_absent = m1->vmp_absent;
7624 m2->vmp_error = VMP_ERROR_GET(m1);
7625 m2->vmp_dirty = m1->vmp_dirty;
7626 assert(!m1->vmp_cleaning);
7627 m2->vmp_precious = m1->vmp_precious;
7628 m2->vmp_clustered = m1->vmp_clustered;
7629 assert(!m1->vmp_overwriting);
7630 m2->vmp_restart = m1->vmp_restart;
7631 m2->vmp_unusual = m1->vmp_unusual;
7632 m2->vmp_cs_validated = m1->vmp_cs_validated;
7633 m2->vmp_cs_tainted = m1->vmp_cs_tainted;
7634 m2->vmp_cs_nx = m1->vmp_cs_nx;
7635
7636 m2->vmp_realtime = m1->vmp_realtime;
7637 m1->vmp_realtime = false;
7638
7639 /*
7640 * If m1 had really been reusable,
7641 * we would have just stolen it, so
7642 * let's not propagate its "reusable"
7643 * bit and assert that m2 is not
7644 * marked as "reusable".
7645 */
7646 // m2->vmp_reusable = m1->vmp_reusable;
7647 assert(!m2->vmp_reusable);
7648
7649 // assert(!m1->vmp_lopage);
7650
7651 if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7652 m2->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
7653 /*
7654 * We just grabbed m2 up above and so it isn't
7655 * going to be on any special Q as yet and so
7656 * we don't need to 'remove' it from the special
7657 * queues. Just resetting the state should be enough.
7658 */
7659 m2->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
7660 }
7661
7662 /*
7663 * page may need to be flushed if
7664 * it is marshalled into a UPL
7665 * that is going to be used by a device
7666 * that doesn't support coherency
7667 */
7668 m2->vmp_written_by_kernel = TRUE;
7669
7670 /*
7671 * make sure we clear the ref/mod state
7672 * from the pmap layer... else we risk
7673 * inheriting state from the last time
7674 * this page was used...
7675 */
7676 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2),
7677 VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7678
7679 if (refmod & VM_MEM_REFERENCED) {
7680 m2->vmp_reference = TRUE;
7681 }
7682 if (refmod & VM_MEM_MODIFIED) {
7683 SET_PAGE_DIRTY(m2, TRUE);
7684 }
7685 offset = m1->vmp_offset;
7686
7687 /*
7688 * completely cleans up the state
7689 * of the page so that it is ready
7690 * to be put onto the free list, or
7691 * for this purpose it looks like it
7692 * just came off of the free list
7693 */
7694 vm_page_free_prepare(m1);
7695
7696 /*
7697 * now put the substitute page on the object
7698 */
7699 vm_page_insert_internal(m2, object, offset, VM_KERN_MEMORY_NONE, TRUE,
7700 TRUE, FALSE, FALSE, NULL);
7701
7702 /*
7703 * Return the relocated vm_page_t if the caller wants to know.
7704 */
7705 if (new_page) {
7706 *new_page = m2;
7707 }
7708
7709 if (m2->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7710 m2->vmp_pmapped = TRUE;
7711 m2->vmp_wpmapped = TRUE;
7712
7713 kr = pmap_enter_check(kernel_pmap, (vm_map_offset_t)m2->vmp_offset, m2,
7714 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, TRUE);
7715
7716 assert(kr == KERN_SUCCESS);
7717
7718 if (compressed_pages) {
7719 ++*compressed_pages;
7720 }
7721 } else {
7722 /* relocated page was not used by the compressor
7723 * put it on either the active or inactive lists */
7724 if (m2->vmp_reference) {
7725 vm_page_activate(m2);
7726 } else {
7727 vm_page_deactivate(m2);
7728 }
7729 }
7730
7731 /* unset the busy flag (pages on the free queue are busy) and notify if wanted */
7732 vm_page_wakeup_done(object, m2);
7733 } else {
7734 assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7735
7736 /*
7737 * completely cleans up the state
7738 * of the page so that it is ready
7739 * to be put onto the free list, or
7740 * for this purpose it looks like it
7741 * just came off of the free list
7742 */
7743 vm_page_free_prepare(m1);
7744
7745 if (new_page) {
7746 vm_page_t m2;
7747 vm_object_offset_t offset;
7748 vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD;
7749
7750 /* The caller still wanted a page, so let's give them a new one. */
7751 offset = m1->vmp_offset;
7752 #if HAS_MTE
7753 if (m1->vmp_using_mte) {
7754 grab_options |= VM_PAGE_GRAB_MTE;
7755 }
7756 #endif /* HAS_MTE */
7757 m2 = vm_page_grab_options(grab_options);
7758
7759 if (m2 == VM_PAGE_NULL) {
7760 return KERN_RESOURCE_SHORTAGE;
7761 }
7762
7763 /*
7764 * make sure we clear the ref/mod state
7765 * from the pmap layer... else we risk
7766 * inheriting state from the last time
7767 * this page was used...
7768 */
7769 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2),
7770 VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7771
7772 offset = m1->vmp_offset;
7773
7774 /*
7775 * now put the substitute page on the object
7776 */
7777 vm_page_insert_internal(m2, object, offset, VM_KERN_MEMORY_NONE, TRUE,
7778 TRUE, FALSE, FALSE, NULL);
7779
7780 *new_page = m2;
7781 }
7782 }
7783
7784 /* we're done here */
7785 return KERN_SUCCESS;
7786 }
7787
7788 /*
7789 * CONTIGUOUS PAGE ALLOCATION
7790 *
7791 * Find a region large enough to contain at least n pages
7792 * of contiguous physical memory.
7793 *
7794 * This is done by traversing the vm_page_t array in a linear fashion
7795 * we assume that the vm_page_t array has the avaiable physical pages in an
7796 * ordered, ascending list... this is currently true of all our implementations
7797 * and must remain so... there can be 'holes' in the array... we also can
7798 * no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed
7799 * which use to happen via 'vm_page_convert'... that function was no longer
7800 * being called and was removed...
7801 *
7802 * The basic flow consists of stabilizing some of the interesting state of
7803 * a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our
7804 * sweep at the beginning of the array looking for pages that meet our criterea
7805 * for a 'stealable' page... currently we are pretty conservative... if the page
7806 * meets this criterea and is physically contiguous to the previous page in the 'run'
7807 * we keep developing it. If we hit a page that doesn't fit, we reset our state
7808 * and start to develop a new run... if at this point we've already considered
7809 * at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold,
7810 * and mutex_pause (which will yield the processor), to keep the latency low w/r
7811 * to other threads trying to acquire free pages (or move pages from q to q),
7812 * and then continue from the spot we left off... we only make 1 pass through the
7813 * array. Once we have a 'run' that is long enough, we'll go into the loop which
7814 * which steals the pages from the queues they're currently on... pages on the free
7815 * queue can be stolen directly... pages that are on any of the other queues
7816 * must be removed from the object they are tabled on... this requires taking the
7817 * object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails
7818 * or if the state of the page behind the vm_object lock is no longer viable, we'll
7819 * dump the pages we've currently stolen back to the free list, and pick up our
7820 * scan from the point where we aborted the 'current' run.
7821 *
7822 *
7823 * Requirements:
7824 * - neither vm_page_queue nor vm_free_list lock can be held on entry
7825 *
7826 * Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL.
7827 *
7828 * Algorithm:
7829 */
7830
7831 #define MAX_CONSIDERED_BEFORE_YIELD 1000
7832
7833
7834 #define RESET_STATE_OF_RUN() \
7835 MACRO_BEGIN \
7836 prevcontaddr = -2; \
7837 start_pnum = -1; \
7838 free_considered = 0; \
7839 substitute_needed = 0; \
7840 npages = 0; \
7841 MACRO_END
7842
7843 /*
7844 * Can we steal in-use (i.e. not free) pages when searching for
7845 * physically-contiguous pages ?
7846 */
7847 #define VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL 1
7848
7849 static unsigned int vm_page_find_contiguous_last_idx = 0, vm_page_lomem_find_contiguous_last_idx = 0;
7850 #if DEBUG
7851 int vm_page_find_contig_debug = 0;
7852 #endif
7853
7854 static vm_page_t
vm_page_find_contiguous(unsigned int contig_pages,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)7855 vm_page_find_contiguous(
7856 unsigned int contig_pages,
7857 ppnum_t max_pnum,
7858 ppnum_t pnum_mask,
7859 boolean_t wire,
7860 int flags)
7861 {
7862 vm_page_list_t list = { };
7863 ppnum_t prevcontaddr = 0;
7864 ppnum_t start_pnum = 0;
7865 unsigned int npages = 0, considered = 0, scanned = 0;
7866 unsigned int page_idx = 0, start_idx = 0, last_idx = 0, orig_last_idx = 0;
7867 unsigned int idx_last_contig_page_found = 0;
7868 int free_considered = 0, free_available = 0;
7869 int substitute_needed = 0;
7870 int zone_gc_called = 0;
7871 boolean_t wrapped;
7872 kern_return_t kr;
7873 #if DEBUG
7874 clock_sec_t tv_start_sec = 0, tv_end_sec = 0;
7875 clock_usec_t tv_start_usec = 0, tv_end_usec = 0;
7876 #endif
7877
7878 int yielded = 0;
7879 int dumped_run = 0;
7880 int stolen_pages = 0;
7881 int compressed_pages = 0;
7882
7883
7884 if (contig_pages == 0) {
7885 return VM_PAGE_NULL;
7886 }
7887
7888 full_scan_again:
7889
7890 #if MACH_ASSERT
7891 vm_page_verify_free_lists();
7892 #endif
7893 #if DEBUG
7894 clock_get_system_microtime(&tv_start_sec, &tv_start_usec);
7895 #endif
7896 PAGE_REPLACEMENT_ALLOWED(TRUE);
7897
7898 #if XNU_VM_HAS_DELAYED_PAGES
7899 /*
7900 * If there are still delayed pages, try to free up some that match.
7901 */
7902 if (__improbable(vm_delayed_count != 0 && contig_pages != 0)) {
7903 vm_free_delayed_pages_contig(contig_pages, max_pnum, pnum_mask);
7904 }
7905 #endif /* XNU_VM_HAS_DELAYED_PAGES */
7906
7907 vm_page_lock_queues();
7908 vm_free_page_lock();
7909
7910 RESET_STATE_OF_RUN();
7911
7912 scanned = 0;
7913 considered = 0;
7914 free_available = vm_page_free_count - vm_page_free_reserved;
7915
7916 wrapped = FALSE;
7917
7918 if (flags & KMA_LOMEM) {
7919 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx;
7920 } else {
7921 idx_last_contig_page_found = vm_page_find_contiguous_last_idx;
7922 }
7923
7924 orig_last_idx = idx_last_contig_page_found;
7925 last_idx = orig_last_idx;
7926
7927 for (page_idx = last_idx, start_idx = last_idx;
7928 npages < contig_pages && page_idx < vm_pages_count;
7929 page_idx++) {
7930 vm_page_t m = NULL;
7931
7932 retry:
7933 if (wrapped &&
7934 npages == 0 &&
7935 page_idx >= orig_last_idx) {
7936 /*
7937 * We're back where we started and we haven't
7938 * found any suitable contiguous range. Let's
7939 * give up.
7940 */
7941 break;
7942 }
7943 scanned++;
7944 m = vm_page_get(page_idx);
7945
7946 assert(vm_page_is_canonical(m));
7947
7948 if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) {
7949 /* no more low pages... */
7950 break;
7951 }
7952 if (!npages & ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0)) {
7953 /*
7954 * not aligned
7955 */
7956 RESET_STATE_OF_RUN();
7957 } else if (!vm_page_is_relocatable(m,
7958 VM_RELOCATE_REASON_CONTIGUOUS)) {
7959 /*
7960 * page is not relocatable */
7961 RESET_STATE_OF_RUN();
7962 } else {
7963 if (VM_PAGE_GET_PHYS_PAGE(m) != prevcontaddr + 1) {
7964 if ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0) {
7965 RESET_STATE_OF_RUN();
7966 goto did_consider;
7967 } else {
7968 npages = 1;
7969 start_idx = page_idx;
7970 start_pnum = VM_PAGE_GET_PHYS_PAGE(m);
7971 }
7972 } else {
7973 npages++;
7974 }
7975 prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m);
7976
7977 VM_PAGE_CHECK(m);
7978 if (m->vmp_q_state == VM_PAGE_ON_FREE_Q) {
7979 free_considered++;
7980 } else {
7981 /*
7982 * This page is not free.
7983 * If we can't steal used pages,
7984 * we have to give up this run
7985 * and keep looking.
7986 * Otherwise, we might need to
7987 * move the contents of this page
7988 * into a substitute page.
7989 */
7990 #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
7991 if (m->vmp_pmapped || m->vmp_dirty || m->vmp_precious) {
7992 substitute_needed++;
7993 }
7994 #else
7995 RESET_STATE_OF_RUN();
7996 #endif
7997 }
7998
7999 if ((free_considered + substitute_needed) > free_available) {
8000 /*
8001 * if we let this run continue
8002 * we will end up dropping the vm_page_free_count
8003 * below the reserve limit... we need to abort
8004 * this run, but we can at least re-consider this
8005 * page... thus the jump back to 'retry'
8006 */
8007 RESET_STATE_OF_RUN();
8008
8009 if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) {
8010 considered++;
8011 goto retry;
8012 }
8013 /*
8014 * free_available == 0
8015 * so can't consider any free pages... if
8016 * we went to retry in this case, we'd
8017 * get stuck looking at the same page
8018 * w/o making any forward progress
8019 * we also want to take this path if we've already
8020 * reached our limit that controls the lock latency
8021 */
8022 }
8023 }
8024 did_consider:
8025 if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) {
8026 PAGE_REPLACEMENT_ALLOWED(FALSE);
8027
8028 vm_free_page_unlock();
8029 vm_page_unlock_queues();
8030
8031 mutex_pause(0);
8032
8033 PAGE_REPLACEMENT_ALLOWED(TRUE);
8034
8035 vm_page_lock_queues();
8036 vm_free_page_lock();
8037
8038 RESET_STATE_OF_RUN();
8039 /*
8040 * reset our free page limit since we
8041 * dropped the lock protecting the vm_page_free_queue
8042 */
8043 free_available = vm_page_free_count - vm_page_free_reserved;
8044 considered = 0;
8045
8046 yielded++;
8047
8048 goto retry;
8049 }
8050 considered++;
8051 } /* main for-loop end */
8052
8053 if (npages != contig_pages) {
8054 if (!wrapped) {
8055 /*
8056 * We didn't find a contiguous range but we didn't
8057 * start from the very first page.
8058 * Start again from the very first page.
8059 */
8060 RESET_STATE_OF_RUN();
8061 if (flags & KMA_LOMEM) {
8062 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx = 0;
8063 } else {
8064 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = 0;
8065 }
8066 last_idx = 0;
8067 page_idx = last_idx;
8068 wrapped = TRUE;
8069 goto retry;
8070 }
8071 vm_free_page_unlock();
8072 } else {
8073 vm_page_t m1;
8074 unsigned int cur_idx;
8075 unsigned int tmp_start_idx;
8076 vm_object_t locked_object = VM_OBJECT_NULL;
8077 bool abort_run = false;
8078
8079 assert(page_idx - start_idx == contig_pages);
8080
8081 tmp_start_idx = start_idx;
8082
8083 /*
8084 * first pass through to pull the free pages
8085 * off of the free queue so that in case we
8086 * need substitute pages, we won't grab any
8087 * of the free pages in the run... we'll clear
8088 * the 'free' bit in the 2nd pass, and even in
8089 * an abort_run case, we'll collect all of the
8090 * free pages in this run and return them to the free list
8091 */
8092 while (start_idx < page_idx) {
8093 vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
8094
8095 m1 = vm_page_get(start_idx++);
8096
8097 #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
8098 assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q);
8099 #endif
8100 if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) {
8101 vm_page_free_queue_steal(options, m1);
8102 }
8103 }
8104 if (flags & KMA_LOMEM) {
8105 vm_page_lomem_find_contiguous_last_idx = page_idx;
8106 } else {
8107 vm_page_find_contiguous_last_idx = page_idx;
8108 }
8109
8110 /*
8111 * we can drop the free queue lock at this point since
8112 * we've pulled any 'free' candidates off of the list
8113 * we need it dropped so that we can do a vm_page_grab
8114 * when substituing for pmapped/dirty pages
8115 */
8116 vm_free_page_unlock();
8117
8118 start_idx = tmp_start_idx;
8119 cur_idx = page_idx - 1;
8120
8121 while (start_idx++ < page_idx) {
8122 /*
8123 * must go through the list from back to front
8124 * so that the page list is created in the
8125 * correct order - low -> high phys addresses
8126 */
8127 m1 = vm_page_get(cur_idx--);
8128
8129 if (m1->vmp_object == 0) {
8130 /*
8131 * page has already been removed from
8132 * the free list in the 1st pass
8133 */
8134 assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
8135 assert(m1->vmp_offset == (vm_object_offset_t) -1);
8136 assert(m1->vmp_busy);
8137 assert(!m1->vmp_wanted);
8138 assert(!m1->vmp_laundry);
8139 } else {
8140 /*
8141 * try to relocate/steal the page
8142 */
8143 if (abort_run) {
8144 continue;
8145 }
8146
8147 assert(m1->vmp_q_state != VM_PAGE_NOT_ON_Q);
8148
8149 vm_object_t object = VM_PAGE_OBJECT(m1);
8150
8151 if (object != locked_object) {
8152 if (locked_object) {
8153 vm_object_unlock(locked_object);
8154 locked_object = VM_OBJECT_NULL;
8155 }
8156 if (vm_object_lock_try(object)) {
8157 locked_object = object;
8158 } else {
8159 /* object must be locked to relocate its pages */
8160 tmp_start_idx = cur_idx;
8161 abort_run = true;
8162 continue;
8163 }
8164 }
8165
8166 kr = vm_page_relocate(m1, &compressed_pages, VM_RELOCATE_REASON_CONTIGUOUS, NULL);
8167 if (kr != KERN_SUCCESS) {
8168 if (locked_object) {
8169 vm_object_unlock(locked_object);
8170 locked_object = VM_OBJECT_NULL;
8171 }
8172 tmp_start_idx = cur_idx;
8173 abort_run = true;
8174 continue;
8175 }
8176
8177 stolen_pages++;
8178 }
8179
8180 /* m1 is ours at this point ... */
8181
8182 if (m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
8183 /*
8184 * The Q state is preserved on m1 because vm_page_queues_remove doesn't
8185 * change it for pages marked as used-by-compressor.
8186 */
8187 vm_page_assign_special_state(m1, VM_PAGE_SPECIAL_Q_BG);
8188 }
8189 VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
8190 vm_page_list_push(&list, m1);
8191 }
8192
8193 if (locked_object) {
8194 vm_object_unlock(locked_object);
8195 locked_object = VM_OBJECT_NULL;
8196 }
8197
8198 if (abort_run) {
8199 /*
8200 * want the index of the last
8201 * page in this run that was
8202 * successfully 'stolen', so back
8203 * it up 1 for the auto-decrement on use
8204 * and 1 more to bump back over this page
8205 */
8206 page_idx = tmp_start_idx + 2;
8207 if (page_idx >= vm_pages_count) {
8208 if (wrapped) {
8209 if (list.vmpl_count) {
8210 vm_page_unlock_queues();
8211 vm_page_free_list(list.vmpl_head, FALSE);
8212 vm_page_lock_queues();
8213 list = (vm_page_list_t){ };
8214 }
8215 dumped_run++;
8216 goto done_scanning;
8217 }
8218 page_idx = last_idx = 0;
8219 wrapped = TRUE;
8220 }
8221 abort_run = false;
8222
8223 /*
8224 * We didn't find a contiguous range but we didn't
8225 * start from the very first page.
8226 * Start again from the very first page.
8227 */
8228 RESET_STATE_OF_RUN();
8229
8230 if (flags & KMA_LOMEM) {
8231 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx = page_idx;
8232 } else {
8233 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = page_idx;
8234 }
8235
8236 last_idx = page_idx;
8237
8238 if (list.vmpl_count) {
8239 vm_page_unlock_queues();
8240 vm_page_free_list(list.vmpl_head, FALSE);
8241 vm_page_lock_queues();
8242 list = (vm_page_list_t){ };
8243 }
8244 dumped_run++;
8245
8246 vm_free_page_lock();
8247 /*
8248 * reset our free page limit since we
8249 * dropped the lock protecting the vm_page_free_queue
8250 */
8251 free_available = vm_page_free_count - vm_page_free_reserved;
8252 goto retry;
8253 }
8254 #if HAS_MTE
8255 else if (list.vmpl_has_tagged) {
8256 const unified_page_list_t pmap_batch_list = {
8257 .page_slist = list.vmpl_head,
8258 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
8259 };
8260
8261 /*
8262 * We successfully found a contiguous range we could
8263 * steal all the pages from. As a last step, make
8264 * certain all pages are regular pages, or convert
8265 * any non-regular pages to regular pages.
8266 */
8267 vm_page_unlock_queues();
8268
8269 /* Make any tagged pages we stole non-tagged. */
8270 pmap_unmake_tagged_pages(&pmap_batch_list);
8271
8272 vm_free_page_lock();
8273
8274 /* Mark any tagged pages we stole as non-tagged. */
8275 vm_page_list_foreach(m1, list) {
8276 if (m1->vmp_using_mte) {
8277 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(m1);
8278
8279 m1->vmp_using_mte = false;
8280 mteinfo_covered_page_clear_tagged(pnum);
8281 }
8282 }
8283 list.vmpl_has_tagged = false;
8284 list.vmpl_has_untagged = true;
8285
8286 vm_free_page_unlock();
8287 vm_page_lock_queues();
8288 }
8289 #endif /* HAS_MTE */
8290
8291 vm_page_list_foreach(m1, list) {
8292 assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
8293 assert(m1->vmp_wire_count == 0);
8294
8295 if (wire == TRUE) {
8296 m1->vmp_wire_count++;
8297 m1->vmp_q_state = VM_PAGE_IS_WIRED;
8298
8299 #if HAS_MTE
8300 if (m1->vmp_wire_count == 1) {
8301 mteinfo_increment_wire_count(m1);
8302 }
8303 #endif /* HAS_MTE */
8304 } else {
8305 m1->vmp_gobbled = TRUE;
8306 }
8307 }
8308 if (wire == FALSE) {
8309 vm_page_gobble_count += npages;
8310 }
8311
8312 /*
8313 * gobbled pages are also counted as wired pages
8314 */
8315 vm_page_wire_count += npages;
8316
8317 assert(vm_page_verify_contiguous(list.vmpl_head, npages));
8318 }
8319 done_scanning:
8320 PAGE_REPLACEMENT_ALLOWED(FALSE);
8321
8322 vm_page_unlock_queues();
8323
8324 #if DEBUG
8325 clock_get_system_microtime(&tv_end_sec, &tv_end_usec);
8326
8327 tv_end_sec -= tv_start_sec;
8328 if (tv_end_usec < tv_start_usec) {
8329 tv_end_sec--;
8330 tv_end_usec += 1000000;
8331 }
8332 tv_end_usec -= tv_start_usec;
8333 if (tv_end_usec >= 1000000) {
8334 tv_end_sec++;
8335 tv_end_sec -= 1000000;
8336 }
8337 if (vm_page_find_contig_debug) {
8338 printf("%s(num=%d,low=%d): found %d pages at 0x%llx in %ld.%06ds... started at %d... scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages\n",
8339 __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
8340 (long)tv_end_sec, tv_end_usec, orig_last_idx,
8341 scanned, yielded, dumped_run, stolen_pages, compressed_pages);
8342 }
8343
8344 #endif
8345 #if MACH_ASSERT
8346 vm_page_verify_free_lists();
8347 #endif
8348 if (list.vmpl_count == 0 && zone_gc_called < 2) {
8349 printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n",
8350 __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
8351 scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count);
8352
8353 if (consider_buffer_cache_collect != NULL) {
8354 (void)(*consider_buffer_cache_collect)(1);
8355 }
8356
8357 zone_gc(zone_gc_called ? ZONE_GC_DRAIN : ZONE_GC_TRIM);
8358
8359 zone_gc_called++;
8360
8361 printf("vm_page_find_contiguous: zone_gc called... wired count is %d\n", vm_page_wire_count);
8362 goto full_scan_again;
8363 }
8364
8365 return list.vmpl_head;
8366 }
8367
8368 /*
8369 * Allocate a list of contiguous, wired pages.
8370 */
8371 kern_return_t
cpm_allocate(vm_size_t size,vm_page_t * list,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)8372 cpm_allocate(
8373 vm_size_t size,
8374 vm_page_t *list,
8375 ppnum_t max_pnum,
8376 ppnum_t pnum_mask,
8377 boolean_t wire,
8378 int flags)
8379 {
8380 vm_page_t pages;
8381 unsigned int npages;
8382
8383 if (size % PAGE_SIZE != 0) {
8384 return KERN_INVALID_ARGUMENT;
8385 }
8386
8387 npages = (unsigned int) (size / PAGE_SIZE);
8388 if (npages != size / PAGE_SIZE) {
8389 /* 32-bit overflow */
8390 return KERN_INVALID_ARGUMENT;
8391 }
8392
8393 /*
8394 * Obtain a pointer to a subset of the free
8395 * list large enough to satisfy the request;
8396 * the region will be physically contiguous.
8397 */
8398 pages = vm_page_find_contiguous(npages, max_pnum, pnum_mask, wire, flags);
8399
8400 if (pages == VM_PAGE_NULL) {
8401 return KERN_NO_SPACE;
8402 }
8403 /*
8404 * determine need for wakeups
8405 */
8406 if (vm_page_free_count < vm_page_free_min) {
8407 vm_free_page_lock();
8408 if (vm_pageout_running == FALSE) {
8409 vm_free_page_unlock();
8410 thread_wakeup((event_t) &vm_page_free_wanted);
8411 } else {
8412 vm_free_page_unlock();
8413 }
8414 }
8415
8416 VM_CHECK_MEMORYSTATUS;
8417
8418 /*
8419 * The CPM pages should now be available and
8420 * ordered by ascending physical address.
8421 */
8422 assert(vm_page_verify_contiguous(pages, npages));
8423
8424 if (flags & KMA_ZERO) {
8425 for (vm_page_t m = pages; m; m = NEXT_PAGE(m)) {
8426 vm_page_zero_fill(
8427 m
8428 #if HAS_MTE
8429 , false
8430 #endif /* HAS_MTE */
8431 );
8432 }
8433 }
8434
8435 *list = pages;
8436 return KERN_SUCCESS;
8437 }
8438
8439
8440 unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT;
8441
8442 /*
8443 * when working on a 'run' of pages, it is necessary to hold
8444 * the vm_page_queue_lock (a hot global lock) for certain operations
8445 * on the page... however, the majority of the work can be done
8446 * while merely holding the object lock... in fact there are certain
8447 * collections of pages that don't require any work brokered by the
8448 * vm_page_queue_lock... to mitigate the time spent behind the global
8449 * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
8450 * while doing all of the work that doesn't require the vm_page_queue_lock...
8451 * then call vm_page_do_delayed_work to acquire the vm_page_queue_lock and do the
8452 * necessary work for each page... we will grab the busy bit on the page
8453 * if it's not already held so that vm_page_do_delayed_work can drop the object lock
8454 * if it can't immediately take the vm_page_queue_lock in order to compete
8455 * for the locks in the same order that vm_pageout_scan takes them.
8456 * the operation names are modeled after the names of the routines that
8457 * need to be called in order to make the changes very obvious in the
8458 * original loop
8459 *
8460 * On certain configurations, this function may return failure if any of
8461 * the pages in the run has a mapping state that doesn't allow the specified
8462 * operation. In that case, it will still fully process the run of pages
8463 * in order to avoid requiring the caller to partially undo the work done
8464 * here.
8465 */
8466
8467 kern_return_t
vm_page_do_delayed_work(vm_object_t object,vm_tag_t tag,struct vm_page_delayed_work * dwp,int dw_count)8468 vm_page_do_delayed_work(
8469 vm_object_t object,
8470 vm_tag_t tag,
8471 struct vm_page_delayed_work *dwp,
8472 int dw_count)
8473 {
8474 kern_return_t kr = KERN_SUCCESS;
8475 int j;
8476 vm_page_t m;
8477 vm_page_t local_free_q = VM_PAGE_NULL;
8478
8479 /*
8480 * pageout_scan takes the vm_page_lock_queues first
8481 * then tries for the object lock... to avoid what
8482 * is effectively a lock inversion, we'll go to the
8483 * trouble of taking them in that same order... otherwise
8484 * if this object contains the majority of the pages resident
8485 * in the UBC (or a small set of large objects actively being
8486 * worked on contain the majority of the pages), we could
8487 * cause the pageout_scan thread to 'starve' in its attempt
8488 * to find pages to move to the free queue, since it has to
8489 * successfully acquire the object lock of any candidate page
8490 * before it can steal/clean it.
8491 */
8492 if (!vm_page_trylock_queues()) {
8493 vm_object_unlock(object);
8494
8495 /*
8496 * "Turnstile enabled vm_pageout_scan" can be runnable
8497 * for a very long time without getting on a core.
8498 * If this is a higher priority thread it could be
8499 * waiting here for a very long time respecting the fact
8500 * that pageout_scan would like its object after VPS does
8501 * a mutex_pause(0).
8502 * So we cap the number of yields in the vm_object_lock_avoid()
8503 * case to a single mutex_pause(0) which will give vm_pageout_scan
8504 * 10us to run and grab the object if needed.
8505 */
8506 vm_page_lock_queues();
8507
8508 for (j = 0;; j++) {
8509 if ((!vm_object_lock_avoid(object) ||
8510 (vps_dynamic_priority_enabled && (j > 0))) &&
8511 _vm_object_lock_try(object)) {
8512 break;
8513 }
8514 vm_page_unlock_queues();
8515 mutex_pause(j);
8516 vm_page_lock_queues();
8517 }
8518 }
8519 for (j = 0; j < dw_count; j++, dwp++) {
8520 m = dwp->dw_m;
8521
8522 if (dwp->dw_mask & DW_vm_pageout_throttle_up) {
8523 vm_pageout_throttle_up(m);
8524 }
8525 #if CONFIG_PHANTOM_CACHE
8526 if (dwp->dw_mask & DW_vm_phantom_cache_update) {
8527 vm_phantom_cache_update(m);
8528 }
8529 #endif
8530 if (dwp->dw_mask & DW_vm_page_wire) {
8531 vm_page_wire(m, tag, FALSE);
8532 if (dwp->dw_mask & DW_vm_page_iopl_wire) {
8533 #if CONFIG_SPTM
8534 /*
8535 * The SPTM's security model prevents us from allowing writable I/O
8536 * mappings of executable pages. We need to check that here,
8537 * in the same place that we set vmp_iopl_wired, because this
8538 * function may have transiently dropped the VM object lock
8539 * before reaching this point, which means that frontloading
8540 * this check in the caller may not work in all cases.
8541 */
8542 if ((dwp->dw_mask & DW_vm_page_iopl_wire_write) && PMAP_PAGE_IS_USER_EXECUTABLE(m)) {
8543 if (kr == KERN_SUCCESS) {
8544 kr = KERN_PROTECTION_FAILURE;
8545 vm_map_guard_exception(VM_PAGE_GET_PHYS_PAGE(m), kGUARD_EXC_SEC_IOPL_ON_EXEC_PAGE);
8546 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
8547 KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_IOPL_ON_EXEC_PAGE),
8548 (uintptr_t)(VM_PAGE_GET_PHYS_PAGE(m)));
8549 }
8550 } else {
8551 m->vmp_iopl_wired = true;
8552 }
8553 #else
8554 m->vmp_iopl_wired = true;
8555 #endif /* CONFIG_SPTM */
8556 }
8557 } else if (dwp->dw_mask & DW_vm_page_unwire) {
8558 boolean_t queueit;
8559
8560 queueit = (dwp->dw_mask & (DW_vm_page_free | DW_vm_page_deactivate_internal)) ? FALSE : TRUE;
8561
8562 vm_page_unwire(m, queueit);
8563 }
8564 if (dwp->dw_mask & DW_vm_page_free) {
8565 vm_page_free_prepare_queues(m);
8566
8567 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
8568 /*
8569 * Add this page to our list of reclaimed pages,
8570 * to be freed later.
8571 */
8572 m->vmp_snext = local_free_q;
8573 local_free_q = m;
8574 } else {
8575 if (dwp->dw_mask & DW_vm_page_deactivate_internal) {
8576 vm_page_deactivate_internal(m, FALSE);
8577 } else if (dwp->dw_mask & DW_vm_page_activate) {
8578 if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
8579 vm_page_activate(m);
8580 }
8581 } else if (dwp->dw_mask & DW_vm_page_speculate) {
8582 vm_page_speculate(m, TRUE);
8583 } else if (dwp->dw_mask & DW_enqueue_cleaned) {
8584 /*
8585 * if we didn't hold the object lock and did this,
8586 * we might disconnect the page, then someone might
8587 * soft fault it back in, then we would put it on the
8588 * cleaned queue, and so we would have a referenced (maybe even dirty)
8589 * page on that queue, which we don't want
8590 */
8591 int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8592
8593 if ((refmod_state & VM_MEM_REFERENCED)) {
8594 /*
8595 * this page has been touched since it got cleaned; let's activate it
8596 * if it hasn't already been
8597 */
8598 VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
8599 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
8600
8601 if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
8602 vm_page_activate(m);
8603 }
8604 } else {
8605 m->vmp_reference = FALSE;
8606 vm_page_enqueue_cleaned(m);
8607 }
8608 } else if (dwp->dw_mask & DW_vm_page_lru) {
8609 vm_page_lru(m);
8610 } else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) {
8611 if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
8612 vm_page_queues_remove(m, TRUE);
8613 }
8614 }
8615 if (dwp->dw_mask & DW_set_reference) {
8616 m->vmp_reference = TRUE;
8617 } else if (dwp->dw_mask & DW_clear_reference) {
8618 m->vmp_reference = FALSE;
8619 }
8620
8621 if (dwp->dw_mask & DW_move_page) {
8622 if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
8623 vm_page_queues_remove(m, FALSE);
8624
8625 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
8626
8627 vm_page_enqueue_inactive(m, FALSE);
8628 }
8629 }
8630 if (dwp->dw_mask & DW_clear_busy) {
8631 m->vmp_busy = FALSE;
8632 }
8633
8634 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8635 vm_page_wakeup(object, m);
8636 }
8637 #if HAS_MTE
8638 if (dwp->dw_mask & DW_vm_page_wakeup_tag_storage) {
8639 assert(m->vmp_ts_wanted);
8640 mteinfo_tag_storage_wakeup(m, false);
8641 }
8642 #endif /* HAS_MTE */
8643 }
8644 }
8645 vm_page_unlock_queues();
8646
8647 if (local_free_q) {
8648 vm_page_free_list(local_free_q, TRUE);
8649 }
8650
8651 VM_CHECK_MEMORYSTATUS;
8652
8653 return kr;
8654 }
8655
8656 __abortlike
8657 static void
__vm_page_alloc_list_failed_panic(vm_size_t page_count,kma_flags_t flags,kern_return_t kr)8658 __vm_page_alloc_list_failed_panic(
8659 vm_size_t page_count,
8660 kma_flags_t flags,
8661 kern_return_t kr)
8662 {
8663 panic("vm_page_alloc_list(%zd, 0x%x) failed unexpectedly with %d",
8664 (size_t)page_count, flags, kr);
8665 }
8666
8667 kern_return_t
vm_page_alloc_list(vm_size_t page_count,kma_flags_t flags,vm_page_t * list)8668 vm_page_alloc_list(vm_size_t page_count, kma_flags_t flags, vm_page_t *list)
8669 {
8670 vm_page_t page_list = VM_PAGE_NULL;
8671 vm_page_t mem;
8672 kern_return_t kr = KERN_SUCCESS;
8673 int page_grab_count = 0;
8674 task_t task;
8675
8676 for (vm_size_t i = 0; i < page_count; i++) {
8677 for (;;) {
8678 vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE;
8679
8680 #if HAS_MTE
8681 if (flags & KMA_TAG) {
8682 options |= VM_PAGE_GRAB_MTE;
8683 }
8684 if (vm_mte_tag_storage_for_compressor && (flags & KMA_COMPRESSOR)) {
8685 /*
8686 * These pages will be used in the compressor pool.
8687 * Prefer tag storage pages for these allocations.
8688 */
8689 options |= VM_PAGE_GRAB_ALLOW_TAG_STORAGE;
8690 }
8691 #endif /* HAS_MTE */
8692 if (flags & KMA_NOPAGEWAIT) {
8693 options |= VM_PAGE_GRAB_NOPAGEWAIT;
8694 }
8695 if (flags & KMA_LOMEM) {
8696 mem = vm_page_grablo(options);
8697 } else {
8698 mem = vm_page_grab_options(options);
8699 }
8700
8701 if (mem != VM_PAGE_NULL) {
8702 break;
8703 }
8704
8705 if (flags & KMA_NOPAGEWAIT) {
8706 kr = KERN_RESOURCE_SHORTAGE;
8707 goto out;
8708 }
8709 if ((flags & KMA_LOMEM) && vm_lopage_needed) {
8710 kr = KERN_RESOURCE_SHORTAGE;
8711 goto out;
8712 }
8713
8714 /* VM privileged threads should have waited in vm_page_grab() and not get here. */
8715 assert(!(current_thread()->options & TH_OPT_VMPRIV));
8716
8717 if ((flags & KMA_NOFAIL) == 0 && ptoa_64(page_count) > max_mem / 4) {
8718 uint64_t unavailable = ptoa_64(vm_page_wire_count + vm_page_free_target);
8719 if (unavailable > max_mem || ptoa_64(page_count) > (max_mem - unavailable)) {
8720 kr = KERN_RESOURCE_SHORTAGE;
8721 goto out;
8722 }
8723 }
8724 VM_PAGE_WAIT();
8725 }
8726
8727 page_grab_count++;
8728 mem->vmp_snext = page_list;
8729 page_list = mem;
8730 }
8731
8732 if ((KMA_ZERO | KMA_NOENCRYPT) & flags) {
8733 for (mem = page_list; mem; mem = mem->vmp_snext) {
8734 vm_page_zero_fill(
8735 mem
8736 #if HAS_MTE
8737 , false /* zero_tags */
8738 #endif /* HAS_MTE */
8739 );
8740 }
8741 }
8742
8743 out:
8744 task = current_task_early();
8745 if (task != NULL) {
8746 ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
8747 }
8748 counter_add(&vm_page_grab_count_kern, page_grab_count);
8749
8750 if (kr == KERN_SUCCESS) {
8751 *list = page_list;
8752 } else if (flags & KMA_NOFAIL) {
8753 __vm_page_alloc_list_failed_panic(page_count, flags, kr);
8754 } else {
8755 vm_page_free_list(page_list, FALSE);
8756 }
8757
8758 return kr;
8759 }
8760
8761 void
vm_page_set_offset(vm_page_t page,vm_object_offset_t offset)8762 vm_page_set_offset(vm_page_t page, vm_object_offset_t offset)
8763 {
8764 page->vmp_offset = offset;
8765 }
8766
8767 vm_page_t
vm_page_get_next(vm_page_t page)8768 vm_page_get_next(vm_page_t page)
8769 {
8770 return page->vmp_snext;
8771 }
8772
8773 vm_object_offset_t
vm_page_get_offset(vm_page_t page)8774 vm_page_get_offset(vm_page_t page)
8775 {
8776 return page->vmp_offset;
8777 }
8778
8779 ppnum_t
vm_page_get_phys_page(vm_page_t page)8780 vm_page_get_phys_page(vm_page_t page)
8781 {
8782 return VM_PAGE_GET_PHYS_PAGE(page);
8783 }
8784
8785
8786 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8787
8788 #if HIBERNATION
8789
8790 static uint32_t hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *);
8791
8792 struct hibernate_statistics {
8793 int hibernate_considered;
8794 int hibernate_reentered_on_q;
8795 int hibernate_found_dirty;
8796 int hibernate_skipped_cleaning;
8797 int hibernate_skipped_transient;
8798 int hibernate_skipped_precious;
8799 int hibernate_skipped_external;
8800 int hibernate_queue_nolock;
8801 int hibernate_queue_paused;
8802 int hibernate_throttled;
8803 int hibernate_throttle_timeout;
8804 int hibernate_drained;
8805 int hibernate_drain_timeout;
8806 int cd_lock_failed;
8807 int cd_found_precious;
8808 int cd_found_wired;
8809 int cd_found_busy;
8810 int cd_found_unusual;
8811 int cd_found_cleaning;
8812 int cd_found_laundry;
8813 int cd_found_dirty;
8814 int cd_found_xpmapped;
8815 int cd_skipped_xpmapped;
8816 int cd_local_free;
8817 int cd_total_free;
8818 int cd_vm_page_wire_count;
8819 int cd_vm_struct_pages_unneeded;
8820 int cd_pages;
8821 int cd_discarded;
8822 int cd_count_wire;
8823 } hibernate_stats;
8824
8825 #if CONFIG_SPTM
8826 /**
8827 * On SPTM-based systems don't save any executable pages into the hibernation
8828 * image. The SPTM has stronger guarantees around not allowing write access to
8829 * the executable pages than on older systems, which prevents XNU from being
8830 * able to restore any pages mapped as executable.
8831 */
8832 #define HIBERNATE_XPMAPPED_LIMIT 0ULL
8833 #else /* CONFIG_SPTM */
8834 /*
8835 * clamp the number of 'xpmapped' pages we'll sweep into the hibernation image
8836 * so that we don't overrun the estimated image size, which would
8837 * result in a hibernation failure.
8838 *
8839 * We use a size value instead of pages because we don't want to take up more space
8840 * on disk if the system has a 16K page size vs 4K. Also, we are not guaranteed
8841 * to have that additional space available.
8842 *
8843 * Since this was set at 40000 pages on X86 we are going to use 160MB as our
8844 * xpmapped size.
8845 */
8846 #define HIBERNATE_XPMAPPED_LIMIT ((160 * 1024 * 1024ULL) / PAGE_SIZE)
8847 #endif /* CONFIG_SPTM */
8848
8849 static int
hibernate_drain_pageout_queue(struct vm_pageout_queue * q)8850 hibernate_drain_pageout_queue(struct vm_pageout_queue *q)
8851 {
8852 wait_result_t wait_result;
8853
8854 vm_page_lock_queues();
8855
8856 while (!vm_page_queue_empty(&q->pgo_pending)) {
8857 q->pgo_draining = TRUE;
8858
8859 assert_wait_timeout((event_t) (&q->pgo_laundry + 1), THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
8860
8861 vm_page_unlock_queues();
8862
8863 wait_result = thread_block(THREAD_CONTINUE_NULL);
8864
8865 if (wait_result == THREAD_TIMED_OUT && !vm_page_queue_empty(&q->pgo_pending)) {
8866 hibernate_stats.hibernate_drain_timeout++;
8867
8868 if (q == &vm_pageout_queue_external) {
8869 return 0;
8870 }
8871
8872 return 1;
8873 }
8874 vm_page_lock_queues();
8875
8876 hibernate_stats.hibernate_drained++;
8877 }
8878 vm_page_unlock_queues();
8879
8880 return 0;
8881 }
8882
8883
8884 boolean_t hibernate_skip_external = FALSE;
8885
8886 static int
hibernate_flush_queue(vm_page_queue_head_t * q,int qcount)8887 hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
8888 {
8889 vm_page_t m;
8890 vm_object_t l_object = NULL;
8891 vm_object_t m_object = NULL;
8892 int refmod_state = 0;
8893 int try_failed_count = 0;
8894 int retval = 0;
8895 int current_run = 0;
8896 struct vm_pageout_queue *iq;
8897 struct vm_pageout_queue *eq;
8898 struct vm_pageout_queue *tq;
8899
8900 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_START,
8901 VM_KERNEL_UNSLIDE_OR_PERM(q), qcount);
8902
8903 iq = &vm_pageout_queue_internal;
8904 eq = &vm_pageout_queue_external;
8905
8906 vm_page_lock_queues();
8907
8908 while (qcount && !vm_page_queue_empty(q)) {
8909 if (current_run++ == 1000) {
8910 if (hibernate_should_abort()) {
8911 retval = 1;
8912 break;
8913 }
8914 current_run = 0;
8915 }
8916
8917 m = (vm_page_t) vm_page_queue_first(q);
8918 m_object = VM_PAGE_OBJECT(m);
8919
8920 /*
8921 * check to see if we currently are working
8922 * with the same object... if so, we've
8923 * already got the lock
8924 */
8925 if (m_object != l_object) {
8926 /*
8927 * the object associated with candidate page is
8928 * different from the one we were just working
8929 * with... dump the lock if we still own it
8930 */
8931 if (l_object != NULL) {
8932 vm_object_unlock(l_object);
8933 l_object = NULL;
8934 }
8935 /*
8936 * Try to lock object; since we've alread got the
8937 * page queues lock, we can only 'try' for this one.
8938 * if the 'try' fails, we need to do a mutex_pause
8939 * to allow the owner of the object lock a chance to
8940 * run...
8941 */
8942 if (!vm_object_lock_try_scan(m_object)) {
8943 if (try_failed_count > 20) {
8944 hibernate_stats.hibernate_queue_nolock++;
8945
8946 goto reenter_pg_on_q;
8947 }
8948
8949 vm_page_unlock_queues();
8950 mutex_pause(try_failed_count++);
8951 vm_page_lock_queues();
8952
8953 hibernate_stats.hibernate_queue_paused++;
8954 continue;
8955 } else {
8956 l_object = m_object;
8957 }
8958 }
8959 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m)) {
8960 /*
8961 * page is not to be cleaned
8962 * put it back on the head of its queue
8963 */
8964 if (m->vmp_cleaning) {
8965 hibernate_stats.hibernate_skipped_cleaning++;
8966 } else {
8967 hibernate_stats.hibernate_skipped_transient++;
8968 }
8969
8970 goto reenter_pg_on_q;
8971 }
8972 if (m_object->vo_copy == VM_OBJECT_NULL) {
8973 if (m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) {
8974 /*
8975 * let the normal hibernate image path
8976 * deal with these
8977 */
8978 goto reenter_pg_on_q;
8979 }
8980 }
8981 if (!m->vmp_dirty && m->vmp_pmapped) {
8982 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
8983
8984 if ((refmod_state & VM_MEM_MODIFIED)) {
8985 SET_PAGE_DIRTY(m, FALSE);
8986 }
8987 } else {
8988 refmod_state = 0;
8989 }
8990
8991 if (!m->vmp_dirty) {
8992 /*
8993 * page is not to be cleaned
8994 * put it back on the head of its queue
8995 */
8996 if (m->vmp_precious) {
8997 hibernate_stats.hibernate_skipped_precious++;
8998 }
8999
9000 goto reenter_pg_on_q;
9001 }
9002
9003 if (hibernate_skip_external == TRUE && !m_object->internal) {
9004 hibernate_stats.hibernate_skipped_external++;
9005
9006 goto reenter_pg_on_q;
9007 }
9008 tq = NULL;
9009
9010 if (m_object->internal) {
9011 if (VM_PAGE_Q_THROTTLED(iq)) {
9012 tq = iq;
9013 }
9014 } else if (VM_PAGE_Q_THROTTLED(eq)) {
9015 tq = eq;
9016 }
9017
9018 if (tq != NULL) {
9019 wait_result_t wait_result;
9020 int wait_count = 5;
9021
9022 if (l_object != NULL) {
9023 vm_object_unlock(l_object);
9024 l_object = NULL;
9025 }
9026
9027 while (retval == 0) {
9028 tq->pgo_throttled = TRUE;
9029
9030 assert_wait_timeout((event_t) &tq->pgo_laundry, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
9031
9032 vm_page_unlock_queues();
9033
9034 wait_result = thread_block(THREAD_CONTINUE_NULL);
9035
9036 vm_page_lock_queues();
9037
9038 if (wait_result != THREAD_TIMED_OUT) {
9039 break;
9040 }
9041 if (!VM_PAGE_Q_THROTTLED(tq)) {
9042 break;
9043 }
9044
9045 if (hibernate_should_abort()) {
9046 retval = 1;
9047 }
9048
9049 if (--wait_count == 0) {
9050 hibernate_stats.hibernate_throttle_timeout++;
9051
9052 if (tq == eq) {
9053 hibernate_skip_external = TRUE;
9054 break;
9055 }
9056 retval = 1;
9057 }
9058 }
9059 if (retval) {
9060 break;
9061 }
9062
9063 hibernate_stats.hibernate_throttled++;
9064
9065 continue;
9066 }
9067 /*
9068 * we've already factored out pages in the laundry which
9069 * means this page can't be on the pageout queue so it's
9070 * safe to do the vm_page_queues_remove
9071 */
9072 vm_page_queues_remove(m, TRUE);
9073
9074 if (m_object->internal == TRUE) {
9075 pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m), PMAP_OPTIONS_COMPRESSOR, NULL);
9076 }
9077
9078 vm_pageout_cluster(m);
9079
9080 hibernate_stats.hibernate_found_dirty++;
9081
9082 goto next_pg;
9083
9084 reenter_pg_on_q:
9085 vm_page_queue_remove(q, m, vmp_pageq);
9086 vm_page_queue_enter(q, m, vmp_pageq);
9087
9088 hibernate_stats.hibernate_reentered_on_q++;
9089 next_pg:
9090 hibernate_stats.hibernate_considered++;
9091
9092 qcount--;
9093 try_failed_count = 0;
9094 }
9095 if (l_object != NULL) {
9096 vm_object_unlock(l_object);
9097 l_object = NULL;
9098 }
9099
9100 vm_page_unlock_queues();
9101
9102 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_END, hibernate_stats.hibernate_found_dirty, retval, 0, 0, 0);
9103
9104 return retval;
9105 }
9106
9107
9108 static int
hibernate_flush_dirty_pages(int pass)9109 hibernate_flush_dirty_pages(int pass)
9110 {
9111 struct vm_speculative_age_q *aq;
9112 uint32_t i;
9113
9114 if (vm_page_local_q) {
9115 zpercpu_foreach_cpu(lid) {
9116 vm_page_reactivate_local(lid, TRUE, FALSE);
9117 }
9118 }
9119
9120 for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
9121 int qcount;
9122 vm_page_t m;
9123
9124 aq = &vm_page_queue_speculative[i];
9125
9126 if (vm_page_queue_empty(&aq->age_q)) {
9127 continue;
9128 }
9129 qcount = 0;
9130
9131 vm_page_lockspin_queues();
9132
9133 vm_page_queue_iterate(&aq->age_q, m, vmp_pageq) {
9134 qcount++;
9135 }
9136 vm_page_unlock_queues();
9137
9138 if (qcount) {
9139 if (hibernate_flush_queue(&aq->age_q, qcount)) {
9140 return 1;
9141 }
9142 }
9143 }
9144 if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count)) {
9145 return 1;
9146 }
9147 /* XXX FBDP TODO: flush secluded queue */
9148 if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count)) {
9149 return 1;
9150 }
9151 if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count)) {
9152 return 1;
9153 }
9154 if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
9155 return 1;
9156 }
9157
9158 if (pass == 1) {
9159 vm_compressor_record_warmup_start();
9160 }
9161
9162 if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) {
9163 if (pass == 1) {
9164 vm_compressor_record_warmup_end();
9165 }
9166 return 1;
9167 }
9168 if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
9169 if (pass == 1) {
9170 vm_compressor_record_warmup_end();
9171 }
9172 return 1;
9173 }
9174 if (pass == 1) {
9175 vm_compressor_record_warmup_end();
9176 }
9177
9178 if (hibernate_skip_external == FALSE && hibernate_drain_pageout_queue(&vm_pageout_queue_external)) {
9179 return 1;
9180 }
9181
9182 return 0;
9183 }
9184
9185
9186 void
hibernate_reset_stats(void)9187 hibernate_reset_stats(void)
9188 {
9189 bzero(&hibernate_stats, sizeof(struct hibernate_statistics));
9190 }
9191
9192
9193 int
hibernate_flush_memory(void)9194 hibernate_flush_memory(void)
9195 {
9196 int retval;
9197
9198 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
9199
9200 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_START, vm_page_free_count, 0, 0, 0, 0);
9201
9202 hibernate_cleaning_in_progress = TRUE;
9203 hibernate_skip_external = FALSE;
9204
9205 if ((retval = hibernate_flush_dirty_pages(1)) == 0) {
9206 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
9207
9208 vm_compressor_flush();
9209
9210 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
9211
9212 if (consider_buffer_cache_collect != NULL) {
9213 unsigned int orig_wire_count;
9214
9215 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_START, 0, 0, 0, 0, 0);
9216 orig_wire_count = vm_page_wire_count;
9217
9218 (void)(*consider_buffer_cache_collect)(1);
9219 zone_gc(ZONE_GC_DRAIN);
9220
9221 HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count);
9222
9223 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_END, orig_wire_count - vm_page_wire_count, 0, 0, 0, 0);
9224 }
9225 }
9226 hibernate_cleaning_in_progress = FALSE;
9227
9228 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_END, vm_page_free_count, hibernate_stats.hibernate_found_dirty, retval, 0, 0);
9229
9230 if (retval) {
9231 HIBLOG("hibernate_flush_memory() failed to finish - vm_page_compressor_count(%d)\n", VM_PAGE_COMPRESSOR_COUNT);
9232 }
9233
9234
9235 HIBPRINT("hibernate_flush_memory() considered(%d) reentered_on_q(%d) found_dirty(%d)\n",
9236 hibernate_stats.hibernate_considered,
9237 hibernate_stats.hibernate_reentered_on_q,
9238 hibernate_stats.hibernate_found_dirty);
9239 HIBPRINT(" skipped_cleaning(%d) skipped_transient(%d) skipped_precious(%d) skipped_external(%d) queue_nolock(%d)\n",
9240 hibernate_stats.hibernate_skipped_cleaning,
9241 hibernate_stats.hibernate_skipped_transient,
9242 hibernate_stats.hibernate_skipped_precious,
9243 hibernate_stats.hibernate_skipped_external,
9244 hibernate_stats.hibernate_queue_nolock);
9245 HIBPRINT(" queue_paused(%d) throttled(%d) throttle_timeout(%d) drained(%d) drain_timeout(%d)\n",
9246 hibernate_stats.hibernate_queue_paused,
9247 hibernate_stats.hibernate_throttled,
9248 hibernate_stats.hibernate_throttle_timeout,
9249 hibernate_stats.hibernate_drained,
9250 hibernate_stats.hibernate_drain_timeout);
9251
9252 return retval;
9253 }
9254
9255
9256 static void
hibernate_page_list_zero(hibernate_page_list_t * list)9257 hibernate_page_list_zero(hibernate_page_list_t *list)
9258 {
9259 uint32_t bank;
9260 hibernate_bitmap_t * bitmap;
9261
9262 bitmap = &list->bank_bitmap[0];
9263 for (bank = 0; bank < list->bank_count; bank++) {
9264 uint32_t last_bit;
9265
9266 bzero((void *) &bitmap->bitmap[0], bitmap->bitmapwords << 2);
9267 // set out-of-bound bits at end of bitmap.
9268 last_bit = ((bitmap->last_page - bitmap->first_page + 1) & 31);
9269 if (last_bit) {
9270 bitmap->bitmap[bitmap->bitmapwords - 1] = (0xFFFFFFFF >> last_bit);
9271 }
9272
9273 bitmap = (hibernate_bitmap_t *) &bitmap->bitmap[bitmap->bitmapwords];
9274 }
9275 }
9276
9277 static boolean_t
hibernate_consider_discard(vm_page_t m,boolean_t preflight)9278 hibernate_consider_discard(vm_page_t m, boolean_t preflight)
9279 {
9280 vm_object_t object = NULL;
9281 int refmod_state;
9282 boolean_t discard = FALSE;
9283
9284 do{
9285 if (vm_page_is_private(m)) {
9286 panic("hibernate_consider_discard: private");
9287 }
9288
9289 object = VM_PAGE_OBJECT(m);
9290
9291 if (!vm_object_lock_try(object)) {
9292 object = NULL;
9293 if (!preflight) {
9294 hibernate_stats.cd_lock_failed++;
9295 }
9296 break;
9297 }
9298 if (VM_PAGE_WIRED(m)) {
9299 if (!preflight) {
9300 hibernate_stats.cd_found_wired++;
9301 }
9302 break;
9303 }
9304 if (m->vmp_precious) {
9305 if (!preflight) {
9306 hibernate_stats.cd_found_precious++;
9307 }
9308 break;
9309 }
9310 if (m->vmp_busy || !object->alive) {
9311 /*
9312 * Somebody is playing with this page.
9313 */
9314 if (!preflight) {
9315 hibernate_stats.cd_found_busy++;
9316 }
9317 break;
9318 }
9319 if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
9320 /*
9321 * If it's unusual in anyway, ignore it
9322 */
9323 if (!preflight) {
9324 hibernate_stats.cd_found_unusual++;
9325 }
9326 break;
9327 }
9328 if (m->vmp_cleaning) {
9329 if (!preflight) {
9330 hibernate_stats.cd_found_cleaning++;
9331 }
9332 break;
9333 }
9334 if (m->vmp_laundry) {
9335 if (!preflight) {
9336 hibernate_stats.cd_found_laundry++;
9337 }
9338 break;
9339 }
9340 if (!m->vmp_dirty) {
9341 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
9342
9343 if (refmod_state & VM_MEM_REFERENCED) {
9344 m->vmp_reference = TRUE;
9345 }
9346 if (refmod_state & VM_MEM_MODIFIED) {
9347 SET_PAGE_DIRTY(m, FALSE);
9348 }
9349 }
9350
9351 /*
9352 * If it's clean or purgeable we can discard the page on wakeup.
9353 */
9354 discard = (!m->vmp_dirty)
9355 || (VM_PURGABLE_VOLATILE == object->purgable)
9356 || (VM_PURGABLE_EMPTY == object->purgable);
9357
9358
9359 if (discard == FALSE) {
9360 if (!preflight) {
9361 hibernate_stats.cd_found_dirty++;
9362 }
9363 } else if (m->vmp_xpmapped && m->vmp_reference && !object->internal) {
9364 if (hibernate_stats.cd_found_xpmapped < HIBERNATE_XPMAPPED_LIMIT) {
9365 if (!preflight) {
9366 hibernate_stats.cd_found_xpmapped++;
9367 }
9368 discard = FALSE;
9369 } else {
9370 if (!preflight) {
9371 hibernate_stats.cd_skipped_xpmapped++;
9372 }
9373 }
9374 }
9375 }while (FALSE);
9376
9377 if (object) {
9378 vm_object_unlock(object);
9379 }
9380
9381 return discard;
9382 }
9383
9384
9385 static void
hibernate_discard_page(vm_page_t m)9386 hibernate_discard_page(vm_page_t m)
9387 {
9388 vm_object_t m_object;
9389
9390 if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
9391 /*
9392 * If it's unusual in anyway, ignore
9393 */
9394 return;
9395 }
9396
9397 m_object = VM_PAGE_OBJECT(m);
9398
9399 #if MACH_ASSERT || DEBUG
9400 if (!vm_object_lock_try(m_object)) {
9401 panic("hibernate_discard_page(%p) !vm_object_lock_try", m);
9402 }
9403 #else
9404 /* No need to lock page queue for token delete, hibernate_vm_unlock()
9405 * makes sure these locks are uncontended before sleep */
9406 #endif /* MACH_ASSERT || DEBUG */
9407
9408 if (m->vmp_pmapped == TRUE) {
9409 __unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
9410 }
9411
9412 if (m->vmp_laundry) {
9413 panic("hibernate_discard_page(%p) laundry", m);
9414 }
9415 if (vm_page_is_private(m)) {
9416 panic("hibernate_discard_page(%p) private", m);
9417 }
9418 if (vm_page_is_fictitious(m)) {
9419 panic("hibernate_discard_page(%p) fictitious", m);
9420 }
9421
9422 if (VM_PURGABLE_VOLATILE == m_object->purgable) {
9423 /* object should be on a queue */
9424 assert((m_object->objq.next != NULL) && (m_object->objq.prev != NULL));
9425 purgeable_q_t old_queue = vm_purgeable_object_remove(m_object);
9426 assert(old_queue);
9427 if (m_object->purgeable_when_ripe) {
9428 vm_purgeable_token_delete_first(old_queue);
9429 }
9430 vm_object_lock_assert_exclusive(m_object);
9431 VM_OBJECT_SET_PURGABLE(m_object, VM_PURGABLE_EMPTY);
9432
9433 /*
9434 * Purgeable ledgers: pages of VOLATILE and EMPTY objects are
9435 * accounted in the "volatile" ledger, so no change here.
9436 * We have to update vm_page_purgeable_count, though, since we're
9437 * effectively purging this object.
9438 */
9439 unsigned int delta;
9440 assert(m_object->resident_page_count >= m_object->wired_page_count);
9441 delta = (m_object->resident_page_count - m_object->wired_page_count);
9442 assert(vm_page_purgeable_count >= delta);
9443 assert(delta > 0);
9444 OSAddAtomic(-delta, (SInt32 *)&vm_page_purgeable_count);
9445 }
9446
9447 vm_page_free(m);
9448
9449 #if MACH_ASSERT || DEBUG
9450 vm_object_unlock(m_object);
9451 #endif /* MACH_ASSERT || DEBUG */
9452 }
9453
9454 /*
9455 * Grab locks for hibernate_page_list_setall()
9456 */
9457 void
hibernate_vm_lock_queues(void)9458 hibernate_vm_lock_queues(void)
9459 {
9460 vm_object_lock(compressor_object);
9461 vm_page_lock_queues();
9462 vm_free_page_lock();
9463 lck_mtx_lock(&vm_purgeable_queue_lock);
9464
9465 if (vm_page_local_q) {
9466 zpercpu_foreach(lq, vm_page_local_q) {
9467 VPL_LOCK(&lq->vpl_lock);
9468 }
9469 }
9470 }
9471
9472 void
hibernate_vm_unlock_queues(void)9473 hibernate_vm_unlock_queues(void)
9474 {
9475 if (vm_page_local_q) {
9476 zpercpu_foreach(lq, vm_page_local_q) {
9477 VPL_UNLOCK(&lq->vpl_lock);
9478 }
9479 }
9480 lck_mtx_unlock(&vm_purgeable_queue_lock);
9481 vm_free_page_unlock();
9482 vm_page_unlock_queues();
9483 vm_object_unlock(compressor_object);
9484 }
9485
9486 #if CONFIG_SPTM
9487 static bool
hibernate_sptm_should_force_page_to_wired_pagelist(vm_page_t vmp)9488 hibernate_sptm_should_force_page_to_wired_pagelist(vm_page_t vmp)
9489 {
9490 const sptm_paddr_t paddr = ptoa_64(VM_PAGE_GET_PHYS_PAGE(vmp));
9491 const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
9492 const vm_object_t vmp_objp = VM_PAGE_OBJECT(vmp);
9493
9494 return frame_type == XNU_USER_JIT || frame_type == XNU_USER_DEBUG ||
9495 (frame_type == XNU_USER_EXEC && vmp_objp->internal == TRUE);
9496 }
9497 #endif
9498
9499 /*
9500 * Bits zero in the bitmaps => page needs to be saved. All pages default to be saved,
9501 * pages known to VM to not need saving are subtracted.
9502 * Wired pages to be saved are present in page_list_wired, pageable in page_list.
9503 */
9504
9505 void
hibernate_page_list_setall(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired,hibernate_page_list_t * page_list_pal,boolean_t preflight,boolean_t will_discard,uint32_t * pagesOut)9506 hibernate_page_list_setall(hibernate_page_list_t * page_list,
9507 hibernate_page_list_t * page_list_wired,
9508 hibernate_page_list_t * page_list_pal,
9509 boolean_t preflight,
9510 boolean_t will_discard,
9511 uint32_t * pagesOut)
9512 {
9513 uint64_t start, end, nsec;
9514 vm_page_t m;
9515 vm_page_t next;
9516 __block uint32_t pages = page_list->page_count;
9517 __block uint32_t count_wire = pages;
9518 uint32_t count_anonymous = 0, count_throttled = 0, count_compressor = 0;
9519 uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0;
9520 uint32_t count_discard_active = 0;
9521 uint32_t count_discard_inactive = 0;
9522 uint32_t count_retired = 0;
9523 uint32_t count_discard_cleaned = 0;
9524 uint32_t count_discard_purgeable = 0;
9525 uint32_t count_discard_speculative = 0;
9526 uint32_t count_discard_vm_struct_pages = 0;
9527 uint32_t bank;
9528 hibernate_bitmap_t * bitmap;
9529 hibernate_bitmap_t * bitmap_wired;
9530 boolean_t discard_all;
9531 boolean_t discard = FALSE;
9532
9533 HIBLOG("hibernate_page_list_setall(preflight %d) start\n", preflight);
9534
9535 if (preflight) {
9536 page_list = NULL;
9537 page_list_wired = NULL;
9538 page_list_pal = NULL;
9539 discard_all = FALSE;
9540 } else {
9541 discard_all = will_discard;
9542 }
9543
9544 #if MACH_ASSERT || DEBUG
9545 if (!preflight) {
9546 assert(hibernate_vm_locks_are_safe());
9547 vm_page_lock_queues();
9548 if (vm_page_local_q) {
9549 zpercpu_foreach(lq, vm_page_local_q) {
9550 VPL_LOCK(&lq->vpl_lock);
9551 }
9552 }
9553 }
9554 #endif /* MACH_ASSERT || DEBUG */
9555
9556
9557 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_START, count_wire, 0, 0, 0, 0);
9558
9559 clock_get_uptime(&start);
9560
9561 if (!preflight) {
9562 hibernate_page_list_zero(page_list);
9563 hibernate_page_list_zero(page_list_wired);
9564 hibernate_page_list_zero(page_list_pal);
9565
9566 hibernate_stats.cd_vm_page_wire_count = vm_page_wire_count;
9567 hibernate_stats.cd_pages = pages;
9568 }
9569
9570 if (vm_page_local_q) {
9571 zpercpu_foreach_cpu(lid) {
9572 vm_page_reactivate_local(lid, TRUE, !preflight);
9573 }
9574 }
9575
9576 if (preflight) {
9577 vm_object_lock(compressor_object);
9578 vm_page_lock_queues();
9579 vm_free_page_lock();
9580 }
9581
9582 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
9583
9584 hibernation_vmqueues_inspection = TRUE;
9585
9586 __auto_type hib_free_boilerplate = ^(vm_page_t page) {
9587 assert((page->vmp_q_state == VM_PAGE_ON_FREE_Q) ||
9588 #if XNU_VM_HAS_LOPAGE
9589 (page->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
9590 #endif /* XNU_VM_HAS_LOPAGE */
9591 (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q));
9592
9593 pages--;
9594 count_wire--;
9595
9596 if (!preflight) {
9597 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(page));
9598 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(page));
9599
9600 hibernate_stats.cd_total_free++;
9601
9602 if (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) {
9603 hibernate_stats.cd_local_free++;
9604 }
9605 }
9606 };
9607
9608 if (!preflight) {
9609 percpu_foreach(free_pages_head, free_pages) {
9610 _vm_page_list_foreach(m, *free_pages_head) {
9611 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9612 hib_free_boilerplate(m);
9613 }
9614 }
9615 #if HAS_MTE
9616 percpu_foreach(mte_pcpu, mte_pcpu) {
9617 _vm_page_list_foreach(m, mte_pcpu->free_tagged_pages) {
9618 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9619 hib_free_boilerplate(m);
9620 }
9621 vm_page_queue_iterate(&mte_pcpu->free_claimed_pages,
9622 m, vmp_pageq) {
9623 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
9624 hib_free_boilerplate(m);
9625 }
9626 }
9627 #endif /* HAS_MTE */
9628 }
9629
9630 #if CONFIG_SPTM
9631 if (vm_pages_free_masks()) {
9632 uint32_t bits = vm_pages_free_mask_len() * MAX_COLORS;
9633 bitmap_t *map = vm_pages_free_masks_as_bitmap(0);
9634
9635 for (int bit = bitmap_first(map, bits);
9636 bit >= 0; bit = bitmap_next(map, bit)) {
9637 ppnum_t pnum = pmap_first_pnum + bit;
9638 vm_page_t mem = vm_page_find_canonical(pnum);
9639
9640 hib_free_boilerplate(mem);
9641 }
9642 } else
9643 #endif /* CONFIG_SPTM */
9644 {
9645 vm_page_free_queue_foreach(&vm_page_queue_free, hib_free_boilerplate);
9646 }
9647 #if HAS_MTE
9648 mteinfo_free_queue_foreach(hib_free_boilerplate);
9649 #endif /* HAS_MTE */
9650 #if XNU_VM_HAS_LOPAGE
9651 vm_page_free_queue_foreach(&vm_lopage_queue_free, hib_free_boilerplate);
9652 #endif /* XNU_VM_HAS_LOPAGE */
9653
9654 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9655 while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) {
9656 assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
9657
9658 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9659 discard = FALSE;
9660 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
9661 && hibernate_consider_discard(m, preflight)) {
9662 if (!preflight) {
9663 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9664 }
9665 count_discard_inactive++;
9666 discard = discard_all;
9667 } else {
9668 count_throttled++;
9669 }
9670 count_wire--;
9671 if (!preflight) {
9672 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9673 }
9674
9675 if (discard) {
9676 hibernate_discard_page(m);
9677 }
9678 m = next;
9679 }
9680
9681 m = (vm_page_t)vm_page_queue_first(&vm_page_queue_anonymous);
9682 while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
9683 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
9684 bool force_to_wired_list = false; /* Default to NOT forcing page into the wired page list */
9685 #if CONFIG_SPTM
9686 force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9687 #endif
9688 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9689 discard = FALSE;
9690 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9691 hibernate_consider_discard(m, preflight)) {
9692 if (!preflight) {
9693 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9694 }
9695 if (m->vmp_dirty) {
9696 count_discard_purgeable++;
9697 } else {
9698 count_discard_inactive++;
9699 }
9700 discard = discard_all;
9701 } else {
9702 /*
9703 * If the page must be force-added to the wired page list, prevent it from appearing
9704 * in the unwired page list.
9705 */
9706 if (force_to_wired_list) {
9707 if (!preflight) {
9708 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9709 }
9710 } else {
9711 count_anonymous++;
9712 }
9713 }
9714 /*
9715 * If the page is NOT being forced into the wired page list, remove it from the
9716 * wired page list here.
9717 */
9718 if (!force_to_wired_list) {
9719 count_wire--;
9720 if (!preflight) {
9721 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9722 }
9723 }
9724 if (discard) {
9725 hibernate_discard_page(m);
9726 }
9727 m = next;
9728 }
9729
9730 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
9731 while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
9732 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
9733
9734 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9735 discard = FALSE;
9736 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9737 hibernate_consider_discard(m, preflight)) {
9738 if (!preflight) {
9739 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9740 }
9741 if (m->vmp_dirty) {
9742 count_discard_purgeable++;
9743 } else {
9744 count_discard_cleaned++;
9745 }
9746 discard = discard_all;
9747 } else {
9748 count_cleaned++;
9749 }
9750 count_wire--;
9751 if (!preflight) {
9752 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9753 }
9754 if (discard) {
9755 hibernate_discard_page(m);
9756 }
9757 m = next;
9758 }
9759
9760 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9761 while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
9762 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
9763 bool force_to_wired_list = false; /* Default to NOT forcing page into the wired page list */
9764 #if CONFIG_SPTM
9765 force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9766 #endif
9767 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9768 discard = FALSE;
9769 if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) &&
9770 hibernate_consider_discard(m, preflight)) {
9771 if (!preflight) {
9772 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9773 }
9774 if (m->vmp_dirty) {
9775 count_discard_purgeable++;
9776 } else {
9777 count_discard_active++;
9778 }
9779 discard = discard_all;
9780 } else {
9781 /*
9782 * If the page must be force-added to the wired page list, prevent it from appearing
9783 * in the unwired page list.
9784 */
9785 if (force_to_wired_list) {
9786 if (!preflight) {
9787 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9788 }
9789 } else {
9790 count_active++;
9791 }
9792 }
9793 /*
9794 * If the page is NOT being forced into the wired page list, remove it from the
9795 * wired page list here.
9796 */
9797 if (!force_to_wired_list) {
9798 count_wire--;
9799 if (!preflight) {
9800 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9801 }
9802 }
9803 if (discard) {
9804 hibernate_discard_page(m);
9805 }
9806 m = next;
9807 }
9808
9809 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9810 while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
9811 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
9812 bool force_to_wired_list = false; /* Default to NOT forcing page into the wired page list */
9813 #if CONFIG_SPTM
9814 force_to_wired_list = hibernate_sptm_should_force_page_to_wired_pagelist(m);
9815 #endif
9816 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9817 discard = FALSE;
9818 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9819 hibernate_consider_discard(m, preflight)) {
9820 if (!preflight) {
9821 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9822 }
9823 if (m->vmp_dirty) {
9824 count_discard_purgeable++;
9825 } else {
9826 count_discard_inactive++;
9827 }
9828 discard = discard_all;
9829 } else {
9830 /*
9831 * If the page must be force-added to the wired page list, prevent it from appearing
9832 * in the unwired page list.
9833 */
9834 if (force_to_wired_list) {
9835 if (!preflight) {
9836 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9837 }
9838 } else {
9839 count_inactive++;
9840 }
9841 }
9842 /*
9843 * If the page is NOT being forced into the wired page list, remove it from the
9844 * wired page list here.
9845 */
9846 if (!force_to_wired_list) {
9847 count_wire--;
9848 if (!preflight) {
9849 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9850 }
9851 }
9852 if (discard) {
9853 hibernate_discard_page(m);
9854 }
9855 m = next;
9856 }
9857 /* XXX FBDP TODO: secluded queue */
9858
9859 for (uint32_t i = 0; i <= vm_page_max_speculative_age_q; i++) {
9860 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
9861 while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
9862 assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q,
9863 "Bad page: %p (0x%x:0x%x) on queue %d has state: %d (Discard: %d, Preflight: %d)",
9864 m, m->vmp_pageq.next, m->vmp_pageq.prev, i, m->vmp_q_state, discard, preflight);
9865
9866 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
9867 discard = FALSE;
9868 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
9869 hibernate_consider_discard(m, preflight)) {
9870 if (!preflight) {
9871 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9872 }
9873 count_discard_speculative++;
9874 discard = discard_all;
9875 } else {
9876 count_speculative++;
9877 }
9878 count_wire--;
9879 if (!preflight) {
9880 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9881 }
9882 if (discard) {
9883 hibernate_discard_page(m);
9884 }
9885 m = next;
9886 }
9887 }
9888
9889 vm_page_queue_iterate(&compressor_object->memq, m, vmp_listq) {
9890 assert(m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
9891
9892 count_compressor++;
9893 count_wire--;
9894 if (!preflight) {
9895 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
9896 }
9897 }
9898
9899
9900 if (preflight == FALSE && discard_all == TRUE) {
9901 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_START);
9902
9903 HIBLOG("hibernate_teardown started\n");
9904 count_discard_vm_struct_pages = hibernate_teardown_vm_structs(page_list, page_list_wired);
9905 HIBLOG("hibernate_teardown completed - discarded %d\n", count_discard_vm_struct_pages);
9906
9907 pages -= count_discard_vm_struct_pages;
9908 count_wire -= count_discard_vm_struct_pages;
9909
9910 hibernate_stats.cd_vm_struct_pages_unneeded = count_discard_vm_struct_pages;
9911
9912 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_END);
9913 }
9914
9915 if (!preflight) {
9916 // pull wired from hibernate_bitmap
9917 bitmap = &page_list->bank_bitmap[0];
9918 bitmap_wired = &page_list_wired->bank_bitmap[0];
9919 for (bank = 0; bank < page_list->bank_count; bank++) {
9920 for (uint32_t i = 0; i < bitmap->bitmapwords; i++) {
9921 bitmap->bitmap[i] = bitmap->bitmap[i] | ~bitmap_wired->bitmap[i];
9922 }
9923 bitmap = (hibernate_bitmap_t *)&bitmap->bitmap[bitmap->bitmapwords];
9924 bitmap_wired = (hibernate_bitmap_t *) &bitmap_wired->bitmap[bitmap_wired->bitmapwords];
9925 }
9926 }
9927
9928 // machine dependent adjustments
9929 hibernate_page_list_setall_machine(page_list, page_list_wired, preflight, &pages);
9930
9931 if (!preflight) {
9932 hibernate_stats.cd_count_wire = count_wire;
9933 hibernate_stats.cd_discarded = count_discard_active +
9934 count_discard_inactive + count_discard_purgeable +
9935 count_discard_speculative + count_discard_cleaned +
9936 count_discard_vm_struct_pages;
9937 }
9938
9939 clock_get_uptime(&end);
9940 absolutetime_to_nanoseconds(end - start, &nsec);
9941 HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL);
9942
9943 HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, "
9944 "zf %d, throt %d, compr %d, xpmapped %d\n"
9945 " %s discard act %d inact %d purgeable %d "
9946 "spec %d cleaned %d retired %d\n",
9947 pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative,
9948 count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped,
9949 discard_all ? "did" : "could",
9950 count_discard_active, count_discard_inactive, count_discard_purgeable,
9951 count_discard_speculative, count_discard_cleaned, count_retired);
9952
9953 if (hibernate_stats.cd_skipped_xpmapped) {
9954 HIBLOG("WARNING: hibernate_page_list_setall skipped %d xpmapped pages\n",
9955 hibernate_stats.cd_skipped_xpmapped);
9956 }
9957
9958 *pagesOut = pages - count_discard_active - count_discard_inactive -
9959 count_discard_purgeable - count_discard_speculative -
9960 count_discard_cleaned - count_retired;
9961
9962 if (preflight && will_discard) {
9963 *pagesOut -= count_compressor + count_throttled +
9964 count_anonymous + count_inactive + count_cleaned +
9965 count_speculative + count_active;
9966
9967 /*
9968 * We try to keep max HIBERNATE_XPMAPPED_LIMIT pages around in the hibernation image
9969 * even if these are clean and so we need to size the hibernation image accordingly.
9970 *
9971 * NB: We have to assume all HIBERNATE_XPMAPPED_LIMIT pages might show up because 'dirty'
9972 * xpmapped pages aren't distinguishable from other 'dirty' pages in preflight. So we might
9973 * only see part of the xpmapped pages if we look at 'cd_found_xpmapped' which solely tracks
9974 * clean xpmapped pages.
9975 *
9976 * Since these pages are all cleaned by the time we are in the post-preflight phase, we might
9977 * see a much larger number in 'cd_found_xpmapped' now than we did in the preflight phase
9978 */
9979 *pagesOut += HIBERNATE_XPMAPPED_LIMIT;
9980 }
9981
9982 hibernation_vmqueues_inspection = FALSE;
9983
9984 #if MACH_ASSERT || DEBUG
9985 if (!preflight) {
9986 if (vm_page_local_q) {
9987 zpercpu_foreach(lq, vm_page_local_q) {
9988 VPL_UNLOCK(&lq->vpl_lock);
9989 }
9990 }
9991 vm_page_unlock_queues();
9992 }
9993 #endif /* MACH_ASSERT || DEBUG */
9994
9995 if (preflight) {
9996 vm_free_page_unlock();
9997 vm_page_unlock_queues();
9998 vm_object_unlock(compressor_object);
9999 }
10000
10001 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_END, count_wire, *pagesOut, 0, 0, 0);
10002 }
10003
10004 void
hibernate_page_list_discard(hibernate_page_list_t * page_list)10005 hibernate_page_list_discard(hibernate_page_list_t * page_list)
10006 {
10007 uint64_t start, end, nsec;
10008 vm_page_t m;
10009 vm_page_t next;
10010 uint32_t i;
10011 uint32_t count_discard_active = 0;
10012 uint32_t count_discard_inactive = 0;
10013 uint32_t count_discard_purgeable = 0;
10014 uint32_t count_discard_cleaned = 0;
10015 uint32_t count_discard_speculative = 0;
10016
10017
10018 #if MACH_ASSERT || DEBUG
10019 vm_page_lock_queues();
10020 if (vm_page_local_q) {
10021 zpercpu_foreach(lq, vm_page_local_q) {
10022 VPL_LOCK(&lq->vpl_lock);
10023 }
10024 }
10025 #endif /* MACH_ASSERT || DEBUG */
10026
10027 clock_get_uptime(&start);
10028
10029 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10030 while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
10031 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
10032
10033 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10034 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10035 if (m->vmp_dirty) {
10036 count_discard_purgeable++;
10037 } else {
10038 count_discard_inactive++;
10039 }
10040 hibernate_discard_page(m);
10041 }
10042 m = next;
10043 }
10044
10045 for (i = 0; i <= vm_page_max_speculative_age_q; i++) {
10046 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
10047 while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
10048 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
10049
10050 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10051 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10052 count_discard_speculative++;
10053 hibernate_discard_page(m);
10054 }
10055 m = next;
10056 }
10057 }
10058
10059 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10060 while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
10061 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
10062
10063 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10064 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10065 if (m->vmp_dirty) {
10066 count_discard_purgeable++;
10067 } else {
10068 count_discard_inactive++;
10069 }
10070 hibernate_discard_page(m);
10071 }
10072 m = next;
10073 }
10074 /* XXX FBDP TODO: secluded queue */
10075
10076 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10077 while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
10078 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
10079
10080 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10081 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10082 if (m->vmp_dirty) {
10083 count_discard_purgeable++;
10084 } else {
10085 count_discard_active++;
10086 }
10087 hibernate_discard_page(m);
10088 }
10089 m = next;
10090 }
10091
10092 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
10093 while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
10094 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
10095
10096 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
10097 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
10098 if (m->vmp_dirty) {
10099 count_discard_purgeable++;
10100 } else {
10101 count_discard_cleaned++;
10102 }
10103 hibernate_discard_page(m);
10104 }
10105 m = next;
10106 }
10107
10108 #if MACH_ASSERT || DEBUG
10109 if (vm_page_local_q) {
10110 zpercpu_foreach(lq, vm_page_local_q) {
10111 VPL_UNLOCK(&lq->vpl_lock);
10112 }
10113 }
10114 vm_page_unlock_queues();
10115 #endif /* MACH_ASSERT || DEBUG */
10116
10117 clock_get_uptime(&end);
10118 absolutetime_to_nanoseconds(end - start, &nsec);
10119 HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d cleaned %d\n",
10120 nsec / 1000000ULL,
10121 count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
10122 }
10123
10124 boolean_t hibernate_paddr_map_inited = FALSE;
10125 unsigned int hibernate_teardown_last_valid_compact_indx = -1;
10126 vm_page_t hibernate_rebuild_hash_list = NULL;
10127
10128 unsigned int hibernate_teardown_found_tabled_pages = 0;
10129 unsigned int hibernate_teardown_found_created_pages = 0;
10130 unsigned int hibernate_teardown_found_free_pages = 0;
10131 unsigned int hibernate_teardown_vm_page_free_count;
10132
10133
10134 struct ppnum_mapping {
10135 struct ppnum_mapping *ppnm_next;
10136 ppnum_t ppnm_base_paddr;
10137 unsigned int ppnm_sindx;
10138 unsigned int ppnm_eindx;
10139 };
10140
10141 struct ppnum_mapping *ppnm_head;
10142 struct ppnum_mapping *ppnm_last_found = NULL;
10143
10144
10145 void
hibernate_create_paddr_map(void)10146 hibernate_create_paddr_map(void)
10147 {
10148 unsigned int i;
10149 ppnum_t next_ppnum_in_run = 0;
10150 struct ppnum_mapping *ppnm = NULL;
10151
10152 if (hibernate_paddr_map_inited == FALSE) {
10153 for (i = 0; i < vm_pages_count; i++) {
10154 if (ppnm) {
10155 ppnm->ppnm_eindx = i;
10156 }
10157
10158 if (ppnm == NULL || VM_PAGE_GET_PHYS_PAGE(vm_page_get(i)) != next_ppnum_in_run) {
10159 ppnm = zalloc_permanent_type(struct ppnum_mapping);
10160
10161 ppnm->ppnm_next = ppnm_head;
10162 ppnm_head = ppnm;
10163
10164 ppnm->ppnm_sindx = i;
10165 ppnm->ppnm_base_paddr = VM_PAGE_GET_PHYS_PAGE(vm_page_get(i));
10166 }
10167 next_ppnum_in_run = VM_PAGE_GET_PHYS_PAGE(vm_page_get(i)) + 1;
10168 }
10169 ppnm->ppnm_eindx = vm_pages_count;
10170
10171 hibernate_paddr_map_inited = TRUE;
10172 }
10173 }
10174
10175 static ppnum_t
hibernate_lookup_paddr(unsigned int indx)10176 hibernate_lookup_paddr(unsigned int indx)
10177 {
10178 struct ppnum_mapping *ppnm = NULL;
10179
10180 ppnm = ppnm_last_found;
10181
10182 if (ppnm) {
10183 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
10184 goto done;
10185 }
10186 }
10187 for (ppnm = ppnm_head; ppnm; ppnm = ppnm->ppnm_next) {
10188 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
10189 ppnm_last_found = ppnm;
10190 break;
10191 }
10192 }
10193 if (ppnm == NULL) {
10194 panic("hibernate_lookup_paddr of %d failed", indx);
10195 }
10196 done:
10197 return ppnm->ppnm_base_paddr + (indx - ppnm->ppnm_sindx);
10198 }
10199
10200
10201 static uint32_t
hibernate_mark_as_unneeded(addr64_t saddr,addr64_t eaddr,hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)10202 hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
10203 {
10204 addr64_t saddr_aligned;
10205 addr64_t eaddr_aligned;
10206 addr64_t addr;
10207 ppnum_t paddr;
10208 unsigned int mark_as_unneeded_pages = 0;
10209
10210 saddr_aligned = (saddr + PAGE_MASK_64) & ~PAGE_MASK_64;
10211 eaddr_aligned = eaddr & ~PAGE_MASK_64;
10212
10213 for (addr = saddr_aligned; addr < eaddr_aligned; addr += PAGE_SIZE_64) {
10214 paddr = pmap_find_phys(kernel_pmap, addr);
10215
10216 assert(paddr);
10217
10218 hibernate_page_bitset(page_list, TRUE, paddr);
10219 hibernate_page_bitset(page_list_wired, TRUE, paddr);
10220
10221 mark_as_unneeded_pages++;
10222 }
10223 return mark_as_unneeded_pages;
10224 }
10225
10226
10227 static void
hibernate_hash_insert_page(vm_page_t mem)10228 hibernate_hash_insert_page(vm_page_t mem)
10229 {
10230 vm_page_bucket_t *bucket;
10231 int hash_id;
10232 vm_object_t m_object;
10233
10234 m_object = VM_PAGE_OBJECT(mem);
10235
10236 assert(mem->vmp_hashed);
10237 assert(m_object);
10238 assert(mem->vmp_offset != (vm_object_offset_t) -1);
10239
10240 /*
10241 * Insert it into the object_object/offset hash table
10242 */
10243 hash_id = vm_page_hash(m_object, mem->vmp_offset);
10244 bucket = &vm_page_buckets[hash_id];
10245
10246 mem->vmp_next_m = bucket->page_list;
10247 bucket->page_list = VM_PAGE_PACK_PTR(mem);
10248 }
10249
10250
10251 static void
hibernate_free_range_flush(vm_page_list_t * list)10252 hibernate_free_range_flush(vm_page_list_t *list)
10253 {
10254 vm_page_free_queue_enter_list(*list, VMP_RELEASE_HIBERNATE);
10255 *list = (vm_page_list_t){ };
10256 }
10257
10258 static void
hibernate_free_range(vm_page_list_t * list,int sindx,int eindx)10259 hibernate_free_range(vm_page_list_t *list, int sindx, int eindx)
10260 {
10261 for (; sindx < eindx; sindx++) {
10262 vm_page_t mem = vm_page_get(sindx);
10263 ppnum_t pnum = hibernate_lookup_paddr(sindx);
10264
10265 vm_page_init(mem, pnum);
10266 #if HAS_MTE
10267 mem->vmp_using_mte = pmap_is_tagged_page(pnum);
10268 #endif /* HAS_MTE */
10269 vm_page_list_push(list, mem);
10270
10271 /* Max batch size of these lists is 255 due to vmp_free_list_result_t */
10272 if (list->vmpl_count >= UINT8_MAX) {
10273 hibernate_free_range_flush(list);
10274 }
10275 }
10276 }
10277
10278 void
hibernate_rebuild_vm_structs(void)10279 hibernate_rebuild_vm_structs(void)
10280 {
10281 int cindx, sindx, eindx;
10282 vm_page_list_t list = { };
10283 vm_page_t mem, tmem, mem_next;
10284 AbsoluteTime startTime, endTime;
10285 uint64_t nsec;
10286
10287 if (!hibernate_rebuild_needed) {
10288 return;
10289 }
10290
10291 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_START);
10292 HIBLOG("hibernate_rebuild started\n");
10293
10294 clock_get_uptime(&startTime);
10295
10296 pal_hib_rebuild_pmap_structs();
10297
10298 bzero(&vm_page_buckets[0], vm_page_bucket_count * sizeof(vm_page_bucket_t));
10299 eindx = vm_pages_count;
10300
10301 /*
10302 * Mark all the vm_pages[] that have not been initialized yet as being
10303 * transient. This is needed to ensure that buddy page search is corrrect.
10304 * Without this random data in these vm_pages[] can trip the buddy search
10305 */
10306 for (int i = hibernate_teardown_last_valid_compact_indx + 1; i < eindx; ++i) {
10307 vm_page_get(i)->vmp_q_state = VM_PAGE_NOT_ON_Q;
10308 }
10309
10310 for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
10311 mem = vm_page_get(cindx);
10312 assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
10313 /*
10314 * hibernate_teardown_vm_structs leaves the location where
10315 * this vm_page_t must be located in "next".
10316 */
10317 tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10318 mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
10319 assert(tmem >= mem);
10320
10321 sindx = (int)(tmem - vm_page_get(0));
10322
10323 if (mem != tmem) {
10324 /*
10325 * this vm_page_t was moved by hibernate_teardown_vm_structs,
10326 * so move it back to its real location
10327 */
10328 *tmem = *mem;
10329 mem = tmem;
10330 }
10331 if (mem->vmp_hashed) {
10332 hibernate_hash_insert_page(mem);
10333 }
10334 /*
10335 * the 'hole' between this vm_page_t and the previous
10336 * vm_page_t we moved needs to be initialized as
10337 * a range of free vm_page_t's
10338 */
10339 hibernate_free_range(&list, sindx + 1, eindx);
10340
10341 eindx = sindx;
10342 }
10343 hibernate_free_range(&list, 0, sindx);
10344 hibernate_free_range_flush(&list);
10345
10346 VM_CHECK_MEMORYSTATUS;
10347
10348 assert(vm_page_free_count == hibernate_teardown_vm_page_free_count);
10349
10350 /*
10351 * process the list of vm_page_t's that were entered in the hash,
10352 * but were not located in the vm_pages arrary... these are
10353 * vm_page_t's that were created on the fly (i.e. fictitious)
10354 */
10355 for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) {
10356 mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10357
10358 mem->vmp_next_m = 0;
10359 hibernate_hash_insert_page(mem);
10360 }
10361 hibernate_rebuild_hash_list = NULL;
10362
10363 clock_get_uptime(&endTime);
10364 SUB_ABSOLUTETIME(&endTime, &startTime);
10365 absolutetime_to_nanoseconds(endTime, &nsec);
10366
10367 HIBLOG("hibernate_rebuild completed - took %qd msecs\n", nsec / 1000000ULL);
10368
10369 hibernate_rebuild_needed = false;
10370
10371 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END);
10372 }
10373
10374 static uint32_t
hibernate_teardown_vm_structs(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)10375 hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
10376 {
10377 unsigned int compact_target_indx;
10378 unsigned int mark_as_unneeded_pages = 0;
10379 unsigned int unneeded_vm_page_bucket_pages = 0;
10380 unsigned int unneeded_vm_pages_pages = 0;
10381 unsigned int unneeded_pmap_pages = 0;
10382 addr64_t start_of_unneeded = 0;
10383 addr64_t end_of_unneeded = 0;
10384
10385
10386 if (hibernate_should_abort()) {
10387 return 0;
10388 }
10389
10390 hibernate_rebuild_needed = true;
10391
10392 HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, "
10393 "active_pages %d, inactive_pages %d, speculative_pages %d, "
10394 "cleaned_pages %d, compressor_pages %d\n",
10395 vm_page_wire_count, vm_page_free_count,
10396 vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count,
10397 vm_page_cleaned_count, compressor_object->resident_page_count);
10398
10399 for (uint32_t i = 0; i < vm_page_bucket_count; i++) {
10400 vm_page_bucket_t *bucket = &vm_page_buckets[i];
10401 vm_page_t mem, mem_next;
10402
10403 for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) {
10404 assert(mem->vmp_hashed);
10405
10406 mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
10407
10408 if (!vm_page_in_array(mem)) {
10409 mem->vmp_next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list);
10410 hibernate_rebuild_hash_list = mem;
10411 }
10412 }
10413 }
10414 unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0],
10415 (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired);
10416 mark_as_unneeded_pages += unneeded_vm_page_bucket_pages;
10417
10418 hibernate_teardown_vm_page_free_count = vm_page_free_count;
10419
10420 compact_target_indx = 0;
10421
10422 vm_free_page_lock();
10423
10424 for (uint32_t i = 0; i < vm_pages_count; i++) {
10425 vm_page_t mem = vm_page_get(i);
10426 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem);
10427 vm_memory_class_t class = vm_page_get_memory_class(mem, pnum);
10428
10429 if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
10430 vm_page_free_queue_remove(class, mem, pnum,
10431 VM_PAGE_ON_FREE_Q);
10432 hibernate_teardown_found_free_pages++;
10433
10434 if (vm_page_get(compact_target_indx)->vmp_q_state != VM_PAGE_ON_FREE_Q) {
10435 compact_target_indx = i;
10436 }
10437 } else {
10438 /*
10439 * record this vm_page_t's original location
10440 * we need this even if it doesn't get moved
10441 * as an indicator to the rebuild function that
10442 * we don't have to move it
10443 */
10444 mem->vmp_next_m = VM_PAGE_PACK_PTR(mem);
10445
10446 if (vm_page_get(compact_target_indx)->vmp_q_state == VM_PAGE_ON_FREE_Q) {
10447 /*
10448 * we've got a hole to fill, so
10449 * move this vm_page_t to it's new home
10450 */
10451 *vm_page_get(compact_target_indx) = *mem;
10452 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
10453
10454 hibernate_teardown_last_valid_compact_indx = compact_target_indx;
10455 compact_target_indx++;
10456 } else {
10457 hibernate_teardown_last_valid_compact_indx = i;
10458 }
10459 }
10460 }
10461
10462 vm_free_page_unlock();
10463
10464 unneeded_vm_pages_pages = hibernate_mark_as_unneeded(
10465 (addr64_t)vm_page_get(hibernate_teardown_last_valid_compact_indx + 1),
10466 (addr64_t)vm_page_get(vm_pages_count - 1),
10467 page_list, page_list_wired);
10468 mark_as_unneeded_pages += unneeded_vm_pages_pages;
10469
10470 pal_hib_teardown_pmap_structs(&start_of_unneeded, &end_of_unneeded);
10471
10472 if (start_of_unneeded) {
10473 unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded,
10474 end_of_unneeded, page_list, page_list_wired);
10475 mark_as_unneeded_pages += unneeded_pmap_pages;
10476 }
10477 HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n",
10478 unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages);
10479
10480 return mark_as_unneeded_pages;
10481 }
10482
10483 #endif /* HIBERNATION */
10484
10485 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
10486
10487 #include <mach_vm_debug.h>
10488 #if MACH_VM_DEBUG
10489
10490 #include <mach_debug/hash_info.h>
10491 #include <vm/vm_debug_internal.h>
10492
10493 /*
10494 * Routine: vm_page_info
10495 * Purpose:
10496 * Return information about the global VP table.
10497 * Fills the buffer with as much information as possible
10498 * and returns the desired size of the buffer.
10499 * Conditions:
10500 * Nothing locked. The caller should provide
10501 * possibly-pageable memory.
10502 */
10503
10504 unsigned int
vm_page_info(hash_info_bucket_t * info,unsigned int count)10505 vm_page_info(
10506 hash_info_bucket_t *info,
10507 unsigned int count)
10508 {
10509 unsigned int i;
10510 lck_ticket_t *bucket_lock;
10511
10512 if (vm_page_bucket_count < count) {
10513 count = vm_page_bucket_count;
10514 }
10515
10516 for (i = 0; i < count; i++) {
10517 vm_page_bucket_t *bucket = &vm_page_buckets[i];
10518 unsigned int bucket_count = 0;
10519 vm_page_t m;
10520
10521 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
10522 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
10523
10524 for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
10525 m != VM_PAGE_NULL;
10526 m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->vmp_next_m))) {
10527 bucket_count++;
10528 }
10529
10530 lck_ticket_unlock(bucket_lock);
10531
10532 /* don't touch pageable memory while holding locks */
10533 info[i].hib_count = bucket_count;
10534 }
10535
10536 return vm_page_bucket_count;
10537 }
10538 #endif /* MACH_VM_DEBUG */
10539
10540 #if VM_PAGE_BUCKETS_CHECK
10541 void
vm_page_buckets_check(void)10542 vm_page_buckets_check(void)
10543 {
10544 unsigned int i;
10545 vm_page_t p;
10546 unsigned int p_hash;
10547 vm_page_bucket_t *bucket;
10548 lck_ticket_t *bucket_lock;
10549
10550 if (!vm_page_buckets_check_ready) {
10551 return;
10552 }
10553
10554 #if HIBERNATION
10555 if (hibernate_rebuild_needed ||
10556 hibernate_rebuild_hash_list) {
10557 panic("BUCKET_CHECK: hibernation in progress: "
10558 "rebuild_needed=%d rebuild_hash_list=%p\n",
10559 hibernate_rebuild_needed,
10560 hibernate_rebuild_hash_list);
10561 }
10562 #endif /* HIBERNATION */
10563
10564 #if VM_PAGE_FAKE_BUCKETS
10565 char *cp;
10566 for (cp = (char *) vm_page_fake_buckets_start;
10567 cp < (char *) vm_page_fake_buckets_end;
10568 cp++) {
10569 if (*cp != 0x5a) {
10570 panic("BUCKET_CHECK: corruption at %p in fake buckets "
10571 "[0x%llx:0x%llx]\n",
10572 cp,
10573 (uint64_t) vm_page_fake_buckets_start,
10574 (uint64_t) vm_page_fake_buckets_end);
10575 }
10576 }
10577 #endif /* VM_PAGE_FAKE_BUCKETS */
10578
10579 for (i = 0; i < vm_page_bucket_count; i++) {
10580 vm_object_t p_object;
10581
10582 bucket = &vm_page_buckets[i];
10583 if (!bucket->page_list) {
10584 continue;
10585 }
10586
10587 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
10588 lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket);
10589 p = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
10590
10591 while (p != VM_PAGE_NULL) {
10592 p_object = VM_PAGE_OBJECT(p);
10593
10594 if (!p->vmp_hashed) {
10595 panic("BUCKET_CHECK: page %p (%p,0x%llx) "
10596 "hash %d in bucket %d at %p "
10597 "is not hashed\n",
10598 p, p_object, p->vmp_offset,
10599 p_hash, i, bucket);
10600 }
10601 p_hash = vm_page_hash(p_object, p->vmp_offset);
10602 if (p_hash != i) {
10603 panic("BUCKET_CHECK: corruption in bucket %d "
10604 "at %p: page %p object %p offset 0x%llx "
10605 "hash %d\n",
10606 i, bucket, p, p_object, p->vmp_offset,
10607 p_hash);
10608 }
10609 p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m));
10610 }
10611 lck_ticket_unlock(bucket_lock);
10612 }
10613
10614 // printf("BUCKET_CHECK: checked buckets\n");
10615 }
10616 #endif /* VM_PAGE_BUCKETS_CHECK */
10617
10618 /*
10619 * 'vm_fault_enter' will place newly created pages (zero-fill and COW) onto the
10620 * local queues if they exist... its the only spot in the system where we add pages
10621 * to those queues... once on those queues, those pages can only move to one of the
10622 * global page queues or the free queues... they NEVER move from local q to local q.
10623 * the 'local' state is stable when vm_page_queues_remove is called since we're behind
10624 * the global vm_page_queue_lock at this point... we still need to take the local lock
10625 * in case this operation is being run on a different CPU then the local queue's identity,
10626 * but we don't have to worry about the page moving to a global queue or becoming wired
10627 * while we're grabbing the local lock since those operations would require the global
10628 * vm_page_queue_lock to be held, and we already own it.
10629 *
10630 * this is why its safe to utilze the wire_count field in the vm_page_t as the local_id...
10631 * 'wired' and local are ALWAYS mutually exclusive conditions.
10632 */
10633
10634 void
vm_page_queues_remove(vm_page_t mem,boolean_t remove_from_specialq)10635 vm_page_queues_remove(vm_page_t mem, boolean_t remove_from_specialq)
10636 {
10637 boolean_t was_pageable = TRUE;
10638 vm_object_t m_object;
10639
10640 m_object = VM_PAGE_OBJECT(mem);
10641
10642 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10643
10644 if (mem->vmp_q_state == VM_PAGE_NOT_ON_Q) {
10645 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10646 if (remove_from_specialq == TRUE) {
10647 vm_page_remove_from_specialq(mem);
10648 }
10649 /*if (mem->vmp_on_specialq != VM_PAGE_SPECIAL_Q_EMPTY) {
10650 * assert(mem->vmp_specialq.next != 0);
10651 * assert(mem->vmp_specialq.prev != 0);
10652 * } else {*/
10653 if (mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
10654 assert(mem->vmp_specialq.next == 0);
10655 assert(mem->vmp_specialq.prev == 0);
10656 }
10657 return;
10658 }
10659
10660 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
10661 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10662 assert(mem->vmp_specialq.next == 0 &&
10663 mem->vmp_specialq.prev == 0 &&
10664 mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
10665 return;
10666 }
10667 if (mem->vmp_q_state == VM_PAGE_IS_WIRED) {
10668 /*
10669 * might put these guys on a list for debugging purposes
10670 * if we do, we'll need to remove this assert
10671 */
10672 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
10673 assert(mem->vmp_specialq.next == 0 &&
10674 mem->vmp_specialq.prev == 0);
10675 /*
10676 * Recall that vmp_on_specialq also means a request to put
10677 * it on the special Q. So we don't want to reset that bit
10678 * just because a wiring request came in. We might want to
10679 * put it on the special queue post-unwiring.
10680 *
10681 * &&
10682 * mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
10683 */
10684 return;
10685 }
10686
10687 assert(m_object != compressor_object);
10688 assert(!is_kernel_object(m_object));
10689 assert(!vm_page_is_fictitious(mem));
10690
10691 switch (mem->vmp_q_state) {
10692 case VM_PAGE_ON_ACTIVE_LOCAL_Q:
10693 {
10694 struct vpl *lq;
10695
10696 lq = zpercpu_get_cpu(vm_page_local_q, mem->vmp_local_id);
10697 VPL_LOCK(&lq->vpl_lock);
10698 vm_page_queue_remove(&lq->vpl_queue, mem, vmp_pageq);
10699 mem->vmp_local_id = 0;
10700 lq->vpl_count--;
10701 if (m_object->internal) {
10702 lq->vpl_internal_count--;
10703 } else {
10704 lq->vpl_external_count--;
10705 }
10706 VPL_UNLOCK(&lq->vpl_lock);
10707 was_pageable = FALSE;
10708 break;
10709 }
10710 case VM_PAGE_ON_ACTIVE_Q:
10711 {
10712 vm_page_queue_remove(&vm_page_queue_active, mem, vmp_pageq);
10713 vm_page_active_count--;
10714 break;
10715 }
10716
10717 case VM_PAGE_ON_INACTIVE_INTERNAL_Q:
10718 {
10719 assert(m_object->internal == TRUE);
10720
10721 vm_page_inactive_count--;
10722 vm_page_queue_remove(&vm_page_queue_anonymous, mem, vmp_pageq);
10723 vm_page_anonymous_count--;
10724
10725 vm_purgeable_q_advance_all();
10726 vm_page_balance_inactive(3);
10727 break;
10728 }
10729
10730 case VM_PAGE_ON_INACTIVE_EXTERNAL_Q:
10731 {
10732 assert(m_object->internal == FALSE);
10733
10734 vm_page_inactive_count--;
10735 vm_page_queue_remove(&vm_page_queue_inactive, mem, vmp_pageq);
10736 vm_purgeable_q_advance_all();
10737 vm_page_balance_inactive(3);
10738 break;
10739 }
10740
10741 case VM_PAGE_ON_INACTIVE_CLEANED_Q:
10742 {
10743 assert(m_object->internal == FALSE);
10744
10745 vm_page_inactive_count--;
10746 vm_page_queue_remove(&vm_page_queue_cleaned, mem, vmp_pageq);
10747 vm_page_cleaned_count--;
10748 vm_page_balance_inactive(3);
10749 break;
10750 }
10751
10752 case VM_PAGE_ON_THROTTLED_Q:
10753 {
10754 assert(m_object->internal == TRUE);
10755
10756 vm_page_queue_remove(&vm_page_queue_throttled, mem, vmp_pageq);
10757 vm_page_throttled_count--;
10758 was_pageable = FALSE;
10759 break;
10760 }
10761
10762 case VM_PAGE_ON_SPECULATIVE_Q:
10763 {
10764 assert(m_object->internal == FALSE);
10765
10766 vm_page_remque(&mem->vmp_pageq);
10767 vm_page_speculative_count--;
10768 vm_page_balance_inactive(3);
10769 break;
10770 }
10771
10772 #if CONFIG_SECLUDED_MEMORY
10773 case VM_PAGE_ON_SECLUDED_Q:
10774 {
10775 vm_page_queue_remove(&vm_page_queue_secluded, mem, vmp_pageq);
10776 vm_page_secluded_count--;
10777 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
10778 if (m_object == VM_OBJECT_NULL) {
10779 vm_page_secluded_count_free--;
10780 was_pageable = FALSE;
10781 } else {
10782 assert(!m_object->internal);
10783 vm_page_secluded_count_inuse--;
10784 was_pageable = FALSE;
10785 // was_pageable = TRUE;
10786 }
10787 break;
10788 }
10789 #endif /* CONFIG_SECLUDED_MEMORY */
10790
10791 default:
10792 {
10793 /*
10794 * if (mem->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)
10795 * NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue...
10796 * the caller is responsible for determing if the page is on that queue, and if so, must
10797 * either first remove it (it needs both the page queues lock and the object lock to do
10798 * this via vm_pageout_steal_laundry), or avoid the call to vm_page_queues_remove
10799 *
10800 * we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q
10801 * or any of the undefined states
10802 */
10803 panic("vm_page_queues_remove - bad page q_state (%p, %d)", mem, mem->vmp_q_state);
10804 break;
10805 }
10806 }
10807 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
10808 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
10809
10810 if (remove_from_specialq == TRUE) {
10811 vm_page_remove_from_specialq(mem);
10812 }
10813 if (was_pageable) {
10814 if (m_object->internal) {
10815 vm_page_pageable_internal_count--;
10816 } else {
10817 vm_page_pageable_external_count--;
10818 }
10819 }
10820 }
10821
10822 void
vm_page_remove_internal(vm_page_t page)10823 vm_page_remove_internal(vm_page_t page)
10824 {
10825 vm_object_t __object = VM_PAGE_OBJECT(page);
10826 if (page == __object->memq_hint) {
10827 vm_page_t __new_hint;
10828 vm_page_queue_entry_t __qe;
10829 __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->vmp_listq);
10830 if (vm_page_queue_end(&__object->memq, __qe)) {
10831 __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->vmp_listq);
10832 if (vm_page_queue_end(&__object->memq, __qe)) {
10833 __qe = NULL;
10834 }
10835 }
10836 __new_hint = (vm_page_t)((uintptr_t) __qe);
10837 __object->memq_hint = __new_hint;
10838 }
10839 vm_page_queue_remove(&__object->memq, page, vmp_listq);
10840 #if CONFIG_SECLUDED_MEMORY
10841 if (__object->eligible_for_secluded) {
10842 vm_page_secluded.eligible_for_secluded--;
10843 }
10844 #endif /* CONFIG_SECLUDED_MEMORY */
10845 #if HAS_MTE
10846 assert_mte_vmo_matches_vmp(__object, page);
10847 #endif /* HAS_MTE */
10848 }
10849
10850 void
vm_page_enqueue_inactive(vm_page_t mem,boolean_t first)10851 vm_page_enqueue_inactive(vm_page_t mem, boolean_t first)
10852 {
10853 vm_object_t m_object;
10854
10855 m_object = VM_PAGE_OBJECT(mem);
10856
10857 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10858 assert(!vm_page_is_fictitious(mem));
10859 assert(!mem->vmp_laundry);
10860 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
10861 vm_page_check_pageable_safe(mem);
10862
10863 if (m_object->internal) {
10864 mem->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
10865
10866 if (first == TRUE) {
10867 vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vmp_pageq);
10868 } else {
10869 vm_page_queue_enter(&vm_page_queue_anonymous, mem, vmp_pageq);
10870 }
10871
10872 vm_page_anonymous_count++;
10873 vm_page_pageable_internal_count++;
10874 } else {
10875 mem->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
10876
10877 if (first == TRUE) {
10878 vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vmp_pageq);
10879 } else {
10880 vm_page_queue_enter(&vm_page_queue_inactive, mem, vmp_pageq);
10881 }
10882
10883 vm_page_pageable_external_count++;
10884 }
10885 vm_page_inactive_count++;
10886 token_new_pagecount++;
10887
10888 vm_page_add_to_specialq(mem, FALSE);
10889 }
10890
10891 void
vm_page_enqueue_active(vm_page_t mem,boolean_t first)10892 vm_page_enqueue_active(vm_page_t mem, boolean_t first)
10893 {
10894 vm_object_t m_object;
10895
10896 m_object = VM_PAGE_OBJECT(mem);
10897
10898 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
10899 assert(!vm_page_is_fictitious(mem));
10900 assert(!mem->vmp_laundry);
10901 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
10902 vm_page_check_pageable_safe(mem);
10903
10904 mem->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
10905 if (first == TRUE) {
10906 vm_page_queue_enter_first(&vm_page_queue_active, mem, vmp_pageq);
10907 } else {
10908 vm_page_queue_enter(&vm_page_queue_active, mem, vmp_pageq);
10909 }
10910 vm_page_active_count++;
10911
10912 if (m_object->internal) {
10913 vm_page_pageable_internal_count++;
10914 } else {
10915 vm_page_pageable_external_count++;
10916 }
10917
10918 vm_page_add_to_specialq(mem, FALSE);
10919 vm_page_balance_inactive(3);
10920 }
10921
10922 /*
10923 * Pages from special kernel objects shouldn't
10924 * be placed on pageable queues.
10925 */
10926 void
vm_page_check_pageable_safe(vm_page_t page)10927 vm_page_check_pageable_safe(vm_page_t page)
10928 {
10929 vm_object_t page_object;
10930
10931 page_object = VM_PAGE_OBJECT(page);
10932
10933 if (is_kernel_object(page_object)) {
10934 panic("vm_page_check_pageable_safe: trying to add page"
10935 "from a kernel object to pageable queue");
10936 }
10937
10938 if (page_object == compressor_object) {
10939 panic("vm_page_check_pageable_safe: trying to add page"
10940 "from compressor object (%p) to pageable queue", compressor_object);
10941 }
10942 }
10943
10944 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
10945 * wired page diagnose
10946 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
10947
10948 #include <libkern/OSKextLibPrivate.h>
10949
10950 #define KA_SIZE(namelen, subtotalscount) \
10951 (sizeof(struct vm_allocation_site) + (namelen) + 1 + ((subtotalscount) * sizeof(struct vm_allocation_total)))
10952
10953 #define KA_NAME(alloc) \
10954 ((char *)(&(alloc)->subtotals[(alloc->subtotalscount)]))
10955
10956 #define KA_NAME_LEN(alloc) \
10957 (VM_TAG_NAME_LEN_MAX & (alloc->flags >> VM_TAG_NAME_LEN_SHIFT))
10958
10959 vm_tag_t
vm_tag_bt(void)10960 vm_tag_bt(void)
10961 {
10962 uintptr_t* frameptr;
10963 uintptr_t* frameptr_next;
10964 uintptr_t retaddr;
10965 uintptr_t kstackb, kstackt;
10966 const vm_allocation_site_t * site;
10967 thread_t cthread;
10968 kern_allocation_name_t name;
10969
10970 cthread = current_thread();
10971 if (__improbable(cthread == NULL)) {
10972 return VM_KERN_MEMORY_OSFMK;
10973 }
10974
10975 if ((name = thread_get_kernel_state(cthread)->allocation_name)) {
10976 if (!name->tag) {
10977 vm_tag_alloc(name);
10978 }
10979 return name->tag;
10980 }
10981
10982 kstackb = cthread->kernel_stack;
10983 kstackt = kstackb + kernel_stack_size;
10984
10985 /* Load stack frame pointer (EBP on x86) into frameptr */
10986 frameptr = __builtin_frame_address(0);
10987 site = NULL;
10988 while (frameptr != NULL) {
10989 /* Verify thread stack bounds */
10990 if (((uintptr_t)(frameptr + 2) > kstackt) || ((uintptr_t)frameptr < kstackb)) {
10991 break;
10992 }
10993
10994 /* Next frame pointer is pointed to by the previous one */
10995 frameptr_next = (uintptr_t*) *frameptr;
10996 #if defined(HAS_APPLE_PAC)
10997 frameptr_next = ptrauth_strip(frameptr_next, ptrauth_key_frame_pointer);
10998 #endif
10999
11000 /* Pull return address from one spot above the frame pointer */
11001 retaddr = *(frameptr + 1);
11002
11003 #if defined(HAS_APPLE_PAC)
11004 retaddr = (uintptr_t) ptrauth_strip((void *)retaddr, ptrauth_key_return_address);
11005 #endif
11006
11007 if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text))
11008 || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) {
11009 site = OSKextGetAllocationSiteForCaller(retaddr);
11010 break;
11011 }
11012 frameptr = frameptr_next;
11013 }
11014
11015 if (site) {
11016 return site->tag;
11017 }
11018
11019 #if MACH_ASSERT
11020 /*
11021 * Kernel tests appear here as unrecognized call sites and would get
11022 * no memory tag. Give them a default tag to prevent panics later.
11023 */
11024 if (thread_get_test_option(test_option_vm_prevent_wire_tag_panic)) {
11025 return VM_KERN_MEMORY_OSFMK;
11026 }
11027 #endif
11028
11029 return VM_KERN_MEMORY_NONE;
11030 }
11031
11032 static uint64_t free_tag_bits[VM_MAX_TAG_VALUE / 64];
11033
11034 void
vm_tag_alloc_locked(vm_allocation_site_t * site,vm_allocation_site_t ** releasesiteP)11035 vm_tag_alloc_locked(vm_allocation_site_t * site, vm_allocation_site_t ** releasesiteP)
11036 {
11037 vm_tag_t tag;
11038 uint64_t avail;
11039 uint32_t idx;
11040 vm_allocation_site_t * prev;
11041
11042 if (site->tag) {
11043 return;
11044 }
11045
11046 idx = 0;
11047 while (TRUE) {
11048 avail = free_tag_bits[idx];
11049 if (avail) {
11050 tag = (vm_tag_t)__builtin_clzll(avail);
11051 avail &= ~(1ULL << (63 - tag));
11052 free_tag_bits[idx] = avail;
11053 tag += (idx << 6);
11054 break;
11055 }
11056 idx++;
11057 if (idx >= ARRAY_COUNT(free_tag_bits)) {
11058 for (idx = 0; idx < ARRAY_COUNT(vm_allocation_sites); idx++) {
11059 prev = vm_allocation_sites[idx];
11060 if (!prev) {
11061 continue;
11062 }
11063 if (!KA_NAME_LEN(prev)) {
11064 continue;
11065 }
11066 if (!prev->tag) {
11067 continue;
11068 }
11069 if (prev->total) {
11070 continue;
11071 }
11072 if (1 != prev->refcount) {
11073 continue;
11074 }
11075
11076 assert(idx == prev->tag);
11077 tag = (vm_tag_t)idx;
11078 prev->tag = VM_KERN_MEMORY_NONE;
11079 *releasesiteP = prev;
11080 break;
11081 }
11082 if (idx >= ARRAY_COUNT(vm_allocation_sites)) {
11083 tag = VM_KERN_MEMORY_ANY;
11084 }
11085 break;
11086 }
11087 }
11088 site->tag = tag;
11089
11090 OSAddAtomic16(1, &site->refcount);
11091
11092 if (VM_KERN_MEMORY_ANY != tag) {
11093 vm_allocation_sites[tag] = site;
11094 }
11095
11096 if (tag > vm_allocation_tag_highest) {
11097 vm_allocation_tag_highest = tag;
11098 }
11099 }
11100
11101 static void
vm_tag_free_locked(vm_tag_t tag)11102 vm_tag_free_locked(vm_tag_t tag)
11103 {
11104 uint64_t avail;
11105 uint32_t idx;
11106 uint64_t bit;
11107
11108 if (VM_KERN_MEMORY_ANY == tag) {
11109 return;
11110 }
11111
11112 idx = (tag >> 6);
11113 avail = free_tag_bits[idx];
11114 tag &= 63;
11115 bit = (1ULL << (63 - tag));
11116 assert(!(avail & bit));
11117 free_tag_bits[idx] = (avail | bit);
11118 }
11119
11120 static void
vm_tag_init(void)11121 vm_tag_init(void)
11122 {
11123 vm_tag_t tag;
11124 for (tag = VM_KERN_MEMORY_FIRST_DYNAMIC; tag < VM_KERN_MEMORY_ANY; tag++) {
11125 vm_tag_free_locked(tag);
11126 }
11127
11128 for (tag = VM_KERN_MEMORY_ANY + 1; tag < VM_MAX_TAG_VALUE; tag++) {
11129 vm_tag_free_locked(tag);
11130 }
11131 }
11132
11133 vm_tag_t
vm_tag_alloc(vm_allocation_site_t * site)11134 vm_tag_alloc(vm_allocation_site_t * site)
11135 {
11136 vm_allocation_site_t * releasesite;
11137
11138 if (!site->tag) {
11139 releasesite = NULL;
11140 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11141 vm_tag_alloc_locked(site, &releasesite);
11142 lck_ticket_unlock(&vm_allocation_sites_lock);
11143 if (releasesite) {
11144 kern_allocation_name_release(releasesite);
11145 }
11146 }
11147
11148 return site->tag;
11149 }
11150
11151 #ifndef ARRAY_SIZE
11152 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
11153 #endif /* ARRAY_SIZE */
11154 #define VM_KERN_MEMORY_ELEM(name) [VM_KERN_MEMORY_##name] = "VM_KERN_MEMORY_" #name
11155 const char *vm_kern_memory_names[] = {
11156 VM_KERN_MEMORY_ELEM(NONE),
11157 VM_KERN_MEMORY_ELEM(OSFMK),
11158 VM_KERN_MEMORY_ELEM(BSD),
11159 VM_KERN_MEMORY_ELEM(IOKIT),
11160 VM_KERN_MEMORY_ELEM(LIBKERN),
11161 VM_KERN_MEMORY_ELEM(OSKEXT),
11162 VM_KERN_MEMORY_ELEM(KEXT),
11163 VM_KERN_MEMORY_ELEM(IPC),
11164 VM_KERN_MEMORY_ELEM(STACK),
11165 VM_KERN_MEMORY_ELEM(CPU),
11166 VM_KERN_MEMORY_ELEM(PMAP),
11167 VM_KERN_MEMORY_ELEM(PTE),
11168 VM_KERN_MEMORY_ELEM(ZONE),
11169 VM_KERN_MEMORY_ELEM(KALLOC),
11170 VM_KERN_MEMORY_ELEM(COMPRESSOR),
11171 VM_KERN_MEMORY_ELEM(COMPRESSED_DATA),
11172 VM_KERN_MEMORY_ELEM(PHANTOM_CACHE),
11173 VM_KERN_MEMORY_ELEM(WAITQ),
11174 VM_KERN_MEMORY_ELEM(DIAG),
11175 VM_KERN_MEMORY_ELEM(LOG),
11176 VM_KERN_MEMORY_ELEM(FILE),
11177 VM_KERN_MEMORY_ELEM(MBUF),
11178 VM_KERN_MEMORY_ELEM(UBC),
11179 VM_KERN_MEMORY_ELEM(SECURITY),
11180 VM_KERN_MEMORY_ELEM(MLOCK),
11181 VM_KERN_MEMORY_ELEM(REASON),
11182 VM_KERN_MEMORY_ELEM(SKYWALK),
11183 VM_KERN_MEMORY_ELEM(LTABLE),
11184 VM_KERN_MEMORY_ELEM(HV),
11185 VM_KERN_MEMORY_ELEM(KALLOC_DATA),
11186 VM_KERN_MEMORY_ELEM(RETIRED),
11187 VM_KERN_MEMORY_ELEM(KALLOC_TYPE),
11188 VM_KERN_MEMORY_ELEM(TRIAGE),
11189 VM_KERN_MEMORY_ELEM(RECOUNT),
11190 VM_KERN_MEMORY_ELEM(MTAG),
11191 VM_KERN_MEMORY_ELEM(EXCLAVES),
11192 VM_KERN_MEMORY_ELEM(EXCLAVES_SHARED),
11193 VM_KERN_MEMORY_ELEM(KALLOC_SHARED),
11194 VM_KERN_MEMORY_ELEM(CPUTRACE),
11195 };
11196
11197 _Static_assert(ARRAY_SIZE(vm_kern_memory_names) == VM_KERN_MEMORY_FIRST_DYNAMIC,
11198 "vm_kern_memory_names must map all counter tags");
11199
11200 #define VM_KERN_COUNT_ELEM(name) [VM_KERN_COUNT_##name] = "VM_KERN_COUNT_" #name
11201 const char *vm_kern_count_names[] = {
11202 VM_KERN_COUNT_ELEM(MANAGED),
11203 VM_KERN_COUNT_ELEM(RESERVED),
11204 VM_KERN_COUNT_ELEM(WIRED),
11205 VM_KERN_COUNT_ELEM(WIRED_MANAGED),
11206 VM_KERN_COUNT_ELEM(STOLEN),
11207 VM_KERN_COUNT_ELEM(LOPAGE),
11208 VM_KERN_COUNT_ELEM(MAP_KERNEL),
11209 VM_KERN_COUNT_ELEM(MAP_ZONE),
11210 VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE),
11211 VM_KERN_COUNT_ELEM(WIRED_BOOT),
11212 VM_KERN_COUNT_ELEM(BOOT_STOLEN),
11213 VM_KERN_COUNT_ELEM(WIRED_STATIC_KERNELCACHE),
11214 VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE_DATA),
11215 VM_KERN_COUNT_ELEM(MAP_KERNEL_DATA),
11216 VM_KERN_COUNT_ELEM(EXCLAVES_CARVEOUT),
11217 };
11218
11219 #if VM_BTLOG_TAGS
11220 #define VM_KERN_MEMORY_STR_MAX_LEN (32)
11221 TUNABLE_STR(vmtaglog, VM_KERN_MEMORY_STR_MAX_LEN, "vmtaglog", "");
11222 #define VM_TAG_BTLOG_SIZE (16u << 10)
11223
11224 btlog_t vmtaglog_btlog;
11225 vm_tag_t vmtaglog_tag;
11226
11227 static void
vm_tag_log(vm_object_t object,int64_t delta,void * fp)11228 vm_tag_log(vm_object_t object, int64_t delta, void *fp)
11229 {
11230 if (is_kernel_object(object)) {
11231 /* kernel object backtraces are tracked in vm entries */
11232 return;
11233 }
11234 if (delta > 0) {
11235 btref_t ref = btref_get(fp, BTREF_GET_NOWAIT);
11236 btlog_record(vmtaglog_btlog, object, 0, ref);
11237 } else if (object->wired_page_count == 0) {
11238 btlog_erase(vmtaglog_btlog, object);
11239 }
11240 }
11241
11242 _Static_assert(ARRAY_SIZE(vm_kern_count_names) == VM_KERN_COUNTER_COUNT,
11243 "vm_kern_count_names must map all counter tags");
11244
11245 static vm_tag_t
vm_tag_str_to_idx(char tagstr[VM_KERN_MEMORY_STR_MAX_LEN])11246 vm_tag_str_to_idx(char tagstr[VM_KERN_MEMORY_STR_MAX_LEN])
11247 {
11248 for (vm_tag_t i = VM_KERN_MEMORY_OSFMK; i < ARRAY_SIZE(vm_kern_memory_names); i++) {
11249 if (!strncmp(vm_kern_memory_names[i], tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11250 return i;
11251 }
11252 }
11253
11254 if (!strncmp("dynamic", tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11255 return VM_KERN_MEMORY_FIRST_DYNAMIC;
11256 }
11257
11258 if (!strncmp("any", tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) {
11259 return VM_KERN_MEMORY_ANY;
11260 }
11261
11262 printf("Unable to find vm tag %s for btlog\n", tagstr);
11263 return VM_KERN_MEMORY_NONE;
11264 }
11265
11266 __startup_func
11267 static void
vm_btlog_init(void)11268 vm_btlog_init(void)
11269 {
11270 vmtaglog_tag = vm_tag_str_to_idx(vmtaglog);
11271
11272 if (vmtaglog_tag != VM_KERN_MEMORY_NONE) {
11273 vmtaglog_btlog = btlog_create(BTLOG_HASH, VM_TAG_BTLOG_SIZE, 0);
11274 }
11275 }
11276 STARTUP(ZALLOC, STARTUP_RANK_FIRST, vm_btlog_init);
11277 #endif /* VM_BTLOG_TAGS */
11278
11279 void
vm_tag_update_size(vm_tag_t tag,int64_t delta,vm_object_t object)11280 vm_tag_update_size(vm_tag_t tag, int64_t delta, vm_object_t object)
11281 {
11282 assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
11283
11284 kern_allocation_update_size(vm_allocation_sites[tag], delta, object);
11285 }
11286
11287 uint64_t
vm_tag_get_size(vm_tag_t tag)11288 vm_tag_get_size(vm_tag_t tag)
11289 {
11290 vm_allocation_site_t *allocation;
11291
11292 assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
11293
11294 allocation = vm_allocation_sites[tag];
11295 return allocation ? os_atomic_load(&allocation->total, relaxed) : 0;
11296 }
11297
11298 void
kern_allocation_update_size(kern_allocation_name_t allocation,int64_t delta,__unused vm_object_t object)11299 kern_allocation_update_size(kern_allocation_name_t allocation, int64_t delta, __unused vm_object_t object)
11300 {
11301 uint64_t value;
11302
11303 value = os_atomic_add(&allocation->total, delta, relaxed);
11304 if (delta < 0) {
11305 assertf(value + (uint64_t)-delta > value,
11306 "tag %d, site %p", allocation->tag, allocation);
11307 }
11308
11309 #if DEBUG || DEVELOPMENT
11310 /* release to publish the new total */
11311 os_atomic_max(&allocation->peak, value, release);
11312 #endif /* DEBUG || DEVELOPMENT */
11313
11314 if (value == (uint64_t)delta && !allocation->tag) {
11315 vm_tag_alloc(allocation);
11316 }
11317
11318 #if VM_BTLOG_TAGS
11319 if (vmtaglog_matches(allocation->tag) && object) {
11320 vm_tag_log(object, delta, __builtin_frame_address(0));
11321 }
11322 #endif /* VM_BTLOG_TAGS */
11323 }
11324
11325 #if DEBUG || DEVELOPMENT
11326
11327 void
vm_tag_reset_all_peaks(void)11328 vm_tag_reset_all_peaks(void)
11329 {
11330 vm_log("resetting peak size for all kernel tags\n");
11331 for (vm_tag_t tag = 0; tag <= vm_allocation_tag_highest; tag++) {
11332 vm_tag_reset_peak(tag);
11333 }
11334 }
11335
11336 kern_return_t
vm_tag_reset_peak(vm_tag_t tag)11337 vm_tag_reset_peak(vm_tag_t tag)
11338 {
11339 if (tag > vm_allocation_tag_highest) {
11340 return KERN_INVALID_ARGUMENT;
11341 }
11342
11343 vm_allocation_site_t *site = vm_allocation_sites[tag];
11344 vm_log_info("resetting peak size for kernel tag %s\n",
11345 KA_NAME(site));
11346
11347 uint64_t new_peak = os_atomic_load(&site->total, relaxed);
11348 /* acquire updates to the total */
11349 os_atomic_min(&site->peak, new_peak, acquire);
11350
11351 return KERN_SUCCESS;
11352 }
11353
11354 #endif /* DEBUG || DEVELOPMENT */
11355
11356 #if VM_TAG_SIZECLASSES
11357
11358 void
vm_allocation_zones_init(void)11359 vm_allocation_zones_init(void)
11360 {
11361 vm_offset_t addr;
11362 vm_size_t size;
11363
11364 const vm_tag_t early_tags[] = {
11365 VM_KERN_MEMORY_DIAG,
11366 VM_KERN_MEMORY_KALLOC,
11367 VM_KERN_MEMORY_KALLOC_DATA,
11368 VM_KERN_MEMORY_KALLOC_TYPE,
11369 VM_KERN_MEMORY_LIBKERN,
11370 VM_KERN_MEMORY_OSFMK,
11371 VM_KERN_MEMORY_RECOUNT,
11372 };
11373
11374 size = VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *)
11375 + ARRAY_COUNT(early_tags) * VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
11376
11377 kmem_alloc(kernel_map, &addr, round_page(size),
11378 KMA_NOFAIL | KMA_KOBJECT | KMA_ZERO | KMA_PERMANENT,
11379 VM_KERN_MEMORY_DIAG);
11380
11381 vm_allocation_zone_totals = (vm_allocation_zone_total_t **) addr;
11382 addr += VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *);
11383
11384 // prepopulate early tag ranges so allocations
11385 // in vm_tag_update_zone_size() and early boot won't recurse
11386 for (size_t i = 0; i < ARRAY_COUNT(early_tags); i++) {
11387 vm_allocation_zone_totals[early_tags[i]] = (vm_allocation_zone_total_t *)addr;
11388 addr += VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
11389 }
11390 }
11391
11392 __attribute__((noinline))
11393 static vm_tag_t
vm_tag_zone_stats_alloc(vm_tag_t tag,zalloc_flags_t flags)11394 vm_tag_zone_stats_alloc(vm_tag_t tag, zalloc_flags_t flags)
11395 {
11396 vm_allocation_zone_total_t *stats;
11397 vm_size_t size = sizeof(*stats) * VM_TAG_SIZECLASSES;
11398
11399 flags = Z_VM_TAG(Z_ZERO | flags, VM_KERN_MEMORY_DIAG);
11400 stats = kalloc_data(size, flags);
11401 if (!stats) {
11402 return VM_KERN_MEMORY_NONE;
11403 }
11404 if (!os_atomic_cmpxchg(&vm_allocation_zone_totals[tag], NULL, stats, release)) {
11405 kfree_data(stats, size);
11406 }
11407 return tag;
11408 }
11409
11410 vm_tag_t
vm_tag_will_update_zone(vm_tag_t tag,uint32_t zflags)11411 vm_tag_will_update_zone(vm_tag_t tag, uint32_t zflags)
11412 {
11413 assert(VM_KERN_MEMORY_NONE != tag);
11414 assert(tag < VM_MAX_TAG_VALUE);
11415
11416 if (__probable(vm_allocation_zone_totals[tag])) {
11417 return tag;
11418 }
11419 return vm_tag_zone_stats_alloc(tag, zflags);
11420 }
11421
11422 void
vm_tag_update_zone_size(vm_tag_t tag,uint32_t zidx,long delta)11423 vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta)
11424 {
11425 vm_allocation_zone_total_t *stats;
11426 vm_size_t value;
11427
11428 assert(VM_KERN_MEMORY_NONE != tag);
11429 assert(tag < VM_MAX_TAG_VALUE);
11430
11431 if (zidx >= VM_TAG_SIZECLASSES) {
11432 return;
11433 }
11434
11435 stats = vm_allocation_zone_totals[tag];
11436 assert(stats);
11437 stats += zidx;
11438
11439 value = os_atomic_add(&stats->vazt_total, delta, relaxed);
11440 if (delta < 0) {
11441 assertf((long)value >= 0, "zidx %d, tag %d, %p", zidx, tag, stats);
11442 return;
11443 } else if (os_atomic_load(&stats->vazt_peak, relaxed) < value) {
11444 os_atomic_max(&stats->vazt_peak, value, relaxed);
11445 }
11446 }
11447
11448 #endif /* VM_TAG_SIZECLASSES */
11449
11450 void
kern_allocation_update_subtotal(kern_allocation_name_t allocation,vm_tag_t subtag,int64_t delta)11451 kern_allocation_update_subtotal(kern_allocation_name_t allocation, vm_tag_t subtag, int64_t delta)
11452 {
11453 kern_allocation_name_t other;
11454 struct vm_allocation_total * total;
11455 uint32_t subidx;
11456
11457 assert(VM_KERN_MEMORY_NONE != subtag);
11458 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11459 for (subidx = 0; subidx < allocation->subtotalscount; subidx++) {
11460 total = &allocation->subtotals[subidx];
11461 if (subtag == total->tag) {
11462 break;
11463 }
11464 }
11465 if (subidx >= allocation->subtotalscount) {
11466 for (subidx = 0; subidx < allocation->subtotalscount; subidx++) {
11467 total = &allocation->subtotals[subidx];
11468 if ((VM_KERN_MEMORY_NONE == total->tag)
11469 || !total->total) {
11470 total->tag = (vm_tag_t)subtag;
11471 break;
11472 }
11473 }
11474 }
11475 assert(subidx < allocation->subtotalscount);
11476 if (subidx >= allocation->subtotalscount) {
11477 lck_ticket_unlock(&vm_allocation_sites_lock);
11478 return;
11479 }
11480 if (delta < 0) {
11481 assertf(total->total >= ((uint64_t)-delta), "name %p", allocation);
11482 }
11483 OSAddAtomic64(delta, &total->total);
11484 lck_ticket_unlock(&vm_allocation_sites_lock);
11485
11486 other = vm_allocation_sites[subtag];
11487 assert(other);
11488 if (delta < 0) {
11489 assertf(other->mapped >= ((uint64_t)-delta), "other %p", other);
11490 }
11491 OSAddAtomic64(delta, &other->mapped);
11492 }
11493
11494 const char *
kern_allocation_get_name(kern_allocation_name_t allocation)11495 kern_allocation_get_name(kern_allocation_name_t allocation)
11496 {
11497 return KA_NAME(allocation);
11498 }
11499
11500 kern_allocation_name_t
kern_allocation_name_allocate(const char * name,uint16_t subtotalscount)11501 kern_allocation_name_allocate(const char * name, uint16_t subtotalscount)
11502 {
11503 kern_allocation_name_t allocation;
11504 uint16_t namelen;
11505
11506 namelen = (uint16_t)strnlen(name, MACH_MEMORY_INFO_NAME_MAX_LEN - 1);
11507
11508 allocation = kalloc_data(KA_SIZE(namelen, subtotalscount), Z_WAITOK | Z_ZERO);
11509 allocation->refcount = 1;
11510 allocation->subtotalscount = subtotalscount;
11511 allocation->flags = (uint16_t)(namelen << VM_TAG_NAME_LEN_SHIFT);
11512 strlcpy(KA_NAME(allocation), name, namelen + 1);
11513
11514 vm_tag_alloc(allocation);
11515 return allocation;
11516 }
11517
11518 void
kern_allocation_name_release(kern_allocation_name_t allocation)11519 kern_allocation_name_release(kern_allocation_name_t allocation)
11520 {
11521 assert(allocation->refcount > 0);
11522 if (1 == OSAddAtomic16(-1, &allocation->refcount)) {
11523 kfree_data(allocation,
11524 KA_SIZE(KA_NAME_LEN(allocation), allocation->subtotalscount));
11525 }
11526 }
11527
11528 #if !VM_TAG_ACTIVE_UPDATE
11529 static void
vm_page_count_object(mach_memory_info_t * info,unsigned int __unused num_info,vm_object_t object)11530 vm_page_count_object(mach_memory_info_t * info, unsigned int __unused num_info, vm_object_t object)
11531 {
11532 if (!object->wired_page_count) {
11533 return;
11534 }
11535 if (!is_kernel_object(object)) {
11536 assert(object->wire_tag < num_info);
11537 info[object->wire_tag].size += ptoa_64(object->wired_page_count);
11538 }
11539 }
11540
11541 typedef void (*vm_page_iterate_proc)(mach_memory_info_t * info,
11542 unsigned int num_info, vm_object_t object);
11543
11544 static void
vm_page_iterate_purgeable_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc,purgeable_q_t queue,int group)11545 vm_page_iterate_purgeable_objects(mach_memory_info_t * info, unsigned int num_info,
11546 vm_page_iterate_proc proc, purgeable_q_t queue,
11547 int group)
11548 {
11549 vm_object_t object;
11550
11551 for (object = (vm_object_t) queue_first(&queue->objq[group]);
11552 !queue_end(&queue->objq[group], (queue_entry_t) object);
11553 object = (vm_object_t) queue_next(&object->objq)) {
11554 proc(info, num_info, object);
11555 }
11556 }
11557
11558 static void
vm_page_iterate_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc)11559 vm_page_iterate_objects(mach_memory_info_t * info, unsigned int num_info,
11560 vm_page_iterate_proc proc)
11561 {
11562 vm_object_t object;
11563
11564 lck_spin_lock_grp(&vm_objects_wired_lock, &vm_page_lck_grp_bucket);
11565 queue_iterate(&vm_objects_wired,
11566 object,
11567 vm_object_t,
11568 wired_objq)
11569 {
11570 proc(info, num_info, object);
11571 }
11572 lck_spin_unlock(&vm_objects_wired_lock);
11573 }
11574 #endif /* ! VM_TAG_ACTIVE_UPDATE */
11575
11576 static uint64_t
process_account(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,boolean_t iterated,bool redact_info __unused)11577 process_account(mach_memory_info_t * info, unsigned int num_info,
11578 uint64_t zones_collectable_bytes, boolean_t iterated, bool redact_info __unused)
11579 {
11580 size_t namelen;
11581 unsigned int idx, count, nextinfo;
11582 vm_allocation_site_t * site;
11583 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11584
11585 for (idx = 0; idx <= vm_allocation_tag_highest; idx++) {
11586 site = vm_allocation_sites[idx];
11587 if (!site) {
11588 continue;
11589 }
11590 info[idx].mapped = site->mapped;
11591 info[idx].tag = site->tag;
11592 if (!iterated) {
11593 info[idx].size = site->total;
11594 #if DEBUG || DEVELOPMENT
11595 info[idx].peak = site->peak;
11596 #endif /* DEBUG || DEVELOPMENT */
11597 } else {
11598 if (!site->subtotalscount && (site->total != info[idx].size)) {
11599 printf("tag mismatch[%d] 0x%qx, iter 0x%qx\n", idx, site->total, info[idx].size);
11600 info[idx].size = site->total;
11601 }
11602 }
11603 info[idx].flags |= VM_KERN_SITE_WIRED;
11604 if (idx < VM_KERN_MEMORY_FIRST_DYNAMIC) {
11605 info[idx].site = idx;
11606 info[idx].flags |= VM_KERN_SITE_TAG;
11607 if (VM_KERN_MEMORY_ZONE == idx) {
11608 info[idx].flags |= VM_KERN_SITE_HIDE;
11609 info[idx].flags &= ~VM_KERN_SITE_WIRED;
11610 info[idx].collectable_bytes = zones_collectable_bytes;
11611 }
11612 info[idx].flags |= VM_KERN_SITE_NAMED;
11613 strlcpy(info[idx].name, vm_kern_memory_names[idx], MACH_MEMORY_INFO_NAME_MAX_LEN);
11614 } else if ((namelen = (VM_TAG_NAME_LEN_MAX & (site->flags >> VM_TAG_NAME_LEN_SHIFT)))) {
11615 info[idx].site = 0;
11616 info[idx].flags |= VM_KERN_SITE_NAMED;
11617 if (namelen > sizeof(info[idx].name)) {
11618 namelen = sizeof(info[idx].name);
11619 }
11620 strncpy(&info[idx].name[0], KA_NAME(site), namelen);
11621 } else if (VM_TAG_KMOD & site->flags) {
11622 info[idx].site = OSKextGetKmodIDForSite(site, NULL, 0);
11623 info[idx].flags |= VM_KERN_SITE_KMOD;
11624 } else {
11625 info[idx].site = VM_KERNEL_UNSLIDE(site);
11626 info[idx].flags |= VM_KERN_SITE_KERNEL;
11627 }
11628 }
11629
11630 nextinfo = (vm_allocation_tag_highest + 1);
11631 count = nextinfo;
11632 if (count >= num_info) {
11633 count = num_info;
11634 }
11635
11636 for (idx = 0; idx < count; idx++) {
11637 site = vm_allocation_sites[idx];
11638 if (!site) {
11639 continue;
11640 }
11641 #if VM_TAG_SIZECLASSES
11642 vm_allocation_zone_total_t * zone;
11643 unsigned int zidx;
11644
11645 if (!redact_info
11646 && vm_allocation_zone_totals
11647 && (zone = vm_allocation_zone_totals[idx])
11648 && (nextinfo < num_info)) {
11649 for (zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
11650 if (!zone[zidx].vazt_peak) {
11651 continue;
11652 }
11653 info[nextinfo] = info[idx];
11654 info[nextinfo].zone = zone_index_from_tag_index(zidx);
11655 info[nextinfo].flags &= ~VM_KERN_SITE_WIRED;
11656 info[nextinfo].flags |= VM_KERN_SITE_ZONE;
11657 info[nextinfo].flags |= VM_KERN_SITE_KALLOC;
11658 info[nextinfo].size = zone[zidx].vazt_total;
11659 info[nextinfo].peak = zone[zidx].vazt_peak;
11660 info[nextinfo].mapped = 0;
11661 nextinfo++;
11662 }
11663 }
11664 #endif /* VM_TAG_SIZECLASSES */
11665 if (site->subtotalscount) {
11666 uint64_t mapped, mapcost, take;
11667 uint32_t sub;
11668 vm_tag_t alloctag;
11669
11670 info[idx].size = site->total;
11671 mapped = info[idx].size;
11672 info[idx].mapped = mapped;
11673 mapcost = 0;
11674 for (sub = 0; sub < site->subtotalscount; sub++) {
11675 alloctag = site->subtotals[sub].tag;
11676 assert(alloctag < num_info);
11677 if (info[alloctag].name[0] && alloctag >= VM_KERN_MEMORY_FIRST_DYNAMIC) {
11678 continue;
11679 }
11680 take = site->subtotals[sub].total;
11681 if (take > info[alloctag].size) {
11682 take = info[alloctag].size;
11683 }
11684 if (take > mapped) {
11685 take = mapped;
11686 }
11687 info[alloctag].mapped -= take;
11688 info[alloctag].size -= take;
11689 mapped -= take;
11690 mapcost += take;
11691 }
11692 info[idx].size = mapcost;
11693 }
11694 }
11695 lck_ticket_unlock(&vm_allocation_sites_lock);
11696
11697 return 0;
11698 }
11699
11700 uint32_t
vm_page_diagnose_estimate(void)11701 vm_page_diagnose_estimate(void)
11702 {
11703 vm_allocation_site_t * site;
11704 uint32_t count = zone_view_count;
11705 uint32_t idx;
11706
11707 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
11708 for (idx = 0; idx < VM_MAX_TAG_VALUE; idx++) {
11709 site = vm_allocation_sites[idx];
11710 if (!site) {
11711 continue;
11712 }
11713 count++;
11714 #if VM_TAG_SIZECLASSES
11715 if (vm_allocation_zone_totals) {
11716 vm_allocation_zone_total_t * zone;
11717 zone = vm_allocation_zone_totals[idx];
11718 if (!zone) {
11719 continue;
11720 }
11721 for (uint32_t zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
11722 count += (zone[zidx].vazt_peak != 0);
11723 }
11724 }
11725 #endif
11726 }
11727 lck_ticket_unlock(&vm_allocation_sites_lock);
11728
11729 /* some slop for new tags created */
11730 count += 8;
11731 count += VM_KERN_COUNTER_COUNT;
11732
11733 return count;
11734 }
11735
11736 static void
vm_page_diagnose_zone_stats(mach_memory_info_t * info,zone_stats_t zstats,bool percpu)11737 vm_page_diagnose_zone_stats(mach_memory_info_t *info, zone_stats_t zstats,
11738 bool percpu)
11739 {
11740 zpercpu_foreach(zs, zstats) {
11741 info->size += zs->zs_mem_allocated - zs->zs_mem_freed;
11742 }
11743 if (percpu) {
11744 info->size *= zpercpu_count();
11745 }
11746 info->flags |= VM_KERN_SITE_NAMED | VM_KERN_SITE_ZONE_VIEW;
11747 }
11748
11749 static void
vm_page_add_info(mach_memory_info_t * info,zone_stats_t stats,bool per_cpu,const char * parent_heap_name,const char * parent_zone_name,const char * view_name)11750 vm_page_add_info(
11751 mach_memory_info_t *info,
11752 zone_stats_t stats,
11753 bool per_cpu,
11754 const char *parent_heap_name,
11755 const char *parent_zone_name,
11756 const char *view_name)
11757 {
11758 vm_page_diagnose_zone_stats(info, stats, per_cpu);
11759 snprintf(info->name, sizeof(info->name),
11760 "%s%s[%s]", parent_heap_name, parent_zone_name, view_name);
11761 }
11762
11763 static void
vm_page_diagnose_zone(mach_memory_info_t * info,zone_t z)11764 vm_page_diagnose_zone(mach_memory_info_t *info, zone_t z)
11765 {
11766 vm_page_add_info(info, z->z_stats, z->z_percpu, zone_heap_name(z),
11767 z->z_name, "raw");
11768 }
11769
11770 static void
vm_page_add_view(mach_memory_info_t * info,zone_stats_t stats,const char * parent_heap_name,const char * parent_zone_name,const char * view_name)11771 vm_page_add_view(
11772 mach_memory_info_t *info,
11773 zone_stats_t stats,
11774 const char *parent_heap_name,
11775 const char *parent_zone_name,
11776 const char *view_name)
11777 {
11778 vm_page_add_info(info, stats, false, parent_heap_name, parent_zone_name,
11779 view_name);
11780 }
11781
11782 static uint32_t
vm_page_diagnose_heap_views(mach_memory_info_t * info,kalloc_heap_t kh,const char * parent_heap_name,const char * parent_zone_name)11783 vm_page_diagnose_heap_views(
11784 mach_memory_info_t *info,
11785 kalloc_heap_t kh,
11786 const char *parent_heap_name,
11787 const char *parent_zone_name)
11788 {
11789 uint32_t i = 0;
11790
11791 while (kh) {
11792 vm_page_add_view(info + i, kh->kh_stats, parent_heap_name,
11793 parent_zone_name, kh->kh_name);
11794 kh = kh->kh_views;
11795 i++;
11796 }
11797 return i;
11798 }
11799
11800 static uint32_t
vm_page_diagnose_heap(mach_memory_info_t * info,kalloc_heap_t kheap)11801 vm_page_diagnose_heap(mach_memory_info_t *info, kalloc_heap_t kheap)
11802 {
11803 uint32_t i = 0;
11804
11805 for (; i < KHEAP_NUM_ZONES; i++) {
11806 vm_page_diagnose_zone(info + i, zone_by_id(kheap->kh_zstart + i));
11807 }
11808
11809 i += vm_page_diagnose_heap_views(info + i, kheap->kh_views, kheap->kh_name,
11810 NULL);
11811 return i;
11812 }
11813
11814 static int
vm_page_diagnose_kt_heaps(mach_memory_info_t * info)11815 vm_page_diagnose_kt_heaps(mach_memory_info_t *info)
11816 {
11817 uint32_t idx = 0;
11818 vm_page_add_view(info + idx, KHEAP_KT_VAR->kh_stats, KHEAP_KT_VAR->kh_name,
11819 "", "raw");
11820 idx++;
11821
11822 for (uint32_t i = 0; i < KT_VAR_MAX_HEAPS; i++) {
11823 struct kheap_info heap = kalloc_type_heap_array[i];
11824 char heap_num_tmp[MAX_ZONE_NAME] = "";
11825 const char *heap_num;
11826
11827 snprintf(&heap_num_tmp[0], MAX_ZONE_NAME, "%u", i);
11828 heap_num = &heap_num_tmp[0];
11829
11830 for (kalloc_type_var_view_t ktv = heap.kt_views; ktv;
11831 ktv = (kalloc_type_var_view_t) ktv->kt_next) {
11832 if (ktv->kt_stats && ktv->kt_stats != KHEAP_KT_VAR->kh_stats) {
11833 vm_page_add_view(info + idx, ktv->kt_stats, KHEAP_KT_VAR->kh_name,
11834 heap_num, ktv->kt_name);
11835 idx++;
11836 }
11837 }
11838
11839 idx += vm_page_diagnose_heap_views(info + idx, heap.kh_views,
11840 KHEAP_KT_VAR->kh_name, heap_num);
11841 }
11842
11843 return idx;
11844 }
11845
11846 kern_return_t
vm_page_diagnose(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,bool redact_info)11847 vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes, bool redact_info)
11848 {
11849 uint64_t wired_size;
11850 uint64_t wired_managed_size;
11851 uint64_t wired_reserved_size;
11852 boolean_t iterate;
11853 mach_memory_info_t * counts;
11854 uint32_t i;
11855
11856 vmlp_api_start(VM_PAGE_DIAGNOSE);
11857
11858 bzero(info, num_info * sizeof(mach_memory_info_t));
11859
11860 if (!vm_page_wire_count_initial) {
11861 vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_ABORTED);
11862 return KERN_ABORTED;
11863 }
11864
11865 wired_size = ptoa_64(vm_page_wire_count);
11866 wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count);
11867 #if XNU_TARGET_OS_OSX
11868 wired_size += ptoa_64(vm_lopage_free_count + vm_page_throttled_count);
11869 wired_reserved_size += ptoa_64(vm_page_throttled_count);
11870 #endif /* XNU_TARGET_OS_OSX */
11871 #if CONFIG_EXCLAVES
11872 wired_reserved_size -= exclaves_carveout_size;
11873 #endif /* CONFIG_EXCLAVES */
11874 wired_managed_size = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial);
11875
11876 wired_size += booter_size;
11877
11878 assert(num_info >= VM_KERN_COUNTER_COUNT);
11879 num_info -= VM_KERN_COUNTER_COUNT;
11880 counts = &info[num_info];
11881
11882 #define SET_COUNT(xcount, xsize, xflags) MACRO_BEGIN \
11883 counts[xcount].tag = VM_MAX_TAG_VALUE + xcount; \
11884 counts[xcount].site = (xcount); \
11885 counts[xcount].size = (xsize); \
11886 counts[xcount].mapped = (xsize); \
11887 counts[xcount].flags = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED | xflags; \
11888 strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \
11889 MACRO_END;
11890
11891 SET_COUNT(VM_KERN_COUNT_MANAGED, ptoa_64(vm_page_pages), 0);
11892 SET_COUNT(VM_KERN_COUNT_WIRED, wired_size, 0);
11893 SET_COUNT(VM_KERN_COUNT_WIRED_MANAGED, wired_managed_size, 0);
11894 SET_COUNT(VM_KERN_COUNT_RESERVED, wired_reserved_size, VM_KERN_SITE_WIRED);
11895 SET_COUNT(VM_KERN_COUNT_STOLEN, ptoa_64(vm_page_stolen_count), VM_KERN_SITE_WIRED);
11896 SET_COUNT(VM_KERN_COUNT_LOPAGE, ptoa_64(vm_lopage_free_count), VM_KERN_SITE_WIRED);
11897 SET_COUNT(VM_KERN_COUNT_WIRED_BOOT, ptoa_64(vm_page_wire_count_on_boot), 0);
11898 SET_COUNT(VM_KERN_COUNT_BOOT_STOLEN, booter_size, VM_KERN_SITE_WIRED);
11899 SET_COUNT(VM_KERN_COUNT_WIRED_STATIC_KERNELCACHE, ptoa_64(vm_page_kernelcache_count), 0);
11900 #if CONFIG_EXCLAVES
11901 SET_COUNT(VM_KERN_COUNT_EXCLAVES_CARVEOUT, exclaves_carveout_size + exclaves_bundle_size, VM_KERN_SITE_WIRED);
11902 #endif /* CONFIG_EXCLAVES */
11903
11904 #define SET_MAP(xcount, xsize, xfree, xlargest) MACRO_BEGIN \
11905 counts[xcount].site = (xcount); \
11906 counts[xcount].size = (xsize); \
11907 counts[xcount].mapped = (xsize); \
11908 counts[xcount].free = (xfree); \
11909 counts[xcount].largest = (xlargest); \
11910 counts[xcount].flags = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED; \
11911 strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \
11912 MACRO_END;
11913
11914 vm_map_size_t map_size, map_free, map_largest;
11915
11916 vm_map_sizes(kernel_map, &map_size, &map_free, &map_largest);
11917 SET_MAP(VM_KERN_COUNT_MAP_KERNEL, map_size, map_free, map_largest);
11918
11919 zone_map_sizes(&map_size, &map_free, &map_largest);
11920 SET_MAP(VM_KERN_COUNT_MAP_ZONE, map_size, map_free, map_largest);
11921
11922 assert(num_info >= zone_view_count);
11923 num_info -= zone_view_count;
11924 counts = &info[num_info];
11925 i = 0;
11926
11927 if (!redact_info) {
11928 if (zone_is_data_buffers_kheap(KHEAP_DATA_BUFFERS->kh_heap_id)) {
11929 i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_BUFFERS);
11930 }
11931 if (zone_is_data_shared_kheap(KHEAP_DATA_SHARED->kh_heap_id)) {
11932 i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_SHARED);
11933 }
11934
11935 if (KHEAP_KT_VAR->kh_heap_id == KHEAP_ID_KT_VAR) {
11936 i += vm_page_diagnose_kt_heaps(counts + i);
11937 }
11938 assert(i <= zone_view_count);
11939
11940 zone_index_foreach(zidx) {
11941 zone_t z = &zone_array[zidx];
11942 zone_security_flags_t zsflags = zone_security_array[zidx];
11943 zone_view_t zv = z->z_views;
11944
11945 if (zv == NULL) {
11946 continue;
11947 }
11948
11949 zone_stats_t zv_stats_head = z->z_stats;
11950 bool has_raw_view = false;
11951
11952 for (; zv; zv = zv->zv_next) {
11953 /*
11954 * kalloc_types that allocate from the same zone are linked
11955 * as views. Only print the ones that have their own stats.
11956 */
11957 if (zv->zv_stats == zv_stats_head) {
11958 continue;
11959 }
11960 has_raw_view = true;
11961 vm_page_diagnose_zone_stats(counts + i, zv->zv_stats,
11962 z->z_percpu);
11963 snprintf(counts[i].name, sizeof(counts[i].name), "%s%s[%s]",
11964 zone_heap_name(z), z->z_name, zv->zv_name);
11965 i++;
11966 assert(i <= zone_view_count);
11967 }
11968
11969 /*
11970 * Print raw views for non kalloc or kalloc_type zones
11971 */
11972 bool kalloc_type = zsflags.z_kalloc_type;
11973 if ((zsflags.z_kheap_id == KHEAP_ID_NONE && !kalloc_type) ||
11974 (kalloc_type && has_raw_view)) {
11975 vm_page_diagnose_zone(counts + i, z);
11976 i++;
11977 assert(i <= zone_view_count);
11978 }
11979 }
11980 }
11981
11982 iterate = !VM_TAG_ACTIVE_UPDATE;
11983 if (iterate) {
11984 enum { kMaxKernelDepth = 1 };
11985 vm_map_t maps[kMaxKernelDepth];
11986 vm_map_entry_t entries[kMaxKernelDepth];
11987 vm_map_t map;
11988 vm_map_entry_t entry;
11989 vm_object_offset_t offset;
11990 vm_page_t page;
11991 int stackIdx, count;
11992
11993 #if !VM_TAG_ACTIVE_UPDATE
11994 vm_page_iterate_objects(info, num_info, &vm_page_count_object);
11995 #endif /* ! VM_TAG_ACTIVE_UPDATE */
11996
11997 map = kernel_map;
11998 stackIdx = 0;
11999 while (map) {
12000 vm_map_lock(map);
12001 for (entry = map->hdr.links.next; map; entry = entry->vme_next) {
12002 if (entry->is_sub_map) {
12003 assert(stackIdx < kMaxKernelDepth);
12004 maps[stackIdx] = map;
12005 entries[stackIdx] = entry;
12006 stackIdx++;
12007 map = VME_SUBMAP(entry);
12008 entry = NULL;
12009 break;
12010 }
12011
12012 vmlp_range_event_entry(map, entry);
12013
12014 if (is_kernel_object(VME_OBJECT(entry))) {
12015 count = 0;
12016 vm_object_lock(VME_OBJECT(entry));
12017 for (offset = entry->vme_start; offset < entry->vme_end; offset += page_size) {
12018 page = vm_page_lookup(VME_OBJECT(entry), offset);
12019 if (page && VM_PAGE_WIRED(page)) {
12020 count++;
12021 }
12022 }
12023 vm_object_unlock(VME_OBJECT(entry));
12024
12025 if (count) {
12026 assert(VME_ALIAS(entry) != VM_KERN_MEMORY_NONE);
12027 assert(VME_ALIAS(entry) < num_info);
12028 info[VME_ALIAS(entry)].size += ptoa_64(count);
12029 }
12030 }
12031 while (map && (entry == vm_map_last_entry(map))) {
12032 vm_map_unlock(map);
12033 if (!stackIdx) {
12034 map = NULL;
12035 } else {
12036 --stackIdx;
12037 map = maps[stackIdx];
12038 entry = entries[stackIdx];
12039 }
12040 }
12041 }
12042 }
12043 }
12044
12045 process_account(info, num_info, zones_collectable_bytes, iterate, redact_info);
12046
12047 vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_SUCCESS);
12048 return KERN_SUCCESS;
12049 }
12050
12051 #if DEBUG || DEVELOPMENT
12052
12053 kern_return_t
vm_kern_allocation_info(uintptr_t addr,vm_size_t * size,vm_tag_t * tag,vm_size_t * zone_size)12054 vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_size_t * zone_size)
12055 {
12056 kern_return_t ret;
12057 vm_size_t zsize;
12058 vm_map_t map;
12059 vm_map_entry_t entry;
12060
12061 vmlp_api_start(VM_KERN_ALLOCATION_INFO);
12062
12063 zsize = zone_element_info((void *) addr, tag);
12064 if (zsize) {
12065 *zone_size = *size = zsize;
12066 vmlp_api_end(VM_KERN_ALLOCATION_INFO, KERN_SUCCESS);
12067 return KERN_SUCCESS;
12068 }
12069
12070 *zone_size = 0;
12071 ret = KERN_INVALID_ADDRESS;
12072 for (map = kernel_map; map;) {
12073 vm_map_lock(map);
12074 if (!vm_map_lookup_entry(map, addr, &entry)) {
12075 break;
12076 }
12077 if (entry->is_sub_map) {
12078 if (map != kernel_map) {
12079 break;
12080 }
12081 map = VME_SUBMAP(entry);
12082 continue;
12083 }
12084 if (entry->vme_start != addr) {
12085 break;
12086 }
12087
12088 vmlp_range_event_entry(map, entry);
12089
12090 *tag = (vm_tag_t)VME_ALIAS(entry);
12091 *size = (entry->vme_end - addr);
12092 ret = KERN_SUCCESS;
12093 break;
12094 }
12095 if (map != kernel_map) {
12096 vm_map_unlock(map);
12097 }
12098 vm_map_unlock(kernel_map);
12099
12100 vmlp_api_end(VM_KERN_ALLOCATION_INFO, ret);
12101 return ret;
12102 }
12103
12104 // some DEBUG/DEVELOPMENT code to get a process to page out its shared cache TEXT pages,
12105 // only used for DK driver in LPW testing
12106 uint64_t
vm_task_evict_shared_cache(task_t task)12107 vm_task_evict_shared_cache(task_t task)
12108 {
12109 enum { kMaxKernelDepth = 3 };
12110 vm_map_t maps[kMaxKernelDepth];
12111 vm_map_entry_t entries[kMaxKernelDepth];
12112 vm_map_t map;
12113 vm_object_t textObject, shadow;
12114 vm_map_entry_t entry;
12115 vm_object_offset_t textOffset, textSize;
12116 int stackIdx;
12117 uint64_t count;
12118
12119 count = counter_load(&task->pageins);
12120 map = get_task_map(task);
12121 textObject = NULL;
12122 stackIdx = 0;
12123 while (map) {
12124 vm_map_lock_read(map);
12125 for (entry = map->hdr.links.next; map; entry = entry->vme_next) {
12126 if (entry->is_sub_map) {
12127 assert(stackIdx < kMaxKernelDepth);
12128 maps[stackIdx] = map;
12129 entries[stackIdx] = entry;
12130 stackIdx++;
12131 map = VME_SUBMAP(entry);
12132 entry = NULL;
12133 break;
12134 }
12135 if (stackIdx && (VM_PROT_EXECUTE | VM_PROT_READ) == entry->protection) {
12136 textObject = VME_OBJECT(entry);
12137 vm_object_lock(textObject);
12138 while ((shadow = textObject->shadow)) {
12139 vm_object_lock(shadow);
12140 vm_object_unlock(textObject);
12141 textObject = shadow;
12142 }
12143 vm_object_reference_locked(textObject);
12144 vm_object_unlock(textObject);
12145 textOffset = VME_OFFSET(entry);
12146 textSize = entry->vme_end - entry->vme_start;
12147 entry = vm_map_last_entry(map);
12148 }
12149 while (map && (entry == vm_map_last_entry(map))) {
12150 vm_map_unlock_read(map);
12151 if (!stackIdx) {
12152 map = NULL;
12153 } else {
12154 --stackIdx;
12155 map = maps[stackIdx];
12156 entry = entries[stackIdx];
12157 if (textObject) {
12158 entry = vm_map_last_entry(map);
12159 }
12160 }
12161 }
12162 }
12163 }
12164
12165 if (textObject) {
12166 vm_object_sync(textObject, textOffset, textSize, true, false, false);
12167 vm_object_deallocate(textObject);
12168 }
12169 return count;
12170 }
12171
12172 uint64_t
vm_task_pageins(task_t task)12173 vm_task_pageins(task_t task)
12174 {
12175 return counter_load(&task->pageins);
12176 }
12177
12178 #endif /* DEBUG || DEVELOPMENT */
12179
12180 uint32_t
vm_tag_get_kext(vm_tag_t tag,char * name,vm_size_t namelen)12181 vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen)
12182 {
12183 vm_allocation_site_t * site;
12184 uint32_t kmodId;
12185
12186 kmodId = 0;
12187 lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
12188 if ((site = vm_allocation_sites[tag])) {
12189 if (VM_TAG_KMOD & site->flags) {
12190 kmodId = OSKextGetKmodIDForSite(site, name, namelen);
12191 }
12192 }
12193 lck_ticket_unlock(&vm_allocation_sites_lock);
12194
12195 return kmodId;
12196 }
12197
12198
12199 #if CONFIG_SECLUDED_MEMORY
12200 /*
12201 * Note that there's no locking around other accesses to vm_page_secluded_target.
12202 * That should be OK, since these are the only place where it can be changed after
12203 * initialization. Other users (like vm_pageout) may see the wrong value briefly,
12204 * but will eventually get the correct value. This brief mismatch is OK as pageout
12205 * and page freeing will auto-adjust the vm_page_secluded_count to match the target
12206 * over time.
12207 */
12208 unsigned int vm_page_secluded_suppress_cnt = 0;
12209 unsigned int vm_page_secluded_save_target;
12210
12211 LCK_GRP_DECLARE(secluded_suppress_slock_grp, "secluded_suppress_slock");
12212 LCK_SPIN_DECLARE(secluded_suppress_slock, &secluded_suppress_slock_grp);
12213
12214 void
start_secluded_suppression(task_t task)12215 start_secluded_suppression(task_t task)
12216 {
12217 if (task->task_suppressed_secluded) {
12218 return;
12219 }
12220 lck_spin_lock(&secluded_suppress_slock);
12221 if (!task->task_suppressed_secluded && vm_page_secluded_suppress_cnt++ == 0) {
12222 task->task_suppressed_secluded = TRUE;
12223 vm_page_secluded_save_target = vm_page_secluded_target;
12224 vm_page_secluded_target = 0;
12225 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
12226 }
12227 lck_spin_unlock(&secluded_suppress_slock);
12228 }
12229
12230 void
stop_secluded_suppression(task_t task)12231 stop_secluded_suppression(task_t task)
12232 {
12233 lck_spin_lock(&secluded_suppress_slock);
12234 if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) {
12235 task->task_suppressed_secluded = FALSE;
12236 vm_page_secluded_target = vm_page_secluded_save_target;
12237 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
12238 }
12239 lck_spin_unlock(&secluded_suppress_slock);
12240 }
12241
12242 #endif /* CONFIG_SECLUDED_MEMORY */
12243
12244 /*
12245 * Move the list of retired pages on the vm_page_queue_retired to
12246 * their final resting place on retired_pages_object.
12247 */
12248 void
vm_retire_boot_pages(void)12249 vm_retire_boot_pages(void)
12250 {
12251 }
12252
12253 /*
12254 * This holds the reported physical address if an ECC error leads to a panic.
12255 * SMC will store it in PMU SRAM under the 'sECC' key.
12256 */
12257 uint64_t ecc_panic_physical_address = 0;
12258
12259