1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_page.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Resident memory management module.
63 */
64
65 #include <debug.h>
66 #include <libkern/OSAtomic.h>
67 #include <libkern/OSDebug.h>
68
69 #include <mach/clock_types.h>
70 #include <mach/vm_prot.h>
71 #include <mach/vm_statistics.h>
72 #include <mach/sdt.h>
73 #include <kern/counter.h>
74 #include <kern/host_statistics.h>
75 #include <kern/sched_prim.h>
76 #include <kern/policy_internal.h>
77 #include <kern/task.h>
78 #include <kern/thread.h>
79 #include <kern/kalloc.h>
80 #include <kern/zalloc_internal.h>
81 #include <kern/ledger.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_init.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_kern.h> /* kmem_alloc() */
88 #include <kern/misc_protos.h>
89 #include <mach_debug/zone_info.h>
90 #include <vm/cpm.h>
91 #include <pexpert/pexpert.h>
92 #include <san/kasan.h>
93
94 #include <vm/vm_protos.h>
95 #include <vm/memory_object.h>
96 #include <vm/vm_purgeable_internal.h>
97 #include <vm/vm_compressor.h>
98 #if defined (__x86_64__)
99 #include <i386/misc_protos.h>
100 #endif
101
102 #if CONFIG_PHANTOM_CACHE
103 #include <vm/vm_phantom_cache.h>
104 #endif
105
106 #if HIBERNATION
107 #include <IOKit/IOHibernatePrivate.h>
108 #include <machine/pal_hibernate.h>
109 #endif /* HIBERNATION */
110
111 #include <sys/kdebug.h>
112
113 #if defined(HAS_APPLE_PAC)
114 #include <ptrauth.h>
115 #endif
116 #if defined(__arm64__)
117 #include <arm/cpu_internal.h>
118 #endif /* defined(__arm64__) */
119
120 #if MACH_ASSERT
121
122 #define ASSERT_PMAP_FREE(mem) pmap_assert_free(VM_PAGE_GET_PHYS_PAGE(mem))
123
124 #else /* MACH_ASSERT */
125
126 #define ASSERT_PMAP_FREE(mem) /* nothing */
127
128 #endif /* MACH_ASSERT */
129
130 extern boolean_t vm_pageout_running;
131 extern thread_t vm_pageout_scan_thread;
132 extern boolean_t vps_dynamic_priority_enabled;
133
134 char vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
135 char vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
136 char vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
137 char vm_page_active_or_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
138
139 #if CONFIG_SECLUDED_MEMORY
140 struct vm_page_secluded_data vm_page_secluded;
141 #endif /* CONFIG_SECLUDED_MEMORY */
142
143 #if DEVELOPMENT || DEBUG
144 extern struct memory_object_pager_ops shared_region_pager_ops;
145 unsigned int shared_region_pagers_resident_count = 0;
146 unsigned int shared_region_pagers_resident_peak = 0;
147 #endif /* DEVELOPMENT || DEBUG */
148
149 int PERCPU_DATA(start_color);
150 vm_page_t PERCPU_DATA(free_pages);
151 boolean_t hibernate_cleaning_in_progress = FALSE;
152 boolean_t vm_page_free_verify = TRUE;
153
154 uint32_t vm_lopage_free_count = 0;
155 uint32_t vm_lopage_free_limit = 0;
156 uint32_t vm_lopage_lowater = 0;
157 boolean_t vm_lopage_refill = FALSE;
158 boolean_t vm_lopage_needed = FALSE;
159
160 lck_mtx_ext_t vm_page_queue_lock_ext;
161 lck_mtx_ext_t vm_page_queue_free_lock_ext;
162 lck_mtx_ext_t vm_purgeable_queue_lock_ext;
163
164 int speculative_age_index = 0;
165 int speculative_steal_index = 0;
166 struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1];
167
168 boolean_t hibernation_vmqueues_inspection = FALSE; /* Tracks if the hibernation code is looking at the VM queues.
169 * Updated and checked behind the vm_page_queues_lock. */
170
171 static void vm_page_free_prepare(vm_page_t page);
172 static vm_page_t vm_page_grab_fictitious_common(ppnum_t, boolean_t);
173
174 static void vm_tag_init(void);
175
176 /* for debugging purposes */
177 SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask =
178 VM_PAGE_PACKED_FROM_ARRAY;
179 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params =
180 VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR);
181
182 /*
183 * Associated with page of user-allocatable memory is a
184 * page structure.
185 */
186
187 /*
188 * These variables record the values returned by vm_page_bootstrap,
189 * for debugging purposes. The implementation of pmap_steal_memory
190 * and pmap_startup here also uses them internally.
191 */
192
193 vm_offset_t virtual_space_start;
194 vm_offset_t virtual_space_end;
195 uint32_t vm_page_pages;
196
197 /*
198 * The vm_page_lookup() routine, which provides for fast
199 * (virtual memory object, offset) to page lookup, employs
200 * the following hash table. The vm_page_{insert,remove}
201 * routines install and remove associations in the table.
202 * [This table is often called the virtual-to-physical,
203 * or VP, table.]
204 */
205 typedef struct {
206 vm_page_packed_t page_list;
207 #if MACH_PAGE_HASH_STATS
208 int cur_count; /* current count */
209 int hi_count; /* high water mark */
210 #endif /* MACH_PAGE_HASH_STATS */
211 } vm_page_bucket_t;
212
213
214 #define BUCKETS_PER_LOCK 16
215
216 SECURITY_READ_ONLY_LATE(vm_page_bucket_t *) vm_page_buckets; /* Array of buckets */
217 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_count = 0; /* How big is array? */
218 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_mask; /* Mask for hash function */
219 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_shift; /* Shift for hash function */
220 SECURITY_READ_ONLY_LATE(uint32_t) vm_page_bucket_hash; /* Basic bucket hash */
221 SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_lock_count = 0; /* How big is array of locks? */
222
223 #ifndef VM_TAG_ACTIVE_UPDATE
224 #error VM_TAG_ACTIVE_UPDATE
225 #endif
226 #ifndef VM_TAG_SIZECLASSES
227 #error VM_TAG_SIZECLASSES
228 #endif
229
230 /* for debugging */
231 SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE;
232 SECURITY_READ_ONLY_LATE(lck_spin_t *) vm_page_bucket_locks;
233
234 vm_allocation_site_t vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1];
235 vm_allocation_site_t * vm_allocation_sites[VM_MAX_TAG_VALUE];
236 #if VM_TAG_SIZECLASSES
237 static vm_allocation_zone_total_t **vm_allocation_zone_totals;
238 #endif /* VM_TAG_SIZECLASSES */
239
240 vm_tag_t vm_allocation_tag_highest;
241
242 #if VM_PAGE_BUCKETS_CHECK
243 boolean_t vm_page_buckets_check_ready = FALSE;
244 #if VM_PAGE_FAKE_BUCKETS
245 vm_page_bucket_t *vm_page_fake_buckets; /* decoy buckets */
246 vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
247 #endif /* VM_PAGE_FAKE_BUCKETS */
248 #endif /* VM_PAGE_BUCKETS_CHECK */
249
250 #if MACH_PAGE_HASH_STATS
251 /* This routine is only for debug. It is intended to be called by
252 * hand by a developer using a kernel debugger. This routine prints
253 * out vm_page_hash table statistics to the kernel debug console.
254 */
255 void
hash_debug(void)256 hash_debug(void)
257 {
258 int i;
259 int numbuckets = 0;
260 int highsum = 0;
261 int maxdepth = 0;
262
263 for (i = 0; i < vm_page_bucket_count; i++) {
264 if (vm_page_buckets[i].hi_count) {
265 numbuckets++;
266 highsum += vm_page_buckets[i].hi_count;
267 if (vm_page_buckets[i].hi_count > maxdepth) {
268 maxdepth = vm_page_buckets[i].hi_count;
269 }
270 }
271 }
272 printf("Total number of buckets: %d\n", vm_page_bucket_count);
273 printf("Number used buckets: %d = %d%%\n",
274 numbuckets, 100 * numbuckets / vm_page_bucket_count);
275 printf("Number unused buckets: %d = %d%%\n",
276 vm_page_bucket_count - numbuckets,
277 100 * (vm_page_bucket_count - numbuckets) / vm_page_bucket_count);
278 printf("Sum of bucket max depth: %d\n", highsum);
279 printf("Average bucket depth: %d.%2d\n",
280 highsum / vm_page_bucket_count,
281 highsum % vm_page_bucket_count);
282 printf("Maximum bucket depth: %d\n", maxdepth);
283 }
284 #endif /* MACH_PAGE_HASH_STATS */
285
286 /*
287 * The virtual page size is currently implemented as a runtime
288 * variable, but is constant once initialized using vm_set_page_size.
289 * This initialization must be done in the machine-dependent
290 * bootstrap sequence, before calling other machine-independent
291 * initializations.
292 *
293 * All references to the virtual page size outside this
294 * module must use the PAGE_SIZE, PAGE_MASK and PAGE_SHIFT
295 * constants.
296 */
297 #if defined(__arm__) || defined(__arm64__)
298 vm_size_t page_size;
299 vm_size_t page_mask;
300 int page_shift;
301 #else
302 vm_size_t page_size = PAGE_SIZE;
303 vm_size_t page_mask = PAGE_MASK;
304 int page_shift = PAGE_SHIFT;
305 #endif
306
307 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages = VM_PAGE_NULL;
308 SECURITY_READ_ONLY_LATE(vm_page_t) vm_page_array_beginning_addr;
309 vm_page_t vm_page_array_ending_addr;
310
311 unsigned int vm_pages_count = 0;
312
313 /*
314 * Resident pages that represent real memory
315 * are allocated from a set of free lists,
316 * one per color.
317 */
318 unsigned int vm_colors;
319 unsigned int vm_color_mask; /* mask is == (vm_colors-1) */
320 unsigned int vm_cache_geometry_colors = 0; /* set by hw dependent code during startup */
321 unsigned int vm_free_magazine_refill_limit = 0;
322
323
324 struct vm_page_queue_free_head {
325 vm_page_queue_head_t qhead;
326 } VM_PAGE_PACKED_ALIGNED;
327
328 struct vm_page_queue_free_head vm_page_queue_free[MAX_COLORS];
329
330
331 unsigned int vm_page_free_wanted;
332 unsigned int vm_page_free_wanted_privileged;
333 #if CONFIG_SECLUDED_MEMORY
334 unsigned int vm_page_free_wanted_secluded;
335 #endif /* CONFIG_SECLUDED_MEMORY */
336 unsigned int vm_page_free_count;
337
338 /*
339 * Occasionally, the virtual memory system uses
340 * resident page structures that do not refer to
341 * real pages, for example to leave a page with
342 * important state information in the VP table.
343 *
344 * These page structures are allocated the way
345 * most other kernel structures are.
346 */
347 SECURITY_READ_ONLY_LATE(zone_t) vm_page_zone;
348 vm_locks_array_t vm_page_locks;
349
350 LCK_ATTR_DECLARE(vm_page_lck_attr, 0, 0);
351 LCK_GRP_DECLARE(vm_page_lck_grp_free, "vm_page_free");
352 LCK_GRP_DECLARE(vm_page_lck_grp_queue, "vm_page_queue");
353 LCK_GRP_DECLARE(vm_page_lck_grp_local, "vm_page_queue_local");
354 LCK_GRP_DECLARE(vm_page_lck_grp_purge, "vm_page_purge");
355 LCK_GRP_DECLARE(vm_page_lck_grp_alloc, "vm_page_alloc");
356 LCK_GRP_DECLARE(vm_page_lck_grp_bucket, "vm_page_bucket");
357 LCK_SPIN_DECLARE_ATTR(vm_objects_wired_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
358 LCK_SPIN_DECLARE_ATTR(vm_allocation_sites_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
359
360 unsigned int vm_page_local_q_soft_limit = 250;
361 unsigned int vm_page_local_q_hard_limit = 500;
362 struct vpl *__zpercpu vm_page_local_q;
363
364 /* N.B. Guard and fictitious pages must not
365 * be assigned a zero phys_page value.
366 */
367 /*
368 * Fictitious pages don't have a physical address,
369 * but we must initialize phys_page to something.
370 * For debugging, this should be a strange value
371 * that the pmap module can recognize in assertions.
372 */
373 const ppnum_t vm_page_fictitious_addr = (ppnum_t) -1;
374
375 /*
376 * Guard pages are not accessible so they don't
377 * need a physical address, but we need to enter
378 * one in the pmap.
379 * Let's make it recognizable and make sure that
380 * we don't use a real physical page with that
381 * physical address.
382 */
383 const ppnum_t vm_page_guard_addr = (ppnum_t) -2;
384
385 /*
386 * Resident page structures are also chained on
387 * queues that are used by the page replacement
388 * system (pageout daemon). These queues are
389 * defined here, but are shared by the pageout
390 * module. The inactive queue is broken into
391 * file backed and anonymous for convenience as the
392 * pageout daemon often assignes a higher
393 * importance to anonymous pages (less likely to pick)
394 */
395 vm_page_queue_head_t vm_page_queue_active VM_PAGE_PACKED_ALIGNED;
396 vm_page_queue_head_t vm_page_queue_inactive VM_PAGE_PACKED_ALIGNED;
397 #if CONFIG_SECLUDED_MEMORY
398 vm_page_queue_head_t vm_page_queue_secluded VM_PAGE_PACKED_ALIGNED;
399 #endif /* CONFIG_SECLUDED_MEMORY */
400 vm_page_queue_head_t vm_page_queue_anonymous VM_PAGE_PACKED_ALIGNED; /* inactive memory queue for anonymous pages */
401 vm_page_queue_head_t vm_page_queue_throttled VM_PAGE_PACKED_ALIGNED;
402
403 queue_head_t vm_objects_wired;
404
405 void vm_update_darkwake_mode(boolean_t);
406
407 #if CONFIG_BACKGROUND_QUEUE
408 vm_page_queue_head_t vm_page_queue_background VM_PAGE_PACKED_ALIGNED;
409 uint32_t vm_page_background_target;
410 uint32_t vm_page_background_target_snapshot;
411 uint32_t vm_page_background_count;
412 uint64_t vm_page_background_promoted_count;
413
414 uint32_t vm_page_background_internal_count;
415 uint32_t vm_page_background_external_count;
416
417 uint32_t vm_page_background_mode;
418 uint32_t vm_page_background_exclude_external;
419 #endif
420
421 unsigned int vm_page_active_count;
422 unsigned int vm_page_inactive_count;
423 unsigned int vm_page_kernelcache_count;
424 #if CONFIG_SECLUDED_MEMORY
425 unsigned int vm_page_secluded_count;
426 unsigned int vm_page_secluded_count_free;
427 unsigned int vm_page_secluded_count_inuse;
428 unsigned int vm_page_secluded_count_over_target;
429 #endif /* CONFIG_SECLUDED_MEMORY */
430 unsigned int vm_page_anonymous_count;
431 unsigned int vm_page_throttled_count;
432 unsigned int vm_page_speculative_count;
433
434 unsigned int vm_page_wire_count;
435 unsigned int vm_page_wire_count_on_boot = 0;
436 unsigned int vm_page_stolen_count = 0;
437 unsigned int vm_page_wire_count_initial;
438 unsigned int vm_page_gobble_count = 0;
439 unsigned int vm_page_kern_lpage_count = 0;
440
441 uint64_t booter_size; /* external so it can be found in core dumps */
442
443 #define VM_PAGE_WIRE_COUNT_WARNING 0
444 #define VM_PAGE_GOBBLE_COUNT_WARNING 0
445
446 unsigned int vm_page_purgeable_count = 0; /* # of pages purgeable now */
447 unsigned int vm_page_purgeable_wired_count = 0; /* # of purgeable pages that are wired now */
448 uint64_t vm_page_purged_count = 0; /* total count of purged pages */
449
450 unsigned int vm_page_xpmapped_external_count = 0;
451 unsigned int vm_page_external_count = 0;
452 unsigned int vm_page_internal_count = 0;
453 unsigned int vm_page_pageable_external_count = 0;
454 unsigned int vm_page_pageable_internal_count = 0;
455
456 #if DEVELOPMENT || DEBUG
457 unsigned int vm_page_speculative_recreated = 0;
458 unsigned int vm_page_speculative_created = 0;
459 unsigned int vm_page_speculative_used = 0;
460 #endif
461
462 vm_page_queue_head_t vm_page_queue_cleaned VM_PAGE_PACKED_ALIGNED;
463
464 unsigned int vm_page_cleaned_count = 0;
465
466 uint64_t max_valid_dma_address = 0xffffffffffffffffULL;
467 ppnum_t max_valid_low_ppnum = PPNUM_MAX;
468
469
470 /*
471 * Several page replacement parameters are also
472 * shared with this module, so that page allocation
473 * (done here in vm_page_alloc) can trigger the
474 * pageout daemon.
475 */
476 unsigned int vm_page_free_target = 0;
477 unsigned int vm_page_free_min = 0;
478 unsigned int vm_page_throttle_limit = 0;
479 unsigned int vm_page_inactive_target = 0;
480 #if CONFIG_SECLUDED_MEMORY
481 unsigned int vm_page_secluded_target = 0;
482 #endif /* CONFIG_SECLUDED_MEMORY */
483 unsigned int vm_page_anonymous_min = 0;
484 unsigned int vm_page_free_reserved = 0;
485
486
487 /*
488 * The VM system has a couple of heuristics for deciding
489 * that pages are "uninteresting" and should be placed
490 * on the inactive queue as likely candidates for replacement.
491 * These variables let the heuristics be controlled at run-time
492 * to make experimentation easier.
493 */
494
495 boolean_t vm_page_deactivate_hint = TRUE;
496
497 struct vm_page_stats_reusable vm_page_stats_reusable;
498
499 /*
500 * vm_set_page_size:
501 *
502 * Sets the page size, perhaps based upon the memory
503 * size. Must be called before any use of page-size
504 * dependent functions.
505 *
506 * Sets page_shift and page_mask from page_size.
507 */
508 void
vm_set_page_size(void)509 vm_set_page_size(void)
510 {
511 page_size = PAGE_SIZE;
512 page_mask = PAGE_MASK;
513 page_shift = PAGE_SHIFT;
514
515 if ((page_mask & page_size) != 0) {
516 panic("vm_set_page_size: page size not a power of two");
517 }
518
519 for (page_shift = 0;; page_shift++) {
520 if ((1U << page_shift) == page_size) {
521 break;
522 }
523 }
524 }
525
526 #if defined (__x86_64__)
527
528 #define MAX_CLUMP_SIZE 16
529 #define DEFAULT_CLUMP_SIZE 4
530
531 unsigned int vm_clump_size, vm_clump_mask, vm_clump_shift, vm_clump_promote_threshold;
532
533 #if DEVELOPMENT || DEBUG
534 unsigned long vm_clump_stats[MAX_CLUMP_SIZE + 1];
535 unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
536
537 static inline void
vm_clump_update_stats(unsigned int c)538 vm_clump_update_stats(unsigned int c)
539 {
540 assert(c <= vm_clump_size);
541 if (c > 0 && c <= vm_clump_size) {
542 vm_clump_stats[c] += c;
543 }
544 vm_clump_allocs += c;
545 }
546 #endif /* if DEVELOPMENT || DEBUG */
547
548 /* Called once to setup the VM clump knobs */
549 static void
vm_page_setup_clump(void)550 vm_page_setup_clump( void )
551 {
552 unsigned int override, n;
553
554 vm_clump_size = DEFAULT_CLUMP_SIZE;
555 if (PE_parse_boot_argn("clump_size", &override, sizeof(override))) {
556 vm_clump_size = override;
557 }
558
559 if (vm_clump_size > MAX_CLUMP_SIZE) {
560 panic("vm_page_setup_clump:: clump_size is too large!");
561 }
562 if (vm_clump_size < 1) {
563 panic("vm_page_setup_clump:: clump_size must be >= 1");
564 }
565 if ((vm_clump_size & (vm_clump_size - 1)) != 0) {
566 panic("vm_page_setup_clump:: clump_size must be a power of 2");
567 }
568
569 vm_clump_promote_threshold = vm_clump_size;
570 vm_clump_mask = vm_clump_size - 1;
571 for (vm_clump_shift = 0, n = vm_clump_size; n > 1; n >>= 1, vm_clump_shift++) {
572 ;
573 }
574
575 #if DEVELOPMENT || DEBUG
576 bzero(vm_clump_stats, sizeof(vm_clump_stats));
577 vm_clump_allocs = vm_clump_inserts = vm_clump_inrange = vm_clump_promotes = 0;
578 #endif /* if DEVELOPMENT || DEBUG */
579 }
580
581 #endif /* #if defined (__x86_64__) */
582
583 #define COLOR_GROUPS_TO_STEAL 4
584
585 /* Called once during statup, once the cache geometry is known.
586 */
587 static void
vm_page_set_colors(void)588 vm_page_set_colors( void )
589 {
590 unsigned int n, override;
591
592 #if defined (__x86_64__)
593 /* adjust #colors because we need to color outside the clump boundary */
594 vm_cache_geometry_colors >>= vm_clump_shift;
595 #endif
596 if (PE_parse_boot_argn("colors", &override, sizeof(override))) { /* colors specified as a boot-arg? */
597 n = override;
598 } else if (vm_cache_geometry_colors) { /* do we know what the cache geometry is? */
599 n = vm_cache_geometry_colors;
600 } else {
601 n = DEFAULT_COLORS; /* use default if all else fails */
602 }
603 if (n == 0) {
604 n = 1;
605 }
606 if (n > MAX_COLORS) {
607 n = MAX_COLORS;
608 }
609
610 /* the count must be a power of 2 */
611 if ((n & (n - 1)) != 0) {
612 n = DEFAULT_COLORS; /* use default if all else fails */
613 }
614 vm_colors = n;
615 vm_color_mask = n - 1;
616
617 vm_free_magazine_refill_limit = vm_colors * COLOR_GROUPS_TO_STEAL;
618
619 #if defined (__x86_64__)
620 /* adjust for reduction in colors due to clumping and multiple cores */
621 if (real_ncpus) {
622 vm_free_magazine_refill_limit *= (vm_clump_size * real_ncpus);
623 }
624 #endif
625 }
626
627 /*
628 * During single threaded early boot we don't initialize all pages.
629 * This avoids some delay during boot. They'll be initialized and
630 * added to the free list as needed or after we are multithreaded by
631 * what becomes the pageout thread.
632 */
633 static boolean_t fill = FALSE;
634 static unsigned int fillval;
635 uint_t vm_delayed_count = 0; /* when non-zero, indicates we may have more pages to init */
636 ppnum_t delay_above_pnum = PPNUM_MAX;
637
638 /*
639 * For x86 first 8 Gig initializes quickly and gives us lots of lowmem + mem above to start off with.
640 * If ARM ever uses delayed page initialization, this value may need to be quite different.
641 */
642 #define DEFAULT_DELAY_ABOVE_PHYS_GB (8)
643
644 /*
645 * When we have to dip into more delayed pages due to low memory, free up
646 * a large chunk to get things back to normal. This avoids contention on the
647 * delayed code allocating page by page.
648 */
649 #define VM_DELAY_PAGE_CHUNK ((1024 * 1024 * 1024) / PAGE_SIZE)
650
651 /*
652 * Get and initialize the next delayed page.
653 */
654 static vm_page_t
vm_get_delayed_page(int grab_options)655 vm_get_delayed_page(int grab_options)
656 {
657 vm_page_t p;
658 ppnum_t pnum;
659
660 /*
661 * Get a new page if we have one.
662 */
663 lck_mtx_lock(&vm_page_queue_free_lock);
664 if (vm_delayed_count == 0) {
665 lck_mtx_unlock(&vm_page_queue_free_lock);
666 return NULL;
667 }
668 if (!pmap_next_page(&pnum)) {
669 vm_delayed_count = 0;
670 lck_mtx_unlock(&vm_page_queue_free_lock);
671 return NULL;
672 }
673
674 assert(vm_delayed_count > 0);
675 --vm_delayed_count;
676
677 #if defined(__x86_64__)
678 /* x86 cluster code requires increasing phys_page in vm_pages[] */
679 if (vm_pages_count > 0) {
680 assert(pnum > vm_pages[vm_pages_count - 1].vmp_phys_page);
681 }
682 #endif
683 p = &vm_pages[vm_pages_count];
684 assert(p < vm_page_array_ending_addr);
685 vm_page_init(p, pnum, FALSE);
686 ++vm_pages_count;
687 ++vm_page_pages;
688 lck_mtx_unlock(&vm_page_queue_free_lock);
689
690 /*
691 * These pages were initially counted as wired, undo that now.
692 */
693 if (grab_options & VM_PAGE_GRAB_Q_LOCK_HELD) {
694 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
695 } else {
696 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
697 vm_page_lockspin_queues();
698 }
699 --vm_page_wire_count;
700 --vm_page_wire_count_initial;
701 if (vm_page_wire_count_on_boot != 0) {
702 --vm_page_wire_count_on_boot;
703 }
704 if (!(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
705 vm_page_unlock_queues();
706 }
707
708
709 if (fill) {
710 fillPage(pnum, fillval);
711 }
712 return p;
713 }
714
715 static void vm_page_module_init_delayed(void);
716
717 /*
718 * Free all remaining delayed pages to the free lists.
719 */
720 void
vm_free_delayed_pages(void)721 vm_free_delayed_pages(void)
722 {
723 vm_page_t p;
724 vm_page_t list = NULL;
725 uint_t cnt = 0;
726 vm_offset_t start_free_va;
727 int64_t free_size;
728
729 while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) {
730 if (vm_himemory_mode) {
731 vm_page_release(p, FALSE);
732 } else {
733 p->vmp_snext = list;
734 list = p;
735 }
736 ++cnt;
737 }
738
739 /*
740 * Free the pages in reverse order if not himemory mode.
741 * Hence the low memory pages will be first on free lists. (LIFO)
742 */
743 while (list != NULL) {
744 p = list;
745 list = p->vmp_snext;
746 p->vmp_snext = NULL;
747 vm_page_release(p, FALSE);
748 }
749 #if DEVELOPMENT || DEBUG
750 kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt);
751 #endif
752
753 /*
754 * Free up any unused full pages at the end of the vm_pages[] array
755 */
756 start_free_va = round_page((vm_offset_t)&vm_pages[vm_pages_count]);
757
758 #if defined(__x86_64__)
759 /*
760 * Since x86 might have used large pages for vm_pages[], we can't
761 * free starting in the middle of a partially used large page.
762 */
763 if (pmap_query_pagesize(kernel_pmap, start_free_va) == I386_LPGBYTES) {
764 start_free_va = ((start_free_va + I386_LPGMASK) & ~I386_LPGMASK);
765 }
766 #endif
767 if (start_free_va < (vm_offset_t)vm_page_array_ending_addr) {
768 free_size = trunc_page((vm_offset_t)vm_page_array_ending_addr - start_free_va);
769 if (free_size > 0) {
770 ml_static_mfree(start_free_va, (vm_offset_t)free_size);
771 vm_page_array_ending_addr = (void *)start_free_va;
772
773 /*
774 * Note there's no locking here, as only this thread will ever change this value.
775 * The reader, vm_page_diagnose, doesn't grab any locks for the counts it looks at.
776 */
777 vm_page_stolen_count -= (free_size >> PAGE_SHIFT);
778
779 #if DEVELOPMENT || DEBUG
780 kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n",
781 (long)free_size, (long)start_free_va);
782 #endif
783 }
784 }
785
786
787 /*
788 * now we can create the VM page array zone
789 */
790 vm_page_module_init_delayed();
791 }
792
793 /*
794 * Try and free up enough delayed pages to match a contig memory allocation.
795 */
796 static void
vm_free_delayed_pages_contig(uint_t npages,ppnum_t max_pnum,ppnum_t pnum_mask)797 vm_free_delayed_pages_contig(
798 uint_t npages,
799 ppnum_t max_pnum,
800 ppnum_t pnum_mask)
801 {
802 vm_page_t p;
803 ppnum_t pnum;
804 uint_t cnt = 0;
805
806 /*
807 * Treat 0 as the absolute max page number.
808 */
809 if (max_pnum == 0) {
810 max_pnum = PPNUM_MAX;
811 }
812
813 /*
814 * Free till we get a properly aligned start page
815 */
816 for (;;) {
817 p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
818 if (p == NULL) {
819 return;
820 }
821 pnum = VM_PAGE_GET_PHYS_PAGE(p);
822 vm_page_release(p, FALSE);
823 if (pnum >= max_pnum) {
824 return;
825 }
826 if ((pnum & pnum_mask) == 0) {
827 break;
828 }
829 }
830
831 /*
832 * Having a healthy pool of free pages will help performance. We don't
833 * want to fall back to the delayed code for every page allocation.
834 */
835 if (vm_page_free_count < VM_DELAY_PAGE_CHUNK) {
836 npages += VM_DELAY_PAGE_CHUNK;
837 }
838
839 /*
840 * Now free up the pages
841 */
842 for (cnt = 1; cnt < npages; ++cnt) {
843 p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
844 if (p == NULL) {
845 return;
846 }
847 vm_page_release(p, FALSE);
848 }
849 }
850
851 #define ROUNDUP_NEXTP2(X) (1U << (32 - __builtin_clz((X) - 1)))
852
853 void
vm_page_init_local_q(unsigned int num_cpus)854 vm_page_init_local_q(unsigned int num_cpus)
855 {
856 struct vpl *t_local_q;
857
858 /*
859 * no point in this for a uni-processor system
860 */
861 if (num_cpus >= 2) {
862 ml_cpu_info_t cpu_info;
863
864 /*
865 * Force the allocation alignment to a cacheline,
866 * because the `vpl` struct has a lock and will be taken
867 * cross CPU so we want to isolate the rest of the per-CPU
868 * data to avoid false sharing due to this lock being taken.
869 */
870
871 ml_cpu_get_info(&cpu_info);
872
873 t_local_q = zalloc_percpu_permanent(sizeof(struct vpl),
874 cpu_info.cache_line_size - 1);
875
876 zpercpu_foreach(lq, t_local_q) {
877 VPL_LOCK_INIT(lq, &vm_page_lck_grp_local, &vm_page_lck_attr);
878 vm_page_queue_init(&lq->vpl_queue);
879 }
880
881 /* make the initialization visible to all cores */
882 os_atomic_store(&vm_page_local_q, t_local_q, release);
883 }
884 }
885
886 /*
887 * vm_init_before_launchd
888 *
889 * This should be called right before launchd is loaded.
890 */
891 void
vm_init_before_launchd()892 vm_init_before_launchd()
893 {
894 vm_page_lockspin_queues();
895 vm_page_wire_count_on_boot = vm_page_wire_count;
896 vm_page_unlock_queues();
897 }
898
899
900 /*
901 * vm_page_bootstrap:
902 *
903 * Initializes the resident memory module.
904 *
905 * Allocates memory for the page cells, and
906 * for the object/offset-to-page hash table headers.
907 * Each page cell is initialized and placed on the free list.
908 * Returns the range of available kernel virtual memory.
909 */
910 __startup_func
911 void
vm_page_bootstrap(vm_offset_t * startp,vm_offset_t * endp)912 vm_page_bootstrap(
913 vm_offset_t *startp,
914 vm_offset_t *endp)
915 {
916 unsigned int i;
917 unsigned int log1;
918 unsigned int log2;
919 unsigned int size;
920
921 /*
922 * Initialize the page queues.
923 */
924
925 lck_mtx_init_ext(&vm_page_queue_free_lock, &vm_page_queue_free_lock_ext, &vm_page_lck_grp_free, &vm_page_lck_attr);
926 lck_mtx_init_ext(&vm_page_queue_lock, &vm_page_queue_lock_ext, &vm_page_lck_grp_queue, &vm_page_lck_attr);
927 lck_mtx_init_ext(&vm_purgeable_queue_lock, &vm_purgeable_queue_lock_ext, &vm_page_lck_grp_purge, &vm_page_lck_attr);
928
929 for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) {
930 int group;
931
932 purgeable_queues[i].token_q_head = 0;
933 purgeable_queues[i].token_q_tail = 0;
934 for (group = 0; group < NUM_VOLATILE_GROUPS; group++) {
935 queue_init(&purgeable_queues[i].objq[group]);
936 }
937
938 purgeable_queues[i].type = i;
939 purgeable_queues[i].new_pages = 0;
940 #if MACH_ASSERT
941 purgeable_queues[i].debug_count_tokens = 0;
942 purgeable_queues[i].debug_count_objects = 0;
943 #endif
944 }
945 ;
946 purgeable_nonvolatile_count = 0;
947 queue_init(&purgeable_nonvolatile_queue);
948
949 for (i = 0; i < MAX_COLORS; i++) {
950 vm_page_queue_init(&vm_page_queue_free[i].qhead);
951 }
952
953 vm_page_queue_init(&vm_lopage_queue_free);
954 vm_page_queue_init(&vm_page_queue_active);
955 vm_page_queue_init(&vm_page_queue_inactive);
956 #if CONFIG_SECLUDED_MEMORY
957 vm_page_queue_init(&vm_page_queue_secluded);
958 #endif /* CONFIG_SECLUDED_MEMORY */
959 vm_page_queue_init(&vm_page_queue_cleaned);
960 vm_page_queue_init(&vm_page_queue_throttled);
961 vm_page_queue_init(&vm_page_queue_anonymous);
962 queue_init(&vm_objects_wired);
963
964 for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
965 vm_page_queue_init(&vm_page_queue_speculative[i].age_q);
966
967 vm_page_queue_speculative[i].age_ts.tv_sec = 0;
968 vm_page_queue_speculative[i].age_ts.tv_nsec = 0;
969 }
970 #if CONFIG_BACKGROUND_QUEUE
971 vm_page_queue_init(&vm_page_queue_background);
972
973 vm_page_background_count = 0;
974 vm_page_background_internal_count = 0;
975 vm_page_background_external_count = 0;
976 vm_page_background_promoted_count = 0;
977
978 vm_page_background_target = (unsigned int)(atop_64(max_mem) / 25);
979
980 if (vm_page_background_target > VM_PAGE_BACKGROUND_TARGET_MAX) {
981 vm_page_background_target = VM_PAGE_BACKGROUND_TARGET_MAX;
982 }
983
984 vm_page_background_mode = VM_PAGE_BG_LEVEL_1;
985 vm_page_background_exclude_external = 0;
986
987 PE_parse_boot_argn("vm_page_bg_mode", &vm_page_background_mode, sizeof(vm_page_background_mode));
988 PE_parse_boot_argn("vm_page_bg_exclude_external", &vm_page_background_exclude_external, sizeof(vm_page_background_exclude_external));
989 PE_parse_boot_argn("vm_page_bg_target", &vm_page_background_target, sizeof(vm_page_background_target));
990
991 if (vm_page_background_mode > VM_PAGE_BG_LEVEL_1) {
992 vm_page_background_mode = VM_PAGE_BG_LEVEL_1;
993 }
994 #endif
995 vm_page_free_wanted = 0;
996 vm_page_free_wanted_privileged = 0;
997 #if CONFIG_SECLUDED_MEMORY
998 vm_page_free_wanted_secluded = 0;
999 #endif /* CONFIG_SECLUDED_MEMORY */
1000
1001 #if defined (__x86_64__)
1002 /* this must be called before vm_page_set_colors() */
1003 vm_page_setup_clump();
1004 #endif
1005
1006 vm_page_set_colors();
1007
1008 bzero(vm_page_inactive_states, sizeof(vm_page_inactive_states));
1009 vm_page_inactive_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1010 vm_page_inactive_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1011 vm_page_inactive_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1012
1013 bzero(vm_page_pageable_states, sizeof(vm_page_pageable_states));
1014 vm_page_pageable_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1015 vm_page_pageable_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1016 vm_page_pageable_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1017 vm_page_pageable_states[VM_PAGE_ON_ACTIVE_Q] = 1;
1018 vm_page_pageable_states[VM_PAGE_ON_SPECULATIVE_Q] = 1;
1019 vm_page_pageable_states[VM_PAGE_ON_THROTTLED_Q] = 1;
1020 #if CONFIG_SECLUDED_MEMORY
1021 vm_page_pageable_states[VM_PAGE_ON_SECLUDED_Q] = 1;
1022 #endif /* CONFIG_SECLUDED_MEMORY */
1023
1024 bzero(vm_page_non_speculative_pageable_states, sizeof(vm_page_non_speculative_pageable_states));
1025 vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1026 vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1027 vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1028 vm_page_non_speculative_pageable_states[VM_PAGE_ON_ACTIVE_Q] = 1;
1029 vm_page_non_speculative_pageable_states[VM_PAGE_ON_THROTTLED_Q] = 1;
1030 #if CONFIG_SECLUDED_MEMORY
1031 vm_page_non_speculative_pageable_states[VM_PAGE_ON_SECLUDED_Q] = 1;
1032 #endif /* CONFIG_SECLUDED_MEMORY */
1033
1034 bzero(vm_page_active_or_inactive_states, sizeof(vm_page_active_or_inactive_states));
1035 vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1036 vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1037 vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1038 vm_page_active_or_inactive_states[VM_PAGE_ON_ACTIVE_Q] = 1;
1039 #if CONFIG_SECLUDED_MEMORY
1040 vm_page_active_or_inactive_states[VM_PAGE_ON_SECLUDED_Q] = 1;
1041 #endif /* CONFIG_SECLUDED_MEMORY */
1042
1043 for (vm_tag_t t = 0; t < VM_KERN_MEMORY_FIRST_DYNAMIC; t++) {
1044 vm_allocation_sites_static[t].refcount = 2;
1045 vm_allocation_sites_static[t].tag = t;
1046 vm_allocation_sites[t] = &vm_allocation_sites_static[t];
1047 }
1048 vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].refcount = 2;
1049 vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].tag = VM_KERN_MEMORY_ANY;
1050 vm_allocation_sites[VM_KERN_MEMORY_ANY] = &vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC];
1051
1052 /*
1053 * Steal memory for the map and zone subsystems.
1054 */
1055 kernel_startup_initialize_upto(STARTUP_SUB_PMAP_STEAL);
1056
1057 /*
1058 * Allocate (and initialize) the virtual-to-physical
1059 * table hash buckets.
1060 *
1061 * The number of buckets should be a power of two to
1062 * get a good hash function. The following computation
1063 * chooses the first power of two that is greater
1064 * than the number of physical pages in the system.
1065 */
1066
1067 if (vm_page_bucket_count == 0) {
1068 unsigned int npages = pmap_free_pages();
1069
1070 vm_page_bucket_count = 1;
1071 while (vm_page_bucket_count < npages) {
1072 vm_page_bucket_count <<= 1;
1073 }
1074 }
1075 vm_page_bucket_lock_count = (vm_page_bucket_count + BUCKETS_PER_LOCK - 1) / BUCKETS_PER_LOCK;
1076
1077 vm_page_hash_mask = vm_page_bucket_count - 1;
1078
1079 /*
1080 * Calculate object shift value for hashing algorithm:
1081 * O = log2(sizeof(struct vm_object))
1082 * B = log2(vm_page_bucket_count)
1083 * hash shifts the object left by
1084 * B/2 - O
1085 */
1086 size = vm_page_bucket_count;
1087 for (log1 = 0; size > 1; log1++) {
1088 size /= 2;
1089 }
1090 size = sizeof(struct vm_object);
1091 for (log2 = 0; size > 1; log2++) {
1092 size /= 2;
1093 }
1094 vm_page_hash_shift = log1 / 2 - log2 + 1;
1095
1096 vm_page_bucket_hash = 1 << ((log1 + 1) >> 1); /* Get (ceiling of sqrt of table size) */
1097 vm_page_bucket_hash |= 1 << ((log1 + 1) >> 2); /* Get (ceiling of quadroot of table size) */
1098 vm_page_bucket_hash |= 1; /* Set bit and add 1 - always must be 1 to insure unique series */
1099
1100 if (vm_page_hash_mask & vm_page_bucket_count) {
1101 printf("vm_page_bootstrap: WARNING -- strange page hash\n");
1102 }
1103
1104 #if VM_PAGE_BUCKETS_CHECK
1105 #if VM_PAGE_FAKE_BUCKETS
1106 /*
1107 * Allocate a decoy set of page buckets, to detect
1108 * any stomping there.
1109 */
1110 vm_page_fake_buckets = (vm_page_bucket_t *)
1111 pmap_steal_memory(vm_page_bucket_count *
1112 sizeof(vm_page_bucket_t));
1113 vm_page_fake_buckets_start = (vm_map_offset_t) vm_page_fake_buckets;
1114 vm_page_fake_buckets_end =
1115 vm_map_round_page((vm_page_fake_buckets_start +
1116 (vm_page_bucket_count *
1117 sizeof(vm_page_bucket_t))),
1118 PAGE_MASK);
1119 char *cp;
1120 for (cp = (char *)vm_page_fake_buckets_start;
1121 cp < (char *)vm_page_fake_buckets_end;
1122 cp++) {
1123 *cp = 0x5a;
1124 }
1125 #endif /* VM_PAGE_FAKE_BUCKETS */
1126 #endif /* VM_PAGE_BUCKETS_CHECK */
1127
1128 kernel_debug_string_early("vm_page_buckets");
1129 vm_page_buckets = (vm_page_bucket_t *)
1130 pmap_steal_memory(vm_page_bucket_count *
1131 sizeof(vm_page_bucket_t));
1132
1133 kernel_debug_string_early("vm_page_bucket_locks");
1134 vm_page_bucket_locks = (lck_spin_t *)
1135 pmap_steal_memory(vm_page_bucket_lock_count *
1136 sizeof(lck_spin_t));
1137
1138 for (i = 0; i < vm_page_bucket_count; i++) {
1139 vm_page_bucket_t *bucket = &vm_page_buckets[i];
1140
1141 bucket->page_list = VM_PAGE_PACK_PTR(VM_PAGE_NULL);
1142 #if MACH_PAGE_HASH_STATS
1143 bucket->cur_count = 0;
1144 bucket->hi_count = 0;
1145 #endif /* MACH_PAGE_HASH_STATS */
1146 }
1147
1148 for (i = 0; i < vm_page_bucket_lock_count; i++) {
1149 lck_spin_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket, &vm_page_lck_attr);
1150 }
1151
1152 vm_tag_init();
1153
1154 #if VM_PAGE_BUCKETS_CHECK
1155 vm_page_buckets_check_ready = TRUE;
1156 #endif /* VM_PAGE_BUCKETS_CHECK */
1157
1158 /*
1159 * Machine-dependent code allocates the resident page table.
1160 * It uses vm_page_init to initialize the page frames.
1161 * The code also returns to us the virtual space available
1162 * to the kernel. We don't trust the pmap module
1163 * to get the alignment right.
1164 */
1165
1166 kernel_debug_string_early("pmap_startup");
1167 pmap_startup(&virtual_space_start, &virtual_space_end);
1168 virtual_space_start = round_page(virtual_space_start);
1169 virtual_space_end = trunc_page(virtual_space_end);
1170
1171 *startp = virtual_space_start;
1172 *endp = virtual_space_end;
1173
1174 /*
1175 * Compute the initial "wire" count.
1176 * Up until now, the pages which have been set aside are not under
1177 * the VM system's control, so although they aren't explicitly
1178 * wired, they nonetheless can't be moved. At this moment,
1179 * all VM managed pages are "free", courtesy of pmap_startup.
1180 */
1181 assert((unsigned int) atop_64(max_mem) == atop_64(max_mem));
1182 vm_page_wire_count = ((unsigned int) atop_64(max_mem)) -
1183 vm_page_free_count - vm_lopage_free_count;
1184 #if CONFIG_SECLUDED_MEMORY
1185 vm_page_wire_count -= vm_page_secluded_count;
1186 #endif
1187 vm_page_wire_count_initial = vm_page_wire_count;
1188
1189 /* capture this for later use */
1190 booter_size = ml_get_booter_memory_size();
1191
1192 printf("vm_page_bootstrap: %d free pages, %d wired pages, (up to %d of which are delayed free)\n",
1193 vm_page_free_count, vm_page_wire_count, vm_delayed_count);
1194
1195 kernel_debug_string_early("vm_page_bootstrap complete");
1196 }
1197
1198 #ifndef MACHINE_PAGES
1199 /*
1200 * This is the early boot time allocator for data structures needed to bootstrap the VM system.
1201 * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this
1202 * on ARM yet, due to the combination of a large base page size and smaller RAM devices.
1203 */
1204 static void *
pmap_steal_memory_internal(vm_size_t size,boolean_t might_free)1205 pmap_steal_memory_internal(
1206 vm_size_t size,
1207 boolean_t might_free)
1208 {
1209 kern_return_t kr;
1210 vm_offset_t addr;
1211 vm_offset_t map_addr;
1212 ppnum_t phys_page;
1213
1214 /*
1215 * Size needs to be aligned to word size.
1216 */
1217 size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
1218
1219 /*
1220 * On the first call, get the initial values for virtual address space
1221 * and page align them.
1222 */
1223 if (virtual_space_start == virtual_space_end) {
1224 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
1225 virtual_space_start = round_page(virtual_space_start);
1226 virtual_space_end = trunc_page(virtual_space_end);
1227
1228 #if defined(__x86_64__)
1229 /*
1230 * Release remaining unused section of preallocated KVA and the 4K page tables
1231 * that map it. This makes the VA available for large page mappings.
1232 */
1233 Idle_PTs_release(virtual_space_start, virtual_space_end);
1234 #endif
1235 }
1236
1237 /*
1238 * Allocate the virtual space for this request. On x86, we'll align to a large page
1239 * address if the size is big enough to back with at least 1 large page.
1240 */
1241 #if defined(__x86_64__)
1242 if (size >= I386_LPGBYTES) {
1243 virtual_space_start = ((virtual_space_start + I386_LPGMASK) & ~I386_LPGMASK);
1244 }
1245 #endif
1246 addr = virtual_space_start;
1247 virtual_space_start += size;
1248
1249 //kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */
1250
1251 /*
1252 * Allocate and map physical pages to back the new virtual space.
1253 */
1254 map_addr = round_page(addr);
1255 while (map_addr < addr + size) {
1256 #if defined(__x86_64__)
1257 /*
1258 * Back with a large page if properly aligned on x86
1259 */
1260 if ((map_addr & I386_LPGMASK) == 0 &&
1261 map_addr + I386_LPGBYTES <= addr + size &&
1262 pmap_pre_expand_large(kernel_pmap, map_addr) == KERN_SUCCESS &&
1263 pmap_next_page_large(&phys_page) == KERN_SUCCESS) {
1264 kr = pmap_enter(kernel_pmap, map_addr, phys_page,
1265 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
1266 VM_WIMG_USE_DEFAULT | VM_MEM_SUPERPAGE, FALSE);
1267
1268 if (kr != KERN_SUCCESS) {
1269 panic("pmap_steal_memory: pmap_enter() large failed, new_addr=%#lx, phys_page=%u",
1270 (unsigned long)map_addr, phys_page);
1271 }
1272 map_addr += I386_LPGBYTES;
1273 vm_page_wire_count += I386_LPGBYTES >> PAGE_SHIFT;
1274 vm_page_stolen_count += I386_LPGBYTES >> PAGE_SHIFT;
1275 vm_page_kern_lpage_count++;
1276 continue;
1277 }
1278 #endif
1279
1280 if (!pmap_next_page_hi(&phys_page, might_free)) {
1281 panic("pmap_steal_memory() size: 0x%llx", (uint64_t)size);
1282 }
1283
1284 #if defined(__x86_64__)
1285 pmap_pre_expand(kernel_pmap, map_addr);
1286 #endif
1287
1288 kr = pmap_enter(kernel_pmap, map_addr, phys_page,
1289 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
1290 VM_WIMG_USE_DEFAULT, FALSE);
1291
1292 if (kr != KERN_SUCCESS) {
1293 panic("pmap_steal_memory() pmap_enter failed, map_addr=%#lx, phys_page=%u",
1294 (unsigned long)map_addr, phys_page);
1295 }
1296 map_addr += PAGE_SIZE;
1297
1298 /*
1299 * Account for newly stolen memory
1300 */
1301 vm_page_wire_count++;
1302 vm_page_stolen_count++;
1303 }
1304
1305 #if defined(__x86_64__)
1306 /*
1307 * The call with might_free is currently the last use of pmap_steal_memory*().
1308 * Notify the pmap layer to record which high pages were allocated so far.
1309 */
1310 if (might_free) {
1311 pmap_hi_pages_done();
1312 }
1313 #endif
1314 #if KASAN
1315 kasan_notify_address(round_page(addr), size);
1316 #endif
1317 return (void *) addr;
1318 }
1319
1320 void *
pmap_steal_memory(vm_size_t size)1321 pmap_steal_memory(
1322 vm_size_t size)
1323 {
1324 return pmap_steal_memory_internal(size, FALSE);
1325 }
1326
1327 void *
pmap_steal_freeable_memory(vm_size_t size)1328 pmap_steal_freeable_memory(
1329 vm_size_t size)
1330 {
1331 return pmap_steal_memory_internal(size, TRUE);
1332 }
1333
1334 #if defined(__arm64__)
1335 /*
1336 * Retire a page at startup.
1337 * These pages will eventually wind up on the retired_pages_object
1338 * in vm_retire_boot_pages().
1339 */
1340 static vm_page_queue_head_t vm_page_queue_retired VM_PAGE_PACKED_ALIGNED;
1341 static void
vm_page_retire_startup(vm_page_t p)1342 vm_page_retire_startup(vm_page_t p)
1343 {
1344 p->vmp_q_state = VM_PAGE_NOT_ON_Q;
1345 p->vmp_error = true;
1346 p->vmp_unusual = true;
1347 vm_page_queue_enter(&vm_page_queue_retired, p, vmp_pageq);
1348 printf("To be retired at boot: page at 0x%llx\n", (long long)ptoa(VM_PAGE_GET_PHYS_PAGE(p)));
1349 }
1350 #endif /* defined(__arm64__) */
1351
1352 #if CONFIG_SECLUDED_MEMORY
1353 /* boot-args to control secluded memory */
1354 unsigned int secluded_mem_mb = 0; /* # of MBs of RAM to seclude */
1355 int secluded_for_iokit = 1; /* IOKit can use secluded memory */
1356 int secluded_for_apps = 1; /* apps can use secluded memory */
1357 int secluded_for_filecache = 2; /* filecache can use seclude memory */
1358 #if 11
1359 int secluded_for_fbdp = 0;
1360 #endif
1361 uint64_t secluded_shutoff_trigger = 0;
1362 uint64_t secluded_shutoff_headroom = 150 * 1024 * 1024; /* original value from N56 */
1363 #endif /* CONFIG_SECLUDED_MEMORY */
1364
1365
1366 #if defined(__arm__) || defined(__arm64__)
1367 extern void patch_low_glo_vm_page_info(void *, void *, uint32_t);
1368 unsigned int vm_first_phys_ppnum = 0;
1369 #endif
1370
1371 void vm_page_release_startup(vm_page_t mem);
1372 void
pmap_startup(vm_offset_t * startp,vm_offset_t * endp)1373 pmap_startup(
1374 vm_offset_t *startp,
1375 vm_offset_t *endp)
1376 {
1377 unsigned int i, npages;
1378 ppnum_t phys_page;
1379 uint64_t mem_sz;
1380 uint64_t start_ns;
1381 uint64_t now_ns;
1382 uint_t low_page_count = 0;
1383
1384 #if defined(__LP64__)
1385 /*
1386 * make sure we are aligned on a 64 byte boundary
1387 * for VM_PAGE_PACK_PTR (it clips off the low-order
1388 * 6 bits of the pointer)
1389 */
1390 if (virtual_space_start != virtual_space_end) {
1391 virtual_space_start = round_page(virtual_space_start);
1392 }
1393 #endif
1394
1395 /*
1396 * We calculate how many page frames we will have
1397 * and then allocate the page structures in one chunk.
1398 *
1399 * Note that the calculation here doesn't take into account
1400 * the memory needed to map what's being allocated, i.e. the page
1401 * table entries. So the actual number of pages we get will be
1402 * less than this. To do someday: include that in the computation.
1403 *
1404 * Also for ARM, we don't use the count of free_pages, but rather the
1405 * range from last page to first page (ignore holes due to retired pages).
1406 */
1407 #if defined(__arm__) || defined(__arm64__)
1408 mem_sz = pmap_free_pages_span() * (uint64_t)PAGE_SIZE;
1409 #else /* defined(__arm__) || defined(__arm64__) */
1410 mem_sz = pmap_free_pages() * (uint64_t)PAGE_SIZE;
1411 #endif /* defined(__arm__) || defined(__arm64__) */
1412 mem_sz += round_page(virtual_space_start) - virtual_space_start; /* Account for any slop */
1413 npages = (uint_t)(mem_sz / (PAGE_SIZE + sizeof(*vm_pages))); /* scaled to include the vm_page_ts */
1414
1415 vm_pages = (vm_page_t) pmap_steal_freeable_memory(npages * sizeof *vm_pages);
1416
1417 /*
1418 * Check if we want to initialize pages to a known value
1419 */
1420 if (PE_parse_boot_argn("fill", &fillval, sizeof(fillval))) {
1421 fill = TRUE;
1422 }
1423 #if DEBUG
1424 /* This slows down booting the DEBUG kernel, particularly on
1425 * large memory systems, but is worthwhile in deterministically
1426 * trapping uninitialized memory usage.
1427 */
1428 if (!fill) {
1429 fill = TRUE;
1430 fillval = 0xDEB8F177;
1431 }
1432 #endif
1433 if (fill) {
1434 kprintf("Filling vm_pages with pattern: 0x%x\n", fillval);
1435 }
1436
1437 #if CONFIG_SECLUDED_MEMORY
1438 /*
1439 * Figure out how much secluded memory to have before we start
1440 * release pages to free lists.
1441 * The default, if specified nowhere else, is no secluded mem.
1442 */
1443 secluded_mem_mb = 0;
1444 if (max_mem > 1 * 1024 * 1024 * 1024) {
1445 /* default to 90MB for devices with > 1GB of RAM */
1446 secluded_mem_mb = 90;
1447 }
1448 /* override with value from device tree, if provided */
1449 PE_get_default("kern.secluded_mem_mb",
1450 &secluded_mem_mb, sizeof(secluded_mem_mb));
1451 /* override with value from boot-args, if provided */
1452 PE_parse_boot_argn("secluded_mem_mb",
1453 &secluded_mem_mb,
1454 sizeof(secluded_mem_mb));
1455
1456 vm_page_secluded_target = (unsigned int)
1457 ((secluded_mem_mb * 1024ULL * 1024ULL) / PAGE_SIZE);
1458 PE_parse_boot_argn("secluded_for_iokit",
1459 &secluded_for_iokit,
1460 sizeof(secluded_for_iokit));
1461 PE_parse_boot_argn("secluded_for_apps",
1462 &secluded_for_apps,
1463 sizeof(secluded_for_apps));
1464 PE_parse_boot_argn("secluded_for_filecache",
1465 &secluded_for_filecache,
1466 sizeof(secluded_for_filecache));
1467 #if 11
1468 PE_parse_boot_argn("secluded_for_fbdp",
1469 &secluded_for_fbdp,
1470 sizeof(secluded_for_fbdp));
1471 #endif
1472
1473 /*
1474 * Allow a really large app to effectively use secluded memory until it exits.
1475 */
1476 if (vm_page_secluded_target != 0) {
1477 /*
1478 * Get an amount from boot-args, else use 1/2 of max_mem.
1479 * 1/2 max_mem was chosen from a Peace daemon tentpole test which
1480 * used munch to induce jetsam thrashing of false idle daemons on N56.
1481 */
1482 int secluded_shutoff_mb;
1483 if (PE_parse_boot_argn("secluded_shutoff_mb", &secluded_shutoff_mb,
1484 sizeof(secluded_shutoff_mb))) {
1485 secluded_shutoff_trigger = (uint64_t)secluded_shutoff_mb * 1024 * 1024;
1486 } else {
1487 secluded_shutoff_trigger = max_mem / 2;
1488 }
1489
1490 /* ensure the headroom value is sensible and avoid underflows */
1491 assert(secluded_shutoff_trigger == 0 || secluded_shutoff_trigger > secluded_shutoff_headroom);
1492 }
1493
1494 #endif /* CONFIG_SECLUDED_MEMORY */
1495
1496 #if defined(__x86_64__)
1497
1498 /*
1499 * Decide how much memory we delay freeing at boot time.
1500 */
1501 uint32_t delay_above_gb;
1502 if (!PE_parse_boot_argn("delay_above_gb", &delay_above_gb, sizeof(delay_above_gb))) {
1503 delay_above_gb = DEFAULT_DELAY_ABOVE_PHYS_GB;
1504 }
1505
1506 if (delay_above_gb == 0) {
1507 delay_above_pnum = PPNUM_MAX;
1508 } else {
1509 delay_above_pnum = delay_above_gb * (1024 * 1024 * 1024 / PAGE_SIZE);
1510 }
1511
1512 /* make sure we have sane breathing room: 1G above low memory */
1513 if (delay_above_pnum <= max_valid_low_ppnum) {
1514 delay_above_pnum = max_valid_low_ppnum + ((1024 * 1024 * 1024) >> PAGE_SHIFT);
1515 }
1516
1517 if (delay_above_pnum < PPNUM_MAX) {
1518 printf("pmap_startup() delaying init/free of page nums > 0x%x\n", delay_above_pnum);
1519 }
1520
1521 #endif /* defined(__x86_64__) */
1522
1523 /*
1524 * Initialize and release the page frames.
1525 */
1526 kernel_debug_string_early("page_frame_init");
1527
1528 vm_page_array_beginning_addr = &vm_pages[0];
1529 vm_page_array_ending_addr = &vm_pages[npages]; /* used by ptr packing/unpacking code */
1530 #if VM_PAGE_PACKED_FROM_ARRAY
1531 if (npages >= VM_PAGE_PACKED_FROM_ARRAY) {
1532 panic("pmap_startup(): too many pages to support vm_page packing");
1533 }
1534 #endif
1535
1536 vm_delayed_count = 0;
1537 #if defined(__arm64__)
1538 vm_page_queue_init(&vm_page_queue_retired);
1539 #endif /* defined(__arm64__) */
1540
1541 absolutetime_to_nanoseconds(mach_absolute_time(), &start_ns);
1542 vm_pages_count = 0;
1543 for (i = 0; i < npages; i++) {
1544 /* Did we run out of pages? */
1545 if (!pmap_next_page(&phys_page)) {
1546 break;
1547 }
1548
1549 if (phys_page < max_valid_low_ppnum) {
1550 ++low_page_count;
1551 }
1552
1553 /* Are we at high enough pages to delay the rest? */
1554 if (low_page_count > vm_lopage_free_limit && phys_page > delay_above_pnum) {
1555 vm_delayed_count = pmap_free_pages();
1556 break;
1557 }
1558
1559 #if defined(__arm__) || defined(__arm64__)
1560 if (i == 0) {
1561 vm_first_phys_ppnum = phys_page;
1562 patch_low_glo_vm_page_info((void *)vm_page_array_beginning_addr,
1563 (void *)vm_page_array_ending_addr, vm_first_phys_ppnum);
1564 #if defined(__arm64__)
1565 } else {
1566 /*
1567 * pmap_next_page() may skip over pages reported bad by iboot.
1568 */
1569 while (i < phys_page - vm_first_phys_ppnum && i < npages) {
1570 ++vm_pages_count;
1571 vm_page_init(&vm_pages[i], i + vm_first_phys_ppnum, FALSE);
1572 vm_page_retire_startup(&vm_pages[i]);
1573 ++i;
1574 }
1575 if (i >= npages) {
1576 break;
1577 }
1578 assert(i == phys_page - vm_first_phys_ppnum);
1579 #endif /* defined(__arm64__) */
1580 }
1581 #endif /* defined(__arm__) || defined(__arm64__) */
1582
1583 #if defined(__x86_64__)
1584 /* The x86 clump freeing code requires increasing ppn's to work correctly */
1585 if (i > 0) {
1586 assert(phys_page > vm_pages[i - 1].vmp_phys_page);
1587 }
1588 #endif
1589 ++vm_pages_count;
1590 vm_page_init(&vm_pages[i], phys_page, FALSE);
1591 if (fill) {
1592 fillPage(phys_page, fillval);
1593 }
1594 if (vm_himemory_mode) {
1595 vm_page_release_startup(&vm_pages[i]);
1596 }
1597 }
1598 vm_page_pages = vm_pages_count; /* used to report to user space */
1599
1600 if (!vm_himemory_mode) {
1601 do {
1602 if (!vm_pages[--i].vmp_error) { /* skip retired pages */
1603 vm_page_release_startup(&vm_pages[i]);
1604 }
1605 } while (i != 0);
1606 }
1607
1608 absolutetime_to_nanoseconds(mach_absolute_time(), &now_ns);
1609 printf("pmap_startup() init/release time: %lld microsec\n", (now_ns - start_ns) / NSEC_PER_USEC);
1610 printf("pmap_startup() delayed init/release of %d pages\n", vm_delayed_count);
1611
1612 #if defined(__LP64__)
1613 if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[0]))) != &vm_pages[0]) {
1614 panic("VM_PAGE_PACK_PTR failed on &vm_pages[0] - %p", (void *)&vm_pages[0]);
1615 }
1616
1617 if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[vm_pages_count - 1]))) != &vm_pages[vm_pages_count - 1]) {
1618 panic("VM_PAGE_PACK_PTR failed on &vm_pages[vm_pages_count-1] - %p", (void *)&vm_pages[vm_pages_count - 1]);
1619 }
1620 #endif
1621
1622 VM_CHECK_MEMORYSTATUS;
1623
1624 /*
1625 * We have to re-align virtual_space_start,
1626 * because pmap_steal_memory has been using it.
1627 */
1628 virtual_space_start = round_page(virtual_space_start);
1629 *startp = virtual_space_start;
1630 *endp = virtual_space_end;
1631 }
1632 #endif /* MACHINE_PAGES */
1633
1634 /*
1635 * Create the zone that represents the vm_pages[] array. Nothing ever allocates
1636 * or frees to this zone. It's just here for reporting purposes via zprint command.
1637 * This needs to be done after all initially delayed pages are put on the free lists.
1638 */
1639 static void
vm_page_module_init_delayed(void)1640 vm_page_module_init_delayed(void)
1641 {
1642 (void)zone_create_ext("vm pages array", sizeof(struct vm_page),
1643 ZC_NOGZALLOC, ZONE_ID_VM_PAGES, ^(zone_t z) {
1644 uint64_t vm_page_zone_pages, vm_page_array_zone_data_size;
1645
1646 zone_set_exhaustible(z, 0);
1647 /*
1648 * Reflect size and usage information for vm_pages[].
1649 */
1650
1651 z->z_elems_avail = (uint32_t)(vm_page_array_ending_addr - vm_pages);
1652 z->z_elems_free = z->z_elems_avail - vm_pages_count;
1653 zpercpu_get_cpu(z->z_stats, 0)->zs_mem_allocated =
1654 vm_pages_count * sizeof(struct vm_page);
1655 vm_page_array_zone_data_size = (uint64_t)vm_page_array_ending_addr - (uint64_t)vm_pages;
1656 vm_page_zone_pages = atop(round_page((vm_offset_t)vm_page_array_zone_data_size));
1657 z->z_wired_cur += vm_page_zone_pages;
1658 z->z_wired_hwm = z->z_wired_cur;
1659 z->z_va_cur = z->z_wired_cur;
1660 /* since zone accounts for these, take them out of stolen */
1661 VM_PAGE_MOVE_STOLEN(vm_page_zone_pages);
1662 });
1663 }
1664
1665 /*
1666 * Create the vm_pages zone. This is used for the vm_page structures for the pages
1667 * that are scavanged from other boot time usages by ml_static_mfree(). As such,
1668 * this needs to happen in early VM bootstrap.
1669 */
1670
1671 __startup_func
1672 static void
vm_page_module_init(void)1673 vm_page_module_init(void)
1674 {
1675 vm_size_t vm_page_with_ppnum_size;
1676
1677 /*
1678 * Since the pointers to elements in this zone will be packed, they
1679 * must have appropriate size. Not strictly what sizeof() reports.
1680 */
1681 vm_page_with_ppnum_size =
1682 (sizeof(struct vm_page_with_ppnum) + (VM_PAGE_PACKED_PTR_ALIGNMENT - 1)) &
1683 ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
1684
1685 vm_page_zone = zone_create_ext("vm pages", vm_page_with_ppnum_size,
1686 ZC_NOGZALLOC | ZC_ALIGNMENT_REQUIRED | ZC_VM_LP64 | ZC_NOTBITAG,
1687 ZONE_ID_ANY, ^(zone_t z) {
1688 /*
1689 * The number "10" is a small number that is larger than the number
1690 * of fictitious pages that any single caller will attempt to allocate
1691 * without blocking.
1692 *
1693 * The largest such number at the moment is kmem_alloc()
1694 * when 2 guard pages are asked. 10 is simply a somewhat larger number,
1695 * taking into account the 50% hysteresis the zone allocator uses.
1696 *
1697 * Note: this works at all because the zone allocator
1698 * doesn't ever allocate fictitious pages.
1699 */
1700 z->z_elems_rsv = 10;
1701 });
1702 }
1703 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_page_module_init);
1704
1705 /*
1706 * Routine: vm_page_create
1707 * Purpose:
1708 * After the VM system is up, machine-dependent code
1709 * may stumble across more physical memory. For example,
1710 * memory that it was reserving for a frame buffer.
1711 * vm_page_create turns this memory into available pages.
1712 */
1713
1714 void
vm_page_create(ppnum_t start,ppnum_t end)1715 vm_page_create(
1716 ppnum_t start,
1717 ppnum_t end)
1718 {
1719 ppnum_t phys_page;
1720 vm_page_t m;
1721
1722 for (phys_page = start;
1723 phys_page < end;
1724 phys_page++) {
1725 m = vm_page_grab_fictitious_common(phys_page, TRUE);
1726 m->vmp_fictitious = FALSE;
1727 pmap_clear_noencrypt(phys_page);
1728
1729 lck_mtx_lock(&vm_page_queue_free_lock);
1730 vm_page_pages++;
1731 lck_mtx_unlock(&vm_page_queue_free_lock);
1732 vm_page_release(m, FALSE);
1733 }
1734 }
1735
1736 #if defined(__arm64__)
1737 /*
1738 * Like vm_page_create(), except we want to immediately retire the page,
1739 * not put it on the free list.
1740 */
1741 void
vm_page_create_retired(ppnum_t phys_page)1742 vm_page_create_retired(
1743 ppnum_t phys_page)
1744 {
1745 vm_page_t m;
1746
1747 m = vm_page_grab_fictitious_common(phys_page, TRUE);
1748 m->vmp_fictitious = FALSE;
1749 pmap_clear_noencrypt(phys_page);
1750 m->vmp_error = true;
1751 m->vmp_unusual = true;
1752 vm_page_lock_queues();
1753 m->vmp_q_state = VM_PAGE_IS_WIRED;
1754 m->vmp_wire_count++;
1755 vm_page_unlock_queues();
1756
1757 lck_mtx_lock(&vm_page_queue_free_lock);
1758 vm_page_pages++;
1759 lck_mtx_unlock(&vm_page_queue_free_lock);
1760
1761 vm_object_lock(retired_pages_object);
1762 vm_page_insert_wired(m, retired_pages_object, ptoa(VM_PAGE_GET_PHYS_PAGE(m)), VM_KERN_MEMORY_RETIRED);
1763 vm_object_unlock(retired_pages_object);
1764 pmap_retire_page(VM_PAGE_GET_PHYS_PAGE(m));
1765 }
1766 #endif /* defined(__arm64__) */
1767
1768 /*
1769 * vm_page_hash:
1770 *
1771 * Distributes the object/offset key pair among hash buckets.
1772 *
1773 * NOTE: The bucket count must be a power of 2
1774 */
1775 #define vm_page_hash(object, offset) (\
1776 ( (natural_t)((uintptr_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\
1777 & vm_page_hash_mask)
1778
1779
1780 /*
1781 * vm_page_insert: [ internal use only ]
1782 *
1783 * Inserts the given mem entry into the object/object-page
1784 * table and object list.
1785 *
1786 * The object must be locked.
1787 */
1788 void
vm_page_insert(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)1789 vm_page_insert(
1790 vm_page_t mem,
1791 vm_object_t object,
1792 vm_object_offset_t offset)
1793 {
1794 vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, FALSE, FALSE, NULL);
1795 }
1796
1797 void
vm_page_insert_wired(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag)1798 vm_page_insert_wired(
1799 vm_page_t mem,
1800 vm_object_t object,
1801 vm_object_offset_t offset,
1802 vm_tag_t tag)
1803 {
1804 vm_page_insert_internal(mem, object, offset, tag, FALSE, TRUE, FALSE, FALSE, NULL);
1805 }
1806
1807 void
vm_page_insert_internal(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag,boolean_t queues_lock_held,boolean_t insert_in_hash,boolean_t batch_pmap_op,boolean_t batch_accounting,uint64_t * delayed_ledger_update)1808 vm_page_insert_internal(
1809 vm_page_t mem,
1810 vm_object_t object,
1811 vm_object_offset_t offset,
1812 vm_tag_t tag,
1813 boolean_t queues_lock_held,
1814 boolean_t insert_in_hash,
1815 boolean_t batch_pmap_op,
1816 boolean_t batch_accounting,
1817 uint64_t *delayed_ledger_update)
1818 {
1819 vm_page_bucket_t *bucket;
1820 lck_spin_t *bucket_lock;
1821 int hash_id;
1822 task_t owner;
1823 int ledger_idx_volatile;
1824 int ledger_idx_nonvolatile;
1825 int ledger_idx_volatile_compressed;
1826 int ledger_idx_nonvolatile_compressed;
1827 boolean_t do_footprint;
1828
1829 #if 0
1830 /*
1831 * we may not hold the page queue lock
1832 * so this check isn't safe to make
1833 */
1834 VM_PAGE_CHECK(mem);
1835 #endif
1836
1837 assertf(page_aligned(offset), "0x%llx\n", offset);
1838
1839 assert(!VM_PAGE_WIRED(mem) || mem->vmp_private || mem->vmp_fictitious || (tag != VM_KERN_MEMORY_NONE));
1840
1841 vm_object_lock_assert_exclusive(object);
1842 LCK_MTX_ASSERT(&vm_page_queue_lock,
1843 queues_lock_held ? LCK_MTX_ASSERT_OWNED
1844 : LCK_MTX_ASSERT_NOTOWNED);
1845
1846 if (queues_lock_held == FALSE) {
1847 assert(!VM_PAGE_PAGEABLE(mem));
1848 }
1849
1850 if (insert_in_hash == TRUE) {
1851 #if DEBUG || VM_PAGE_BUCKETS_CHECK
1852 if (mem->vmp_tabled || mem->vmp_object) {
1853 panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
1854 "already in (obj=%p,off=0x%llx)",
1855 mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
1856 }
1857 #endif
1858 if (object->internal && (offset >= object->vo_size)) {
1859 panic("vm_page_insert_internal: (page=%p,obj=%p,off=0x%llx,size=0x%llx) inserted at offset past object bounds",
1860 mem, object, offset, object->vo_size);
1861 }
1862
1863 assert(vm_page_lookup(object, offset) == VM_PAGE_NULL);
1864
1865 /*
1866 * Record the object/offset pair in this page
1867 */
1868
1869 mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
1870 mem->vmp_offset = offset;
1871
1872 #if CONFIG_SECLUDED_MEMORY
1873 if (object->eligible_for_secluded) {
1874 vm_page_secluded.eligible_for_secluded++;
1875 }
1876 #endif /* CONFIG_SECLUDED_MEMORY */
1877
1878 /*
1879 * Insert it into the object_object/offset hash table
1880 */
1881 hash_id = vm_page_hash(object, offset);
1882 bucket = &vm_page_buckets[hash_id];
1883 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
1884
1885 lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
1886
1887 mem->vmp_next_m = bucket->page_list;
1888 bucket->page_list = VM_PAGE_PACK_PTR(mem);
1889 assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)));
1890
1891 #if MACH_PAGE_HASH_STATS
1892 if (++bucket->cur_count > bucket->hi_count) {
1893 bucket->hi_count = bucket->cur_count;
1894 }
1895 #endif /* MACH_PAGE_HASH_STATS */
1896 mem->vmp_hashed = TRUE;
1897 lck_spin_unlock(bucket_lock);
1898 }
1899
1900 {
1901 unsigned int cache_attr;
1902
1903 cache_attr = object->wimg_bits & VM_WIMG_MASK;
1904
1905 if (cache_attr != VM_WIMG_USE_DEFAULT) {
1906 PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op);
1907 }
1908 }
1909 /*
1910 * Now link into the object's list of backed pages.
1911 */
1912 vm_page_queue_enter(&object->memq, mem, vmp_listq);
1913 object->memq_hint = mem;
1914 mem->vmp_tabled = TRUE;
1915
1916 /*
1917 * Show that the object has one more resident page.
1918 */
1919
1920 object->resident_page_count++;
1921 if (VM_PAGE_WIRED(mem)) {
1922 assert(mem->vmp_wire_count > 0);
1923 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
1924 VM_OBJECT_WIRED_PAGE_ADD(object, mem);
1925 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
1926 }
1927 assert(object->resident_page_count >= object->wired_page_count);
1928
1929 #if DEVELOPMENT || DEBUG
1930 if (object->object_is_shared_cache &&
1931 object->pager != NULL &&
1932 object->pager->mo_pager_ops == &shared_region_pager_ops) {
1933 int new, old;
1934 assert(!object->internal);
1935 new = OSAddAtomic(+1, &shared_region_pagers_resident_count);
1936 do {
1937 old = shared_region_pagers_resident_peak;
1938 } while (old < new &&
1939 !OSCompareAndSwap(old, new, &shared_region_pagers_resident_peak));
1940 }
1941 #endif /* DEVELOPMENT || DEBUG */
1942
1943 if (batch_accounting == FALSE) {
1944 if (object->internal) {
1945 OSAddAtomic(1, &vm_page_internal_count);
1946 } else {
1947 OSAddAtomic(1, &vm_page_external_count);
1948 }
1949 }
1950
1951 /*
1952 * It wouldn't make sense to insert a "reusable" page in
1953 * an object (the page would have been marked "reusable" only
1954 * at the time of a madvise(MADV_FREE_REUSABLE) if it was already
1955 * in the object at that time).
1956 * But a page could be inserted in a "all_reusable" object, if
1957 * something faults it in (a vm_read() from another task or a
1958 * "use-after-free" issue in user space, for example). It can
1959 * also happen if we're relocating a page from that object to
1960 * a different physical page during a physically-contiguous
1961 * allocation.
1962 */
1963 assert(!mem->vmp_reusable);
1964 if (object->all_reusable) {
1965 OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count);
1966 }
1967
1968 if (object->purgable == VM_PURGABLE_DENY &&
1969 !object->vo_ledger_tag) {
1970 owner = TASK_NULL;
1971 } else {
1972 owner = VM_OBJECT_OWNER(object);
1973 vm_object_ledger_tag_ledgers(object,
1974 &ledger_idx_volatile,
1975 &ledger_idx_nonvolatile,
1976 &ledger_idx_volatile_compressed,
1977 &ledger_idx_nonvolatile_compressed,
1978 &do_footprint);
1979 }
1980 if (owner &&
1981 (object->purgable == VM_PURGABLE_NONVOLATILE ||
1982 object->purgable == VM_PURGABLE_DENY ||
1983 VM_PAGE_WIRED(mem))) {
1984 if (delayed_ledger_update) {
1985 *delayed_ledger_update += PAGE_SIZE;
1986 } else {
1987 /* more non-volatile bytes */
1988 ledger_credit(owner->ledger,
1989 ledger_idx_nonvolatile,
1990 PAGE_SIZE);
1991 if (do_footprint) {
1992 /* more footprint */
1993 ledger_credit(owner->ledger,
1994 task_ledgers.phys_footprint,
1995 PAGE_SIZE);
1996 }
1997 }
1998 } else if (owner &&
1999 (object->purgable == VM_PURGABLE_VOLATILE ||
2000 object->purgable == VM_PURGABLE_EMPTY)) {
2001 assert(!VM_PAGE_WIRED(mem));
2002 /* more volatile bytes */
2003 ledger_credit(owner->ledger,
2004 ledger_idx_volatile,
2005 PAGE_SIZE);
2006 }
2007
2008 if (object->purgable == VM_PURGABLE_VOLATILE) {
2009 if (VM_PAGE_WIRED(mem)) {
2010 OSAddAtomic(+1, &vm_page_purgeable_wired_count);
2011 } else {
2012 OSAddAtomic(+1, &vm_page_purgeable_count);
2013 }
2014 } else if (object->purgable == VM_PURGABLE_EMPTY &&
2015 mem->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
2016 /*
2017 * This page belongs to a purged VM object but hasn't
2018 * been purged (because it was "busy").
2019 * It's in the "throttled" queue and hence not
2020 * visible to vm_pageout_scan(). Move it to a pageable
2021 * queue, so that it can eventually be reclaimed, instead
2022 * of lingering in the "empty" object.
2023 */
2024 if (queues_lock_held == FALSE) {
2025 vm_page_lockspin_queues();
2026 }
2027 vm_page_deactivate(mem);
2028 if (queues_lock_held == FALSE) {
2029 vm_page_unlock_queues();
2030 }
2031 }
2032
2033 #if VM_OBJECT_TRACKING_OP_MODIFIED
2034 if (vm_object_tracking_btlog &&
2035 object->internal &&
2036 object->resident_page_count == 0 &&
2037 object->pager == NULL &&
2038 object->shadow != NULL &&
2039 object->shadow->copy == object) {
2040 btlog_record(vm_object_tracking_btlog, object,
2041 VM_OBJECT_TRACKING_OP_MODIFIED,
2042 btref_get(__builtin_frame_address(0), 0));
2043 }
2044 #endif /* VM_OBJECT_TRACKING_OP_MODIFIED */
2045 }
2046
2047 /*
2048 * vm_page_replace:
2049 *
2050 * Exactly like vm_page_insert, except that we first
2051 * remove any existing page at the given offset in object.
2052 *
2053 * The object must be locked.
2054 */
2055 void
vm_page_replace(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)2056 vm_page_replace(
2057 vm_page_t mem,
2058 vm_object_t object,
2059 vm_object_offset_t offset)
2060 {
2061 vm_page_bucket_t *bucket;
2062 vm_page_t found_m = VM_PAGE_NULL;
2063 lck_spin_t *bucket_lock;
2064 int hash_id;
2065
2066 #if 0
2067 /*
2068 * we don't hold the page queue lock
2069 * so this check isn't safe to make
2070 */
2071 VM_PAGE_CHECK(mem);
2072 #endif
2073 vm_object_lock_assert_exclusive(object);
2074 #if DEBUG || VM_PAGE_BUCKETS_CHECK
2075 if (mem->vmp_tabled || mem->vmp_object) {
2076 panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
2077 "already in (obj=%p,off=0x%llx)",
2078 mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
2079 }
2080 #endif
2081 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2082
2083 assert(!VM_PAGE_PAGEABLE(mem));
2084
2085 /*
2086 * Record the object/offset pair in this page
2087 */
2088 mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
2089 mem->vmp_offset = offset;
2090
2091 /*
2092 * Insert it into the object_object/offset hash table,
2093 * replacing any page that might have been there.
2094 */
2095
2096 hash_id = vm_page_hash(object, offset);
2097 bucket = &vm_page_buckets[hash_id];
2098 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
2099
2100 lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
2101
2102 if (bucket->page_list) {
2103 vm_page_packed_t *mp = &bucket->page_list;
2104 vm_page_t m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp));
2105
2106 do {
2107 /*
2108 * compare packed object pointers
2109 */
2110 if (m->vmp_object == mem->vmp_object && m->vmp_offset == offset) {
2111 /*
2112 * Remove old page from hash list
2113 */
2114 *mp = m->vmp_next_m;
2115 m->vmp_hashed = FALSE;
2116 m->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
2117
2118 found_m = m;
2119 break;
2120 }
2121 mp = &m->vmp_next_m;
2122 } while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp))));
2123
2124 mem->vmp_next_m = bucket->page_list;
2125 } else {
2126 mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
2127 }
2128 /*
2129 * insert new page at head of hash list
2130 */
2131 bucket->page_list = VM_PAGE_PACK_PTR(mem);
2132 mem->vmp_hashed = TRUE;
2133
2134 lck_spin_unlock(bucket_lock);
2135
2136 if (found_m) {
2137 /*
2138 * there was already a page at the specified
2139 * offset for this object... remove it from
2140 * the object and free it back to the free list
2141 */
2142 vm_page_free_unlocked(found_m, FALSE);
2143 }
2144 vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, FALSE, FALSE, FALSE, NULL);
2145 }
2146
2147 /*
2148 * vm_page_remove: [ internal use only ]
2149 *
2150 * Removes the given mem entry from the object/offset-page
2151 * table and the object page list.
2152 *
2153 * The object must be locked.
2154 */
2155
2156 void
vm_page_remove(vm_page_t mem,boolean_t remove_from_hash)2157 vm_page_remove(
2158 vm_page_t mem,
2159 boolean_t remove_from_hash)
2160 {
2161 vm_page_bucket_t *bucket;
2162 vm_page_t this;
2163 lck_spin_t *bucket_lock;
2164 int hash_id;
2165 task_t owner;
2166 vm_object_t m_object;
2167 int ledger_idx_volatile;
2168 int ledger_idx_nonvolatile;
2169 int ledger_idx_volatile_compressed;
2170 int ledger_idx_nonvolatile_compressed;
2171 int do_footprint;
2172
2173 m_object = VM_PAGE_OBJECT(mem);
2174
2175 vm_object_lock_assert_exclusive(m_object);
2176 assert(mem->vmp_tabled);
2177 assert(!mem->vmp_cleaning);
2178 assert(!mem->vmp_laundry);
2179
2180 if (VM_PAGE_PAGEABLE(mem)) {
2181 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2182 }
2183 #if 0
2184 /*
2185 * we don't hold the page queue lock
2186 * so this check isn't safe to make
2187 */
2188 VM_PAGE_CHECK(mem);
2189 #endif
2190 if (remove_from_hash == TRUE) {
2191 /*
2192 * Remove from the object_object/offset hash table
2193 */
2194 hash_id = vm_page_hash(m_object, mem->vmp_offset);
2195 bucket = &vm_page_buckets[hash_id];
2196 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
2197
2198 lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
2199
2200 if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) {
2201 /* optimize for common case */
2202
2203 bucket->page_list = mem->vmp_next_m;
2204 } else {
2205 vm_page_packed_t *prev;
2206
2207 for (prev = &this->vmp_next_m;
2208 (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem;
2209 prev = &this->vmp_next_m) {
2210 continue;
2211 }
2212 *prev = this->vmp_next_m;
2213 }
2214 #if MACH_PAGE_HASH_STATS
2215 bucket->cur_count--;
2216 #endif /* MACH_PAGE_HASH_STATS */
2217 mem->vmp_hashed = FALSE;
2218 this->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
2219 lck_spin_unlock(bucket_lock);
2220 }
2221 /*
2222 * Now remove from the object's list of backed pages.
2223 */
2224
2225 vm_page_remove_internal(mem);
2226
2227 /*
2228 * And show that the object has one fewer resident
2229 * page.
2230 */
2231
2232 assert(m_object->resident_page_count > 0);
2233 m_object->resident_page_count--;
2234
2235 #if DEVELOPMENT || DEBUG
2236 if (m_object->object_is_shared_cache &&
2237 m_object->pager != NULL &&
2238 m_object->pager->mo_pager_ops == &shared_region_pager_ops) {
2239 assert(!m_object->internal);
2240 OSAddAtomic(-1, &shared_region_pagers_resident_count);
2241 }
2242 #endif /* DEVELOPMENT || DEBUG */
2243
2244 if (m_object->internal) {
2245 #if DEBUG
2246 assert(vm_page_internal_count);
2247 #endif /* DEBUG */
2248
2249 OSAddAtomic(-1, &vm_page_internal_count);
2250 } else {
2251 assert(vm_page_external_count);
2252 OSAddAtomic(-1, &vm_page_external_count);
2253
2254 if (mem->vmp_xpmapped) {
2255 assert(vm_page_xpmapped_external_count);
2256 OSAddAtomic(-1, &vm_page_xpmapped_external_count);
2257 }
2258 }
2259 if (!m_object->internal &&
2260 m_object->cached_list.next &&
2261 m_object->cached_list.prev) {
2262 if (m_object->resident_page_count == 0) {
2263 vm_object_cache_remove(m_object);
2264 }
2265 }
2266
2267 if (VM_PAGE_WIRED(mem)) {
2268 assert(mem->vmp_wire_count > 0);
2269 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
2270 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
2271 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
2272 }
2273 assert(m_object->resident_page_count >=
2274 m_object->wired_page_count);
2275 if (mem->vmp_reusable) {
2276 assert(m_object->reusable_page_count > 0);
2277 m_object->reusable_page_count--;
2278 assert(m_object->reusable_page_count <=
2279 m_object->resident_page_count);
2280 mem->vmp_reusable = FALSE;
2281 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
2282 vm_page_stats_reusable.reused_remove++;
2283 } else if (m_object->all_reusable) {
2284 OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
2285 vm_page_stats_reusable.reused_remove++;
2286 }
2287
2288 if (m_object->purgable == VM_PURGABLE_DENY &&
2289 !m_object->vo_ledger_tag) {
2290 owner = TASK_NULL;
2291 } else {
2292 owner = VM_OBJECT_OWNER(m_object);
2293 vm_object_ledger_tag_ledgers(m_object,
2294 &ledger_idx_volatile,
2295 &ledger_idx_nonvolatile,
2296 &ledger_idx_volatile_compressed,
2297 &ledger_idx_nonvolatile_compressed,
2298 &do_footprint);
2299 }
2300 if (owner &&
2301 (m_object->purgable == VM_PURGABLE_NONVOLATILE ||
2302 m_object->purgable == VM_PURGABLE_DENY ||
2303 VM_PAGE_WIRED(mem))) {
2304 /* less non-volatile bytes */
2305 ledger_debit(owner->ledger,
2306 ledger_idx_nonvolatile,
2307 PAGE_SIZE);
2308 if (do_footprint) {
2309 /* less footprint */
2310 ledger_debit(owner->ledger,
2311 task_ledgers.phys_footprint,
2312 PAGE_SIZE);
2313 }
2314 } else if (owner &&
2315 (m_object->purgable == VM_PURGABLE_VOLATILE ||
2316 m_object->purgable == VM_PURGABLE_EMPTY)) {
2317 assert(!VM_PAGE_WIRED(mem));
2318 /* less volatile bytes */
2319 ledger_debit(owner->ledger,
2320 ledger_idx_volatile,
2321 PAGE_SIZE);
2322 }
2323 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
2324 if (VM_PAGE_WIRED(mem)) {
2325 assert(vm_page_purgeable_wired_count > 0);
2326 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
2327 } else {
2328 assert(vm_page_purgeable_count > 0);
2329 OSAddAtomic(-1, &vm_page_purgeable_count);
2330 }
2331 }
2332
2333 if (m_object->set_cache_attr == TRUE) {
2334 pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0);
2335 }
2336
2337 mem->vmp_tabled = FALSE;
2338 mem->vmp_object = 0;
2339 mem->vmp_offset = (vm_object_offset_t) -1;
2340 }
2341
2342
2343 /*
2344 * vm_page_lookup:
2345 *
2346 * Returns the page associated with the object/offset
2347 * pair specified; if none is found, VM_PAGE_NULL is returned.
2348 *
2349 * The object must be locked. No side effects.
2350 */
2351
2352 #define VM_PAGE_HASH_LOOKUP_THRESHOLD 10
2353
2354 #if DEBUG_VM_PAGE_LOOKUP
2355
2356 struct {
2357 uint64_t vpl_total;
2358 uint64_t vpl_empty_obj;
2359 uint64_t vpl_bucket_NULL;
2360 uint64_t vpl_hit_hint;
2361 uint64_t vpl_hit_hint_next;
2362 uint64_t vpl_hit_hint_prev;
2363 uint64_t vpl_fast;
2364 uint64_t vpl_slow;
2365 uint64_t vpl_hit;
2366 uint64_t vpl_miss;
2367
2368 uint64_t vpl_fast_elapsed;
2369 uint64_t vpl_slow_elapsed;
2370 } vm_page_lookup_stats __attribute__((aligned(8)));
2371
2372 #endif
2373
2374 #define KDP_VM_PAGE_WALK_MAX 1000
2375
2376 vm_page_t
kdp_vm_page_lookup(vm_object_t object,vm_object_offset_t offset)2377 kdp_vm_page_lookup(
2378 vm_object_t object,
2379 vm_object_offset_t offset)
2380 {
2381 vm_page_t cur_page;
2382 int num_traversed = 0;
2383
2384 if (not_in_kdp) {
2385 panic("panic: kdp_vm_page_lookup done outside of kernel debugger");
2386 }
2387
2388 vm_page_queue_iterate(&object->memq, cur_page, vmp_listq) {
2389 if (cur_page->vmp_offset == offset) {
2390 return cur_page;
2391 }
2392 num_traversed++;
2393
2394 if (num_traversed >= KDP_VM_PAGE_WALK_MAX) {
2395 return VM_PAGE_NULL;
2396 }
2397 }
2398
2399 return VM_PAGE_NULL;
2400 }
2401
2402 vm_page_t
vm_page_lookup(vm_object_t object,vm_object_offset_t offset)2403 vm_page_lookup(
2404 vm_object_t object,
2405 vm_object_offset_t offset)
2406 {
2407 vm_page_t mem;
2408 vm_page_bucket_t *bucket;
2409 vm_page_queue_entry_t qe;
2410 lck_spin_t *bucket_lock = NULL;
2411 int hash_id;
2412 #if DEBUG_VM_PAGE_LOOKUP
2413 uint64_t start, elapsed;
2414
2415 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_total);
2416 #endif
2417
2418 #if CONFIG_KERNEL_TBI
2419 if (VM_KERNEL_ADDRESS(offset)) {
2420 offset = VM_KERNEL_STRIP_UPTR(offset);
2421 }
2422 #endif /* CONFIG_KERNEL_TBI */
2423
2424 vm_object_lock_assert_held(object);
2425 assertf(page_aligned(offset), "offset 0x%llx\n", offset);
2426
2427 if (object->resident_page_count == 0) {
2428 #if DEBUG_VM_PAGE_LOOKUP
2429 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_empty_obj);
2430 #endif
2431 return VM_PAGE_NULL;
2432 }
2433
2434 mem = object->memq_hint;
2435
2436 if (mem != VM_PAGE_NULL) {
2437 assert(VM_PAGE_OBJECT(mem) == object);
2438
2439 if (mem->vmp_offset == offset) {
2440 #if DEBUG_VM_PAGE_LOOKUP
2441 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint);
2442 #endif
2443 return mem;
2444 }
2445 qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->vmp_listq);
2446
2447 if (!vm_page_queue_end(&object->memq, qe)) {
2448 vm_page_t next_page;
2449
2450 next_page = (vm_page_t)((uintptr_t)qe);
2451 assert(VM_PAGE_OBJECT(next_page) == object);
2452
2453 if (next_page->vmp_offset == offset) {
2454 object->memq_hint = next_page; /* new hint */
2455 #if DEBUG_VM_PAGE_LOOKUP
2456 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_next);
2457 #endif
2458 return next_page;
2459 }
2460 }
2461 qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->vmp_listq);
2462
2463 if (!vm_page_queue_end(&object->memq, qe)) {
2464 vm_page_t prev_page;
2465
2466 prev_page = (vm_page_t)((uintptr_t)qe);
2467 assert(VM_PAGE_OBJECT(prev_page) == object);
2468
2469 if (prev_page->vmp_offset == offset) {
2470 object->memq_hint = prev_page; /* new hint */
2471 #if DEBUG_VM_PAGE_LOOKUP
2472 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_prev);
2473 #endif
2474 return prev_page;
2475 }
2476 }
2477 }
2478 /*
2479 * Search the hash table for this object/offset pair
2480 */
2481 hash_id = vm_page_hash(object, offset);
2482 bucket = &vm_page_buckets[hash_id];
2483
2484 /*
2485 * since we hold the object lock, we are guaranteed that no
2486 * new pages can be inserted into this object... this in turn
2487 * guarantess that the page we're looking for can't exist
2488 * if the bucket it hashes to is currently NULL even when looked
2489 * at outside the scope of the hash bucket lock... this is a
2490 * really cheap optimiztion to avoid taking the lock
2491 */
2492 if (!bucket->page_list) {
2493 #if DEBUG_VM_PAGE_LOOKUP
2494 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_bucket_NULL);
2495 #endif
2496 return VM_PAGE_NULL;
2497 }
2498
2499 #if DEBUG_VM_PAGE_LOOKUP
2500 start = mach_absolute_time();
2501 #endif
2502 if (object->resident_page_count <= VM_PAGE_HASH_LOOKUP_THRESHOLD) {
2503 /*
2504 * on average, it's roughly 3 times faster to run a short memq list
2505 * than to take the spin lock and go through the hash list
2506 */
2507 mem = (vm_page_t)vm_page_queue_first(&object->memq);
2508
2509 while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
2510 if (mem->vmp_offset == offset) {
2511 break;
2512 }
2513
2514 mem = (vm_page_t)vm_page_queue_next(&mem->vmp_listq);
2515 }
2516 if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
2517 mem = NULL;
2518 }
2519 } else {
2520 vm_page_object_t packed_object;
2521
2522 packed_object = VM_PAGE_PACK_OBJECT(object);
2523
2524 bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
2525
2526 lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
2527
2528 for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
2529 mem != VM_PAGE_NULL;
2530 mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m))) {
2531 #if 0
2532 /*
2533 * we don't hold the page queue lock
2534 * so this check isn't safe to make
2535 */
2536 VM_PAGE_CHECK(mem);
2537 #endif
2538 if ((mem->vmp_object == packed_object) && (mem->vmp_offset == offset)) {
2539 break;
2540 }
2541 }
2542 lck_spin_unlock(bucket_lock);
2543 }
2544
2545 #if DEBUG_VM_PAGE_LOOKUP
2546 elapsed = mach_absolute_time() - start;
2547
2548 if (bucket_lock) {
2549 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_slow);
2550 OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_slow_elapsed);
2551 } else {
2552 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_fast);
2553 OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_fast_elapsed);
2554 }
2555 if (mem != VM_PAGE_NULL) {
2556 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit);
2557 } else {
2558 OSAddAtomic64(1, &vm_page_lookup_stats.vpl_miss);
2559 }
2560 #endif
2561 if (mem != VM_PAGE_NULL) {
2562 assert(VM_PAGE_OBJECT(mem) == object);
2563
2564 object->memq_hint = mem;
2565 }
2566 return mem;
2567 }
2568
2569
2570 /*
2571 * vm_page_rename:
2572 *
2573 * Move the given memory entry from its
2574 * current object to the specified target object/offset.
2575 *
2576 * The object must be locked.
2577 */
2578 void
vm_page_rename(vm_page_t mem,vm_object_t new_object,vm_object_offset_t new_offset)2579 vm_page_rename(
2580 vm_page_t mem,
2581 vm_object_t new_object,
2582 vm_object_offset_t new_offset)
2583 {
2584 boolean_t internal_to_external, external_to_internal;
2585 vm_tag_t tag;
2586 vm_object_t m_object;
2587
2588 m_object = VM_PAGE_OBJECT(mem);
2589
2590 assert(m_object != new_object);
2591 assert(m_object);
2592
2593 /*
2594 * Changes to mem->vmp_object require the page lock because
2595 * the pageout daemon uses that lock to get the object.
2596 */
2597 vm_page_lockspin_queues();
2598
2599 internal_to_external = FALSE;
2600 external_to_internal = FALSE;
2601
2602 if (mem->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
2603 /*
2604 * it's much easier to get the vm_page_pageable_xxx accounting correct
2605 * if we first move the page to the active queue... it's going to end
2606 * up there anyway, and we don't do vm_page_rename's frequently enough
2607 * for this to matter.
2608 */
2609 vm_page_queues_remove(mem, FALSE);
2610 vm_page_activate(mem);
2611 }
2612 if (VM_PAGE_PAGEABLE(mem)) {
2613 if (m_object->internal && !new_object->internal) {
2614 internal_to_external = TRUE;
2615 }
2616 if (!m_object->internal && new_object->internal) {
2617 external_to_internal = TRUE;
2618 }
2619 }
2620
2621 tag = m_object->wire_tag;
2622 vm_page_remove(mem, TRUE);
2623 vm_page_insert_internal(mem, new_object, new_offset, tag, TRUE, TRUE, FALSE, FALSE, NULL);
2624
2625 if (internal_to_external) {
2626 vm_page_pageable_internal_count--;
2627 vm_page_pageable_external_count++;
2628 } else if (external_to_internal) {
2629 vm_page_pageable_external_count--;
2630 vm_page_pageable_internal_count++;
2631 }
2632
2633 vm_page_unlock_queues();
2634 }
2635
2636 /*
2637 * vm_page_init:
2638 *
2639 * Initialize the fields in a new page.
2640 * This takes a structure with random values and initializes it
2641 * so that it can be given to vm_page_release or vm_page_insert.
2642 */
2643 void
vm_page_init(vm_page_t mem,ppnum_t phys_page,boolean_t lopage)2644 vm_page_init(
2645 vm_page_t mem,
2646 ppnum_t phys_page,
2647 boolean_t lopage)
2648 {
2649 uint_t i;
2650 uintptr_t *p;
2651
2652 assert(phys_page);
2653
2654 #if DEBUG
2655 if ((phys_page != vm_page_fictitious_addr) && (phys_page != vm_page_guard_addr)) {
2656 if (!(pmap_valid_page(phys_page))) {
2657 panic("vm_page_init: non-DRAM phys_page 0x%x", phys_page);
2658 }
2659 }
2660 #endif /* DEBUG */
2661
2662 /*
2663 * Initialize the fields of the vm_page. If adding any new fields to vm_page,
2664 * try to use initial values which match 0. This minimizes the number of writes
2665 * needed for boot-time initialization.
2666 *
2667 * Kernel bzero() isn't an inline yet, so do it by hand for performance.
2668 */
2669 assert(VM_PAGE_NOT_ON_Q == 0);
2670 assert(sizeof(*mem) % sizeof(uintptr_t) == 0);
2671 for (p = (uintptr_t *)(void *)mem, i = sizeof(*mem) / sizeof(uintptr_t); i != 0; --i) {
2672 *p++ = 0;
2673 }
2674 mem->vmp_offset = (vm_object_offset_t)-1;
2675 mem->vmp_busy = TRUE;
2676 mem->vmp_lopage = lopage;
2677
2678 VM_PAGE_SET_PHYS_PAGE(mem, phys_page);
2679 #if 0
2680 /*
2681 * we're leaving this turned off for now... currently pages
2682 * come off the free list and are either immediately dirtied/referenced
2683 * due to zero-fill or COW faults, or are used to read or write files...
2684 * in the file I/O case, the UPL mechanism takes care of clearing
2685 * the state of the HW ref/mod bits in a somewhat fragile way.
2686 * Since we may change the way this works in the future (to toughen it up),
2687 * I'm leaving this as a reminder of where these bits could get cleared
2688 */
2689
2690 /*
2691 * make sure both the h/w referenced and modified bits are
2692 * clear at this point... we are especially dependent on
2693 * not finding a 'stale' h/w modified in a number of spots
2694 * once this page goes back into use
2695 */
2696 pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
2697 #endif
2698 }
2699
2700 /*
2701 * vm_page_grab_fictitious:
2702 *
2703 * Remove a fictitious page from the free list.
2704 * Returns VM_PAGE_NULL if there are no free pages.
2705 */
2706
2707 static vm_page_t
vm_page_grab_fictitious_common(ppnum_t phys_addr,boolean_t canwait)2708 vm_page_grab_fictitious_common(ppnum_t phys_addr, boolean_t canwait)
2709 {
2710 vm_page_t m;
2711
2712 m = zalloc_flags(vm_page_zone, canwait ? Z_WAITOK : Z_NOWAIT);
2713 if (m) {
2714 vm_page_init(m, phys_addr, FALSE);
2715 m->vmp_fictitious = TRUE;
2716 }
2717 return m;
2718 }
2719
2720 vm_page_t
vm_page_grab_fictitious(boolean_t canwait)2721 vm_page_grab_fictitious(boolean_t canwait)
2722 {
2723 return vm_page_grab_fictitious_common(vm_page_fictitious_addr, canwait);
2724 }
2725
2726 int vm_guard_count;
2727
2728
2729 vm_page_t
vm_page_grab_guard(boolean_t canwait)2730 vm_page_grab_guard(boolean_t canwait)
2731 {
2732 vm_page_t page;
2733 page = vm_page_grab_fictitious_common(vm_page_guard_addr, canwait);
2734 if (page) {
2735 OSAddAtomic(1, &vm_guard_count);
2736 }
2737 return page;
2738 }
2739
2740
2741 /*
2742 * vm_page_release_fictitious:
2743 *
2744 * Release a fictitious page to the zone pool
2745 */
2746 void
vm_page_release_fictitious(vm_page_t m)2747 vm_page_release_fictitious(
2748 vm_page_t m)
2749 {
2750 assert((m->vmp_q_state == VM_PAGE_NOT_ON_Q) || (m->vmp_q_state == VM_PAGE_IS_WIRED));
2751 assert(m->vmp_fictitious);
2752 assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr ||
2753 VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr);
2754
2755
2756 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2757 OSAddAtomic(-1, &vm_guard_count);
2758 }
2759
2760 zfree(vm_page_zone, m);
2761 }
2762
2763 /*
2764 * vm_pool_low():
2765 *
2766 * Return true if it is not likely that a non-vm_privileged thread
2767 * can get memory without blocking. Advisory only, since the
2768 * situation may change under us.
2769 */
2770 bool
vm_pool_low(void)2771 vm_pool_low(void)
2772 {
2773 /* No locking, at worst we will fib. */
2774 return vm_page_free_count <= vm_page_free_reserved;
2775 }
2776
2777 boolean_t vm_darkwake_mode = FALSE;
2778
2779 /*
2780 * vm_update_darkwake_mode():
2781 *
2782 * Tells the VM that the system is in / out of darkwake.
2783 *
2784 * Today, the VM only lowers/raises the background queue target
2785 * so as to favor consuming more/less background pages when
2786 * darwake is ON/OFF.
2787 *
2788 * We might need to do more things in the future.
2789 */
2790
2791 void
vm_update_darkwake_mode(boolean_t darkwake_mode)2792 vm_update_darkwake_mode(boolean_t darkwake_mode)
2793 {
2794 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2795
2796 vm_page_lockspin_queues();
2797
2798 if (vm_darkwake_mode == darkwake_mode) {
2799 /*
2800 * No change.
2801 */
2802 vm_page_unlock_queues();
2803 return;
2804 }
2805
2806 vm_darkwake_mode = darkwake_mode;
2807
2808 if (vm_darkwake_mode == TRUE) {
2809 #if CONFIG_BACKGROUND_QUEUE
2810
2811 /* save background target to restore later */
2812 vm_page_background_target_snapshot = vm_page_background_target;
2813
2814 /* target is set to 0...no protection for background pages */
2815 vm_page_background_target = 0;
2816
2817 #endif /* CONFIG_BACKGROUND_QUEUE */
2818 } else if (vm_darkwake_mode == FALSE) {
2819 #if CONFIG_BACKGROUND_QUEUE
2820
2821 if (vm_page_background_target_snapshot) {
2822 vm_page_background_target = vm_page_background_target_snapshot;
2823 }
2824 #endif /* CONFIG_BACKGROUND_QUEUE */
2825 }
2826 vm_page_unlock_queues();
2827 }
2828
2829 #if CONFIG_BACKGROUND_QUEUE
2830
2831 void
vm_page_update_background_state(vm_page_t mem)2832 vm_page_update_background_state(vm_page_t mem)
2833 {
2834 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
2835 return;
2836 }
2837
2838 if (mem->vmp_in_background == FALSE) {
2839 return;
2840 }
2841
2842 task_t my_task = current_task_early();
2843
2844 if (my_task) {
2845 if (task_get_darkwake_mode(my_task)) {
2846 return;
2847 }
2848 }
2849
2850 #if BACKGROUNDQ_BASED_ON_QOS
2851 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY) {
2852 return;
2853 }
2854 #else
2855 if (my_task) {
2856 if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG)) {
2857 return;
2858 }
2859 }
2860 #endif
2861 vm_page_lockspin_queues();
2862
2863 mem->vmp_in_background = FALSE;
2864 vm_page_background_promoted_count++;
2865
2866 vm_page_remove_from_backgroundq(mem);
2867
2868 vm_page_unlock_queues();
2869 }
2870
2871
2872 void
vm_page_assign_background_state(vm_page_t mem)2873 vm_page_assign_background_state(vm_page_t mem)
2874 {
2875 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
2876 return;
2877 }
2878
2879 task_t my_task = current_task_early();
2880
2881 if (my_task) {
2882 if (task_get_darkwake_mode(my_task)) {
2883 mem->vmp_in_background = TRUE;
2884 return;
2885 }
2886 }
2887
2888 #if BACKGROUNDQ_BASED_ON_QOS
2889 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY) {
2890 mem->vmp_in_background = TRUE;
2891 } else {
2892 mem->vmp_in_background = FALSE;
2893 }
2894 #else
2895 if (my_task) {
2896 mem->vmp_in_background = proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG);
2897 }
2898 #endif
2899 }
2900
2901
2902 void
vm_page_remove_from_backgroundq(vm_page_t mem)2903 vm_page_remove_from_backgroundq(
2904 vm_page_t mem)
2905 {
2906 vm_object_t m_object;
2907
2908 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2909
2910 if (mem->vmp_on_backgroundq) {
2911 vm_page_queue_remove(&vm_page_queue_background, mem, vmp_backgroundq);
2912
2913 mem->vmp_backgroundq.next = 0;
2914 mem->vmp_backgroundq.prev = 0;
2915 mem->vmp_on_backgroundq = FALSE;
2916
2917 vm_page_background_count--;
2918
2919 m_object = VM_PAGE_OBJECT(mem);
2920
2921 if (m_object->internal) {
2922 vm_page_background_internal_count--;
2923 } else {
2924 vm_page_background_external_count--;
2925 }
2926 } else {
2927 assert(VM_PAGE_UNPACK_PTR(mem->vmp_backgroundq.next) == (uintptr_t)NULL &&
2928 VM_PAGE_UNPACK_PTR(mem->vmp_backgroundq.prev) == (uintptr_t)NULL);
2929 }
2930 }
2931
2932
2933 void
vm_page_add_to_backgroundq(vm_page_t mem,boolean_t first)2934 vm_page_add_to_backgroundq(
2935 vm_page_t mem,
2936 boolean_t first)
2937 {
2938 vm_object_t m_object;
2939
2940 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2941
2942 if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
2943 return;
2944 }
2945
2946 if (mem->vmp_on_backgroundq == FALSE) {
2947 m_object = VM_PAGE_OBJECT(mem);
2948
2949 if (vm_page_background_exclude_external && !m_object->internal) {
2950 return;
2951 }
2952
2953 if (first == TRUE) {
2954 vm_page_queue_enter_first(&vm_page_queue_background, mem, vmp_backgroundq);
2955 } else {
2956 vm_page_queue_enter(&vm_page_queue_background, mem, vmp_backgroundq);
2957 }
2958 mem->vmp_on_backgroundq = TRUE;
2959
2960 vm_page_background_count++;
2961
2962 if (m_object->internal) {
2963 vm_page_background_internal_count++;
2964 } else {
2965 vm_page_background_external_count++;
2966 }
2967 }
2968 }
2969
2970 #endif /* CONFIG_BACKGROUND_QUEUE */
2971
2972 /*
2973 * This can be switched to FALSE to help debug drivers
2974 * that are having problems with memory > 4G.
2975 */
2976 boolean_t vm_himemory_mode = TRUE;
2977
2978 /*
2979 * this interface exists to support hardware controllers
2980 * incapable of generating DMAs with more than 32 bits
2981 * of address on platforms with physical memory > 4G...
2982 */
2983 unsigned int vm_lopages_allocated_q = 0;
2984 unsigned int vm_lopages_allocated_cpm_success = 0;
2985 unsigned int vm_lopages_allocated_cpm_failed = 0;
2986 vm_page_queue_head_t vm_lopage_queue_free VM_PAGE_PACKED_ALIGNED;
2987
2988 vm_page_t
vm_page_grablo(void)2989 vm_page_grablo(void)
2990 {
2991 vm_page_t mem;
2992
2993 if (vm_lopage_needed == FALSE) {
2994 return vm_page_grab();
2995 }
2996
2997 lck_mtx_lock_spin(&vm_page_queue_free_lock);
2998
2999 if (!vm_page_queue_empty(&vm_lopage_queue_free)) {
3000 vm_page_queue_remove_first(&vm_lopage_queue_free, mem, vmp_pageq);
3001 assert(vm_lopage_free_count);
3002 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
3003 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
3004
3005 vm_lopage_free_count--;
3006 vm_lopages_allocated_q++;
3007
3008 if (vm_lopage_free_count < vm_lopage_lowater) {
3009 vm_lopage_refill = TRUE;
3010 }
3011
3012 lck_mtx_unlock(&vm_page_queue_free_lock);
3013
3014 #if CONFIG_BACKGROUND_QUEUE
3015 vm_page_assign_background_state(mem);
3016 #endif
3017 } else {
3018 lck_mtx_unlock(&vm_page_queue_free_lock);
3019
3020 if (cpm_allocate(PAGE_SIZE, &mem, atop(PPNUM_MAX), 0, FALSE, KMA_LOMEM) != KERN_SUCCESS) {
3021 lck_mtx_lock_spin(&vm_page_queue_free_lock);
3022 vm_lopages_allocated_cpm_failed++;
3023 lck_mtx_unlock(&vm_page_queue_free_lock);
3024
3025 return VM_PAGE_NULL;
3026 }
3027 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3028
3029 mem->vmp_busy = TRUE;
3030
3031 vm_page_lockspin_queues();
3032
3033 mem->vmp_gobbled = FALSE;
3034 vm_page_gobble_count--;
3035 vm_page_wire_count--;
3036
3037 vm_lopages_allocated_cpm_success++;
3038 vm_page_unlock_queues();
3039 }
3040 assert(mem->vmp_busy);
3041 assert(!mem->vmp_pmapped);
3042 assert(!mem->vmp_wpmapped);
3043 assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
3044
3045 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
3046
3047 counter_inc(&vm_page_grab_count);
3048 VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, 0, 1, 0, 0);
3049
3050 return mem;
3051 }
3052
3053 /*
3054 * vm_page_grab:
3055 *
3056 * first try to grab a page from the per-cpu free list...
3057 * this must be done while pre-emption is disabled... if
3058 * a page is available, we're done...
3059 * if no page is available, grab the vm_page_queue_free_lock
3060 * and see if current number of free pages would allow us
3061 * to grab at least 1... if not, return VM_PAGE_NULL as before...
3062 * if there are pages available, disable preemption and
3063 * recheck the state of the per-cpu free list... we could
3064 * have been preempted and moved to a different cpu, or
3065 * some other thread could have re-filled it... if still
3066 * empty, figure out how many pages we can steal from the
3067 * global free queue and move to the per-cpu queue...
3068 * return 1 of these pages when done... only wakeup the
3069 * pageout_scan thread if we moved pages from the global
3070 * list... no need for the wakeup if we've satisfied the
3071 * request from the per-cpu queue.
3072 */
3073
3074 #if CONFIG_SECLUDED_MEMORY
3075 vm_page_t vm_page_grab_secluded(void);
3076 #endif /* CONFIG_SECLUDED_MEMORY */
3077
3078 static inline void
3079 vm_page_grab_diags(void);
3080
3081 vm_page_t
vm_page_grab(void)3082 vm_page_grab(void)
3083 {
3084 return vm_page_grab_options(VM_PAGE_GRAB_OPTIONS_NONE);
3085 }
3086
3087 #if HIBERNATION
3088 boolean_t hibernate_rebuild_needed = FALSE;
3089 #endif /* HIBERNATION */
3090
3091 vm_page_t
vm_page_grab_options(int grab_options)3092 vm_page_grab_options(
3093 int grab_options)
3094 {
3095 vm_page_t mem;
3096
3097 disable_preemption();
3098
3099 if ((mem = *PERCPU_GET(free_pages))) {
3100 return_page_from_cpu_list:
3101 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
3102
3103 #if HIBERNATION
3104 if (hibernate_rebuild_needed) {
3105 panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__);
3106 }
3107 #endif /* HIBERNATION */
3108
3109 vm_page_grab_diags();
3110
3111 vm_offset_t pcpu_base = current_percpu_base();
3112 counter_inc_preemption_disabled(&vm_page_grab_count);
3113 *PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = mem->vmp_snext;
3114 VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
3115
3116 enable_preemption();
3117 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
3118 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
3119
3120 assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
3121 assert(mem->vmp_tabled == FALSE);
3122 assert(mem->vmp_object == 0);
3123 assert(!mem->vmp_laundry);
3124 ASSERT_PMAP_FREE(mem);
3125 assert(mem->vmp_busy);
3126 assert(!mem->vmp_pmapped);
3127 assert(!mem->vmp_wpmapped);
3128 assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
3129
3130 #if CONFIG_BACKGROUND_QUEUE
3131 vm_page_assign_background_state(mem);
3132 #endif
3133 return mem;
3134 }
3135 enable_preemption();
3136
3137
3138 /*
3139 * Optionally produce warnings if the wire or gobble
3140 * counts exceed some threshold.
3141 */
3142 #if VM_PAGE_WIRE_COUNT_WARNING
3143 if (vm_page_wire_count >= VM_PAGE_WIRE_COUNT_WARNING) {
3144 printf("mk: vm_page_grab(): high wired page count of %d\n",
3145 vm_page_wire_count);
3146 }
3147 #endif
3148 #if VM_PAGE_GOBBLE_COUNT_WARNING
3149 if (vm_page_gobble_count >= VM_PAGE_GOBBLE_COUNT_WARNING) {
3150 printf("mk: vm_page_grab(): high gobbled page count of %d\n",
3151 vm_page_gobble_count);
3152 }
3153 #endif
3154
3155 /*
3156 * If free count is low and we have delayed pages from early boot,
3157 * get one of those instead.
3158 */
3159 if (__improbable(vm_delayed_count > 0 &&
3160 vm_page_free_count <= vm_page_free_target &&
3161 (mem = vm_get_delayed_page(grab_options)) != NULL)) {
3162 return mem;
3163 }
3164
3165 lck_mtx_lock_spin(&vm_page_queue_free_lock);
3166
3167 /*
3168 * Only let privileged threads (involved in pageout)
3169 * dip into the reserved pool.
3170 */
3171 if ((vm_page_free_count < vm_page_free_reserved) &&
3172 !(current_thread()->options & TH_OPT_VMPRIV)) {
3173 /* no page for us in the free queue... */
3174 lck_mtx_unlock(&vm_page_queue_free_lock);
3175 mem = VM_PAGE_NULL;
3176
3177 #if CONFIG_SECLUDED_MEMORY
3178 /* ... but can we try and grab from the secluded queue? */
3179 if (vm_page_secluded_count > 0 &&
3180 ((grab_options & VM_PAGE_GRAB_SECLUDED) ||
3181 task_can_use_secluded_mem(current_task(), TRUE))) {
3182 mem = vm_page_grab_secluded();
3183 if (grab_options & VM_PAGE_GRAB_SECLUDED) {
3184 vm_page_secluded.grab_for_iokit++;
3185 if (mem) {
3186 vm_page_secluded.grab_for_iokit_success++;
3187 }
3188 }
3189 if (mem) {
3190 VM_CHECK_MEMORYSTATUS;
3191
3192 vm_page_grab_diags();
3193 counter_inc(&vm_page_grab_count);
3194 VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
3195
3196 return mem;
3197 }
3198 }
3199 #else /* CONFIG_SECLUDED_MEMORY */
3200 (void) grab_options;
3201 #endif /* CONFIG_SECLUDED_MEMORY */
3202 } else {
3203 vm_page_t head;
3204 vm_page_t tail;
3205 unsigned int pages_to_steal;
3206 unsigned int color;
3207 unsigned int clump_end, sub_count;
3208
3209 while (vm_page_free_count == 0) {
3210 lck_mtx_unlock(&vm_page_queue_free_lock);
3211 /*
3212 * must be a privileged thread to be
3213 * in this state since a non-privileged
3214 * thread would have bailed if we were
3215 * under the vm_page_free_reserved mark
3216 */
3217 VM_PAGE_WAIT();
3218 lck_mtx_lock_spin(&vm_page_queue_free_lock);
3219 }
3220
3221 disable_preemption();
3222
3223 if ((mem = *PERCPU_GET(free_pages))) {
3224 lck_mtx_unlock(&vm_page_queue_free_lock);
3225
3226 /*
3227 * we got preempted and moved to another processor
3228 * or we got preempted and someone else ran and filled the cache
3229 */
3230 goto return_page_from_cpu_list;
3231 }
3232 if (vm_page_free_count <= vm_page_free_reserved) {
3233 pages_to_steal = 1;
3234 } else {
3235 if (vm_free_magazine_refill_limit <= (vm_page_free_count - vm_page_free_reserved)) {
3236 pages_to_steal = vm_free_magazine_refill_limit;
3237 } else {
3238 pages_to_steal = (vm_page_free_count - vm_page_free_reserved);
3239 }
3240 }
3241 color = *PERCPU_GET(start_color);
3242 head = tail = NULL;
3243
3244 vm_page_free_count -= pages_to_steal;
3245 clump_end = sub_count = 0;
3246
3247 while (pages_to_steal--) {
3248 while (vm_page_queue_empty(&vm_page_queue_free[color].qhead)) {
3249 color = (color + 1) & vm_color_mask;
3250 }
3251 #if defined(__x86_64__)
3252 vm_page_queue_remove_first_with_clump(&vm_page_queue_free[color].qhead,
3253 mem, clump_end);
3254 #else
3255 vm_page_queue_remove_first(&vm_page_queue_free[color].qhead,
3256 mem, vmp_pageq);
3257 #endif
3258
3259 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
3260
3261 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
3262
3263 #if defined(__arm__) || defined(__arm64__)
3264 color = (color + 1) & vm_color_mask;
3265 #else
3266
3267 #if DEVELOPMENT || DEBUG
3268
3269 sub_count++;
3270 if (clump_end) {
3271 vm_clump_update_stats(sub_count);
3272 sub_count = 0;
3273 color = (color + 1) & vm_color_mask;
3274 }
3275 #else
3276 if (clump_end) {
3277 color = (color + 1) & vm_color_mask;
3278 }
3279
3280 #endif /* if DEVELOPMENT || DEBUG */
3281
3282 #endif /* if defined(__arm__) || defined(__arm64__) */
3283
3284 if (head == NULL) {
3285 head = mem;
3286 } else {
3287 tail->vmp_snext = mem;
3288 }
3289 tail = mem;
3290
3291 assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
3292 assert(mem->vmp_tabled == FALSE);
3293 assert(mem->vmp_object == 0);
3294 assert(!mem->vmp_laundry);
3295
3296 mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
3297
3298 ASSERT_PMAP_FREE(mem);
3299 assert(mem->vmp_busy);
3300 assert(!mem->vmp_pmapped);
3301 assert(!mem->vmp_wpmapped);
3302 assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
3303 }
3304 #if defined (__x86_64__) && (DEVELOPMENT || DEBUG)
3305 vm_clump_update_stats(sub_count);
3306 #endif
3307 lck_mtx_unlock(&vm_page_queue_free_lock);
3308
3309 #if HIBERNATION
3310 if (hibernate_rebuild_needed) {
3311 panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__);
3312 }
3313 #endif /* HIBERNATION */
3314 vm_offset_t pcpu_base = current_percpu_base();
3315 *PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = head->vmp_snext;
3316 *PERCPU_GET_WITH_BASE(pcpu_base, start_color) = color;
3317
3318 /*
3319 * satisfy this request
3320 */
3321 vm_page_grab_diags();
3322 counter_inc_preemption_disabled(&vm_page_grab_count);
3323 VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
3324 mem = head;
3325 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
3326
3327 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
3328 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
3329
3330 enable_preemption();
3331 }
3332 /*
3333 * Decide if we should poke the pageout daemon.
3334 * We do this if the free count is less than the low
3335 * water mark. VM Pageout Scan will keep running till
3336 * the free_count > free_target (& hence above free_min).
3337 * This wakeup is to catch the possibility of the counts
3338 * dropping between VM Pageout Scan parking and this check.
3339 *
3340 * We don't have the counts locked ... if they change a little,
3341 * it doesn't really matter.
3342 */
3343 if (vm_page_free_count < vm_page_free_min) {
3344 lck_mtx_lock(&vm_page_queue_free_lock);
3345 if (vm_pageout_running == FALSE) {
3346 lck_mtx_unlock(&vm_page_queue_free_lock);
3347 thread_wakeup((event_t) &vm_page_free_wanted);
3348 } else {
3349 lck_mtx_unlock(&vm_page_queue_free_lock);
3350 }
3351 }
3352
3353 VM_CHECK_MEMORYSTATUS;
3354
3355 if (mem) {
3356 // dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 4); /* (TEST/DEBUG) */
3357
3358 #if CONFIG_BACKGROUND_QUEUE
3359 vm_page_assign_background_state(mem);
3360 #endif
3361 }
3362 return mem;
3363 }
3364
3365 #if CONFIG_SECLUDED_MEMORY
3366 vm_page_t
vm_page_grab_secluded(void)3367 vm_page_grab_secluded(void)
3368 {
3369 vm_page_t mem;
3370 vm_object_t object;
3371 int refmod_state;
3372
3373 if (vm_page_secluded_count == 0) {
3374 /* no secluded pages to grab... */
3375 return VM_PAGE_NULL;
3376 }
3377
3378 /* secluded queue is protected by the VM page queue lock */
3379 vm_page_lock_queues();
3380
3381 if (vm_page_secluded_count == 0) {
3382 /* no secluded pages to grab... */
3383 vm_page_unlock_queues();
3384 return VM_PAGE_NULL;
3385 }
3386
3387 #if 00
3388 /* can we grab from the secluded queue? */
3389 if (vm_page_secluded_count > vm_page_secluded_target ||
3390 (vm_page_secluded_count > 0 &&
3391 task_can_use_secluded_mem(current_task(), TRUE))) {
3392 /* OK */
3393 } else {
3394 /* can't grab from secluded queue... */
3395 vm_page_unlock_queues();
3396 return VM_PAGE_NULL;
3397 }
3398 #endif
3399
3400 /* we can grab a page from secluded queue! */
3401 assert((vm_page_secluded_count_free +
3402 vm_page_secluded_count_inuse) ==
3403 vm_page_secluded_count);
3404 if (current_task()->task_can_use_secluded_mem) {
3405 assert(num_tasks_can_use_secluded_mem > 0);
3406 }
3407 assert(!vm_page_queue_empty(&vm_page_queue_secluded));
3408 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3409 mem = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
3410 assert(mem->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
3411 vm_page_queues_remove(mem, TRUE);
3412
3413 object = VM_PAGE_OBJECT(mem);
3414
3415 assert(!mem->vmp_fictitious);
3416 assert(!VM_PAGE_WIRED(mem));
3417 if (object == VM_OBJECT_NULL) {
3418 /* free for grab! */
3419 vm_page_unlock_queues();
3420 vm_page_secluded.grab_success_free++;
3421
3422 assert(mem->vmp_busy);
3423 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3424 assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL);
3425 assert(mem->vmp_pageq.next == 0);
3426 assert(mem->vmp_pageq.prev == 0);
3427 assert(mem->vmp_listq.next == 0);
3428 assert(mem->vmp_listq.prev == 0);
3429 #if CONFIG_BACKGROUND_QUEUE
3430 assert(mem->vmp_on_backgroundq == 0);
3431 assert(mem->vmp_backgroundq.next == 0);
3432 assert(mem->vmp_backgroundq.prev == 0);
3433 #endif /* CONFIG_BACKGROUND_QUEUE */
3434 return mem;
3435 }
3436
3437 assert(!object->internal);
3438 // vm_page_pageable_external_count--;
3439
3440 if (!vm_object_lock_try(object)) {
3441 // printf("SECLUDED: page %p: object %p locked\n", mem, object);
3442 vm_page_secluded.grab_failure_locked++;
3443 reactivate_secluded_page:
3444 vm_page_activate(mem);
3445 vm_page_unlock_queues();
3446 return VM_PAGE_NULL;
3447 }
3448 if (mem->vmp_busy ||
3449 mem->vmp_cleaning ||
3450 mem->vmp_laundry) {
3451 /* can't steal page in this state... */
3452 vm_object_unlock(object);
3453 vm_page_secluded.grab_failure_state++;
3454 goto reactivate_secluded_page;
3455 }
3456
3457 mem->vmp_busy = TRUE;
3458 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
3459 if (refmod_state & VM_MEM_REFERENCED) {
3460 mem->vmp_reference = TRUE;
3461 }
3462 if (refmod_state & VM_MEM_MODIFIED) {
3463 SET_PAGE_DIRTY(mem, FALSE);
3464 }
3465 if (mem->vmp_dirty || mem->vmp_precious) {
3466 /* can't grab a dirty page; re-activate */
3467 // printf("SECLUDED: dirty page %p\n", mem);
3468 PAGE_WAKEUP_DONE(mem);
3469 vm_page_secluded.grab_failure_dirty++;
3470 vm_object_unlock(object);
3471 goto reactivate_secluded_page;
3472 }
3473 if (mem->vmp_reference) {
3474 /* it's been used but we do need to grab a page... */
3475 }
3476
3477 vm_page_unlock_queues();
3478
3479 /* finish what vm_page_free() would have done... */
3480 vm_page_free_prepare_object(mem, TRUE);
3481 vm_object_unlock(object);
3482 object = VM_OBJECT_NULL;
3483 if (vm_page_free_verify) {
3484 ASSERT_PMAP_FREE(mem);
3485 }
3486 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
3487 vm_page_secluded.grab_success_other++;
3488
3489 assert(mem->vmp_busy);
3490 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3491 assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL);
3492 assert(mem->vmp_pageq.next == 0);
3493 assert(mem->vmp_pageq.prev == 0);
3494 assert(mem->vmp_listq.next == 0);
3495 assert(mem->vmp_listq.prev == 0);
3496 #if CONFIG_BACKGROUND_QUEUE
3497 assert(mem->vmp_on_backgroundq == 0);
3498 assert(mem->vmp_backgroundq.next == 0);
3499 assert(mem->vmp_backgroundq.prev == 0);
3500 #endif /* CONFIG_BACKGROUND_QUEUE */
3501
3502 return mem;
3503 }
3504
3505 uint64_t
vm_page_secluded_drain(void)3506 vm_page_secluded_drain(void)
3507 {
3508 vm_page_t local_freeq;
3509 int local_freed;
3510 uint64_t num_reclaimed;
3511 unsigned int saved_secluded_count, saved_secluded_target;
3512
3513 num_reclaimed = 0;
3514 local_freeq = NULL;
3515 local_freed = 0;
3516
3517 vm_page_lock_queues();
3518
3519 saved_secluded_count = vm_page_secluded_count;
3520 saved_secluded_target = vm_page_secluded_target;
3521 vm_page_secluded_target = 0;
3522 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3523 while (vm_page_secluded_count) {
3524 vm_page_t secluded_page;
3525
3526 assert((vm_page_secluded_count_free +
3527 vm_page_secluded_count_inuse) ==
3528 vm_page_secluded_count);
3529 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
3530 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
3531
3532 vm_page_queues_remove(secluded_page, FALSE);
3533 assert(!secluded_page->vmp_fictitious);
3534 assert(!VM_PAGE_WIRED(secluded_page));
3535
3536 if (secluded_page->vmp_object == 0) {
3537 /* transfer to free queue */
3538 assert(secluded_page->vmp_busy);
3539 secluded_page->vmp_snext = local_freeq;
3540 local_freeq = secluded_page;
3541 local_freed += 1;
3542 } else {
3543 /* transfer to head of active queue */
3544 vm_page_enqueue_active(secluded_page, FALSE);
3545 secluded_page = VM_PAGE_NULL;
3546 }
3547 num_reclaimed++;
3548 }
3549 vm_page_secluded_target = saved_secluded_target;
3550 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3551
3552 // printf("FBDP %s:%d secluded_count %d->%d, target %d, reclaimed %lld\n", __FUNCTION__, __LINE__, saved_secluded_count, vm_page_secluded_count, vm_page_secluded_target, num_reclaimed);
3553
3554 vm_page_unlock_queues();
3555
3556 if (local_freed) {
3557 vm_page_free_list(local_freeq, TRUE);
3558 local_freeq = NULL;
3559 local_freed = 0;
3560 }
3561
3562 return num_reclaimed;
3563 }
3564 #endif /* CONFIG_SECLUDED_MEMORY */
3565
3566
3567 static inline void
vm_page_grab_diags()3568 vm_page_grab_diags()
3569 {
3570 #if DEVELOPMENT || DEBUG
3571 task_t task = current_task_early();
3572 if (task == NULL) {
3573 return;
3574 }
3575
3576 ledger_credit(task->ledger, task_ledgers.pages_grabbed, 1);
3577 #endif /* DEVELOPMENT || DEBUG */
3578 }
3579
3580 /*
3581 * vm_page_release:
3582 *
3583 * Return a page to the free list.
3584 */
3585
3586 void
vm_page_release(vm_page_t mem,boolean_t page_queues_locked)3587 vm_page_release(
3588 vm_page_t mem,
3589 boolean_t page_queues_locked)
3590 {
3591 unsigned int color;
3592 int need_wakeup = 0;
3593 int need_priv_wakeup = 0;
3594 #if CONFIG_SECLUDED_MEMORY
3595 int need_secluded_wakeup = 0;
3596 #endif /* CONFIG_SECLUDED_MEMORY */
3597 event_t wakeup_event = NULL;
3598
3599 if (page_queues_locked) {
3600 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3601 } else {
3602 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3603 }
3604
3605 assert(!mem->vmp_private && !mem->vmp_fictitious);
3606 if (vm_page_free_verify) {
3607 ASSERT_PMAP_FREE(mem);
3608 }
3609 // dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 5); /* (TEST/DEBUG) */
3610
3611 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
3612
3613 lck_mtx_lock_spin(&vm_page_queue_free_lock);
3614
3615 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3616 assert(mem->vmp_busy);
3617 assert(!mem->vmp_laundry);
3618 assert(mem->vmp_object == 0);
3619 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
3620 assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
3621 #if CONFIG_BACKGROUND_QUEUE
3622 assert(mem->vmp_backgroundq.next == 0 &&
3623 mem->vmp_backgroundq.prev == 0 &&
3624 mem->vmp_on_backgroundq == FALSE);
3625 #endif
3626 if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) &&
3627 vm_lopage_free_count < vm_lopage_free_limit &&
3628 VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
3629 /*
3630 * this exists to support hardware controllers
3631 * incapable of generating DMAs with more than 32 bits
3632 * of address on platforms with physical memory > 4G...
3633 */
3634 vm_page_queue_enter_first(&vm_lopage_queue_free, mem, vmp_pageq);
3635 vm_lopage_free_count++;
3636
3637 if (vm_lopage_free_count >= vm_lopage_free_limit) {
3638 vm_lopage_refill = FALSE;
3639 }
3640
3641 mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
3642 mem->vmp_lopage = TRUE;
3643 #if CONFIG_SECLUDED_MEMORY
3644 } else if (vm_page_free_count > vm_page_free_reserved &&
3645 vm_page_secluded_count < vm_page_secluded_target &&
3646 num_tasks_can_use_secluded_mem == 0) {
3647 /*
3648 * XXX FBDP TODO: also avoid refilling secluded queue
3649 * when some IOKit objects are already grabbing from it...
3650 */
3651 if (!page_queues_locked) {
3652 if (!vm_page_trylock_queues()) {
3653 /* take locks in right order */
3654 lck_mtx_unlock(&vm_page_queue_free_lock);
3655 vm_page_lock_queues();
3656 lck_mtx_lock_spin(&vm_page_queue_free_lock);
3657 }
3658 }
3659 mem->vmp_lopage = FALSE;
3660 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3661 vm_page_queue_enter_first(&vm_page_queue_secluded, mem, vmp_pageq);
3662 mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
3663 vm_page_secluded_count++;
3664 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3665 vm_page_secluded_count_free++;
3666 if (!page_queues_locked) {
3667 vm_page_unlock_queues();
3668 }
3669 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
3670 if (vm_page_free_wanted_secluded > 0) {
3671 vm_page_free_wanted_secluded--;
3672 need_secluded_wakeup = 1;
3673 }
3674 #endif /* CONFIG_SECLUDED_MEMORY */
3675 } else {
3676 mem->vmp_lopage = FALSE;
3677 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
3678
3679 color = VM_PAGE_GET_COLOR(mem);
3680 #if defined(__x86_64__)
3681 vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem);
3682 #else
3683 vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vmp_pageq);
3684 #endif
3685 vm_page_free_count++;
3686 /*
3687 * Check if we should wake up someone waiting for page.
3688 * But don't bother waking them unless they can allocate.
3689 *
3690 * We wakeup only one thread, to prevent starvation.
3691 * Because the scheduling system handles wait queues FIFO,
3692 * if we wakeup all waiting threads, one greedy thread
3693 * can starve multiple niceguy threads. When the threads
3694 * all wakeup, the greedy threads runs first, grabs the page,
3695 * and waits for another page. It will be the first to run
3696 * when the next page is freed.
3697 *
3698 * However, there is a slight danger here.
3699 * The thread we wake might not use the free page.
3700 * Then the other threads could wait indefinitely
3701 * while the page goes unused. To forestall this,
3702 * the pageout daemon will keep making free pages
3703 * as long as vm_page_free_wanted is non-zero.
3704 */
3705
3706 assert(vm_page_free_count > 0);
3707 if (vm_page_free_wanted_privileged > 0) {
3708 vm_page_free_wanted_privileged--;
3709 need_priv_wakeup = 1;
3710 #if CONFIG_SECLUDED_MEMORY
3711 } else if (vm_page_free_wanted_secluded > 0 &&
3712 vm_page_free_count > vm_page_free_reserved) {
3713 vm_page_free_wanted_secluded--;
3714 need_secluded_wakeup = 1;
3715 #endif /* CONFIG_SECLUDED_MEMORY */
3716 } else if (vm_page_free_wanted > 0 &&
3717 vm_page_free_count > vm_page_free_reserved) {
3718 vm_page_free_wanted--;
3719 need_wakeup = 1;
3720 }
3721 }
3722 vm_pageout_vminfo.vm_page_pages_freed++;
3723
3724 VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, 1, 0, 0, 0);
3725
3726 lck_mtx_unlock(&vm_page_queue_free_lock);
3727
3728 if (need_priv_wakeup) {
3729 wakeup_event = &vm_page_free_wanted_privileged;
3730 }
3731 #if CONFIG_SECLUDED_MEMORY
3732 else if (need_secluded_wakeup) {
3733 wakeup_event = &vm_page_free_wanted_secluded;
3734 }
3735 #endif /* CONFIG_SECLUDED_MEMORY */
3736 else if (need_wakeup) {
3737 wakeup_event = &vm_page_free_count;
3738 }
3739
3740 if (wakeup_event) {
3741 if (vps_dynamic_priority_enabled == TRUE) {
3742 thread_t thread_woken = NULL;
3743 wakeup_one_with_inheritor((event_t) wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
3744 /*
3745 * (80947592) if this is the last reference on this
3746 * thread, calling thread_deallocate() here
3747 * might take the tasks_threads_lock,
3748 * sadly thread_create_internal is doing several
3749 * allocations under this lock, which can result in
3750 * deadlocks with the pageout scan daemon.
3751 *
3752 * FIXME: we should disallow allocations under the
3753 * task_thread_locks, but that is a larger fix to make.
3754 */
3755 thread_deallocate_safe(thread_woken);
3756 } else {
3757 thread_wakeup_one((event_t) wakeup_event);
3758 }
3759 }
3760
3761 VM_CHECK_MEMORYSTATUS;
3762 }
3763
3764 /*
3765 * This version of vm_page_release() is used only at startup
3766 * when we are single-threaded and pages are being released
3767 * for the first time. Hence, no locking or unnecessary checks are made.
3768 * Note: VM_CHECK_MEMORYSTATUS invoked by the caller.
3769 */
3770 void
vm_page_release_startup(vm_page_t mem)3771 vm_page_release_startup(
3772 vm_page_t mem)
3773 {
3774 vm_page_queue_t queue_free;
3775
3776 if (vm_lopage_free_count < vm_lopage_free_limit &&
3777 VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
3778 mem->vmp_lopage = TRUE;
3779 mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
3780 vm_lopage_free_count++;
3781 queue_free = &vm_lopage_queue_free;
3782 #if CONFIG_SECLUDED_MEMORY
3783 } else if (vm_page_secluded_count < vm_page_secluded_target) {
3784 mem->vmp_lopage = FALSE;
3785 mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
3786 vm_page_secluded_count++;
3787 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3788 vm_page_secluded_count_free++;
3789 queue_free = &vm_page_queue_secluded;
3790 #endif /* CONFIG_SECLUDED_MEMORY */
3791 } else {
3792 mem->vmp_lopage = FALSE;
3793 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
3794 vm_page_free_count++;
3795 queue_free = &vm_page_queue_free[VM_PAGE_GET_COLOR(mem)].qhead;
3796 }
3797 if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
3798 #if defined(__x86_64__)
3799 vm_page_queue_enter_clump(queue_free, mem);
3800 #else
3801 vm_page_queue_enter(queue_free, mem, vmp_pageq);
3802 #endif
3803 } else {
3804 vm_page_queue_enter_first(queue_free, mem, vmp_pageq);
3805 }
3806 }
3807
3808 /*
3809 * vm_page_wait:
3810 *
3811 * Wait for a page to become available.
3812 * If there are plenty of free pages, then we don't sleep.
3813 *
3814 * Returns:
3815 * TRUE: There may be another page, try again
3816 * FALSE: We were interrupted out of our wait, don't try again
3817 */
3818
3819 boolean_t
vm_page_wait(int interruptible)3820 vm_page_wait(
3821 int interruptible )
3822 {
3823 /*
3824 * We can't use vm_page_free_reserved to make this
3825 * determination. Consider: some thread might
3826 * need to allocate two pages. The first allocation
3827 * succeeds, the second fails. After the first page is freed,
3828 * a call to vm_page_wait must really block.
3829 */
3830 kern_return_t wait_result;
3831 int need_wakeup = 0;
3832 int is_privileged = current_thread()->options & TH_OPT_VMPRIV;
3833 event_t wait_event = NULL;
3834
3835 lck_mtx_lock_spin(&vm_page_queue_free_lock);
3836
3837 if (is_privileged && vm_page_free_count) {
3838 lck_mtx_unlock(&vm_page_queue_free_lock);
3839 return TRUE;
3840 }
3841
3842 if (vm_page_free_count >= vm_page_free_target) {
3843 lck_mtx_unlock(&vm_page_queue_free_lock);
3844 return TRUE;
3845 }
3846
3847 if (is_privileged) {
3848 if (vm_page_free_wanted_privileged++ == 0) {
3849 need_wakeup = 1;
3850 }
3851 wait_event = (event_t)&vm_page_free_wanted_privileged;
3852 #if CONFIG_SECLUDED_MEMORY
3853 } else if (secluded_for_apps &&
3854 task_can_use_secluded_mem(current_task(), FALSE)) {
3855 #if 00
3856 /* XXX FBDP: need pageq lock for this... */
3857 /* XXX FBDP: might wait even if pages available, */
3858 /* XXX FBDP: hopefully not for too long... */
3859 if (vm_page_secluded_count > 0) {
3860 lck_mtx_unlock(&vm_page_queue_free_lock);
3861 return TRUE;
3862 }
3863 #endif
3864 if (vm_page_free_wanted_secluded++ == 0) {
3865 need_wakeup = 1;
3866 }
3867 wait_event = (event_t)&vm_page_free_wanted_secluded;
3868 #endif /* CONFIG_SECLUDED_MEMORY */
3869 } else {
3870 if (vm_page_free_wanted++ == 0) {
3871 need_wakeup = 1;
3872 }
3873 wait_event = (event_t)&vm_page_free_count;
3874 }
3875
3876 /*
3877 * We don't do a vm_pageout_scan wakeup if we already have
3878 * some waiters because vm_pageout_scan checks for waiters
3879 * before it returns and does so behind the vm_page_queue_free_lock,
3880 * which we own when we bump the waiter counts.
3881 */
3882
3883 if (vps_dynamic_priority_enabled == TRUE) {
3884 /*
3885 * We are waking up vm_pageout_scan here. If it needs
3886 * the vm_page_queue_free_lock before we unlock it
3887 * we'll end up just blocking and incur an extra
3888 * context switch. Could be a perf. issue.
3889 */
3890
3891 if (need_wakeup) {
3892 thread_wakeup((event_t)&vm_page_free_wanted);
3893 }
3894
3895 /*
3896 * LD: This event is going to get recorded every time because
3897 * we don't get back THREAD_WAITING from lck_mtx_sleep_with_inheritor.
3898 * We just block in that routine.
3899 */
3900 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
3901 vm_page_free_wanted_privileged,
3902 vm_page_free_wanted,
3903 #if CONFIG_SECLUDED_MEMORY
3904 vm_page_free_wanted_secluded,
3905 #else /* CONFIG_SECLUDED_MEMORY */
3906 0,
3907 #endif /* CONFIG_SECLUDED_MEMORY */
3908 0);
3909 wait_result = lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
3910 LCK_SLEEP_UNLOCK,
3911 wait_event,
3912 vm_pageout_scan_thread,
3913 interruptible,
3914 0);
3915 } else {
3916 wait_result = assert_wait(wait_event, interruptible);
3917
3918 lck_mtx_unlock(&vm_page_queue_free_lock);
3919
3920 if (need_wakeup) {
3921 thread_wakeup((event_t)&vm_page_free_wanted);
3922 }
3923
3924 if (wait_result == THREAD_WAITING) {
3925 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
3926 vm_page_free_wanted_privileged,
3927 vm_page_free_wanted,
3928 #if CONFIG_SECLUDED_MEMORY
3929 vm_page_free_wanted_secluded,
3930 #else /* CONFIG_SECLUDED_MEMORY */
3931 0,
3932 #endif /* CONFIG_SECLUDED_MEMORY */
3933 0);
3934 wait_result = thread_block(THREAD_CONTINUE_NULL);
3935 VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
3936 VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
3937 }
3938 }
3939
3940 return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING);
3941 }
3942
3943 /*
3944 * vm_page_alloc:
3945 *
3946 * Allocate and return a memory cell associated
3947 * with this VM object/offset pair.
3948 *
3949 * Object must be locked.
3950 */
3951
3952 vm_page_t
vm_page_alloc(vm_object_t object,vm_object_offset_t offset)3953 vm_page_alloc(
3954 vm_object_t object,
3955 vm_object_offset_t offset)
3956 {
3957 vm_page_t mem;
3958 int grab_options;
3959
3960 vm_object_lock_assert_exclusive(object);
3961 grab_options = 0;
3962 #if CONFIG_SECLUDED_MEMORY
3963 if (object->can_grab_secluded) {
3964 grab_options |= VM_PAGE_GRAB_SECLUDED;
3965 }
3966 #endif /* CONFIG_SECLUDED_MEMORY */
3967 mem = vm_page_grab_options(grab_options);
3968 if (mem == VM_PAGE_NULL) {
3969 return VM_PAGE_NULL;
3970 }
3971
3972 vm_page_insert(mem, object, offset);
3973
3974 return mem;
3975 }
3976
3977 /*
3978 * vm_page_free_prepare:
3979 *
3980 * Removes page from any queue it may be on
3981 * and disassociates it from its VM object.
3982 *
3983 * Object and page queues must be locked prior to entry.
3984 */
3985 static void
vm_page_free_prepare(vm_page_t mem)3986 vm_page_free_prepare(
3987 vm_page_t mem)
3988 {
3989 vm_page_free_prepare_queues(mem);
3990 vm_page_free_prepare_object(mem, TRUE);
3991 }
3992
3993
3994 void
vm_page_free_prepare_queues(vm_page_t mem)3995 vm_page_free_prepare_queues(
3996 vm_page_t mem)
3997 {
3998 vm_object_t m_object;
3999
4000 VM_PAGE_CHECK(mem);
4001
4002 assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
4003 assert(!mem->vmp_cleaning);
4004 m_object = VM_PAGE_OBJECT(mem);
4005
4006 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4007 if (m_object) {
4008 vm_object_lock_assert_exclusive(m_object);
4009 }
4010 if (mem->vmp_laundry) {
4011 /*
4012 * We may have to free a page while it's being laundered
4013 * if we lost its pager (due to a forced unmount, for example).
4014 * We need to call vm_pageout_steal_laundry() before removing
4015 * the page from its VM object, so that we can remove it
4016 * from its pageout queue and adjust the laundry accounting
4017 */
4018 vm_pageout_steal_laundry(mem, TRUE);
4019 }
4020
4021 vm_page_queues_remove(mem, TRUE);
4022
4023 if (VM_PAGE_WIRED(mem)) {
4024 assert(mem->vmp_wire_count > 0);
4025
4026 if (m_object) {
4027 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
4028 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
4029 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
4030
4031 assert(m_object->resident_page_count >=
4032 m_object->wired_page_count);
4033
4034 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
4035 OSAddAtomic(+1, &vm_page_purgeable_count);
4036 assert(vm_page_purgeable_wired_count > 0);
4037 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
4038 }
4039 if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
4040 m_object->purgable == VM_PURGABLE_EMPTY) &&
4041 m_object->vo_owner != TASK_NULL) {
4042 task_t owner;
4043 int ledger_idx_volatile;
4044 int ledger_idx_nonvolatile;
4045 int ledger_idx_volatile_compressed;
4046 int ledger_idx_nonvolatile_compressed;
4047 boolean_t do_footprint;
4048
4049 owner = VM_OBJECT_OWNER(m_object);
4050 vm_object_ledger_tag_ledgers(
4051 m_object,
4052 &ledger_idx_volatile,
4053 &ledger_idx_nonvolatile,
4054 &ledger_idx_volatile_compressed,
4055 &ledger_idx_nonvolatile_compressed,
4056 &do_footprint);
4057 /*
4058 * While wired, this page was accounted
4059 * as "non-volatile" but it should now
4060 * be accounted as "volatile".
4061 */
4062 /* one less "non-volatile"... */
4063 ledger_debit(owner->ledger,
4064 ledger_idx_nonvolatile,
4065 PAGE_SIZE);
4066 if (do_footprint) {
4067 /* ... and "phys_footprint" */
4068 ledger_debit(owner->ledger,
4069 task_ledgers.phys_footprint,
4070 PAGE_SIZE);
4071 }
4072 /* one more "volatile" */
4073 ledger_credit(owner->ledger,
4074 ledger_idx_volatile,
4075 PAGE_SIZE);
4076 }
4077 }
4078 if (!mem->vmp_private && !mem->vmp_fictitious) {
4079 vm_page_wire_count--;
4080 }
4081
4082 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
4083 mem->vmp_wire_count = 0;
4084 assert(!mem->vmp_gobbled);
4085 } else if (mem->vmp_gobbled) {
4086 if (!mem->vmp_private && !mem->vmp_fictitious) {
4087 vm_page_wire_count--;
4088 }
4089 vm_page_gobble_count--;
4090 }
4091 }
4092
4093
4094 void
vm_page_free_prepare_object(vm_page_t mem,boolean_t remove_from_hash)4095 vm_page_free_prepare_object(
4096 vm_page_t mem,
4097 boolean_t remove_from_hash)
4098 {
4099 if (mem->vmp_tabled) {
4100 vm_page_remove(mem, remove_from_hash); /* clears tabled, object, offset */
4101 }
4102 PAGE_WAKEUP(mem); /* clears wanted */
4103
4104 if (mem->vmp_private) {
4105 mem->vmp_private = FALSE;
4106 mem->vmp_fictitious = TRUE;
4107 VM_PAGE_SET_PHYS_PAGE(mem, vm_page_fictitious_addr);
4108 }
4109 if (!mem->vmp_fictitious) {
4110 assert(mem->vmp_pageq.next == 0);
4111 assert(mem->vmp_pageq.prev == 0);
4112 assert(mem->vmp_listq.next == 0);
4113 assert(mem->vmp_listq.prev == 0);
4114 #if CONFIG_BACKGROUND_QUEUE
4115 assert(mem->vmp_backgroundq.next == 0);
4116 assert(mem->vmp_backgroundq.prev == 0);
4117 #endif /* CONFIG_BACKGROUND_QUEUE */
4118 assert(mem->vmp_next_m == 0);
4119 ASSERT_PMAP_FREE(mem);
4120 vm_page_init(mem, VM_PAGE_GET_PHYS_PAGE(mem), mem->vmp_lopage);
4121 }
4122 }
4123
4124
4125 /*
4126 * vm_page_free:
4127 *
4128 * Returns the given page to the free list,
4129 * disassociating it with any VM object.
4130 *
4131 * Object and page queues must be locked prior to entry.
4132 */
4133 void
vm_page_free(vm_page_t mem)4134 vm_page_free(
4135 vm_page_t mem)
4136 {
4137 vm_page_free_prepare(mem);
4138
4139 if (mem->vmp_fictitious) {
4140 vm_page_release_fictitious(mem);
4141 } else {
4142 vm_page_release(mem,
4143 TRUE); /* page queues are locked */
4144 }
4145 }
4146
4147
4148 void
vm_page_free_unlocked(vm_page_t mem,boolean_t remove_from_hash)4149 vm_page_free_unlocked(
4150 vm_page_t mem,
4151 boolean_t remove_from_hash)
4152 {
4153 vm_page_lockspin_queues();
4154 vm_page_free_prepare_queues(mem);
4155 vm_page_unlock_queues();
4156
4157 vm_page_free_prepare_object(mem, remove_from_hash);
4158
4159 if (mem->vmp_fictitious) {
4160 vm_page_release_fictitious(mem);
4161 } else {
4162 vm_page_release(mem, FALSE); /* page queues are not locked */
4163 }
4164 }
4165
4166
4167 /*
4168 * Free a list of pages. The list can be up to several hundred pages,
4169 * as blocked up by vm_pageout_scan().
4170 * The big win is not having to take the free list lock once
4171 * per page.
4172 *
4173 * The VM page queues lock (vm_page_queue_lock) should NOT be held.
4174 * The VM page free queues lock (vm_page_queue_free_lock) should NOT be held.
4175 */
4176 void
vm_page_free_list(vm_page_t freeq,boolean_t prepare_object)4177 vm_page_free_list(
4178 vm_page_t freeq,
4179 boolean_t prepare_object)
4180 {
4181 vm_page_t mem;
4182 vm_page_t nxt;
4183 vm_page_t local_freeq;
4184 int pg_count;
4185
4186 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
4187 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
4188
4189 while (freeq) {
4190 pg_count = 0;
4191 local_freeq = VM_PAGE_NULL;
4192 mem = freeq;
4193
4194 /*
4195 * break up the processing into smaller chunks so
4196 * that we can 'pipeline' the pages onto the
4197 * free list w/o introducing too much
4198 * contention on the global free queue lock
4199 */
4200 while (mem && pg_count < 64) {
4201 assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
4202 (mem->vmp_q_state == VM_PAGE_IS_WIRED));
4203 #if CONFIG_BACKGROUND_QUEUE
4204 assert(mem->vmp_backgroundq.next == 0 &&
4205 mem->vmp_backgroundq.prev == 0 &&
4206 mem->vmp_on_backgroundq == FALSE);
4207 #endif
4208 nxt = mem->vmp_snext;
4209 mem->vmp_snext = NULL;
4210 assert(mem->vmp_pageq.prev == 0);
4211
4212 if (vm_page_free_verify && !mem->vmp_fictitious && !mem->vmp_private) {
4213 ASSERT_PMAP_FREE(mem);
4214 }
4215 if (prepare_object == TRUE) {
4216 vm_page_free_prepare_object(mem, TRUE);
4217 }
4218
4219 if (!mem->vmp_fictitious) {
4220 assert(mem->vmp_busy);
4221
4222 if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) &&
4223 vm_lopage_free_count < vm_lopage_free_limit &&
4224 VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
4225 vm_page_release(mem, FALSE); /* page queues are not locked */
4226 #if CONFIG_SECLUDED_MEMORY
4227 } else if (vm_page_secluded_count < vm_page_secluded_target &&
4228 num_tasks_can_use_secluded_mem == 0) {
4229 vm_page_release(mem,
4230 FALSE); /* page queues are not locked */
4231 #endif /* CONFIG_SECLUDED_MEMORY */
4232 } else {
4233 /*
4234 * IMPORTANT: we can't set the page "free" here
4235 * because that would make the page eligible for
4236 * a physically-contiguous allocation (see
4237 * vm_page_find_contiguous()) right away (we don't
4238 * hold the vm_page_queue_free lock). That would
4239 * cause trouble because the page is not actually
4240 * in the free queue yet...
4241 */
4242 mem->vmp_snext = local_freeq;
4243 local_freeq = mem;
4244 pg_count++;
4245
4246 pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
4247 }
4248 } else {
4249 assert(VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_fictitious_addr ||
4250 VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_guard_addr);
4251 vm_page_release_fictitious(mem);
4252 }
4253 mem = nxt;
4254 }
4255 freeq = mem;
4256
4257 if ((mem = local_freeq)) {
4258 unsigned int avail_free_count;
4259 unsigned int need_wakeup = 0;
4260 unsigned int need_priv_wakeup = 0;
4261 #if CONFIG_SECLUDED_MEMORY
4262 unsigned int need_wakeup_secluded = 0;
4263 #endif /* CONFIG_SECLUDED_MEMORY */
4264 event_t priv_wakeup_event, secluded_wakeup_event, normal_wakeup_event;
4265 boolean_t priv_wakeup_all, secluded_wakeup_all, normal_wakeup_all;
4266
4267 lck_mtx_lock_spin(&vm_page_queue_free_lock);
4268
4269 while (mem) {
4270 int color;
4271
4272 nxt = mem->vmp_snext;
4273
4274 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
4275 assert(mem->vmp_busy);
4276 mem->vmp_lopage = FALSE;
4277 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
4278
4279 color = VM_PAGE_GET_COLOR(mem);
4280 #if defined(__x86_64__)
4281 vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem);
4282 #else
4283 vm_page_queue_enter(&vm_page_queue_free[color].qhead,
4284 mem, vmp_pageq);
4285 #endif
4286 mem = nxt;
4287 }
4288 vm_pageout_vminfo.vm_page_pages_freed += pg_count;
4289 vm_page_free_count += pg_count;
4290 avail_free_count = vm_page_free_count;
4291
4292 VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, pg_count, 0, 0, 0);
4293
4294 if (vm_page_free_wanted_privileged > 0 && avail_free_count > 0) {
4295 if (avail_free_count < vm_page_free_wanted_privileged) {
4296 need_priv_wakeup = avail_free_count;
4297 vm_page_free_wanted_privileged -= avail_free_count;
4298 avail_free_count = 0;
4299 } else {
4300 need_priv_wakeup = vm_page_free_wanted_privileged;
4301 avail_free_count -= vm_page_free_wanted_privileged;
4302 vm_page_free_wanted_privileged = 0;
4303 }
4304 }
4305 #if CONFIG_SECLUDED_MEMORY
4306 if (vm_page_free_wanted_secluded > 0 &&
4307 avail_free_count > vm_page_free_reserved) {
4308 unsigned int available_pages;
4309 available_pages = (avail_free_count -
4310 vm_page_free_reserved);
4311 if (available_pages <
4312 vm_page_free_wanted_secluded) {
4313 need_wakeup_secluded = available_pages;
4314 vm_page_free_wanted_secluded -=
4315 available_pages;
4316 avail_free_count -= available_pages;
4317 } else {
4318 need_wakeup_secluded =
4319 vm_page_free_wanted_secluded;
4320 avail_free_count -=
4321 vm_page_free_wanted_secluded;
4322 vm_page_free_wanted_secluded = 0;
4323 }
4324 }
4325 #endif /* CONFIG_SECLUDED_MEMORY */
4326 if (vm_page_free_wanted > 0 && avail_free_count > vm_page_free_reserved) {
4327 unsigned int available_pages;
4328
4329 available_pages = avail_free_count - vm_page_free_reserved;
4330
4331 if (available_pages >= vm_page_free_wanted) {
4332 need_wakeup = vm_page_free_wanted;
4333 vm_page_free_wanted = 0;
4334 } else {
4335 need_wakeup = available_pages;
4336 vm_page_free_wanted -= available_pages;
4337 }
4338 }
4339 lck_mtx_unlock(&vm_page_queue_free_lock);
4340
4341 priv_wakeup_event = NULL;
4342 secluded_wakeup_event = NULL;
4343 normal_wakeup_event = NULL;
4344
4345 priv_wakeup_all = FALSE;
4346 secluded_wakeup_all = FALSE;
4347 normal_wakeup_all = FALSE;
4348
4349
4350 if (need_priv_wakeup != 0) {
4351 /*
4352 * There shouldn't be that many VM-privileged threads,
4353 * so let's wake them all up, even if we don't quite
4354 * have enough pages to satisfy them all.
4355 */
4356 priv_wakeup_event = (event_t)&vm_page_free_wanted_privileged;
4357 priv_wakeup_all = TRUE;
4358 }
4359 #if CONFIG_SECLUDED_MEMORY
4360 if (need_wakeup_secluded != 0 &&
4361 vm_page_free_wanted_secluded == 0) {
4362 secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded;
4363 secluded_wakeup_all = TRUE;
4364 need_wakeup_secluded = 0;
4365 } else {
4366 secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded;
4367 }
4368 #endif /* CONFIG_SECLUDED_MEMORY */
4369 if (need_wakeup != 0 && vm_page_free_wanted == 0) {
4370 /*
4371 * We don't expect to have any more waiters
4372 * after this, so let's wake them all up at
4373 * once.
4374 */
4375 normal_wakeup_event = (event_t) &vm_page_free_count;
4376 normal_wakeup_all = TRUE;
4377 need_wakeup = 0;
4378 } else {
4379 normal_wakeup_event = (event_t) &vm_page_free_count;
4380 }
4381
4382 if (priv_wakeup_event ||
4383 #if CONFIG_SECLUDED_MEMORY
4384 secluded_wakeup_event ||
4385 #endif /* CONFIG_SECLUDED_MEMORY */
4386 normal_wakeup_event) {
4387 if (vps_dynamic_priority_enabled == TRUE) {
4388 thread_t thread_woken = NULL;
4389
4390 if (priv_wakeup_all == TRUE) {
4391 wakeup_all_with_inheritor(priv_wakeup_event, THREAD_AWAKENED);
4392 }
4393
4394 #if CONFIG_SECLUDED_MEMORY
4395 if (secluded_wakeup_all == TRUE) {
4396 wakeup_all_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED);
4397 }
4398
4399 while (need_wakeup_secluded-- != 0) {
4400 /*
4401 * Wake up one waiter per page we just released.
4402 */
4403 wakeup_one_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
4404 thread_deallocate(thread_woken);
4405 }
4406 #endif /* CONFIG_SECLUDED_MEMORY */
4407
4408 if (normal_wakeup_all == TRUE) {
4409 wakeup_all_with_inheritor(normal_wakeup_event, THREAD_AWAKENED);
4410 }
4411
4412 while (need_wakeup-- != 0) {
4413 /*
4414 * Wake up one waiter per page we just released.
4415 */
4416 wakeup_one_with_inheritor(normal_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
4417 thread_deallocate(thread_woken);
4418 }
4419 } else {
4420 /*
4421 * Non-priority-aware wakeups.
4422 */
4423
4424 if (priv_wakeup_all == TRUE) {
4425 thread_wakeup(priv_wakeup_event);
4426 }
4427
4428 #if CONFIG_SECLUDED_MEMORY
4429 if (secluded_wakeup_all == TRUE) {
4430 thread_wakeup(secluded_wakeup_event);
4431 }
4432
4433 while (need_wakeup_secluded-- != 0) {
4434 /*
4435 * Wake up one waiter per page we just released.
4436 */
4437 thread_wakeup_one(secluded_wakeup_event);
4438 }
4439
4440 #endif /* CONFIG_SECLUDED_MEMORY */
4441 if (normal_wakeup_all == TRUE) {
4442 thread_wakeup(normal_wakeup_event);
4443 }
4444
4445 while (need_wakeup-- != 0) {
4446 /*
4447 * Wake up one waiter per page we just released.
4448 */
4449 thread_wakeup_one(normal_wakeup_event);
4450 }
4451 }
4452 }
4453
4454 VM_CHECK_MEMORYSTATUS;
4455 }
4456 }
4457 }
4458
4459
4460 /*
4461 * vm_page_wire:
4462 *
4463 * Mark this page as wired down by yet
4464 * another map, removing it from paging queues
4465 * as necessary.
4466 *
4467 * The page's object and the page queues must be locked.
4468 */
4469
4470
4471 void
vm_page_wire(vm_page_t mem,vm_tag_t tag,boolean_t check_memorystatus)4472 vm_page_wire(
4473 vm_page_t mem,
4474 vm_tag_t tag,
4475 boolean_t check_memorystatus)
4476 {
4477 vm_object_t m_object;
4478
4479 m_object = VM_PAGE_OBJECT(mem);
4480
4481 // dbgLog(current_thread(), mem->vmp_offset, m_object, 1); /* (TEST/DEBUG) */
4482
4483 VM_PAGE_CHECK(mem);
4484 if (m_object) {
4485 vm_object_lock_assert_exclusive(m_object);
4486 } else {
4487 /*
4488 * In theory, the page should be in an object before it
4489 * gets wired, since we need to hold the object lock
4490 * to update some fields in the page structure.
4491 * However, some code (i386 pmap, for example) might want
4492 * to wire a page before it gets inserted into an object.
4493 * That's somewhat OK, as long as nobody else can get to
4494 * that page and update it at the same time.
4495 */
4496 }
4497 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4498 if (!VM_PAGE_WIRED(mem)) {
4499 if (mem->vmp_laundry) {
4500 vm_pageout_steal_laundry(mem, TRUE);
4501 }
4502
4503 vm_page_queues_remove(mem, TRUE);
4504
4505 assert(mem->vmp_wire_count == 0);
4506 mem->vmp_q_state = VM_PAGE_IS_WIRED;
4507
4508 if (m_object) {
4509 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
4510 VM_OBJECT_WIRED_PAGE_ADD(m_object, mem);
4511 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, tag);
4512
4513 assert(m_object->resident_page_count >=
4514 m_object->wired_page_count);
4515 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
4516 assert(vm_page_purgeable_count > 0);
4517 OSAddAtomic(-1, &vm_page_purgeable_count);
4518 OSAddAtomic(1, &vm_page_purgeable_wired_count);
4519 }
4520 if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
4521 m_object->purgable == VM_PURGABLE_EMPTY) &&
4522 m_object->vo_owner != TASK_NULL) {
4523 task_t owner;
4524 int ledger_idx_volatile;
4525 int ledger_idx_nonvolatile;
4526 int ledger_idx_volatile_compressed;
4527 int ledger_idx_nonvolatile_compressed;
4528 boolean_t do_footprint;
4529
4530 owner = VM_OBJECT_OWNER(m_object);
4531 vm_object_ledger_tag_ledgers(
4532 m_object,
4533 &ledger_idx_volatile,
4534 &ledger_idx_nonvolatile,
4535 &ledger_idx_volatile_compressed,
4536 &ledger_idx_nonvolatile_compressed,
4537 &do_footprint);
4538 /* less volatile bytes */
4539 ledger_debit(owner->ledger,
4540 ledger_idx_volatile,
4541 PAGE_SIZE);
4542 /* more not-quite-volatile bytes */
4543 ledger_credit(owner->ledger,
4544 ledger_idx_nonvolatile,
4545 PAGE_SIZE);
4546 if (do_footprint) {
4547 /* more footprint */
4548 ledger_credit(owner->ledger,
4549 task_ledgers.phys_footprint,
4550 PAGE_SIZE);
4551 }
4552 }
4553 if (m_object->all_reusable) {
4554 /*
4555 * Wired pages are not counted as "re-usable"
4556 * in "all_reusable" VM objects, so nothing
4557 * to do here.
4558 */
4559 } else if (mem->vmp_reusable) {
4560 /*
4561 * This page is not "re-usable" when it's
4562 * wired, so adjust its state and the
4563 * accounting.
4564 */
4565 vm_object_reuse_pages(m_object,
4566 mem->vmp_offset,
4567 mem->vmp_offset + PAGE_SIZE_64,
4568 FALSE);
4569 }
4570 }
4571 assert(!mem->vmp_reusable);
4572
4573 if (!mem->vmp_private && !mem->vmp_fictitious && !mem->vmp_gobbled) {
4574 vm_page_wire_count++;
4575 }
4576 if (mem->vmp_gobbled) {
4577 vm_page_gobble_count--;
4578 }
4579 mem->vmp_gobbled = FALSE;
4580
4581 if (check_memorystatus == TRUE) {
4582 VM_CHECK_MEMORYSTATUS;
4583 }
4584 }
4585 assert(!mem->vmp_gobbled);
4586 assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
4587 mem->vmp_wire_count++;
4588 if (__improbable(mem->vmp_wire_count == 0)) {
4589 panic("vm_page_wire(%p): wire_count overflow", mem);
4590 }
4591 VM_PAGE_CHECK(mem);
4592 }
4593
4594 /*
4595 * vm_page_unwire:
4596 *
4597 * Release one wiring of this page, potentially
4598 * enabling it to be paged again.
4599 *
4600 * The page's object and the page queues must be locked.
4601 */
4602 void
vm_page_unwire(vm_page_t mem,boolean_t queueit)4603 vm_page_unwire(
4604 vm_page_t mem,
4605 boolean_t queueit)
4606 {
4607 vm_object_t m_object;
4608
4609 m_object = VM_PAGE_OBJECT(mem);
4610
4611 // dbgLog(current_thread(), mem->vmp_offset, m_object, 0); /* (TEST/DEBUG) */
4612
4613 VM_PAGE_CHECK(mem);
4614 assert(VM_PAGE_WIRED(mem));
4615 assert(mem->vmp_wire_count > 0);
4616 assert(!mem->vmp_gobbled);
4617 assert(m_object != VM_OBJECT_NULL);
4618 vm_object_lock_assert_exclusive(m_object);
4619 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4620 if (--mem->vmp_wire_count == 0) {
4621 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
4622
4623 VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
4624 VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
4625 VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
4626 if (!mem->vmp_private && !mem->vmp_fictitious) {
4627 vm_page_wire_count--;
4628 }
4629
4630 assert(m_object->resident_page_count >=
4631 m_object->wired_page_count);
4632 if (m_object->purgable == VM_PURGABLE_VOLATILE) {
4633 OSAddAtomic(+1, &vm_page_purgeable_count);
4634 assert(vm_page_purgeable_wired_count > 0);
4635 OSAddAtomic(-1, &vm_page_purgeable_wired_count);
4636 }
4637 if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
4638 m_object->purgable == VM_PURGABLE_EMPTY) &&
4639 m_object->vo_owner != TASK_NULL) {
4640 task_t owner;
4641 int ledger_idx_volatile;
4642 int ledger_idx_nonvolatile;
4643 int ledger_idx_volatile_compressed;
4644 int ledger_idx_nonvolatile_compressed;
4645 boolean_t do_footprint;
4646
4647 owner = VM_OBJECT_OWNER(m_object);
4648 vm_object_ledger_tag_ledgers(
4649 m_object,
4650 &ledger_idx_volatile,
4651 &ledger_idx_nonvolatile,
4652 &ledger_idx_volatile_compressed,
4653 &ledger_idx_nonvolatile_compressed,
4654 &do_footprint);
4655 /* more volatile bytes */
4656 ledger_credit(owner->ledger,
4657 ledger_idx_volatile,
4658 PAGE_SIZE);
4659 /* less not-quite-volatile bytes */
4660 ledger_debit(owner->ledger,
4661 ledger_idx_nonvolatile,
4662 PAGE_SIZE);
4663 if (do_footprint) {
4664 /* less footprint */
4665 ledger_debit(owner->ledger,
4666 task_ledgers.phys_footprint,
4667 PAGE_SIZE);
4668 }
4669 }
4670 assert(m_object != kernel_object);
4671 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
4672
4673 if (queueit == TRUE) {
4674 if (m_object->purgable == VM_PURGABLE_EMPTY) {
4675 vm_page_deactivate(mem);
4676 } else {
4677 vm_page_activate(mem);
4678 }
4679 }
4680
4681 VM_CHECK_MEMORYSTATUS;
4682 }
4683 VM_PAGE_CHECK(mem);
4684 }
4685
4686 /*
4687 * vm_page_deactivate:
4688 *
4689 * Returns the given page to the inactive list,
4690 * indicating that no physical maps have access
4691 * to this page. [Used by the physical mapping system.]
4692 *
4693 * The page queues must be locked.
4694 */
4695 void
vm_page_deactivate(vm_page_t m)4696 vm_page_deactivate(
4697 vm_page_t m)
4698 {
4699 vm_page_deactivate_internal(m, TRUE);
4700 }
4701
4702
4703 void
vm_page_deactivate_internal(vm_page_t m,boolean_t clear_hw_reference)4704 vm_page_deactivate_internal(
4705 vm_page_t m,
4706 boolean_t clear_hw_reference)
4707 {
4708 vm_object_t m_object;
4709
4710 m_object = VM_PAGE_OBJECT(m);
4711
4712 VM_PAGE_CHECK(m);
4713 assert(m_object != kernel_object);
4714 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
4715
4716 // dbgLog(VM_PAGE_GET_PHYS_PAGE(m), vm_page_free_count, vm_page_wire_count, 6); /* (TEST/DEBUG) */
4717 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4718 /*
4719 * This page is no longer very interesting. If it was
4720 * interesting (active or inactive/referenced), then we
4721 * clear the reference bit and (re)enter it in the
4722 * inactive queue. Note wired pages should not have
4723 * their reference bit cleared.
4724 */
4725 assert( !(m->vmp_absent && !m->vmp_unusual));
4726
4727 if (m->vmp_gobbled) { /* can this happen? */
4728 assert( !VM_PAGE_WIRED(m));
4729
4730 if (!m->vmp_private && !m->vmp_fictitious) {
4731 vm_page_wire_count--;
4732 }
4733 vm_page_gobble_count--;
4734 m->vmp_gobbled = FALSE;
4735 }
4736 /*
4737 * if this page is currently on the pageout queue, we can't do the
4738 * vm_page_queues_remove (which doesn't handle the pageout queue case)
4739 * and we can't remove it manually since we would need the object lock
4740 * (which is not required here) to decrement the activity_in_progress
4741 * reference which is held on the object while the page is in the pageout queue...
4742 * just let the normal laundry processing proceed
4743 */
4744 if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
4745 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
4746 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
4747 VM_PAGE_WIRED(m)) {
4748 return;
4749 }
4750 if (!m->vmp_absent && clear_hw_reference == TRUE) {
4751 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
4752 }
4753
4754 m->vmp_reference = FALSE;
4755 m->vmp_no_cache = FALSE;
4756
4757 if (!VM_PAGE_INACTIVE(m)) {
4758 vm_page_queues_remove(m, FALSE);
4759
4760 if (!VM_DYNAMIC_PAGING_ENABLED() &&
4761 m->vmp_dirty && m_object->internal &&
4762 (m_object->purgable == VM_PURGABLE_DENY ||
4763 m_object->purgable == VM_PURGABLE_NONVOLATILE ||
4764 m_object->purgable == VM_PURGABLE_VOLATILE)) {
4765 vm_page_check_pageable_safe(m);
4766 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
4767 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
4768 vm_page_throttled_count++;
4769 } else {
4770 if (m_object->named && m_object->ref_count == 1) {
4771 vm_page_speculate(m, FALSE);
4772 #if DEVELOPMENT || DEBUG
4773 vm_page_speculative_recreated++;
4774 #endif
4775 } else {
4776 vm_page_enqueue_inactive(m, FALSE);
4777 }
4778 }
4779 }
4780 }
4781
4782 /*
4783 * vm_page_enqueue_cleaned
4784 *
4785 * Put the page on the cleaned queue, mark it cleaned, etc.
4786 * Being on the cleaned queue (and having m->clean_queue set)
4787 * does ** NOT ** guarantee that the page is clean!
4788 *
4789 * Call with the queues lock held.
4790 */
4791
4792 void
vm_page_enqueue_cleaned(vm_page_t m)4793 vm_page_enqueue_cleaned(vm_page_t m)
4794 {
4795 vm_object_t m_object;
4796
4797 m_object = VM_PAGE_OBJECT(m);
4798
4799 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
4800 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4801 assert( !(m->vmp_absent && !m->vmp_unusual));
4802
4803 if (VM_PAGE_WIRED(m)) {
4804 return;
4805 }
4806
4807 if (m->vmp_gobbled) {
4808 if (!m->vmp_private && !m->vmp_fictitious) {
4809 vm_page_wire_count--;
4810 }
4811 vm_page_gobble_count--;
4812 m->vmp_gobbled = FALSE;
4813 }
4814 /*
4815 * if this page is currently on the pageout queue, we can't do the
4816 * vm_page_queues_remove (which doesn't handle the pageout queue case)
4817 * and we can't remove it manually since we would need the object lock
4818 * (which is not required here) to decrement the activity_in_progress
4819 * reference which is held on the object while the page is in the pageout queue...
4820 * just let the normal laundry processing proceed
4821 */
4822 if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
4823 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
4824 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
4825 return;
4826 }
4827 vm_page_queues_remove(m, FALSE);
4828
4829 vm_page_check_pageable_safe(m);
4830 vm_page_queue_enter(&vm_page_queue_cleaned, m, vmp_pageq);
4831 m->vmp_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q;
4832 vm_page_cleaned_count++;
4833
4834 vm_page_inactive_count++;
4835 if (m_object->internal) {
4836 vm_page_pageable_internal_count++;
4837 } else {
4838 vm_page_pageable_external_count++;
4839 }
4840 #if CONFIG_BACKGROUND_QUEUE
4841 if (m->vmp_in_background) {
4842 vm_page_add_to_backgroundq(m, TRUE);
4843 }
4844 #endif
4845 VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
4846 }
4847
4848 /*
4849 * vm_page_activate:
4850 *
4851 * Put the specified page on the active list (if appropriate).
4852 *
4853 * The page queues must be locked.
4854 */
4855
4856 void
vm_page_activate(vm_page_t m)4857 vm_page_activate(
4858 vm_page_t m)
4859 {
4860 vm_object_t m_object;
4861
4862 m_object = VM_PAGE_OBJECT(m);
4863
4864 VM_PAGE_CHECK(m);
4865 #ifdef FIXME_4778297
4866 assert(m_object != kernel_object);
4867 #endif
4868 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
4869 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4870 assert( !(m->vmp_absent && !m->vmp_unusual));
4871
4872 if (m->vmp_gobbled) {
4873 assert( !VM_PAGE_WIRED(m));
4874 if (!m->vmp_private && !m->vmp_fictitious) {
4875 vm_page_wire_count--;
4876 }
4877 vm_page_gobble_count--;
4878 m->vmp_gobbled = FALSE;
4879 }
4880 /*
4881 * if this page is currently on the pageout queue, we can't do the
4882 * vm_page_queues_remove (which doesn't handle the pageout queue case)
4883 * and we can't remove it manually since we would need the object lock
4884 * (which is not required here) to decrement the activity_in_progress
4885 * reference which is held on the object while the page is in the pageout queue...
4886 * just let the normal laundry processing proceed
4887 */
4888 if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
4889 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
4890 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
4891 return;
4892 }
4893
4894 #if DEBUG
4895 if (m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q) {
4896 panic("vm_page_activate: already active");
4897 }
4898 #endif
4899
4900 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
4901 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4902 DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
4903 }
4904
4905 vm_page_queues_remove(m, FALSE);
4906
4907 if (!VM_PAGE_WIRED(m)) {
4908 vm_page_check_pageable_safe(m);
4909 if (!VM_DYNAMIC_PAGING_ENABLED() &&
4910 m->vmp_dirty && m_object->internal &&
4911 (m_object->purgable == VM_PURGABLE_DENY ||
4912 m_object->purgable == VM_PURGABLE_NONVOLATILE ||
4913 m_object->purgable == VM_PURGABLE_VOLATILE)) {
4914 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
4915 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
4916 vm_page_throttled_count++;
4917 } else {
4918 #if CONFIG_SECLUDED_MEMORY
4919 if (secluded_for_filecache &&
4920 vm_page_secluded_target != 0 &&
4921 num_tasks_can_use_secluded_mem == 0 &&
4922 m_object->eligible_for_secluded) {
4923 vm_page_queue_enter(&vm_page_queue_secluded, m, vmp_pageq);
4924 m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
4925 vm_page_secluded_count++;
4926 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
4927 vm_page_secluded_count_inuse++;
4928 assert(!m_object->internal);
4929 // vm_page_pageable_external_count++;
4930 } else
4931 #endif /* CONFIG_SECLUDED_MEMORY */
4932 vm_page_enqueue_active(m, FALSE);
4933 }
4934 m->vmp_reference = TRUE;
4935 m->vmp_no_cache = FALSE;
4936 }
4937 VM_PAGE_CHECK(m);
4938 }
4939
4940
4941 /*
4942 * vm_page_speculate:
4943 *
4944 * Put the specified page on the speculative list (if appropriate).
4945 *
4946 * The page queues must be locked.
4947 */
4948 void
vm_page_speculate(vm_page_t m,boolean_t new)4949 vm_page_speculate(
4950 vm_page_t m,
4951 boolean_t new)
4952 {
4953 struct vm_speculative_age_q *aq;
4954 vm_object_t m_object;
4955
4956 m_object = VM_PAGE_OBJECT(m);
4957
4958 VM_PAGE_CHECK(m);
4959 vm_page_check_pageable_safe(m);
4960
4961 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
4962 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4963 assert( !(m->vmp_absent && !m->vmp_unusual));
4964 assert(m_object->internal == FALSE);
4965
4966 /*
4967 * if this page is currently on the pageout queue, we can't do the
4968 * vm_page_queues_remove (which doesn't handle the pageout queue case)
4969 * and we can't remove it manually since we would need the object lock
4970 * (which is not required here) to decrement the activity_in_progress
4971 * reference which is held on the object while the page is in the pageout queue...
4972 * just let the normal laundry processing proceed
4973 */
4974 if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
4975 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
4976 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
4977 return;
4978 }
4979
4980 vm_page_queues_remove(m, FALSE);
4981
4982 if (!VM_PAGE_WIRED(m)) {
4983 mach_timespec_t ts;
4984 clock_sec_t sec;
4985 clock_nsec_t nsec;
4986
4987 clock_get_system_nanotime(&sec, &nsec);
4988 ts.tv_sec = (unsigned int) sec;
4989 ts.tv_nsec = nsec;
4990
4991 if (vm_page_speculative_count == 0) {
4992 speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
4993 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
4994
4995 aq = &vm_page_queue_speculative[speculative_age_index];
4996
4997 /*
4998 * set the timer to begin a new group
4999 */
5000 aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
5001 aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
5002
5003 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
5004 } else {
5005 aq = &vm_page_queue_speculative[speculative_age_index];
5006
5007 if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) {
5008 speculative_age_index++;
5009
5010 if (speculative_age_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
5011 speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
5012 }
5013 if (speculative_age_index == speculative_steal_index) {
5014 speculative_steal_index = speculative_age_index + 1;
5015
5016 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
5017 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
5018 }
5019 }
5020 aq = &vm_page_queue_speculative[speculative_age_index];
5021
5022 if (!vm_page_queue_empty(&aq->age_q)) {
5023 vm_page_speculate_ageit(aq);
5024 }
5025
5026 aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
5027 aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
5028
5029 ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
5030 }
5031 }
5032 vm_page_enqueue_tail(&aq->age_q, &m->vmp_pageq);
5033 m->vmp_q_state = VM_PAGE_ON_SPECULATIVE_Q;
5034 vm_page_speculative_count++;
5035 vm_page_pageable_external_count++;
5036
5037 if (new == TRUE) {
5038 vm_object_lock_assert_exclusive(m_object);
5039
5040 m_object->pages_created++;
5041 #if DEVELOPMENT || DEBUG
5042 vm_page_speculative_created++;
5043 #endif
5044 }
5045 }
5046 VM_PAGE_CHECK(m);
5047 }
5048
5049
5050 /*
5051 * move pages from the specified aging bin to
5052 * the speculative bin that pageout_scan claims from
5053 *
5054 * The page queues must be locked.
5055 */
5056 void
vm_page_speculate_ageit(struct vm_speculative_age_q * aq)5057 vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
5058 {
5059 struct vm_speculative_age_q *sq;
5060 vm_page_t t;
5061
5062 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
5063
5064 if (vm_page_queue_empty(&sq->age_q)) {
5065 sq->age_q.next = aq->age_q.next;
5066 sq->age_q.prev = aq->age_q.prev;
5067
5068 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next);
5069 t->vmp_pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q);
5070
5071 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
5072 t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
5073 } else {
5074 t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
5075 t->vmp_pageq.next = aq->age_q.next;
5076
5077 t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next);
5078 t->vmp_pageq.prev = sq->age_q.prev;
5079
5080 t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev);
5081 t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
5082
5083 sq->age_q.prev = aq->age_q.prev;
5084 }
5085 vm_page_queue_init(&aq->age_q);
5086 }
5087
5088
5089 void
vm_page_lru(vm_page_t m)5090 vm_page_lru(
5091 vm_page_t m)
5092 {
5093 VM_PAGE_CHECK(m);
5094 assert(VM_PAGE_OBJECT(m) != kernel_object);
5095 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
5096
5097 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5098
5099 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q) {
5100 /*
5101 * we don't need to do all the other work that
5102 * vm_page_queues_remove and vm_page_enqueue_inactive
5103 * bring along for the ride
5104 */
5105 assert(!m->vmp_laundry);
5106 assert(!m->vmp_private);
5107
5108 m->vmp_no_cache = FALSE;
5109
5110 vm_page_queue_remove(&vm_page_queue_inactive, m, vmp_pageq);
5111 vm_page_queue_enter(&vm_page_queue_inactive, m, vmp_pageq);
5112
5113 return;
5114 }
5115 /*
5116 * if this page is currently on the pageout queue, we can't do the
5117 * vm_page_queues_remove (which doesn't handle the pageout queue case)
5118 * and we can't remove it manually since we would need the object lock
5119 * (which is not required here) to decrement the activity_in_progress
5120 * reference which is held on the object while the page is in the pageout queue...
5121 * just let the normal laundry processing proceed
5122 */
5123 if (m->vmp_laundry || m->vmp_private ||
5124 (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
5125 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
5126 VM_PAGE_WIRED(m)) {
5127 return;
5128 }
5129
5130 m->vmp_no_cache = FALSE;
5131
5132 vm_page_queues_remove(m, FALSE);
5133
5134 vm_page_enqueue_inactive(m, FALSE);
5135 }
5136
5137
5138 void
vm_page_reactivate_all_throttled(void)5139 vm_page_reactivate_all_throttled(void)
5140 {
5141 vm_page_t first_throttled, last_throttled;
5142 vm_page_t first_active;
5143 vm_page_t m;
5144 int extra_active_count;
5145 int extra_internal_count, extra_external_count;
5146 vm_object_t m_object;
5147
5148 if (!VM_DYNAMIC_PAGING_ENABLED()) {
5149 return;
5150 }
5151
5152 extra_active_count = 0;
5153 extra_internal_count = 0;
5154 extra_external_count = 0;
5155 vm_page_lock_queues();
5156 if (!vm_page_queue_empty(&vm_page_queue_throttled)) {
5157 /*
5158 * Switch "throttled" pages to "active".
5159 */
5160 vm_page_queue_iterate(&vm_page_queue_throttled, m, vmp_pageq) {
5161 VM_PAGE_CHECK(m);
5162 assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
5163
5164 m_object = VM_PAGE_OBJECT(m);
5165
5166 extra_active_count++;
5167 if (m_object->internal) {
5168 extra_internal_count++;
5169 } else {
5170 extra_external_count++;
5171 }
5172
5173 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
5174 VM_PAGE_CHECK(m);
5175 #if CONFIG_BACKGROUND_QUEUE
5176 if (m->vmp_in_background) {
5177 vm_page_add_to_backgroundq(m, FALSE);
5178 }
5179 #endif
5180 }
5181
5182 /*
5183 * Transfer the entire throttled queue to a regular LRU page queues.
5184 * We insert it at the head of the active queue, so that these pages
5185 * get re-evaluated by the LRU algorithm first, since they've been
5186 * completely out of it until now.
5187 */
5188 first_throttled = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
5189 last_throttled = (vm_page_t) vm_page_queue_last(&vm_page_queue_throttled);
5190 first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
5191 if (vm_page_queue_empty(&vm_page_queue_active)) {
5192 vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
5193 } else {
5194 first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
5195 }
5196 vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled);
5197 first_throttled->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
5198 last_throttled->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
5199
5200 #if DEBUG
5201 printf("reactivated %d throttled pages\n", vm_page_throttled_count);
5202 #endif
5203 vm_page_queue_init(&vm_page_queue_throttled);
5204 /*
5205 * Adjust the global page counts.
5206 */
5207 vm_page_active_count += extra_active_count;
5208 vm_page_pageable_internal_count += extra_internal_count;
5209 vm_page_pageable_external_count += extra_external_count;
5210 vm_page_throttled_count = 0;
5211 }
5212 assert(vm_page_throttled_count == 0);
5213 assert(vm_page_queue_empty(&vm_page_queue_throttled));
5214 vm_page_unlock_queues();
5215 }
5216
5217
5218 /*
5219 * move pages from the indicated local queue to the global active queue
5220 * its ok to fail if we're below the hard limit and force == FALSE
5221 * the nolocks == TRUE case is to allow this function to be run on
5222 * the hibernate path
5223 */
5224
5225 void
vm_page_reactivate_local(uint32_t lid,boolean_t force,boolean_t nolocks)5226 vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
5227 {
5228 struct vpl *lq;
5229 vm_page_t first_local, last_local;
5230 vm_page_t first_active;
5231 vm_page_t m;
5232 uint32_t count = 0;
5233
5234 if (vm_page_local_q == NULL) {
5235 return;
5236 }
5237
5238 lq = zpercpu_get_cpu(vm_page_local_q, lid);
5239
5240 if (nolocks == FALSE) {
5241 if (lq->vpl_count < vm_page_local_q_hard_limit && force == FALSE) {
5242 if (!vm_page_trylockspin_queues()) {
5243 return;
5244 }
5245 } else {
5246 vm_page_lockspin_queues();
5247 }
5248
5249 VPL_LOCK(&lq->vpl_lock);
5250 }
5251 if (lq->vpl_count) {
5252 /*
5253 * Switch "local" pages to "active".
5254 */
5255 assert(!vm_page_queue_empty(&lq->vpl_queue));
5256
5257 vm_page_queue_iterate(&lq->vpl_queue, m, vmp_pageq) {
5258 VM_PAGE_CHECK(m);
5259 vm_page_check_pageable_safe(m);
5260 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q);
5261 assert(!m->vmp_fictitious);
5262
5263 if (m->vmp_local_id != lid) {
5264 panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m);
5265 }
5266
5267 m->vmp_local_id = 0;
5268 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
5269 VM_PAGE_CHECK(m);
5270 #if CONFIG_BACKGROUND_QUEUE
5271 if (m->vmp_in_background) {
5272 vm_page_add_to_backgroundq(m, FALSE);
5273 }
5274 #endif
5275 count++;
5276 }
5277 if (count != lq->vpl_count) {
5278 panic("vm_page_reactivate_local: count = %d, vm_page_local_count = %d", count, lq->vpl_count);
5279 }
5280
5281 /*
5282 * Transfer the entire local queue to a regular LRU page queues.
5283 */
5284 first_local = (vm_page_t) vm_page_queue_first(&lq->vpl_queue);
5285 last_local = (vm_page_t) vm_page_queue_last(&lq->vpl_queue);
5286 first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
5287
5288 if (vm_page_queue_empty(&vm_page_queue_active)) {
5289 vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
5290 } else {
5291 first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
5292 }
5293 vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
5294 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
5295 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
5296
5297 vm_page_queue_init(&lq->vpl_queue);
5298 /*
5299 * Adjust the global page counts.
5300 */
5301 vm_page_active_count += lq->vpl_count;
5302 vm_page_pageable_internal_count += lq->vpl_internal_count;
5303 vm_page_pageable_external_count += lq->vpl_external_count;
5304 lq->vpl_count = 0;
5305 lq->vpl_internal_count = 0;
5306 lq->vpl_external_count = 0;
5307 }
5308 assert(vm_page_queue_empty(&lq->vpl_queue));
5309
5310 if (nolocks == FALSE) {
5311 VPL_UNLOCK(&lq->vpl_lock);
5312
5313 vm_page_balance_inactive(count / 4);
5314 vm_page_unlock_queues();
5315 }
5316 }
5317
5318 /*
5319 * vm_page_part_zero_fill:
5320 *
5321 * Zero-fill a part of the page.
5322 */
5323 #define PMAP_ZERO_PART_PAGE_IMPLEMENTED
5324 void
vm_page_part_zero_fill(vm_page_t m,vm_offset_t m_pa,vm_size_t len)5325 vm_page_part_zero_fill(
5326 vm_page_t m,
5327 vm_offset_t m_pa,
5328 vm_size_t len)
5329 {
5330 #if 0
5331 /*
5332 * we don't hold the page queue lock
5333 * so this check isn't safe to make
5334 */
5335 VM_PAGE_CHECK(m);
5336 #endif
5337
5338 #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED
5339 pmap_zero_part_page(VM_PAGE_GET_PHYS_PAGE(m), m_pa, len);
5340 #else
5341 vm_page_t tmp;
5342 while (1) {
5343 tmp = vm_page_grab();
5344 if (tmp == VM_PAGE_NULL) {
5345 vm_page_wait(THREAD_UNINT);
5346 continue;
5347 }
5348 break;
5349 }
5350 vm_page_zero_fill(tmp);
5351 if (m_pa != 0) {
5352 vm_page_part_copy(m, 0, tmp, 0, m_pa);
5353 }
5354 if ((m_pa + len) < PAGE_SIZE) {
5355 vm_page_part_copy(m, m_pa + len, tmp,
5356 m_pa + len, PAGE_SIZE - (m_pa + len));
5357 }
5358 vm_page_copy(tmp, m);
5359 VM_PAGE_FREE(tmp);
5360 #endif
5361 }
5362
5363 /*
5364 * vm_page_zero_fill:
5365 *
5366 * Zero-fill the specified page.
5367 */
5368 void
vm_page_zero_fill(vm_page_t m)5369 vm_page_zero_fill(
5370 vm_page_t m)
5371 {
5372 #if 0
5373 /*
5374 * we don't hold the page queue lock
5375 * so this check isn't safe to make
5376 */
5377 VM_PAGE_CHECK(m);
5378 #endif
5379
5380 // dbgTrace(0xAEAEAEAE, VM_PAGE_GET_PHYS_PAGE(m), 0); /* (BRINGUP) */
5381 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
5382 }
5383
5384 /*
5385 * vm_page_part_copy:
5386 *
5387 * copy part of one page to another
5388 */
5389
5390 void
vm_page_part_copy(vm_page_t src_m,vm_offset_t src_pa,vm_page_t dst_m,vm_offset_t dst_pa,vm_size_t len)5391 vm_page_part_copy(
5392 vm_page_t src_m,
5393 vm_offset_t src_pa,
5394 vm_page_t dst_m,
5395 vm_offset_t dst_pa,
5396 vm_size_t len)
5397 {
5398 #if 0
5399 /*
5400 * we don't hold the page queue lock
5401 * so this check isn't safe to make
5402 */
5403 VM_PAGE_CHECK(src_m);
5404 VM_PAGE_CHECK(dst_m);
5405 #endif
5406 pmap_copy_part_page(VM_PAGE_GET_PHYS_PAGE(src_m), src_pa,
5407 VM_PAGE_GET_PHYS_PAGE(dst_m), dst_pa, len);
5408 }
5409
5410 /*
5411 * vm_page_copy:
5412 *
5413 * Copy one page to another
5414 */
5415
5416 int vm_page_copy_cs_validations = 0;
5417 int vm_page_copy_cs_tainted = 0;
5418
5419 void
vm_page_copy(vm_page_t src_m,vm_page_t dest_m)5420 vm_page_copy(
5421 vm_page_t src_m,
5422 vm_page_t dest_m)
5423 {
5424 vm_object_t src_m_object;
5425
5426 src_m_object = VM_PAGE_OBJECT(src_m);
5427
5428 #if 0
5429 /*
5430 * we don't hold the page queue lock
5431 * so this check isn't safe to make
5432 */
5433 VM_PAGE_CHECK(src_m);
5434 VM_PAGE_CHECK(dest_m);
5435 #endif
5436 vm_object_lock_assert_held(src_m_object);
5437
5438 if (src_m_object != VM_OBJECT_NULL &&
5439 src_m_object->code_signed) {
5440 /*
5441 * We're copying a page from a code-signed object.
5442 * Whoever ends up mapping the copy page might care about
5443 * the original page's integrity, so let's validate the
5444 * source page now.
5445 */
5446 vm_page_copy_cs_validations++;
5447 vm_page_validate_cs(src_m, PAGE_SIZE, 0);
5448 #if DEVELOPMENT || DEBUG
5449 DTRACE_VM4(codesigned_copy,
5450 vm_object_t, src_m_object,
5451 vm_object_offset_t, src_m->vmp_offset,
5452 int, src_m->vmp_cs_validated,
5453 int, src_m->vmp_cs_tainted);
5454 #endif /* DEVELOPMENT || DEBUG */
5455 }
5456
5457 /*
5458 * Propagate the cs_tainted bit to the copy page. Do not propagate
5459 * the cs_validated bit.
5460 */
5461 dest_m->vmp_cs_tainted = src_m->vmp_cs_tainted;
5462 dest_m->vmp_cs_nx = src_m->vmp_cs_nx;
5463 if (dest_m->vmp_cs_tainted) {
5464 vm_page_copy_cs_tainted++;
5465 }
5466 dest_m->vmp_error = src_m->vmp_error; /* sliding src_m might have failed... */
5467 pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m));
5468 }
5469
5470 #if MACH_ASSERT
5471 static void
_vm_page_print(vm_page_t p)5472 _vm_page_print(
5473 vm_page_t p)
5474 {
5475 printf("vm_page %p: \n", p);
5476 printf(" pageq: next=%p prev=%p\n",
5477 (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next),
5478 (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.prev));
5479 printf(" listq: next=%p prev=%p\n",
5480 (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.next)),
5481 (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.prev)));
5482 printf(" next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)));
5483 printf(" object=%p offset=0x%llx\n", VM_PAGE_OBJECT(p), p->vmp_offset);
5484 printf(" wire_count=%u\n", p->vmp_wire_count);
5485 printf(" q_state=%u\n", p->vmp_q_state);
5486
5487 printf(" %slaundry, %sref, %sgobbled, %sprivate\n",
5488 (p->vmp_laundry ? "" : "!"),
5489 (p->vmp_reference ? "" : "!"),
5490 (p->vmp_gobbled ? "" : "!"),
5491 (p->vmp_private ? "" : "!"));
5492 printf(" %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n",
5493 (p->vmp_busy ? "" : "!"),
5494 (p->vmp_wanted ? "" : "!"),
5495 (p->vmp_tabled ? "" : "!"),
5496 (p->vmp_fictitious ? "" : "!"),
5497 (p->vmp_pmapped ? "" : "!"),
5498 (p->vmp_wpmapped ? "" : "!"));
5499 printf(" %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n",
5500 (p->vmp_free_when_done ? "" : "!"),
5501 (p->vmp_absent ? "" : "!"),
5502 (p->vmp_error ? "" : "!"),
5503 (p->vmp_dirty ? "" : "!"),
5504 (p->vmp_cleaning ? "" : "!"),
5505 (p->vmp_precious ? "" : "!"),
5506 (p->vmp_clustered ? "" : "!"));
5507 printf(" %soverwriting, %srestart, %sunusual\n",
5508 (p->vmp_overwriting ? "" : "!"),
5509 (p->vmp_restart ? "" : "!"),
5510 (p->vmp_unusual ? "" : "!"));
5511 printf(" cs_validated=%d, cs_tainted=%d, cs_nx=%d, %sno_cache\n",
5512 p->vmp_cs_validated,
5513 p->vmp_cs_tainted,
5514 p->vmp_cs_nx,
5515 (p->vmp_no_cache ? "" : "!"));
5516
5517 printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p));
5518 }
5519
5520 /*
5521 * Check that the list of pages is ordered by
5522 * ascending physical address and has no holes.
5523 */
5524 static int
vm_page_verify_contiguous(vm_page_t pages,unsigned int npages)5525 vm_page_verify_contiguous(
5526 vm_page_t pages,
5527 unsigned int npages)
5528 {
5529 vm_page_t m;
5530 unsigned int page_count;
5531 vm_offset_t prev_addr;
5532
5533 prev_addr = VM_PAGE_GET_PHYS_PAGE(pages);
5534 page_count = 1;
5535 for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) {
5536 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5537 printf("m %p prev_addr 0x%lx, current addr 0x%x\n",
5538 m, (long)prev_addr, VM_PAGE_GET_PHYS_PAGE(m));
5539 printf("pages %p page_count %d npages %d\n", pages, page_count, npages);
5540 panic("vm_page_verify_contiguous: not contiguous!");
5541 }
5542 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5543 ++page_count;
5544 }
5545 if (page_count != npages) {
5546 printf("pages %p actual count 0x%x but requested 0x%x\n",
5547 pages, page_count, npages);
5548 panic("vm_page_verify_contiguous: count error");
5549 }
5550 return 1;
5551 }
5552
5553
5554 /*
5555 * Check the free lists for proper length etc.
5556 */
5557 static boolean_t vm_page_verify_this_free_list_enabled = FALSE;
5558 static unsigned int
vm_page_verify_free_list(vm_page_queue_head_t * vm_page_queue,unsigned int color,vm_page_t look_for_page,boolean_t expect_page)5559 vm_page_verify_free_list(
5560 vm_page_queue_head_t *vm_page_queue,
5561 unsigned int color,
5562 vm_page_t look_for_page,
5563 boolean_t expect_page)
5564 {
5565 unsigned int npages;
5566 vm_page_t m;
5567 vm_page_t prev_m;
5568 boolean_t found_page;
5569
5570 if (!vm_page_verify_this_free_list_enabled) {
5571 return 0;
5572 }
5573
5574 found_page = FALSE;
5575 npages = 0;
5576 prev_m = (vm_page_t)((uintptr_t)vm_page_queue);
5577
5578 vm_page_queue_iterate(vm_page_queue, m, vmp_pageq) {
5579 if (m == look_for_page) {
5580 found_page = TRUE;
5581 }
5582 if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev) != prev_m) {
5583 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p",
5584 color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev), prev_m);
5585 }
5586 if (!m->vmp_busy) {
5587 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy",
5588 color, npages, m);
5589 }
5590 if (color != (unsigned int) -1) {
5591 if (VM_PAGE_GET_COLOR(m) != color) {
5592 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u",
5593 color, npages, m, VM_PAGE_GET_COLOR(m), color);
5594 }
5595 if (m->vmp_q_state != VM_PAGE_ON_FREE_Q) {
5596 panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d",
5597 color, npages, m, m->vmp_q_state);
5598 }
5599 } else {
5600 if (m->vmp_q_state != VM_PAGE_ON_FREE_LOCAL_Q) {
5601 panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d",
5602 npages, m, m->vmp_q_state);
5603 }
5604 }
5605 ++npages;
5606 prev_m = m;
5607 }
5608 if (look_for_page != VM_PAGE_NULL) {
5609 unsigned int other_color;
5610
5611 if (expect_page && !found_page) {
5612 printf("vm_page_verify_free_list(color=%u, npages=%u): page %p not found phys=%u\n",
5613 color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
5614 _vm_page_print(look_for_page);
5615 for (other_color = 0;
5616 other_color < vm_colors;
5617 other_color++) {
5618 if (other_color == color) {
5619 continue;
5620 }
5621 vm_page_verify_free_list(&vm_page_queue_free[other_color].qhead,
5622 other_color, look_for_page, FALSE);
5623 }
5624 if (color == (unsigned int) -1) {
5625 vm_page_verify_free_list(&vm_lopage_queue_free,
5626 (unsigned int) -1, look_for_page, FALSE);
5627 }
5628 panic("vm_page_verify_free_list(color=%u)", color);
5629 }
5630 if (!expect_page && found_page) {
5631 printf("vm_page_verify_free_list(color=%u, npages=%u): page %p found phys=%u\n",
5632 color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
5633 }
5634 }
5635 return npages;
5636 }
5637
5638 static boolean_t vm_page_verify_all_free_lists_enabled = FALSE;
5639 static void
vm_page_verify_free_lists(void)5640 vm_page_verify_free_lists( void )
5641 {
5642 unsigned int color, npages, nlopages;
5643 boolean_t toggle = TRUE;
5644
5645 if (!vm_page_verify_all_free_lists_enabled) {
5646 return;
5647 }
5648
5649 npages = 0;
5650
5651 lck_mtx_lock(&vm_page_queue_free_lock);
5652
5653 if (vm_page_verify_this_free_list_enabled == TRUE) {
5654 /*
5655 * This variable has been set globally for extra checking of
5656 * each free list Q. Since we didn't set it, we don't own it
5657 * and we shouldn't toggle it.
5658 */
5659 toggle = FALSE;
5660 }
5661
5662 if (toggle == TRUE) {
5663 vm_page_verify_this_free_list_enabled = TRUE;
5664 }
5665
5666 for (color = 0; color < vm_colors; color++) {
5667 npages += vm_page_verify_free_list(&vm_page_queue_free[color].qhead,
5668 color, VM_PAGE_NULL, FALSE);
5669 }
5670 nlopages = vm_page_verify_free_list(&vm_lopage_queue_free,
5671 (unsigned int) -1,
5672 VM_PAGE_NULL, FALSE);
5673 if (npages != vm_page_free_count || nlopages != vm_lopage_free_count) {
5674 panic("vm_page_verify_free_lists: "
5675 "npages %u free_count %d nlopages %u lo_free_count %u",
5676 npages, vm_page_free_count, nlopages, vm_lopage_free_count);
5677 }
5678
5679 if (toggle == TRUE) {
5680 vm_page_verify_this_free_list_enabled = FALSE;
5681 }
5682
5683 lck_mtx_unlock(&vm_page_queue_free_lock);
5684 }
5685
5686 #endif /* MACH_ASSERT */
5687
5688
5689 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
5690
5691 /*
5692 * CONTIGUOUS PAGE ALLOCATION
5693 *
5694 * Find a region large enough to contain at least n pages
5695 * of contiguous physical memory.
5696 *
5697 * This is done by traversing the vm_page_t array in a linear fashion
5698 * we assume that the vm_page_t array has the avaiable physical pages in an
5699 * ordered, ascending list... this is currently true of all our implementations
5700 * and must remain so... there can be 'holes' in the array... we also can
5701 * no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed
5702 * which use to happen via 'vm_page_convert'... that function was no longer
5703 * being called and was removed...
5704 *
5705 * The basic flow consists of stabilizing some of the interesting state of
5706 * a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our
5707 * sweep at the beginning of the array looking for pages that meet our criterea
5708 * for a 'stealable' page... currently we are pretty conservative... if the page
5709 * meets this criterea and is physically contiguous to the previous page in the 'run'
5710 * we keep developing it. If we hit a page that doesn't fit, we reset our state
5711 * and start to develop a new run... if at this point we've already considered
5712 * at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold,
5713 * and mutex_pause (which will yield the processor), to keep the latency low w/r
5714 * to other threads trying to acquire free pages (or move pages from q to q),
5715 * and then continue from the spot we left off... we only make 1 pass through the
5716 * array. Once we have a 'run' that is long enough, we'll go into the loop which
5717 * which steals the pages from the queues they're currently on... pages on the free
5718 * queue can be stolen directly... pages that are on any of the other queues
5719 * must be removed from the object they are tabled on... this requires taking the
5720 * object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails
5721 * or if the state of the page behind the vm_object lock is no longer viable, we'll
5722 * dump the pages we've currently stolen back to the free list, and pick up our
5723 * scan from the point where we aborted the 'current' run.
5724 *
5725 *
5726 * Requirements:
5727 * - neither vm_page_queue nor vm_free_list lock can be held on entry
5728 *
5729 * Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL.
5730 *
5731 * Algorithm:
5732 */
5733
5734 #define MAX_CONSIDERED_BEFORE_YIELD 1000
5735
5736
5737 #define RESET_STATE_OF_RUN() \
5738 MACRO_BEGIN \
5739 prevcontaddr = -2; \
5740 start_pnum = -1; \
5741 free_considered = 0; \
5742 substitute_needed = 0; \
5743 npages = 0; \
5744 MACRO_END
5745
5746 /*
5747 * Can we steal in-use (i.e. not free) pages when searching for
5748 * physically-contiguous pages ?
5749 */
5750 #define VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL 1
5751
5752 static unsigned int vm_page_find_contiguous_last_idx = 0, vm_page_lomem_find_contiguous_last_idx = 0;
5753 #if DEBUG
5754 int vm_page_find_contig_debug = 0;
5755 #endif
5756
5757 static vm_page_t
vm_page_find_contiguous(unsigned int contig_pages,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)5758 vm_page_find_contiguous(
5759 unsigned int contig_pages,
5760 ppnum_t max_pnum,
5761 ppnum_t pnum_mask,
5762 boolean_t wire,
5763 int flags)
5764 {
5765 vm_page_t m = NULL;
5766 ppnum_t prevcontaddr = 0;
5767 ppnum_t start_pnum = 0;
5768 unsigned int npages = 0, considered = 0, scanned = 0;
5769 unsigned int page_idx = 0, start_idx = 0, last_idx = 0, orig_last_idx = 0;
5770 unsigned int idx_last_contig_page_found = 0;
5771 int free_considered = 0, free_available = 0;
5772 int substitute_needed = 0;
5773 int zone_gc_called = 0;
5774 boolean_t wrapped;
5775 kern_return_t kr;
5776 #if DEBUG
5777 clock_sec_t tv_start_sec = 0, tv_end_sec = 0;
5778 clock_usec_t tv_start_usec = 0, tv_end_usec = 0;
5779 #endif
5780
5781 int yielded = 0;
5782 int dumped_run = 0;
5783 int stolen_pages = 0;
5784 int compressed_pages = 0;
5785
5786
5787 if (contig_pages == 0) {
5788 return VM_PAGE_NULL;
5789 }
5790
5791 full_scan_again:
5792
5793 #if MACH_ASSERT
5794 vm_page_verify_free_lists();
5795 #endif
5796 #if DEBUG
5797 clock_get_system_microtime(&tv_start_sec, &tv_start_usec);
5798 #endif
5799 PAGE_REPLACEMENT_ALLOWED(TRUE);
5800
5801 /*
5802 * If there are still delayed pages, try to free up some that match.
5803 */
5804 if (__improbable(vm_delayed_count != 0 && contig_pages != 0)) {
5805 vm_free_delayed_pages_contig(contig_pages, max_pnum, pnum_mask);
5806 }
5807
5808 vm_page_lock_queues();
5809 lck_mtx_lock(&vm_page_queue_free_lock);
5810
5811 RESET_STATE_OF_RUN();
5812
5813 scanned = 0;
5814 considered = 0;
5815 free_available = vm_page_free_count - vm_page_free_reserved;
5816
5817 wrapped = FALSE;
5818
5819 if (flags & KMA_LOMEM) {
5820 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx;
5821 } else {
5822 idx_last_contig_page_found = vm_page_find_contiguous_last_idx;
5823 }
5824
5825 orig_last_idx = idx_last_contig_page_found;
5826 last_idx = orig_last_idx;
5827
5828 for (page_idx = last_idx, start_idx = last_idx;
5829 npages < contig_pages && page_idx < vm_pages_count;
5830 page_idx++) {
5831 retry:
5832 if (wrapped &&
5833 npages == 0 &&
5834 page_idx >= orig_last_idx) {
5835 /*
5836 * We're back where we started and we haven't
5837 * found any suitable contiguous range. Let's
5838 * give up.
5839 */
5840 break;
5841 }
5842 scanned++;
5843 m = &vm_pages[page_idx];
5844
5845 assert(!m->vmp_fictitious);
5846 assert(!m->vmp_private);
5847
5848 if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) {
5849 /* no more low pages... */
5850 break;
5851 }
5852 if (!npages & ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0)) {
5853 /*
5854 * not aligned
5855 */
5856 RESET_STATE_OF_RUN();
5857 } else if (VM_PAGE_WIRED(m) || m->vmp_gobbled ||
5858 m->vmp_laundry || m->vmp_wanted ||
5859 m->vmp_cleaning || m->vmp_overwriting || m->vmp_free_when_done) {
5860 /*
5861 * page is in a transient state
5862 * or a state we don't want to deal
5863 * with, so don't consider it which
5864 * means starting a new run
5865 */
5866 RESET_STATE_OF_RUN();
5867 } else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
5868 (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) ||
5869 (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
5870 (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
5871 /*
5872 * page needs to be on one of our queues (other then the pageout or special free queues)
5873 * or it needs to belong to the compressor pool (which is now indicated
5874 * by vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out
5875 * from the check for VM_PAGE_NOT_ON_Q)
5876 * in order for it to be stable behind the
5877 * locks we hold at this point...
5878 * if not, don't consider it which
5879 * means starting a new run
5880 */
5881 RESET_STATE_OF_RUN();
5882 } else if ((m->vmp_q_state != VM_PAGE_ON_FREE_Q) && (!m->vmp_tabled || m->vmp_busy)) {
5883 /*
5884 * pages on the free list are always 'busy'
5885 * so we couldn't test for 'busy' in the check
5886 * for the transient states... pages that are
5887 * 'free' are never 'tabled', so we also couldn't
5888 * test for 'tabled'. So we check here to make
5889 * sure that a non-free page is not busy and is
5890 * tabled on an object...
5891 * if not, don't consider it which
5892 * means starting a new run
5893 */
5894 RESET_STATE_OF_RUN();
5895 } else {
5896 if (VM_PAGE_GET_PHYS_PAGE(m) != prevcontaddr + 1) {
5897 if ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0) {
5898 RESET_STATE_OF_RUN();
5899 goto did_consider;
5900 } else {
5901 npages = 1;
5902 start_idx = page_idx;
5903 start_pnum = VM_PAGE_GET_PHYS_PAGE(m);
5904 }
5905 } else {
5906 npages++;
5907 }
5908 prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m);
5909
5910 VM_PAGE_CHECK(m);
5911 if (m->vmp_q_state == VM_PAGE_ON_FREE_Q) {
5912 free_considered++;
5913 } else {
5914 /*
5915 * This page is not free.
5916 * If we can't steal used pages,
5917 * we have to give up this run
5918 * and keep looking.
5919 * Otherwise, we might need to
5920 * move the contents of this page
5921 * into a substitute page.
5922 */
5923 #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
5924 if (m->vmp_pmapped || m->vmp_dirty || m->vmp_precious) {
5925 substitute_needed++;
5926 }
5927 #else
5928 RESET_STATE_OF_RUN();
5929 #endif
5930 }
5931
5932 if ((free_considered + substitute_needed) > free_available) {
5933 /*
5934 * if we let this run continue
5935 * we will end up dropping the vm_page_free_count
5936 * below the reserve limit... we need to abort
5937 * this run, but we can at least re-consider this
5938 * page... thus the jump back to 'retry'
5939 */
5940 RESET_STATE_OF_RUN();
5941
5942 if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) {
5943 considered++;
5944 goto retry;
5945 }
5946 /*
5947 * free_available == 0
5948 * so can't consider any free pages... if
5949 * we went to retry in this case, we'd
5950 * get stuck looking at the same page
5951 * w/o making any forward progress
5952 * we also want to take this path if we've already
5953 * reached our limit that controls the lock latency
5954 */
5955 }
5956 }
5957 did_consider:
5958 if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) {
5959 PAGE_REPLACEMENT_ALLOWED(FALSE);
5960
5961 lck_mtx_unlock(&vm_page_queue_free_lock);
5962 vm_page_unlock_queues();
5963
5964 mutex_pause(0);
5965
5966 PAGE_REPLACEMENT_ALLOWED(TRUE);
5967
5968 vm_page_lock_queues();
5969 lck_mtx_lock(&vm_page_queue_free_lock);
5970
5971 RESET_STATE_OF_RUN();
5972 /*
5973 * reset our free page limit since we
5974 * dropped the lock protecting the vm_page_free_queue
5975 */
5976 free_available = vm_page_free_count - vm_page_free_reserved;
5977 considered = 0;
5978
5979 yielded++;
5980
5981 goto retry;
5982 }
5983 considered++;
5984 }
5985 m = VM_PAGE_NULL;
5986
5987 if (npages != contig_pages) {
5988 if (!wrapped) {
5989 /*
5990 * We didn't find a contiguous range but we didn't
5991 * start from the very first page.
5992 * Start again from the very first page.
5993 */
5994 RESET_STATE_OF_RUN();
5995 if (flags & KMA_LOMEM) {
5996 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx = 0;
5997 } else {
5998 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = 0;
5999 }
6000 last_idx = 0;
6001 page_idx = last_idx;
6002 wrapped = TRUE;
6003 goto retry;
6004 }
6005 lck_mtx_unlock(&vm_page_queue_free_lock);
6006 } else {
6007 vm_page_t m1;
6008 vm_page_t m2;
6009 unsigned int cur_idx;
6010 unsigned int tmp_start_idx;
6011 vm_object_t locked_object = VM_OBJECT_NULL;
6012 boolean_t abort_run = FALSE;
6013
6014 assert(page_idx - start_idx == contig_pages);
6015
6016 tmp_start_idx = start_idx;
6017
6018 /*
6019 * first pass through to pull the free pages
6020 * off of the free queue so that in case we
6021 * need substitute pages, we won't grab any
6022 * of the free pages in the run... we'll clear
6023 * the 'free' bit in the 2nd pass, and even in
6024 * an abort_run case, we'll collect all of the
6025 * free pages in this run and return them to the free list
6026 */
6027 while (start_idx < page_idx) {
6028 m1 = &vm_pages[start_idx++];
6029
6030 #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
6031 assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q);
6032 #endif
6033
6034 if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) {
6035 unsigned int color;
6036
6037 color = VM_PAGE_GET_COLOR(m1);
6038 #if MACH_ASSERT
6039 vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, m1, TRUE);
6040 #endif
6041 vm_page_queue_remove(&vm_page_queue_free[color].qhead, m1, vmp_pageq);
6042
6043 VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
6044 #if MACH_ASSERT
6045 vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, VM_PAGE_NULL, FALSE);
6046 #endif
6047 /*
6048 * Clear the "free" bit so that this page
6049 * does not get considered for another
6050 * concurrent physically-contiguous allocation.
6051 */
6052 m1->vmp_q_state = VM_PAGE_NOT_ON_Q;
6053 assert(m1->vmp_busy);
6054
6055 vm_page_free_count--;
6056 }
6057 }
6058 if (flags & KMA_LOMEM) {
6059 vm_page_lomem_find_contiguous_last_idx = page_idx;
6060 } else {
6061 vm_page_find_contiguous_last_idx = page_idx;
6062 }
6063
6064 /*
6065 * we can drop the free queue lock at this point since
6066 * we've pulled any 'free' candidates off of the list
6067 * we need it dropped so that we can do a vm_page_grab
6068 * when substituing for pmapped/dirty pages
6069 */
6070 lck_mtx_unlock(&vm_page_queue_free_lock);
6071
6072 start_idx = tmp_start_idx;
6073 cur_idx = page_idx - 1;
6074
6075 while (start_idx++ < page_idx) {
6076 /*
6077 * must go through the list from back to front
6078 * so that the page list is created in the
6079 * correct order - low -> high phys addresses
6080 */
6081 m1 = &vm_pages[cur_idx--];
6082
6083 if (m1->vmp_object == 0) {
6084 /*
6085 * page has already been removed from
6086 * the free list in the 1st pass
6087 */
6088 assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
6089 assert(m1->vmp_offset == (vm_object_offset_t) -1);
6090 assert(m1->vmp_busy);
6091 assert(!m1->vmp_wanted);
6092 assert(!m1->vmp_laundry);
6093 } else {
6094 vm_object_t object;
6095 int refmod;
6096 boolean_t disconnected, reusable;
6097
6098 if (abort_run == TRUE) {
6099 continue;
6100 }
6101
6102 assert(m1->vmp_q_state != VM_PAGE_NOT_ON_Q);
6103
6104 object = VM_PAGE_OBJECT(m1);
6105
6106 if (object != locked_object) {
6107 if (locked_object) {
6108 vm_object_unlock(locked_object);
6109 locked_object = VM_OBJECT_NULL;
6110 }
6111 if (vm_object_lock_try(object)) {
6112 locked_object = object;
6113 }
6114 }
6115 if (locked_object == VM_OBJECT_NULL ||
6116 (VM_PAGE_WIRED(m1) || m1->vmp_gobbled ||
6117 m1->vmp_laundry || m1->vmp_wanted ||
6118 m1->vmp_cleaning || m1->vmp_overwriting || m1->vmp_free_when_done || m1->vmp_busy) ||
6119 (m1->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6120 if (locked_object) {
6121 vm_object_unlock(locked_object);
6122 locked_object = VM_OBJECT_NULL;
6123 }
6124 tmp_start_idx = cur_idx;
6125 abort_run = TRUE;
6126 continue;
6127 }
6128
6129 disconnected = FALSE;
6130 reusable = FALSE;
6131
6132 if ((m1->vmp_reusable ||
6133 object->all_reusable) &&
6134 (m1->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) &&
6135 !m1->vmp_dirty &&
6136 !m1->vmp_reference) {
6137 /* reusable page... */
6138 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
6139 disconnected = TRUE;
6140 if (refmod == 0) {
6141 /*
6142 * ... not reused: can steal
6143 * without relocating contents.
6144 */
6145 reusable = TRUE;
6146 }
6147 }
6148
6149 if ((m1->vmp_pmapped &&
6150 !reusable) ||
6151 m1->vmp_dirty ||
6152 m1->vmp_precious) {
6153 vm_object_offset_t offset;
6154
6155 m2 = vm_page_grab_options(VM_PAGE_GRAB_Q_LOCK_HELD);
6156
6157 if (m2 == VM_PAGE_NULL) {
6158 if (locked_object) {
6159 vm_object_unlock(locked_object);
6160 locked_object = VM_OBJECT_NULL;
6161 }
6162 tmp_start_idx = cur_idx;
6163 abort_run = TRUE;
6164 continue;
6165 }
6166 if (!disconnected) {
6167 if (m1->vmp_pmapped) {
6168 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
6169 } else {
6170 refmod = 0;
6171 }
6172 }
6173
6174 /* copy the page's contents */
6175 pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2));
6176 /* copy the page's state */
6177 assert(!VM_PAGE_WIRED(m1));
6178 assert(m1->vmp_q_state != VM_PAGE_ON_FREE_Q);
6179 assert(m1->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q);
6180 assert(!m1->vmp_laundry);
6181 m2->vmp_reference = m1->vmp_reference;
6182 assert(!m1->vmp_gobbled);
6183 assert(!m1->vmp_private);
6184 m2->vmp_no_cache = m1->vmp_no_cache;
6185 m2->vmp_xpmapped = 0;
6186 assert(!m1->vmp_busy);
6187 assert(!m1->vmp_wanted);
6188 assert(!m1->vmp_fictitious);
6189 m2->vmp_pmapped = m1->vmp_pmapped; /* should flush cache ? */
6190 m2->vmp_wpmapped = m1->vmp_wpmapped;
6191 assert(!m1->vmp_free_when_done);
6192 m2->vmp_absent = m1->vmp_absent;
6193 m2->vmp_error = m1->vmp_error;
6194 m2->vmp_dirty = m1->vmp_dirty;
6195 assert(!m1->vmp_cleaning);
6196 m2->vmp_precious = m1->vmp_precious;
6197 m2->vmp_clustered = m1->vmp_clustered;
6198 assert(!m1->vmp_overwriting);
6199 m2->vmp_restart = m1->vmp_restart;
6200 m2->vmp_unusual = m1->vmp_unusual;
6201 m2->vmp_cs_validated = m1->vmp_cs_validated;
6202 m2->vmp_cs_tainted = m1->vmp_cs_tainted;
6203 m2->vmp_cs_nx = m1->vmp_cs_nx;
6204
6205 /*
6206 * If m1 had really been reusable,
6207 * we would have just stolen it, so
6208 * let's not propagate it's "reusable"
6209 * bit and assert that m2 is not
6210 * marked as "reusable".
6211 */
6212 // m2->vmp_reusable = m1->vmp_reusable;
6213 assert(!m2->vmp_reusable);
6214
6215 // assert(!m1->vmp_lopage);
6216
6217 if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6218 m2->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
6219 }
6220
6221 /*
6222 * page may need to be flushed if
6223 * it is marshalled into a UPL
6224 * that is going to be used by a device
6225 * that doesn't support coherency
6226 */
6227 m2->vmp_written_by_kernel = TRUE;
6228
6229 /*
6230 * make sure we clear the ref/mod state
6231 * from the pmap layer... else we risk
6232 * inheriting state from the last time
6233 * this page was used...
6234 */
6235 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
6236
6237 if (refmod & VM_MEM_REFERENCED) {
6238 m2->vmp_reference = TRUE;
6239 }
6240 if (refmod & VM_MEM_MODIFIED) {
6241 SET_PAGE_DIRTY(m2, TRUE);
6242 }
6243 offset = m1->vmp_offset;
6244
6245 /*
6246 * completely cleans up the state
6247 * of the page so that it is ready
6248 * to be put onto the free list, or
6249 * for this purpose it looks like it
6250 * just came off of the free list
6251 */
6252 vm_page_free_prepare(m1);
6253
6254 /*
6255 * now put the substitute page
6256 * on the object
6257 */
6258 vm_page_insert_internal(m2, locked_object, offset, VM_KERN_MEMORY_NONE, TRUE, TRUE, FALSE, FALSE, NULL);
6259
6260 if (m2->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6261 m2->vmp_pmapped = TRUE;
6262 m2->vmp_wpmapped = TRUE;
6263
6264 PMAP_ENTER(kernel_pmap, (vm_map_offset_t)m2->vmp_offset, m2,
6265 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, kr);
6266
6267 assert(kr == KERN_SUCCESS);
6268
6269 compressed_pages++;
6270 } else {
6271 if (m2->vmp_reference) {
6272 vm_page_activate(m2);
6273 } else {
6274 vm_page_deactivate(m2);
6275 }
6276 }
6277 PAGE_WAKEUP_DONE(m2);
6278 } else {
6279 assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6280
6281 /*
6282 * completely cleans up the state
6283 * of the page so that it is ready
6284 * to be put onto the free list, or
6285 * for this purpose it looks like it
6286 * just came off of the free list
6287 */
6288 vm_page_free_prepare(m1);
6289 }
6290
6291 stolen_pages++;
6292 }
6293 #if CONFIG_BACKGROUND_QUEUE
6294 vm_page_assign_background_state(m1);
6295 #endif
6296 VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
6297 m1->vmp_snext = m;
6298 m = m1;
6299 }
6300 if (locked_object) {
6301 vm_object_unlock(locked_object);
6302 locked_object = VM_OBJECT_NULL;
6303 }
6304
6305 if (abort_run == TRUE) {
6306 /*
6307 * want the index of the last
6308 * page in this run that was
6309 * successfully 'stolen', so back
6310 * it up 1 for the auto-decrement on use
6311 * and 1 more to bump back over this page
6312 */
6313 page_idx = tmp_start_idx + 2;
6314 if (page_idx >= vm_pages_count) {
6315 if (wrapped) {
6316 if (m != VM_PAGE_NULL) {
6317 vm_page_unlock_queues();
6318 vm_page_free_list(m, FALSE);
6319 vm_page_lock_queues();
6320 m = VM_PAGE_NULL;
6321 }
6322 dumped_run++;
6323 goto done_scanning;
6324 }
6325 page_idx = last_idx = 0;
6326 wrapped = TRUE;
6327 }
6328 abort_run = FALSE;
6329
6330 /*
6331 * We didn't find a contiguous range but we didn't
6332 * start from the very first page.
6333 * Start again from the very first page.
6334 */
6335 RESET_STATE_OF_RUN();
6336
6337 if (flags & KMA_LOMEM) {
6338 idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx = page_idx;
6339 } else {
6340 idx_last_contig_page_found = vm_page_find_contiguous_last_idx = page_idx;
6341 }
6342
6343 last_idx = page_idx;
6344
6345 if (m != VM_PAGE_NULL) {
6346 vm_page_unlock_queues();
6347 vm_page_free_list(m, FALSE);
6348 vm_page_lock_queues();
6349 m = VM_PAGE_NULL;
6350 }
6351 dumped_run++;
6352
6353 lck_mtx_lock(&vm_page_queue_free_lock);
6354 /*
6355 * reset our free page limit since we
6356 * dropped the lock protecting the vm_page_free_queue
6357 */
6358 free_available = vm_page_free_count - vm_page_free_reserved;
6359 goto retry;
6360 }
6361
6362 for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) {
6363 assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
6364 assert(m1->vmp_wire_count == 0);
6365
6366 if (wire == TRUE) {
6367 m1->vmp_wire_count++;
6368 m1->vmp_q_state = VM_PAGE_IS_WIRED;
6369 } else {
6370 m1->vmp_gobbled = TRUE;
6371 }
6372 }
6373 if (wire == FALSE) {
6374 vm_page_gobble_count += npages;
6375 }
6376
6377 /*
6378 * gobbled pages are also counted as wired pages
6379 */
6380 vm_page_wire_count += npages;
6381
6382 assert(vm_page_verify_contiguous(m, npages));
6383 }
6384 done_scanning:
6385 PAGE_REPLACEMENT_ALLOWED(FALSE);
6386
6387 vm_page_unlock_queues();
6388
6389 #if DEBUG
6390 clock_get_system_microtime(&tv_end_sec, &tv_end_usec);
6391
6392 tv_end_sec -= tv_start_sec;
6393 if (tv_end_usec < tv_start_usec) {
6394 tv_end_sec--;
6395 tv_end_usec += 1000000;
6396 }
6397 tv_end_usec -= tv_start_usec;
6398 if (tv_end_usec >= 1000000) {
6399 tv_end_sec++;
6400 tv_end_sec -= 1000000;
6401 }
6402 if (vm_page_find_contig_debug) {
6403 printf("%s(num=%d,low=%d): found %d pages at 0x%llx in %ld.%06ds... started at %d... scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages\n",
6404 __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
6405 (long)tv_end_sec, tv_end_usec, orig_last_idx,
6406 scanned, yielded, dumped_run, stolen_pages, compressed_pages);
6407 }
6408
6409 #endif
6410 #if MACH_ASSERT
6411 vm_page_verify_free_lists();
6412 #endif
6413 if (m == NULL && zone_gc_called < 2) {
6414 printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n",
6415 __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
6416 scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count);
6417
6418 if (consider_buffer_cache_collect != NULL) {
6419 (void)(*consider_buffer_cache_collect)(1);
6420 }
6421
6422 zone_gc(zone_gc_called ? ZONE_GC_DRAIN : ZONE_GC_TRIM);
6423
6424 zone_gc_called++;
6425
6426 printf("vm_page_find_contiguous: zone_gc called... wired count is %d\n", vm_page_wire_count);
6427 goto full_scan_again;
6428 }
6429
6430 return m;
6431 }
6432
6433 /*
6434 * Allocate a list of contiguous, wired pages.
6435 */
6436 kern_return_t
cpm_allocate(vm_size_t size,vm_page_t * list,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)6437 cpm_allocate(
6438 vm_size_t size,
6439 vm_page_t *list,
6440 ppnum_t max_pnum,
6441 ppnum_t pnum_mask,
6442 boolean_t wire,
6443 int flags)
6444 {
6445 vm_page_t pages;
6446 unsigned int npages;
6447
6448 if (size % PAGE_SIZE != 0) {
6449 return KERN_INVALID_ARGUMENT;
6450 }
6451
6452 npages = (unsigned int) (size / PAGE_SIZE);
6453 if (npages != size / PAGE_SIZE) {
6454 /* 32-bit overflow */
6455 return KERN_INVALID_ARGUMENT;
6456 }
6457
6458 /*
6459 * Obtain a pointer to a subset of the free
6460 * list large enough to satisfy the request;
6461 * the region will be physically contiguous.
6462 */
6463 pages = vm_page_find_contiguous(npages, max_pnum, pnum_mask, wire, flags);
6464
6465 if (pages == VM_PAGE_NULL) {
6466 return KERN_NO_SPACE;
6467 }
6468 /*
6469 * determine need for wakeups
6470 */
6471 if (vm_page_free_count < vm_page_free_min) {
6472 lck_mtx_lock(&vm_page_queue_free_lock);
6473 if (vm_pageout_running == FALSE) {
6474 lck_mtx_unlock(&vm_page_queue_free_lock);
6475 thread_wakeup((event_t) &vm_page_free_wanted);
6476 } else {
6477 lck_mtx_unlock(&vm_page_queue_free_lock);
6478 }
6479 }
6480
6481 VM_CHECK_MEMORYSTATUS;
6482
6483 /*
6484 * The CPM pages should now be available and
6485 * ordered by ascending physical address.
6486 */
6487 assert(vm_page_verify_contiguous(pages, npages));
6488
6489 *list = pages;
6490 return KERN_SUCCESS;
6491 }
6492
6493
6494 unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT;
6495
6496 /*
6497 * when working on a 'run' of pages, it is necessary to hold
6498 * the vm_page_queue_lock (a hot global lock) for certain operations
6499 * on the page... however, the majority of the work can be done
6500 * while merely holding the object lock... in fact there are certain
6501 * collections of pages that don't require any work brokered by the
6502 * vm_page_queue_lock... to mitigate the time spent behind the global
6503 * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
6504 * while doing all of the work that doesn't require the vm_page_queue_lock...
6505 * then call vm_page_do_delayed_work to acquire the vm_page_queue_lock and do the
6506 * necessary work for each page... we will grab the busy bit on the page
6507 * if it's not already held so that vm_page_do_delayed_work can drop the object lock
6508 * if it can't immediately take the vm_page_queue_lock in order to compete
6509 * for the locks in the same order that vm_pageout_scan takes them.
6510 * the operation names are modeled after the names of the routines that
6511 * need to be called in order to make the changes very obvious in the
6512 * original loop
6513 */
6514
6515 void
vm_page_do_delayed_work(vm_object_t object,vm_tag_t tag,struct vm_page_delayed_work * dwp,int dw_count)6516 vm_page_do_delayed_work(
6517 vm_object_t object,
6518 vm_tag_t tag,
6519 struct vm_page_delayed_work *dwp,
6520 int dw_count)
6521 {
6522 int j;
6523 vm_page_t m;
6524 vm_page_t local_free_q = VM_PAGE_NULL;
6525
6526 /*
6527 * pageout_scan takes the vm_page_lock_queues first
6528 * then tries for the object lock... to avoid what
6529 * is effectively a lock inversion, we'll go to the
6530 * trouble of taking them in that same order... otherwise
6531 * if this object contains the majority of the pages resident
6532 * in the UBC (or a small set of large objects actively being
6533 * worked on contain the majority of the pages), we could
6534 * cause the pageout_scan thread to 'starve' in its attempt
6535 * to find pages to move to the free queue, since it has to
6536 * successfully acquire the object lock of any candidate page
6537 * before it can steal/clean it.
6538 */
6539 if (!vm_page_trylockspin_queues()) {
6540 vm_object_unlock(object);
6541
6542 /*
6543 * "Turnstile enabled vm_pageout_scan" can be runnable
6544 * for a very long time without getting on a core.
6545 * If this is a higher priority thread it could be
6546 * waiting here for a very long time respecting the fact
6547 * that pageout_scan would like its object after VPS does
6548 * a mutex_pause(0).
6549 * So we cap the number of yields in the vm_object_lock_avoid()
6550 * case to a single mutex_pause(0) which will give vm_pageout_scan
6551 * 10us to run and grab the object if needed.
6552 */
6553 vm_page_lockspin_queues();
6554
6555 for (j = 0;; j++) {
6556 if ((!vm_object_lock_avoid(object) ||
6557 (vps_dynamic_priority_enabled && (j > 0))) &&
6558 _vm_object_lock_try(object)) {
6559 break;
6560 }
6561 vm_page_unlock_queues();
6562 mutex_pause(j);
6563 vm_page_lockspin_queues();
6564 }
6565 }
6566 for (j = 0; j < dw_count; j++, dwp++) {
6567 m = dwp->dw_m;
6568
6569 if (dwp->dw_mask & DW_vm_pageout_throttle_up) {
6570 vm_pageout_throttle_up(m);
6571 }
6572 #if CONFIG_PHANTOM_CACHE
6573 if (dwp->dw_mask & DW_vm_phantom_cache_update) {
6574 vm_phantom_cache_update(m);
6575 }
6576 #endif
6577 if (dwp->dw_mask & DW_vm_page_wire) {
6578 vm_page_wire(m, tag, FALSE);
6579 } else if (dwp->dw_mask & DW_vm_page_unwire) {
6580 boolean_t queueit;
6581
6582 queueit = (dwp->dw_mask & (DW_vm_page_free | DW_vm_page_deactivate_internal)) ? FALSE : TRUE;
6583
6584 vm_page_unwire(m, queueit);
6585 }
6586 if (dwp->dw_mask & DW_vm_page_free) {
6587 vm_page_free_prepare_queues(m);
6588
6589 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
6590 /*
6591 * Add this page to our list of reclaimed pages,
6592 * to be freed later.
6593 */
6594 m->vmp_snext = local_free_q;
6595 local_free_q = m;
6596 } else {
6597 if (dwp->dw_mask & DW_vm_page_deactivate_internal) {
6598 vm_page_deactivate_internal(m, FALSE);
6599 } else if (dwp->dw_mask & DW_vm_page_activate) {
6600 if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
6601 vm_page_activate(m);
6602 }
6603 } else if (dwp->dw_mask & DW_vm_page_speculate) {
6604 vm_page_speculate(m, TRUE);
6605 } else if (dwp->dw_mask & DW_enqueue_cleaned) {
6606 /*
6607 * if we didn't hold the object lock and did this,
6608 * we might disconnect the page, then someone might
6609 * soft fault it back in, then we would put it on the
6610 * cleaned queue, and so we would have a referenced (maybe even dirty)
6611 * page on that queue, which we don't want
6612 */
6613 int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
6614
6615 if ((refmod_state & VM_MEM_REFERENCED)) {
6616 /*
6617 * this page has been touched since it got cleaned; let's activate it
6618 * if it hasn't already been
6619 */
6620 VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
6621 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
6622
6623 if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
6624 vm_page_activate(m);
6625 }
6626 } else {
6627 m->vmp_reference = FALSE;
6628 vm_page_enqueue_cleaned(m);
6629 }
6630 } else if (dwp->dw_mask & DW_vm_page_lru) {
6631 vm_page_lru(m);
6632 } else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) {
6633 if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
6634 vm_page_queues_remove(m, TRUE);
6635 }
6636 }
6637 if (dwp->dw_mask & DW_set_reference) {
6638 m->vmp_reference = TRUE;
6639 } else if (dwp->dw_mask & DW_clear_reference) {
6640 m->vmp_reference = FALSE;
6641 }
6642
6643 if (dwp->dw_mask & DW_move_page) {
6644 if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
6645 vm_page_queues_remove(m, FALSE);
6646
6647 assert(VM_PAGE_OBJECT(m) != kernel_object);
6648
6649 vm_page_enqueue_inactive(m, FALSE);
6650 }
6651 }
6652 if (dwp->dw_mask & DW_clear_busy) {
6653 m->vmp_busy = FALSE;
6654 }
6655
6656 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
6657 PAGE_WAKEUP(m);
6658 }
6659 }
6660 }
6661 vm_page_unlock_queues();
6662
6663 if (local_free_q) {
6664 vm_page_free_list(local_free_q, TRUE);
6665 }
6666
6667 VM_CHECK_MEMORYSTATUS;
6668 }
6669
6670 __abortlike
6671 static void
__vm_page_alloc_list_failed_panic(vm_size_t page_count,kma_flags_t flags,kern_return_t kr)6672 __vm_page_alloc_list_failed_panic(
6673 vm_size_t page_count,
6674 kma_flags_t flags,
6675 kern_return_t kr)
6676 {
6677 panic("vm_page_alloc_list(%zd, 0x%x) failed unexpectedly with %d",
6678 (size_t)page_count, flags, kr);
6679 }
6680
6681 kern_return_t
vm_page_alloc_list(vm_size_t page_count,kma_flags_t flags,vm_page_t * list)6682 vm_page_alloc_list(
6683 vm_size_t page_count,
6684 kma_flags_t flags,
6685 vm_page_t *list)
6686 {
6687 vm_page_t page_list = VM_PAGE_NULL;
6688 vm_page_t mem;
6689 kern_return_t kr = KERN_SUCCESS;
6690 int page_grab_count = 0;
6691 #if DEVELOPMENT || DEBUG
6692 task_t task;
6693 #endif /* DEVELOPMENT || DEBUG */
6694
6695 for (vm_size_t i = 0; i < page_count; i++) {
6696 for (;;) {
6697 if (flags & KMA_LOMEM) {
6698 mem = vm_page_grablo();
6699 } else {
6700 mem = vm_page_grab();
6701 }
6702
6703 if (mem != VM_PAGE_NULL) {
6704 break;
6705 }
6706
6707 if (flags & KMA_NOPAGEWAIT) {
6708 kr = KERN_RESOURCE_SHORTAGE;
6709 goto out;
6710 }
6711 if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) {
6712 kr = KERN_RESOURCE_SHORTAGE;
6713 goto out;
6714 }
6715
6716 /* VM privileged threads should have waited in vm_page_grab() and not get here. */
6717 assert(!(current_thread()->options & TH_OPT_VMPRIV));
6718
6719 if ((flags & KMA_NOFAIL) == 0) {
6720 uint64_t unavailable = ptoa_64(vm_page_wire_count + vm_page_free_target);
6721 if (unavailable > max_mem || ptoa_64(page_count) > (max_mem - unavailable)) {
6722 kr = KERN_RESOURCE_SHORTAGE;
6723 goto out;
6724 }
6725 }
6726 VM_PAGE_WAIT();
6727 }
6728
6729 page_grab_count++;
6730 mem->vmp_snext = page_list;
6731 page_list = mem;
6732 }
6733
6734 if ((KMA_ZERO | KMA_NOENCRYPT) & flags) {
6735 for (mem = page_list; mem; mem = mem->vmp_snext) {
6736 vm_page_zero_fill(mem);
6737 }
6738 }
6739
6740 out:
6741 #if DEBUG || DEVELOPMENT
6742 task = current_task_early();
6743 if (task != NULL) {
6744 ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
6745 }
6746 #endif
6747
6748 if (kr == KERN_SUCCESS) {
6749 *list = page_list;
6750 } else if (flags & KMA_NOFAIL) {
6751 __vm_page_alloc_list_failed_panic(page_count, flags, kr);
6752 } else {
6753 vm_page_free_list(page_list, FALSE);
6754 }
6755
6756 return kr;
6757 }
6758
6759 void
vm_page_set_offset(vm_page_t page,vm_object_offset_t offset)6760 vm_page_set_offset(vm_page_t page, vm_object_offset_t offset)
6761 {
6762 page->vmp_offset = offset;
6763 }
6764
6765 vm_page_t
vm_page_get_next(vm_page_t page)6766 vm_page_get_next(vm_page_t page)
6767 {
6768 return page->vmp_snext;
6769 }
6770
6771 vm_object_offset_t
vm_page_get_offset(vm_page_t page)6772 vm_page_get_offset(vm_page_t page)
6773 {
6774 return page->vmp_offset;
6775 }
6776
6777 ppnum_t
vm_page_get_phys_page(vm_page_t page)6778 vm_page_get_phys_page(vm_page_t page)
6779 {
6780 return VM_PAGE_GET_PHYS_PAGE(page);
6781 }
6782
6783
6784 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
6785
6786 #if HIBERNATION
6787
6788 static vm_page_t hibernate_gobble_queue;
6789
6790 static int hibernate_drain_pageout_queue(struct vm_pageout_queue *);
6791 static int hibernate_flush_dirty_pages(int);
6792 static int hibernate_flush_queue(vm_page_queue_head_t *, int);
6793
6794 void hibernate_flush_wait(void);
6795 void hibernate_mark_in_progress(void);
6796 void hibernate_clear_in_progress(void);
6797
6798 void hibernate_free_range(int, int);
6799 void hibernate_hash_insert_page(vm_page_t);
6800 uint32_t hibernate_mark_as_unneeded(addr64_t, addr64_t, hibernate_page_list_t *, hibernate_page_list_t *);
6801 uint32_t hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *);
6802 ppnum_t hibernate_lookup_paddr(unsigned int);
6803
6804 struct hibernate_statistics {
6805 int hibernate_considered;
6806 int hibernate_reentered_on_q;
6807 int hibernate_found_dirty;
6808 int hibernate_skipped_cleaning;
6809 int hibernate_skipped_transient;
6810 int hibernate_skipped_precious;
6811 int hibernate_skipped_external;
6812 int hibernate_queue_nolock;
6813 int hibernate_queue_paused;
6814 int hibernate_throttled;
6815 int hibernate_throttle_timeout;
6816 int hibernate_drained;
6817 int hibernate_drain_timeout;
6818 int cd_lock_failed;
6819 int cd_found_precious;
6820 int cd_found_wired;
6821 int cd_found_busy;
6822 int cd_found_unusual;
6823 int cd_found_cleaning;
6824 int cd_found_laundry;
6825 int cd_found_dirty;
6826 int cd_found_xpmapped;
6827 int cd_skipped_xpmapped;
6828 int cd_local_free;
6829 int cd_total_free;
6830 int cd_vm_page_wire_count;
6831 int cd_vm_struct_pages_unneeded;
6832 int cd_pages;
6833 int cd_discarded;
6834 int cd_count_wire;
6835 } hibernate_stats;
6836
6837
6838 /*
6839 * clamp the number of 'xpmapped' pages we'll sweep into the hibernation image
6840 * so that we don't overrun the estimated image size, which would
6841 * result in a hibernation failure.
6842 *
6843 * We use a size value instead of pages because we don't want to take up more space
6844 * on disk if the system has a 16K page size vs 4K. Also, we are not guaranteed
6845 * to have that additional space available.
6846 *
6847 * Since this was set at 40000 pages on X86 we are going to use 160MB as our
6848 * xpmapped size.
6849 */
6850 #define HIBERNATE_XPMAPPED_LIMIT ((160 * 1024 * 1024ULL) / PAGE_SIZE)
6851
6852
6853 static int
hibernate_drain_pageout_queue(struct vm_pageout_queue * q)6854 hibernate_drain_pageout_queue(struct vm_pageout_queue *q)
6855 {
6856 wait_result_t wait_result;
6857
6858 vm_page_lock_queues();
6859
6860 while (!vm_page_queue_empty(&q->pgo_pending)) {
6861 q->pgo_draining = TRUE;
6862
6863 assert_wait_timeout((event_t) (&q->pgo_laundry + 1), THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
6864
6865 vm_page_unlock_queues();
6866
6867 wait_result = thread_block(THREAD_CONTINUE_NULL);
6868
6869 if (wait_result == THREAD_TIMED_OUT && !vm_page_queue_empty(&q->pgo_pending)) {
6870 hibernate_stats.hibernate_drain_timeout++;
6871
6872 if (q == &vm_pageout_queue_external) {
6873 return 0;
6874 }
6875
6876 return 1;
6877 }
6878 vm_page_lock_queues();
6879
6880 hibernate_stats.hibernate_drained++;
6881 }
6882 vm_page_unlock_queues();
6883
6884 return 0;
6885 }
6886
6887
6888 boolean_t hibernate_skip_external = FALSE;
6889
6890 static int
hibernate_flush_queue(vm_page_queue_head_t * q,int qcount)6891 hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
6892 {
6893 vm_page_t m;
6894 vm_object_t l_object = NULL;
6895 vm_object_t m_object = NULL;
6896 int refmod_state = 0;
6897 int try_failed_count = 0;
6898 int retval = 0;
6899 int current_run = 0;
6900 struct vm_pageout_queue *iq;
6901 struct vm_pageout_queue *eq;
6902 struct vm_pageout_queue *tq;
6903
6904 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_START,
6905 VM_KERNEL_UNSLIDE_OR_PERM(q), qcount);
6906
6907 iq = &vm_pageout_queue_internal;
6908 eq = &vm_pageout_queue_external;
6909
6910 vm_page_lock_queues();
6911
6912 while (qcount && !vm_page_queue_empty(q)) {
6913 if (current_run++ == 1000) {
6914 if (hibernate_should_abort()) {
6915 retval = 1;
6916 break;
6917 }
6918 current_run = 0;
6919 }
6920
6921 m = (vm_page_t) vm_page_queue_first(q);
6922 m_object = VM_PAGE_OBJECT(m);
6923
6924 /*
6925 * check to see if we currently are working
6926 * with the same object... if so, we've
6927 * already got the lock
6928 */
6929 if (m_object != l_object) {
6930 /*
6931 * the object associated with candidate page is
6932 * different from the one we were just working
6933 * with... dump the lock if we still own it
6934 */
6935 if (l_object != NULL) {
6936 vm_object_unlock(l_object);
6937 l_object = NULL;
6938 }
6939 /*
6940 * Try to lock object; since we've alread got the
6941 * page queues lock, we can only 'try' for this one.
6942 * if the 'try' fails, we need to do a mutex_pause
6943 * to allow the owner of the object lock a chance to
6944 * run...
6945 */
6946 if (!vm_object_lock_try_scan(m_object)) {
6947 if (try_failed_count > 20) {
6948 hibernate_stats.hibernate_queue_nolock++;
6949
6950 goto reenter_pg_on_q;
6951 }
6952
6953 vm_page_unlock_queues();
6954 mutex_pause(try_failed_count++);
6955 vm_page_lock_queues();
6956
6957 hibernate_stats.hibernate_queue_paused++;
6958 continue;
6959 } else {
6960 l_object = m_object;
6961 }
6962 }
6963 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error) {
6964 /*
6965 * page is not to be cleaned
6966 * put it back on the head of its queue
6967 */
6968 if (m->vmp_cleaning) {
6969 hibernate_stats.hibernate_skipped_cleaning++;
6970 } else {
6971 hibernate_stats.hibernate_skipped_transient++;
6972 }
6973
6974 goto reenter_pg_on_q;
6975 }
6976 if (m_object->copy == VM_OBJECT_NULL) {
6977 if (m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) {
6978 /*
6979 * let the normal hibernate image path
6980 * deal with these
6981 */
6982 goto reenter_pg_on_q;
6983 }
6984 }
6985 if (!m->vmp_dirty && m->vmp_pmapped) {
6986 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
6987
6988 if ((refmod_state & VM_MEM_MODIFIED)) {
6989 SET_PAGE_DIRTY(m, FALSE);
6990 }
6991 } else {
6992 refmod_state = 0;
6993 }
6994
6995 if (!m->vmp_dirty) {
6996 /*
6997 * page is not to be cleaned
6998 * put it back on the head of its queue
6999 */
7000 if (m->vmp_precious) {
7001 hibernate_stats.hibernate_skipped_precious++;
7002 }
7003
7004 goto reenter_pg_on_q;
7005 }
7006
7007 if (hibernate_skip_external == TRUE && !m_object->internal) {
7008 hibernate_stats.hibernate_skipped_external++;
7009
7010 goto reenter_pg_on_q;
7011 }
7012 tq = NULL;
7013
7014 if (m_object->internal) {
7015 if (VM_PAGE_Q_THROTTLED(iq)) {
7016 tq = iq;
7017 }
7018 } else if (VM_PAGE_Q_THROTTLED(eq)) {
7019 tq = eq;
7020 }
7021
7022 if (tq != NULL) {
7023 wait_result_t wait_result;
7024 int wait_count = 5;
7025
7026 if (l_object != NULL) {
7027 vm_object_unlock(l_object);
7028 l_object = NULL;
7029 }
7030
7031 while (retval == 0) {
7032 tq->pgo_throttled = TRUE;
7033
7034 assert_wait_timeout((event_t) &tq->pgo_laundry, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
7035
7036 vm_page_unlock_queues();
7037
7038 wait_result = thread_block(THREAD_CONTINUE_NULL);
7039
7040 vm_page_lock_queues();
7041
7042 if (wait_result != THREAD_TIMED_OUT) {
7043 break;
7044 }
7045 if (!VM_PAGE_Q_THROTTLED(tq)) {
7046 break;
7047 }
7048
7049 if (hibernate_should_abort()) {
7050 retval = 1;
7051 }
7052
7053 if (--wait_count == 0) {
7054 hibernate_stats.hibernate_throttle_timeout++;
7055
7056 if (tq == eq) {
7057 hibernate_skip_external = TRUE;
7058 break;
7059 }
7060 retval = 1;
7061 }
7062 }
7063 if (retval) {
7064 break;
7065 }
7066
7067 hibernate_stats.hibernate_throttled++;
7068
7069 continue;
7070 }
7071 /*
7072 * we've already factored out pages in the laundry which
7073 * means this page can't be on the pageout queue so it's
7074 * safe to do the vm_page_queues_remove
7075 */
7076 vm_page_queues_remove(m, TRUE);
7077
7078 if (m_object->internal == TRUE) {
7079 pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m), PMAP_OPTIONS_COMPRESSOR, NULL);
7080 }
7081
7082 vm_pageout_cluster(m);
7083
7084 hibernate_stats.hibernate_found_dirty++;
7085
7086 goto next_pg;
7087
7088 reenter_pg_on_q:
7089 vm_page_queue_remove(q, m, vmp_pageq);
7090 vm_page_queue_enter(q, m, vmp_pageq);
7091
7092 hibernate_stats.hibernate_reentered_on_q++;
7093 next_pg:
7094 hibernate_stats.hibernate_considered++;
7095
7096 qcount--;
7097 try_failed_count = 0;
7098 }
7099 if (l_object != NULL) {
7100 vm_object_unlock(l_object);
7101 l_object = NULL;
7102 }
7103
7104 vm_page_unlock_queues();
7105
7106 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_END, hibernate_stats.hibernate_found_dirty, retval, 0, 0, 0);
7107
7108 return retval;
7109 }
7110
7111
7112 static int
hibernate_flush_dirty_pages(int pass)7113 hibernate_flush_dirty_pages(int pass)
7114 {
7115 struct vm_speculative_age_q *aq;
7116 uint32_t i;
7117
7118 if (vm_page_local_q) {
7119 zpercpu_foreach_cpu(lid) {
7120 vm_page_reactivate_local(lid, TRUE, FALSE);
7121 }
7122 }
7123
7124 for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
7125 int qcount;
7126 vm_page_t m;
7127
7128 aq = &vm_page_queue_speculative[i];
7129
7130 if (vm_page_queue_empty(&aq->age_q)) {
7131 continue;
7132 }
7133 qcount = 0;
7134
7135 vm_page_lockspin_queues();
7136
7137 vm_page_queue_iterate(&aq->age_q, m, vmp_pageq) {
7138 qcount++;
7139 }
7140 vm_page_unlock_queues();
7141
7142 if (qcount) {
7143 if (hibernate_flush_queue(&aq->age_q, qcount)) {
7144 return 1;
7145 }
7146 }
7147 }
7148 if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count)) {
7149 return 1;
7150 }
7151 /* XXX FBDP TODO: flush secluded queue */
7152 if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count)) {
7153 return 1;
7154 }
7155 if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count)) {
7156 return 1;
7157 }
7158 if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
7159 return 1;
7160 }
7161
7162 if (pass == 1) {
7163 vm_compressor_record_warmup_start();
7164 }
7165
7166 if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) {
7167 if (pass == 1) {
7168 vm_compressor_record_warmup_end();
7169 }
7170 return 1;
7171 }
7172 if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
7173 if (pass == 1) {
7174 vm_compressor_record_warmup_end();
7175 }
7176 return 1;
7177 }
7178 if (pass == 1) {
7179 vm_compressor_record_warmup_end();
7180 }
7181
7182 if (hibernate_skip_external == FALSE && hibernate_drain_pageout_queue(&vm_pageout_queue_external)) {
7183 return 1;
7184 }
7185
7186 return 0;
7187 }
7188
7189
7190 void
hibernate_reset_stats()7191 hibernate_reset_stats()
7192 {
7193 bzero(&hibernate_stats, sizeof(struct hibernate_statistics));
7194 }
7195
7196
7197 int
hibernate_flush_memory()7198 hibernate_flush_memory()
7199 {
7200 int retval;
7201
7202 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
7203
7204 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_START, vm_page_free_count, 0, 0, 0, 0);
7205
7206 hibernate_cleaning_in_progress = TRUE;
7207 hibernate_skip_external = FALSE;
7208
7209 if ((retval = hibernate_flush_dirty_pages(1)) == 0) {
7210 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
7211
7212 vm_compressor_flush();
7213
7214 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
7215
7216 if (consider_buffer_cache_collect != NULL) {
7217 unsigned int orig_wire_count;
7218
7219 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_START, 0, 0, 0, 0, 0);
7220 orig_wire_count = vm_page_wire_count;
7221
7222 (void)(*consider_buffer_cache_collect)(1);
7223 zone_gc(ZONE_GC_DRAIN);
7224
7225 HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count);
7226
7227 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_END, orig_wire_count - vm_page_wire_count, 0, 0, 0, 0);
7228 }
7229 }
7230 hibernate_cleaning_in_progress = FALSE;
7231
7232 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_END, vm_page_free_count, hibernate_stats.hibernate_found_dirty, retval, 0, 0);
7233
7234 if (retval) {
7235 HIBLOG("hibernate_flush_memory() failed to finish - vm_page_compressor_count(%d)\n", VM_PAGE_COMPRESSOR_COUNT);
7236 }
7237
7238
7239 HIBPRINT("hibernate_flush_memory() considered(%d) reentered_on_q(%d) found_dirty(%d)\n",
7240 hibernate_stats.hibernate_considered,
7241 hibernate_stats.hibernate_reentered_on_q,
7242 hibernate_stats.hibernate_found_dirty);
7243 HIBPRINT(" skipped_cleaning(%d) skipped_transient(%d) skipped_precious(%d) skipped_external(%d) queue_nolock(%d)\n",
7244 hibernate_stats.hibernate_skipped_cleaning,
7245 hibernate_stats.hibernate_skipped_transient,
7246 hibernate_stats.hibernate_skipped_precious,
7247 hibernate_stats.hibernate_skipped_external,
7248 hibernate_stats.hibernate_queue_nolock);
7249 HIBPRINT(" queue_paused(%d) throttled(%d) throttle_timeout(%d) drained(%d) drain_timeout(%d)\n",
7250 hibernate_stats.hibernate_queue_paused,
7251 hibernate_stats.hibernate_throttled,
7252 hibernate_stats.hibernate_throttle_timeout,
7253 hibernate_stats.hibernate_drained,
7254 hibernate_stats.hibernate_drain_timeout);
7255
7256 return retval;
7257 }
7258
7259
7260 static void
hibernate_page_list_zero(hibernate_page_list_t * list)7261 hibernate_page_list_zero(hibernate_page_list_t *list)
7262 {
7263 uint32_t bank;
7264 hibernate_bitmap_t * bitmap;
7265
7266 bitmap = &list->bank_bitmap[0];
7267 for (bank = 0; bank < list->bank_count; bank++) {
7268 uint32_t last_bit;
7269
7270 bzero((void *) &bitmap->bitmap[0], bitmap->bitmapwords << 2);
7271 // set out-of-bound bits at end of bitmap.
7272 last_bit = ((bitmap->last_page - bitmap->first_page + 1) & 31);
7273 if (last_bit) {
7274 bitmap->bitmap[bitmap->bitmapwords - 1] = (0xFFFFFFFF >> last_bit);
7275 }
7276
7277 bitmap = (hibernate_bitmap_t *) &bitmap->bitmap[bitmap->bitmapwords];
7278 }
7279 }
7280
7281 void
hibernate_free_gobble_pages(void)7282 hibernate_free_gobble_pages(void)
7283 {
7284 vm_page_t m, next;
7285 uint32_t count = 0;
7286
7287 m = (vm_page_t) hibernate_gobble_queue;
7288 while (m) {
7289 next = m->vmp_snext;
7290 vm_page_free(m);
7291 count++;
7292 m = next;
7293 }
7294 hibernate_gobble_queue = VM_PAGE_NULL;
7295
7296 if (count) {
7297 HIBLOG("Freed %d pages\n", count);
7298 }
7299 }
7300
7301 static boolean_t
hibernate_consider_discard(vm_page_t m,boolean_t preflight)7302 hibernate_consider_discard(vm_page_t m, boolean_t preflight)
7303 {
7304 vm_object_t object = NULL;
7305 int refmod_state;
7306 boolean_t discard = FALSE;
7307
7308 do{
7309 if (m->vmp_private) {
7310 panic("hibernate_consider_discard: private");
7311 }
7312
7313 object = VM_PAGE_OBJECT(m);
7314
7315 if (!vm_object_lock_try(object)) {
7316 object = NULL;
7317 if (!preflight) {
7318 hibernate_stats.cd_lock_failed++;
7319 }
7320 break;
7321 }
7322 if (VM_PAGE_WIRED(m)) {
7323 if (!preflight) {
7324 hibernate_stats.cd_found_wired++;
7325 }
7326 break;
7327 }
7328 if (m->vmp_precious) {
7329 if (!preflight) {
7330 hibernate_stats.cd_found_precious++;
7331 }
7332 break;
7333 }
7334 if (m->vmp_busy || !object->alive) {
7335 /*
7336 * Somebody is playing with this page.
7337 */
7338 if (!preflight) {
7339 hibernate_stats.cd_found_busy++;
7340 }
7341 break;
7342 }
7343 if (m->vmp_absent || m->vmp_unusual || m->vmp_error) {
7344 /*
7345 * If it's unusual in anyway, ignore it
7346 */
7347 if (!preflight) {
7348 hibernate_stats.cd_found_unusual++;
7349 }
7350 break;
7351 }
7352 if (m->vmp_cleaning) {
7353 if (!preflight) {
7354 hibernate_stats.cd_found_cleaning++;
7355 }
7356 break;
7357 }
7358 if (m->vmp_laundry) {
7359 if (!preflight) {
7360 hibernate_stats.cd_found_laundry++;
7361 }
7362 break;
7363 }
7364 if (!m->vmp_dirty) {
7365 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
7366
7367 if (refmod_state & VM_MEM_REFERENCED) {
7368 m->vmp_reference = TRUE;
7369 }
7370 if (refmod_state & VM_MEM_MODIFIED) {
7371 SET_PAGE_DIRTY(m, FALSE);
7372 }
7373 }
7374
7375 /*
7376 * If it's clean or purgeable we can discard the page on wakeup.
7377 */
7378 discard = (!m->vmp_dirty)
7379 || (VM_PURGABLE_VOLATILE == object->purgable)
7380 || (VM_PURGABLE_EMPTY == object->purgable);
7381
7382
7383 if (discard == FALSE) {
7384 if (!preflight) {
7385 hibernate_stats.cd_found_dirty++;
7386 }
7387 } else if (m->vmp_xpmapped && m->vmp_reference && !object->internal) {
7388 if (hibernate_stats.cd_found_xpmapped < HIBERNATE_XPMAPPED_LIMIT) {
7389 if (!preflight) {
7390 hibernate_stats.cd_found_xpmapped++;
7391 }
7392 discard = FALSE;
7393 } else {
7394 if (!preflight) {
7395 hibernate_stats.cd_skipped_xpmapped++;
7396 }
7397 }
7398 }
7399 }while (FALSE);
7400
7401 if (object) {
7402 vm_object_unlock(object);
7403 }
7404
7405 return discard;
7406 }
7407
7408
7409 static void
hibernate_discard_page(vm_page_t m)7410 hibernate_discard_page(vm_page_t m)
7411 {
7412 vm_object_t m_object;
7413
7414 if (m->vmp_absent || m->vmp_unusual || m->vmp_error) {
7415 /*
7416 * If it's unusual in anyway, ignore
7417 */
7418 return;
7419 }
7420
7421 m_object = VM_PAGE_OBJECT(m);
7422
7423 #if MACH_ASSERT || DEBUG
7424 if (!vm_object_lock_try(m_object)) {
7425 panic("hibernate_discard_page(%p) !vm_object_lock_try", m);
7426 }
7427 #else
7428 /* No need to lock page queue for token delete, hibernate_vm_unlock()
7429 * makes sure these locks are uncontended before sleep */
7430 #endif /* MACH_ASSERT || DEBUG */
7431
7432 if (m->vmp_pmapped == TRUE) {
7433 __unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7434 }
7435
7436 if (m->vmp_laundry) {
7437 panic("hibernate_discard_page(%p) laundry", m);
7438 }
7439 if (m->vmp_private) {
7440 panic("hibernate_discard_page(%p) private", m);
7441 }
7442 if (m->vmp_fictitious) {
7443 panic("hibernate_discard_page(%p) fictitious", m);
7444 }
7445
7446 if (VM_PURGABLE_VOLATILE == m_object->purgable) {
7447 /* object should be on a queue */
7448 assert((m_object->objq.next != NULL) && (m_object->objq.prev != NULL));
7449 purgeable_q_t old_queue = vm_purgeable_object_remove(m_object);
7450 assert(old_queue);
7451 if (m_object->purgeable_when_ripe) {
7452 vm_purgeable_token_delete_first(old_queue);
7453 }
7454 vm_object_lock_assert_exclusive(m_object);
7455 m_object->purgable = VM_PURGABLE_EMPTY;
7456
7457 /*
7458 * Purgeable ledgers: pages of VOLATILE and EMPTY objects are
7459 * accounted in the "volatile" ledger, so no change here.
7460 * We have to update vm_page_purgeable_count, though, since we're
7461 * effectively purging this object.
7462 */
7463 unsigned int delta;
7464 assert(m_object->resident_page_count >= m_object->wired_page_count);
7465 delta = (m_object->resident_page_count - m_object->wired_page_count);
7466 assert(vm_page_purgeable_count >= delta);
7467 assert(delta > 0);
7468 OSAddAtomic(-delta, (SInt32 *)&vm_page_purgeable_count);
7469 }
7470
7471 vm_page_free(m);
7472
7473 #if MACH_ASSERT || DEBUG
7474 vm_object_unlock(m_object);
7475 #endif /* MACH_ASSERT || DEBUG */
7476 }
7477
7478 /*
7479 * Grab locks for hibernate_page_list_setall()
7480 */
7481 void
hibernate_vm_lock_queues(void)7482 hibernate_vm_lock_queues(void)
7483 {
7484 vm_object_lock(compressor_object);
7485 vm_page_lock_queues();
7486 lck_mtx_lock(&vm_page_queue_free_lock);
7487 lck_mtx_lock(&vm_purgeable_queue_lock);
7488
7489 if (vm_page_local_q) {
7490 zpercpu_foreach(lq, vm_page_local_q) {
7491 VPL_LOCK(&lq->vpl_lock);
7492 }
7493 }
7494 }
7495
7496 void
hibernate_vm_unlock_queues(void)7497 hibernate_vm_unlock_queues(void)
7498 {
7499 if (vm_page_local_q) {
7500 zpercpu_foreach(lq, vm_page_local_q) {
7501 VPL_UNLOCK(&lq->vpl_lock);
7502 }
7503 }
7504 lck_mtx_unlock(&vm_purgeable_queue_lock);
7505 lck_mtx_unlock(&vm_page_queue_free_lock);
7506 vm_page_unlock_queues();
7507 vm_object_unlock(compressor_object);
7508 }
7509
7510 /*
7511 * Bits zero in the bitmaps => page needs to be saved. All pages default to be saved,
7512 * pages known to VM to not need saving are subtracted.
7513 * Wired pages to be saved are present in page_list_wired, pageable in page_list.
7514 */
7515
7516 void
hibernate_page_list_setall(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired,hibernate_page_list_t * page_list_pal,boolean_t preflight,boolean_t will_discard,uint32_t * pagesOut)7517 hibernate_page_list_setall(hibernate_page_list_t * page_list,
7518 hibernate_page_list_t * page_list_wired,
7519 hibernate_page_list_t * page_list_pal,
7520 boolean_t preflight,
7521 boolean_t will_discard,
7522 uint32_t * pagesOut)
7523 {
7524 uint64_t start, end, nsec;
7525 vm_page_t m;
7526 vm_page_t next;
7527 uint32_t pages = page_list->page_count;
7528 uint32_t count_anonymous = 0, count_throttled = 0, count_compressor = 0;
7529 uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0;
7530 uint32_t count_wire = pages;
7531 uint32_t count_discard_active = 0;
7532 uint32_t count_discard_inactive = 0;
7533 uint32_t count_discard_cleaned = 0;
7534 uint32_t count_discard_purgeable = 0;
7535 uint32_t count_discard_speculative = 0;
7536 uint32_t count_discard_vm_struct_pages = 0;
7537 uint32_t i;
7538 uint32_t bank;
7539 hibernate_bitmap_t * bitmap;
7540 hibernate_bitmap_t * bitmap_wired;
7541 boolean_t discard_all;
7542 boolean_t discard = FALSE;
7543
7544 HIBLOG("hibernate_page_list_setall(preflight %d) start\n", preflight);
7545
7546 if (preflight) {
7547 page_list = NULL;
7548 page_list_wired = NULL;
7549 page_list_pal = NULL;
7550 discard_all = FALSE;
7551 } else {
7552 discard_all = will_discard;
7553 }
7554
7555 #if MACH_ASSERT || DEBUG
7556 if (!preflight) {
7557 assert(hibernate_vm_locks_are_safe());
7558 vm_page_lock_queues();
7559 if (vm_page_local_q) {
7560 zpercpu_foreach(lq, vm_page_local_q) {
7561 VPL_LOCK(&lq->vpl_lock);
7562 }
7563 }
7564 }
7565 #endif /* MACH_ASSERT || DEBUG */
7566
7567
7568 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_START, count_wire, 0, 0, 0, 0);
7569
7570 clock_get_uptime(&start);
7571
7572 if (!preflight) {
7573 hibernate_page_list_zero(page_list);
7574 hibernate_page_list_zero(page_list_wired);
7575 hibernate_page_list_zero(page_list_pal);
7576
7577 hibernate_stats.cd_vm_page_wire_count = vm_page_wire_count;
7578 hibernate_stats.cd_pages = pages;
7579 }
7580
7581 if (vm_page_local_q) {
7582 zpercpu_foreach_cpu(lid) {
7583 vm_page_reactivate_local(lid, TRUE, !preflight);
7584 }
7585 }
7586
7587 if (preflight) {
7588 vm_object_lock(compressor_object);
7589 vm_page_lock_queues();
7590 lck_mtx_lock(&vm_page_queue_free_lock);
7591 }
7592
7593 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
7594
7595 hibernation_vmqueues_inspection = TRUE;
7596
7597 m = (vm_page_t) hibernate_gobble_queue;
7598 while (m) {
7599 pages--;
7600 count_wire--;
7601 if (!preflight) {
7602 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7603 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7604 }
7605 m = m->vmp_snext;
7606 }
7607
7608 if (!preflight) {
7609 percpu_foreach(free_pages_head, free_pages) {
7610 for (m = *free_pages_head; m; m = m->vmp_snext) {
7611 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
7612
7613 pages--;
7614 count_wire--;
7615 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7616 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7617
7618 hibernate_stats.cd_local_free++;
7619 hibernate_stats.cd_total_free++;
7620 }
7621 }
7622 }
7623
7624 for (i = 0; i < vm_colors; i++) {
7625 vm_page_queue_iterate(&vm_page_queue_free[i].qhead, m, vmp_pageq) {
7626 assert(m->vmp_q_state == VM_PAGE_ON_FREE_Q);
7627
7628 pages--;
7629 count_wire--;
7630 if (!preflight) {
7631 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7632 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7633
7634 hibernate_stats.cd_total_free++;
7635 }
7636 }
7637 }
7638
7639 vm_page_queue_iterate(&vm_lopage_queue_free, m, vmp_pageq) {
7640 assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
7641
7642 pages--;
7643 count_wire--;
7644 if (!preflight) {
7645 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7646 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7647
7648 hibernate_stats.cd_total_free++;
7649 }
7650 }
7651
7652 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
7653 while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) {
7654 assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
7655
7656 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7657 discard = FALSE;
7658 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
7659 && hibernate_consider_discard(m, preflight)) {
7660 if (!preflight) {
7661 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7662 }
7663 count_discard_inactive++;
7664 discard = discard_all;
7665 } else {
7666 count_throttled++;
7667 }
7668 count_wire--;
7669 if (!preflight) {
7670 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7671 }
7672
7673 if (discard) {
7674 hibernate_discard_page(m);
7675 }
7676 m = next;
7677 }
7678
7679 m = (vm_page_t)vm_page_queue_first(&vm_page_queue_anonymous);
7680 while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
7681 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
7682
7683 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7684 discard = FALSE;
7685 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7686 hibernate_consider_discard(m, preflight)) {
7687 if (!preflight) {
7688 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7689 }
7690 if (m->vmp_dirty) {
7691 count_discard_purgeable++;
7692 } else {
7693 count_discard_inactive++;
7694 }
7695 discard = discard_all;
7696 } else {
7697 count_anonymous++;
7698 }
7699 count_wire--;
7700 if (!preflight) {
7701 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7702 }
7703 if (discard) {
7704 hibernate_discard_page(m);
7705 }
7706 m = next;
7707 }
7708
7709 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
7710 while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
7711 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
7712
7713 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7714 discard = FALSE;
7715 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7716 hibernate_consider_discard(m, preflight)) {
7717 if (!preflight) {
7718 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7719 }
7720 if (m->vmp_dirty) {
7721 count_discard_purgeable++;
7722 } else {
7723 count_discard_cleaned++;
7724 }
7725 discard = discard_all;
7726 } else {
7727 count_cleaned++;
7728 }
7729 count_wire--;
7730 if (!preflight) {
7731 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7732 }
7733 if (discard) {
7734 hibernate_discard_page(m);
7735 }
7736 m = next;
7737 }
7738
7739 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
7740 while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
7741 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
7742
7743 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7744 discard = FALSE;
7745 if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) &&
7746 hibernate_consider_discard(m, preflight)) {
7747 if (!preflight) {
7748 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7749 }
7750 if (m->vmp_dirty) {
7751 count_discard_purgeable++;
7752 } else {
7753 count_discard_active++;
7754 }
7755 discard = discard_all;
7756 } else {
7757 count_active++;
7758 }
7759 count_wire--;
7760 if (!preflight) {
7761 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7762 }
7763 if (discard) {
7764 hibernate_discard_page(m);
7765 }
7766 m = next;
7767 }
7768
7769 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
7770 while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
7771 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
7772
7773 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7774 discard = FALSE;
7775 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7776 hibernate_consider_discard(m, preflight)) {
7777 if (!preflight) {
7778 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7779 }
7780 if (m->vmp_dirty) {
7781 count_discard_purgeable++;
7782 } else {
7783 count_discard_inactive++;
7784 }
7785 discard = discard_all;
7786 } else {
7787 count_inactive++;
7788 }
7789 count_wire--;
7790 if (!preflight) {
7791 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7792 }
7793 if (discard) {
7794 hibernate_discard_page(m);
7795 }
7796 m = next;
7797 }
7798 /* XXX FBDP TODO: secluded queue */
7799
7800 for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
7801 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
7802 while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
7803 assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q,
7804 "Bad page: %p (0x%x:0x%x) on queue %d has state: %d (Discard: %d, Preflight: %d)",
7805 m, m->vmp_pageq.next, m->vmp_pageq.prev, i, m->vmp_q_state, discard, preflight);
7806
7807 next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7808 discard = FALSE;
7809 if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7810 hibernate_consider_discard(m, preflight)) {
7811 if (!preflight) {
7812 hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7813 }
7814 count_discard_speculative++;
7815 discard = discard_all;
7816 } else {
7817 count_speculative++;
7818 }
7819 count_wire--;
7820 if (!preflight) {
7821 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7822 }
7823 if (discard) {
7824 hibernate_discard_page(m);
7825 }
7826 m = next;
7827 }
7828 }
7829
7830 vm_page_queue_iterate(&compressor_object->memq, m, vmp_listq) {
7831 assert(m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
7832
7833 count_compressor++;
7834 count_wire--;
7835 if (!preflight) {
7836 hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7837 }
7838 }
7839
7840 if (preflight == FALSE && discard_all == TRUE) {
7841 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_START);
7842
7843 HIBLOG("hibernate_teardown started\n");
7844 count_discard_vm_struct_pages = hibernate_teardown_vm_structs(page_list, page_list_wired);
7845 HIBLOG("hibernate_teardown completed - discarded %d\n", count_discard_vm_struct_pages);
7846
7847 pages -= count_discard_vm_struct_pages;
7848 count_wire -= count_discard_vm_struct_pages;
7849
7850 hibernate_stats.cd_vm_struct_pages_unneeded = count_discard_vm_struct_pages;
7851
7852 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_END);
7853 }
7854
7855 if (!preflight) {
7856 // pull wired from hibernate_bitmap
7857 bitmap = &page_list->bank_bitmap[0];
7858 bitmap_wired = &page_list_wired->bank_bitmap[0];
7859 for (bank = 0; bank < page_list->bank_count; bank++) {
7860 for (i = 0; i < bitmap->bitmapwords; i++) {
7861 bitmap->bitmap[i] = bitmap->bitmap[i] | ~bitmap_wired->bitmap[i];
7862 }
7863 bitmap = (hibernate_bitmap_t *)&bitmap->bitmap[bitmap->bitmapwords];
7864 bitmap_wired = (hibernate_bitmap_t *) &bitmap_wired->bitmap[bitmap_wired->bitmapwords];
7865 }
7866 }
7867
7868 // machine dependent adjustments
7869 hibernate_page_list_setall_machine(page_list, page_list_wired, preflight, &pages);
7870
7871 if (!preflight) {
7872 hibernate_stats.cd_count_wire = count_wire;
7873 hibernate_stats.cd_discarded = count_discard_active + count_discard_inactive + count_discard_purgeable +
7874 count_discard_speculative + count_discard_cleaned + count_discard_vm_struct_pages;
7875 }
7876
7877 clock_get_uptime(&end);
7878 absolutetime_to_nanoseconds(end - start, &nsec);
7879 HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL);
7880
7881 HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, zf %d, throt %d, compr %d, xpmapped %d\n %s discard act %d inact %d purgeable %d spec %d cleaned %d\n",
7882 pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative, count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped,
7883 discard_all ? "did" : "could",
7884 count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
7885
7886 if (hibernate_stats.cd_skipped_xpmapped) {
7887 HIBLOG("WARNING: hibernate_page_list_setall skipped %d xpmapped pages\n", hibernate_stats.cd_skipped_xpmapped);
7888 }
7889
7890 *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative - count_discard_cleaned;
7891
7892 if (preflight && will_discard) {
7893 *pagesOut -= count_compressor + count_throttled + count_anonymous + count_inactive + count_cleaned + count_speculative + count_active;
7894 /*
7895 * We try to keep max HIBERNATE_XPMAPPED_LIMIT pages around in the hibernation image
7896 * even if these are clean and so we need to size the hibernation image accordingly.
7897 *
7898 * NB: We have to assume all HIBERNATE_XPMAPPED_LIMIT pages might show up because 'dirty'
7899 * xpmapped pages aren't distinguishable from other 'dirty' pages in preflight. So we might
7900 * only see part of the xpmapped pages if we look at 'cd_found_xpmapped' which solely tracks
7901 * clean xpmapped pages.
7902 *
7903 * Since these pages are all cleaned by the time we are in the post-preflight phase, we might
7904 * see a much larger number in 'cd_found_xpmapped' now than we did in the preflight phase
7905 */
7906 *pagesOut += HIBERNATE_XPMAPPED_LIMIT;
7907 }
7908
7909 hibernation_vmqueues_inspection = FALSE;
7910
7911 #if MACH_ASSERT || DEBUG
7912 if (!preflight) {
7913 if (vm_page_local_q) {
7914 zpercpu_foreach(lq, vm_page_local_q) {
7915 VPL_UNLOCK(&lq->vpl_lock);
7916 }
7917 }
7918 vm_page_unlock_queues();
7919 }
7920 #endif /* MACH_ASSERT || DEBUG */
7921
7922 if (preflight) {
7923 lck_mtx_unlock(&vm_page_queue_free_lock);
7924 vm_page_unlock_queues();
7925 vm_object_unlock(compressor_object);
7926 }
7927
7928 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_END, count_wire, *pagesOut, 0, 0, 0);
7929 }
7930
7931 void
hibernate_page_list_discard(hibernate_page_list_t * page_list)7932 hibernate_page_list_discard(hibernate_page_list_t * page_list)
7933 {
7934 uint64_t start, end, nsec;
7935 vm_page_t m;
7936 vm_page_t next;
7937 uint32_t i;
7938 uint32_t count_discard_active = 0;
7939 uint32_t count_discard_inactive = 0;
7940 uint32_t count_discard_purgeable = 0;
7941 uint32_t count_discard_cleaned = 0;
7942 uint32_t count_discard_speculative = 0;
7943
7944
7945 #if MACH_ASSERT || DEBUG
7946 vm_page_lock_queues();
7947 if (vm_page_local_q) {
7948 zpercpu_foreach(lq, vm_page_local_q) {
7949 VPL_LOCK(&lq->vpl_lock);
7950 }
7951 }
7952 #endif /* MACH_ASSERT || DEBUG */
7953
7954 clock_get_uptime(&start);
7955
7956 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
7957 while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
7958 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
7959
7960 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7961 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
7962 if (m->vmp_dirty) {
7963 count_discard_purgeable++;
7964 } else {
7965 count_discard_inactive++;
7966 }
7967 hibernate_discard_page(m);
7968 }
7969 m = next;
7970 }
7971
7972 for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
7973 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
7974 while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
7975 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
7976
7977 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7978 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
7979 count_discard_speculative++;
7980 hibernate_discard_page(m);
7981 }
7982 m = next;
7983 }
7984 }
7985
7986 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
7987 while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
7988 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
7989
7990 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7991 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
7992 if (m->vmp_dirty) {
7993 count_discard_purgeable++;
7994 } else {
7995 count_discard_inactive++;
7996 }
7997 hibernate_discard_page(m);
7998 }
7999 m = next;
8000 }
8001 /* XXX FBDP TODO: secluded queue */
8002
8003 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
8004 while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
8005 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
8006
8007 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
8008 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
8009 if (m->vmp_dirty) {
8010 count_discard_purgeable++;
8011 } else {
8012 count_discard_active++;
8013 }
8014 hibernate_discard_page(m);
8015 }
8016 m = next;
8017 }
8018
8019 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
8020 while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
8021 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
8022
8023 next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
8024 if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
8025 if (m->vmp_dirty) {
8026 count_discard_purgeable++;
8027 } else {
8028 count_discard_cleaned++;
8029 }
8030 hibernate_discard_page(m);
8031 }
8032 m = next;
8033 }
8034
8035 #if MACH_ASSERT || DEBUG
8036 if (vm_page_local_q) {
8037 zpercpu_foreach(lq, vm_page_local_q) {
8038 VPL_UNLOCK(&lq->vpl_lock);
8039 }
8040 }
8041 vm_page_unlock_queues();
8042 #endif /* MACH_ASSERT || DEBUG */
8043
8044 clock_get_uptime(&end);
8045 absolutetime_to_nanoseconds(end - start, &nsec);
8046 HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d cleaned %d\n",
8047 nsec / 1000000ULL,
8048 count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
8049 }
8050
8051 boolean_t hibernate_paddr_map_inited = FALSE;
8052 unsigned int hibernate_teardown_last_valid_compact_indx = -1;
8053 vm_page_t hibernate_rebuild_hash_list = NULL;
8054
8055 unsigned int hibernate_teardown_found_tabled_pages = 0;
8056 unsigned int hibernate_teardown_found_created_pages = 0;
8057 unsigned int hibernate_teardown_found_free_pages = 0;
8058 unsigned int hibernate_teardown_vm_page_free_count;
8059
8060
8061 struct ppnum_mapping {
8062 struct ppnum_mapping *ppnm_next;
8063 ppnum_t ppnm_base_paddr;
8064 unsigned int ppnm_sindx;
8065 unsigned int ppnm_eindx;
8066 };
8067
8068 struct ppnum_mapping *ppnm_head;
8069 struct ppnum_mapping *ppnm_last_found = NULL;
8070
8071
8072 void
hibernate_create_paddr_map(void)8073 hibernate_create_paddr_map(void)
8074 {
8075 unsigned int i;
8076 ppnum_t next_ppnum_in_run = 0;
8077 struct ppnum_mapping *ppnm = NULL;
8078
8079 if (hibernate_paddr_map_inited == FALSE) {
8080 for (i = 0; i < vm_pages_count; i++) {
8081 if (ppnm) {
8082 ppnm->ppnm_eindx = i;
8083 }
8084
8085 if (ppnm == NULL || VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]) != next_ppnum_in_run) {
8086 ppnm = zalloc_permanent_type(struct ppnum_mapping);
8087
8088 ppnm->ppnm_next = ppnm_head;
8089 ppnm_head = ppnm;
8090
8091 ppnm->ppnm_sindx = i;
8092 ppnm->ppnm_base_paddr = VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]);
8093 }
8094 next_ppnum_in_run = VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]) + 1;
8095 }
8096 ppnm->ppnm_eindx = vm_pages_count;
8097
8098 hibernate_paddr_map_inited = TRUE;
8099 }
8100 }
8101
8102 ppnum_t
hibernate_lookup_paddr(unsigned int indx)8103 hibernate_lookup_paddr(unsigned int indx)
8104 {
8105 struct ppnum_mapping *ppnm = NULL;
8106
8107 ppnm = ppnm_last_found;
8108
8109 if (ppnm) {
8110 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
8111 goto done;
8112 }
8113 }
8114 for (ppnm = ppnm_head; ppnm; ppnm = ppnm->ppnm_next) {
8115 if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
8116 ppnm_last_found = ppnm;
8117 break;
8118 }
8119 }
8120 if (ppnm == NULL) {
8121 panic("hibernate_lookup_paddr of %d failed", indx);
8122 }
8123 done:
8124 return ppnm->ppnm_base_paddr + (indx - ppnm->ppnm_sindx);
8125 }
8126
8127
8128 uint32_t
hibernate_mark_as_unneeded(addr64_t saddr,addr64_t eaddr,hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)8129 hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
8130 {
8131 addr64_t saddr_aligned;
8132 addr64_t eaddr_aligned;
8133 addr64_t addr;
8134 ppnum_t paddr;
8135 unsigned int mark_as_unneeded_pages = 0;
8136
8137 saddr_aligned = (saddr + PAGE_MASK_64) & ~PAGE_MASK_64;
8138 eaddr_aligned = eaddr & ~PAGE_MASK_64;
8139
8140 for (addr = saddr_aligned; addr < eaddr_aligned; addr += PAGE_SIZE_64) {
8141 paddr = pmap_find_phys(kernel_pmap, addr);
8142
8143 assert(paddr);
8144
8145 hibernate_page_bitset(page_list, TRUE, paddr);
8146 hibernate_page_bitset(page_list_wired, TRUE, paddr);
8147
8148 mark_as_unneeded_pages++;
8149 }
8150 return mark_as_unneeded_pages;
8151 }
8152
8153
8154 void
hibernate_hash_insert_page(vm_page_t mem)8155 hibernate_hash_insert_page(vm_page_t mem)
8156 {
8157 vm_page_bucket_t *bucket;
8158 int hash_id;
8159 vm_object_t m_object;
8160
8161 m_object = VM_PAGE_OBJECT(mem);
8162
8163 assert(mem->vmp_hashed);
8164 assert(m_object);
8165 assert(mem->vmp_offset != (vm_object_offset_t) -1);
8166
8167 /*
8168 * Insert it into the object_object/offset hash table
8169 */
8170 hash_id = vm_page_hash(m_object, mem->vmp_offset);
8171 bucket = &vm_page_buckets[hash_id];
8172
8173 mem->vmp_next_m = bucket->page_list;
8174 bucket->page_list = VM_PAGE_PACK_PTR(mem);
8175 }
8176
8177
8178 void
hibernate_free_range(int sindx,int eindx)8179 hibernate_free_range(int sindx, int eindx)
8180 {
8181 vm_page_t mem;
8182 unsigned int color;
8183
8184 while (sindx < eindx) {
8185 mem = &vm_pages[sindx];
8186
8187 vm_page_init(mem, hibernate_lookup_paddr(sindx), FALSE);
8188
8189 mem->vmp_lopage = FALSE;
8190 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
8191
8192 color = VM_PAGE_GET_COLOR(mem);
8193 #if defined(__x86_64__)
8194 vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem);
8195 #else
8196 vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vmp_pageq);
8197 #endif
8198 vm_page_free_count++;
8199
8200 sindx++;
8201 }
8202 }
8203
8204 void
hibernate_rebuild_vm_structs(void)8205 hibernate_rebuild_vm_structs(void)
8206 {
8207 int i, cindx, sindx, eindx;
8208 vm_page_t mem, tmem, mem_next;
8209 AbsoluteTime startTime, endTime;
8210 uint64_t nsec;
8211
8212 if (hibernate_rebuild_needed == FALSE) {
8213 return;
8214 }
8215
8216 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_START);
8217 HIBLOG("hibernate_rebuild started\n");
8218
8219 clock_get_uptime(&startTime);
8220
8221 pal_hib_rebuild_pmap_structs();
8222
8223 bzero(&vm_page_buckets[0], vm_page_bucket_count * sizeof(vm_page_bucket_t));
8224 eindx = vm_pages_count;
8225
8226 /*
8227 * Mark all the vm_pages[] that have not been initialized yet as being
8228 * transient. This is needed to ensure that buddy page search is corrrect.
8229 * Without this random data in these vm_pages[] can trip the buddy search
8230 */
8231 for (i = hibernate_teardown_last_valid_compact_indx + 1; i < eindx; ++i) {
8232 vm_pages[i].vmp_q_state = VM_PAGE_NOT_ON_Q;
8233 }
8234
8235 for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
8236 mem = &vm_pages[cindx];
8237 assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
8238 /*
8239 * hibernate_teardown_vm_structs leaves the location where
8240 * this vm_page_t must be located in "next".
8241 */
8242 tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
8243 mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
8244
8245 sindx = (int)(tmem - &vm_pages[0]);
8246
8247 if (mem != tmem) {
8248 /*
8249 * this vm_page_t was moved by hibernate_teardown_vm_structs,
8250 * so move it back to its real location
8251 */
8252 *tmem = *mem;
8253 mem = tmem;
8254 }
8255 if (mem->vmp_hashed) {
8256 hibernate_hash_insert_page(mem);
8257 }
8258 /*
8259 * the 'hole' between this vm_page_t and the previous
8260 * vm_page_t we moved needs to be initialized as
8261 * a range of free vm_page_t's
8262 */
8263 hibernate_free_range(sindx + 1, eindx);
8264
8265 eindx = sindx;
8266 }
8267 if (sindx) {
8268 hibernate_free_range(0, sindx);
8269 }
8270
8271 assert(vm_page_free_count == hibernate_teardown_vm_page_free_count);
8272
8273 /*
8274 * process the list of vm_page_t's that were entered in the hash,
8275 * but were not located in the vm_pages arrary... these are
8276 * vm_page_t's that were created on the fly (i.e. fictitious)
8277 */
8278 for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) {
8279 mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
8280
8281 mem->vmp_next_m = 0;
8282 hibernate_hash_insert_page(mem);
8283 }
8284 hibernate_rebuild_hash_list = NULL;
8285
8286 clock_get_uptime(&endTime);
8287 SUB_ABSOLUTETIME(&endTime, &startTime);
8288 absolutetime_to_nanoseconds(endTime, &nsec);
8289
8290 HIBLOG("hibernate_rebuild completed - took %qd msecs\n", nsec / 1000000ULL);
8291
8292 hibernate_rebuild_needed = FALSE;
8293
8294 KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END);
8295 }
8296
8297 uint32_t
hibernate_teardown_vm_structs(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)8298 hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
8299 {
8300 unsigned int i;
8301 unsigned int compact_target_indx;
8302 vm_page_t mem, mem_next;
8303 vm_page_bucket_t *bucket;
8304 unsigned int mark_as_unneeded_pages = 0;
8305 unsigned int unneeded_vm_page_bucket_pages = 0;
8306 unsigned int unneeded_vm_pages_pages = 0;
8307 unsigned int unneeded_pmap_pages = 0;
8308 addr64_t start_of_unneeded = 0;
8309 addr64_t end_of_unneeded = 0;
8310
8311
8312 if (hibernate_should_abort()) {
8313 return 0;
8314 }
8315
8316 hibernate_rebuild_needed = TRUE;
8317
8318 HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, active_pages %d, inactive_pages %d, speculative_pages %d, cleaned_pages %d, compressor_pages %d\n",
8319 vm_page_wire_count, vm_page_free_count, vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count,
8320 vm_page_cleaned_count, compressor_object->resident_page_count);
8321
8322 for (i = 0; i < vm_page_bucket_count; i++) {
8323 bucket = &vm_page_buckets[i];
8324
8325 for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) {
8326 assert(mem->vmp_hashed);
8327
8328 mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
8329
8330 if (mem < &vm_pages[0] || mem >= &vm_pages[vm_pages_count]) {
8331 mem->vmp_next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list);
8332 hibernate_rebuild_hash_list = mem;
8333 }
8334 }
8335 }
8336 unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0], (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired);
8337 mark_as_unneeded_pages += unneeded_vm_page_bucket_pages;
8338
8339 hibernate_teardown_vm_page_free_count = vm_page_free_count;
8340
8341 compact_target_indx = 0;
8342
8343 for (i = 0; i < vm_pages_count; i++) {
8344 mem = &vm_pages[i];
8345
8346 if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
8347 unsigned int color;
8348
8349 assert(mem->vmp_busy);
8350 assert(!mem->vmp_lopage);
8351
8352 color = VM_PAGE_GET_COLOR(mem);
8353
8354 vm_page_queue_remove(&vm_page_queue_free[color].qhead, mem, vmp_pageq);
8355
8356 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
8357
8358 vm_page_free_count--;
8359
8360 hibernate_teardown_found_free_pages++;
8361
8362 if (vm_pages[compact_target_indx].vmp_q_state != VM_PAGE_ON_FREE_Q) {
8363 compact_target_indx = i;
8364 }
8365 } else {
8366 /*
8367 * record this vm_page_t's original location
8368 * we need this even if it doesn't get moved
8369 * as an indicator to the rebuild function that
8370 * we don't have to move it
8371 */
8372 mem->vmp_next_m = VM_PAGE_PACK_PTR(mem);
8373
8374 if (vm_pages[compact_target_indx].vmp_q_state == VM_PAGE_ON_FREE_Q) {
8375 /*
8376 * we've got a hole to fill, so
8377 * move this vm_page_t to it's new home
8378 */
8379 vm_pages[compact_target_indx] = *mem;
8380 mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
8381
8382 hibernate_teardown_last_valid_compact_indx = compact_target_indx;
8383 compact_target_indx++;
8384 } else {
8385 hibernate_teardown_last_valid_compact_indx = i;
8386 }
8387 }
8388 }
8389 unneeded_vm_pages_pages = hibernate_mark_as_unneeded((addr64_t)&vm_pages[hibernate_teardown_last_valid_compact_indx + 1],
8390 (addr64_t)&vm_pages[vm_pages_count - 1], page_list, page_list_wired);
8391 mark_as_unneeded_pages += unneeded_vm_pages_pages;
8392
8393 pal_hib_teardown_pmap_structs(&start_of_unneeded, &end_of_unneeded);
8394
8395 if (start_of_unneeded) {
8396 unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded, end_of_unneeded, page_list, page_list_wired);
8397 mark_as_unneeded_pages += unneeded_pmap_pages;
8398 }
8399 HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n", unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages);
8400
8401 return mark_as_unneeded_pages;
8402 }
8403
8404
8405 #endif /* HIBERNATION */
8406
8407 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8408
8409 #include <mach_vm_debug.h>
8410 #if MACH_VM_DEBUG
8411
8412 #include <mach_debug/hash_info.h>
8413 #include <vm/vm_debug.h>
8414
8415 /*
8416 * Routine: vm_page_info
8417 * Purpose:
8418 * Return information about the global VP table.
8419 * Fills the buffer with as much information as possible
8420 * and returns the desired size of the buffer.
8421 * Conditions:
8422 * Nothing locked. The caller should provide
8423 * possibly-pageable memory.
8424 */
8425
8426 unsigned int
vm_page_info(hash_info_bucket_t * info,unsigned int count)8427 vm_page_info(
8428 hash_info_bucket_t *info,
8429 unsigned int count)
8430 {
8431 unsigned int i;
8432 lck_spin_t *bucket_lock;
8433
8434 if (vm_page_bucket_count < count) {
8435 count = vm_page_bucket_count;
8436 }
8437
8438 for (i = 0; i < count; i++) {
8439 vm_page_bucket_t *bucket = &vm_page_buckets[i];
8440 unsigned int bucket_count = 0;
8441 vm_page_t m;
8442
8443 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
8444 lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
8445
8446 for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
8447 m != VM_PAGE_NULL;
8448 m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->vmp_next_m))) {
8449 bucket_count++;
8450 }
8451
8452 lck_spin_unlock(bucket_lock);
8453
8454 /* don't touch pageable memory while holding locks */
8455 info[i].hib_count = bucket_count;
8456 }
8457
8458 return vm_page_bucket_count;
8459 }
8460 #endif /* MACH_VM_DEBUG */
8461
8462 #if VM_PAGE_BUCKETS_CHECK
8463 void
vm_page_buckets_check(void)8464 vm_page_buckets_check(void)
8465 {
8466 unsigned int i;
8467 vm_page_t p;
8468 unsigned int p_hash;
8469 vm_page_bucket_t *bucket;
8470 lck_spin_t *bucket_lock;
8471
8472 if (!vm_page_buckets_check_ready) {
8473 return;
8474 }
8475
8476 #if HIBERNATION
8477 if (hibernate_rebuild_needed ||
8478 hibernate_rebuild_hash_list) {
8479 panic("BUCKET_CHECK: hibernation in progress: "
8480 "rebuild_needed=%d rebuild_hash_list=%p\n",
8481 hibernate_rebuild_needed,
8482 hibernate_rebuild_hash_list);
8483 }
8484 #endif /* HIBERNATION */
8485
8486 #if VM_PAGE_FAKE_BUCKETS
8487 char *cp;
8488 for (cp = (char *) vm_page_fake_buckets_start;
8489 cp < (char *) vm_page_fake_buckets_end;
8490 cp++) {
8491 if (*cp != 0x5a) {
8492 panic("BUCKET_CHECK: corruption at %p in fake buckets "
8493 "[0x%llx:0x%llx]\n",
8494 cp,
8495 (uint64_t) vm_page_fake_buckets_start,
8496 (uint64_t) vm_page_fake_buckets_end);
8497 }
8498 }
8499 #endif /* VM_PAGE_FAKE_BUCKETS */
8500
8501 for (i = 0; i < vm_page_bucket_count; i++) {
8502 vm_object_t p_object;
8503
8504 bucket = &vm_page_buckets[i];
8505 if (!bucket->page_list) {
8506 continue;
8507 }
8508
8509 bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
8510 lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
8511 p = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
8512
8513 while (p != VM_PAGE_NULL) {
8514 p_object = VM_PAGE_OBJECT(p);
8515
8516 if (!p->vmp_hashed) {
8517 panic("BUCKET_CHECK: page %p (%p,0x%llx) "
8518 "hash %d in bucket %d at %p "
8519 "is not hashed\n",
8520 p, p_object, p->vmp_offset,
8521 p_hash, i, bucket);
8522 }
8523 p_hash = vm_page_hash(p_object, p->vmp_offset);
8524 if (p_hash != i) {
8525 panic("BUCKET_CHECK: corruption in bucket %d "
8526 "at %p: page %p object %p offset 0x%llx "
8527 "hash %d\n",
8528 i, bucket, p, p_object, p->vmp_offset,
8529 p_hash);
8530 }
8531 p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m));
8532 }
8533 lck_spin_unlock(bucket_lock);
8534 }
8535
8536 // printf("BUCKET_CHECK: checked buckets\n");
8537 }
8538 #endif /* VM_PAGE_BUCKETS_CHECK */
8539
8540 /*
8541 * 'vm_fault_enter' will place newly created pages (zero-fill and COW) onto the
8542 * local queues if they exist... its the only spot in the system where we add pages
8543 * to those queues... once on those queues, those pages can only move to one of the
8544 * global page queues or the free queues... they NEVER move from local q to local q.
8545 * the 'local' state is stable when vm_page_queues_remove is called since we're behind
8546 * the global vm_page_queue_lock at this point... we still need to take the local lock
8547 * in case this operation is being run on a different CPU then the local queue's identity,
8548 * but we don't have to worry about the page moving to a global queue or becoming wired
8549 * while we're grabbing the local lock since those operations would require the global
8550 * vm_page_queue_lock to be held, and we already own it.
8551 *
8552 * this is why its safe to utilze the wire_count field in the vm_page_t as the local_id...
8553 * 'wired' and local are ALWAYS mutually exclusive conditions.
8554 */
8555
8556 #if CONFIG_BACKGROUND_QUEUE
8557 void
vm_page_queues_remove(vm_page_t mem,boolean_t remove_from_backgroundq)8558 vm_page_queues_remove(vm_page_t mem, boolean_t remove_from_backgroundq)
8559 #else
8560 void
8561 vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
8562 #endif
8563 {
8564 boolean_t was_pageable = TRUE;
8565 vm_object_t m_object;
8566
8567 m_object = VM_PAGE_OBJECT(mem);
8568
8569 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
8570
8571 if (mem->vmp_q_state == VM_PAGE_NOT_ON_Q) {
8572 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
8573 #if CONFIG_BACKGROUND_QUEUE
8574 if (remove_from_backgroundq == TRUE) {
8575 vm_page_remove_from_backgroundq(mem);
8576 }
8577 if (mem->vmp_on_backgroundq) {
8578 assert(mem->vmp_backgroundq.next != 0);
8579 assert(mem->vmp_backgroundq.prev != 0);
8580 } else {
8581 assert(mem->vmp_backgroundq.next == 0);
8582 assert(mem->vmp_backgroundq.prev == 0);
8583 }
8584 #endif /* CONFIG_BACKGROUND_QUEUE */
8585 return;
8586 }
8587
8588 if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8589 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
8590 #if CONFIG_BACKGROUND_QUEUE
8591 assert(mem->vmp_backgroundq.next == 0 &&
8592 mem->vmp_backgroundq.prev == 0 &&
8593 mem->vmp_on_backgroundq == FALSE);
8594 #endif
8595 return;
8596 }
8597 if (mem->vmp_q_state == VM_PAGE_IS_WIRED) {
8598 /*
8599 * might put these guys on a list for debugging purposes
8600 * if we do, we'll need to remove this assert
8601 */
8602 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
8603 #if CONFIG_BACKGROUND_QUEUE
8604 assert(mem->vmp_backgroundq.next == 0 &&
8605 mem->vmp_backgroundq.prev == 0 &&
8606 mem->vmp_on_backgroundq == FALSE);
8607 #endif
8608 return;
8609 }
8610
8611 assert(m_object != compressor_object);
8612 assert(m_object != kernel_object);
8613 assert(!mem->vmp_fictitious);
8614
8615 switch (mem->vmp_q_state) {
8616 case VM_PAGE_ON_ACTIVE_LOCAL_Q:
8617 {
8618 struct vpl *lq;
8619
8620 lq = zpercpu_get_cpu(vm_page_local_q, mem->vmp_local_id);
8621 VPL_LOCK(&lq->vpl_lock);
8622 vm_page_queue_remove(&lq->vpl_queue, mem, vmp_pageq);
8623 mem->vmp_local_id = 0;
8624 lq->vpl_count--;
8625 if (m_object->internal) {
8626 lq->vpl_internal_count--;
8627 } else {
8628 lq->vpl_external_count--;
8629 }
8630 VPL_UNLOCK(&lq->vpl_lock);
8631 was_pageable = FALSE;
8632 break;
8633 }
8634 case VM_PAGE_ON_ACTIVE_Q:
8635 {
8636 vm_page_queue_remove(&vm_page_queue_active, mem, vmp_pageq);
8637 vm_page_active_count--;
8638 break;
8639 }
8640
8641 case VM_PAGE_ON_INACTIVE_INTERNAL_Q:
8642 {
8643 assert(m_object->internal == TRUE);
8644
8645 vm_page_inactive_count--;
8646 vm_page_queue_remove(&vm_page_queue_anonymous, mem, vmp_pageq);
8647 vm_page_anonymous_count--;
8648
8649 vm_purgeable_q_advance_all();
8650 vm_page_balance_inactive(3);
8651 break;
8652 }
8653
8654 case VM_PAGE_ON_INACTIVE_EXTERNAL_Q:
8655 {
8656 assert(m_object->internal == FALSE);
8657
8658 vm_page_inactive_count--;
8659 vm_page_queue_remove(&vm_page_queue_inactive, mem, vmp_pageq);
8660 vm_purgeable_q_advance_all();
8661 vm_page_balance_inactive(3);
8662 break;
8663 }
8664
8665 case VM_PAGE_ON_INACTIVE_CLEANED_Q:
8666 {
8667 assert(m_object->internal == FALSE);
8668
8669 vm_page_inactive_count--;
8670 vm_page_queue_remove(&vm_page_queue_cleaned, mem, vmp_pageq);
8671 vm_page_cleaned_count--;
8672 vm_page_balance_inactive(3);
8673 break;
8674 }
8675
8676 case VM_PAGE_ON_THROTTLED_Q:
8677 {
8678 assert(m_object->internal == TRUE);
8679
8680 vm_page_queue_remove(&vm_page_queue_throttled, mem, vmp_pageq);
8681 vm_page_throttled_count--;
8682 was_pageable = FALSE;
8683 break;
8684 }
8685
8686 case VM_PAGE_ON_SPECULATIVE_Q:
8687 {
8688 assert(m_object->internal == FALSE);
8689
8690 vm_page_remque(&mem->vmp_pageq);
8691 vm_page_speculative_count--;
8692 vm_page_balance_inactive(3);
8693 break;
8694 }
8695
8696 #if CONFIG_SECLUDED_MEMORY
8697 case VM_PAGE_ON_SECLUDED_Q:
8698 {
8699 vm_page_queue_remove(&vm_page_queue_secluded, mem, vmp_pageq);
8700 vm_page_secluded_count--;
8701 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
8702 if (m_object == VM_OBJECT_NULL) {
8703 vm_page_secluded_count_free--;
8704 was_pageable = FALSE;
8705 } else {
8706 assert(!m_object->internal);
8707 vm_page_secluded_count_inuse--;
8708 was_pageable = FALSE;
8709 // was_pageable = TRUE;
8710 }
8711 break;
8712 }
8713 #endif /* CONFIG_SECLUDED_MEMORY */
8714
8715 default:
8716 {
8717 /*
8718 * if (mem->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)
8719 * NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue...
8720 * the caller is responsible for determing if the page is on that queue, and if so, must
8721 * either first remove it (it needs both the page queues lock and the object lock to do
8722 * this via vm_pageout_steal_laundry), or avoid the call to vm_page_queues_remove
8723 *
8724 * we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q
8725 * or any of the undefined states
8726 */
8727 panic("vm_page_queues_remove - bad page q_state (%p, %d)", mem, mem->vmp_q_state);
8728 break;
8729 }
8730 }
8731 VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
8732 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
8733
8734 #if CONFIG_BACKGROUND_QUEUE
8735 if (remove_from_backgroundq == TRUE) {
8736 vm_page_remove_from_backgroundq(mem);
8737 }
8738 #endif
8739 if (was_pageable) {
8740 if (m_object->internal) {
8741 vm_page_pageable_internal_count--;
8742 } else {
8743 vm_page_pageable_external_count--;
8744 }
8745 }
8746 }
8747
8748 void
vm_page_remove_internal(vm_page_t page)8749 vm_page_remove_internal(vm_page_t page)
8750 {
8751 vm_object_t __object = VM_PAGE_OBJECT(page);
8752 if (page == __object->memq_hint) {
8753 vm_page_t __new_hint;
8754 vm_page_queue_entry_t __qe;
8755 __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->vmp_listq);
8756 if (vm_page_queue_end(&__object->memq, __qe)) {
8757 __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->vmp_listq);
8758 if (vm_page_queue_end(&__object->memq, __qe)) {
8759 __qe = NULL;
8760 }
8761 }
8762 __new_hint = (vm_page_t)((uintptr_t) __qe);
8763 __object->memq_hint = __new_hint;
8764 }
8765 vm_page_queue_remove(&__object->memq, page, vmp_listq);
8766 #if CONFIG_SECLUDED_MEMORY
8767 if (__object->eligible_for_secluded) {
8768 vm_page_secluded.eligible_for_secluded--;
8769 }
8770 #endif /* CONFIG_SECLUDED_MEMORY */
8771 }
8772
8773 void
vm_page_enqueue_inactive(vm_page_t mem,boolean_t first)8774 vm_page_enqueue_inactive(vm_page_t mem, boolean_t first)
8775 {
8776 vm_object_t m_object;
8777
8778 m_object = VM_PAGE_OBJECT(mem);
8779
8780 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
8781 assert(!mem->vmp_fictitious);
8782 assert(!mem->vmp_laundry);
8783 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
8784 vm_page_check_pageable_safe(mem);
8785
8786 if (m_object->internal) {
8787 mem->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
8788
8789 if (first == TRUE) {
8790 vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vmp_pageq);
8791 } else {
8792 vm_page_queue_enter(&vm_page_queue_anonymous, mem, vmp_pageq);
8793 }
8794
8795 vm_page_anonymous_count++;
8796 vm_page_pageable_internal_count++;
8797 } else {
8798 mem->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
8799
8800 if (first == TRUE) {
8801 vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vmp_pageq);
8802 } else {
8803 vm_page_queue_enter(&vm_page_queue_inactive, mem, vmp_pageq);
8804 }
8805
8806 vm_page_pageable_external_count++;
8807 }
8808 vm_page_inactive_count++;
8809 token_new_pagecount++;
8810
8811 #if CONFIG_BACKGROUND_QUEUE
8812 if (mem->vmp_in_background) {
8813 vm_page_add_to_backgroundq(mem, FALSE);
8814 }
8815 #endif
8816 }
8817
8818 void
vm_page_enqueue_active(vm_page_t mem,boolean_t first)8819 vm_page_enqueue_active(vm_page_t mem, boolean_t first)
8820 {
8821 vm_object_t m_object;
8822
8823 m_object = VM_PAGE_OBJECT(mem);
8824
8825 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
8826 assert(!mem->vmp_fictitious);
8827 assert(!mem->vmp_laundry);
8828 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
8829 vm_page_check_pageable_safe(mem);
8830
8831 mem->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
8832 if (first == TRUE) {
8833 vm_page_queue_enter_first(&vm_page_queue_active, mem, vmp_pageq);
8834 } else {
8835 vm_page_queue_enter(&vm_page_queue_active, mem, vmp_pageq);
8836 }
8837 vm_page_active_count++;
8838
8839 if (m_object->internal) {
8840 vm_page_pageable_internal_count++;
8841 } else {
8842 vm_page_pageable_external_count++;
8843 }
8844
8845 #if CONFIG_BACKGROUND_QUEUE
8846 if (mem->vmp_in_background) {
8847 vm_page_add_to_backgroundq(mem, FALSE);
8848 }
8849 #endif
8850 vm_page_balance_inactive(3);
8851 }
8852
8853 /*
8854 * Pages from special kernel objects shouldn't
8855 * be placed on pageable queues.
8856 */
8857 void
vm_page_check_pageable_safe(vm_page_t page)8858 vm_page_check_pageable_safe(vm_page_t page)
8859 {
8860 vm_object_t page_object;
8861
8862 page_object = VM_PAGE_OBJECT(page);
8863
8864 if (page_object == kernel_object) {
8865 panic("vm_page_check_pageable_safe: trying to add page"
8866 "from kernel object (%p) to pageable queue", kernel_object);
8867 }
8868
8869 if (page_object == compressor_object) {
8870 panic("vm_page_check_pageable_safe: trying to add page"
8871 "from compressor object (%p) to pageable queue", compressor_object);
8872 }
8873 }
8874
8875 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
8876 * wired page diagnose
8877 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8878
8879 #include <libkern/OSKextLibPrivate.h>
8880
8881 #define KA_SIZE(namelen, subtotalscount) \
8882 (sizeof(struct vm_allocation_site) + (namelen) + 1 + ((subtotalscount) * sizeof(struct vm_allocation_total)))
8883
8884 #define KA_NAME(alloc) \
8885 ((char *)(&(alloc)->subtotals[(alloc->subtotalscount)]))
8886
8887 #define KA_NAME_LEN(alloc) \
8888 (VM_TAG_NAME_LEN_MAX & (alloc->flags >> VM_TAG_NAME_LEN_SHIFT))
8889
8890 vm_tag_t
vm_tag_bt(void)8891 vm_tag_bt(void)
8892 {
8893 uintptr_t* frameptr;
8894 uintptr_t* frameptr_next;
8895 uintptr_t retaddr;
8896 uintptr_t kstackb, kstackt;
8897 const vm_allocation_site_t * site;
8898 thread_t cthread;
8899 kern_allocation_name_t name;
8900
8901 cthread = current_thread();
8902 if (__improbable(cthread == NULL)) {
8903 return VM_KERN_MEMORY_OSFMK;
8904 }
8905
8906 if ((name = thread_get_kernel_state(cthread)->allocation_name)) {
8907 if (!name->tag) {
8908 vm_tag_alloc(name);
8909 }
8910 return name->tag;
8911 }
8912
8913 kstackb = cthread->kernel_stack;
8914 kstackt = kstackb + kernel_stack_size;
8915
8916 /* Load stack frame pointer (EBP on x86) into frameptr */
8917 frameptr = __builtin_frame_address(0);
8918 site = NULL;
8919 while (frameptr != NULL) {
8920 /* Verify thread stack bounds */
8921 if (((uintptr_t)(frameptr + 2) > kstackt) || ((uintptr_t)frameptr < kstackb)) {
8922 break;
8923 }
8924
8925 /* Next frame pointer is pointed to by the previous one */
8926 frameptr_next = (uintptr_t*) *frameptr;
8927
8928 /* Pull return address from one spot above the frame pointer */
8929 retaddr = *(frameptr + 1);
8930
8931 #if defined(HAS_APPLE_PAC)
8932 retaddr = (uintptr_t) ptrauth_strip((void *)retaddr, ptrauth_key_return_address);
8933 #endif
8934
8935 if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text))
8936 || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) {
8937 site = OSKextGetAllocationSiteForCaller(retaddr);
8938 break;
8939 }
8940 frameptr = frameptr_next;
8941 }
8942
8943 return site ? site->tag : VM_KERN_MEMORY_NONE;
8944 }
8945
8946 static uint64_t free_tag_bits[VM_MAX_TAG_VALUE / 64];
8947
8948 void
vm_tag_alloc_locked(vm_allocation_site_t * site,vm_allocation_site_t ** releasesiteP)8949 vm_tag_alloc_locked(vm_allocation_site_t * site, vm_allocation_site_t ** releasesiteP)
8950 {
8951 vm_tag_t tag;
8952 uint64_t avail;
8953 uint32_t idx;
8954 vm_allocation_site_t * prev;
8955
8956 if (site->tag) {
8957 return;
8958 }
8959
8960 idx = 0;
8961 while (TRUE) {
8962 avail = free_tag_bits[idx];
8963 if (avail) {
8964 tag = (vm_tag_t)__builtin_clzll(avail);
8965 avail &= ~(1ULL << (63 - tag));
8966 free_tag_bits[idx] = avail;
8967 tag += (idx << 6);
8968 break;
8969 }
8970 idx++;
8971 if (idx >= ARRAY_COUNT(free_tag_bits)) {
8972 for (idx = 0; idx < ARRAY_COUNT(vm_allocation_sites); idx++) {
8973 prev = vm_allocation_sites[idx];
8974 if (!prev) {
8975 continue;
8976 }
8977 if (!KA_NAME_LEN(prev)) {
8978 continue;
8979 }
8980 if (!prev->tag) {
8981 continue;
8982 }
8983 if (prev->total) {
8984 continue;
8985 }
8986 if (1 != prev->refcount) {
8987 continue;
8988 }
8989
8990 assert(idx == prev->tag);
8991 tag = (vm_tag_t)idx;
8992 prev->tag = VM_KERN_MEMORY_NONE;
8993 *releasesiteP = prev;
8994 break;
8995 }
8996 if (idx >= ARRAY_COUNT(vm_allocation_sites)) {
8997 tag = VM_KERN_MEMORY_ANY;
8998 }
8999 break;
9000 }
9001 }
9002 site->tag = tag;
9003
9004 OSAddAtomic16(1, &site->refcount);
9005
9006 if (VM_KERN_MEMORY_ANY != tag) {
9007 vm_allocation_sites[tag] = site;
9008 }
9009
9010 if (tag > vm_allocation_tag_highest) {
9011 vm_allocation_tag_highest = tag;
9012 }
9013 }
9014
9015 static void
vm_tag_free_locked(vm_tag_t tag)9016 vm_tag_free_locked(vm_tag_t tag)
9017 {
9018 uint64_t avail;
9019 uint32_t idx;
9020 uint64_t bit;
9021
9022 if (VM_KERN_MEMORY_ANY == tag) {
9023 return;
9024 }
9025
9026 idx = (tag >> 6);
9027 avail = free_tag_bits[idx];
9028 tag &= 63;
9029 bit = (1ULL << (63 - tag));
9030 assert(!(avail & bit));
9031 free_tag_bits[idx] = (avail | bit);
9032 }
9033
9034 static void
vm_tag_init(void)9035 vm_tag_init(void)
9036 {
9037 vm_tag_t tag;
9038 for (tag = VM_KERN_MEMORY_FIRST_DYNAMIC; tag < VM_KERN_MEMORY_ANY; tag++) {
9039 vm_tag_free_locked(tag);
9040 }
9041
9042 for (tag = VM_KERN_MEMORY_ANY + 1; tag < VM_MAX_TAG_VALUE; tag++) {
9043 vm_tag_free_locked(tag);
9044 }
9045 }
9046
9047 vm_tag_t
vm_tag_alloc(vm_allocation_site_t * site)9048 vm_tag_alloc(vm_allocation_site_t * site)
9049 {
9050 vm_allocation_site_t * releasesite;
9051
9052 if (!site->tag) {
9053 releasesite = NULL;
9054 lck_spin_lock(&vm_allocation_sites_lock);
9055 vm_tag_alloc_locked(site, &releasesite);
9056 lck_spin_unlock(&vm_allocation_sites_lock);
9057 if (releasesite) {
9058 kern_allocation_name_release(releasesite);
9059 }
9060 }
9061
9062 return site->tag;
9063 }
9064
9065 void
vm_tag_update_size(vm_tag_t tag,int64_t delta)9066 vm_tag_update_size(vm_tag_t tag, int64_t delta)
9067 {
9068 vm_allocation_site_t * allocation;
9069 uint64_t prior;
9070
9071 assert(VM_KERN_MEMORY_NONE != tag);
9072 assert(tag < VM_MAX_TAG_VALUE);
9073
9074 allocation = vm_allocation_sites[tag];
9075 assert(allocation);
9076
9077 if (delta < 0) {
9078 assertf(allocation->total >= ((uint64_t)-delta), "tag %d, site %p", tag, allocation);
9079 }
9080 prior = OSAddAtomic64(delta, &allocation->total);
9081
9082 #if DEBUG || DEVELOPMENT
9083
9084 uint64_t new, peak;
9085 new = prior + delta;
9086 do{
9087 peak = allocation->peak;
9088 if (new <= peak) {
9089 break;
9090 }
9091 }while (!OSCompareAndSwap64(peak, new, &allocation->peak));
9092
9093 #endif /* DEBUG || DEVELOPMENT */
9094
9095 if (tag < VM_KERN_MEMORY_FIRST_DYNAMIC) {
9096 return;
9097 }
9098
9099 if (!prior && !allocation->tag) {
9100 vm_tag_alloc(allocation);
9101 }
9102 }
9103
9104 void
kern_allocation_update_size(kern_allocation_name_t allocation,int64_t delta)9105 kern_allocation_update_size(kern_allocation_name_t allocation, int64_t delta)
9106 {
9107 uint64_t prior;
9108
9109 if (delta < 0) {
9110 assertf(allocation->total >= ((uint64_t)-delta), "name %p", allocation);
9111 }
9112 prior = OSAddAtomic64(delta, &allocation->total);
9113
9114 #if DEBUG || DEVELOPMENT
9115
9116 uint64_t new, peak;
9117 new = prior + delta;
9118 do{
9119 peak = allocation->peak;
9120 if (new <= peak) {
9121 break;
9122 }
9123 }while (!OSCompareAndSwap64(peak, new, &allocation->peak));
9124
9125 #endif /* DEBUG || DEVELOPMENT */
9126
9127 if (!prior && !allocation->tag) {
9128 vm_tag_alloc(allocation);
9129 }
9130 }
9131
9132 #if VM_TAG_SIZECLASSES
9133
9134 void
vm_allocation_zones_init(void)9135 vm_allocation_zones_init(void)
9136 {
9137 vm_offset_t addr;
9138 vm_size_t size;
9139
9140 const vm_tag_t early_tags[] = {
9141 VM_KERN_MEMORY_DIAG,
9142 VM_KERN_MEMORY_KALLOC,
9143 VM_KERN_MEMORY_KALLOC_DATA,
9144 VM_KERN_MEMORY_KALLOC_TYPE,
9145 VM_KERN_MEMORY_LIBKERN,
9146 VM_KERN_MEMORY_OSFMK,
9147 };
9148
9149 size = VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *)
9150 + ARRAY_COUNT(early_tags) * VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
9151
9152 kmem_alloc(kernel_map, &addr, round_page(size),
9153 KMA_NOFAIL | KMA_KOBJECT | KMA_ZERO | KMA_PERMANENT,
9154 VM_KERN_MEMORY_DIAG);
9155
9156 vm_allocation_zone_totals = (vm_allocation_zone_total_t **) addr;
9157 addr += VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *);
9158
9159 // prepopulate early tag ranges so allocations
9160 // in vm_tag_update_zone_size() and early boot won't recurse
9161 for (size_t i = 0; i < ARRAY_COUNT(early_tags); i++) {
9162 vm_allocation_zone_totals[early_tags[i]] = (vm_allocation_zone_total_t *)addr;
9163 addr += VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
9164 }
9165 }
9166
9167 __attribute__((noinline))
9168 static vm_tag_t
vm_tag_zone_stats_alloc(vm_tag_t tag,zalloc_flags_t flags)9169 vm_tag_zone_stats_alloc(vm_tag_t tag, zalloc_flags_t flags)
9170 {
9171 vm_allocation_zone_total_t *stats;
9172 vm_size_t size = sizeof(*stats) * VM_TAG_SIZECLASSES;
9173
9174 flags = Z_VM_TAG(Z_ZERO | flags, VM_KERN_MEMORY_DIAG);
9175 stats = kalloc_data(size, flags);
9176 if (!stats) {
9177 return VM_KERN_MEMORY_NONE;
9178 }
9179 if (!os_atomic_cmpxchg(&vm_allocation_zone_totals[tag], NULL, stats, release)) {
9180 kfree_data(stats, size);
9181 }
9182 return tag;
9183 }
9184
9185 vm_tag_t
vm_tag_will_update_zone(vm_tag_t tag,uint32_t zidx,uint32_t zflags)9186 vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx, uint32_t zflags)
9187 {
9188 assert(VM_KERN_MEMORY_NONE != tag);
9189 assert(tag < VM_MAX_TAG_VALUE);
9190
9191 if (zidx >= VM_TAG_SIZECLASSES) {
9192 return VM_KERN_MEMORY_NONE;
9193 }
9194
9195 if (__probable(vm_allocation_zone_totals[tag])) {
9196 return tag;
9197 }
9198 return vm_tag_zone_stats_alloc(tag, zflags);
9199 }
9200
9201 void
vm_tag_update_zone_size(vm_tag_t tag,uint32_t zidx,long delta)9202 vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta)
9203 {
9204 vm_allocation_zone_total_t *stats;
9205 vm_size_t value;
9206
9207 assert(VM_KERN_MEMORY_NONE != tag);
9208 assert(tag < VM_MAX_TAG_VALUE);
9209
9210 if (zidx >= VM_TAG_SIZECLASSES) {
9211 return;
9212 }
9213
9214 stats = vm_allocation_zone_totals[tag];
9215 assert(stats);
9216 stats += zidx;
9217
9218 value = os_atomic_add(&stats->vazt_total, delta, relaxed);
9219 if (delta < 0) {
9220 assertf((long)value >= 0, "zidx %d, tag %d, %p", zidx, tag, stats);
9221 return;
9222 } else if (os_atomic_load(&stats->vazt_peak, relaxed) < value) {
9223 os_atomic_max(&stats->vazt_peak, value, relaxed);
9224 }
9225 }
9226
9227 #endif /* VM_TAG_SIZECLASSES */
9228
9229 void
kern_allocation_update_subtotal(kern_allocation_name_t allocation,uint32_t subtag,int64_t delta)9230 kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subtag, int64_t delta)
9231 {
9232 kern_allocation_name_t other;
9233 struct vm_allocation_total * total;
9234 uint32_t subidx;
9235
9236 subidx = 0;
9237 assert(VM_KERN_MEMORY_NONE != subtag);
9238 lck_spin_lock(&vm_allocation_sites_lock);
9239 for (; subidx < allocation->subtotalscount; subidx++) {
9240 if (VM_KERN_MEMORY_NONE == allocation->subtotals[subidx].tag) {
9241 allocation->subtotals[subidx].tag = (vm_tag_t)subtag;
9242 break;
9243 }
9244 if (subtag == allocation->subtotals[subidx].tag) {
9245 break;
9246 }
9247 }
9248 lck_spin_unlock(&vm_allocation_sites_lock);
9249 assert(subidx < allocation->subtotalscount);
9250 if (subidx >= allocation->subtotalscount) {
9251 return;
9252 }
9253
9254 total = &allocation->subtotals[subidx];
9255 other = vm_allocation_sites[subtag];
9256 assert(other);
9257
9258 if (delta < 0) {
9259 assertf(total->total >= ((uint64_t)-delta), "name %p", allocation);
9260 assertf(other->mapped >= ((uint64_t)-delta), "other %p", other);
9261 }
9262 OSAddAtomic64(delta, &other->mapped);
9263 OSAddAtomic64(delta, &total->total);
9264 }
9265
9266 const char *
kern_allocation_get_name(kern_allocation_name_t allocation)9267 kern_allocation_get_name(kern_allocation_name_t allocation)
9268 {
9269 return KA_NAME(allocation);
9270 }
9271
9272 kern_allocation_name_t
kern_allocation_name_allocate(const char * name,uint16_t subtotalscount)9273 kern_allocation_name_allocate(const char * name, uint16_t subtotalscount)
9274 {
9275 kern_allocation_name_t allocation;
9276 uint16_t namelen;
9277
9278 namelen = (uint16_t)strnlen(name, MACH_MEMORY_INFO_NAME_MAX_LEN - 1);
9279
9280 allocation = kalloc_data(KA_SIZE(namelen, subtotalscount), Z_WAITOK | Z_ZERO);
9281 allocation->refcount = 1;
9282 allocation->subtotalscount = subtotalscount;
9283 allocation->flags = (uint16_t)(namelen << VM_TAG_NAME_LEN_SHIFT);
9284 strlcpy(KA_NAME(allocation), name, namelen + 1);
9285
9286 vm_tag_alloc(allocation);
9287 return allocation;
9288 }
9289
9290 void
kern_allocation_name_release(kern_allocation_name_t allocation)9291 kern_allocation_name_release(kern_allocation_name_t allocation)
9292 {
9293 assert(allocation->refcount > 0);
9294 if (1 == OSAddAtomic16(-1, &allocation->refcount)) {
9295 kfree_data(allocation,
9296 KA_SIZE(KA_NAME_LEN(allocation), allocation->subtotalscount));
9297 }
9298 }
9299
9300 vm_tag_t
kern_allocation_name_get_vm_tag(kern_allocation_name_t allocation)9301 kern_allocation_name_get_vm_tag(kern_allocation_name_t allocation)
9302 {
9303 return vm_tag_alloc(allocation);
9304 }
9305
9306 #if !VM_TAG_ACTIVE_UPDATE
9307 static void
vm_page_count_object(mach_memory_info_t * info,unsigned int __unused num_info,vm_object_t object)9308 vm_page_count_object(mach_memory_info_t * info, unsigned int __unused num_info, vm_object_t object)
9309 {
9310 if (!object->wired_page_count) {
9311 return;
9312 }
9313 if (object != kernel_object) {
9314 assert(object->wire_tag < num_info);
9315 info[object->wire_tag].size += ptoa_64(object->wired_page_count);
9316 }
9317 }
9318
9319 typedef void (*vm_page_iterate_proc)(mach_memory_info_t * info,
9320 unsigned int num_info, vm_object_t object);
9321
9322 static void
vm_page_iterate_purgeable_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc,purgeable_q_t queue,int group)9323 vm_page_iterate_purgeable_objects(mach_memory_info_t * info, unsigned int num_info,
9324 vm_page_iterate_proc proc, purgeable_q_t queue,
9325 int group)
9326 {
9327 vm_object_t object;
9328
9329 for (object = (vm_object_t) queue_first(&queue->objq[group]);
9330 !queue_end(&queue->objq[group], (queue_entry_t) object);
9331 object = (vm_object_t) queue_next(&object->objq)) {
9332 proc(info, num_info, object);
9333 }
9334 }
9335
9336 static void
vm_page_iterate_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc)9337 vm_page_iterate_objects(mach_memory_info_t * info, unsigned int num_info,
9338 vm_page_iterate_proc proc)
9339 {
9340 vm_object_t object;
9341
9342 lck_spin_lock_grp(&vm_objects_wired_lock, &vm_page_lck_grp_bucket);
9343 queue_iterate(&vm_objects_wired,
9344 object,
9345 vm_object_t,
9346 wired_objq)
9347 {
9348 proc(info, num_info, object);
9349 }
9350 lck_spin_unlock(&vm_objects_wired_lock);
9351 }
9352 #endif /* ! VM_TAG_ACTIVE_UPDATE */
9353
9354 static uint64_t
process_account(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,boolean_t iterated)9355 process_account(mach_memory_info_t * info, unsigned int num_info,
9356 uint64_t zones_collectable_bytes, boolean_t iterated)
9357 {
9358 size_t namelen;
9359 unsigned int idx, count, nextinfo;
9360 vm_allocation_site_t * site;
9361 lck_spin_lock(&vm_allocation_sites_lock);
9362
9363 for (idx = 0; idx <= vm_allocation_tag_highest; idx++) {
9364 site = vm_allocation_sites[idx];
9365 if (!site) {
9366 continue;
9367 }
9368 info[idx].mapped = site->mapped;
9369 info[idx].tag = site->tag;
9370 if (!iterated) {
9371 info[idx].size = site->total;
9372 #if DEBUG || DEVELOPMENT
9373 info[idx].peak = site->peak;
9374 #endif /* DEBUG || DEVELOPMENT */
9375 } else {
9376 if (!site->subtotalscount && (site->total != info[idx].size)) {
9377 printf("tag mismatch[%d] 0x%qx, iter 0x%qx\n", idx, site->total, info[idx].size);
9378 info[idx].size = site->total;
9379 }
9380 }
9381 info[idx].flags |= VM_KERN_SITE_WIRED;
9382 if (idx < VM_KERN_MEMORY_FIRST_DYNAMIC) {
9383 info[idx].site = idx;
9384 info[idx].flags |= VM_KERN_SITE_TAG;
9385 if (VM_KERN_MEMORY_ZONE == idx) {
9386 info[idx].flags |= VM_KERN_SITE_HIDE;
9387 info[idx].flags &= ~VM_KERN_SITE_WIRED;
9388 info[idx].collectable_bytes = zones_collectable_bytes;
9389 }
9390 } else if ((namelen = (VM_TAG_NAME_LEN_MAX & (site->flags >> VM_TAG_NAME_LEN_SHIFT)))) {
9391 info[idx].site = 0;
9392 info[idx].flags |= VM_KERN_SITE_NAMED;
9393 if (namelen > sizeof(info[idx].name)) {
9394 namelen = sizeof(info[idx].name);
9395 }
9396 strncpy(&info[idx].name[0], KA_NAME(site), namelen);
9397 } else if (VM_TAG_KMOD & site->flags) {
9398 info[idx].site = OSKextGetKmodIDForSite(site, NULL, 0);
9399 info[idx].flags |= VM_KERN_SITE_KMOD;
9400 } else {
9401 info[idx].site = VM_KERNEL_UNSLIDE(site);
9402 info[idx].flags |= VM_KERN_SITE_KERNEL;
9403 }
9404 }
9405
9406 nextinfo = (vm_allocation_tag_highest + 1);
9407 count = nextinfo;
9408 if (count >= num_info) {
9409 count = num_info;
9410 }
9411
9412 for (idx = 0; idx < count; idx++) {
9413 site = vm_allocation_sites[idx];
9414 if (!site) {
9415 continue;
9416 }
9417 #if VM_TAG_SIZECLASSES
9418 vm_allocation_zone_total_t * zone;
9419 unsigned int zidx;
9420
9421 if (vm_allocation_zone_totals
9422 && (zone = vm_allocation_zone_totals[idx])
9423 && (nextinfo < num_info)) {
9424 for (zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
9425 if (!zone[zidx].vazt_peak) {
9426 continue;
9427 }
9428 info[nextinfo] = info[idx];
9429 info[nextinfo].zone = (uint16_t)zone_index_from_tag_index(zidx);
9430 info[nextinfo].flags &= ~VM_KERN_SITE_WIRED;
9431 info[nextinfo].flags |= VM_KERN_SITE_ZONE;
9432 info[nextinfo].flags |= VM_KERN_SITE_KALLOC;
9433 info[nextinfo].size = zone[zidx].vazt_total;
9434 info[nextinfo].peak = zone[zidx].vazt_peak;
9435 info[nextinfo].mapped = 0;
9436 nextinfo++;
9437 }
9438 }
9439 #endif /* VM_TAG_SIZECLASSES */
9440 if (site->subtotalscount) {
9441 uint64_t mapped, mapcost, take;
9442 uint32_t sub;
9443 vm_tag_t alloctag;
9444
9445 info[idx].size = site->total;
9446 mapped = info[idx].size;
9447 info[idx].mapped = mapped;
9448 mapcost = 0;
9449 for (sub = 0; sub < site->subtotalscount; sub++) {
9450 alloctag = site->subtotals[sub].tag;
9451 assert(alloctag < num_info);
9452 if (info[alloctag].name[0]) {
9453 continue;
9454 }
9455 take = site->subtotals[sub].total;
9456 if (take > info[alloctag].size) {
9457 take = info[alloctag].size;
9458 }
9459 if (take > mapped) {
9460 take = mapped;
9461 }
9462 info[alloctag].mapped -= take;
9463 info[alloctag].size -= take;
9464 mapped -= take;
9465 mapcost += take;
9466 }
9467 info[idx].size = mapcost;
9468 }
9469 }
9470 lck_spin_unlock(&vm_allocation_sites_lock);
9471
9472 return 0;
9473 }
9474
9475 uint32_t
vm_page_diagnose_estimate(void)9476 vm_page_diagnose_estimate(void)
9477 {
9478 vm_allocation_site_t * site;
9479 uint32_t count = zone_view_count;
9480 uint32_t idx;
9481
9482 lck_spin_lock(&vm_allocation_sites_lock);
9483 for (idx = 0; idx < VM_MAX_TAG_VALUE; idx++) {
9484 site = vm_allocation_sites[idx];
9485 if (!site) {
9486 continue;
9487 }
9488 count++;
9489 #if VM_TAG_SIZECLASSES
9490 if (vm_allocation_zone_totals) {
9491 vm_allocation_zone_total_t * zone;
9492 zone = vm_allocation_zone_totals[idx];
9493 if (!zone) {
9494 continue;
9495 }
9496 for (uint32_t zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
9497 count += (zone[zidx].vazt_peak != 0);
9498 }
9499 }
9500 #endif
9501 }
9502 lck_spin_unlock(&vm_allocation_sites_lock);
9503
9504 /* some slop for new tags created */
9505 count += 8;
9506 count += VM_KERN_COUNTER_COUNT;
9507
9508 return count;
9509 }
9510
9511 static void
vm_page_diagnose_zone_stats(mach_memory_info_t * info,zone_stats_t zstats,bool percpu)9512 vm_page_diagnose_zone_stats(mach_memory_info_t *info, zone_stats_t zstats,
9513 bool percpu)
9514 {
9515 zpercpu_foreach(zs, zstats) {
9516 info->size += zs->zs_mem_allocated - zs->zs_mem_freed;
9517 }
9518 if (percpu) {
9519 info->size *= zpercpu_count();
9520 }
9521 info->flags |= VM_KERN_SITE_NAMED | VM_KERN_SITE_ZONE_VIEW;
9522 }
9523
9524 static void
vm_page_diagnose_zone(mach_memory_info_t * info,zone_t z)9525 vm_page_diagnose_zone(mach_memory_info_t *info, zone_t z)
9526 {
9527 vm_page_diagnose_zone_stats(info, z->z_stats, z->z_percpu);
9528 snprintf(info->name, sizeof(info->name),
9529 "%s%s[raw]", zone_heap_name(z), z->z_name);
9530 }
9531
9532 static int
vm_page_diagnose_heap(mach_memory_info_t * info,kalloc_heap_t kheap)9533 vm_page_diagnose_heap(mach_memory_info_t *info, kalloc_heap_t kheap)
9534 {
9535 struct kheap_zones *zones = kheap->kh_zones;
9536 int i = 0;
9537
9538 for (; i < zones->max_k_zone; i++) {
9539 vm_page_diagnose_zone(info + i, zones->k_zone[i]);
9540 }
9541
9542 for (kalloc_heap_t kh = zones->views; kh; kh = kh->kh_next, i++) {
9543 vm_page_diagnose_zone_stats(info + i, kh->kh_stats, false);
9544 snprintf(info[i].name, sizeof(info[i].name),
9545 "%skalloc[%s]", kheap->kh_name, kh->kh_name);
9546 }
9547
9548 return i;
9549 }
9550
9551 static int
vm_page_diagnose_kt_heaps(mach_memory_info_t * info)9552 vm_page_diagnose_kt_heaps(mach_memory_info_t *info)
9553 {
9554 uint32_t idx = 0;
9555 vm_page_diagnose_zone_stats(info + idx, KHEAP_KT_VAR->kh_stats, false);
9556 snprintf(info[idx].name, sizeof(info[idx].name),
9557 "%s[raw]", KHEAP_KT_VAR->kh_name);
9558 idx++;
9559
9560 for (uint32_t i = 0; i < KT_VAR_MAX_HEAPS; i++) {
9561 struct kt_heap_zones heap = kalloc_type_heap_array[i];
9562
9563 for (kalloc_type_var_view_t ktv = heap.views; ktv;
9564 ktv = (kalloc_type_var_view_t) ktv->kt_next) {
9565 if (ktv->kt_stats && ktv->kt_stats != KHEAP_KT_VAR->kh_stats) {
9566 vm_page_diagnose_zone_stats(info + idx, ktv->kt_stats, false);
9567 snprintf(info[i].name, sizeof(info[i].name),
9568 "%s[%s]", KHEAP_KT_VAR->kh_name, ktv->kt_name);
9569 idx++;
9570 }
9571 }
9572 }
9573
9574 return idx;
9575 }
9576
9577 kern_return_t
vm_page_diagnose(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes)9578 vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes)
9579 {
9580 uint64_t wired_size;
9581 uint64_t wired_managed_size;
9582 uint64_t wired_reserved_size;
9583 boolean_t iterate;
9584 mach_memory_info_t * counts;
9585 uint32_t i;
9586
9587 bzero(info, num_info * sizeof(mach_memory_info_t));
9588
9589 if (!vm_page_wire_count_initial) {
9590 return KERN_ABORTED;
9591 }
9592
9593 #if !XNU_TARGET_OS_OSX
9594 wired_size = ptoa_64(vm_page_wire_count);
9595 wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count);
9596 #else /* !XNU_TARGET_OS_OSX */
9597 wired_size = ptoa_64(vm_page_wire_count + vm_lopage_free_count + vm_page_throttled_count);
9598 wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count + vm_page_throttled_count);
9599 #endif /* !XNU_TARGET_OS_OSX */
9600 wired_managed_size = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial);
9601
9602 wired_size += booter_size;
9603
9604 assert(num_info >= VM_KERN_COUNTER_COUNT);
9605 num_info -= VM_KERN_COUNTER_COUNT;
9606 counts = &info[num_info];
9607
9608 #define SET_COUNT(xcount, xsize, xflags) \
9609 counts[xcount].tag = VM_MAX_TAG_VALUE + xcount; \
9610 counts[xcount].site = (xcount); \
9611 counts[xcount].size = (xsize); \
9612 counts[xcount].mapped = (xsize); \
9613 counts[xcount].flags = VM_KERN_SITE_COUNTER | xflags;
9614
9615 SET_COUNT(VM_KERN_COUNT_MANAGED, ptoa_64(vm_page_pages), 0);
9616 SET_COUNT(VM_KERN_COUNT_WIRED, wired_size, 0);
9617 SET_COUNT(VM_KERN_COUNT_WIRED_MANAGED, wired_managed_size, 0);
9618 SET_COUNT(VM_KERN_COUNT_RESERVED, wired_reserved_size, VM_KERN_SITE_WIRED);
9619 SET_COUNT(VM_KERN_COUNT_STOLEN, ptoa_64(vm_page_stolen_count), VM_KERN_SITE_WIRED);
9620 SET_COUNT(VM_KERN_COUNT_LOPAGE, ptoa_64(vm_lopage_free_count), VM_KERN_SITE_WIRED);
9621 SET_COUNT(VM_KERN_COUNT_WIRED_BOOT, ptoa_64(vm_page_wire_count_on_boot), 0);
9622 SET_COUNT(VM_KERN_COUNT_BOOT_STOLEN, booter_size, VM_KERN_SITE_WIRED);
9623 SET_COUNT(VM_KERN_COUNT_WIRED_STATIC_KERNELCACHE, ptoa_64(vm_page_kernelcache_count), 0);
9624
9625 #define SET_MAP(xcount, xsize, xfree, xlargest) \
9626 counts[xcount].site = (xcount); \
9627 counts[xcount].size = (xsize); \
9628 counts[xcount].mapped = (xsize); \
9629 counts[xcount].free = (xfree); \
9630 counts[xcount].largest = (xlargest); \
9631 counts[xcount].flags = VM_KERN_SITE_COUNTER;
9632
9633 vm_map_size_t map_size, map_free, map_largest;
9634
9635 vm_map_sizes(kernel_map, &map_size, &map_free, &map_largest);
9636 SET_MAP(VM_KERN_COUNT_MAP_KERNEL, map_size, map_free, map_largest);
9637
9638 zone_map_sizes(&map_size, &map_free, &map_largest);
9639 SET_MAP(VM_KERN_COUNT_MAP_ZONE, map_size, map_free, map_largest);
9640
9641 assert(num_info >= zone_view_count);
9642 num_info -= zone_view_count;
9643 counts = &info[num_info];
9644 i = 0;
9645
9646 i += vm_page_diagnose_heap(counts + i, KHEAP_DEFAULT);
9647 if (KHEAP_DATA_BUFFERS->kh_heap_id == KHEAP_ID_DATA_BUFFERS) {
9648 i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_BUFFERS);
9649 }
9650 if (KHEAP_KT_VAR->kh_heap_id == KHEAP_ID_KT_VAR) {
9651 i += vm_page_diagnose_kt_heaps(counts + i);
9652 }
9653 assert(i <= zone_view_count);
9654
9655 zone_index_foreach(zidx) {
9656 zone_t z = &zone_array[zidx];
9657 zone_security_flags_t zsflags = zone_security_array[zidx];
9658 zone_view_t zv = z->z_views;
9659
9660 if (zv == NULL) {
9661 continue;
9662 }
9663
9664 zone_stats_t zv_stats_head = z->z_stats;
9665 bool has_raw_view = false;
9666
9667 for (; zv; zv = zv->zv_next) {
9668 /*
9669 * kalloc_types that allocate from the same zone are linked
9670 * as views. Only print the ones that have their own stats.
9671 */
9672 if (zv->zv_stats == zv_stats_head) {
9673 continue;
9674 }
9675 has_raw_view = true;
9676 vm_page_diagnose_zone_stats(counts + i, zv->zv_stats,
9677 z->z_percpu);
9678 snprintf(counts[i].name, sizeof(counts[i].name), "%s%s[%s]",
9679 zone_heap_name(z), z->z_name, zv->zv_name);
9680 i++;
9681 assert(i <= zone_view_count);
9682 }
9683
9684 /*
9685 * Print raw views for non kalloc or kalloc_type zones
9686 */
9687 bool kalloc_type = zsflags.z_kalloc_type;
9688 if ((zsflags.z_kheap_id == KHEAP_ID_NONE && !kalloc_type) ||
9689 (kalloc_type && has_raw_view)) {
9690 vm_page_diagnose_zone(counts + i, z);
9691 i++;
9692 assert(i <= zone_view_count);
9693 }
9694 }
9695
9696 iterate = !VM_TAG_ACTIVE_UPDATE;
9697 if (iterate) {
9698 enum { kMaxKernelDepth = 1 };
9699 vm_map_t maps[kMaxKernelDepth];
9700 vm_map_entry_t entries[kMaxKernelDepth];
9701 vm_map_t map;
9702 vm_map_entry_t entry;
9703 vm_object_offset_t offset;
9704 vm_page_t page;
9705 int stackIdx, count;
9706
9707 #if !VM_TAG_ACTIVE_UPDATE
9708 vm_page_iterate_objects(info, num_info, &vm_page_count_object);
9709 #endif /* ! VM_TAG_ACTIVE_UPDATE */
9710
9711 map = kernel_map;
9712 stackIdx = 0;
9713 while (map) {
9714 vm_map_lock(map);
9715 for (entry = map->hdr.links.next; map; entry = entry->links.next) {
9716 if (entry->is_sub_map) {
9717 assert(stackIdx < kMaxKernelDepth);
9718 maps[stackIdx] = map;
9719 entries[stackIdx] = entry;
9720 stackIdx++;
9721 map = VME_SUBMAP(entry);
9722 entry = NULL;
9723 break;
9724 }
9725 if (VME_OBJECT(entry) == kernel_object) {
9726 count = 0;
9727 vm_object_lock(VME_OBJECT(entry));
9728 for (offset = entry->links.start; offset < entry->links.end; offset += page_size) {
9729 page = vm_page_lookup(VME_OBJECT(entry), offset);
9730 if (page && VM_PAGE_WIRED(page)) {
9731 count++;
9732 }
9733 }
9734 vm_object_unlock(VME_OBJECT(entry));
9735
9736 if (count) {
9737 assert(VME_ALIAS(entry) != VM_KERN_MEMORY_NONE);
9738 assert(VME_ALIAS(entry) < num_info);
9739 info[VME_ALIAS(entry)].size += ptoa_64(count);
9740 }
9741 }
9742 while (map && (entry == vm_map_last_entry(map))) {
9743 vm_map_unlock(map);
9744 if (!stackIdx) {
9745 map = NULL;
9746 } else {
9747 --stackIdx;
9748 map = maps[stackIdx];
9749 entry = entries[stackIdx];
9750 }
9751 }
9752 }
9753 }
9754 }
9755
9756 process_account(info, num_info, zones_collectable_bytes, iterate);
9757
9758 return KERN_SUCCESS;
9759 }
9760
9761 #if DEBUG || DEVELOPMENT
9762
9763 kern_return_t
vm_kern_allocation_info(uintptr_t addr,vm_size_t * size,vm_tag_t * tag,vm_size_t * zone_size)9764 vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_size_t * zone_size)
9765 {
9766 kern_return_t ret;
9767 vm_size_t zsize;
9768 vm_map_t map;
9769 vm_map_entry_t entry;
9770
9771 zsize = zone_element_info((void *) addr, tag);
9772 if (zsize) {
9773 *zone_size = *size = zsize;
9774 return KERN_SUCCESS;
9775 }
9776
9777 *zone_size = 0;
9778 ret = KERN_INVALID_ADDRESS;
9779 for (map = kernel_map; map;) {
9780 vm_map_lock(map);
9781 if (!vm_map_lookup_entry_allow_pgz(map, addr, &entry)) {
9782 break;
9783 }
9784 if (entry->is_sub_map) {
9785 if (map != kernel_map) {
9786 break;
9787 }
9788 map = VME_SUBMAP(entry);
9789 continue;
9790 }
9791 if (entry->vme_start != addr) {
9792 break;
9793 }
9794 *tag = (vm_tag_t)VME_ALIAS(entry);
9795 *size = (entry->vme_end - addr);
9796 ret = KERN_SUCCESS;
9797 break;
9798 }
9799 if (map != kernel_map) {
9800 vm_map_unlock(map);
9801 }
9802 vm_map_unlock(kernel_map);
9803
9804 return ret;
9805 }
9806
9807 #endif /* DEBUG || DEVELOPMENT */
9808
9809 uint32_t
vm_tag_get_kext(vm_tag_t tag,char * name,vm_size_t namelen)9810 vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen)
9811 {
9812 vm_allocation_site_t * site;
9813 uint32_t kmodId;
9814
9815 kmodId = 0;
9816 lck_spin_lock(&vm_allocation_sites_lock);
9817 if ((site = vm_allocation_sites[tag])) {
9818 if (VM_TAG_KMOD & site->flags) {
9819 kmodId = OSKextGetKmodIDForSite(site, name, namelen);
9820 }
9821 }
9822 lck_spin_unlock(&vm_allocation_sites_lock);
9823
9824 return kmodId;
9825 }
9826
9827
9828 #if CONFIG_SECLUDED_MEMORY
9829 /*
9830 * Note that there's no locking around other accesses to vm_page_secluded_target.
9831 * That should be OK, since these are the only place where it can be changed after
9832 * initialization. Other users (like vm_pageout) may see the wrong value briefly,
9833 * but will eventually get the correct value. This brief mismatch is OK as pageout
9834 * and page freeing will auto-adjust the vm_page_secluded_count to match the target
9835 * over time.
9836 */
9837 unsigned int vm_page_secluded_suppress_cnt = 0;
9838 unsigned int vm_page_secluded_save_target;
9839
9840 LCK_GRP_DECLARE(secluded_suppress_slock_grp, "secluded_suppress_slock");
9841 LCK_SPIN_DECLARE(secluded_suppress_slock, &secluded_suppress_slock_grp);
9842
9843 void
start_secluded_suppression(task_t task)9844 start_secluded_suppression(task_t task)
9845 {
9846 if (task->task_suppressed_secluded) {
9847 return;
9848 }
9849 lck_spin_lock(&secluded_suppress_slock);
9850 if (!task->task_suppressed_secluded && vm_page_secluded_suppress_cnt++ == 0) {
9851 task->task_suppressed_secluded = TRUE;
9852 vm_page_secluded_save_target = vm_page_secluded_target;
9853 vm_page_secluded_target = 0;
9854 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
9855 }
9856 lck_spin_unlock(&secluded_suppress_slock);
9857 }
9858
9859 void
stop_secluded_suppression(task_t task)9860 stop_secluded_suppression(task_t task)
9861 {
9862 lck_spin_lock(&secluded_suppress_slock);
9863 if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) {
9864 task->task_suppressed_secluded = FALSE;
9865 vm_page_secluded_target = vm_page_secluded_save_target;
9866 VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
9867 }
9868 lck_spin_unlock(&secluded_suppress_slock);
9869 }
9870
9871 #endif /* CONFIG_SECLUDED_MEMORY */
9872
9873 /*
9874 * Move the list of retired pages on the vm_page_queue_retired to
9875 * their final resting place on retired_pages_object.
9876 */
9877 void
vm_retire_boot_pages(void)9878 vm_retire_boot_pages(void)
9879 {
9880 #if defined(__arm64__)
9881 vm_page_t p;
9882
9883 vm_object_lock(retired_pages_object);
9884 while (!vm_page_queue_empty(&vm_page_queue_retired)) {
9885 vm_page_queue_remove_first(&vm_page_queue_retired, p, vmp_pageq);
9886 assert(p != NULL);
9887 vm_page_lock_queues();
9888 p->vmp_q_state = VM_PAGE_IS_WIRED;
9889 p->vmp_wire_count++;
9890 vm_page_unlock_queues();
9891 vm_page_insert_wired(p, retired_pages_object, ptoa(VM_PAGE_GET_PHYS_PAGE(p)), VM_KERN_MEMORY_RETIRED);
9892 vm_object_unlock(retired_pages_object);
9893 pmap_retire_page(VM_PAGE_GET_PHYS_PAGE(p));
9894 vm_object_lock(retired_pages_object);
9895 }
9896 vm_object_unlock(retired_pages_object);
9897 #endif /* defined(__arm64__) */
9898 }
9899
9900 /*
9901 * Returns the current number of retired pages, used for sysctl.
9902 */
9903 uint32_t
vm_retired_pages_count(void)9904 vm_retired_pages_count(void)
9905 {
9906 return retired_pages_object->resident_page_count;
9907 }
9908
9909