xref: /xnu-8796.121.2/osfmk/vm/vm_resident.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_page.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *
62  *	Resident memory management module.
63  */
64 
65 #include <debug.h>
66 #include <libkern/OSAtomic.h>
67 #include <libkern/OSDebug.h>
68 
69 #include <mach/clock_types.h>
70 #include <mach/vm_prot.h>
71 #include <mach/vm_statistics.h>
72 #include <mach/sdt.h>
73 #include <kern/counter.h>
74 #include <kern/host_statistics.h>
75 #include <kern/sched_prim.h>
76 #include <kern/policy_internal.h>
77 #include <kern/task.h>
78 #include <kern/thread.h>
79 #include <kern/kalloc.h>
80 #include <kern/zalloc_internal.h>
81 #include <kern/ledger.h>
82 #include <kern/ecc.h>
83 #include <vm/pmap.h>
84 #include <vm/vm_init.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_page.h>
87 #include <vm/vm_pageout.h>
88 #include <vm/vm_kern.h>                 /* kmem_alloc() */
89 #include <kern/misc_protos.h>
90 #include <mach_debug/zone_info.h>
91 #include <vm/cpm.h>
92 #include <pexpert/pexpert.h>
93 #include <pexpert/device_tree.h>
94 #include <san/kasan.h>
95 
96 #include <vm/vm_protos.h>
97 #include <vm/memory_object.h>
98 #include <vm/vm_purgeable_internal.h>
99 #include <vm/vm_compressor.h>
100 #if defined (__x86_64__)
101 #include <i386/misc_protos.h>
102 #endif
103 
104 #if CONFIG_PHANTOM_CACHE
105 #include <vm/vm_phantom_cache.h>
106 #endif
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #include <machine/pal_hibernate.h>
111 #endif /* HIBERNATION */
112 
113 #include <sys/kdebug.h>
114 
115 #if defined(HAS_APPLE_PAC)
116 #include <ptrauth.h>
117 #endif
118 #if defined(__arm64__)
119 #include <arm/cpu_internal.h>
120 #endif /* defined(__arm64__) */
121 
122 #if MACH_ASSERT
123 
124 #define ASSERT_PMAP_FREE(mem) pmap_assert_free(VM_PAGE_GET_PHYS_PAGE(mem))
125 
126 #else /* MACH_ASSERT */
127 
128 #define ASSERT_PMAP_FREE(mem) /* nothing */
129 
130 #endif /* MACH_ASSERT */
131 
132 extern boolean_t vm_pageout_running;
133 extern thread_t  vm_pageout_scan_thread;
134 extern boolean_t vps_dynamic_priority_enabled;
135 
136 char    vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
137 char    vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
138 char    vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
139 char    vm_page_active_or_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
140 
141 #if CONFIG_SECLUDED_MEMORY
142 struct vm_page_secluded_data vm_page_secluded;
143 #endif /* CONFIG_SECLUDED_MEMORY */
144 
145 #if DEVELOPMENT || DEBUG
146 extern struct memory_object_pager_ops shared_region_pager_ops;
147 unsigned int shared_region_pagers_resident_count = 0;
148 unsigned int shared_region_pagers_resident_peak = 0;
149 #endif /* DEVELOPMENT || DEBUG */
150 
151 
152 
153 int             PERCPU_DATA(start_color);
154 vm_page_t       PERCPU_DATA(free_pages);
155 boolean_t       hibernate_cleaning_in_progress = FALSE;
156 boolean_t       vm_page_free_verify = TRUE;
157 
158 uint32_t        vm_lopage_free_count = 0;
159 uint32_t        vm_lopage_free_limit = 0;
160 uint32_t        vm_lopage_lowater    = 0;
161 boolean_t       vm_lopage_refill = FALSE;
162 boolean_t       vm_lopage_needed = FALSE;
163 
164 int             speculative_age_index = 0;
165 int             speculative_steal_index = 0;
166 struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1];
167 
168 boolean_t       hibernation_vmqueues_inspection = FALSE; /* Tracks if the hibernation code is looking at the VM queues.
169                                                           * Updated and checked behind the vm_page_queues_lock. */
170 
171 static void             vm_page_free_prepare(vm_page_t  page);
172 static vm_page_t        vm_page_grab_fictitious_common(ppnum_t, boolean_t);
173 
174 static void vm_tag_init(void);
175 
176 /* for debugging purposes */
177 SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask =
178     VM_PAGE_PACKED_FROM_ARRAY;
179 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params =
180     VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR);
181 
182 /*
183  *	Associated with page of user-allocatable memory is a
184  *	page structure.
185  */
186 
187 /*
188  *	These variables record the values returned by vm_page_bootstrap,
189  *	for debugging purposes.  The implementation of pmap_steal_memory
190  *	and pmap_startup here also uses them internally.
191  */
192 
193 vm_offset_t virtual_space_start;
194 vm_offset_t virtual_space_end;
195 uint32_t        vm_page_pages;
196 
197 /*
198  *	The vm_page_lookup() routine, which provides for fast
199  *	(virtual memory object, offset) to page lookup, employs
200  *	the following hash table.  The vm_page_{insert,remove}
201  *	routines install and remove associations in the table.
202  *	[This table is often called the virtual-to-physical,
203  *	or VP, table.]
204  */
205 typedef struct {
206 	vm_page_packed_t page_list;
207 #if     MACH_PAGE_HASH_STATS
208 	int             cur_count;              /* current count */
209 	int             hi_count;               /* high water mark */
210 #endif /* MACH_PAGE_HASH_STATS */
211 } vm_page_bucket_t;
212 
213 
214 #define BUCKETS_PER_LOCK        16
215 
216 SECURITY_READ_ONLY_LATE(vm_page_bucket_t *) vm_page_buckets;                /* Array of buckets */
217 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_bucket_count = 0;       /* How big is array? */
218 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_hash_mask;              /* Mask for hash function */
219 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_hash_shift;             /* Shift for hash function */
220 SECURITY_READ_ONLY_LATE(uint32_t)           vm_page_bucket_hash;            /* Basic bucket hash */
221 SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_bucket_lock_count = 0;  /* How big is array of locks? */
222 
223 #ifndef VM_TAG_ACTIVE_UPDATE
224 #error VM_TAG_ACTIVE_UPDATE
225 #endif
226 #ifndef VM_TAG_SIZECLASSES
227 #error VM_TAG_SIZECLASSES
228 #endif
229 
230 /* for debugging */
231 SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE;
232 SECURITY_READ_ONLY_LATE(lck_spin_t *) vm_page_bucket_locks;
233 
234 vm_allocation_site_t            vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1];
235 vm_allocation_site_t *          vm_allocation_sites[VM_MAX_TAG_VALUE];
236 #if VM_TAG_SIZECLASSES
237 static vm_allocation_zone_total_t **vm_allocation_zone_totals;
238 #endif /* VM_TAG_SIZECLASSES */
239 
240 vm_tag_t vm_allocation_tag_highest;
241 
242 #if VM_PAGE_BUCKETS_CHECK
243 boolean_t vm_page_buckets_check_ready = FALSE;
244 #if VM_PAGE_FAKE_BUCKETS
245 vm_page_bucket_t *vm_page_fake_buckets; /* decoy buckets */
246 vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
247 #endif /* VM_PAGE_FAKE_BUCKETS */
248 #endif /* VM_PAGE_BUCKETS_CHECK */
249 
250 #if     MACH_PAGE_HASH_STATS
251 /* This routine is only for debug.  It is intended to be called by
252  * hand by a developer using a kernel debugger.  This routine prints
253  * out vm_page_hash table statistics to the kernel debug console.
254  */
255 void
hash_debug(void)256 hash_debug(void)
257 {
258 	int     i;
259 	int     numbuckets = 0;
260 	int     highsum = 0;
261 	int     maxdepth = 0;
262 
263 	for (i = 0; i < vm_page_bucket_count; i++) {
264 		if (vm_page_buckets[i].hi_count) {
265 			numbuckets++;
266 			highsum += vm_page_buckets[i].hi_count;
267 			if (vm_page_buckets[i].hi_count > maxdepth) {
268 				maxdepth = vm_page_buckets[i].hi_count;
269 			}
270 		}
271 	}
272 	printf("Total number of buckets: %d\n", vm_page_bucket_count);
273 	printf("Number used buckets:     %d = %d%%\n",
274 	    numbuckets, 100 * numbuckets / vm_page_bucket_count);
275 	printf("Number unused buckets:   %d = %d%%\n",
276 	    vm_page_bucket_count - numbuckets,
277 	    100 * (vm_page_bucket_count - numbuckets) / vm_page_bucket_count);
278 	printf("Sum of bucket max depth: %d\n", highsum);
279 	printf("Average bucket depth:    %d.%2d\n",
280 	    highsum / vm_page_bucket_count,
281 	    highsum % vm_page_bucket_count);
282 	printf("Maximum bucket depth:    %d\n", maxdepth);
283 }
284 #endif /* MACH_PAGE_HASH_STATS */
285 
286 /*
287  *	The virtual page size is currently implemented as a runtime
288  *	variable, but is constant once initialized using vm_set_page_size.
289  *	This initialization must be done in the machine-dependent
290  *	bootstrap sequence, before calling other machine-independent
291  *	initializations.
292  *
293  *	All references to the virtual page size outside this
294  *	module must use the PAGE_SIZE, PAGE_MASK and PAGE_SHIFT
295  *	constants.
296  */
297 #if defined(__arm64__)
298 vm_size_t       page_size;
299 vm_size_t       page_mask;
300 int             page_shift;
301 #else
302 vm_size_t       page_size  = PAGE_SIZE;
303 vm_size_t       page_mask  = PAGE_MASK;
304 int             page_shift = PAGE_SHIFT;
305 #endif
306 
307 SECURITY_READ_ONLY_LATE(vm_page_t) vm_pages = VM_PAGE_NULL;
308 SECURITY_READ_ONLY_LATE(vm_page_t) vm_page_array_beginning_addr;
309 vm_page_t                          vm_page_array_ending_addr;
310 
311 unsigned int    vm_pages_count = 0;
312 
313 /*
314  *	Resident pages that represent real memory
315  *	are allocated from a set of free lists,
316  *	one per color.
317  */
318 unsigned int    vm_colors;
319 unsigned int    vm_color_mask;                  /* mask is == (vm_colors-1) */
320 unsigned int    vm_cache_geometry_colors = 0;   /* set by hw dependent code during startup */
321 unsigned int    vm_free_magazine_refill_limit = 0;
322 
323 
324 struct vm_page_queue_free_head {
325 	vm_page_queue_head_t    qhead;
326 } VM_PAGE_PACKED_ALIGNED;
327 
328 struct vm_page_queue_free_head  vm_page_queue_free[MAX_COLORS];
329 
330 
331 unsigned int    vm_page_free_wanted;
332 unsigned int    vm_page_free_wanted_privileged;
333 #if CONFIG_SECLUDED_MEMORY
334 unsigned int    vm_page_free_wanted_secluded;
335 #endif /* CONFIG_SECLUDED_MEMORY */
336 unsigned int    vm_page_free_count;
337 
338 unsigned int    vm_page_realtime_count;
339 
340 /*
341  *	Occasionally, the virtual memory system uses
342  *	resident page structures that do not refer to
343  *	real pages, for example to leave a page with
344  *	important state information in the VP table.
345  *
346  *	These page structures are allocated the way
347  *	most other kernel structures are.
348  */
349 SECURITY_READ_ONLY_LATE(zone_t) vm_page_zone;
350 vm_locks_array_t vm_page_locks;
351 
352 LCK_ATTR_DECLARE(vm_page_lck_attr, 0, 0);
353 LCK_GRP_DECLARE(vm_page_lck_grp_free, "vm_page_free");
354 LCK_GRP_DECLARE(vm_page_lck_grp_queue, "vm_page_queue");
355 LCK_GRP_DECLARE(vm_page_lck_grp_local, "vm_page_queue_local");
356 LCK_GRP_DECLARE(vm_page_lck_grp_purge, "vm_page_purge");
357 LCK_GRP_DECLARE(vm_page_lck_grp_alloc, "vm_page_alloc");
358 LCK_GRP_DECLARE(vm_page_lck_grp_bucket, "vm_page_bucket");
359 LCK_SPIN_DECLARE_ATTR(vm_objects_wired_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
360 LCK_TICKET_DECLARE(vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
361 
362 unsigned int    vm_page_local_q_soft_limit = 250;
363 unsigned int    vm_page_local_q_hard_limit = 500;
364 struct vpl     *__zpercpu vm_page_local_q;
365 
366 /* N.B. Guard and fictitious pages must not
367  * be assigned a zero phys_page value.
368  */
369 /*
370  *	Fictitious pages don't have a physical address,
371  *	but we must initialize phys_page to something.
372  *	For debugging, this should be a strange value
373  *	that the pmap module can recognize in assertions.
374  */
375 const ppnum_t vm_page_fictitious_addr = (ppnum_t) -1;
376 
377 /*
378  *	Guard pages are not accessible so they don't
379  *      need a physical address, but we need to enter
380  *	one in the pmap.
381  *	Let's make it recognizable and make sure that
382  *	we don't use a real physical page with that
383  *	physical address.
384  */
385 const ppnum_t vm_page_guard_addr = (ppnum_t) -2;
386 
387 /*
388  *	Resident page structures are also chained on
389  *	queues that are used by the page replacement
390  *	system (pageout daemon).  These queues are
391  *	defined here, but are shared by the pageout
392  *	module.  The inactive queue is broken into
393  *	file backed and anonymous for convenience as the
394  *	pageout daemon often assignes a higher
395  *	importance to anonymous pages (less likely to pick)
396  */
397 vm_page_queue_head_t    vm_page_queue_active VM_PAGE_PACKED_ALIGNED;
398 vm_page_queue_head_t    vm_page_queue_inactive VM_PAGE_PACKED_ALIGNED;
399 #if CONFIG_SECLUDED_MEMORY
400 vm_page_queue_head_t    vm_page_queue_secluded VM_PAGE_PACKED_ALIGNED;
401 #endif /* CONFIG_SECLUDED_MEMORY */
402 vm_page_queue_head_t    vm_page_queue_anonymous VM_PAGE_PACKED_ALIGNED;  /* inactive memory queue for anonymous pages */
403 vm_page_queue_head_t    vm_page_queue_throttled VM_PAGE_PACKED_ALIGNED;
404 
405 queue_head_t    vm_objects_wired;
406 
407 void vm_update_darkwake_mode(boolean_t);
408 
409 vm_page_queue_head_t    vm_page_queue_donate VM_PAGE_PACKED_ALIGNED;
410 uint32_t        vm_page_donate_mode;
411 uint32_t        vm_page_donate_target, vm_page_donate_target_high, vm_page_donate_target_low;
412 uint32_t        vm_page_donate_count;
413 bool            vm_page_donate_queue_ripe;
414 
415 
416 vm_page_queue_head_t    vm_page_queue_background VM_PAGE_PACKED_ALIGNED;
417 uint32_t        vm_page_background_target;
418 uint32_t        vm_page_background_target_snapshot;
419 uint32_t        vm_page_background_count;
420 uint64_t        vm_page_background_promoted_count;
421 
422 uint32_t        vm_page_background_internal_count;
423 uint32_t        vm_page_background_external_count;
424 
425 uint32_t        vm_page_background_mode;
426 uint32_t        vm_page_background_exclude_external;
427 
428 unsigned int    vm_page_active_count;
429 unsigned int    vm_page_inactive_count;
430 unsigned int    vm_page_kernelcache_count;
431 #if CONFIG_SECLUDED_MEMORY
432 unsigned int    vm_page_secluded_count;
433 unsigned int    vm_page_secluded_count_free;
434 unsigned int    vm_page_secluded_count_inuse;
435 unsigned int    vm_page_secluded_count_over_target;
436 #endif /* CONFIG_SECLUDED_MEMORY */
437 unsigned int    vm_page_anonymous_count;
438 unsigned int    vm_page_throttled_count;
439 unsigned int    vm_page_speculative_count;
440 
441 unsigned int    vm_page_wire_count;
442 unsigned int    vm_page_wire_count_on_boot = 0;
443 unsigned int    vm_page_stolen_count = 0;
444 unsigned int    vm_page_wire_count_initial;
445 unsigned int    vm_page_gobble_count = 0;
446 unsigned int    vm_page_kern_lpage_count = 0;
447 
448 uint64_t        booter_size;  /* external so it can be found in core dumps */
449 
450 #define VM_PAGE_WIRE_COUNT_WARNING      0
451 #define VM_PAGE_GOBBLE_COUNT_WARNING    0
452 
453 unsigned int    vm_page_purgeable_count = 0; /* # of pages purgeable now */
454 unsigned int    vm_page_purgeable_wired_count = 0; /* # of purgeable pages that are wired now */
455 uint64_t        vm_page_purged_count = 0;    /* total count of purged pages */
456 
457 unsigned int    vm_page_xpmapped_external_count = 0;
458 unsigned int    vm_page_external_count = 0;
459 unsigned int    vm_page_internal_count = 0;
460 unsigned int    vm_page_pageable_external_count = 0;
461 unsigned int    vm_page_pageable_internal_count = 0;
462 
463 #if DEVELOPMENT || DEBUG
464 unsigned int    vm_page_speculative_recreated = 0;
465 unsigned int    vm_page_speculative_created = 0;
466 unsigned int    vm_page_speculative_used = 0;
467 #endif
468 
469 vm_page_queue_head_t    vm_page_queue_cleaned VM_PAGE_PACKED_ALIGNED;
470 
471 unsigned int    vm_page_cleaned_count = 0;
472 
473 uint64_t        max_valid_dma_address = 0xffffffffffffffffULL;
474 ppnum_t         max_valid_low_ppnum = PPNUM_MAX;
475 
476 
477 /*
478  *	Several page replacement parameters are also
479  *	shared with this module, so that page allocation
480  *	(done here in vm_page_alloc) can trigger the
481  *	pageout daemon.
482  */
483 unsigned int    vm_page_free_target = 0;
484 unsigned int    vm_page_free_min = 0;
485 unsigned int    vm_page_throttle_limit = 0;
486 unsigned int    vm_page_inactive_target = 0;
487 #if CONFIG_SECLUDED_MEMORY
488 unsigned int    vm_page_secluded_target = 0;
489 #endif /* CONFIG_SECLUDED_MEMORY */
490 unsigned int    vm_page_anonymous_min = 0;
491 unsigned int    vm_page_free_reserved = 0;
492 
493 
494 /*
495  *	The VM system has a couple of heuristics for deciding
496  *	that pages are "uninteresting" and should be placed
497  *	on the inactive queue as likely candidates for replacement.
498  *	These variables let the heuristics be controlled at run-time
499  *	to make experimentation easier.
500  */
501 
502 boolean_t vm_page_deactivate_hint = TRUE;
503 
504 struct vm_page_stats_reusable vm_page_stats_reusable;
505 
506 /*
507  *	vm_set_page_size:
508  *
509  *	Sets the page size, perhaps based upon the memory
510  *	size.  Must be called before any use of page-size
511  *	dependent functions.
512  *
513  *	Sets page_shift and page_mask from page_size.
514  */
515 void
vm_set_page_size(void)516 vm_set_page_size(void)
517 {
518 	page_size  = PAGE_SIZE;
519 	page_mask  = PAGE_MASK;
520 	page_shift = PAGE_SHIFT;
521 
522 	if ((page_mask & page_size) != 0) {
523 		panic("vm_set_page_size: page size not a power of two");
524 	}
525 
526 	for (page_shift = 0;; page_shift++) {
527 		if ((1U << page_shift) == page_size) {
528 			break;
529 		}
530 	}
531 }
532 
533 #if defined (__x86_64__)
534 
535 #define MAX_CLUMP_SIZE      16
536 #define DEFAULT_CLUMP_SIZE  4
537 
538 unsigned int vm_clump_size, vm_clump_mask, vm_clump_shift, vm_clump_promote_threshold;
539 
540 #if DEVELOPMENT || DEBUG
541 unsigned long vm_clump_stats[MAX_CLUMP_SIZE + 1];
542 unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
543 
544 static inline void
vm_clump_update_stats(unsigned int c)545 vm_clump_update_stats(unsigned int c)
546 {
547 	assert(c <= vm_clump_size);
548 	if (c > 0 && c <= vm_clump_size) {
549 		vm_clump_stats[c] += c;
550 	}
551 	vm_clump_allocs += c;
552 }
553 #endif  /*  if DEVELOPMENT || DEBUG */
554 
555 /* Called once to setup the VM clump knobs */
556 static void
vm_page_setup_clump(void)557 vm_page_setup_clump( void )
558 {
559 	unsigned int override, n;
560 
561 	vm_clump_size = DEFAULT_CLUMP_SIZE;
562 	if (PE_parse_boot_argn("clump_size", &override, sizeof(override))) {
563 		vm_clump_size = override;
564 	}
565 
566 	if (vm_clump_size > MAX_CLUMP_SIZE) {
567 		panic("vm_page_setup_clump:: clump_size is too large!");
568 	}
569 	if (vm_clump_size < 1) {
570 		panic("vm_page_setup_clump:: clump_size must be >= 1");
571 	}
572 	if ((vm_clump_size & (vm_clump_size - 1)) != 0) {
573 		panic("vm_page_setup_clump:: clump_size must be a power of 2");
574 	}
575 
576 	vm_clump_promote_threshold = vm_clump_size;
577 	vm_clump_mask = vm_clump_size - 1;
578 	for (vm_clump_shift = 0, n = vm_clump_size; n > 1; n >>= 1, vm_clump_shift++) {
579 		;
580 	}
581 
582 #if DEVELOPMENT || DEBUG
583 	bzero(vm_clump_stats, sizeof(vm_clump_stats));
584 	vm_clump_allocs = vm_clump_inserts = vm_clump_inrange = vm_clump_promotes = 0;
585 #endif  /*  if DEVELOPMENT || DEBUG */
586 }
587 
588 #endif  /* #if defined (__x86_64__) */
589 
590 #define COLOR_GROUPS_TO_STEAL   4
591 
592 /* Called once during statup, once the cache geometry is known.
593  */
594 static void
vm_page_set_colors(void)595 vm_page_set_colors( void )
596 {
597 	unsigned int    n, override;
598 
599 #if defined (__x86_64__)
600 	/* adjust #colors because we need to color outside the clump boundary */
601 	vm_cache_geometry_colors >>= vm_clump_shift;
602 #endif
603 	if (PE_parse_boot_argn("colors", &override, sizeof(override))) {                /* colors specified as a boot-arg? */
604 		n = override;
605 	} else if (vm_cache_geometry_colors) {                  /* do we know what the cache geometry is? */
606 		n = vm_cache_geometry_colors;
607 	} else {
608 		n = DEFAULT_COLORS;                             /* use default if all else fails */
609 	}
610 	if (n == 0) {
611 		n = 1;
612 	}
613 	if (n > MAX_COLORS) {
614 		n = MAX_COLORS;
615 	}
616 
617 	/* the count must be a power of 2  */
618 	if ((n & (n - 1)) != 0) {
619 		n = DEFAULT_COLORS;                             /* use default if all else fails */
620 	}
621 	vm_colors = n;
622 	vm_color_mask = n - 1;
623 
624 	vm_free_magazine_refill_limit = vm_colors * COLOR_GROUPS_TO_STEAL;
625 
626 #if defined (__x86_64__)
627 	/* adjust for reduction in colors due to clumping and multiple cores */
628 	if (real_ncpus) {
629 		vm_free_magazine_refill_limit *= (vm_clump_size * real_ncpus);
630 	}
631 #endif
632 }
633 
634 /*
635  * During single threaded early boot we don't initialize all pages.
636  * This avoids some delay during boot. They'll be initialized and
637  * added to the free list as needed or after we are multithreaded by
638  * what becomes the pageout thread.
639  */
640 static boolean_t fill = FALSE;
641 static unsigned int fillval;
642 uint_t vm_delayed_count = 0;    /* when non-zero, indicates we may have more pages to init */
643 ppnum_t delay_above_pnum = PPNUM_MAX;
644 
645 /*
646  * For x86 first 8 Gig initializes quickly and gives us lots of lowmem + mem above to start off with.
647  * If ARM ever uses delayed page initialization, this value may need to be quite different.
648  */
649 #define DEFAULT_DELAY_ABOVE_PHYS_GB (8)
650 
651 /*
652  * When we have to dip into more delayed pages due to low memory, free up
653  * a large chunk to get things back to normal. This avoids contention on the
654  * delayed code allocating page by page.
655  */
656 #define VM_DELAY_PAGE_CHUNK ((1024 * 1024 * 1024) / PAGE_SIZE)
657 
658 /*
659  * Get and initialize the next delayed page.
660  */
661 static vm_page_t
vm_get_delayed_page(int grab_options)662 vm_get_delayed_page(int grab_options)
663 {
664 	vm_page_t p;
665 	ppnum_t   pnum;
666 
667 	/*
668 	 * Get a new page if we have one.
669 	 */
670 	vm_free_page_lock();
671 	if (vm_delayed_count == 0) {
672 		vm_free_page_unlock();
673 		return NULL;
674 	}
675 
676 	if (!pmap_next_page(&pnum)) {
677 		vm_delayed_count = 0;
678 		vm_free_page_unlock();
679 		return NULL;
680 	}
681 
682 
683 	assert(vm_delayed_count > 0);
684 	--vm_delayed_count;
685 
686 #if defined(__x86_64__)
687 	/* x86 cluster code requires increasing phys_page in vm_pages[] */
688 	if (vm_pages_count > 0) {
689 		assert(pnum > vm_pages[vm_pages_count - 1].vmp_phys_page);
690 	}
691 #endif
692 	p = &vm_pages[vm_pages_count];
693 	assert(p < vm_page_array_ending_addr);
694 	vm_page_init(p, pnum, FALSE);
695 	++vm_pages_count;
696 	++vm_page_pages;
697 	vm_free_page_unlock();
698 
699 	/*
700 	 * These pages were initially counted as wired, undo that now.
701 	 */
702 	if (grab_options & VM_PAGE_GRAB_Q_LOCK_HELD) {
703 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
704 	} else {
705 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
706 		vm_page_lockspin_queues();
707 	}
708 	--vm_page_wire_count;
709 	--vm_page_wire_count_initial;
710 	if (vm_page_wire_count_on_boot != 0) {
711 		--vm_page_wire_count_on_boot;
712 	}
713 	if (!(grab_options & VM_PAGE_GRAB_Q_LOCK_HELD)) {
714 		vm_page_unlock_queues();
715 	}
716 
717 
718 	if (fill) {
719 		fillPage(pnum, fillval);
720 	}
721 	return p;
722 }
723 
724 static void vm_page_module_init_delayed(void);
725 
726 /*
727  * Free all remaining delayed pages to the free lists.
728  */
729 void
vm_free_delayed_pages(void)730 vm_free_delayed_pages(void)
731 {
732 	vm_page_t   p;
733 	vm_page_t   list = NULL;
734 	uint_t      cnt = 0;
735 	vm_offset_t start_free_va;
736 	int64_t     free_size;
737 
738 	while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) {
739 		if (vm_himemory_mode) {
740 			vm_page_release(p, FALSE);
741 		} else {
742 			p->vmp_snext = list;
743 			list = p;
744 		}
745 		++cnt;
746 	}
747 
748 	/*
749 	 * Free the pages in reverse order if not himemory mode.
750 	 * Hence the low memory pages will be first on free lists. (LIFO)
751 	 */
752 	while (list != NULL) {
753 		p = list;
754 		list = p->vmp_snext;
755 		p->vmp_snext = NULL;
756 		vm_page_release(p, FALSE);
757 	}
758 #if DEVELOPMENT || DEBUG
759 	kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt);
760 #endif
761 
762 	/*
763 	 * Free up any unused full pages at the end of the vm_pages[] array
764 	 */
765 	start_free_va = round_page((vm_offset_t)&vm_pages[vm_pages_count]);
766 
767 #if defined(__x86_64__)
768 	/*
769 	 * Since x86 might have used large pages for vm_pages[], we can't
770 	 * free starting in the middle of a partially used large page.
771 	 */
772 	if (pmap_query_pagesize(kernel_pmap, start_free_va) == I386_LPGBYTES) {
773 		start_free_va = ((start_free_va + I386_LPGMASK) & ~I386_LPGMASK);
774 	}
775 #endif
776 	if (start_free_va < (vm_offset_t)vm_page_array_ending_addr) {
777 		free_size = trunc_page((vm_offset_t)vm_page_array_ending_addr - start_free_va);
778 		if (free_size > 0) {
779 			ml_static_mfree(start_free_va, (vm_offset_t)free_size);
780 			vm_page_array_ending_addr = (void *)start_free_va;
781 
782 			/*
783 			 * Note there's no locking here, as only this thread will ever change this value.
784 			 * The reader, vm_page_diagnose, doesn't grab any locks for the counts it looks at.
785 			 */
786 			vm_page_stolen_count -= (free_size >> PAGE_SHIFT);
787 
788 #if DEVELOPMENT || DEBUG
789 			kprintf("Freeing final unused %ld bytes from vm_pages[] at 0x%lx\n",
790 			    (long)free_size, (long)start_free_va);
791 #endif
792 		}
793 	}
794 
795 
796 	/*
797 	 * now we can create the VM page array zone
798 	 */
799 	vm_page_module_init_delayed();
800 }
801 
802 /*
803  * Try and free up enough delayed pages to match a contig memory allocation.
804  */
805 static void
vm_free_delayed_pages_contig(uint_t npages,ppnum_t max_pnum,ppnum_t pnum_mask)806 vm_free_delayed_pages_contig(
807 	uint_t    npages,
808 	ppnum_t   max_pnum,
809 	ppnum_t   pnum_mask)
810 {
811 	vm_page_t p;
812 	ppnum_t   pnum;
813 	uint_t    cnt = 0;
814 
815 	/*
816 	 * Treat 0 as the absolute max page number.
817 	 */
818 	if (max_pnum == 0) {
819 		max_pnum = PPNUM_MAX;
820 	}
821 
822 	/*
823 	 * Free till we get a properly aligned start page
824 	 */
825 	for (;;) {
826 		p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
827 		if (p == NULL) {
828 			return;
829 		}
830 		pnum = VM_PAGE_GET_PHYS_PAGE(p);
831 		vm_page_release(p, FALSE);
832 		if (pnum >= max_pnum) {
833 			return;
834 		}
835 		if ((pnum & pnum_mask) == 0) {
836 			break;
837 		}
838 	}
839 
840 	/*
841 	 * Having a healthy pool of free pages will help performance. We don't
842 	 * want to fall back to the delayed code for every page allocation.
843 	 */
844 	if (vm_page_free_count < VM_DELAY_PAGE_CHUNK) {
845 		npages += VM_DELAY_PAGE_CHUNK;
846 	}
847 
848 	/*
849 	 * Now free up the pages
850 	 */
851 	for (cnt = 1; cnt < npages; ++cnt) {
852 		p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE);
853 		if (p == NULL) {
854 			return;
855 		}
856 		vm_page_release(p, FALSE);
857 	}
858 }
859 
860 #define ROUNDUP_NEXTP2(X) (1U << (32 - __builtin_clz((X) - 1)))
861 
862 void
vm_page_init_local_q(unsigned int num_cpus)863 vm_page_init_local_q(unsigned int num_cpus)
864 {
865 	struct vpl *t_local_q;
866 
867 	/*
868 	 * no point in this for a uni-processor system
869 	 */
870 	if (num_cpus >= 2) {
871 		ml_cpu_info_t cpu_info;
872 
873 		/*
874 		 * Force the allocation alignment to a cacheline,
875 		 * because the `vpl` struct has a lock and will be taken
876 		 * cross CPU so we want to isolate the rest of the per-CPU
877 		 * data to avoid false sharing due to this lock being taken.
878 		 */
879 
880 		ml_cpu_get_info(&cpu_info);
881 
882 		t_local_q = zalloc_percpu_permanent(sizeof(struct vpl),
883 		    cpu_info.cache_line_size - 1);
884 
885 		zpercpu_foreach(lq, t_local_q) {
886 			VPL_LOCK_INIT(lq, &vm_page_lck_grp_local, &vm_page_lck_attr);
887 			vm_page_queue_init(&lq->vpl_queue);
888 		}
889 
890 		/* make the initialization visible to all cores */
891 		os_atomic_store(&vm_page_local_q, t_local_q, release);
892 	}
893 }
894 
895 /*
896  * vm_init_before_launchd
897  *
898  * This should be called right before launchd is loaded.
899  */
900 void
vm_init_before_launchd()901 vm_init_before_launchd()
902 {
903 	vm_page_lockspin_queues();
904 	vm_page_wire_count_on_boot = vm_page_wire_count;
905 	vm_page_unlock_queues();
906 }
907 
908 
909 /*
910  *	vm_page_bootstrap:
911  *
912  *	Initializes the resident memory module.
913  *
914  *	Allocates memory for the page cells, and
915  *	for the object/offset-to-page hash table headers.
916  *	Each page cell is initialized and placed on the free list.
917  *	Returns the range of available kernel virtual memory.
918  */
919 __startup_func
920 void
vm_page_bootstrap(vm_offset_t * startp,vm_offset_t * endp)921 vm_page_bootstrap(
922 	vm_offset_t             *startp,
923 	vm_offset_t             *endp)
924 {
925 	unsigned int            i;
926 	unsigned int            log1;
927 	unsigned int            log2;
928 	unsigned int            size;
929 
930 	/*
931 	 *	Initialize the page queues.
932 	 */
933 
934 	lck_mtx_init(&vm_page_queue_free_lock, &vm_page_lck_grp_free, &vm_page_lck_attr);
935 	lck_mtx_init(&vm_page_queue_lock, &vm_page_lck_grp_queue, &vm_page_lck_attr);
936 	lck_mtx_init(&vm_purgeable_queue_lock, &vm_page_lck_grp_purge, &vm_page_lck_attr);
937 
938 	for (i = 0; i < PURGEABLE_Q_TYPE_MAX; i++) {
939 		int group;
940 
941 		purgeable_queues[i].token_q_head = 0;
942 		purgeable_queues[i].token_q_tail = 0;
943 		for (group = 0; group < NUM_VOLATILE_GROUPS; group++) {
944 			queue_init(&purgeable_queues[i].objq[group]);
945 		}
946 
947 		purgeable_queues[i].type = i;
948 		purgeable_queues[i].new_pages = 0;
949 #if MACH_ASSERT
950 		purgeable_queues[i].debug_count_tokens = 0;
951 		purgeable_queues[i].debug_count_objects = 0;
952 #endif
953 	}
954 	;
955 	purgeable_nonvolatile_count = 0;
956 	queue_init(&purgeable_nonvolatile_queue);
957 
958 	for (i = 0; i < MAX_COLORS; i++) {
959 		vm_page_queue_init(&vm_page_queue_free[i].qhead);
960 	}
961 
962 	vm_page_queue_init(&vm_lopage_queue_free);
963 	vm_page_queue_init(&vm_page_queue_active);
964 	vm_page_queue_init(&vm_page_queue_inactive);
965 #if CONFIG_SECLUDED_MEMORY
966 	vm_page_queue_init(&vm_page_queue_secluded);
967 #endif /* CONFIG_SECLUDED_MEMORY */
968 	vm_page_queue_init(&vm_page_queue_cleaned);
969 	vm_page_queue_init(&vm_page_queue_throttled);
970 	vm_page_queue_init(&vm_page_queue_anonymous);
971 	queue_init(&vm_objects_wired);
972 
973 	for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
974 		vm_page_queue_init(&vm_page_queue_speculative[i].age_q);
975 
976 		vm_page_queue_speculative[i].age_ts.tv_sec = 0;
977 		vm_page_queue_speculative[i].age_ts.tv_nsec = 0;
978 	}
979 
980 	vm_page_queue_init(&vm_page_queue_donate);
981 	vm_page_queue_init(&vm_page_queue_background);
982 
983 	vm_page_background_count = 0;
984 	vm_page_background_internal_count = 0;
985 	vm_page_background_external_count = 0;
986 	vm_page_background_promoted_count = 0;
987 
988 	vm_page_background_target = (unsigned int)(atop_64(max_mem) / 25);
989 
990 	if (vm_page_background_target > VM_PAGE_BACKGROUND_TARGET_MAX) {
991 		vm_page_background_target = VM_PAGE_BACKGROUND_TARGET_MAX;
992 	}
993 
994 #if    defined(__LP64__)
995 	vm_page_background_mode = VM_PAGE_BG_ENABLED;
996 	vm_page_donate_mode = VM_PAGE_DONATE_ENABLED;
997 #else
998 	vm_page_background_mode = VM_PAGE_BG_DISABLED;
999 	vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
1000 #endif
1001 	vm_page_background_exclude_external = 0;
1002 
1003 	PE_parse_boot_argn("vm_page_bg_mode", &vm_page_background_mode, sizeof(vm_page_background_mode));
1004 	PE_parse_boot_argn("vm_page_bg_exclude_external", &vm_page_background_exclude_external, sizeof(vm_page_background_exclude_external));
1005 	PE_parse_boot_argn("vm_page_bg_target", &vm_page_background_target, sizeof(vm_page_background_target));
1006 
1007 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && vm_page_background_mode != VM_PAGE_BG_ENABLED) {
1008 		vm_page_background_mode = VM_PAGE_BG_DISABLED;
1009 	}
1010 
1011 	PE_parse_boot_argn("vm_page_donate_mode", &vm_page_donate_mode, sizeof(vm_page_donate_mode));
1012 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED && vm_page_donate_mode != VM_PAGE_DONATE_ENABLED) {
1013 		vm_page_donate_mode = VM_PAGE_DONATE_DISABLED;
1014 	}
1015 
1016 	vm_page_donate_target_high = VM_PAGE_DONATE_TARGET_HIGHWATER;
1017 	vm_page_donate_target_low = VM_PAGE_DONATE_TARGET_LOWWATER;
1018 	vm_page_donate_target = vm_page_donate_target_high;
1019 	vm_page_donate_count = 0;
1020 
1021 	vm_page_free_wanted = 0;
1022 	vm_page_free_wanted_privileged = 0;
1023 #if CONFIG_SECLUDED_MEMORY
1024 	vm_page_free_wanted_secluded = 0;
1025 #endif /* CONFIG_SECLUDED_MEMORY */
1026 
1027 #if defined (__x86_64__)
1028 	/* this must be called before vm_page_set_colors() */
1029 	vm_page_setup_clump();
1030 #endif
1031 
1032 	vm_page_set_colors();
1033 
1034 	bzero(vm_page_inactive_states, sizeof(vm_page_inactive_states));
1035 	vm_page_inactive_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1036 	vm_page_inactive_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1037 	vm_page_inactive_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1038 
1039 	bzero(vm_page_pageable_states, sizeof(vm_page_pageable_states));
1040 	vm_page_pageable_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1041 	vm_page_pageable_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1042 	vm_page_pageable_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1043 	vm_page_pageable_states[VM_PAGE_ON_ACTIVE_Q] = 1;
1044 	vm_page_pageable_states[VM_PAGE_ON_SPECULATIVE_Q] = 1;
1045 	vm_page_pageable_states[VM_PAGE_ON_THROTTLED_Q] = 1;
1046 #if CONFIG_SECLUDED_MEMORY
1047 	vm_page_pageable_states[VM_PAGE_ON_SECLUDED_Q] = 1;
1048 #endif /* CONFIG_SECLUDED_MEMORY */
1049 
1050 	bzero(vm_page_non_speculative_pageable_states, sizeof(vm_page_non_speculative_pageable_states));
1051 	vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1052 	vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1053 	vm_page_non_speculative_pageable_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1054 	vm_page_non_speculative_pageable_states[VM_PAGE_ON_ACTIVE_Q] = 1;
1055 	vm_page_non_speculative_pageable_states[VM_PAGE_ON_THROTTLED_Q] = 1;
1056 #if CONFIG_SECLUDED_MEMORY
1057 	vm_page_non_speculative_pageable_states[VM_PAGE_ON_SECLUDED_Q] = 1;
1058 #endif /* CONFIG_SECLUDED_MEMORY */
1059 
1060 	bzero(vm_page_active_or_inactive_states, sizeof(vm_page_active_or_inactive_states));
1061 	vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_INTERNAL_Q] = 1;
1062 	vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = 1;
1063 	vm_page_active_or_inactive_states[VM_PAGE_ON_INACTIVE_CLEANED_Q] = 1;
1064 	vm_page_active_or_inactive_states[VM_PAGE_ON_ACTIVE_Q] = 1;
1065 #if CONFIG_SECLUDED_MEMORY
1066 	vm_page_active_or_inactive_states[VM_PAGE_ON_SECLUDED_Q] = 1;
1067 #endif /* CONFIG_SECLUDED_MEMORY */
1068 
1069 	for (vm_tag_t t = 0; t < VM_KERN_MEMORY_FIRST_DYNAMIC; t++) {
1070 		vm_allocation_sites_static[t].refcount = 2;
1071 		vm_allocation_sites_static[t].tag = t;
1072 		vm_allocation_sites[t] = &vm_allocation_sites_static[t];
1073 	}
1074 	vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].refcount = 2;
1075 	vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC].tag = VM_KERN_MEMORY_ANY;
1076 	vm_allocation_sites[VM_KERN_MEMORY_ANY] = &vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC];
1077 
1078 	/*
1079 	 *	Steal memory for the map and zone subsystems.
1080 	 */
1081 	kernel_startup_initialize_upto(STARTUP_SUB_PMAP_STEAL);
1082 
1083 	/*
1084 	 *	Allocate (and initialize) the virtual-to-physical
1085 	 *	table hash buckets.
1086 	 *
1087 	 *	The number of buckets should be a power of two to
1088 	 *	get a good hash function.  The following computation
1089 	 *	chooses the first power of two that is greater
1090 	 *	than the number of physical pages in the system.
1091 	 */
1092 
1093 	if (vm_page_bucket_count == 0) {
1094 		unsigned int npages = pmap_free_pages();
1095 
1096 		vm_page_bucket_count = 1;
1097 		while (vm_page_bucket_count < npages) {
1098 			vm_page_bucket_count <<= 1;
1099 		}
1100 	}
1101 	vm_page_bucket_lock_count = (vm_page_bucket_count + BUCKETS_PER_LOCK - 1) / BUCKETS_PER_LOCK;
1102 
1103 	vm_page_hash_mask = vm_page_bucket_count - 1;
1104 
1105 	/*
1106 	 *	Calculate object shift value for hashing algorithm:
1107 	 *		O = log2(sizeof(struct vm_object))
1108 	 *		B = log2(vm_page_bucket_count)
1109 	 *	        hash shifts the object left by
1110 	 *		B/2 - O
1111 	 */
1112 	size = vm_page_bucket_count;
1113 	for (log1 = 0; size > 1; log1++) {
1114 		size /= 2;
1115 	}
1116 	size = sizeof(struct vm_object);
1117 	for (log2 = 0; size > 1; log2++) {
1118 		size /= 2;
1119 	}
1120 	vm_page_hash_shift = log1 / 2 - log2 + 1;
1121 
1122 	vm_page_bucket_hash = 1 << ((log1 + 1) >> 1);           /* Get (ceiling of sqrt of table size) */
1123 	vm_page_bucket_hash |= 1 << ((log1 + 1) >> 2);          /* Get (ceiling of quadroot of table size) */
1124 	vm_page_bucket_hash |= 1;                                                       /* Set bit and add 1 - always must be 1 to insure unique series */
1125 
1126 	if (vm_page_hash_mask & vm_page_bucket_count) {
1127 		printf("vm_page_bootstrap: WARNING -- strange page hash\n");
1128 	}
1129 
1130 #if VM_PAGE_BUCKETS_CHECK
1131 #if VM_PAGE_FAKE_BUCKETS
1132 	/*
1133 	 * Allocate a decoy set of page buckets, to detect
1134 	 * any stomping there.
1135 	 */
1136 	vm_page_fake_buckets = (vm_page_bucket_t *)
1137 	    pmap_steal_memory(vm_page_bucket_count *
1138 	    sizeof(vm_page_bucket_t), 0);
1139 	vm_page_fake_buckets_start = (vm_map_offset_t) vm_page_fake_buckets;
1140 	vm_page_fake_buckets_end =
1141 	    vm_map_round_page((vm_page_fake_buckets_start +
1142 	    (vm_page_bucket_count *
1143 	    sizeof(vm_page_bucket_t))),
1144 	    PAGE_MASK);
1145 	char *cp;
1146 	for (cp = (char *)vm_page_fake_buckets_start;
1147 	    cp < (char *)vm_page_fake_buckets_end;
1148 	    cp++) {
1149 		*cp = 0x5a;
1150 	}
1151 #endif /* VM_PAGE_FAKE_BUCKETS */
1152 #endif /* VM_PAGE_BUCKETS_CHECK */
1153 
1154 	kernel_debug_string_early("vm_page_buckets");
1155 	vm_page_buckets = (vm_page_bucket_t *)
1156 	    pmap_steal_memory(vm_page_bucket_count *
1157 	    sizeof(vm_page_bucket_t), 0);
1158 
1159 	kernel_debug_string_early("vm_page_bucket_locks");
1160 	vm_page_bucket_locks = (lck_spin_t *)
1161 	    pmap_steal_memory(vm_page_bucket_lock_count *
1162 	    sizeof(lck_spin_t), 0);
1163 
1164 	for (i = 0; i < vm_page_bucket_count; i++) {
1165 		vm_page_bucket_t *bucket = &vm_page_buckets[i];
1166 
1167 		bucket->page_list = VM_PAGE_PACK_PTR(VM_PAGE_NULL);
1168 #if     MACH_PAGE_HASH_STATS
1169 		bucket->cur_count = 0;
1170 		bucket->hi_count = 0;
1171 #endif /* MACH_PAGE_HASH_STATS */
1172 	}
1173 
1174 	for (i = 0; i < vm_page_bucket_lock_count; i++) {
1175 		lck_spin_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket, &vm_page_lck_attr);
1176 	}
1177 
1178 	vm_tag_init();
1179 
1180 #if VM_PAGE_BUCKETS_CHECK
1181 	vm_page_buckets_check_ready = TRUE;
1182 #endif /* VM_PAGE_BUCKETS_CHECK */
1183 
1184 	/*
1185 	 *	Machine-dependent code allocates the resident page table.
1186 	 *	It uses vm_page_init to initialize the page frames.
1187 	 *	The code also returns to us the virtual space available
1188 	 *	to the kernel.  We don't trust the pmap module
1189 	 *	to get the alignment right.
1190 	 */
1191 
1192 	kernel_debug_string_early("pmap_startup");
1193 	pmap_startup(&virtual_space_start, &virtual_space_end);
1194 	virtual_space_start = round_page(virtual_space_start);
1195 	virtual_space_end = trunc_page(virtual_space_end);
1196 
1197 	*startp = virtual_space_start;
1198 	*endp = virtual_space_end;
1199 
1200 	/*
1201 	 *	Compute the initial "wire" count.
1202 	 *	Up until now, the pages which have been set aside are not under
1203 	 *	the VM system's control, so although they aren't explicitly
1204 	 *	wired, they nonetheless can't be moved. At this moment,
1205 	 *	all VM managed pages are "free", courtesy of pmap_startup.
1206 	 */
1207 	assert((unsigned int) atop_64(max_mem) == atop_64(max_mem));
1208 	vm_page_wire_count = ((unsigned int) atop_64(max_mem)) -
1209 	    vm_page_free_count - vm_lopage_free_count;
1210 #if CONFIG_SECLUDED_MEMORY
1211 	vm_page_wire_count -= vm_page_secluded_count;
1212 #endif
1213 	vm_page_wire_count_initial = vm_page_wire_count;
1214 
1215 	/* capture this for later use */
1216 	booter_size = ml_get_booter_memory_size();
1217 
1218 	printf("vm_page_bootstrap: %d free pages, %d wired pages, (up to %d of which are delayed free)\n",
1219 	    vm_page_free_count, vm_page_wire_count, vm_delayed_count);
1220 
1221 	kernel_debug_string_early("vm_page_bootstrap complete");
1222 }
1223 
1224 #ifndef MACHINE_PAGES
1225 /*
1226  * This is the early boot time allocator for data structures needed to bootstrap the VM system.
1227  * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this
1228  * on ARM yet, due to the combination of a large base page size and smaller RAM devices.
1229  */
1230 static void *
pmap_steal_memory_internal(vm_size_t size,vm_size_t alignment,boolean_t might_free)1231 pmap_steal_memory_internal(
1232 	vm_size_t size,
1233 	vm_size_t alignment,
1234 	boolean_t might_free)
1235 {
1236 	kern_return_t kr;
1237 	vm_offset_t addr;
1238 	vm_offset_t map_addr;
1239 	ppnum_t phys_page;
1240 
1241 	/*
1242 	 * Size needs to be aligned to word size.
1243 	 */
1244 	size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
1245 
1246 	/*
1247 	 * Alignment defaults to word size if not specified.
1248 	 */
1249 	if (alignment == 0) {
1250 		alignment = sizeof(void*);
1251 	}
1252 
1253 	/*
1254 	 * Alignment must be no greater than a page and must be a power of two.
1255 	 */
1256 	assert(alignment <= PAGE_SIZE);
1257 	assert((alignment & (alignment - 1)) == 0);
1258 
1259 	/*
1260 	 * On the first call, get the initial values for virtual address space
1261 	 * and page align them.
1262 	 */
1263 	if (virtual_space_start == virtual_space_end) {
1264 		pmap_virtual_space(&virtual_space_start, &virtual_space_end);
1265 		virtual_space_start = round_page(virtual_space_start);
1266 		virtual_space_end = trunc_page(virtual_space_end);
1267 
1268 #if defined(__x86_64__)
1269 		/*
1270 		 * Release remaining unused section of preallocated KVA and the 4K page tables
1271 		 * that map it. This makes the VA available for large page mappings.
1272 		 */
1273 		Idle_PTs_release(virtual_space_start, virtual_space_end);
1274 #endif
1275 	}
1276 
1277 	/*
1278 	 * Allocate the virtual space for this request. On x86, we'll align to a large page
1279 	 * address if the size is big enough to back with at least 1 large page.
1280 	 */
1281 #if defined(__x86_64__)
1282 	if (size >= I386_LPGBYTES) {
1283 		virtual_space_start = ((virtual_space_start + I386_LPGMASK) & ~I386_LPGMASK);
1284 	}
1285 #endif
1286 	virtual_space_start = (virtual_space_start + (alignment - 1)) & ~(alignment - 1);
1287 	addr = virtual_space_start;
1288 	virtual_space_start += size;
1289 
1290 	//kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size);	/* (TEST/DEBUG) */
1291 
1292 	/*
1293 	 * Allocate and map physical pages to back the new virtual space.
1294 	 */
1295 	map_addr = round_page(addr);
1296 	while (map_addr < addr + size) {
1297 #if defined(__x86_64__)
1298 		/*
1299 		 * Back with a large page if properly aligned on x86
1300 		 */
1301 		if ((map_addr & I386_LPGMASK) == 0 &&
1302 		    map_addr + I386_LPGBYTES <= addr + size &&
1303 		    pmap_pre_expand_large(kernel_pmap, map_addr) == KERN_SUCCESS &&
1304 		    pmap_next_page_large(&phys_page) == KERN_SUCCESS) {
1305 			kr = pmap_enter(kernel_pmap, map_addr, phys_page,
1306 			    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
1307 			    VM_WIMG_USE_DEFAULT | VM_MEM_SUPERPAGE, FALSE);
1308 
1309 			if (kr != KERN_SUCCESS) {
1310 				panic("pmap_steal_memory: pmap_enter() large failed, new_addr=%#lx, phys_page=%u",
1311 				    (unsigned long)map_addr, phys_page);
1312 			}
1313 			map_addr += I386_LPGBYTES;
1314 			vm_page_wire_count += I386_LPGBYTES >> PAGE_SHIFT;
1315 			vm_page_stolen_count += I386_LPGBYTES >> PAGE_SHIFT;
1316 			vm_page_kern_lpage_count++;
1317 			continue;
1318 		}
1319 #endif
1320 
1321 		if (!pmap_next_page_hi(&phys_page, might_free)) {
1322 			panic("pmap_steal_memory() size: 0x%llx", (uint64_t)size);
1323 		}
1324 
1325 #if defined(__x86_64__)
1326 		pmap_pre_expand(kernel_pmap, map_addr);
1327 #endif
1328 
1329 		kr = pmap_enter(kernel_pmap, map_addr, phys_page,
1330 		    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
1331 		    VM_WIMG_USE_DEFAULT, FALSE);
1332 
1333 		if (kr != KERN_SUCCESS) {
1334 			panic("pmap_steal_memory() pmap_enter failed, map_addr=%#lx, phys_page=%u",
1335 			    (unsigned long)map_addr, phys_page);
1336 		}
1337 		map_addr += PAGE_SIZE;
1338 
1339 		/*
1340 		 * Account for newly stolen memory
1341 		 */
1342 		vm_page_wire_count++;
1343 		vm_page_stolen_count++;
1344 	}
1345 
1346 #if defined(__x86_64__)
1347 	/*
1348 	 * The call with might_free is currently the last use of pmap_steal_memory*().
1349 	 * Notify the pmap layer to record which high pages were allocated so far.
1350 	 */
1351 	if (might_free) {
1352 		pmap_hi_pages_done();
1353 	}
1354 #endif
1355 #if KASAN
1356 	kasan_notify_address(round_page(addr), size);
1357 #endif
1358 	return (void *) addr;
1359 }
1360 
1361 void *
pmap_steal_memory(vm_size_t size,vm_size_t alignment)1362 pmap_steal_memory(
1363 	vm_size_t size,
1364 	vm_size_t alignment)
1365 {
1366 	return pmap_steal_memory_internal(size, alignment, FALSE);
1367 }
1368 
1369 void *
pmap_steal_freeable_memory(vm_size_t size)1370 pmap_steal_freeable_memory(
1371 	vm_size_t size)
1372 {
1373 	return pmap_steal_memory_internal(size, 0, TRUE);
1374 }
1375 
1376 
1377 #if CONFIG_SECLUDED_MEMORY
1378 /* boot-args to control secluded memory */
1379 TUNABLE_DT(unsigned int, secluded_mem_mb, "/defaults", "kern.secluded_mem_mb", "secluded_mem_mb", 0, TUNABLE_DT_NONE);
1380 /* IOKit can use secluded memory */
1381 TUNABLE(bool, secluded_for_iokit, "secluded_for_iokit", true);
1382 /* apps can use secluded memory */
1383 TUNABLE(bool, secluded_for_apps, "secluded_for_apps", true);
1384 /* filecache can use seclude memory */
1385 TUNABLE(secluded_filecache_mode_t, secluded_for_filecache, "secluded_for_filecache", SECLUDED_FILECACHE_RDONLY);
1386 uint64_t secluded_shutoff_trigger = 0;
1387 uint64_t secluded_shutoff_headroom = 150 * 1024 * 1024; /* original value from N56 */
1388 #endif /* CONFIG_SECLUDED_MEMORY */
1389 
1390 
1391 #if defined(__arm64__)
1392 extern void patch_low_glo_vm_page_info(void *, void *, uint32_t);
1393 unsigned int vm_first_phys_ppnum = 0;
1394 #endif
1395 
1396 void vm_page_release_startup(vm_page_t mem);
1397 void
pmap_startup(vm_offset_t * startp,vm_offset_t * endp)1398 pmap_startup(
1399 	vm_offset_t     *startp,
1400 	vm_offset_t     *endp)
1401 {
1402 	unsigned int    i, npages;
1403 	ppnum_t         phys_page;
1404 	uint64_t        mem_sz;
1405 	uint64_t        start_ns;
1406 	uint64_t        now_ns;
1407 	uint_t          low_page_count = 0;
1408 
1409 #if    defined(__LP64__)
1410 	/*
1411 	 * make sure we are aligned on a 64 byte boundary
1412 	 * for VM_PAGE_PACK_PTR (it clips off the low-order
1413 	 * 6 bits of the pointer)
1414 	 */
1415 	if (virtual_space_start != virtual_space_end) {
1416 		virtual_space_start = round_page(virtual_space_start);
1417 	}
1418 #endif
1419 
1420 	/*
1421 	 * We calculate how many page frames we will have
1422 	 * and then allocate the page structures in one chunk.
1423 	 *
1424 	 * Note that the calculation here doesn't take into account
1425 	 * the memory needed to map what's being allocated, i.e. the page
1426 	 * table entries. So the actual number of pages we get will be
1427 	 * less than this. To do someday: include that in the computation.
1428 	 *
1429 	 * Also for ARM, we don't use the count of free_pages, but rather the
1430 	 * range from last page to first page (ignore holes due to retired pages).
1431 	 */
1432 #if defined(__arm64__)
1433 	mem_sz = pmap_free_pages_span() * (uint64_t)PAGE_SIZE;
1434 #else /* defined(__arm64__) */
1435 	mem_sz = pmap_free_pages() * (uint64_t)PAGE_SIZE;
1436 #endif /* defined(__arm64__) */
1437 	mem_sz += round_page(virtual_space_start) - virtual_space_start;        /* Account for any slop */
1438 	npages = (uint_t)(mem_sz / (PAGE_SIZE + sizeof(*vm_pages)));    /* scaled to include the vm_page_ts */
1439 
1440 
1441 	vm_pages = (vm_page_t) pmap_steal_freeable_memory(npages * sizeof *vm_pages);
1442 
1443 	/*
1444 	 * Check if we want to initialize pages to a known value
1445 	 */
1446 	if (PE_parse_boot_argn("fill", &fillval, sizeof(fillval))) {
1447 		fill = TRUE;
1448 	}
1449 #if     DEBUG
1450 	/* This slows down booting the DEBUG kernel, particularly on
1451 	 * large memory systems, but is worthwhile in deterministically
1452 	 * trapping uninitialized memory usage.
1453 	 */
1454 	if (!fill) {
1455 		fill = TRUE;
1456 		fillval = 0xDEB8F177;
1457 	}
1458 #endif
1459 	if (fill) {
1460 		kprintf("Filling vm_pages with pattern: 0x%x\n", fillval);
1461 	}
1462 
1463 #if CONFIG_SECLUDED_MEMORY
1464 	/*
1465 	 * Figure out how much secluded memory to have before we start
1466 	 * release pages to free lists.
1467 	 * The default, if specified nowhere else, is no secluded mem.
1468 	 */
1469 	vm_page_secluded_target = (unsigned int)atop_64(secluded_mem_mb * 1024ULL * 1024ULL);
1470 
1471 	/*
1472 	 * Allow a really large app to effectively use secluded memory until it exits.
1473 	 */
1474 	if (vm_page_secluded_target != 0) {
1475 		/*
1476 		 * Get an amount from boot-args, else use 1/2 of max_mem.
1477 		 * 1/2 max_mem was chosen from a Peace daemon tentpole test which
1478 		 * used munch to induce jetsam thrashing of false idle daemons on N56.
1479 		 */
1480 		int secluded_shutoff_mb;
1481 		if (PE_parse_boot_argn("secluded_shutoff_mb", &secluded_shutoff_mb,
1482 		    sizeof(secluded_shutoff_mb))) {
1483 			secluded_shutoff_trigger = (uint64_t)secluded_shutoff_mb * 1024 * 1024;
1484 		} else {
1485 			secluded_shutoff_trigger = max_mem / 2;
1486 		}
1487 
1488 		/* ensure the headroom value is sensible and avoid underflows */
1489 		assert(secluded_shutoff_trigger == 0 || secluded_shutoff_trigger > secluded_shutoff_headroom);
1490 	}
1491 
1492 #endif /* CONFIG_SECLUDED_MEMORY */
1493 
1494 #if defined(__x86_64__)
1495 
1496 	/*
1497 	 * Decide how much memory we delay freeing at boot time.
1498 	 */
1499 	uint32_t delay_above_gb;
1500 	if (!PE_parse_boot_argn("delay_above_gb", &delay_above_gb, sizeof(delay_above_gb))) {
1501 		delay_above_gb = DEFAULT_DELAY_ABOVE_PHYS_GB;
1502 	}
1503 
1504 	if (delay_above_gb == 0) {
1505 		delay_above_pnum = PPNUM_MAX;
1506 	} else {
1507 		delay_above_pnum = delay_above_gb * (1024 * 1024 * 1024 / PAGE_SIZE);
1508 	}
1509 
1510 	/* make sure we have sane breathing room: 1G above low memory */
1511 	if (delay_above_pnum <= max_valid_low_ppnum) {
1512 		delay_above_pnum = max_valid_low_ppnum + ((1024 * 1024 * 1024) >> PAGE_SHIFT);
1513 	}
1514 
1515 	if (delay_above_pnum < PPNUM_MAX) {
1516 		printf("pmap_startup() delaying init/free of page nums > 0x%x\n", delay_above_pnum);
1517 	}
1518 
1519 #endif /* defined(__x86_64__) */
1520 
1521 	/*
1522 	 * Initialize and release the page frames.
1523 	 */
1524 	kernel_debug_string_early("page_frame_init");
1525 
1526 	vm_page_array_beginning_addr = &vm_pages[0];
1527 	vm_page_array_ending_addr = &vm_pages[npages];  /* used by ptr packing/unpacking code */
1528 #if VM_PAGE_PACKED_FROM_ARRAY
1529 	if (npages >= VM_PAGE_PACKED_FROM_ARRAY) {
1530 		panic("pmap_startup(): too many pages to support vm_page packing");
1531 	}
1532 #endif
1533 
1534 	vm_delayed_count = 0;
1535 
1536 	absolutetime_to_nanoseconds(mach_absolute_time(), &start_ns);
1537 	vm_pages_count = 0;
1538 	for (i = 0; i < npages; i++) {
1539 		/* Did we run out of pages? */
1540 		if (!pmap_next_page(&phys_page)) {
1541 			break;
1542 		}
1543 
1544 		if (phys_page < max_valid_low_ppnum) {
1545 			++low_page_count;
1546 		}
1547 
1548 		/* Are we at high enough pages to delay the rest? */
1549 		if (low_page_count > vm_lopage_free_limit && phys_page > delay_above_pnum) {
1550 			vm_delayed_count = pmap_free_pages();
1551 			break;
1552 		}
1553 
1554 #if defined(__arm64__)
1555 		if (i == 0) {
1556 			vm_first_phys_ppnum = phys_page;
1557 			patch_low_glo_vm_page_info((void *)vm_page_array_beginning_addr,
1558 			    (void *)vm_page_array_ending_addr, vm_first_phys_ppnum);
1559 		}
1560 #endif /* defined(__arm64__) */
1561 
1562 #if defined(__x86_64__)
1563 		/* The x86 clump freeing code requires increasing ppn's to work correctly */
1564 		if (i > 0) {
1565 			assert(phys_page > vm_pages[i - 1].vmp_phys_page);
1566 		}
1567 #endif
1568 		++vm_pages_count;
1569 		vm_page_init(&vm_pages[i], phys_page, FALSE);
1570 		if (fill) {
1571 			fillPage(phys_page, fillval);
1572 		}
1573 		if (vm_himemory_mode) {
1574 			vm_page_release_startup(&vm_pages[i]);
1575 		}
1576 	}
1577 	vm_page_pages = vm_pages_count; /* used to report to user space */
1578 
1579 	if (!vm_himemory_mode) {
1580 		do {
1581 			if (!VMP_ERROR_GET(&vm_pages[--i])) {               /* skip retired pages */
1582 				vm_page_release_startup(&vm_pages[i]);
1583 			}
1584 		} while (i != 0);
1585 	}
1586 
1587 	absolutetime_to_nanoseconds(mach_absolute_time(), &now_ns);
1588 	printf("pmap_startup() init/release time: %lld microsec\n", (now_ns - start_ns) / NSEC_PER_USEC);
1589 	printf("pmap_startup() delayed init/release of %d pages\n", vm_delayed_count);
1590 
1591 #if defined(__LP64__)
1592 	if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[0]))) != &vm_pages[0]) {
1593 		panic("VM_PAGE_PACK_PTR failed on &vm_pages[0] - %p", (void *)&vm_pages[0]);
1594 	}
1595 
1596 	if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(&vm_pages[vm_pages_count - 1]))) != &vm_pages[vm_pages_count - 1]) {
1597 		panic("VM_PAGE_PACK_PTR failed on &vm_pages[vm_pages_count-1] - %p", (void *)&vm_pages[vm_pages_count - 1]);
1598 	}
1599 #endif
1600 
1601 	VM_CHECK_MEMORYSTATUS;
1602 
1603 	/*
1604 	 * We have to re-align virtual_space_start,
1605 	 * because pmap_steal_memory has been using it.
1606 	 */
1607 	virtual_space_start = round_page(virtual_space_start);
1608 	*startp = virtual_space_start;
1609 	*endp = virtual_space_end;
1610 }
1611 #endif  /* MACHINE_PAGES */
1612 
1613 /*
1614  * Create the zone that represents the vm_pages[] array. Nothing ever allocates
1615  * or frees to this zone. It's just here for reporting purposes via zprint command.
1616  * This needs to be done after all initially delayed pages are put on the free lists.
1617  */
1618 static void
vm_page_module_init_delayed(void)1619 vm_page_module_init_delayed(void)
1620 {
1621 	(void)zone_create_ext("vm pages array", sizeof(struct vm_page),
1622 	    ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE, ZONE_ID_VM_PAGES, ^(zone_t z) {
1623 		uint64_t vm_page_zone_pages, vm_page_array_zone_data_size;
1624 
1625 		zone_set_exhaustible(z, 0);
1626 		/*
1627 		 * Reflect size and usage information for vm_pages[].
1628 		 */
1629 
1630 		z->z_elems_avail = (uint32_t)(vm_page_array_ending_addr - vm_pages);
1631 		z->z_elems_free = z->z_elems_avail - vm_pages_count;
1632 		zpercpu_get_cpu(z->z_stats, 0)->zs_mem_allocated =
1633 		vm_pages_count * sizeof(struct vm_page);
1634 		vm_page_array_zone_data_size = (uint64_t)vm_page_array_ending_addr - (uint64_t)vm_pages;
1635 		vm_page_zone_pages = atop(round_page((vm_offset_t)vm_page_array_zone_data_size));
1636 		z->z_wired_cur += vm_page_zone_pages;
1637 		z->z_wired_hwm = z->z_wired_cur;
1638 		z->z_va_cur = z->z_wired_cur;
1639 		/* since zone accounts for these, take them out of stolen */
1640 		VM_PAGE_MOVE_STOLEN(vm_page_zone_pages);
1641 	});
1642 }
1643 
1644 /*
1645  * Create the vm_pages zone. This is used for the vm_page structures for the pages
1646  * that are scavanged from other boot time usages by ml_static_mfree(). As such,
1647  * this needs to happen in early VM bootstrap.
1648  */
1649 
1650 __startup_func
1651 static void
vm_page_module_init(void)1652 vm_page_module_init(void)
1653 {
1654 	vm_size_t vm_page_with_ppnum_size;
1655 
1656 	/*
1657 	 * Since the pointers to elements in this zone will be packed, they
1658 	 * must have appropriate size. Not strictly what sizeof() reports.
1659 	 */
1660 	vm_page_with_ppnum_size =
1661 	    (sizeof(struct vm_page_with_ppnum) + (VM_PAGE_PACKED_PTR_ALIGNMENT - 1)) &
1662 	    ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
1663 
1664 	vm_page_zone = zone_create_ext("vm pages", vm_page_with_ppnum_size,
1665 	    ZC_ALIGNMENT_REQUIRED | ZC_VM | ZC_NOTBITAG,
1666 	    ZONE_ID_ANY, ^(zone_t z) {
1667 		/*
1668 		 * The number "10" is a small number that is larger than the number
1669 		 * of fictitious pages that any single caller will attempt to allocate
1670 		 * without blocking.
1671 		 *
1672 		 * The largest such number at the moment is kmem_alloc()
1673 		 * when 2 guard pages are asked. 10 is simply a somewhat larger number,
1674 		 * taking into account the 50% hysteresis the zone allocator uses.
1675 		 *
1676 		 * Note: this works at all because the zone allocator
1677 		 *       doesn't ever allocate fictitious pages.
1678 		 */
1679 		zone_raise_reserve(z, 10);
1680 	});
1681 }
1682 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_page_module_init);
1683 
1684 /*
1685  *	Routine:	vm_page_create
1686  *	Purpose:
1687  *		After the VM system is up, machine-dependent code
1688  *		may stumble across more physical memory.  For example,
1689  *		memory that it was reserving for a frame buffer.
1690  *		vm_page_create turns this memory into available pages.
1691  */
1692 
1693 void
vm_page_create(ppnum_t start,ppnum_t end)1694 vm_page_create(
1695 	ppnum_t start,
1696 	ppnum_t end)
1697 {
1698 	ppnum_t         phys_page;
1699 	vm_page_t       m;
1700 
1701 	for (phys_page = start;
1702 	    phys_page < end;
1703 	    phys_page++) {
1704 		m = vm_page_grab_fictitious_common(phys_page, TRUE);
1705 		m->vmp_fictitious = FALSE;
1706 		pmap_clear_noencrypt(phys_page);
1707 
1708 
1709 		vm_free_page_lock();
1710 		vm_page_pages++;
1711 		vm_free_page_unlock();
1712 		vm_page_release(m, FALSE);
1713 	}
1714 }
1715 
1716 
1717 /*
1718  *	vm_page_hash:
1719  *
1720  *	Distributes the object/offset key pair among hash buckets.
1721  *
1722  *	NOTE:	The bucket count must be a power of 2
1723  */
1724 #define vm_page_hash(object, offset) (\
1725 	( (natural_t)((uintptr_t)object * vm_page_bucket_hash) + ((uint32_t)atop_64(offset) ^ vm_page_bucket_hash))\
1726 	 & vm_page_hash_mask)
1727 
1728 
1729 /*
1730  *	vm_page_insert:		[ internal use only ]
1731  *
1732  *	Inserts the given mem entry into the object/object-page
1733  *	table and object list.
1734  *
1735  *	The object must be locked.
1736  */
1737 void
vm_page_insert(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)1738 vm_page_insert(
1739 	vm_page_t               mem,
1740 	vm_object_t             object,
1741 	vm_object_offset_t      offset)
1742 {
1743 	vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, FALSE, FALSE, NULL);
1744 }
1745 
1746 void
vm_page_insert_wired(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag)1747 vm_page_insert_wired(
1748 	vm_page_t               mem,
1749 	vm_object_t             object,
1750 	vm_object_offset_t      offset,
1751 	vm_tag_t                tag)
1752 {
1753 	vm_page_insert_internal(mem, object, offset, tag, FALSE, TRUE, FALSE, FALSE, NULL);
1754 }
1755 
1756 void
vm_page_insert_internal(vm_page_t mem,vm_object_t object,vm_object_offset_t offset,vm_tag_t tag,boolean_t queues_lock_held,boolean_t insert_in_hash,boolean_t batch_pmap_op,boolean_t batch_accounting,uint64_t * delayed_ledger_update)1757 vm_page_insert_internal(
1758 	vm_page_t               mem,
1759 	vm_object_t             object,
1760 	vm_object_offset_t      offset,
1761 	vm_tag_t                tag,
1762 	boolean_t               queues_lock_held,
1763 	boolean_t               insert_in_hash,
1764 	boolean_t               batch_pmap_op,
1765 	boolean_t               batch_accounting,
1766 	uint64_t                *delayed_ledger_update)
1767 {
1768 	vm_page_bucket_t        *bucket;
1769 	lck_spin_t              *bucket_lock;
1770 	int                     hash_id;
1771 	task_t                  owner;
1772 	int                     ledger_idx_volatile;
1773 	int                     ledger_idx_nonvolatile;
1774 	int                     ledger_idx_volatile_compressed;
1775 	int                     ledger_idx_nonvolatile_compressed;
1776 	boolean_t               do_footprint;
1777 
1778 #if 0
1779 	/*
1780 	 * we may not hold the page queue lock
1781 	 * so this check isn't safe to make
1782 	 */
1783 	VM_PAGE_CHECK(mem);
1784 #endif
1785 
1786 	assertf(page_aligned(offset), "0x%llx\n", offset);
1787 
1788 	assert(!VM_PAGE_WIRED(mem) || mem->vmp_private || mem->vmp_fictitious || (tag != VM_KERN_MEMORY_NONE));
1789 
1790 	vm_object_lock_assert_exclusive(object);
1791 	LCK_MTX_ASSERT(&vm_page_queue_lock,
1792 	    queues_lock_held ? LCK_MTX_ASSERT_OWNED
1793 	    : LCK_MTX_ASSERT_NOTOWNED);
1794 
1795 	if (queues_lock_held == FALSE) {
1796 		assert(!VM_PAGE_PAGEABLE(mem));
1797 	}
1798 
1799 	if (insert_in_hash == TRUE) {
1800 #if DEBUG || VM_PAGE_BUCKETS_CHECK
1801 		if (mem->vmp_tabled || mem->vmp_object) {
1802 			panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
1803 			    "already in (obj=%p,off=0x%llx)",
1804 			    mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
1805 		}
1806 #endif
1807 		if (object->internal && (offset >= object->vo_size)) {
1808 			panic("vm_page_insert_internal: (page=%p,obj=%p,off=0x%llx,size=0x%llx) inserted at offset past object bounds",
1809 			    mem, object, offset, object->vo_size);
1810 		}
1811 
1812 		assert(vm_page_lookup(object, offset) == VM_PAGE_NULL);
1813 
1814 		/*
1815 		 *	Record the object/offset pair in this page
1816 		 */
1817 
1818 		mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
1819 		mem->vmp_offset = offset;
1820 
1821 #if CONFIG_SECLUDED_MEMORY
1822 		if (object->eligible_for_secluded) {
1823 			vm_page_secluded.eligible_for_secluded++;
1824 		}
1825 #endif /* CONFIG_SECLUDED_MEMORY */
1826 
1827 		/*
1828 		 *	Insert it into the object_object/offset hash table
1829 		 */
1830 		hash_id = vm_page_hash(object, offset);
1831 		bucket = &vm_page_buckets[hash_id];
1832 		bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
1833 
1834 		lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
1835 
1836 		mem->vmp_next_m = bucket->page_list;
1837 		bucket->page_list = VM_PAGE_PACK_PTR(mem);
1838 		assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)));
1839 
1840 #if     MACH_PAGE_HASH_STATS
1841 		if (++bucket->cur_count > bucket->hi_count) {
1842 			bucket->hi_count = bucket->cur_count;
1843 		}
1844 #endif /* MACH_PAGE_HASH_STATS */
1845 		mem->vmp_hashed = TRUE;
1846 		lck_spin_unlock(bucket_lock);
1847 	}
1848 
1849 	{
1850 		unsigned int    cache_attr;
1851 
1852 		cache_attr = object->wimg_bits & VM_WIMG_MASK;
1853 
1854 		if (cache_attr != VM_WIMG_USE_DEFAULT) {
1855 			PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op);
1856 		}
1857 	}
1858 	/*
1859 	 *	Now link into the object's list of backed pages.
1860 	 */
1861 	vm_page_queue_enter(&object->memq, mem, vmp_listq);
1862 	object->memq_hint = mem;
1863 	mem->vmp_tabled = TRUE;
1864 
1865 	/*
1866 	 *	Show that the object has one more resident page.
1867 	 */
1868 
1869 	object->resident_page_count++;
1870 	if (VM_PAGE_WIRED(mem)) {
1871 		assert(mem->vmp_wire_count > 0);
1872 		VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
1873 		VM_OBJECT_WIRED_PAGE_ADD(object, mem);
1874 		VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
1875 	}
1876 	assert(object->resident_page_count >= object->wired_page_count);
1877 
1878 #if DEVELOPMENT || DEBUG
1879 	if (object->object_is_shared_cache &&
1880 	    object->pager != NULL &&
1881 	    object->pager->mo_pager_ops == &shared_region_pager_ops) {
1882 		int new, old;
1883 		assert(!object->internal);
1884 		new = OSAddAtomic(+1, &shared_region_pagers_resident_count);
1885 		do {
1886 			old = shared_region_pagers_resident_peak;
1887 		} while (old < new &&
1888 		    !OSCompareAndSwap(old, new, &shared_region_pagers_resident_peak));
1889 	}
1890 #endif /* DEVELOPMENT || DEBUG */
1891 
1892 	if (batch_accounting == FALSE) {
1893 		if (object->internal) {
1894 			OSAddAtomic(1, &vm_page_internal_count);
1895 		} else {
1896 			OSAddAtomic(1, &vm_page_external_count);
1897 		}
1898 	}
1899 
1900 	/*
1901 	 * It wouldn't make sense to insert a "reusable" page in
1902 	 * an object (the page would have been marked "reusable" only
1903 	 * at the time of a madvise(MADV_FREE_REUSABLE) if it was already
1904 	 * in the object at that time).
1905 	 * But a page could be inserted in a "all_reusable" object, if
1906 	 * something faults it in (a vm_read() from another task or a
1907 	 * "use-after-free" issue in user space, for example).  It can
1908 	 * also happen if we're relocating a page from that object to
1909 	 * a different physical page during a physically-contiguous
1910 	 * allocation.
1911 	 */
1912 	assert(!mem->vmp_reusable);
1913 	if (object->all_reusable) {
1914 		OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count);
1915 	}
1916 
1917 	if (object->purgable == VM_PURGABLE_DENY &&
1918 	    !object->vo_ledger_tag) {
1919 		owner = TASK_NULL;
1920 	} else {
1921 		owner = VM_OBJECT_OWNER(object);
1922 		vm_object_ledger_tag_ledgers(object,
1923 		    &ledger_idx_volatile,
1924 		    &ledger_idx_nonvolatile,
1925 		    &ledger_idx_volatile_compressed,
1926 		    &ledger_idx_nonvolatile_compressed,
1927 		    &do_footprint);
1928 	}
1929 	if (owner &&
1930 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
1931 	    object->purgable == VM_PURGABLE_DENY ||
1932 	    VM_PAGE_WIRED(mem))) {
1933 		if (delayed_ledger_update) {
1934 			*delayed_ledger_update += PAGE_SIZE;
1935 		} else {
1936 			/* more non-volatile bytes */
1937 			ledger_credit(owner->ledger,
1938 			    ledger_idx_nonvolatile,
1939 			    PAGE_SIZE);
1940 			if (do_footprint) {
1941 				/* more footprint */
1942 				ledger_credit(owner->ledger,
1943 				    task_ledgers.phys_footprint,
1944 				    PAGE_SIZE);
1945 			}
1946 		}
1947 	} else if (owner &&
1948 	    (object->purgable == VM_PURGABLE_VOLATILE ||
1949 	    object->purgable == VM_PURGABLE_EMPTY)) {
1950 		assert(!VM_PAGE_WIRED(mem));
1951 		/* more volatile bytes */
1952 		ledger_credit(owner->ledger,
1953 		    ledger_idx_volatile,
1954 		    PAGE_SIZE);
1955 	}
1956 
1957 	if (object->purgable == VM_PURGABLE_VOLATILE) {
1958 		if (VM_PAGE_WIRED(mem)) {
1959 			OSAddAtomic(+1, &vm_page_purgeable_wired_count);
1960 		} else {
1961 			OSAddAtomic(+1, &vm_page_purgeable_count);
1962 		}
1963 	} else if (object->purgable == VM_PURGABLE_EMPTY &&
1964 	    mem->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
1965 		/*
1966 		 * This page belongs to a purged VM object but hasn't
1967 		 * been purged (because it was "busy").
1968 		 * It's in the "throttled" queue and hence not
1969 		 * visible to vm_pageout_scan().  Move it to a pageable
1970 		 * queue, so that it can eventually be reclaimed, instead
1971 		 * of lingering in the "empty" object.
1972 		 */
1973 		if (queues_lock_held == FALSE) {
1974 			vm_page_lockspin_queues();
1975 		}
1976 		vm_page_deactivate(mem);
1977 		if (queues_lock_held == FALSE) {
1978 			vm_page_unlock_queues();
1979 		}
1980 	}
1981 
1982 #if VM_OBJECT_TRACKING_OP_MODIFIED
1983 	if (vm_object_tracking_btlog &&
1984 	    object->internal &&
1985 	    object->resident_page_count == 0 &&
1986 	    object->pager == NULL &&
1987 	    object->shadow != NULL &&
1988 	    object->shadow->copy == object) {
1989 		btlog_record(vm_object_tracking_btlog, object,
1990 		    VM_OBJECT_TRACKING_OP_MODIFIED,
1991 		    btref_get(__builtin_frame_address(0), 0));
1992 	}
1993 #endif /* VM_OBJECT_TRACKING_OP_MODIFIED */
1994 }
1995 
1996 /*
1997  *	vm_page_replace:
1998  *
1999  *	Exactly like vm_page_insert, except that we first
2000  *	remove any existing page at the given offset in object.
2001  *
2002  *	The object must be locked.
2003  */
2004 void
vm_page_replace(vm_page_t mem,vm_object_t object,vm_object_offset_t offset)2005 vm_page_replace(
2006 	vm_page_t               mem,
2007 	vm_object_t             object,
2008 	vm_object_offset_t      offset)
2009 {
2010 	vm_page_bucket_t *bucket;
2011 	vm_page_t        found_m = VM_PAGE_NULL;
2012 	lck_spin_t      *bucket_lock;
2013 	int             hash_id;
2014 
2015 #if 0
2016 	/*
2017 	 * we don't hold the page queue lock
2018 	 * so this check isn't safe to make
2019 	 */
2020 	VM_PAGE_CHECK(mem);
2021 #endif
2022 	vm_object_lock_assert_exclusive(object);
2023 #if DEBUG || VM_PAGE_BUCKETS_CHECK
2024 	if (mem->vmp_tabled || mem->vmp_object) {
2025 		panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
2026 		    "already in (obj=%p,off=0x%llx)",
2027 		    mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
2028 	}
2029 #endif
2030 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2031 
2032 	assert(!VM_PAGE_PAGEABLE(mem));
2033 
2034 	/*
2035 	 *	Record the object/offset pair in this page
2036 	 */
2037 	mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
2038 	mem->vmp_offset = offset;
2039 
2040 	/*
2041 	 *	Insert it into the object_object/offset hash table,
2042 	 *	replacing any page that might have been there.
2043 	 */
2044 
2045 	hash_id = vm_page_hash(object, offset);
2046 	bucket = &vm_page_buckets[hash_id];
2047 	bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
2048 
2049 	lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
2050 
2051 	if (bucket->page_list) {
2052 		vm_page_packed_t *mp = &bucket->page_list;
2053 		vm_page_t m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp));
2054 
2055 		do {
2056 			/*
2057 			 * compare packed object pointers
2058 			 */
2059 			if (m->vmp_object == mem->vmp_object && m->vmp_offset == offset) {
2060 				/*
2061 				 * Remove old page from hash list
2062 				 */
2063 				*mp = m->vmp_next_m;
2064 				m->vmp_hashed = FALSE;
2065 				m->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
2066 
2067 				found_m = m;
2068 				break;
2069 			}
2070 			mp = &m->vmp_next_m;
2071 		} while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp))));
2072 
2073 		mem->vmp_next_m = bucket->page_list;
2074 	} else {
2075 		mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
2076 	}
2077 	/*
2078 	 * insert new page at head of hash list
2079 	 */
2080 	bucket->page_list = VM_PAGE_PACK_PTR(mem);
2081 	mem->vmp_hashed = TRUE;
2082 
2083 	lck_spin_unlock(bucket_lock);
2084 
2085 	if (found_m) {
2086 		/*
2087 		 * there was already a page at the specified
2088 		 * offset for this object... remove it from
2089 		 * the object and free it back to the free list
2090 		 */
2091 		vm_page_free_unlocked(found_m, FALSE);
2092 	}
2093 	vm_page_insert_internal(mem, object, offset, VM_KERN_MEMORY_NONE, FALSE, FALSE, FALSE, FALSE, NULL);
2094 }
2095 
2096 /*
2097  *	vm_page_remove:		[ internal use only ]
2098  *
2099  *	Removes the given mem entry from the object/offset-page
2100  *	table and the object page list.
2101  *
2102  *	The object must be locked.
2103  */
2104 
2105 void
vm_page_remove(vm_page_t mem,boolean_t remove_from_hash)2106 vm_page_remove(
2107 	vm_page_t       mem,
2108 	boolean_t       remove_from_hash)
2109 {
2110 	vm_page_bucket_t *bucket;
2111 	vm_page_t       this;
2112 	lck_spin_t      *bucket_lock;
2113 	int             hash_id;
2114 	task_t          owner;
2115 	vm_object_t     m_object;
2116 	int             ledger_idx_volatile;
2117 	int             ledger_idx_nonvolatile;
2118 	int             ledger_idx_volatile_compressed;
2119 	int             ledger_idx_nonvolatile_compressed;
2120 	int             do_footprint;
2121 
2122 	m_object = VM_PAGE_OBJECT(mem);
2123 
2124 	vm_object_lock_assert_exclusive(m_object);
2125 	assert(mem->vmp_tabled);
2126 	assert(!mem->vmp_cleaning);
2127 	assert(!mem->vmp_laundry);
2128 
2129 	if (VM_PAGE_PAGEABLE(mem)) {
2130 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2131 	}
2132 #if 0
2133 	/*
2134 	 * we don't hold the page queue lock
2135 	 * so this check isn't safe to make
2136 	 */
2137 	VM_PAGE_CHECK(mem);
2138 #endif
2139 	if (remove_from_hash == TRUE) {
2140 		/*
2141 		 *	Remove from the object_object/offset hash table
2142 		 */
2143 		hash_id = vm_page_hash(m_object, mem->vmp_offset);
2144 		bucket = &vm_page_buckets[hash_id];
2145 		bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
2146 
2147 		lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
2148 
2149 		if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) {
2150 			/* optimize for common case */
2151 
2152 			bucket->page_list = mem->vmp_next_m;
2153 		} else {
2154 			vm_page_packed_t        *prev;
2155 
2156 			for (prev = &this->vmp_next_m;
2157 			    (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem;
2158 			    prev = &this->vmp_next_m) {
2159 				continue;
2160 			}
2161 			*prev = this->vmp_next_m;
2162 		}
2163 #if     MACH_PAGE_HASH_STATS
2164 		bucket->cur_count--;
2165 #endif /* MACH_PAGE_HASH_STATS */
2166 		mem->vmp_hashed = FALSE;
2167 		this->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
2168 		lck_spin_unlock(bucket_lock);
2169 	}
2170 	/*
2171 	 *	Now remove from the object's list of backed pages.
2172 	 */
2173 
2174 	vm_page_remove_internal(mem);
2175 
2176 	/*
2177 	 *	And show that the object has one fewer resident
2178 	 *	page.
2179 	 */
2180 
2181 	assert(m_object->resident_page_count > 0);
2182 	m_object->resident_page_count--;
2183 
2184 #if DEVELOPMENT || DEBUG
2185 	if (m_object->object_is_shared_cache &&
2186 	    m_object->pager != NULL &&
2187 	    m_object->pager->mo_pager_ops == &shared_region_pager_ops) {
2188 		assert(!m_object->internal);
2189 		OSAddAtomic(-1, &shared_region_pagers_resident_count);
2190 	}
2191 #endif /* DEVELOPMENT || DEBUG */
2192 
2193 	if (m_object->internal) {
2194 #if DEBUG
2195 		assert(vm_page_internal_count);
2196 #endif /* DEBUG */
2197 
2198 		OSAddAtomic(-1, &vm_page_internal_count);
2199 	} else {
2200 		assert(vm_page_external_count);
2201 		OSAddAtomic(-1, &vm_page_external_count);
2202 
2203 		if (mem->vmp_xpmapped) {
2204 			assert(vm_page_xpmapped_external_count);
2205 			OSAddAtomic(-1, &vm_page_xpmapped_external_count);
2206 		}
2207 	}
2208 	if (!m_object->internal &&
2209 	    m_object->cached_list.next &&
2210 	    m_object->cached_list.prev) {
2211 		if (m_object->resident_page_count == 0) {
2212 			vm_object_cache_remove(m_object);
2213 		}
2214 	}
2215 
2216 	if (VM_PAGE_WIRED(mem)) {
2217 		assert(mem->vmp_wire_count > 0);
2218 		VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
2219 		VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
2220 		VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
2221 	}
2222 	assert(m_object->resident_page_count >=
2223 	    m_object->wired_page_count);
2224 	if (mem->vmp_reusable) {
2225 		assert(m_object->reusable_page_count > 0);
2226 		m_object->reusable_page_count--;
2227 		assert(m_object->reusable_page_count <=
2228 		    m_object->resident_page_count);
2229 		mem->vmp_reusable = FALSE;
2230 		OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
2231 		vm_page_stats_reusable.reused_remove++;
2232 	} else if (m_object->all_reusable) {
2233 		OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
2234 		vm_page_stats_reusable.reused_remove++;
2235 	}
2236 
2237 	if (m_object->purgable == VM_PURGABLE_DENY &&
2238 	    !m_object->vo_ledger_tag) {
2239 		owner = TASK_NULL;
2240 	} else {
2241 		owner = VM_OBJECT_OWNER(m_object);
2242 		vm_object_ledger_tag_ledgers(m_object,
2243 		    &ledger_idx_volatile,
2244 		    &ledger_idx_nonvolatile,
2245 		    &ledger_idx_volatile_compressed,
2246 		    &ledger_idx_nonvolatile_compressed,
2247 		    &do_footprint);
2248 	}
2249 	if (owner &&
2250 	    (m_object->purgable == VM_PURGABLE_NONVOLATILE ||
2251 	    m_object->purgable == VM_PURGABLE_DENY ||
2252 	    VM_PAGE_WIRED(mem))) {
2253 		/* less non-volatile bytes */
2254 		ledger_debit(owner->ledger,
2255 		    ledger_idx_nonvolatile,
2256 		    PAGE_SIZE);
2257 		if (do_footprint) {
2258 			/* less footprint */
2259 			ledger_debit(owner->ledger,
2260 			    task_ledgers.phys_footprint,
2261 			    PAGE_SIZE);
2262 		}
2263 	} else if (owner &&
2264 	    (m_object->purgable == VM_PURGABLE_VOLATILE ||
2265 	    m_object->purgable == VM_PURGABLE_EMPTY)) {
2266 		assert(!VM_PAGE_WIRED(mem));
2267 		/* less volatile bytes */
2268 		ledger_debit(owner->ledger,
2269 		    ledger_idx_volatile,
2270 		    PAGE_SIZE);
2271 	}
2272 	if (m_object->purgable == VM_PURGABLE_VOLATILE) {
2273 		if (VM_PAGE_WIRED(mem)) {
2274 			assert(vm_page_purgeable_wired_count > 0);
2275 			OSAddAtomic(-1, &vm_page_purgeable_wired_count);
2276 		} else {
2277 			assert(vm_page_purgeable_count > 0);
2278 			OSAddAtomic(-1, &vm_page_purgeable_count);
2279 		}
2280 	}
2281 
2282 	if (m_object->set_cache_attr == TRUE) {
2283 		pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0);
2284 	}
2285 
2286 	mem->vmp_tabled = FALSE;
2287 	mem->vmp_object = 0;
2288 	mem->vmp_offset = (vm_object_offset_t) -1;
2289 }
2290 
2291 
2292 /*
2293  *	vm_page_lookup:
2294  *
2295  *	Returns the page associated with the object/offset
2296  *	pair specified; if none is found, VM_PAGE_NULL is returned.
2297  *
2298  *	The object must be locked.  No side effects.
2299  */
2300 
2301 #define VM_PAGE_HASH_LOOKUP_THRESHOLD   10
2302 
2303 #if DEBUG_VM_PAGE_LOOKUP
2304 
2305 struct {
2306 	uint64_t        vpl_total;
2307 	uint64_t        vpl_empty_obj;
2308 	uint64_t        vpl_bucket_NULL;
2309 	uint64_t        vpl_hit_hint;
2310 	uint64_t        vpl_hit_hint_next;
2311 	uint64_t        vpl_hit_hint_prev;
2312 	uint64_t        vpl_fast;
2313 	uint64_t        vpl_slow;
2314 	uint64_t        vpl_hit;
2315 	uint64_t        vpl_miss;
2316 
2317 	uint64_t        vpl_fast_elapsed;
2318 	uint64_t        vpl_slow_elapsed;
2319 } vm_page_lookup_stats __attribute__((aligned(8)));
2320 
2321 #endif
2322 
2323 #define KDP_VM_PAGE_WALK_MAX    1000
2324 
2325 vm_page_t
kdp_vm_page_lookup(vm_object_t object,vm_object_offset_t offset)2326 kdp_vm_page_lookup(
2327 	vm_object_t             object,
2328 	vm_object_offset_t      offset)
2329 {
2330 	vm_page_t cur_page;
2331 	int num_traversed = 0;
2332 
2333 	if (not_in_kdp) {
2334 		panic("panic: kdp_vm_page_lookup done outside of kernel debugger");
2335 	}
2336 
2337 	vm_page_queue_iterate(&object->memq, cur_page, vmp_listq) {
2338 		if (cur_page->vmp_offset == offset) {
2339 			return cur_page;
2340 		}
2341 		num_traversed++;
2342 
2343 		if (num_traversed >= KDP_VM_PAGE_WALK_MAX) {
2344 			return VM_PAGE_NULL;
2345 		}
2346 	}
2347 
2348 	return VM_PAGE_NULL;
2349 }
2350 
2351 vm_page_t
vm_page_lookup(vm_object_t object,vm_object_offset_t offset)2352 vm_page_lookup(
2353 	vm_object_t             object,
2354 	vm_object_offset_t      offset)
2355 {
2356 	vm_page_t       mem;
2357 	vm_page_bucket_t *bucket;
2358 	vm_page_queue_entry_t   qe;
2359 	lck_spin_t      *bucket_lock = NULL;
2360 	int             hash_id;
2361 #if DEBUG_VM_PAGE_LOOKUP
2362 	uint64_t        start, elapsed;
2363 
2364 	OSAddAtomic64(1, &vm_page_lookup_stats.vpl_total);
2365 #endif
2366 
2367 #if CONFIG_KERNEL_TBI
2368 	if (VM_KERNEL_ADDRESS(offset)) {
2369 		offset = VM_KERNEL_STRIP_UPTR(offset);
2370 	}
2371 #endif /* CONFIG_KERNEL_TBI */
2372 
2373 	vm_object_lock_assert_held(object);
2374 	assertf(page_aligned(offset), "offset 0x%llx\n", offset);
2375 
2376 	if (object->resident_page_count == 0) {
2377 #if DEBUG_VM_PAGE_LOOKUP
2378 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_empty_obj);
2379 #endif
2380 		return VM_PAGE_NULL;
2381 	}
2382 
2383 	mem = object->memq_hint;
2384 
2385 	if (mem != VM_PAGE_NULL) {
2386 		assert(VM_PAGE_OBJECT(mem) == object);
2387 
2388 		if (mem->vmp_offset == offset) {
2389 #if DEBUG_VM_PAGE_LOOKUP
2390 			OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint);
2391 #endif
2392 			return mem;
2393 		}
2394 		qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->vmp_listq);
2395 
2396 		if (!vm_page_queue_end(&object->memq, qe)) {
2397 			vm_page_t       next_page;
2398 
2399 			next_page = (vm_page_t)((uintptr_t)qe);
2400 			assert(VM_PAGE_OBJECT(next_page) == object);
2401 
2402 			if (next_page->vmp_offset == offset) {
2403 				object->memq_hint = next_page; /* new hint */
2404 #if DEBUG_VM_PAGE_LOOKUP
2405 				OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_next);
2406 #endif
2407 				return next_page;
2408 			}
2409 		}
2410 		qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->vmp_listq);
2411 
2412 		if (!vm_page_queue_end(&object->memq, qe)) {
2413 			vm_page_t prev_page;
2414 
2415 			prev_page = (vm_page_t)((uintptr_t)qe);
2416 			assert(VM_PAGE_OBJECT(prev_page) == object);
2417 
2418 			if (prev_page->vmp_offset == offset) {
2419 				object->memq_hint = prev_page; /* new hint */
2420 #if DEBUG_VM_PAGE_LOOKUP
2421 				OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_prev);
2422 #endif
2423 				return prev_page;
2424 			}
2425 		}
2426 	}
2427 	/*
2428 	 * Search the hash table for this object/offset pair
2429 	 */
2430 	hash_id = vm_page_hash(object, offset);
2431 	bucket = &vm_page_buckets[hash_id];
2432 
2433 	/*
2434 	 * since we hold the object lock, we are guaranteed that no
2435 	 * new pages can be inserted into this object... this in turn
2436 	 * guarantess that the page we're looking for can't exist
2437 	 * if the bucket it hashes to is currently NULL even when looked
2438 	 * at outside the scope of the hash bucket lock... this is a
2439 	 * really cheap optimiztion to avoid taking the lock
2440 	 */
2441 	if (!bucket->page_list) {
2442 #if DEBUG_VM_PAGE_LOOKUP
2443 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_bucket_NULL);
2444 #endif
2445 		return VM_PAGE_NULL;
2446 	}
2447 
2448 #if DEBUG_VM_PAGE_LOOKUP
2449 	start = mach_absolute_time();
2450 #endif
2451 	if (object->resident_page_count <= VM_PAGE_HASH_LOOKUP_THRESHOLD) {
2452 		/*
2453 		 * on average, it's roughly 3 times faster to run a short memq list
2454 		 * than to take the spin lock and go through the hash list
2455 		 */
2456 		mem = (vm_page_t)vm_page_queue_first(&object->memq);
2457 
2458 		while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
2459 			if (mem->vmp_offset == offset) {
2460 				break;
2461 			}
2462 
2463 			mem = (vm_page_t)vm_page_queue_next(&mem->vmp_listq);
2464 		}
2465 		if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
2466 			mem = NULL;
2467 		}
2468 	} else {
2469 		vm_page_object_t        packed_object;
2470 
2471 		packed_object = VM_PAGE_PACK_OBJECT(object);
2472 
2473 		bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
2474 
2475 		lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
2476 
2477 		for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
2478 		    mem != VM_PAGE_NULL;
2479 		    mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m))) {
2480 #if 0
2481 			/*
2482 			 * we don't hold the page queue lock
2483 			 * so this check isn't safe to make
2484 			 */
2485 			VM_PAGE_CHECK(mem);
2486 #endif
2487 			if ((mem->vmp_object == packed_object) && (mem->vmp_offset == offset)) {
2488 				break;
2489 			}
2490 		}
2491 		lck_spin_unlock(bucket_lock);
2492 	}
2493 
2494 #if DEBUG_VM_PAGE_LOOKUP
2495 	elapsed = mach_absolute_time() - start;
2496 
2497 	if (bucket_lock) {
2498 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_slow);
2499 		OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_slow_elapsed);
2500 	} else {
2501 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_fast);
2502 		OSAddAtomic64(elapsed, &vm_page_lookup_stats.vpl_fast_elapsed);
2503 	}
2504 	if (mem != VM_PAGE_NULL) {
2505 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit);
2506 	} else {
2507 		OSAddAtomic64(1, &vm_page_lookup_stats.vpl_miss);
2508 	}
2509 #endif
2510 	if (mem != VM_PAGE_NULL) {
2511 		assert(VM_PAGE_OBJECT(mem) == object);
2512 
2513 		object->memq_hint = mem;
2514 	}
2515 	return mem;
2516 }
2517 
2518 
2519 /*
2520  *	vm_page_rename:
2521  *
2522  *	Move the given memory entry from its
2523  *	current object to the specified target object/offset.
2524  *
2525  *	The object must be locked.
2526  */
2527 void
vm_page_rename(vm_page_t mem,vm_object_t new_object,vm_object_offset_t new_offset)2528 vm_page_rename(
2529 	vm_page_t               mem,
2530 	vm_object_t             new_object,
2531 	vm_object_offset_t      new_offset)
2532 {
2533 	boolean_t       internal_to_external, external_to_internal;
2534 	vm_tag_t        tag;
2535 	vm_object_t     m_object;
2536 
2537 	m_object = VM_PAGE_OBJECT(mem);
2538 
2539 	assert(m_object != new_object);
2540 	assert(m_object);
2541 
2542 	/*
2543 	 *	Changes to mem->vmp_object require the page lock because
2544 	 *	the pageout daemon uses that lock to get the object.
2545 	 */
2546 	vm_page_lockspin_queues();
2547 
2548 	internal_to_external = FALSE;
2549 	external_to_internal = FALSE;
2550 
2551 	if (mem->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
2552 		/*
2553 		 * it's much easier to get the vm_page_pageable_xxx accounting correct
2554 		 * if we first move the page to the active queue... it's going to end
2555 		 * up there anyway, and we don't do vm_page_rename's frequently enough
2556 		 * for this to matter.
2557 		 */
2558 		vm_page_queues_remove(mem, FALSE);
2559 		vm_page_activate(mem);
2560 	}
2561 	if (VM_PAGE_PAGEABLE(mem)) {
2562 		if (m_object->internal && !new_object->internal) {
2563 			internal_to_external = TRUE;
2564 		}
2565 		if (!m_object->internal && new_object->internal) {
2566 			external_to_internal = TRUE;
2567 		}
2568 	}
2569 
2570 	tag = m_object->wire_tag;
2571 	vm_page_remove(mem, TRUE);
2572 	vm_page_insert_internal(mem, new_object, new_offset, tag, TRUE, TRUE, FALSE, FALSE, NULL);
2573 
2574 	if (internal_to_external) {
2575 		vm_page_pageable_internal_count--;
2576 		vm_page_pageable_external_count++;
2577 	} else if (external_to_internal) {
2578 		vm_page_pageable_external_count--;
2579 		vm_page_pageable_internal_count++;
2580 	}
2581 
2582 	vm_page_unlock_queues();
2583 }
2584 
2585 /*
2586  *	vm_page_init:
2587  *
2588  *	Initialize the fields in a new page.
2589  *	This takes a structure with random values and initializes it
2590  *	so that it can be given to vm_page_release or vm_page_insert.
2591  */
2592 void
vm_page_init(vm_page_t mem,ppnum_t phys_page,boolean_t lopage)2593 vm_page_init(
2594 	vm_page_t mem,
2595 	ppnum_t   phys_page,
2596 	boolean_t lopage)
2597 {
2598 	uint_t    i;
2599 	uintptr_t *p;
2600 
2601 	assert(phys_page);
2602 
2603 #if DEBUG
2604 	if ((phys_page != vm_page_fictitious_addr) && (phys_page != vm_page_guard_addr)) {
2605 		if (!(pmap_valid_page(phys_page))) {
2606 			panic("vm_page_init: non-DRAM phys_page 0x%x", phys_page);
2607 		}
2608 	}
2609 #endif /* DEBUG */
2610 
2611 	/*
2612 	 * Initialize the fields of the vm_page. If adding any new fields to vm_page,
2613 	 * try to use initial values which match 0. This minimizes the number of writes
2614 	 * needed for boot-time initialization.
2615 	 *
2616 	 * Kernel bzero() isn't an inline yet, so do it by hand for performance.
2617 	 */
2618 	assert(VM_PAGE_NOT_ON_Q == 0);
2619 	assert(sizeof(*mem) % sizeof(uintptr_t) == 0);
2620 	for (p = (uintptr_t *)(void *)mem, i = sizeof(*mem) / sizeof(uintptr_t); i != 0; --i) {
2621 		*p++ = 0;
2622 	}
2623 	mem->vmp_offset = (vm_object_offset_t)-1;
2624 	mem->vmp_busy = TRUE;
2625 	mem->vmp_lopage = lopage;
2626 
2627 	VM_PAGE_SET_PHYS_PAGE(mem, phys_page);
2628 #if 0
2629 	/*
2630 	 * we're leaving this turned off for now... currently pages
2631 	 * come off the free list and are either immediately dirtied/referenced
2632 	 * due to zero-fill or COW faults, or are used to read or write files...
2633 	 * in the file I/O case, the UPL mechanism takes care of clearing
2634 	 * the state of the HW ref/mod bits in a somewhat fragile way.
2635 	 * Since we may change the way this works in the future (to toughen it up),
2636 	 * I'm leaving this as a reminder of where these bits could get cleared
2637 	 */
2638 
2639 	/*
2640 	 * make sure both the h/w referenced and modified bits are
2641 	 * clear at this point... we are especially dependent on
2642 	 * not finding a 'stale' h/w modified in a number of spots
2643 	 * once this page goes back into use
2644 	 */
2645 	pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
2646 #endif
2647 }
2648 
2649 /*
2650  *	vm_page_grab_fictitious:
2651  *
2652  *	Remove a fictitious page from the free list.
2653  *	Returns VM_PAGE_NULL if there are no free pages.
2654  */
2655 
2656 static vm_page_t
vm_page_grab_fictitious_common(ppnum_t phys_addr,boolean_t canwait)2657 vm_page_grab_fictitious_common(ppnum_t phys_addr, boolean_t canwait)
2658 {
2659 	vm_page_t m;
2660 
2661 	m = zalloc_flags(vm_page_zone, canwait ? Z_WAITOK : Z_NOWAIT);
2662 	if (m) {
2663 		vm_page_init(m, phys_addr, FALSE);
2664 		m->vmp_fictitious = TRUE;
2665 	}
2666 	return m;
2667 }
2668 
2669 vm_page_t
vm_page_grab_fictitious(boolean_t canwait)2670 vm_page_grab_fictitious(boolean_t canwait)
2671 {
2672 	return vm_page_grab_fictitious_common(vm_page_fictitious_addr, canwait);
2673 }
2674 
2675 int vm_guard_count;
2676 
2677 
2678 vm_page_t
vm_page_grab_guard(boolean_t canwait)2679 vm_page_grab_guard(boolean_t canwait)
2680 {
2681 	vm_page_t page;
2682 	page = vm_page_grab_fictitious_common(vm_page_guard_addr, canwait);
2683 	if (page) {
2684 		OSAddAtomic(1, &vm_guard_count);
2685 	}
2686 	return page;
2687 }
2688 
2689 
2690 /*
2691  *	vm_page_release_fictitious:
2692  *
2693  *	Release a fictitious page to the zone pool
2694  */
2695 void
vm_page_release_fictitious(vm_page_t m)2696 vm_page_release_fictitious(
2697 	vm_page_t m)
2698 {
2699 	assert((m->vmp_q_state == VM_PAGE_NOT_ON_Q) || (m->vmp_q_state == VM_PAGE_IS_WIRED));
2700 	assert(m->vmp_fictitious);
2701 	assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr ||
2702 	    VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr);
2703 	assert(!m->vmp_realtime);
2704 
2705 	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2706 		OSAddAtomic(-1, &vm_guard_count);
2707 	}
2708 
2709 	zfree(vm_page_zone, m);
2710 }
2711 
2712 /*
2713  *	vm_pool_low():
2714  *
2715  *	Return true if it is not likely that a non-vm_privileged thread
2716  *	can get memory without blocking.  Advisory only, since the
2717  *	situation may change under us.
2718  */
2719 bool
vm_pool_low(void)2720 vm_pool_low(void)
2721 {
2722 	/* No locking, at worst we will fib. */
2723 	return vm_page_free_count <= vm_page_free_reserved;
2724 }
2725 
2726 boolean_t vm_darkwake_mode = FALSE;
2727 
2728 /*
2729  * vm_update_darkwake_mode():
2730  *
2731  * Tells the VM that the system is in / out of darkwake.
2732  *
2733  * Today, the VM only lowers/raises the background queue target
2734  * so as to favor consuming more/less background pages when
2735  * darwake is ON/OFF.
2736  *
2737  * We might need to do more things in the future.
2738  */
2739 
2740 void
vm_update_darkwake_mode(boolean_t darkwake_mode)2741 vm_update_darkwake_mode(boolean_t darkwake_mode)
2742 {
2743 #if XNU_TARGET_OS_OSX && defined(__arm64__)
2744 #pragma unused(darkwake_mode)
2745 	assert(vm_darkwake_mode == FALSE);
2746 	/*
2747 	 * Darkwake mode isn't supported for AS macOS.
2748 	 */
2749 	return;
2750 #else /* XNU_TARGET_OS_OSX && __arm64__ */
2751 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2752 
2753 	vm_page_lockspin_queues();
2754 
2755 	if (vm_darkwake_mode == darkwake_mode) {
2756 		/*
2757 		 * No change.
2758 		 */
2759 		vm_page_unlock_queues();
2760 		return;
2761 	}
2762 
2763 	vm_darkwake_mode = darkwake_mode;
2764 
2765 	if (vm_darkwake_mode == TRUE) {
2766 		/* save background target to restore later */
2767 		vm_page_background_target_snapshot = vm_page_background_target;
2768 
2769 		/* target is set to 0...no protection for background pages */
2770 		vm_page_background_target = 0;
2771 	} else if (vm_darkwake_mode == FALSE) {
2772 		if (vm_page_background_target_snapshot) {
2773 			vm_page_background_target = vm_page_background_target_snapshot;
2774 		}
2775 	}
2776 	vm_page_unlock_queues();
2777 #endif
2778 }
2779 
2780 void
vm_page_update_special_state(vm_page_t mem)2781 vm_page_update_special_state(vm_page_t mem)
2782 {
2783 	if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR || mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
2784 		return;
2785 	}
2786 
2787 	int mode = mem->vmp_on_specialq;
2788 
2789 	switch (mode) {
2790 	case VM_PAGE_SPECIAL_Q_BG:
2791 	{
2792 		task_t  my_task = current_task_early();
2793 
2794 		if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
2795 			return;
2796 		}
2797 
2798 		if (my_task) {
2799 			if (task_get_darkwake_mode(my_task)) {
2800 				return;
2801 			}
2802 		}
2803 
2804 		if (my_task) {
2805 			if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG)) {
2806 				return;
2807 			}
2808 		}
2809 		vm_page_lockspin_queues();
2810 
2811 		vm_page_background_promoted_count++;
2812 
2813 		vm_page_remove_from_specialq(mem);
2814 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
2815 
2816 		vm_page_unlock_queues();
2817 		break;
2818 	}
2819 
2820 	case VM_PAGE_SPECIAL_Q_DONATE:
2821 	{
2822 		task_t  my_task = current_task_early();
2823 
2824 		if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
2825 			return;
2826 		}
2827 
2828 		if (my_task->donates_own_pages == false) {
2829 			vm_page_lockspin_queues();
2830 
2831 			vm_page_remove_from_specialq(mem);
2832 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
2833 
2834 			vm_page_unlock_queues();
2835 		}
2836 		break;
2837 	}
2838 
2839 	default:
2840 	{
2841 		assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
2842 		    VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
2843 		break;
2844 	}
2845 	}
2846 }
2847 
2848 
2849 void
vm_page_assign_special_state(vm_page_t mem,int mode)2850 vm_page_assign_special_state(vm_page_t mem, int mode)
2851 {
2852 	if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2853 		return;
2854 	}
2855 
2856 	switch (mode) {
2857 	case VM_PAGE_SPECIAL_Q_BG:
2858 	{
2859 		if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
2860 			return;
2861 		}
2862 
2863 		task_t  my_task = current_task_early();
2864 
2865 		if (my_task) {
2866 			if (task_get_darkwake_mode(my_task)) {
2867 				mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
2868 				return;
2869 			}
2870 		}
2871 
2872 		if (my_task) {
2873 			mem->vmp_on_specialq = (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG) ? VM_PAGE_SPECIAL_Q_BG : VM_PAGE_SPECIAL_Q_EMPTY);
2874 		}
2875 		break;
2876 	}
2877 
2878 	case VM_PAGE_SPECIAL_Q_DONATE:
2879 	{
2880 		if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
2881 			return;
2882 		}
2883 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
2884 		break;
2885 	}
2886 
2887 	default:
2888 		break;
2889 	}
2890 }
2891 
2892 
2893 void
vm_page_remove_from_specialq(vm_page_t mem)2894 vm_page_remove_from_specialq(
2895 	vm_page_t       mem)
2896 {
2897 	vm_object_t     m_object;
2898 	unsigned short  mode;
2899 
2900 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2901 
2902 	mode = mem->vmp_on_specialq;
2903 
2904 	switch (mode) {
2905 	case VM_PAGE_SPECIAL_Q_BG:
2906 	{
2907 		if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
2908 			vm_page_queue_remove(&vm_page_queue_background, mem, vmp_specialq);
2909 
2910 			mem->vmp_specialq.next = 0;
2911 			mem->vmp_specialq.prev = 0;
2912 
2913 			vm_page_background_count--;
2914 
2915 			m_object = VM_PAGE_OBJECT(mem);
2916 
2917 			if (m_object->internal) {
2918 				vm_page_background_internal_count--;
2919 			} else {
2920 				vm_page_background_external_count--;
2921 			}
2922 		}
2923 		break;
2924 	}
2925 
2926 	case VM_PAGE_SPECIAL_Q_DONATE:
2927 	{
2928 		if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
2929 			vm_page_queue_remove((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
2930 			mem->vmp_specialq.next = 0;
2931 			mem->vmp_specialq.prev = 0;
2932 			vm_page_donate_count--;
2933 			if (vm_page_donate_queue_ripe && (vm_page_donate_count < vm_page_donate_target)) {
2934 				assert(vm_page_donate_target == vm_page_donate_target_low);
2935 				vm_page_donate_target = vm_page_donate_target_high;
2936 				vm_page_donate_queue_ripe = false;
2937 			}
2938 		}
2939 
2940 		break;
2941 	}
2942 
2943 	default:
2944 	{
2945 		assert(VM_PAGE_UNPACK_PTR(mem->vmp_specialq.next) == (uintptr_t)NULL &&
2946 		    VM_PAGE_UNPACK_PTR(mem->vmp_specialq.prev) == (uintptr_t)NULL);
2947 		break;
2948 	}
2949 	}
2950 }
2951 
2952 
2953 void
vm_page_add_to_specialq(vm_page_t mem,boolean_t first)2954 vm_page_add_to_specialq(
2955 	vm_page_t       mem,
2956 	boolean_t       first)
2957 {
2958 	vm_object_t     m_object;
2959 
2960 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2961 
2962 	if (mem->vmp_specialq.next && mem->vmp_specialq.prev) {
2963 		return;
2964 	}
2965 
2966 	int mode = mem->vmp_on_specialq;
2967 
2968 	switch (mode) {
2969 	case VM_PAGE_SPECIAL_Q_BG:
2970 	{
2971 		if (vm_page_background_mode == VM_PAGE_BG_DISABLED) {
2972 			return;
2973 		}
2974 
2975 		m_object = VM_PAGE_OBJECT(mem);
2976 
2977 		if (vm_page_background_exclude_external && !m_object->internal) {
2978 			return;
2979 		}
2980 
2981 		if (first == TRUE) {
2982 			vm_page_queue_enter_first(&vm_page_queue_background, mem, vmp_specialq);
2983 		} else {
2984 			vm_page_queue_enter(&vm_page_queue_background, mem, vmp_specialq);
2985 		}
2986 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_BG;
2987 
2988 		vm_page_background_count++;
2989 
2990 		if (m_object->internal) {
2991 			vm_page_background_internal_count++;
2992 		} else {
2993 			vm_page_background_external_count++;
2994 		}
2995 		break;
2996 	}
2997 
2998 	case VM_PAGE_SPECIAL_Q_DONATE:
2999 	{
3000 		if (first == TRUE) {
3001 			vm_page_queue_enter_first((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
3002 		} else {
3003 			vm_page_queue_enter((vm_page_queue_head_t*)&vm_page_queue_donate, mem, vmp_specialq);
3004 		}
3005 		vm_page_donate_count++;
3006 		if (!vm_page_donate_queue_ripe && (vm_page_donate_count > vm_page_donate_target)) {
3007 			assert(vm_page_donate_target == vm_page_donate_target_high);
3008 			vm_page_donate_target = vm_page_donate_target_low;
3009 			vm_page_donate_queue_ripe = true;
3010 		}
3011 		mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3012 		break;
3013 	}
3014 
3015 	default:
3016 		break;
3017 	}
3018 }
3019 
3020 /*
3021  * This can be switched to FALSE to help debug drivers
3022  * that are having problems with memory > 4G.
3023  */
3024 boolean_t       vm_himemory_mode = TRUE;
3025 
3026 /*
3027  * this interface exists to support hardware controllers
3028  * incapable of generating DMAs with more than 32 bits
3029  * of address on platforms with physical memory > 4G...
3030  */
3031 unsigned int    vm_lopages_allocated_q = 0;
3032 unsigned int    vm_lopages_allocated_cpm_success = 0;
3033 unsigned int    vm_lopages_allocated_cpm_failed = 0;
3034 vm_page_queue_head_t    vm_lopage_queue_free VM_PAGE_PACKED_ALIGNED;
3035 
3036 vm_page_t
vm_page_grablo(void)3037 vm_page_grablo(void)
3038 {
3039 	vm_page_t       mem;
3040 
3041 	if (vm_lopage_needed == FALSE) {
3042 		return vm_page_grab();
3043 	}
3044 
3045 	vm_free_page_lock_spin();
3046 
3047 	if (!vm_page_queue_empty(&vm_lopage_queue_free)) {
3048 		vm_page_queue_remove_first(&vm_lopage_queue_free, mem, vmp_pageq);
3049 		assert(vm_lopage_free_count);
3050 		assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
3051 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
3052 
3053 		vm_lopage_free_count--;
3054 		vm_lopages_allocated_q++;
3055 
3056 		if (vm_lopage_free_count < vm_lopage_lowater) {
3057 			vm_lopage_refill = TRUE;
3058 		}
3059 
3060 		vm_free_page_unlock();
3061 
3062 		if (current_task()->donates_own_pages) {
3063 			vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE);
3064 		} else {
3065 			vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG);
3066 		}
3067 	} else {
3068 		vm_free_page_unlock();
3069 
3070 		if (cpm_allocate(PAGE_SIZE, &mem, atop(PPNUM_MAX), 0, FALSE, KMA_LOMEM) != KERN_SUCCESS) {
3071 			vm_free_page_lock_spin();
3072 			vm_lopages_allocated_cpm_failed++;
3073 			vm_free_page_unlock();
3074 
3075 			return VM_PAGE_NULL;
3076 		}
3077 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3078 
3079 		mem->vmp_busy = TRUE;
3080 
3081 		vm_page_lockspin_queues();
3082 
3083 		mem->vmp_gobbled = FALSE;
3084 		vm_page_gobble_count--;
3085 		vm_page_wire_count--;
3086 
3087 		vm_lopages_allocated_cpm_success++;
3088 		vm_page_unlock_queues();
3089 	}
3090 	assert(mem->vmp_busy);
3091 	assert(!mem->vmp_pmapped);
3092 	assert(!mem->vmp_wpmapped);
3093 	assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
3094 
3095 	VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
3096 
3097 	counter_inc(&vm_page_grab_count);
3098 	VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, 0, 1, 0, 0);
3099 
3100 	return mem;
3101 }
3102 
3103 /*
3104  *	vm_page_grab:
3105  *
3106  *	first try to grab a page from the per-cpu free list...
3107  *	this must be done while pre-emption is disabled... if
3108  *      a page is available, we're done...
3109  *	if no page is available, grab the vm_page_queue_free_lock
3110  *	and see if current number of free pages would allow us
3111  *      to grab at least 1... if not, return VM_PAGE_NULL as before...
3112  *	if there are pages available, disable preemption and
3113  *      recheck the state of the per-cpu free list... we could
3114  *	have been preempted and moved to a different cpu, or
3115  *      some other thread could have re-filled it... if still
3116  *	empty, figure out how many pages we can steal from the
3117  *	global free queue and move to the per-cpu queue...
3118  *	return 1 of these pages when done... only wakeup the
3119  *      pageout_scan thread if we moved pages from the global
3120  *	list... no need for the wakeup if we've satisfied the
3121  *	request from the per-cpu queue.
3122  */
3123 
3124 #if CONFIG_SECLUDED_MEMORY
3125 vm_page_t vm_page_grab_secluded(void);
3126 #endif /* CONFIG_SECLUDED_MEMORY */
3127 
3128 static inline void
3129 vm_page_grab_diags(void);
3130 
3131 vm_page_t
vm_page_grab(void)3132 vm_page_grab(void)
3133 {
3134 	return vm_page_grab_options(VM_PAGE_GRAB_OPTIONS_NONE);
3135 }
3136 
3137 #if HIBERNATION
3138 boolean_t       hibernate_rebuild_needed = FALSE;
3139 #endif /* HIBERNATION */
3140 
3141 vm_page_t
vm_page_grab_options(int grab_options)3142 vm_page_grab_options(
3143 	int grab_options)
3144 {
3145 	vm_page_t       mem;
3146 
3147 restart:
3148 	disable_preemption();
3149 
3150 	if ((mem = *PERCPU_GET(free_pages))) {
3151 		assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
3152 
3153 #if HIBERNATION
3154 		if (hibernate_rebuild_needed) {
3155 			panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__);
3156 		}
3157 #endif /* HIBERNATION */
3158 
3159 		vm_page_grab_diags();
3160 
3161 		vm_offset_t pcpu_base = current_percpu_base();
3162 		counter_inc_preemption_disabled(&vm_page_grab_count);
3163 		*PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = mem->vmp_snext;
3164 		VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
3165 
3166 		VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
3167 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
3168 		enable_preemption();
3169 
3170 		assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
3171 		assert(mem->vmp_tabled == FALSE);
3172 		assert(mem->vmp_object == 0);
3173 		assert(!mem->vmp_laundry);
3174 		ASSERT_PMAP_FREE(mem);
3175 		assert(mem->vmp_busy);
3176 		assert(!mem->vmp_pmapped);
3177 		assert(!mem->vmp_wpmapped);
3178 		assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
3179 		assert(!mem->vmp_realtime);
3180 
3181 		task_t  cur_task = current_task_early();
3182 		if (cur_task && cur_task != kernel_task) {
3183 			if (cur_task->donates_own_pages) {
3184 				vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE);
3185 			} else {
3186 				vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG);
3187 			}
3188 		}
3189 		return mem;
3190 	}
3191 	enable_preemption();
3192 
3193 
3194 	/*
3195 	 *	Optionally produce warnings if the wire or gobble
3196 	 *	counts exceed some threshold.
3197 	 */
3198 #if VM_PAGE_WIRE_COUNT_WARNING
3199 	if (vm_page_wire_count >= VM_PAGE_WIRE_COUNT_WARNING) {
3200 		printf("mk: vm_page_grab(): high wired page count of %d\n",
3201 		    vm_page_wire_count);
3202 	}
3203 #endif
3204 #if VM_PAGE_GOBBLE_COUNT_WARNING
3205 	if (vm_page_gobble_count >= VM_PAGE_GOBBLE_COUNT_WARNING) {
3206 		printf("mk: vm_page_grab(): high gobbled page count of %d\n",
3207 		    vm_page_gobble_count);
3208 	}
3209 #endif
3210 
3211 	/*
3212 	 * If free count is low and we have delayed pages from early boot,
3213 	 * get one of those instead.
3214 	 */
3215 	if (__improbable(vm_delayed_count > 0 &&
3216 	    vm_page_free_count <= vm_page_free_target &&
3217 	    (mem = vm_get_delayed_page(grab_options)) != NULL)) {
3218 		assert(!mem->vmp_realtime);
3219 		return mem;
3220 	}
3221 
3222 	vm_free_page_lock_spin();
3223 
3224 	/*
3225 	 *	Only let privileged threads (involved in pageout)
3226 	 *	dip into the reserved pool.
3227 	 */
3228 	if ((vm_page_free_count < vm_page_free_reserved) &&
3229 	    !(current_thread()->options & TH_OPT_VMPRIV)) {
3230 		/* no page for us in the free queue... */
3231 		vm_free_page_unlock();
3232 		mem = VM_PAGE_NULL;
3233 
3234 #if CONFIG_SECLUDED_MEMORY
3235 		/* ... but can we try and grab from the secluded queue? */
3236 		if (vm_page_secluded_count > 0 &&
3237 		    ((grab_options & VM_PAGE_GRAB_SECLUDED) ||
3238 		    task_can_use_secluded_mem(current_task(), TRUE))) {
3239 			mem = vm_page_grab_secluded();
3240 			if (grab_options & VM_PAGE_GRAB_SECLUDED) {
3241 				vm_page_secluded.grab_for_iokit++;
3242 				if (mem) {
3243 					vm_page_secluded.grab_for_iokit_success++;
3244 				}
3245 			}
3246 			if (mem) {
3247 				VM_CHECK_MEMORYSTATUS;
3248 
3249 				vm_page_grab_diags();
3250 				counter_inc(&vm_page_grab_count);
3251 				VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
3252 
3253 				assert(!mem->vmp_realtime);
3254 				return mem;
3255 			}
3256 		}
3257 #else /* CONFIG_SECLUDED_MEMORY */
3258 		(void) grab_options;
3259 #endif /* CONFIG_SECLUDED_MEMORY */
3260 	} else {
3261 		vm_page_t        head;
3262 		vm_page_t        tail;
3263 		unsigned int     pages_to_steal;
3264 		unsigned int     color;
3265 		unsigned int clump_end, sub_count;
3266 
3267 		while (vm_page_free_count == 0) {
3268 			vm_free_page_unlock();
3269 			/*
3270 			 * must be a privileged thread to be
3271 			 * in this state since a non-privileged
3272 			 * thread would have bailed if we were
3273 			 * under the vm_page_free_reserved mark
3274 			 */
3275 			VM_PAGE_WAIT();
3276 			vm_free_page_lock_spin();
3277 		}
3278 
3279 		/*
3280 		 * Need to repopulate the per-CPU free list from the global free list.
3281 		 * Note we don't do any processing of pending retirement pages here.
3282 		 * That'll happen in the code above when the page comes off the per-CPU list.
3283 		 */
3284 		disable_preemption();
3285 
3286 		/*
3287 		 * If we got preempted the cache might now have pages.
3288 		 */
3289 		if ((mem = *PERCPU_GET(free_pages))) {
3290 			vm_free_page_unlock();
3291 			enable_preemption();
3292 			goto restart;
3293 		}
3294 
3295 		if (vm_page_free_count <= vm_page_free_reserved) {
3296 			pages_to_steal = 1;
3297 		} else {
3298 			if (vm_free_magazine_refill_limit <= (vm_page_free_count - vm_page_free_reserved)) {
3299 				pages_to_steal = vm_free_magazine_refill_limit;
3300 			} else {
3301 				pages_to_steal = (vm_page_free_count - vm_page_free_reserved);
3302 			}
3303 		}
3304 		color = *PERCPU_GET(start_color);
3305 		head = tail = NULL;
3306 
3307 		vm_page_free_count -= pages_to_steal;
3308 		clump_end = sub_count = 0;
3309 
3310 		while (pages_to_steal--) {
3311 			while (vm_page_queue_empty(&vm_page_queue_free[color].qhead)) {
3312 				color = (color + 1) & vm_color_mask;
3313 			}
3314 #if defined(__x86_64__)
3315 			vm_page_queue_remove_first_with_clump(&vm_page_queue_free[color].qhead,
3316 			    mem, clump_end);
3317 #else
3318 			vm_page_queue_remove_first(&vm_page_queue_free[color].qhead,
3319 			    mem, vmp_pageq);
3320 #endif
3321 
3322 			assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
3323 
3324 			VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
3325 
3326 #if defined(__arm64__)
3327 			color = (color + 1) & vm_color_mask;
3328 #else
3329 
3330 #if DEVELOPMENT || DEBUG
3331 
3332 			sub_count++;
3333 			if (clump_end) {
3334 				vm_clump_update_stats(sub_count);
3335 				sub_count = 0;
3336 				color = (color + 1) & vm_color_mask;
3337 			}
3338 #else
3339 			if (clump_end) {
3340 				color = (color + 1) & vm_color_mask;
3341 			}
3342 
3343 #endif /* if DEVELOPMENT || DEBUG */
3344 
3345 #endif  /* if defined(__arm64__) */
3346 
3347 			if (head == NULL) {
3348 				head = mem;
3349 			} else {
3350 				tail->vmp_snext = mem;
3351 			}
3352 			tail = mem;
3353 
3354 			assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
3355 			assert(mem->vmp_tabled == FALSE);
3356 			assert(mem->vmp_object == 0);
3357 			assert(!mem->vmp_laundry);
3358 
3359 			mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
3360 
3361 			ASSERT_PMAP_FREE(mem);
3362 			assert(mem->vmp_busy);
3363 			assert(!mem->vmp_pmapped);
3364 			assert(!mem->vmp_wpmapped);
3365 			assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
3366 			assert(!mem->vmp_realtime);
3367 		}
3368 #if defined (__x86_64__) && (DEVELOPMENT || DEBUG)
3369 		vm_clump_update_stats(sub_count);
3370 #endif
3371 
3372 #if HIBERNATION
3373 		if (hibernate_rebuild_needed) {
3374 			panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__);
3375 		}
3376 #endif /* HIBERNATION */
3377 		vm_offset_t pcpu_base = current_percpu_base();
3378 		*PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = head;
3379 		*PERCPU_GET_WITH_BASE(pcpu_base, start_color) = color;
3380 
3381 		vm_free_page_unlock();
3382 		enable_preemption();
3383 		goto restart;
3384 	}
3385 
3386 	/*
3387 	 *	Decide if we should poke the pageout daemon.
3388 	 *	We do this if the free count is less than the low
3389 	 *	water mark. VM Pageout Scan will keep running till
3390 	 *	the free_count > free_target (& hence above free_min).
3391 	 *	This wakeup is to catch the possibility of the counts
3392 	 *	dropping between VM Pageout Scan parking and this check.
3393 	 *
3394 	 *	We don't have the counts locked ... if they change a little,
3395 	 *	it doesn't really matter.
3396 	 */
3397 	if (vm_page_free_count < vm_page_free_min) {
3398 		vm_free_page_lock();
3399 		if (vm_pageout_running == FALSE) {
3400 			vm_free_page_unlock();
3401 			thread_wakeup((event_t) &vm_page_free_wanted);
3402 		} else {
3403 			vm_free_page_unlock();
3404 		}
3405 	}
3406 
3407 	VM_CHECK_MEMORYSTATUS;
3408 
3409 	if (mem) {
3410 		assert(!mem->vmp_realtime);
3411 //		dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 4);	/* (TEST/DEBUG) */
3412 
3413 		task_t  cur_task = current_task_early();
3414 		if (cur_task && cur_task != kernel_task) {
3415 			if (cur_task->donates_own_pages) {
3416 				vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE);
3417 			} else {
3418 				vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG);
3419 			}
3420 		}
3421 	}
3422 	return mem;
3423 }
3424 
3425 #if CONFIG_SECLUDED_MEMORY
3426 vm_page_t
vm_page_grab_secluded(void)3427 vm_page_grab_secluded(void)
3428 {
3429 	vm_page_t       mem;
3430 	vm_object_t     object;
3431 	int             refmod_state;
3432 
3433 	if (vm_page_secluded_count == 0) {
3434 		/* no secluded pages to grab... */
3435 		return VM_PAGE_NULL;
3436 	}
3437 
3438 	/* secluded queue is protected by the VM page queue lock */
3439 	vm_page_lock_queues();
3440 
3441 	if (vm_page_secluded_count == 0) {
3442 		/* no secluded pages to grab... */
3443 		vm_page_unlock_queues();
3444 		return VM_PAGE_NULL;
3445 	}
3446 
3447 #if 00
3448 	/* can we grab from the secluded queue? */
3449 	if (vm_page_secluded_count > vm_page_secluded_target ||
3450 	    (vm_page_secluded_count > 0 &&
3451 	    task_can_use_secluded_mem(current_task(), TRUE))) {
3452 		/* OK */
3453 	} else {
3454 		/* can't grab from secluded queue... */
3455 		vm_page_unlock_queues();
3456 		return VM_PAGE_NULL;
3457 	}
3458 #endif
3459 
3460 	/* we can grab a page from secluded queue! */
3461 	assert((vm_page_secluded_count_free +
3462 	    vm_page_secluded_count_inuse) ==
3463 	    vm_page_secluded_count);
3464 	if (current_task()->task_can_use_secluded_mem) {
3465 		assert(num_tasks_can_use_secluded_mem > 0);
3466 	}
3467 	assert(!vm_page_queue_empty(&vm_page_queue_secluded));
3468 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3469 	mem = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
3470 	assert(mem->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
3471 	vm_page_queues_remove(mem, TRUE);
3472 
3473 	object = VM_PAGE_OBJECT(mem);
3474 
3475 	assert(!mem->vmp_fictitious);
3476 	assert(!VM_PAGE_WIRED(mem));
3477 	if (object == VM_OBJECT_NULL) {
3478 		/* free for grab! */
3479 		vm_page_unlock_queues();
3480 		vm_page_secluded.grab_success_free++;
3481 
3482 		assert(mem->vmp_busy);
3483 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3484 		assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL);
3485 		assert(mem->vmp_pageq.next == 0);
3486 		assert(mem->vmp_pageq.prev == 0);
3487 		assert(mem->vmp_listq.next == 0);
3488 		assert(mem->vmp_listq.prev == 0);
3489 		assert(mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
3490 		assert(mem->vmp_specialq.next == 0);
3491 		assert(mem->vmp_specialq.prev == 0);
3492 		return mem;
3493 	}
3494 
3495 	assert(!object->internal);
3496 //	vm_page_pageable_external_count--;
3497 
3498 	if (!vm_object_lock_try(object)) {
3499 //		printf("SECLUDED: page %p: object %p locked\n", mem, object);
3500 		vm_page_secluded.grab_failure_locked++;
3501 reactivate_secluded_page:
3502 		vm_page_activate(mem);
3503 		vm_page_unlock_queues();
3504 		return VM_PAGE_NULL;
3505 	}
3506 	if (mem->vmp_busy ||
3507 	    mem->vmp_cleaning ||
3508 	    mem->vmp_laundry) {
3509 		/* can't steal page in this state... */
3510 		vm_object_unlock(object);
3511 		vm_page_secluded.grab_failure_state++;
3512 		goto reactivate_secluded_page;
3513 	}
3514 	if (mem->vmp_realtime) {
3515 		/* don't steal pages used by realtime threads... */
3516 		vm_object_unlock(object);
3517 		vm_page_secluded.grab_failure_realtime++;
3518 		goto reactivate_secluded_page;
3519 	}
3520 
3521 	mem->vmp_busy = TRUE;
3522 	refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
3523 	if (refmod_state & VM_MEM_REFERENCED) {
3524 		mem->vmp_reference = TRUE;
3525 	}
3526 	if (refmod_state & VM_MEM_MODIFIED) {
3527 		SET_PAGE_DIRTY(mem, FALSE);
3528 	}
3529 	if (mem->vmp_dirty || mem->vmp_precious) {
3530 		/* can't grab a dirty page; re-activate */
3531 //		printf("SECLUDED: dirty page %p\n", mem);
3532 		PAGE_WAKEUP_DONE(mem);
3533 		vm_page_secluded.grab_failure_dirty++;
3534 		vm_object_unlock(object);
3535 		goto reactivate_secluded_page;
3536 	}
3537 	if (mem->vmp_reference) {
3538 		/* it's been used but we do need to grab a page... */
3539 	}
3540 
3541 	vm_page_unlock_queues();
3542 
3543 
3544 	/* finish what vm_page_free() would have done... */
3545 	vm_page_free_prepare_object(mem, TRUE);
3546 	vm_object_unlock(object);
3547 	object = VM_OBJECT_NULL;
3548 	if (vm_page_free_verify) {
3549 		ASSERT_PMAP_FREE(mem);
3550 	}
3551 	pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
3552 	vm_page_secluded.grab_success_other++;
3553 
3554 	assert(mem->vmp_busy);
3555 	assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3556 	assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL);
3557 	assert(mem->vmp_pageq.next == 0);
3558 	assert(mem->vmp_pageq.prev == 0);
3559 	assert(mem->vmp_listq.next == 0);
3560 	assert(mem->vmp_listq.prev == 0);
3561 	assert(mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
3562 	assert(mem->vmp_specialq.next == 0);
3563 	assert(mem->vmp_specialq.prev == 0);
3564 
3565 	return mem;
3566 }
3567 
3568 uint64_t
vm_page_secluded_drain(void)3569 vm_page_secluded_drain(void)
3570 {
3571 	vm_page_t local_freeq;
3572 	int local_freed;
3573 	uint64_t num_reclaimed;
3574 	unsigned int saved_secluded_count, saved_secluded_target;
3575 
3576 	num_reclaimed = 0;
3577 	local_freeq = NULL;
3578 	local_freed = 0;
3579 
3580 	vm_page_lock_queues();
3581 
3582 	saved_secluded_count = vm_page_secluded_count;
3583 	saved_secluded_target = vm_page_secluded_target;
3584 	vm_page_secluded_target = 0;
3585 	VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3586 	while (vm_page_secluded_count) {
3587 		vm_page_t secluded_page;
3588 
3589 		assert((vm_page_secluded_count_free +
3590 		    vm_page_secluded_count_inuse) ==
3591 		    vm_page_secluded_count);
3592 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
3593 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
3594 
3595 		vm_page_queues_remove(secluded_page, FALSE);
3596 		assert(!secluded_page->vmp_fictitious);
3597 		assert(!VM_PAGE_WIRED(secluded_page));
3598 
3599 		if (secluded_page->vmp_object == 0) {
3600 			/* transfer to free queue */
3601 			assert(secluded_page->vmp_busy);
3602 			secluded_page->vmp_snext = local_freeq;
3603 			local_freeq = secluded_page;
3604 			local_freed += 1;
3605 		} else {
3606 			/* transfer to head of active queue */
3607 			vm_page_enqueue_active(secluded_page, FALSE);
3608 			secluded_page = VM_PAGE_NULL;
3609 		}
3610 		num_reclaimed++;
3611 	}
3612 	vm_page_secluded_target = saved_secluded_target;
3613 	VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3614 
3615 //	printf("FBDP %s:%d secluded_count %d->%d, target %d, reclaimed %lld\n", __FUNCTION__, __LINE__, saved_secluded_count, vm_page_secluded_count, vm_page_secluded_target, num_reclaimed);
3616 
3617 	vm_page_unlock_queues();
3618 
3619 	if (local_freed) {
3620 		vm_page_free_list(local_freeq, TRUE);
3621 		local_freeq = NULL;
3622 		local_freed = 0;
3623 	}
3624 
3625 	return num_reclaimed;
3626 }
3627 #endif /* CONFIG_SECLUDED_MEMORY */
3628 
3629 
3630 static inline void
vm_page_grab_diags()3631 vm_page_grab_diags()
3632 {
3633 #if DEVELOPMENT || DEBUG
3634 	task_t task = current_task_early();
3635 	if (task == NULL) {
3636 		return;
3637 	}
3638 
3639 	ledger_credit(task->ledger, task_ledgers.pages_grabbed, 1);
3640 #endif /* DEVELOPMENT || DEBUG */
3641 }
3642 
3643 /*
3644  *	vm_page_release:
3645  *
3646  *	Return a page to the free list.
3647  */
3648 
3649 void
vm_page_release(vm_page_t mem,boolean_t page_queues_locked)3650 vm_page_release(
3651 	vm_page_t       mem,
3652 	boolean_t       page_queues_locked)
3653 {
3654 	unsigned int    color;
3655 	int     need_wakeup = 0;
3656 	int     need_priv_wakeup = 0;
3657 #if CONFIG_SECLUDED_MEMORY
3658 	int     need_secluded_wakeup = 0;
3659 #endif /* CONFIG_SECLUDED_MEMORY */
3660 	event_t wakeup_event = NULL;
3661 
3662 	if (page_queues_locked) {
3663 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3664 	} else {
3665 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3666 	}
3667 
3668 	assert(!mem->vmp_private && !mem->vmp_fictitious);
3669 	if (vm_page_free_verify) {
3670 		ASSERT_PMAP_FREE(mem);
3671 	}
3672 //	dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 5);	/* (TEST/DEBUG) */
3673 
3674 	pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
3675 
3676 	if (__improbable(mem->vmp_realtime)) {
3677 		if (!page_queues_locked) {
3678 			vm_page_lock_queues();
3679 		}
3680 		if (mem->vmp_realtime) {
3681 			mem->vmp_realtime = false;
3682 			vm_page_realtime_count--;
3683 		}
3684 		if (!page_queues_locked) {
3685 			vm_page_unlock_queues();
3686 		}
3687 	}
3688 
3689 	vm_free_page_lock_spin();
3690 
3691 	assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
3692 	assert(mem->vmp_busy);
3693 	assert(!mem->vmp_laundry);
3694 	assert(mem->vmp_object == 0);
3695 	assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
3696 	assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
3697 	assert(mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0);
3698 
3699 	/* Clear any specialQ hints before releasing page to the free pool*/
3700 	mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
3701 
3702 	if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) &&
3703 	    vm_lopage_free_count < vm_lopage_free_limit &&
3704 	    VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
3705 		/*
3706 		 * this exists to support hardware controllers
3707 		 * incapable of generating DMAs with more than 32 bits
3708 		 * of address on platforms with physical memory > 4G...
3709 		 */
3710 		vm_page_queue_enter_first(&vm_lopage_queue_free, mem, vmp_pageq);
3711 		vm_lopage_free_count++;
3712 
3713 		if (vm_lopage_free_count >= vm_lopage_free_limit) {
3714 			vm_lopage_refill = FALSE;
3715 		}
3716 
3717 		mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
3718 		mem->vmp_lopage = TRUE;
3719 #if CONFIG_SECLUDED_MEMORY
3720 	} else if (vm_page_free_count > vm_page_free_reserved &&
3721 	    vm_page_secluded_count < vm_page_secluded_target &&
3722 	    num_tasks_can_use_secluded_mem == 0) {
3723 		/*
3724 		 * XXX FBDP TODO: also avoid refilling secluded queue
3725 		 * when some IOKit objects are already grabbing from it...
3726 		 */
3727 		if (!page_queues_locked) {
3728 			if (!vm_page_trylock_queues()) {
3729 				/* take locks in right order */
3730 				vm_free_page_unlock();
3731 				vm_page_lock_queues();
3732 				vm_free_page_lock_spin();
3733 			}
3734 		}
3735 		mem->vmp_lopage = FALSE;
3736 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3737 		vm_page_queue_enter_first(&vm_page_queue_secluded, mem, vmp_pageq);
3738 		mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
3739 		vm_page_secluded_count++;
3740 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3741 		vm_page_secluded_count_free++;
3742 		if (!page_queues_locked) {
3743 			vm_page_unlock_queues();
3744 		}
3745 		LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
3746 		if (vm_page_free_wanted_secluded > 0) {
3747 			vm_page_free_wanted_secluded--;
3748 			need_secluded_wakeup = 1;
3749 		}
3750 #endif /* CONFIG_SECLUDED_MEMORY */
3751 	} else {
3752 		mem->vmp_lopage = FALSE;
3753 		mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
3754 
3755 		color = VM_PAGE_GET_COLOR(mem);
3756 #if defined(__x86_64__)
3757 		vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem);
3758 #else
3759 		vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vmp_pageq);
3760 #endif
3761 		vm_page_free_count++;
3762 		/*
3763 		 *	Check if we should wake up someone waiting for page.
3764 		 *	But don't bother waking them unless they can allocate.
3765 		 *
3766 		 *	We wakeup only one thread, to prevent starvation.
3767 		 *	Because the scheduling system handles wait queues FIFO,
3768 		 *	if we wakeup all waiting threads, one greedy thread
3769 		 *	can starve multiple niceguy threads.  When the threads
3770 		 *	all wakeup, the greedy threads runs first, grabs the page,
3771 		 *	and waits for another page.  It will be the first to run
3772 		 *	when the next page is freed.
3773 		 *
3774 		 *	However, there is a slight danger here.
3775 		 *	The thread we wake might not use the free page.
3776 		 *	Then the other threads could wait indefinitely
3777 		 *	while the page goes unused.  To forestall this,
3778 		 *	the pageout daemon will keep making free pages
3779 		 *	as long as vm_page_free_wanted is non-zero.
3780 		 */
3781 
3782 		assert(vm_page_free_count > 0);
3783 		if (vm_page_free_wanted_privileged > 0) {
3784 			vm_page_free_wanted_privileged--;
3785 			need_priv_wakeup = 1;
3786 #if CONFIG_SECLUDED_MEMORY
3787 		} else if (vm_page_free_wanted_secluded > 0 &&
3788 		    vm_page_free_count > vm_page_free_reserved) {
3789 			vm_page_free_wanted_secluded--;
3790 			need_secluded_wakeup = 1;
3791 #endif /* CONFIG_SECLUDED_MEMORY */
3792 		} else if (vm_page_free_wanted > 0 &&
3793 		    vm_page_free_count > vm_page_free_reserved) {
3794 			vm_page_free_wanted--;
3795 			need_wakeup = 1;
3796 		}
3797 	}
3798 	vm_pageout_vminfo.vm_page_pages_freed++;
3799 
3800 	vm_free_page_unlock();
3801 
3802 	VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, 1, 0, 0, 0);
3803 
3804 	if (need_priv_wakeup) {
3805 		wakeup_event = &vm_page_free_wanted_privileged;
3806 	}
3807 #if CONFIG_SECLUDED_MEMORY
3808 	else if (need_secluded_wakeup) {
3809 		wakeup_event = &vm_page_free_wanted_secluded;
3810 	}
3811 #endif /* CONFIG_SECLUDED_MEMORY */
3812 	else if (need_wakeup) {
3813 		wakeup_event = &vm_page_free_count;
3814 	}
3815 
3816 	if (wakeup_event) {
3817 		if (vps_dynamic_priority_enabled == TRUE) {
3818 			thread_t thread_woken = NULL;
3819 			wakeup_one_with_inheritor((event_t) wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
3820 			/*
3821 			 * (80947592) if this is the last reference on this
3822 			 * thread, calling thread_deallocate() here
3823 			 * might take the tasks_threads_lock,
3824 			 * sadly thread_create_internal is doing several
3825 			 * allocations under this lock, which can result in
3826 			 * deadlocks with the pageout scan daemon.
3827 			 *
3828 			 * FIXME: we should disallow allocations under the
3829 			 * task_thread_locks, but that is a larger fix to make.
3830 			 */
3831 			thread_deallocate_safe(thread_woken);
3832 		} else {
3833 			thread_wakeup_one((event_t) wakeup_event);
3834 		}
3835 	}
3836 
3837 	VM_CHECK_MEMORYSTATUS;
3838 }
3839 
3840 /*
3841  * This version of vm_page_release() is used only at startup
3842  * when we are single-threaded and pages are being released
3843  * for the first time. Hence, no locking or unnecessary checks are made.
3844  * Note: VM_CHECK_MEMORYSTATUS invoked by the caller.
3845  */
3846 void
vm_page_release_startup(vm_page_t mem)3847 vm_page_release_startup(
3848 	vm_page_t       mem)
3849 {
3850 	vm_page_queue_t queue_free;
3851 
3852 	if (vm_lopage_free_count < vm_lopage_free_limit &&
3853 	    VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
3854 		mem->vmp_lopage = TRUE;
3855 		mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
3856 		vm_lopage_free_count++;
3857 		queue_free = &vm_lopage_queue_free;
3858 #if CONFIG_SECLUDED_MEMORY
3859 	} else if (vm_page_secluded_count < vm_page_secluded_target) {
3860 		mem->vmp_lopage = FALSE;
3861 		mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
3862 		vm_page_secluded_count++;
3863 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
3864 		vm_page_secluded_count_free++;
3865 		queue_free = &vm_page_queue_secluded;
3866 #endif /* CONFIG_SECLUDED_MEMORY */
3867 	} else {
3868 		mem->vmp_lopage = FALSE;
3869 		mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
3870 		vm_page_free_count++;
3871 		queue_free = &vm_page_queue_free[VM_PAGE_GET_COLOR(mem)].qhead;
3872 	}
3873 	if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
3874 #if defined(__x86_64__)
3875 		vm_page_queue_enter_clump(queue_free, mem);
3876 #else
3877 		vm_page_queue_enter(queue_free, mem, vmp_pageq);
3878 #endif
3879 	} else {
3880 		vm_page_queue_enter_first(queue_free, mem, vmp_pageq);
3881 	}
3882 }
3883 
3884 /*
3885  *	vm_page_wait:
3886  *
3887  *	Wait for a page to become available.
3888  *	If there are plenty of free pages, then we don't sleep.
3889  *
3890  *	Returns:
3891  *		TRUE:  There may be another page, try again
3892  *		FALSE: We were interrupted out of our wait, don't try again
3893  */
3894 
3895 boolean_t
vm_page_wait(int interruptible)3896 vm_page_wait(
3897 	int     interruptible )
3898 {
3899 	/*
3900 	 *	We can't use vm_page_free_reserved to make this
3901 	 *	determination.  Consider: some thread might
3902 	 *	need to allocate two pages.  The first allocation
3903 	 *	succeeds, the second fails.  After the first page is freed,
3904 	 *	a call to vm_page_wait must really block.
3905 	 */
3906 	kern_return_t   wait_result;
3907 	int             need_wakeup = 0;
3908 	int             is_privileged = current_thread()->options & TH_OPT_VMPRIV;
3909 	event_t         wait_event = NULL;
3910 
3911 	vm_free_page_lock_spin();
3912 
3913 	if (is_privileged && vm_page_free_count) {
3914 		vm_free_page_unlock();
3915 		return TRUE;
3916 	}
3917 
3918 	if (vm_page_free_count >= vm_page_free_target) {
3919 		vm_free_page_unlock();
3920 		return TRUE;
3921 	}
3922 
3923 	if (is_privileged) {
3924 		if (vm_page_free_wanted_privileged++ == 0) {
3925 			need_wakeup = 1;
3926 		}
3927 		wait_event = (event_t)&vm_page_free_wanted_privileged;
3928 #if CONFIG_SECLUDED_MEMORY
3929 	} else if (secluded_for_apps &&
3930 	    task_can_use_secluded_mem(current_task(), FALSE)) {
3931 #if 00
3932 		/* XXX FBDP: need pageq lock for this... */
3933 		/* XXX FBDP: might wait even if pages available, */
3934 		/* XXX FBDP: hopefully not for too long... */
3935 		if (vm_page_secluded_count > 0) {
3936 			vm_free_page_unlock();
3937 			return TRUE;
3938 		}
3939 #endif
3940 		if (vm_page_free_wanted_secluded++ == 0) {
3941 			need_wakeup = 1;
3942 		}
3943 		wait_event = (event_t)&vm_page_free_wanted_secluded;
3944 #endif /* CONFIG_SECLUDED_MEMORY */
3945 	} else {
3946 		if (vm_page_free_wanted++ == 0) {
3947 			need_wakeup = 1;
3948 		}
3949 		wait_event = (event_t)&vm_page_free_count;
3950 	}
3951 
3952 	/*
3953 	 * We don't do a vm_pageout_scan wakeup if we already have
3954 	 * some waiters because vm_pageout_scan checks for waiters
3955 	 * before it returns and does so behind the vm_page_queue_free_lock,
3956 	 * which we own when we bump the waiter counts.
3957 	 */
3958 
3959 	if (vps_dynamic_priority_enabled == TRUE) {
3960 		/*
3961 		 * We are waking up vm_pageout_scan here. If it needs
3962 		 * the vm_page_queue_free_lock before we unlock it
3963 		 * we'll end up just blocking and incur an extra
3964 		 * context switch. Could be a perf. issue.
3965 		 */
3966 
3967 		if (need_wakeup) {
3968 			thread_wakeup((event_t)&vm_page_free_wanted);
3969 		}
3970 
3971 		/*
3972 		 * LD: This event is going to get recorded every time because
3973 		 * we don't get back THREAD_WAITING from lck_mtx_sleep_with_inheritor.
3974 		 * We just block in that routine.
3975 		 */
3976 		VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
3977 		    vm_page_free_wanted_privileged,
3978 		    vm_page_free_wanted,
3979 #if CONFIG_SECLUDED_MEMORY
3980 		    vm_page_free_wanted_secluded,
3981 #else /* CONFIG_SECLUDED_MEMORY */
3982 		    0,
3983 #endif /* CONFIG_SECLUDED_MEMORY */
3984 		    0);
3985 		wait_result =  lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
3986 		    LCK_SLEEP_UNLOCK,
3987 		    wait_event,
3988 		    vm_pageout_scan_thread,
3989 		    interruptible,
3990 		    0);
3991 	} else {
3992 		wait_result = assert_wait(wait_event, interruptible);
3993 
3994 		vm_free_page_unlock();
3995 
3996 		if (need_wakeup) {
3997 			thread_wakeup((event_t)&vm_page_free_wanted);
3998 		}
3999 
4000 		if (wait_result == THREAD_WAITING) {
4001 			VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
4002 			    vm_page_free_wanted_privileged,
4003 			    vm_page_free_wanted,
4004 #if CONFIG_SECLUDED_MEMORY
4005 			    vm_page_free_wanted_secluded,
4006 #else /* CONFIG_SECLUDED_MEMORY */
4007 			    0,
4008 #endif /* CONFIG_SECLUDED_MEMORY */
4009 			    0);
4010 			wait_result = thread_block(THREAD_CONTINUE_NULL);
4011 			VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
4012 			    VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
4013 		}
4014 	}
4015 
4016 	return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING);
4017 }
4018 
4019 /*
4020  *	vm_page_alloc:
4021  *
4022  *	Allocate and return a memory cell associated
4023  *	with this VM object/offset pair.
4024  *
4025  *	Object must be locked.
4026  */
4027 
4028 vm_page_t
vm_page_alloc(vm_object_t object,vm_object_offset_t offset)4029 vm_page_alloc(
4030 	vm_object_t             object,
4031 	vm_object_offset_t      offset)
4032 {
4033 	vm_page_t       mem;
4034 	int             grab_options;
4035 
4036 	vm_object_lock_assert_exclusive(object);
4037 	grab_options = 0;
4038 #if CONFIG_SECLUDED_MEMORY
4039 	if (object->can_grab_secluded) {
4040 		grab_options |= VM_PAGE_GRAB_SECLUDED;
4041 	}
4042 #endif /* CONFIG_SECLUDED_MEMORY */
4043 	mem = vm_page_grab_options(grab_options);
4044 	if (mem == VM_PAGE_NULL) {
4045 		return VM_PAGE_NULL;
4046 	}
4047 
4048 	vm_page_insert(mem, object, offset);
4049 
4050 	return mem;
4051 }
4052 
4053 /*
4054  *	vm_page_free_prepare:
4055  *
4056  *	Removes page from any queue it may be on
4057  *	and disassociates it from its VM object.
4058  *
4059  *	Object and page queues must be locked prior to entry.
4060  */
4061 static void
vm_page_free_prepare(vm_page_t mem)4062 vm_page_free_prepare(
4063 	vm_page_t       mem)
4064 {
4065 
4066 	vm_page_free_prepare_queues(mem);
4067 	vm_page_free_prepare_object(mem, TRUE);
4068 }
4069 
4070 
4071 void
vm_page_free_prepare_queues(vm_page_t mem)4072 vm_page_free_prepare_queues(
4073 	vm_page_t       mem)
4074 {
4075 	vm_object_t     m_object;
4076 
4077 	VM_PAGE_CHECK(mem);
4078 
4079 	assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
4080 	assert(!mem->vmp_cleaning);
4081 	m_object = VM_PAGE_OBJECT(mem);
4082 
4083 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4084 	if (m_object) {
4085 		vm_object_lock_assert_exclusive(m_object);
4086 	}
4087 	if (mem->vmp_laundry) {
4088 		/*
4089 		 * We may have to free a page while it's being laundered
4090 		 * if we lost its pager (due to a forced unmount, for example).
4091 		 * We need to call vm_pageout_steal_laundry() before removing
4092 		 * the page from its VM object, so that we can remove it
4093 		 * from its pageout queue and adjust the laundry accounting
4094 		 */
4095 		vm_pageout_steal_laundry(mem, TRUE);
4096 	}
4097 
4098 	vm_page_queues_remove(mem, TRUE);
4099 
4100 	if (__improbable(mem->vmp_realtime)) {
4101 		mem->vmp_realtime = false;
4102 		vm_page_realtime_count--;
4103 	}
4104 
4105 	if (VM_PAGE_WIRED(mem)) {
4106 		assert(mem->vmp_wire_count > 0);
4107 
4108 		if (m_object) {
4109 			VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
4110 			VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
4111 			VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
4112 
4113 			assert(m_object->resident_page_count >=
4114 			    m_object->wired_page_count);
4115 
4116 			if (m_object->purgable == VM_PURGABLE_VOLATILE) {
4117 				OSAddAtomic(+1, &vm_page_purgeable_count);
4118 				assert(vm_page_purgeable_wired_count > 0);
4119 				OSAddAtomic(-1, &vm_page_purgeable_wired_count);
4120 			}
4121 			if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
4122 			    m_object->purgable == VM_PURGABLE_EMPTY) &&
4123 			    m_object->vo_owner != TASK_NULL) {
4124 				task_t          owner;
4125 				int             ledger_idx_volatile;
4126 				int             ledger_idx_nonvolatile;
4127 				int             ledger_idx_volatile_compressed;
4128 				int             ledger_idx_nonvolatile_compressed;
4129 				boolean_t       do_footprint;
4130 
4131 				owner = VM_OBJECT_OWNER(m_object);
4132 				vm_object_ledger_tag_ledgers(
4133 					m_object,
4134 					&ledger_idx_volatile,
4135 					&ledger_idx_nonvolatile,
4136 					&ledger_idx_volatile_compressed,
4137 					&ledger_idx_nonvolatile_compressed,
4138 					&do_footprint);
4139 				/*
4140 				 * While wired, this page was accounted
4141 				 * as "non-volatile" but it should now
4142 				 * be accounted as "volatile".
4143 				 */
4144 				/* one less "non-volatile"... */
4145 				ledger_debit(owner->ledger,
4146 				    ledger_idx_nonvolatile,
4147 				    PAGE_SIZE);
4148 				if (do_footprint) {
4149 					/* ... and "phys_footprint" */
4150 					ledger_debit(owner->ledger,
4151 					    task_ledgers.phys_footprint,
4152 					    PAGE_SIZE);
4153 				}
4154 				/* one more "volatile" */
4155 				ledger_credit(owner->ledger,
4156 				    ledger_idx_volatile,
4157 				    PAGE_SIZE);
4158 			}
4159 		}
4160 		if (!mem->vmp_private && !mem->vmp_fictitious) {
4161 			vm_page_wire_count--;
4162 		}
4163 
4164 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
4165 		mem->vmp_wire_count = 0;
4166 		assert(!mem->vmp_gobbled);
4167 	} else if (mem->vmp_gobbled) {
4168 		if (!mem->vmp_private && !mem->vmp_fictitious) {
4169 			vm_page_wire_count--;
4170 		}
4171 		vm_page_gobble_count--;
4172 	}
4173 }
4174 
4175 
4176 void
vm_page_free_prepare_object(vm_page_t mem,boolean_t remove_from_hash)4177 vm_page_free_prepare_object(
4178 	vm_page_t       mem,
4179 	boolean_t       remove_from_hash)
4180 {
4181 	assert(!mem->vmp_realtime);
4182 	if (mem->vmp_tabled) {
4183 		vm_page_remove(mem, remove_from_hash);  /* clears tabled, object, offset */
4184 	}
4185 	PAGE_WAKEUP(mem);               /* clears wanted */
4186 
4187 	if (mem->vmp_private) {
4188 		mem->vmp_private = FALSE;
4189 		mem->vmp_fictitious = TRUE;
4190 		VM_PAGE_SET_PHYS_PAGE(mem, vm_page_fictitious_addr);
4191 	}
4192 	if (!mem->vmp_fictitious) {
4193 		assert(mem->vmp_pageq.next == 0);
4194 		assert(mem->vmp_pageq.prev == 0);
4195 		assert(mem->vmp_listq.next == 0);
4196 		assert(mem->vmp_listq.prev == 0);
4197 		assert(mem->vmp_specialq.next == 0);
4198 		assert(mem->vmp_specialq.prev == 0);
4199 		assert(mem->vmp_next_m == 0);
4200 		ASSERT_PMAP_FREE(mem);
4201 		{
4202 			vm_page_init(mem, VM_PAGE_GET_PHYS_PAGE(mem), mem->vmp_lopage);
4203 		}
4204 	}
4205 }
4206 
4207 
4208 /*
4209  *	vm_page_free:
4210  *
4211  *	Returns the given page to the free list,
4212  *	disassociating it with any VM object.
4213  *
4214  *	Object and page queues must be locked prior to entry.
4215  */
4216 void
vm_page_free(vm_page_t mem)4217 vm_page_free(
4218 	vm_page_t       mem)
4219 {
4220 	vm_page_free_prepare(mem);
4221 
4222 	if (mem->vmp_fictitious) {
4223 		vm_page_release_fictitious(mem);
4224 	} else {
4225 		vm_page_release(mem, TRUE);  /* page queues are locked */
4226 	}
4227 }
4228 
4229 
4230 void
vm_page_free_unlocked(vm_page_t mem,boolean_t remove_from_hash)4231 vm_page_free_unlocked(
4232 	vm_page_t       mem,
4233 	boolean_t       remove_from_hash)
4234 {
4235 	vm_page_lockspin_queues();
4236 	vm_page_free_prepare_queues(mem);
4237 	vm_page_unlock_queues();
4238 
4239 	vm_page_free_prepare_object(mem, remove_from_hash);
4240 
4241 	if (mem->vmp_fictitious) {
4242 		vm_page_release_fictitious(mem);
4243 	} else {
4244 		vm_page_release(mem, FALSE); /* page queues are not locked */
4245 	}
4246 }
4247 
4248 
4249 /*
4250  * Free a list of pages.  The list can be up to several hundred pages,
4251  * as blocked up by vm_pageout_scan().
4252  * The big win is not having to take the free list lock once
4253  * per page.
4254  *
4255  * The VM page queues lock (vm_page_queue_lock) should NOT be held.
4256  * The VM page free queues lock (vm_page_queue_free_lock) should NOT be held.
4257  */
4258 void
vm_page_free_list(vm_page_t freeq,boolean_t prepare_object)4259 vm_page_free_list(
4260 	vm_page_t       freeq,
4261 	boolean_t       prepare_object)
4262 {
4263 	vm_page_t       mem;
4264 	vm_page_t       nxt;
4265 	vm_page_t       local_freeq;
4266 	int             pg_count;
4267 
4268 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
4269 	LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
4270 
4271 	while (freeq) {
4272 		pg_count = 0;
4273 		local_freeq = VM_PAGE_NULL;
4274 		mem = freeq;
4275 
4276 		/*
4277 		 * break up the processing into smaller chunks so
4278 		 * that we can 'pipeline' the pages onto the
4279 		 * free list w/o introducing too much
4280 		 * contention on the global free queue lock
4281 		 */
4282 		while (mem && pg_count < 64) {
4283 			assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
4284 			    (mem->vmp_q_state == VM_PAGE_IS_WIRED));
4285 			assert(mem->vmp_specialq.next == 0 &&
4286 			    mem->vmp_specialq.prev == 0);
4287 			/*
4288 			 * &&
4289 			 *   mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
4290 			 */
4291 			nxt = mem->vmp_snext;
4292 			mem->vmp_snext = NULL;
4293 			assert(mem->vmp_pageq.prev == 0);
4294 
4295 			if (vm_page_free_verify && !mem->vmp_fictitious && !mem->vmp_private) {
4296 				ASSERT_PMAP_FREE(mem);
4297 			}
4298 
4299 			if (__improbable(mem->vmp_realtime)) {
4300 				vm_page_lock_queues();
4301 				if (mem->vmp_realtime) {
4302 					mem->vmp_realtime = false;
4303 					vm_page_realtime_count--;
4304 				}
4305 				vm_page_unlock_queues();
4306 			}
4307 
4308 			if (prepare_object == TRUE) {
4309 				vm_page_free_prepare_object(mem, TRUE);
4310 			}
4311 
4312 			if (!mem->vmp_fictitious) {
4313 				assert(mem->vmp_busy);
4314 
4315 				if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) &&
4316 				    vm_lopage_free_count < vm_lopage_free_limit &&
4317 				    VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
4318 					vm_page_release(mem, FALSE); /* page queues are not locked */
4319 #if CONFIG_SECLUDED_MEMORY
4320 				} else if (vm_page_secluded_count < vm_page_secluded_target &&
4321 				    num_tasks_can_use_secluded_mem == 0) {
4322 					vm_page_release(mem,
4323 					    FALSE);             /* page queues are not locked */
4324 #endif /* CONFIG_SECLUDED_MEMORY */
4325 				} else {
4326 					/*
4327 					 * IMPORTANT: we can't set the page "free" here
4328 					 * because that would make the page eligible for
4329 					 * a physically-contiguous allocation (see
4330 					 * vm_page_find_contiguous()) right away (we don't
4331 					 * hold the vm_page_queue_free lock).  That would
4332 					 * cause trouble because the page is not actually
4333 					 * in the free queue yet...
4334 					 */
4335 					mem->vmp_snext = local_freeq;
4336 					local_freeq = mem;
4337 					pg_count++;
4338 
4339 					pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
4340 				}
4341 			} else {
4342 				assert(VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_fictitious_addr ||
4343 				    VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_guard_addr);
4344 				vm_page_release_fictitious(mem);
4345 			}
4346 			mem = nxt;
4347 		}
4348 		freeq = mem;
4349 
4350 		if ((mem = local_freeq)) {
4351 			unsigned int    avail_free_count;
4352 			unsigned int    need_wakeup = 0;
4353 			unsigned int    need_priv_wakeup = 0;
4354 #if CONFIG_SECLUDED_MEMORY
4355 			unsigned int    need_wakeup_secluded = 0;
4356 #endif /* CONFIG_SECLUDED_MEMORY */
4357 			event_t         priv_wakeup_event, secluded_wakeup_event, normal_wakeup_event;
4358 			boolean_t       priv_wakeup_all, secluded_wakeup_all, normal_wakeup_all;
4359 
4360 			vm_free_page_lock_spin();
4361 
4362 			while (mem) {
4363 				int     color;
4364 
4365 				nxt = mem->vmp_snext;
4366 
4367 				assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
4368 				assert(mem->vmp_busy);
4369 				assert(!mem->vmp_realtime);
4370 				mem->vmp_lopage = FALSE;
4371 				mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
4372 
4373 				color = VM_PAGE_GET_COLOR(mem);
4374 #if defined(__x86_64__)
4375 				vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem);
4376 #else
4377 				vm_page_queue_enter(&vm_page_queue_free[color].qhead,
4378 				    mem, vmp_pageq);
4379 #endif
4380 				mem = nxt;
4381 			}
4382 			vm_pageout_vminfo.vm_page_pages_freed += pg_count;
4383 			vm_page_free_count += pg_count;
4384 			avail_free_count = vm_page_free_count;
4385 
4386 			VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, pg_count, 0, 0, 0);
4387 
4388 			if (vm_page_free_wanted_privileged > 0 && avail_free_count > 0) {
4389 				if (avail_free_count < vm_page_free_wanted_privileged) {
4390 					need_priv_wakeup = avail_free_count;
4391 					vm_page_free_wanted_privileged -= avail_free_count;
4392 					avail_free_count = 0;
4393 				} else {
4394 					need_priv_wakeup = vm_page_free_wanted_privileged;
4395 					avail_free_count -= vm_page_free_wanted_privileged;
4396 					vm_page_free_wanted_privileged = 0;
4397 				}
4398 			}
4399 #if CONFIG_SECLUDED_MEMORY
4400 			if (vm_page_free_wanted_secluded > 0 &&
4401 			    avail_free_count > vm_page_free_reserved) {
4402 				unsigned int available_pages;
4403 				available_pages = (avail_free_count -
4404 				    vm_page_free_reserved);
4405 				if (available_pages <
4406 				    vm_page_free_wanted_secluded) {
4407 					need_wakeup_secluded = available_pages;
4408 					vm_page_free_wanted_secluded -=
4409 					    available_pages;
4410 					avail_free_count -= available_pages;
4411 				} else {
4412 					need_wakeup_secluded =
4413 					    vm_page_free_wanted_secluded;
4414 					avail_free_count -=
4415 					    vm_page_free_wanted_secluded;
4416 					vm_page_free_wanted_secluded = 0;
4417 				}
4418 			}
4419 #endif /* CONFIG_SECLUDED_MEMORY */
4420 			if (vm_page_free_wanted > 0 && avail_free_count > vm_page_free_reserved) {
4421 				unsigned int  available_pages;
4422 
4423 				available_pages = avail_free_count - vm_page_free_reserved;
4424 
4425 				if (available_pages >= vm_page_free_wanted) {
4426 					need_wakeup = vm_page_free_wanted;
4427 					vm_page_free_wanted = 0;
4428 				} else {
4429 					need_wakeup = available_pages;
4430 					vm_page_free_wanted -= available_pages;
4431 				}
4432 			}
4433 			vm_free_page_unlock();
4434 
4435 			priv_wakeup_event = NULL;
4436 			secluded_wakeup_event = NULL;
4437 			normal_wakeup_event = NULL;
4438 
4439 			priv_wakeup_all = FALSE;
4440 			secluded_wakeup_all = FALSE;
4441 			normal_wakeup_all = FALSE;
4442 
4443 
4444 			if (need_priv_wakeup != 0) {
4445 				/*
4446 				 * There shouldn't be that many VM-privileged threads,
4447 				 * so let's wake them all up, even if we don't quite
4448 				 * have enough pages to satisfy them all.
4449 				 */
4450 				priv_wakeup_event = (event_t)&vm_page_free_wanted_privileged;
4451 				priv_wakeup_all = TRUE;
4452 			}
4453 #if CONFIG_SECLUDED_MEMORY
4454 			if (need_wakeup_secluded != 0 &&
4455 			    vm_page_free_wanted_secluded == 0) {
4456 				secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded;
4457 				secluded_wakeup_all = TRUE;
4458 				need_wakeup_secluded = 0;
4459 			} else {
4460 				secluded_wakeup_event = (event_t)&vm_page_free_wanted_secluded;
4461 			}
4462 #endif /* CONFIG_SECLUDED_MEMORY */
4463 			if (need_wakeup != 0 && vm_page_free_wanted == 0) {
4464 				/*
4465 				 * We don't expect to have any more waiters
4466 				 * after this, so let's wake them all up at
4467 				 * once.
4468 				 */
4469 				normal_wakeup_event = (event_t) &vm_page_free_count;
4470 				normal_wakeup_all = TRUE;
4471 				need_wakeup = 0;
4472 			} else {
4473 				normal_wakeup_event = (event_t) &vm_page_free_count;
4474 			}
4475 
4476 			if (priv_wakeup_event ||
4477 #if CONFIG_SECLUDED_MEMORY
4478 			    secluded_wakeup_event ||
4479 #endif /* CONFIG_SECLUDED_MEMORY */
4480 			    normal_wakeup_event) {
4481 				if (vps_dynamic_priority_enabled == TRUE) {
4482 					thread_t thread_woken = NULL;
4483 
4484 					if (priv_wakeup_all == TRUE) {
4485 						wakeup_all_with_inheritor(priv_wakeup_event, THREAD_AWAKENED);
4486 					}
4487 
4488 #if CONFIG_SECLUDED_MEMORY
4489 					if (secluded_wakeup_all == TRUE) {
4490 						wakeup_all_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED);
4491 					}
4492 
4493 					while (need_wakeup_secluded-- != 0) {
4494 						/*
4495 						 * Wake up one waiter per page we just released.
4496 						 */
4497 						wakeup_one_with_inheritor(secluded_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
4498 						thread_deallocate(thread_woken);
4499 					}
4500 #endif /* CONFIG_SECLUDED_MEMORY */
4501 
4502 					if (normal_wakeup_all == TRUE) {
4503 						wakeup_all_with_inheritor(normal_wakeup_event, THREAD_AWAKENED);
4504 					}
4505 
4506 					while (need_wakeup-- != 0) {
4507 						/*
4508 						 * Wake up one waiter per page we just released.
4509 						 */
4510 						wakeup_one_with_inheritor(normal_wakeup_event, THREAD_AWAKENED, LCK_WAKE_DO_NOT_TRANSFER_PUSH, &thread_woken);
4511 						thread_deallocate(thread_woken);
4512 					}
4513 				} else {
4514 					/*
4515 					 * Non-priority-aware wakeups.
4516 					 */
4517 
4518 					if (priv_wakeup_all == TRUE) {
4519 						thread_wakeup(priv_wakeup_event);
4520 					}
4521 
4522 #if CONFIG_SECLUDED_MEMORY
4523 					if (secluded_wakeup_all == TRUE) {
4524 						thread_wakeup(secluded_wakeup_event);
4525 					}
4526 
4527 					while (need_wakeup_secluded-- != 0) {
4528 						/*
4529 						 * Wake up one waiter per page we just released.
4530 						 */
4531 						thread_wakeup_one(secluded_wakeup_event);
4532 					}
4533 
4534 #endif /* CONFIG_SECLUDED_MEMORY */
4535 					if (normal_wakeup_all == TRUE) {
4536 						thread_wakeup(normal_wakeup_event);
4537 					}
4538 
4539 					while (need_wakeup-- != 0) {
4540 						/*
4541 						 * Wake up one waiter per page we just released.
4542 						 */
4543 						thread_wakeup_one(normal_wakeup_event);
4544 					}
4545 				}
4546 			}
4547 
4548 			VM_CHECK_MEMORYSTATUS;
4549 		}
4550 	}
4551 }
4552 
4553 
4554 /*
4555  *	vm_page_wire:
4556  *
4557  *	Mark this page as wired down by yet
4558  *	another map, removing it from paging queues
4559  *	as necessary.
4560  *
4561  *	The page's object and the page queues must be locked.
4562  */
4563 
4564 
4565 void
vm_page_wire(vm_page_t mem,vm_tag_t tag,boolean_t check_memorystatus)4566 vm_page_wire(
4567 	vm_page_t mem,
4568 	vm_tag_t           tag,
4569 	boolean_t          check_memorystatus)
4570 {
4571 	vm_object_t     m_object;
4572 
4573 	m_object = VM_PAGE_OBJECT(mem);
4574 
4575 //	dbgLog(current_thread(), mem->vmp_offset, m_object, 1);	/* (TEST/DEBUG) */
4576 
4577 	VM_PAGE_CHECK(mem);
4578 	if (m_object) {
4579 		vm_object_lock_assert_exclusive(m_object);
4580 	} else {
4581 		/*
4582 		 * In theory, the page should be in an object before it
4583 		 * gets wired, since we need to hold the object lock
4584 		 * to update some fields in the page structure.
4585 		 * However, some code (i386 pmap, for example) might want
4586 		 * to wire a page before it gets inserted into an object.
4587 		 * That's somewhat OK, as long as nobody else can get to
4588 		 * that page and update it at the same time.
4589 		 */
4590 	}
4591 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4592 	if (!VM_PAGE_WIRED(mem)) {
4593 		if (mem->vmp_laundry) {
4594 			vm_pageout_steal_laundry(mem, TRUE);
4595 		}
4596 
4597 		vm_page_queues_remove(mem, TRUE);
4598 
4599 		assert(mem->vmp_wire_count == 0);
4600 		mem->vmp_q_state = VM_PAGE_IS_WIRED;
4601 
4602 		if (m_object) {
4603 			VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
4604 			VM_OBJECT_WIRED_PAGE_ADD(m_object, mem);
4605 			VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, tag);
4606 
4607 			assert(m_object->resident_page_count >=
4608 			    m_object->wired_page_count);
4609 			if (m_object->purgable == VM_PURGABLE_VOLATILE) {
4610 				assert(vm_page_purgeable_count > 0);
4611 				OSAddAtomic(-1, &vm_page_purgeable_count);
4612 				OSAddAtomic(1, &vm_page_purgeable_wired_count);
4613 			}
4614 			if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
4615 			    m_object->purgable == VM_PURGABLE_EMPTY) &&
4616 			    m_object->vo_owner != TASK_NULL) {
4617 				task_t          owner;
4618 				int             ledger_idx_volatile;
4619 				int             ledger_idx_nonvolatile;
4620 				int             ledger_idx_volatile_compressed;
4621 				int             ledger_idx_nonvolatile_compressed;
4622 				boolean_t       do_footprint;
4623 
4624 				owner = VM_OBJECT_OWNER(m_object);
4625 				vm_object_ledger_tag_ledgers(
4626 					m_object,
4627 					&ledger_idx_volatile,
4628 					&ledger_idx_nonvolatile,
4629 					&ledger_idx_volatile_compressed,
4630 					&ledger_idx_nonvolatile_compressed,
4631 					&do_footprint);
4632 				/* less volatile bytes */
4633 				ledger_debit(owner->ledger,
4634 				    ledger_idx_volatile,
4635 				    PAGE_SIZE);
4636 				/* more not-quite-volatile bytes */
4637 				ledger_credit(owner->ledger,
4638 				    ledger_idx_nonvolatile,
4639 				    PAGE_SIZE);
4640 				if (do_footprint) {
4641 					/* more footprint */
4642 					ledger_credit(owner->ledger,
4643 					    task_ledgers.phys_footprint,
4644 					    PAGE_SIZE);
4645 				}
4646 			}
4647 			if (m_object->all_reusable) {
4648 				/*
4649 				 * Wired pages are not counted as "re-usable"
4650 				 * in "all_reusable" VM objects, so nothing
4651 				 * to do here.
4652 				 */
4653 			} else if (mem->vmp_reusable) {
4654 				/*
4655 				 * This page is not "re-usable" when it's
4656 				 * wired, so adjust its state and the
4657 				 * accounting.
4658 				 */
4659 				vm_object_reuse_pages(m_object,
4660 				    mem->vmp_offset,
4661 				    mem->vmp_offset + PAGE_SIZE_64,
4662 				    FALSE);
4663 			}
4664 		}
4665 		assert(!mem->vmp_reusable);
4666 
4667 		if (!mem->vmp_private && !mem->vmp_fictitious && !mem->vmp_gobbled) {
4668 			vm_page_wire_count++;
4669 		}
4670 		if (mem->vmp_gobbled) {
4671 			vm_page_gobble_count--;
4672 		}
4673 		mem->vmp_gobbled = FALSE;
4674 
4675 		if (check_memorystatus == TRUE) {
4676 			VM_CHECK_MEMORYSTATUS;
4677 		}
4678 	}
4679 	assert(!mem->vmp_gobbled);
4680 	assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
4681 	mem->vmp_wire_count++;
4682 	if (__improbable(mem->vmp_wire_count == 0)) {
4683 		panic("vm_page_wire(%p): wire_count overflow", mem);
4684 	}
4685 	VM_PAGE_CHECK(mem);
4686 }
4687 
4688 /*
4689  *	vm_page_unwire:
4690  *
4691  *	Release one wiring of this page, potentially
4692  *	enabling it to be paged again.
4693  *
4694  *	The page's object and the page queues must be locked.
4695  */
4696 void
vm_page_unwire(vm_page_t mem,boolean_t queueit)4697 vm_page_unwire(
4698 	vm_page_t       mem,
4699 	boolean_t       queueit)
4700 {
4701 	vm_object_t     m_object;
4702 
4703 	m_object = VM_PAGE_OBJECT(mem);
4704 
4705 //	dbgLog(current_thread(), mem->vmp_offset, m_object, 0);	/* (TEST/DEBUG) */
4706 
4707 	VM_PAGE_CHECK(mem);
4708 	assert(VM_PAGE_WIRED(mem));
4709 	assert(mem->vmp_wire_count > 0);
4710 	assert(!mem->vmp_gobbled);
4711 	assert(m_object != VM_OBJECT_NULL);
4712 	vm_object_lock_assert_exclusive(m_object);
4713 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4714 	if (--mem->vmp_wire_count == 0) {
4715 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
4716 
4717 		VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
4718 		VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
4719 		VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
4720 		if (!mem->vmp_private && !mem->vmp_fictitious) {
4721 			vm_page_wire_count--;
4722 		}
4723 
4724 		assert(m_object->resident_page_count >=
4725 		    m_object->wired_page_count);
4726 		if (m_object->purgable == VM_PURGABLE_VOLATILE) {
4727 			OSAddAtomic(+1, &vm_page_purgeable_count);
4728 			assert(vm_page_purgeable_wired_count > 0);
4729 			OSAddAtomic(-1, &vm_page_purgeable_wired_count);
4730 		}
4731 		if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
4732 		    m_object->purgable == VM_PURGABLE_EMPTY) &&
4733 		    m_object->vo_owner != TASK_NULL) {
4734 			task_t          owner;
4735 			int             ledger_idx_volatile;
4736 			int             ledger_idx_nonvolatile;
4737 			int             ledger_idx_volatile_compressed;
4738 			int             ledger_idx_nonvolatile_compressed;
4739 			boolean_t       do_footprint;
4740 
4741 			owner = VM_OBJECT_OWNER(m_object);
4742 			vm_object_ledger_tag_ledgers(
4743 				m_object,
4744 				&ledger_idx_volatile,
4745 				&ledger_idx_nonvolatile,
4746 				&ledger_idx_volatile_compressed,
4747 				&ledger_idx_nonvolatile_compressed,
4748 				&do_footprint);
4749 			/* more volatile bytes */
4750 			ledger_credit(owner->ledger,
4751 			    ledger_idx_volatile,
4752 			    PAGE_SIZE);
4753 			/* less not-quite-volatile bytes */
4754 			ledger_debit(owner->ledger,
4755 			    ledger_idx_nonvolatile,
4756 			    PAGE_SIZE);
4757 			if (do_footprint) {
4758 				/* less footprint */
4759 				ledger_debit(owner->ledger,
4760 				    task_ledgers.phys_footprint,
4761 				    PAGE_SIZE);
4762 			}
4763 		}
4764 		assert(m_object != kernel_object);
4765 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
4766 
4767 		if (queueit == TRUE) {
4768 			if (m_object->purgable == VM_PURGABLE_EMPTY) {
4769 				vm_page_deactivate(mem);
4770 			} else {
4771 				vm_page_activate(mem);
4772 			}
4773 		}
4774 
4775 		VM_CHECK_MEMORYSTATUS;
4776 	}
4777 	VM_PAGE_CHECK(mem);
4778 }
4779 
4780 /*
4781  *	vm_page_deactivate:
4782  *
4783  *	Returns the given page to the inactive list,
4784  *	indicating that no physical maps have access
4785  *	to this page.  [Used by the physical mapping system.]
4786  *
4787  *	The page queues must be locked.
4788  */
4789 void
vm_page_deactivate(vm_page_t m)4790 vm_page_deactivate(
4791 	vm_page_t       m)
4792 {
4793 	vm_page_deactivate_internal(m, TRUE);
4794 }
4795 
4796 
4797 void
vm_page_deactivate_internal(vm_page_t m,boolean_t clear_hw_reference)4798 vm_page_deactivate_internal(
4799 	vm_page_t       m,
4800 	boolean_t       clear_hw_reference)
4801 {
4802 	vm_object_t     m_object;
4803 
4804 	m_object = VM_PAGE_OBJECT(m);
4805 
4806 	VM_PAGE_CHECK(m);
4807 	assert(m_object != kernel_object);
4808 	assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
4809 
4810 //	dbgLog(VM_PAGE_GET_PHYS_PAGE(m), vm_page_free_count, vm_page_wire_count, 6);	/* (TEST/DEBUG) */
4811 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4812 	/*
4813 	 *	This page is no longer very interesting.  If it was
4814 	 *	interesting (active or inactive/referenced), then we
4815 	 *	clear the reference bit and (re)enter it in the
4816 	 *	inactive queue.  Note wired pages should not have
4817 	 *	their reference bit cleared.
4818 	 */
4819 	assert( !(m->vmp_absent && !m->vmp_unusual));
4820 
4821 	if (m->vmp_gobbled) {           /* can this happen? */
4822 		assert( !VM_PAGE_WIRED(m));
4823 
4824 		if (!m->vmp_private && !m->vmp_fictitious) {
4825 			vm_page_wire_count--;
4826 		}
4827 		vm_page_gobble_count--;
4828 		m->vmp_gobbled = FALSE;
4829 	}
4830 	/*
4831 	 * if this page is currently on the pageout queue, we can't do the
4832 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
4833 	 * and we can't remove it manually since we would need the object lock
4834 	 * (which is not required here) to decrement the activity_in_progress
4835 	 * reference which is held on the object while the page is in the pageout queue...
4836 	 * just let the normal laundry processing proceed
4837 	 */
4838 	if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
4839 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
4840 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
4841 	    VM_PAGE_WIRED(m)) {
4842 		return;
4843 	}
4844 	if (!m->vmp_absent && clear_hw_reference == TRUE) {
4845 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
4846 	}
4847 
4848 	m->vmp_reference = FALSE;
4849 	m->vmp_no_cache = FALSE;
4850 
4851 	if (!VM_PAGE_INACTIVE(m)) {
4852 		vm_page_queues_remove(m, FALSE);
4853 
4854 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
4855 		    m->vmp_dirty && m_object->internal &&
4856 		    (m_object->purgable == VM_PURGABLE_DENY ||
4857 		    m_object->purgable == VM_PURGABLE_NONVOLATILE ||
4858 		    m_object->purgable == VM_PURGABLE_VOLATILE)) {
4859 			vm_page_check_pageable_safe(m);
4860 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
4861 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
4862 			vm_page_throttled_count++;
4863 		} else {
4864 			if (m_object->named && m_object->ref_count == 1) {
4865 				vm_page_speculate(m, FALSE);
4866 #if DEVELOPMENT || DEBUG
4867 				vm_page_speculative_recreated++;
4868 #endif
4869 			} else {
4870 				vm_page_enqueue_inactive(m, FALSE);
4871 			}
4872 		}
4873 	}
4874 }
4875 
4876 /*
4877  * vm_page_enqueue_cleaned
4878  *
4879  * Put the page on the cleaned queue, mark it cleaned, etc.
4880  * Being on the cleaned queue (and having m->clean_queue set)
4881  * does ** NOT ** guarantee that the page is clean!
4882  *
4883  * Call with the queues lock held.
4884  */
4885 
4886 void
vm_page_enqueue_cleaned(vm_page_t m)4887 vm_page_enqueue_cleaned(vm_page_t m)
4888 {
4889 	vm_object_t     m_object;
4890 
4891 	m_object = VM_PAGE_OBJECT(m);
4892 
4893 	assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
4894 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4895 	assert( !(m->vmp_absent && !m->vmp_unusual));
4896 
4897 	if (VM_PAGE_WIRED(m)) {
4898 		return;
4899 	}
4900 
4901 	if (m->vmp_gobbled) {
4902 		if (!m->vmp_private && !m->vmp_fictitious) {
4903 			vm_page_wire_count--;
4904 		}
4905 		vm_page_gobble_count--;
4906 		m->vmp_gobbled = FALSE;
4907 	}
4908 	/*
4909 	 * if this page is currently on the pageout queue, we can't do the
4910 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
4911 	 * and we can't remove it manually since we would need the object lock
4912 	 * (which is not required here) to decrement the activity_in_progress
4913 	 * reference which is held on the object while the page is in the pageout queue...
4914 	 * just let the normal laundry processing proceed
4915 	 */
4916 	if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
4917 	    (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
4918 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
4919 		return;
4920 	}
4921 	vm_page_queues_remove(m, FALSE);
4922 
4923 	vm_page_check_pageable_safe(m);
4924 	vm_page_queue_enter(&vm_page_queue_cleaned, m, vmp_pageq);
4925 	m->vmp_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q;
4926 	vm_page_cleaned_count++;
4927 
4928 	vm_page_inactive_count++;
4929 	if (m_object->internal) {
4930 		vm_page_pageable_internal_count++;
4931 	} else {
4932 		vm_page_pageable_external_count++;
4933 	}
4934 	vm_page_add_to_specialq(m, TRUE);
4935 	VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
4936 }
4937 
4938 /*
4939  *	vm_page_activate:
4940  *
4941  *	Put the specified page on the active list (if appropriate).
4942  *
4943  *	The page queues must be locked.
4944  */
4945 
4946 void
vm_page_activate(vm_page_t m)4947 vm_page_activate(
4948 	vm_page_t       m)
4949 {
4950 	vm_object_t     m_object;
4951 
4952 	m_object = VM_PAGE_OBJECT(m);
4953 
4954 	VM_PAGE_CHECK(m);
4955 #ifdef  FIXME_4778297
4956 	assert(m_object != kernel_object);
4957 #endif
4958 	assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
4959 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
4960 	assert( !(m->vmp_absent && !m->vmp_unusual));
4961 
4962 	if (m->vmp_gobbled) {
4963 		assert( !VM_PAGE_WIRED(m));
4964 		if (!m->vmp_private && !m->vmp_fictitious) {
4965 			vm_page_wire_count--;
4966 		}
4967 		vm_page_gobble_count--;
4968 		m->vmp_gobbled = FALSE;
4969 	}
4970 	/*
4971 	 * if this page is currently on the pageout queue, we can't do the
4972 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
4973 	 * and we can't remove it manually since we would need the object lock
4974 	 * (which is not required here) to decrement the activity_in_progress
4975 	 * reference which is held on the object while the page is in the pageout queue...
4976 	 * just let the normal laundry processing proceed
4977 	 */
4978 	if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
4979 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
4980 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
4981 		return;
4982 	}
4983 
4984 #if DEBUG
4985 	if (m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q) {
4986 		panic("vm_page_activate: already active");
4987 	}
4988 #endif
4989 
4990 	if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
4991 		DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4992 		DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
4993 	}
4994 
4995 	/*
4996 	 * A freshly activated page should be promoted in the donation queue.
4997 	 * So we remove it here while preserving its hint and we will enqueue
4998 	 * it again in vm_page_enqueue_active.
4999 	 */
5000 	vm_page_queues_remove(m, ((m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) ? TRUE : FALSE));
5001 
5002 	if (!VM_PAGE_WIRED(m)) {
5003 		vm_page_check_pageable_safe(m);
5004 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
5005 		    m->vmp_dirty && m_object->internal &&
5006 		    (m_object->purgable == VM_PURGABLE_DENY ||
5007 		    m_object->purgable == VM_PURGABLE_NONVOLATILE ||
5008 		    m_object->purgable == VM_PURGABLE_VOLATILE)) {
5009 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
5010 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
5011 			vm_page_throttled_count++;
5012 		} else {
5013 #if CONFIG_SECLUDED_MEMORY
5014 			if (secluded_for_filecache &&
5015 			    vm_page_secluded_target != 0 &&
5016 			    num_tasks_can_use_secluded_mem == 0 &&
5017 			    m_object->eligible_for_secluded &&
5018 			    !m->vmp_realtime) {
5019 				vm_page_queue_enter(&vm_page_queue_secluded, m, vmp_pageq);
5020 				m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
5021 				vm_page_secluded_count++;
5022 				VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
5023 				vm_page_secluded_count_inuse++;
5024 				assert(!m_object->internal);
5025 //				vm_page_pageable_external_count++;
5026 			} else
5027 #endif /* CONFIG_SECLUDED_MEMORY */
5028 			vm_page_enqueue_active(m, FALSE);
5029 		}
5030 		m->vmp_reference = TRUE;
5031 		m->vmp_no_cache = FALSE;
5032 	}
5033 	VM_PAGE_CHECK(m);
5034 }
5035 
5036 
5037 /*
5038  *      vm_page_speculate:
5039  *
5040  *      Put the specified page on the speculative list (if appropriate).
5041  *
5042  *      The page queues must be locked.
5043  */
5044 void
vm_page_speculate(vm_page_t m,boolean_t new)5045 vm_page_speculate(
5046 	vm_page_t       m,
5047 	boolean_t       new)
5048 {
5049 	struct vm_speculative_age_q     *aq;
5050 	vm_object_t     m_object;
5051 
5052 	m_object = VM_PAGE_OBJECT(m);
5053 
5054 	VM_PAGE_CHECK(m);
5055 	vm_page_check_pageable_safe(m);
5056 
5057 	assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
5058 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5059 	assert( !(m->vmp_absent && !m->vmp_unusual));
5060 	assert(m_object->internal == FALSE);
5061 
5062 	/*
5063 	 * if this page is currently on the pageout queue, we can't do the
5064 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
5065 	 * and we can't remove it manually since we would need the object lock
5066 	 * (which is not required here) to decrement the activity_in_progress
5067 	 * reference which is held on the object while the page is in the pageout queue...
5068 	 * just let the normal laundry processing proceed
5069 	 */
5070 	if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
5071 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
5072 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
5073 		return;
5074 	}
5075 
5076 	vm_page_queues_remove(m, FALSE);
5077 
5078 	if (!VM_PAGE_WIRED(m)) {
5079 		mach_timespec_t         ts;
5080 		clock_sec_t sec;
5081 		clock_nsec_t nsec;
5082 
5083 		clock_get_system_nanotime(&sec, &nsec);
5084 		ts.tv_sec = (unsigned int) sec;
5085 		ts.tv_nsec = nsec;
5086 
5087 		if (vm_page_speculative_count == 0) {
5088 			speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
5089 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
5090 
5091 			aq = &vm_page_queue_speculative[speculative_age_index];
5092 
5093 			/*
5094 			 * set the timer to begin a new group
5095 			 */
5096 			aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
5097 			aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
5098 
5099 			ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
5100 		} else {
5101 			aq = &vm_page_queue_speculative[speculative_age_index];
5102 
5103 			if (CMP_MACH_TIMESPEC(&ts, &aq->age_ts) >= 0) {
5104 				speculative_age_index++;
5105 
5106 				if (speculative_age_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
5107 					speculative_age_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
5108 				}
5109 				if (speculative_age_index == speculative_steal_index) {
5110 					speculative_steal_index = speculative_age_index + 1;
5111 
5112 					if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
5113 						speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
5114 					}
5115 				}
5116 				aq = &vm_page_queue_speculative[speculative_age_index];
5117 
5118 				if (!vm_page_queue_empty(&aq->age_q)) {
5119 					vm_page_speculate_ageit(aq);
5120 				}
5121 
5122 				aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
5123 				aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
5124 
5125 				ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
5126 			}
5127 		}
5128 		vm_page_enqueue_tail(&aq->age_q, &m->vmp_pageq);
5129 		m->vmp_q_state = VM_PAGE_ON_SPECULATIVE_Q;
5130 		vm_page_speculative_count++;
5131 		vm_page_pageable_external_count++;
5132 
5133 		if (new == TRUE) {
5134 			vm_object_lock_assert_exclusive(m_object);
5135 
5136 			m_object->pages_created++;
5137 #if DEVELOPMENT || DEBUG
5138 			vm_page_speculative_created++;
5139 #endif
5140 		}
5141 	}
5142 	VM_PAGE_CHECK(m);
5143 }
5144 
5145 
5146 /*
5147  * move pages from the specified aging bin to
5148  * the speculative bin that pageout_scan claims from
5149  *
5150  *      The page queues must be locked.
5151  */
5152 void
vm_page_speculate_ageit(struct vm_speculative_age_q * aq)5153 vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
5154 {
5155 	struct vm_speculative_age_q     *sq;
5156 	vm_page_t       t;
5157 
5158 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
5159 
5160 	if (vm_page_queue_empty(&sq->age_q)) {
5161 		sq->age_q.next = aq->age_q.next;
5162 		sq->age_q.prev = aq->age_q.prev;
5163 
5164 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next);
5165 		t->vmp_pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q);
5166 
5167 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
5168 		t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
5169 	} else {
5170 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
5171 		t->vmp_pageq.next = aq->age_q.next;
5172 
5173 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next);
5174 		t->vmp_pageq.prev = sq->age_q.prev;
5175 
5176 		t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev);
5177 		t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
5178 
5179 		sq->age_q.prev = aq->age_q.prev;
5180 	}
5181 	vm_page_queue_init(&aq->age_q);
5182 }
5183 
5184 
5185 void
vm_page_lru(vm_page_t m)5186 vm_page_lru(
5187 	vm_page_t       m)
5188 {
5189 	VM_PAGE_CHECK(m);
5190 	assert(VM_PAGE_OBJECT(m) != kernel_object);
5191 	assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
5192 
5193 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
5194 
5195 	if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q) {
5196 		/*
5197 		 * we don't need to do all the other work that
5198 		 * vm_page_queues_remove and vm_page_enqueue_inactive
5199 		 * bring along for the ride
5200 		 */
5201 		assert(!m->vmp_laundry);
5202 		assert(!m->vmp_private);
5203 
5204 		m->vmp_no_cache = FALSE;
5205 
5206 		vm_page_queue_remove(&vm_page_queue_inactive, m, vmp_pageq);
5207 		vm_page_queue_enter(&vm_page_queue_inactive, m, vmp_pageq);
5208 
5209 		return;
5210 	}
5211 	/*
5212 	 * if this page is currently on the pageout queue, we can't do the
5213 	 * vm_page_queues_remove (which doesn't handle the pageout queue case)
5214 	 * and we can't remove it manually since we would need the object lock
5215 	 * (which is not required here) to decrement the activity_in_progress
5216 	 * reference which is held on the object while the page is in the pageout queue...
5217 	 * just let the normal laundry processing proceed
5218 	 */
5219 	if (m->vmp_laundry || m->vmp_private ||
5220 	    (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
5221 	    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
5222 	    VM_PAGE_WIRED(m)) {
5223 		return;
5224 	}
5225 
5226 	m->vmp_no_cache = FALSE;
5227 
5228 	vm_page_queues_remove(m, FALSE);
5229 
5230 	vm_page_enqueue_inactive(m, FALSE);
5231 }
5232 
5233 
5234 void
vm_page_reactivate_all_throttled(void)5235 vm_page_reactivate_all_throttled(void)
5236 {
5237 	vm_page_t       first_throttled, last_throttled;
5238 	vm_page_t       first_active;
5239 	vm_page_t       m;
5240 	int             extra_active_count;
5241 	int             extra_internal_count, extra_external_count;
5242 	vm_object_t     m_object;
5243 
5244 	if (!VM_DYNAMIC_PAGING_ENABLED()) {
5245 		return;
5246 	}
5247 
5248 	extra_active_count = 0;
5249 	extra_internal_count = 0;
5250 	extra_external_count = 0;
5251 	vm_page_lock_queues();
5252 	if (!vm_page_queue_empty(&vm_page_queue_throttled)) {
5253 		/*
5254 		 * Switch "throttled" pages to "active".
5255 		 */
5256 		vm_page_queue_iterate(&vm_page_queue_throttled, m, vmp_pageq) {
5257 			VM_PAGE_CHECK(m);
5258 			assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
5259 
5260 			m_object = VM_PAGE_OBJECT(m);
5261 
5262 			extra_active_count++;
5263 			if (m_object->internal) {
5264 				extra_internal_count++;
5265 			} else {
5266 				extra_external_count++;
5267 			}
5268 
5269 			m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
5270 			VM_PAGE_CHECK(m);
5271 			vm_page_add_to_specialq(m, FALSE);
5272 		}
5273 
5274 		/*
5275 		 * Transfer the entire throttled queue to a regular LRU page queues.
5276 		 * We insert it at the head of the active queue, so that these pages
5277 		 * get re-evaluated by the LRU algorithm first, since they've been
5278 		 * completely out of it until now.
5279 		 */
5280 		first_throttled = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
5281 		last_throttled = (vm_page_t) vm_page_queue_last(&vm_page_queue_throttled);
5282 		first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
5283 		if (vm_page_queue_empty(&vm_page_queue_active)) {
5284 			vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
5285 		} else {
5286 			first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
5287 		}
5288 		vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled);
5289 		first_throttled->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
5290 		last_throttled->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
5291 
5292 #if DEBUG
5293 		printf("reactivated %d throttled pages\n", vm_page_throttled_count);
5294 #endif
5295 		vm_page_queue_init(&vm_page_queue_throttled);
5296 		/*
5297 		 * Adjust the global page counts.
5298 		 */
5299 		vm_page_active_count += extra_active_count;
5300 		vm_page_pageable_internal_count += extra_internal_count;
5301 		vm_page_pageable_external_count += extra_external_count;
5302 		vm_page_throttled_count = 0;
5303 	}
5304 	assert(vm_page_throttled_count == 0);
5305 	assert(vm_page_queue_empty(&vm_page_queue_throttled));
5306 	vm_page_unlock_queues();
5307 }
5308 
5309 
5310 /*
5311  * move pages from the indicated local queue to the global active queue
5312  * its ok to fail if we're below the hard limit and force == FALSE
5313  * the nolocks == TRUE case is to allow this function to be run on
5314  * the hibernate path
5315  */
5316 
5317 void
vm_page_reactivate_local(uint32_t lid,boolean_t force,boolean_t nolocks)5318 vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
5319 {
5320 	struct vpl      *lq;
5321 	vm_page_t       first_local, last_local;
5322 	vm_page_t       first_active;
5323 	vm_page_t       m;
5324 	uint32_t        count = 0;
5325 
5326 	if (vm_page_local_q == NULL) {
5327 		return;
5328 	}
5329 
5330 	lq = zpercpu_get_cpu(vm_page_local_q, lid);
5331 
5332 	if (nolocks == FALSE) {
5333 		if (lq->vpl_count < vm_page_local_q_hard_limit && force == FALSE) {
5334 			if (!vm_page_trylockspin_queues()) {
5335 				return;
5336 			}
5337 		} else {
5338 			vm_page_lockspin_queues();
5339 		}
5340 
5341 		VPL_LOCK(&lq->vpl_lock);
5342 	}
5343 	if (lq->vpl_count) {
5344 		/*
5345 		 * Switch "local" pages to "active".
5346 		 */
5347 		assert(!vm_page_queue_empty(&lq->vpl_queue));
5348 
5349 		vm_page_queue_iterate(&lq->vpl_queue, m, vmp_pageq) {
5350 			VM_PAGE_CHECK(m);
5351 			vm_page_check_pageable_safe(m);
5352 			assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q);
5353 			assert(!m->vmp_fictitious);
5354 
5355 			if (m->vmp_local_id != lid) {
5356 				panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m);
5357 			}
5358 
5359 			m->vmp_local_id = 0;
5360 			m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
5361 			VM_PAGE_CHECK(m);
5362 			vm_page_add_to_specialq(m, FALSE);
5363 			count++;
5364 		}
5365 		if (count != lq->vpl_count) {
5366 			panic("vm_page_reactivate_local: count = %d, vm_page_local_count = %d", count, lq->vpl_count);
5367 		}
5368 
5369 		/*
5370 		 * Transfer the entire local queue to a regular LRU page queues.
5371 		 */
5372 		first_local = (vm_page_t) vm_page_queue_first(&lq->vpl_queue);
5373 		last_local = (vm_page_t) vm_page_queue_last(&lq->vpl_queue);
5374 		first_active = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
5375 
5376 		if (vm_page_queue_empty(&vm_page_queue_active)) {
5377 			vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
5378 		} else {
5379 			first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
5380 		}
5381 		vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
5382 		first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
5383 		last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
5384 
5385 		vm_page_queue_init(&lq->vpl_queue);
5386 		/*
5387 		 * Adjust the global page counts.
5388 		 */
5389 		vm_page_active_count += lq->vpl_count;
5390 		vm_page_pageable_internal_count += lq->vpl_internal_count;
5391 		vm_page_pageable_external_count += lq->vpl_external_count;
5392 		lq->vpl_count = 0;
5393 		lq->vpl_internal_count = 0;
5394 		lq->vpl_external_count = 0;
5395 	}
5396 	assert(vm_page_queue_empty(&lq->vpl_queue));
5397 
5398 	if (nolocks == FALSE) {
5399 		VPL_UNLOCK(&lq->vpl_lock);
5400 
5401 		vm_page_balance_inactive(count / 4);
5402 		vm_page_unlock_queues();
5403 	}
5404 }
5405 
5406 /*
5407  *	vm_page_part_zero_fill:
5408  *
5409  *	Zero-fill a part of the page.
5410  */
5411 #define PMAP_ZERO_PART_PAGE_IMPLEMENTED
5412 void
vm_page_part_zero_fill(vm_page_t m,vm_offset_t m_pa,vm_size_t len)5413 vm_page_part_zero_fill(
5414 	vm_page_t       m,
5415 	vm_offset_t     m_pa,
5416 	vm_size_t       len)
5417 {
5418 #if 0
5419 	/*
5420 	 * we don't hold the page queue lock
5421 	 * so this check isn't safe to make
5422 	 */
5423 	VM_PAGE_CHECK(m);
5424 #endif
5425 
5426 #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED
5427 	pmap_zero_part_page(VM_PAGE_GET_PHYS_PAGE(m), m_pa, len);
5428 #else
5429 	vm_page_t       tmp;
5430 	while (1) {
5431 		tmp = vm_page_grab();
5432 		if (tmp == VM_PAGE_NULL) {
5433 			vm_page_wait(THREAD_UNINT);
5434 			continue;
5435 		}
5436 		break;
5437 	}
5438 	vm_page_zero_fill(tmp);
5439 	if (m_pa != 0) {
5440 		vm_page_part_copy(m, 0, tmp, 0, m_pa);
5441 	}
5442 	if ((m_pa + len) < PAGE_SIZE) {
5443 		vm_page_part_copy(m, m_pa + len, tmp,
5444 		    m_pa + len, PAGE_SIZE - (m_pa + len));
5445 	}
5446 	vm_page_copy(tmp, m);
5447 	VM_PAGE_FREE(tmp);
5448 #endif
5449 }
5450 
5451 /*
5452  *	vm_page_zero_fill:
5453  *
5454  *	Zero-fill the specified page.
5455  */
5456 void
vm_page_zero_fill(vm_page_t m)5457 vm_page_zero_fill(
5458 	vm_page_t       m)
5459 {
5460 #if 0
5461 	/*
5462 	 * we don't hold the page queue lock
5463 	 * so this check isn't safe to make
5464 	 */
5465 	VM_PAGE_CHECK(m);
5466 #endif
5467 
5468 //	dbgTrace(0xAEAEAEAE, VM_PAGE_GET_PHYS_PAGE(m), 0);		/* (BRINGUP) */
5469 	pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
5470 }
5471 
5472 /*
5473  *	vm_page_part_copy:
5474  *
5475  *	copy part of one page to another
5476  */
5477 
5478 void
vm_page_part_copy(vm_page_t src_m,vm_offset_t src_pa,vm_page_t dst_m,vm_offset_t dst_pa,vm_size_t len)5479 vm_page_part_copy(
5480 	vm_page_t       src_m,
5481 	vm_offset_t     src_pa,
5482 	vm_page_t       dst_m,
5483 	vm_offset_t     dst_pa,
5484 	vm_size_t       len)
5485 {
5486 #if 0
5487 	/*
5488 	 * we don't hold the page queue lock
5489 	 * so this check isn't safe to make
5490 	 */
5491 	VM_PAGE_CHECK(src_m);
5492 	VM_PAGE_CHECK(dst_m);
5493 #endif
5494 	pmap_copy_part_page(VM_PAGE_GET_PHYS_PAGE(src_m), src_pa,
5495 	    VM_PAGE_GET_PHYS_PAGE(dst_m), dst_pa, len);
5496 }
5497 
5498 /*
5499  *	vm_page_copy:
5500  *
5501  *	Copy one page to another
5502  */
5503 
5504 int vm_page_copy_cs_validations = 0;
5505 int vm_page_copy_cs_tainted = 0;
5506 
5507 void
vm_page_copy(vm_page_t src_m,vm_page_t dest_m)5508 vm_page_copy(
5509 	vm_page_t       src_m,
5510 	vm_page_t       dest_m)
5511 {
5512 	vm_object_t     src_m_object;
5513 
5514 	src_m_object = VM_PAGE_OBJECT(src_m);
5515 
5516 #if 0
5517 	/*
5518 	 * we don't hold the page queue lock
5519 	 * so this check isn't safe to make
5520 	 */
5521 	VM_PAGE_CHECK(src_m);
5522 	VM_PAGE_CHECK(dest_m);
5523 #endif
5524 	vm_object_lock_assert_held(src_m_object);
5525 
5526 	if (src_m_object != VM_OBJECT_NULL &&
5527 	    src_m_object->code_signed) {
5528 		/*
5529 		 * We're copying a page from a code-signed object.
5530 		 * Whoever ends up mapping the copy page might care about
5531 		 * the original page's integrity, so let's validate the
5532 		 * source page now.
5533 		 */
5534 		vm_page_copy_cs_validations++;
5535 		vm_page_validate_cs(src_m, PAGE_SIZE, 0);
5536 #if DEVELOPMENT || DEBUG
5537 		DTRACE_VM4(codesigned_copy,
5538 		    vm_object_t, src_m_object,
5539 		    vm_object_offset_t, src_m->vmp_offset,
5540 		    int, src_m->vmp_cs_validated,
5541 		    int, src_m->vmp_cs_tainted);
5542 #endif /* DEVELOPMENT || DEBUG */
5543 	}
5544 
5545 	/*
5546 	 * Propagate the cs_tainted bit to the copy page. Do not propagate
5547 	 * the cs_validated bit.
5548 	 */
5549 	dest_m->vmp_cs_tainted = src_m->vmp_cs_tainted;
5550 	dest_m->vmp_cs_nx = src_m->vmp_cs_nx;
5551 	if (dest_m->vmp_cs_tainted) {
5552 		vm_page_copy_cs_tainted++;
5553 	}
5554 	dest_m->vmp_error = VMP_ERROR_GET(src_m); /* sliding src_m might have failed... */
5555 	pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m));
5556 }
5557 
5558 #if MACH_ASSERT
5559 static void
_vm_page_print(vm_page_t p)5560 _vm_page_print(
5561 	vm_page_t       p)
5562 {
5563 	printf("vm_page %p: \n", p);
5564 	printf("  pageq: next=%p prev=%p\n",
5565 	    (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next),
5566 	    (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.prev));
5567 	printf("  listq: next=%p prev=%p\n",
5568 	    (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.next)),
5569 	    (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.prev)));
5570 	printf("  next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)));
5571 	printf("  object=%p offset=0x%llx\n", VM_PAGE_OBJECT(p), p->vmp_offset);
5572 	printf("  wire_count=%u\n", p->vmp_wire_count);
5573 	printf("  q_state=%u\n", p->vmp_q_state);
5574 
5575 	printf("  %slaundry, %sref, %sgobbled, %sprivate\n",
5576 	    (p->vmp_laundry ? "" : "!"),
5577 	    (p->vmp_reference ? "" : "!"),
5578 	    (p->vmp_gobbled ? "" : "!"),
5579 	    (p->vmp_private ? "" : "!"));
5580 	printf("  %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n",
5581 	    (p->vmp_busy ? "" : "!"),
5582 	    (p->vmp_wanted ? "" : "!"),
5583 	    (p->vmp_tabled ? "" : "!"),
5584 	    (p->vmp_fictitious ? "" : "!"),
5585 	    (p->vmp_pmapped ? "" : "!"),
5586 	    (p->vmp_wpmapped ? "" : "!"));
5587 	printf("  %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n",
5588 	    (p->vmp_free_when_done ? "" : "!"),
5589 	    (p->vmp_absent ? "" : "!"),
5590 	    (VMP_ERROR_GET(p) ? "" : "!"),
5591 	    (p->vmp_dirty ? "" : "!"),
5592 	    (p->vmp_cleaning ? "" : "!"),
5593 	    (p->vmp_precious ? "" : "!"),
5594 	    (p->vmp_clustered ? "" : "!"));
5595 	printf("  %soverwriting, %srestart, %sunusual\n",
5596 	    (p->vmp_overwriting ? "" : "!"),
5597 	    (p->vmp_restart ? "" : "!"),
5598 	    (p->vmp_unusual ? "" : "!"));
5599 	printf("  cs_validated=%d, cs_tainted=%d, cs_nx=%d, %sno_cache\n",
5600 	    p->vmp_cs_validated,
5601 	    p->vmp_cs_tainted,
5602 	    p->vmp_cs_nx,
5603 	    (p->vmp_no_cache ? "" : "!"));
5604 
5605 	printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p));
5606 }
5607 
5608 /*
5609  *	Check that the list of pages is ordered by
5610  *	ascending physical address and has no holes.
5611  */
5612 static int
vm_page_verify_contiguous(vm_page_t pages,unsigned int npages)5613 vm_page_verify_contiguous(
5614 	vm_page_t       pages,
5615 	unsigned int    npages)
5616 {
5617 	vm_page_t               m;
5618 	unsigned int            page_count;
5619 	vm_offset_t             prev_addr;
5620 
5621 	prev_addr = VM_PAGE_GET_PHYS_PAGE(pages);
5622 	page_count = 1;
5623 	for (m = NEXT_PAGE(pages); m != VM_PAGE_NULL; m = NEXT_PAGE(m)) {
5624 		if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5625 			printf("m %p prev_addr 0x%lx, current addr 0x%x\n",
5626 			    m, (long)prev_addr, VM_PAGE_GET_PHYS_PAGE(m));
5627 			printf("pages %p page_count %d npages %d\n", pages, page_count, npages);
5628 			panic("vm_page_verify_contiguous:  not contiguous!");
5629 		}
5630 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5631 		++page_count;
5632 	}
5633 	if (page_count != npages) {
5634 		printf("pages %p actual count 0x%x but requested 0x%x\n",
5635 		    pages, page_count, npages);
5636 		panic("vm_page_verify_contiguous:  count error");
5637 	}
5638 	return 1;
5639 }
5640 
5641 
5642 /*
5643  *	Check the free lists for proper length etc.
5644  */
5645 static boolean_t vm_page_verify_this_free_list_enabled = FALSE;
5646 static unsigned int
vm_page_verify_free_list(vm_page_queue_head_t * vm_page_queue,unsigned int color,vm_page_t look_for_page,boolean_t expect_page)5647 vm_page_verify_free_list(
5648 	vm_page_queue_head_t    *vm_page_queue,
5649 	unsigned int    color,
5650 	vm_page_t       look_for_page,
5651 	boolean_t       expect_page)
5652 {
5653 	unsigned int    npages;
5654 	vm_page_t       m;
5655 	vm_page_t       prev_m;
5656 	boolean_t       found_page;
5657 
5658 	if (!vm_page_verify_this_free_list_enabled) {
5659 		return 0;
5660 	}
5661 
5662 	found_page = FALSE;
5663 	npages = 0;
5664 	prev_m = (vm_page_t)((uintptr_t)vm_page_queue);
5665 
5666 	vm_page_queue_iterate(vm_page_queue, m, vmp_pageq) {
5667 		if (m == look_for_page) {
5668 			found_page = TRUE;
5669 		}
5670 		if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev) != prev_m) {
5671 			panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p",
5672 			    color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev), prev_m);
5673 		}
5674 		if (!m->vmp_busy) {
5675 			panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy",
5676 			    color, npages, m);
5677 		}
5678 		if (color != (unsigned int) -1) {
5679 			if (VM_PAGE_GET_COLOR(m) != color) {
5680 				panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u",
5681 				    color, npages, m, VM_PAGE_GET_COLOR(m), color);
5682 			}
5683 			if (m->vmp_q_state != VM_PAGE_ON_FREE_Q) {
5684 				panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d",
5685 				    color, npages, m, m->vmp_q_state);
5686 			}
5687 		} else {
5688 			if (m->vmp_q_state != VM_PAGE_ON_FREE_LOCAL_Q) {
5689 				panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d",
5690 				    npages, m, m->vmp_q_state);
5691 			}
5692 		}
5693 		++npages;
5694 		prev_m = m;
5695 	}
5696 	if (look_for_page != VM_PAGE_NULL) {
5697 		unsigned int other_color;
5698 
5699 		if (expect_page && !found_page) {
5700 			printf("vm_page_verify_free_list(color=%u, npages=%u): page %p not found phys=%u\n",
5701 			    color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
5702 			_vm_page_print(look_for_page);
5703 			for (other_color = 0;
5704 			    other_color < vm_colors;
5705 			    other_color++) {
5706 				if (other_color == color) {
5707 					continue;
5708 				}
5709 				vm_page_verify_free_list(&vm_page_queue_free[other_color].qhead,
5710 				    other_color, look_for_page, FALSE);
5711 			}
5712 			if (color == (unsigned int) -1) {
5713 				vm_page_verify_free_list(&vm_lopage_queue_free,
5714 				    (unsigned int) -1, look_for_page, FALSE);
5715 			}
5716 			panic("vm_page_verify_free_list(color=%u)", color);
5717 		}
5718 		if (!expect_page && found_page) {
5719 			printf("vm_page_verify_free_list(color=%u, npages=%u): page %p found phys=%u\n",
5720 			    color, npages, look_for_page, VM_PAGE_GET_PHYS_PAGE(look_for_page));
5721 		}
5722 	}
5723 	return npages;
5724 }
5725 
5726 static boolean_t vm_page_verify_all_free_lists_enabled = FALSE;
5727 static void
vm_page_verify_free_lists(void)5728 vm_page_verify_free_lists( void )
5729 {
5730 	unsigned int    color, npages, nlopages;
5731 	boolean_t       toggle = TRUE;
5732 
5733 	if (!vm_page_verify_all_free_lists_enabled) {
5734 		return;
5735 	}
5736 
5737 	npages = 0;
5738 
5739 	vm_free_page_lock();
5740 
5741 	if (vm_page_verify_this_free_list_enabled == TRUE) {
5742 		/*
5743 		 * This variable has been set globally for extra checking of
5744 		 * each free list Q. Since we didn't set it, we don't own it
5745 		 * and we shouldn't toggle it.
5746 		 */
5747 		toggle = FALSE;
5748 	}
5749 
5750 	if (toggle == TRUE) {
5751 		vm_page_verify_this_free_list_enabled = TRUE;
5752 	}
5753 
5754 	for (color = 0; color < vm_colors; color++) {
5755 		npages += vm_page_verify_free_list(&vm_page_queue_free[color].qhead,
5756 		    color, VM_PAGE_NULL, FALSE);
5757 	}
5758 	nlopages = vm_page_verify_free_list(&vm_lopage_queue_free,
5759 	    (unsigned int) -1,
5760 	    VM_PAGE_NULL, FALSE);
5761 	if (npages != vm_page_free_count || nlopages != vm_lopage_free_count) {
5762 		panic("vm_page_verify_free_lists:  "
5763 		    "npages %u free_count %d nlopages %u lo_free_count %u",
5764 		    npages, vm_page_free_count, nlopages, vm_lopage_free_count);
5765 	}
5766 
5767 	if (toggle == TRUE) {
5768 		vm_page_verify_this_free_list_enabled = FALSE;
5769 	}
5770 
5771 	vm_free_page_unlock();
5772 }
5773 
5774 #endif  /* MACH_ASSERT */
5775 
5776 
5777 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
5778 
5779 /*
5780  *	CONTIGUOUS PAGE ALLOCATION
5781  *
5782  *	Find a region large enough to contain at least n pages
5783  *	of contiguous physical memory.
5784  *
5785  *	This is done by traversing the vm_page_t array in a linear fashion
5786  *	we assume that the vm_page_t array has the avaiable physical pages in an
5787  *	ordered, ascending list... this is currently true of all our implementations
5788  *      and must remain so... there can be 'holes' in the array...  we also can
5789  *	no longer tolerate the vm_page_t's in the list being 'freed' and reclaimed
5790  *      which use to happen via 'vm_page_convert'... that function was no longer
5791  *      being called and was removed...
5792  *
5793  *	The basic flow consists of stabilizing some of the interesting state of
5794  *	a vm_page_t behind the vm_page_queue and vm_page_free locks... we start our
5795  *	sweep at the beginning of the array looking for pages that meet our criterea
5796  *	for a 'stealable' page... currently we are pretty conservative... if the page
5797  *	meets this criterea and is physically contiguous to the previous page in the 'run'
5798  *      we keep developing it.  If we hit a page that doesn't fit, we reset our state
5799  *	and start to develop a new run... if at this point we've already considered
5800  *      at least MAX_CONSIDERED_BEFORE_YIELD pages, we'll drop the 2 locks we hold,
5801  *	and mutex_pause (which will yield the processor), to keep the latency low w/r
5802  *	to other threads trying to acquire free pages (or move pages from q to q),
5803  *	and then continue from the spot we left off... we only make 1 pass through the
5804  *	array.  Once we have a 'run' that is long enough, we'll go into the loop which
5805  *      which steals the pages from the queues they're currently on... pages on the free
5806  *	queue can be stolen directly... pages that are on any of the other queues
5807  *	must be removed from the object they are tabled on... this requires taking the
5808  *      object lock... we do this as a 'try' to prevent deadlocks... if the 'try' fails
5809  *	or if the state of the page behind the vm_object lock is no longer viable, we'll
5810  *	dump the pages we've currently stolen back to the free list, and pick up our
5811  *	scan from the point where we aborted the 'current' run.
5812  *
5813  *
5814  *	Requirements:
5815  *		- neither vm_page_queue nor vm_free_list lock can be held on entry
5816  *
5817  *	Returns a pointer to a list of gobbled/wired pages or VM_PAGE_NULL.
5818  *
5819  * Algorithm:
5820  */
5821 
5822 #define MAX_CONSIDERED_BEFORE_YIELD     1000
5823 
5824 
5825 #define RESET_STATE_OF_RUN()    \
5826 	MACRO_BEGIN             \
5827 	prevcontaddr = -2;      \
5828 	start_pnum = -1;        \
5829 	free_considered = 0;    \
5830 	substitute_needed = 0;  \
5831 	npages = 0;             \
5832 	MACRO_END
5833 
5834 /*
5835  * Can we steal in-use (i.e. not free) pages when searching for
5836  * physically-contiguous pages ?
5837  */
5838 #define VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL 1
5839 
5840 static unsigned int vm_page_find_contiguous_last_idx = 0, vm_page_lomem_find_contiguous_last_idx = 0;
5841 #if DEBUG
5842 int vm_page_find_contig_debug = 0;
5843 #endif
5844 
5845 static vm_page_t
vm_page_find_contiguous(unsigned int contig_pages,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)5846 vm_page_find_contiguous(
5847 	unsigned int    contig_pages,
5848 	ppnum_t         max_pnum,
5849 	ppnum_t         pnum_mask,
5850 	boolean_t       wire,
5851 	int             flags)
5852 {
5853 	vm_page_t       m = NULL;
5854 	ppnum_t         prevcontaddr = 0;
5855 	ppnum_t         start_pnum = 0;
5856 	unsigned int    npages = 0, considered = 0, scanned = 0;
5857 	unsigned int    page_idx = 0, start_idx = 0, last_idx = 0, orig_last_idx = 0;
5858 	unsigned int    idx_last_contig_page_found = 0;
5859 	int             free_considered = 0, free_available = 0;
5860 	int             substitute_needed = 0;
5861 	int             zone_gc_called = 0;
5862 	boolean_t       wrapped;
5863 	kern_return_t   kr;
5864 #if DEBUG
5865 	clock_sec_t     tv_start_sec = 0, tv_end_sec = 0;
5866 	clock_usec_t    tv_start_usec = 0, tv_end_usec = 0;
5867 #endif
5868 
5869 	int             yielded = 0;
5870 	int             dumped_run = 0;
5871 	int             stolen_pages = 0;
5872 	int             compressed_pages = 0;
5873 
5874 
5875 	if (contig_pages == 0) {
5876 		return VM_PAGE_NULL;
5877 	}
5878 
5879 full_scan_again:
5880 
5881 #if MACH_ASSERT
5882 	vm_page_verify_free_lists();
5883 #endif
5884 #if DEBUG
5885 	clock_get_system_microtime(&tv_start_sec, &tv_start_usec);
5886 #endif
5887 	PAGE_REPLACEMENT_ALLOWED(TRUE);
5888 
5889 	/*
5890 	 * If there are still delayed pages, try to free up some that match.
5891 	 */
5892 	if (__improbable(vm_delayed_count != 0 && contig_pages != 0)) {
5893 		vm_free_delayed_pages_contig(contig_pages, max_pnum, pnum_mask);
5894 	}
5895 
5896 	vm_page_lock_queues();
5897 	vm_free_page_lock();
5898 
5899 	RESET_STATE_OF_RUN();
5900 
5901 	scanned = 0;
5902 	considered = 0;
5903 	free_available = vm_page_free_count - vm_page_free_reserved;
5904 
5905 	wrapped = FALSE;
5906 
5907 	if (flags & KMA_LOMEM) {
5908 		idx_last_contig_page_found = vm_page_lomem_find_contiguous_last_idx;
5909 	} else {
5910 		idx_last_contig_page_found =  vm_page_find_contiguous_last_idx;
5911 	}
5912 
5913 	orig_last_idx = idx_last_contig_page_found;
5914 	last_idx = orig_last_idx;
5915 
5916 	for (page_idx = last_idx, start_idx = last_idx;
5917 	    npages < contig_pages && page_idx < vm_pages_count;
5918 	    page_idx++) {
5919 retry:
5920 		if (wrapped &&
5921 		    npages == 0 &&
5922 		    page_idx >= orig_last_idx) {
5923 			/*
5924 			 * We're back where we started and we haven't
5925 			 * found any suitable contiguous range.  Let's
5926 			 * give up.
5927 			 */
5928 			break;
5929 		}
5930 		scanned++;
5931 		m = &vm_pages[page_idx];
5932 
5933 		assert(!m->vmp_fictitious);
5934 		assert(!m->vmp_private);
5935 
5936 		if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) {
5937 			/* no more low pages... */
5938 			break;
5939 		}
5940 		if (!npages & ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0)) {
5941 			/*
5942 			 * not aligned
5943 			 */
5944 			RESET_STATE_OF_RUN();
5945 		} else if (VM_PAGE_WIRED(m) || m->vmp_gobbled ||
5946 		    m->vmp_laundry || m->vmp_wanted ||
5947 		    m->vmp_cleaning || m->vmp_overwriting || m->vmp_free_when_done) {
5948 			/*
5949 			 * page is in a transient state
5950 			 * or a state we don't want to deal
5951 			 * with, so don't consider it which
5952 			 * means starting a new run
5953 			 */
5954 			RESET_STATE_OF_RUN();
5955 		} else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
5956 		    (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) ||
5957 		    (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
5958 		    (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
5959 			/*
5960 			 * page needs to be on one of our queues (other then the pageout or special free queues)
5961 			 * or it needs to belong to the compressor pool (which is now indicated
5962 			 * by vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out
5963 			 * from the check for VM_PAGE_NOT_ON_Q)
5964 			 * in order for it to be stable behind the
5965 			 * locks we hold at this point...
5966 			 * if not, don't consider it which
5967 			 * means starting a new run
5968 			 */
5969 			RESET_STATE_OF_RUN();
5970 		} else if ((m->vmp_q_state != VM_PAGE_ON_FREE_Q) && (!m->vmp_tabled || m->vmp_busy)) {
5971 			/*
5972 			 * pages on the free list are always 'busy'
5973 			 * so we couldn't test for 'busy' in the check
5974 			 * for the transient states... pages that are
5975 			 * 'free' are never 'tabled', so we also couldn't
5976 			 * test for 'tabled'.  So we check here to make
5977 			 * sure that a non-free page is not busy and is
5978 			 * tabled on an object...
5979 			 * if not, don't consider it which
5980 			 * means starting a new run
5981 			 */
5982 			RESET_STATE_OF_RUN();
5983 		} else {
5984 			if (VM_PAGE_GET_PHYS_PAGE(m) != prevcontaddr + 1) {
5985 				if ((VM_PAGE_GET_PHYS_PAGE(m) & pnum_mask) != 0) {
5986 					RESET_STATE_OF_RUN();
5987 					goto did_consider;
5988 				} else {
5989 					npages = 1;
5990 					start_idx = page_idx;
5991 					start_pnum = VM_PAGE_GET_PHYS_PAGE(m);
5992 				}
5993 			} else {
5994 				npages++;
5995 			}
5996 			prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m);
5997 
5998 			VM_PAGE_CHECK(m);
5999 			if (m->vmp_q_state == VM_PAGE_ON_FREE_Q) {
6000 				free_considered++;
6001 			} else {
6002 				/*
6003 				 * This page is not free.
6004 				 * If we can't steal used pages,
6005 				 * we have to give up this run
6006 				 * and keep looking.
6007 				 * Otherwise, we might need to
6008 				 * move the contents of this page
6009 				 * into a substitute page.
6010 				 */
6011 #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
6012 				if (m->vmp_pmapped || m->vmp_dirty || m->vmp_precious) {
6013 					substitute_needed++;
6014 				}
6015 #else
6016 				RESET_STATE_OF_RUN();
6017 #endif
6018 			}
6019 
6020 			if ((free_considered + substitute_needed) > free_available) {
6021 				/*
6022 				 * if we let this run continue
6023 				 * we will end up dropping the vm_page_free_count
6024 				 * below the reserve limit... we need to abort
6025 				 * this run, but we can at least re-consider this
6026 				 * page... thus the jump back to 'retry'
6027 				 */
6028 				RESET_STATE_OF_RUN();
6029 
6030 				if (free_available && considered <= MAX_CONSIDERED_BEFORE_YIELD) {
6031 					considered++;
6032 					goto retry;
6033 				}
6034 				/*
6035 				 * free_available == 0
6036 				 * so can't consider any free pages... if
6037 				 * we went to retry in this case, we'd
6038 				 * get stuck looking at the same page
6039 				 * w/o making any forward progress
6040 				 * we also want to take this path if we've already
6041 				 * reached our limit that controls the lock latency
6042 				 */
6043 			}
6044 		}
6045 did_consider:
6046 		if (considered > MAX_CONSIDERED_BEFORE_YIELD && npages <= 1) {
6047 			PAGE_REPLACEMENT_ALLOWED(FALSE);
6048 
6049 			vm_free_page_unlock();
6050 			vm_page_unlock_queues();
6051 
6052 			mutex_pause(0);
6053 
6054 			PAGE_REPLACEMENT_ALLOWED(TRUE);
6055 
6056 			vm_page_lock_queues();
6057 			vm_free_page_lock();
6058 
6059 			RESET_STATE_OF_RUN();
6060 			/*
6061 			 * reset our free page limit since we
6062 			 * dropped the lock protecting the vm_page_free_queue
6063 			 */
6064 			free_available = vm_page_free_count - vm_page_free_reserved;
6065 			considered = 0;
6066 
6067 			yielded++;
6068 
6069 			goto retry;
6070 		}
6071 		considered++;
6072 	}
6073 	m = VM_PAGE_NULL;
6074 
6075 	if (npages != contig_pages) {
6076 		if (!wrapped) {
6077 			/*
6078 			 * We didn't find a contiguous range but we didn't
6079 			 * start from the very first page.
6080 			 * Start again from the very first page.
6081 			 */
6082 			RESET_STATE_OF_RUN();
6083 			if (flags & KMA_LOMEM) {
6084 				idx_last_contig_page_found  = vm_page_lomem_find_contiguous_last_idx = 0;
6085 			} else {
6086 				idx_last_contig_page_found = vm_page_find_contiguous_last_idx = 0;
6087 			}
6088 			last_idx = 0;
6089 			page_idx = last_idx;
6090 			wrapped = TRUE;
6091 			goto retry;
6092 		}
6093 		vm_free_page_unlock();
6094 	} else {
6095 		vm_page_t       m1;
6096 		vm_page_t       m2;
6097 		unsigned int    cur_idx;
6098 		unsigned int    tmp_start_idx;
6099 		vm_object_t     locked_object = VM_OBJECT_NULL;
6100 		boolean_t       abort_run = FALSE;
6101 
6102 		assert(page_idx - start_idx == contig_pages);
6103 
6104 		tmp_start_idx = start_idx;
6105 
6106 		/*
6107 		 * first pass through to pull the free pages
6108 		 * off of the free queue so that in case we
6109 		 * need substitute pages, we won't grab any
6110 		 * of the free pages in the run... we'll clear
6111 		 * the 'free' bit in the 2nd pass, and even in
6112 		 * an abort_run case, we'll collect all of the
6113 		 * free pages in this run and return them to the free list
6114 		 */
6115 		while (start_idx < page_idx) {
6116 			m1 = &vm_pages[start_idx++];
6117 
6118 #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
6119 			assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q);
6120 #endif
6121 
6122 			if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) {
6123 				unsigned int color;
6124 
6125 				color = VM_PAGE_GET_COLOR(m1);
6126 #if MACH_ASSERT
6127 				vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, m1, TRUE);
6128 #endif
6129 				vm_page_queue_remove(&vm_page_queue_free[color].qhead, m1, vmp_pageq);
6130 
6131 				VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
6132 #if MACH_ASSERT
6133 				vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, VM_PAGE_NULL, FALSE);
6134 #endif
6135 				/*
6136 				 * Clear the "free" bit so that this page
6137 				 * does not get considered for another
6138 				 * concurrent physically-contiguous allocation.
6139 				 */
6140 				m1->vmp_q_state = VM_PAGE_NOT_ON_Q;
6141 				assert(m1->vmp_busy);
6142 
6143 				vm_page_free_count--;
6144 			}
6145 		}
6146 		if (flags & KMA_LOMEM) {
6147 			vm_page_lomem_find_contiguous_last_idx = page_idx;
6148 		} else {
6149 			vm_page_find_contiguous_last_idx = page_idx;
6150 		}
6151 
6152 		/*
6153 		 * we can drop the free queue lock at this point since
6154 		 * we've pulled any 'free' candidates off of the list
6155 		 * we need it dropped so that we can do a vm_page_grab
6156 		 * when substituing for pmapped/dirty pages
6157 		 */
6158 		vm_free_page_unlock();
6159 
6160 		start_idx = tmp_start_idx;
6161 		cur_idx = page_idx - 1;
6162 
6163 		while (start_idx++ < page_idx) {
6164 			/*
6165 			 * must go through the list from back to front
6166 			 * so that the page list is created in the
6167 			 * correct order - low -> high phys addresses
6168 			 */
6169 			m1 = &vm_pages[cur_idx--];
6170 
6171 			if (m1->vmp_object == 0) {
6172 				/*
6173 				 * page has already been removed from
6174 				 * the free list in the 1st pass
6175 				 */
6176 				assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
6177 				assert(m1->vmp_offset == (vm_object_offset_t) -1);
6178 				assert(m1->vmp_busy);
6179 				assert(!m1->vmp_wanted);
6180 				assert(!m1->vmp_laundry);
6181 			} else {
6182 				vm_object_t object;
6183 				int refmod;
6184 				boolean_t disconnected, reusable;
6185 
6186 				if (abort_run == TRUE) {
6187 					continue;
6188 				}
6189 
6190 				assert(m1->vmp_q_state != VM_PAGE_NOT_ON_Q);
6191 
6192 				object = VM_PAGE_OBJECT(m1);
6193 
6194 				if (object != locked_object) {
6195 					if (locked_object) {
6196 						vm_object_unlock(locked_object);
6197 						locked_object = VM_OBJECT_NULL;
6198 					}
6199 					if (vm_object_lock_try(object)) {
6200 						locked_object = object;
6201 					}
6202 				}
6203 				if (locked_object == VM_OBJECT_NULL ||
6204 				    (VM_PAGE_WIRED(m1) || m1->vmp_gobbled ||
6205 				    m1->vmp_laundry || m1->vmp_wanted ||
6206 				    m1->vmp_cleaning || m1->vmp_overwriting || m1->vmp_free_when_done || m1->vmp_busy) ||
6207 				    (m1->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
6208 					if (locked_object) {
6209 						vm_object_unlock(locked_object);
6210 						locked_object = VM_OBJECT_NULL;
6211 					}
6212 					tmp_start_idx = cur_idx;
6213 					abort_run = TRUE;
6214 					continue;
6215 				}
6216 
6217 				disconnected = FALSE;
6218 				reusable = FALSE;
6219 
6220 				if ((m1->vmp_reusable ||
6221 				    object->all_reusable) &&
6222 				    (m1->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) &&
6223 				    !m1->vmp_dirty &&
6224 				    !m1->vmp_reference) {
6225 					/* reusable page... */
6226 					refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
6227 					disconnected = TRUE;
6228 					if (refmod == 0) {
6229 						/*
6230 						 * ... not reused: can steal
6231 						 * without relocating contents.
6232 						 */
6233 						reusable = TRUE;
6234 					}
6235 				}
6236 
6237 				if ((m1->vmp_pmapped &&
6238 				    !reusable) ||
6239 				    m1->vmp_dirty ||
6240 				    m1->vmp_precious) {
6241 					vm_object_offset_t offset;
6242 
6243 					m2 = vm_page_grab_options(VM_PAGE_GRAB_Q_LOCK_HELD);
6244 
6245 					if (m2 == VM_PAGE_NULL) {
6246 						if (locked_object) {
6247 							vm_object_unlock(locked_object);
6248 							locked_object = VM_OBJECT_NULL;
6249 						}
6250 						tmp_start_idx = cur_idx;
6251 						abort_run = TRUE;
6252 						continue;
6253 					}
6254 					if (!disconnected) {
6255 						if (m1->vmp_pmapped) {
6256 							refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
6257 						} else {
6258 							refmod = 0;
6259 						}
6260 					}
6261 
6262 					/* copy the page's contents */
6263 					pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2));
6264 					/* copy the page's state */
6265 					assert(!VM_PAGE_WIRED(m1));
6266 					assert(m1->vmp_q_state != VM_PAGE_ON_FREE_Q);
6267 					assert(m1->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q);
6268 					assert(!m1->vmp_laundry);
6269 					m2->vmp_reference       = m1->vmp_reference;
6270 					assert(!m1->vmp_gobbled);
6271 					assert(!m1->vmp_private);
6272 					m2->vmp_no_cache        = m1->vmp_no_cache;
6273 					m2->vmp_xpmapped        = 0;
6274 					assert(!m1->vmp_busy);
6275 					assert(!m1->vmp_wanted);
6276 					assert(!m1->vmp_fictitious);
6277 					m2->vmp_pmapped = m1->vmp_pmapped; /* should flush cache ? */
6278 					m2->vmp_wpmapped        = m1->vmp_wpmapped;
6279 					assert(!m1->vmp_free_when_done);
6280 					m2->vmp_absent  = m1->vmp_absent;
6281 					m2->vmp_error   = VMP_ERROR_GET(m1);
6282 					m2->vmp_dirty   = m1->vmp_dirty;
6283 					assert(!m1->vmp_cleaning);
6284 					m2->vmp_precious        = m1->vmp_precious;
6285 					m2->vmp_clustered       = m1->vmp_clustered;
6286 					assert(!m1->vmp_overwriting);
6287 					m2->vmp_restart = m1->vmp_restart;
6288 					m2->vmp_unusual = m1->vmp_unusual;
6289 					m2->vmp_cs_validated = m1->vmp_cs_validated;
6290 					m2->vmp_cs_tainted      = m1->vmp_cs_tainted;
6291 					m2->vmp_cs_nx   = m1->vmp_cs_nx;
6292 
6293 					m2->vmp_realtime = m1->vmp_realtime;
6294 					m1->vmp_realtime = false;
6295 
6296 					/*
6297 					 * If m1 had really been reusable,
6298 					 * we would have just stolen it, so
6299 					 * let's not propagate it's "reusable"
6300 					 * bit and assert that m2 is not
6301 					 * marked as "reusable".
6302 					 */
6303 					// m2->vmp_reusable	= m1->vmp_reusable;
6304 					assert(!m2->vmp_reusable);
6305 
6306 					// assert(!m1->vmp_lopage);
6307 
6308 					if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6309 						m2->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
6310 						/*
6311 						 * We just grabbed m2 up above and so it isn't
6312 						 * going to be on any special Q as yet and so
6313 						 * we don't need to 'remove' it from the special
6314 						 * queues. Just resetting the state should be enough.
6315 						 */
6316 						m2->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
6317 					}
6318 
6319 					/*
6320 					 * page may need to be flushed if
6321 					 * it is marshalled into a UPL
6322 					 * that is going to be used by a device
6323 					 * that doesn't support coherency
6324 					 */
6325 					m2->vmp_written_by_kernel = TRUE;
6326 
6327 					/*
6328 					 * make sure we clear the ref/mod state
6329 					 * from the pmap layer... else we risk
6330 					 * inheriting state from the last time
6331 					 * this page was used...
6332 					 */
6333 					pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
6334 
6335 					if (refmod & VM_MEM_REFERENCED) {
6336 						m2->vmp_reference = TRUE;
6337 					}
6338 					if (refmod & VM_MEM_MODIFIED) {
6339 						SET_PAGE_DIRTY(m2, TRUE);
6340 					}
6341 					offset = m1->vmp_offset;
6342 
6343 					/*
6344 					 * completely cleans up the state
6345 					 * of the page so that it is ready
6346 					 * to be put onto the free list, or
6347 					 * for this purpose it looks like it
6348 					 * just came off of the free list
6349 					 */
6350 					vm_page_free_prepare(m1);
6351 
6352 					/*
6353 					 * now put the substitute page
6354 					 * on the object
6355 					 */
6356 					vm_page_insert_internal(m2, locked_object, offset, VM_KERN_MEMORY_NONE, TRUE, TRUE, FALSE, FALSE, NULL);
6357 
6358 					if (m2->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6359 						m2->vmp_pmapped = TRUE;
6360 						m2->vmp_wpmapped = TRUE;
6361 
6362 						PMAP_ENTER(kernel_pmap, (vm_map_offset_t)m2->vmp_offset, m2,
6363 						    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, kr);
6364 
6365 						assert(kr == KERN_SUCCESS);
6366 
6367 						compressed_pages++;
6368 					} else {
6369 						if (m2->vmp_reference) {
6370 							vm_page_activate(m2);
6371 						} else {
6372 							vm_page_deactivate(m2);
6373 						}
6374 					}
6375 					PAGE_WAKEUP_DONE(m2);
6376 				} else {
6377 					assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6378 
6379 					/*
6380 					 * completely cleans up the state
6381 					 * of the page so that it is ready
6382 					 * to be put onto the free list, or
6383 					 * for this purpose it looks like it
6384 					 * just came off of the free list
6385 					 */
6386 					vm_page_free_prepare(m1);
6387 				}
6388 
6389 				stolen_pages++;
6390 			}
6391 			if (m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
6392 				/*
6393 				 * The Q state is preserved on m1 because vm_page_queues_remove doesn't
6394 				 * change it for pages marked as used-by-compressor.
6395 				 */
6396 				vm_page_assign_special_state(m1, VM_PAGE_SPECIAL_Q_BG);
6397 			}
6398 			VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
6399 			m1->vmp_snext = m;
6400 			m = m1;
6401 		}
6402 		if (locked_object) {
6403 			vm_object_unlock(locked_object);
6404 			locked_object = VM_OBJECT_NULL;
6405 		}
6406 
6407 		if (abort_run == TRUE) {
6408 			/*
6409 			 * want the index of the last
6410 			 * page in this run that was
6411 			 * successfully 'stolen', so back
6412 			 * it up 1 for the auto-decrement on use
6413 			 * and 1 more to bump back over this page
6414 			 */
6415 			page_idx = tmp_start_idx + 2;
6416 			if (page_idx >= vm_pages_count) {
6417 				if (wrapped) {
6418 					if (m != VM_PAGE_NULL) {
6419 						vm_page_unlock_queues();
6420 						vm_page_free_list(m, FALSE);
6421 						vm_page_lock_queues();
6422 						m = VM_PAGE_NULL;
6423 					}
6424 					dumped_run++;
6425 					goto done_scanning;
6426 				}
6427 				page_idx = last_idx = 0;
6428 				wrapped = TRUE;
6429 			}
6430 			abort_run = FALSE;
6431 
6432 			/*
6433 			 * We didn't find a contiguous range but we didn't
6434 			 * start from the very first page.
6435 			 * Start again from the very first page.
6436 			 */
6437 			RESET_STATE_OF_RUN();
6438 
6439 			if (flags & KMA_LOMEM) {
6440 				idx_last_contig_page_found  = vm_page_lomem_find_contiguous_last_idx = page_idx;
6441 			} else {
6442 				idx_last_contig_page_found = vm_page_find_contiguous_last_idx = page_idx;
6443 			}
6444 
6445 			last_idx = page_idx;
6446 
6447 			if (m != VM_PAGE_NULL) {
6448 				vm_page_unlock_queues();
6449 				vm_page_free_list(m, FALSE);
6450 				vm_page_lock_queues();
6451 				m = VM_PAGE_NULL;
6452 			}
6453 			dumped_run++;
6454 
6455 			vm_free_page_lock();
6456 			/*
6457 			 * reset our free page limit since we
6458 			 * dropped the lock protecting the vm_page_free_queue
6459 			 */
6460 			free_available = vm_page_free_count - vm_page_free_reserved;
6461 			goto retry;
6462 		}
6463 
6464 		for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) {
6465 			assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
6466 			assert(m1->vmp_wire_count == 0);
6467 
6468 			if (wire == TRUE) {
6469 				m1->vmp_wire_count++;
6470 				m1->vmp_q_state = VM_PAGE_IS_WIRED;
6471 			} else {
6472 				m1->vmp_gobbled = TRUE;
6473 			}
6474 		}
6475 		if (wire == FALSE) {
6476 			vm_page_gobble_count += npages;
6477 		}
6478 
6479 		/*
6480 		 * gobbled pages are also counted as wired pages
6481 		 */
6482 		vm_page_wire_count += npages;
6483 
6484 		assert(vm_page_verify_contiguous(m, npages));
6485 	}
6486 done_scanning:
6487 	PAGE_REPLACEMENT_ALLOWED(FALSE);
6488 
6489 	vm_page_unlock_queues();
6490 
6491 #if DEBUG
6492 	clock_get_system_microtime(&tv_end_sec, &tv_end_usec);
6493 
6494 	tv_end_sec -= tv_start_sec;
6495 	if (tv_end_usec < tv_start_usec) {
6496 		tv_end_sec--;
6497 		tv_end_usec += 1000000;
6498 	}
6499 	tv_end_usec -= tv_start_usec;
6500 	if (tv_end_usec >= 1000000) {
6501 		tv_end_sec++;
6502 		tv_end_sec -= 1000000;
6503 	}
6504 	if (vm_page_find_contig_debug) {
6505 		printf("%s(num=%d,low=%d): found %d pages at 0x%llx in %ld.%06ds...  started at %d...  scanned %d pages...  yielded %d times...  dumped run %d times... stole %d pages... stole %d compressed pages\n",
6506 		    __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
6507 		    (long)tv_end_sec, tv_end_usec, orig_last_idx,
6508 		        scanned, yielded, dumped_run, stolen_pages, compressed_pages);
6509 	}
6510 
6511 #endif
6512 #if MACH_ASSERT
6513 	vm_page_verify_free_lists();
6514 #endif
6515 	if (m == NULL && zone_gc_called < 2) {
6516 		printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages...  yielded %d times...  dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n",
6517 		    __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
6518 		        scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count);
6519 
6520 		if (consider_buffer_cache_collect != NULL) {
6521 			(void)(*consider_buffer_cache_collect)(1);
6522 		}
6523 
6524 		zone_gc(zone_gc_called ? ZONE_GC_DRAIN : ZONE_GC_TRIM);
6525 
6526 		zone_gc_called++;
6527 
6528 		printf("vm_page_find_contiguous: zone_gc called... wired count is %d\n", vm_page_wire_count);
6529 		goto full_scan_again;
6530 	}
6531 
6532 	return m;
6533 }
6534 
6535 /*
6536  *	Allocate a list of contiguous, wired pages.
6537  */
6538 kern_return_t
cpm_allocate(vm_size_t size,vm_page_t * list,ppnum_t max_pnum,ppnum_t pnum_mask,boolean_t wire,int flags)6539 cpm_allocate(
6540 	vm_size_t       size,
6541 	vm_page_t       *list,
6542 	ppnum_t         max_pnum,
6543 	ppnum_t         pnum_mask,
6544 	boolean_t       wire,
6545 	int             flags)
6546 {
6547 	vm_page_t               pages;
6548 	unsigned int            npages;
6549 
6550 	if (size % PAGE_SIZE != 0) {
6551 		return KERN_INVALID_ARGUMENT;
6552 	}
6553 
6554 	npages = (unsigned int) (size / PAGE_SIZE);
6555 	if (npages != size / PAGE_SIZE) {
6556 		/* 32-bit overflow */
6557 		return KERN_INVALID_ARGUMENT;
6558 	}
6559 
6560 	/*
6561 	 *	Obtain a pointer to a subset of the free
6562 	 *	list large enough to satisfy the request;
6563 	 *	the region will be physically contiguous.
6564 	 */
6565 	pages = vm_page_find_contiguous(npages, max_pnum, pnum_mask, wire, flags);
6566 
6567 	if (pages == VM_PAGE_NULL) {
6568 		return KERN_NO_SPACE;
6569 	}
6570 	/*
6571 	 * determine need for wakeups
6572 	 */
6573 	if (vm_page_free_count < vm_page_free_min) {
6574 		vm_free_page_lock();
6575 		if (vm_pageout_running == FALSE) {
6576 			vm_free_page_unlock();
6577 			thread_wakeup((event_t) &vm_page_free_wanted);
6578 		} else {
6579 			vm_free_page_unlock();
6580 		}
6581 	}
6582 
6583 	VM_CHECK_MEMORYSTATUS;
6584 
6585 	/*
6586 	 *	The CPM pages should now be available and
6587 	 *	ordered by ascending physical address.
6588 	 */
6589 	assert(vm_page_verify_contiguous(pages, npages));
6590 
6591 	if (flags & KMA_ZERO) {
6592 		for (vm_page_t m = pages; m; m = NEXT_PAGE(m)) {
6593 			vm_page_zero_fill(m);
6594 		}
6595 	}
6596 
6597 	*list = pages;
6598 	return KERN_SUCCESS;
6599 }
6600 
6601 
6602 unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT;
6603 
6604 /*
6605  * when working on a 'run' of pages, it is necessary to hold
6606  * the vm_page_queue_lock (a hot global lock) for certain operations
6607  * on the page... however, the majority of the work can be done
6608  * while merely holding the object lock... in fact there are certain
6609  * collections of pages that don't require any work brokered by the
6610  * vm_page_queue_lock... to mitigate the time spent behind the global
6611  * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
6612  * while doing all of the work that doesn't require the vm_page_queue_lock...
6613  * then call vm_page_do_delayed_work to acquire the vm_page_queue_lock and do the
6614  * necessary work for each page... we will grab the busy bit on the page
6615  * if it's not already held so that vm_page_do_delayed_work can drop the object lock
6616  * if it can't immediately take the vm_page_queue_lock in order to compete
6617  * for the locks in the same order that vm_pageout_scan takes them.
6618  * the operation names are modeled after the names of the routines that
6619  * need to be called in order to make the changes very obvious in the
6620  * original loop
6621  */
6622 
6623 void
vm_page_do_delayed_work(vm_object_t object,vm_tag_t tag,struct vm_page_delayed_work * dwp,int dw_count)6624 vm_page_do_delayed_work(
6625 	vm_object_t     object,
6626 	vm_tag_t        tag,
6627 	struct vm_page_delayed_work *dwp,
6628 	int             dw_count)
6629 {
6630 	int             j;
6631 	vm_page_t       m;
6632 	vm_page_t       local_free_q = VM_PAGE_NULL;
6633 
6634 	/*
6635 	 * pageout_scan takes the vm_page_lock_queues first
6636 	 * then tries for the object lock... to avoid what
6637 	 * is effectively a lock inversion, we'll go to the
6638 	 * trouble of taking them in that same order... otherwise
6639 	 * if this object contains the majority of the pages resident
6640 	 * in the UBC (or a small set of large objects actively being
6641 	 * worked on contain the majority of the pages), we could
6642 	 * cause the pageout_scan thread to 'starve' in its attempt
6643 	 * to find pages to move to the free queue, since it has to
6644 	 * successfully acquire the object lock of any candidate page
6645 	 * before it can steal/clean it.
6646 	 */
6647 	if (!vm_page_trylockspin_queues()) {
6648 		vm_object_unlock(object);
6649 
6650 		/*
6651 		 * "Turnstile enabled vm_pageout_scan" can be runnable
6652 		 * for a very long time without getting on a core.
6653 		 * If this is a higher priority thread it could be
6654 		 * waiting here for a very long time respecting the fact
6655 		 * that pageout_scan would like its object after VPS does
6656 		 * a mutex_pause(0).
6657 		 * So we cap the number of yields in the vm_object_lock_avoid()
6658 		 * case to a single mutex_pause(0) which will give vm_pageout_scan
6659 		 * 10us to run and grab the object if needed.
6660 		 */
6661 		vm_page_lockspin_queues();
6662 
6663 		for (j = 0;; j++) {
6664 			if ((!vm_object_lock_avoid(object) ||
6665 			    (vps_dynamic_priority_enabled && (j > 0))) &&
6666 			    _vm_object_lock_try(object)) {
6667 				break;
6668 			}
6669 			vm_page_unlock_queues();
6670 			mutex_pause(j);
6671 			vm_page_lockspin_queues();
6672 		}
6673 	}
6674 	for (j = 0; j < dw_count; j++, dwp++) {
6675 		m = dwp->dw_m;
6676 
6677 		if (dwp->dw_mask & DW_vm_pageout_throttle_up) {
6678 			vm_pageout_throttle_up(m);
6679 		}
6680 #if CONFIG_PHANTOM_CACHE
6681 		if (dwp->dw_mask & DW_vm_phantom_cache_update) {
6682 			vm_phantom_cache_update(m);
6683 		}
6684 #endif
6685 		if (dwp->dw_mask & DW_vm_page_wire) {
6686 			vm_page_wire(m, tag, FALSE);
6687 		} else if (dwp->dw_mask & DW_vm_page_unwire) {
6688 			boolean_t       queueit;
6689 
6690 			queueit = (dwp->dw_mask & (DW_vm_page_free | DW_vm_page_deactivate_internal)) ? FALSE : TRUE;
6691 
6692 			vm_page_unwire(m, queueit);
6693 		}
6694 		if (dwp->dw_mask & DW_vm_page_free) {
6695 			vm_page_free_prepare_queues(m);
6696 
6697 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
6698 			/*
6699 			 * Add this page to our list of reclaimed pages,
6700 			 * to be freed later.
6701 			 */
6702 			m->vmp_snext = local_free_q;
6703 			local_free_q = m;
6704 		} else {
6705 			if (dwp->dw_mask & DW_vm_page_deactivate_internal) {
6706 				vm_page_deactivate_internal(m, FALSE);
6707 			} else if (dwp->dw_mask & DW_vm_page_activate) {
6708 				if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
6709 					vm_page_activate(m);
6710 				}
6711 			} else if (dwp->dw_mask & DW_vm_page_speculate) {
6712 				vm_page_speculate(m, TRUE);
6713 			} else if (dwp->dw_mask & DW_enqueue_cleaned) {
6714 				/*
6715 				 * if we didn't hold the object lock and did this,
6716 				 * we might disconnect the page, then someone might
6717 				 * soft fault it back in, then we would put it on the
6718 				 * cleaned queue, and so we would have a referenced (maybe even dirty)
6719 				 * page on that queue, which we don't want
6720 				 */
6721 				int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
6722 
6723 				if ((refmod_state & VM_MEM_REFERENCED)) {
6724 					/*
6725 					 * this page has been touched since it got cleaned; let's activate it
6726 					 * if it hasn't already been
6727 					 */
6728 					VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
6729 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
6730 
6731 					if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
6732 						vm_page_activate(m);
6733 					}
6734 				} else {
6735 					m->vmp_reference = FALSE;
6736 					vm_page_enqueue_cleaned(m);
6737 				}
6738 			} else if (dwp->dw_mask & DW_vm_page_lru) {
6739 				vm_page_lru(m);
6740 			} else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) {
6741 				if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
6742 					vm_page_queues_remove(m, TRUE);
6743 				}
6744 			}
6745 			if (dwp->dw_mask & DW_set_reference) {
6746 				m->vmp_reference = TRUE;
6747 			} else if (dwp->dw_mask & DW_clear_reference) {
6748 				m->vmp_reference = FALSE;
6749 			}
6750 
6751 			if (dwp->dw_mask & DW_move_page) {
6752 				if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
6753 					vm_page_queues_remove(m, FALSE);
6754 
6755 					assert(VM_PAGE_OBJECT(m) != kernel_object);
6756 
6757 					vm_page_enqueue_inactive(m, FALSE);
6758 				}
6759 			}
6760 			if (dwp->dw_mask & DW_clear_busy) {
6761 				m->vmp_busy = FALSE;
6762 			}
6763 
6764 			if (dwp->dw_mask & DW_PAGE_WAKEUP) {
6765 				PAGE_WAKEUP(m);
6766 			}
6767 		}
6768 	}
6769 	vm_page_unlock_queues();
6770 
6771 	if (local_free_q) {
6772 		vm_page_free_list(local_free_q, TRUE);
6773 	}
6774 
6775 	VM_CHECK_MEMORYSTATUS;
6776 }
6777 
6778 __abortlike
6779 static void
__vm_page_alloc_list_failed_panic(vm_size_t page_count,kma_flags_t flags,kern_return_t kr)6780 __vm_page_alloc_list_failed_panic(
6781 	vm_size_t       page_count,
6782 	kma_flags_t     flags,
6783 	kern_return_t   kr)
6784 {
6785 	panic("vm_page_alloc_list(%zd, 0x%x) failed unexpectedly with %d",
6786 	    (size_t)page_count, flags, kr);
6787 }
6788 
6789 kern_return_t
vm_page_alloc_list(vm_size_t page_count,kma_flags_t flags,vm_page_t * list)6790 vm_page_alloc_list(
6791 	vm_size_t   page_count,
6792 	kma_flags_t flags,
6793 	vm_page_t  *list)
6794 {
6795 	vm_page_t       page_list = VM_PAGE_NULL;
6796 	vm_page_t       mem;
6797 	kern_return_t   kr = KERN_SUCCESS;
6798 	int             page_grab_count = 0;
6799 #if DEVELOPMENT || DEBUG
6800 	task_t          task;
6801 #endif /* DEVELOPMENT || DEBUG */
6802 
6803 	for (vm_size_t i = 0; i < page_count; i++) {
6804 		for (;;) {
6805 			if (flags & KMA_LOMEM) {
6806 				mem = vm_page_grablo();
6807 			} else {
6808 				mem = vm_page_grab();
6809 			}
6810 
6811 			if (mem != VM_PAGE_NULL) {
6812 				break;
6813 			}
6814 
6815 			if (flags & KMA_NOPAGEWAIT) {
6816 				kr = KERN_RESOURCE_SHORTAGE;
6817 				goto out;
6818 			}
6819 			if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) {
6820 				kr = KERN_RESOURCE_SHORTAGE;
6821 				goto out;
6822 			}
6823 
6824 			/* VM privileged threads should have waited in vm_page_grab() and not get here. */
6825 			assert(!(current_thread()->options & TH_OPT_VMPRIV));
6826 
6827 			if ((flags & KMA_NOFAIL) == 0) {
6828 				uint64_t unavailable = ptoa_64(vm_page_wire_count + vm_page_free_target);
6829 				if (unavailable > max_mem || ptoa_64(page_count) > (max_mem - unavailable)) {
6830 					kr = KERN_RESOURCE_SHORTAGE;
6831 					goto out;
6832 				}
6833 			}
6834 			VM_PAGE_WAIT();
6835 		}
6836 
6837 		page_grab_count++;
6838 		mem->vmp_snext = page_list;
6839 		page_list = mem;
6840 	}
6841 
6842 	if ((KMA_ZERO | KMA_NOENCRYPT) & flags) {
6843 		for (mem = page_list; mem; mem = mem->vmp_snext) {
6844 			vm_page_zero_fill(mem);
6845 		}
6846 	}
6847 
6848 out:
6849 #if DEBUG || DEVELOPMENT
6850 	task = current_task_early();
6851 	if (task != NULL) {
6852 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
6853 	}
6854 #endif
6855 
6856 	if (kr == KERN_SUCCESS) {
6857 		*list = page_list;
6858 	} else if (flags & KMA_NOFAIL) {
6859 		__vm_page_alloc_list_failed_panic(page_count, flags, kr);
6860 	} else {
6861 		vm_page_free_list(page_list, FALSE);
6862 	}
6863 
6864 	return kr;
6865 }
6866 
6867 void
vm_page_set_offset(vm_page_t page,vm_object_offset_t offset)6868 vm_page_set_offset(vm_page_t page, vm_object_offset_t offset)
6869 {
6870 	page->vmp_offset = offset;
6871 }
6872 
6873 vm_page_t
vm_page_get_next(vm_page_t page)6874 vm_page_get_next(vm_page_t page)
6875 {
6876 	return page->vmp_snext;
6877 }
6878 
6879 vm_object_offset_t
vm_page_get_offset(vm_page_t page)6880 vm_page_get_offset(vm_page_t page)
6881 {
6882 	return page->vmp_offset;
6883 }
6884 
6885 ppnum_t
vm_page_get_phys_page(vm_page_t page)6886 vm_page_get_phys_page(vm_page_t page)
6887 {
6888 	return VM_PAGE_GET_PHYS_PAGE(page);
6889 }
6890 
6891 
6892 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
6893 
6894 #if HIBERNATION
6895 
6896 static vm_page_t hibernate_gobble_queue;
6897 
6898 static int  hibernate_drain_pageout_queue(struct vm_pageout_queue *);
6899 static int  hibernate_flush_dirty_pages(int);
6900 static int  hibernate_flush_queue(vm_page_queue_head_t *, int);
6901 
6902 void hibernate_flush_wait(void);
6903 void hibernate_mark_in_progress(void);
6904 void hibernate_clear_in_progress(void);
6905 
6906 void            hibernate_free_range(int, int);
6907 void            hibernate_hash_insert_page(vm_page_t);
6908 uint32_t        hibernate_mark_as_unneeded(addr64_t, addr64_t, hibernate_page_list_t *, hibernate_page_list_t *);
6909 uint32_t        hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *);
6910 ppnum_t         hibernate_lookup_paddr(unsigned int);
6911 
6912 struct hibernate_statistics {
6913 	int hibernate_considered;
6914 	int hibernate_reentered_on_q;
6915 	int hibernate_found_dirty;
6916 	int hibernate_skipped_cleaning;
6917 	int hibernate_skipped_transient;
6918 	int hibernate_skipped_precious;
6919 	int hibernate_skipped_external;
6920 	int hibernate_queue_nolock;
6921 	int hibernate_queue_paused;
6922 	int hibernate_throttled;
6923 	int hibernate_throttle_timeout;
6924 	int hibernate_drained;
6925 	int hibernate_drain_timeout;
6926 	int cd_lock_failed;
6927 	int cd_found_precious;
6928 	int cd_found_wired;
6929 	int cd_found_busy;
6930 	int cd_found_unusual;
6931 	int cd_found_cleaning;
6932 	int cd_found_laundry;
6933 	int cd_found_dirty;
6934 	int cd_found_xpmapped;
6935 	int cd_skipped_xpmapped;
6936 	int cd_local_free;
6937 	int cd_total_free;
6938 	int cd_vm_page_wire_count;
6939 	int cd_vm_struct_pages_unneeded;
6940 	int cd_pages;
6941 	int cd_discarded;
6942 	int cd_count_wire;
6943 } hibernate_stats;
6944 
6945 
6946 /*
6947  * clamp the number of 'xpmapped' pages we'll sweep into the hibernation image
6948  * so that we don't overrun the estimated image size, which would
6949  * result in a hibernation failure.
6950  *
6951  * We use a size value instead of pages because we don't want to take up more space
6952  * on disk if the system has a 16K page size vs 4K. Also, we are not guaranteed
6953  * to have that additional space available.
6954  *
6955  * Since this was set at 40000 pages on X86 we are going to use 160MB as our
6956  * xpmapped size.
6957  */
6958 #define HIBERNATE_XPMAPPED_LIMIT        ((160 * 1024 * 1024ULL) / PAGE_SIZE)
6959 
6960 
6961 static int
hibernate_drain_pageout_queue(struct vm_pageout_queue * q)6962 hibernate_drain_pageout_queue(struct vm_pageout_queue *q)
6963 {
6964 	wait_result_t   wait_result;
6965 
6966 	vm_page_lock_queues();
6967 
6968 	while (!vm_page_queue_empty(&q->pgo_pending)) {
6969 		q->pgo_draining = TRUE;
6970 
6971 		assert_wait_timeout((event_t) (&q->pgo_laundry + 1), THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
6972 
6973 		vm_page_unlock_queues();
6974 
6975 		wait_result = thread_block(THREAD_CONTINUE_NULL);
6976 
6977 		if (wait_result == THREAD_TIMED_OUT && !vm_page_queue_empty(&q->pgo_pending)) {
6978 			hibernate_stats.hibernate_drain_timeout++;
6979 
6980 			if (q == &vm_pageout_queue_external) {
6981 				return 0;
6982 			}
6983 
6984 			return 1;
6985 		}
6986 		vm_page_lock_queues();
6987 
6988 		hibernate_stats.hibernate_drained++;
6989 	}
6990 	vm_page_unlock_queues();
6991 
6992 	return 0;
6993 }
6994 
6995 
6996 boolean_t hibernate_skip_external = FALSE;
6997 
6998 static int
hibernate_flush_queue(vm_page_queue_head_t * q,int qcount)6999 hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
7000 {
7001 	vm_page_t       m;
7002 	vm_object_t     l_object = NULL;
7003 	vm_object_t     m_object = NULL;
7004 	int             refmod_state = 0;
7005 	int             try_failed_count = 0;
7006 	int             retval = 0;
7007 	int             current_run = 0;
7008 	struct  vm_pageout_queue *iq;
7009 	struct  vm_pageout_queue *eq;
7010 	struct  vm_pageout_queue *tq;
7011 
7012 	KDBG(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_START,
7013 	    VM_KERNEL_UNSLIDE_OR_PERM(q), qcount);
7014 
7015 	iq = &vm_pageout_queue_internal;
7016 	eq = &vm_pageout_queue_external;
7017 
7018 	vm_page_lock_queues();
7019 
7020 	while (qcount && !vm_page_queue_empty(q)) {
7021 		if (current_run++ == 1000) {
7022 			if (hibernate_should_abort()) {
7023 				retval = 1;
7024 				break;
7025 			}
7026 			current_run = 0;
7027 		}
7028 
7029 		m = (vm_page_t) vm_page_queue_first(q);
7030 		m_object = VM_PAGE_OBJECT(m);
7031 
7032 		/*
7033 		 * check to see if we currently are working
7034 		 * with the same object... if so, we've
7035 		 * already got the lock
7036 		 */
7037 		if (m_object != l_object) {
7038 			/*
7039 			 * the object associated with candidate page is
7040 			 * different from the one we were just working
7041 			 * with... dump the lock if we still own it
7042 			 */
7043 			if (l_object != NULL) {
7044 				vm_object_unlock(l_object);
7045 				l_object = NULL;
7046 			}
7047 			/*
7048 			 * Try to lock object; since we've alread got the
7049 			 * page queues lock, we can only 'try' for this one.
7050 			 * if the 'try' fails, we need to do a mutex_pause
7051 			 * to allow the owner of the object lock a chance to
7052 			 * run...
7053 			 */
7054 			if (!vm_object_lock_try_scan(m_object)) {
7055 				if (try_failed_count > 20) {
7056 					hibernate_stats.hibernate_queue_nolock++;
7057 
7058 					goto reenter_pg_on_q;
7059 				}
7060 
7061 				vm_page_unlock_queues();
7062 				mutex_pause(try_failed_count++);
7063 				vm_page_lock_queues();
7064 
7065 				hibernate_stats.hibernate_queue_paused++;
7066 				continue;
7067 			} else {
7068 				l_object = m_object;
7069 			}
7070 		}
7071 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m)) {
7072 			/*
7073 			 * page is not to be cleaned
7074 			 * put it back on the head of its queue
7075 			 */
7076 			if (m->vmp_cleaning) {
7077 				hibernate_stats.hibernate_skipped_cleaning++;
7078 			} else {
7079 				hibernate_stats.hibernate_skipped_transient++;
7080 			}
7081 
7082 			goto reenter_pg_on_q;
7083 		}
7084 		if (m_object->copy == VM_OBJECT_NULL) {
7085 			if (m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) {
7086 				/*
7087 				 * let the normal hibernate image path
7088 				 * deal with these
7089 				 */
7090 				goto reenter_pg_on_q;
7091 			}
7092 		}
7093 		if (!m->vmp_dirty && m->vmp_pmapped) {
7094 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
7095 
7096 			if ((refmod_state & VM_MEM_MODIFIED)) {
7097 				SET_PAGE_DIRTY(m, FALSE);
7098 			}
7099 		} else {
7100 			refmod_state = 0;
7101 		}
7102 
7103 		if (!m->vmp_dirty) {
7104 			/*
7105 			 * page is not to be cleaned
7106 			 * put it back on the head of its queue
7107 			 */
7108 			if (m->vmp_precious) {
7109 				hibernate_stats.hibernate_skipped_precious++;
7110 			}
7111 
7112 			goto reenter_pg_on_q;
7113 		}
7114 
7115 		if (hibernate_skip_external == TRUE && !m_object->internal) {
7116 			hibernate_stats.hibernate_skipped_external++;
7117 
7118 			goto reenter_pg_on_q;
7119 		}
7120 		tq = NULL;
7121 
7122 		if (m_object->internal) {
7123 			if (VM_PAGE_Q_THROTTLED(iq)) {
7124 				tq = iq;
7125 			}
7126 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
7127 			tq = eq;
7128 		}
7129 
7130 		if (tq != NULL) {
7131 			wait_result_t   wait_result;
7132 			int             wait_count = 5;
7133 
7134 			if (l_object != NULL) {
7135 				vm_object_unlock(l_object);
7136 				l_object = NULL;
7137 			}
7138 
7139 			while (retval == 0) {
7140 				tq->pgo_throttled = TRUE;
7141 
7142 				assert_wait_timeout((event_t) &tq->pgo_laundry, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
7143 
7144 				vm_page_unlock_queues();
7145 
7146 				wait_result = thread_block(THREAD_CONTINUE_NULL);
7147 
7148 				vm_page_lock_queues();
7149 
7150 				if (wait_result != THREAD_TIMED_OUT) {
7151 					break;
7152 				}
7153 				if (!VM_PAGE_Q_THROTTLED(tq)) {
7154 					break;
7155 				}
7156 
7157 				if (hibernate_should_abort()) {
7158 					retval = 1;
7159 				}
7160 
7161 				if (--wait_count == 0) {
7162 					hibernate_stats.hibernate_throttle_timeout++;
7163 
7164 					if (tq == eq) {
7165 						hibernate_skip_external = TRUE;
7166 						break;
7167 					}
7168 					retval = 1;
7169 				}
7170 			}
7171 			if (retval) {
7172 				break;
7173 			}
7174 
7175 			hibernate_stats.hibernate_throttled++;
7176 
7177 			continue;
7178 		}
7179 		/*
7180 		 * we've already factored out pages in the laundry which
7181 		 * means this page can't be on the pageout queue so it's
7182 		 * safe to do the vm_page_queues_remove
7183 		 */
7184 		vm_page_queues_remove(m, TRUE);
7185 
7186 		if (m_object->internal == TRUE) {
7187 			pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m), PMAP_OPTIONS_COMPRESSOR, NULL);
7188 		}
7189 
7190 		vm_pageout_cluster(m);
7191 
7192 		hibernate_stats.hibernate_found_dirty++;
7193 
7194 		goto next_pg;
7195 
7196 reenter_pg_on_q:
7197 		vm_page_queue_remove(q, m, vmp_pageq);
7198 		vm_page_queue_enter(q, m, vmp_pageq);
7199 
7200 		hibernate_stats.hibernate_reentered_on_q++;
7201 next_pg:
7202 		hibernate_stats.hibernate_considered++;
7203 
7204 		qcount--;
7205 		try_failed_count = 0;
7206 	}
7207 	if (l_object != NULL) {
7208 		vm_object_unlock(l_object);
7209 		l_object = NULL;
7210 	}
7211 
7212 	vm_page_unlock_queues();
7213 
7214 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_END, hibernate_stats.hibernate_found_dirty, retval, 0, 0, 0);
7215 
7216 	return retval;
7217 }
7218 
7219 
7220 static int
hibernate_flush_dirty_pages(int pass)7221 hibernate_flush_dirty_pages(int pass)
7222 {
7223 	struct vm_speculative_age_q     *aq;
7224 	uint32_t        i;
7225 
7226 	if (vm_page_local_q) {
7227 		zpercpu_foreach_cpu(lid) {
7228 			vm_page_reactivate_local(lid, TRUE, FALSE);
7229 		}
7230 	}
7231 
7232 	for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
7233 		int             qcount;
7234 		vm_page_t       m;
7235 
7236 		aq = &vm_page_queue_speculative[i];
7237 
7238 		if (vm_page_queue_empty(&aq->age_q)) {
7239 			continue;
7240 		}
7241 		qcount = 0;
7242 
7243 		vm_page_lockspin_queues();
7244 
7245 		vm_page_queue_iterate(&aq->age_q, m, vmp_pageq) {
7246 			qcount++;
7247 		}
7248 		vm_page_unlock_queues();
7249 
7250 		if (qcount) {
7251 			if (hibernate_flush_queue(&aq->age_q, qcount)) {
7252 				return 1;
7253 			}
7254 		}
7255 	}
7256 	if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count)) {
7257 		return 1;
7258 	}
7259 	/* XXX FBDP TODO: flush secluded queue */
7260 	if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count)) {
7261 		return 1;
7262 	}
7263 	if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count)) {
7264 		return 1;
7265 	}
7266 	if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
7267 		return 1;
7268 	}
7269 
7270 	if (pass == 1) {
7271 		vm_compressor_record_warmup_start();
7272 	}
7273 
7274 	if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) {
7275 		if (pass == 1) {
7276 			vm_compressor_record_warmup_end();
7277 		}
7278 		return 1;
7279 	}
7280 	if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) {
7281 		if (pass == 1) {
7282 			vm_compressor_record_warmup_end();
7283 		}
7284 		return 1;
7285 	}
7286 	if (pass == 1) {
7287 		vm_compressor_record_warmup_end();
7288 	}
7289 
7290 	if (hibernate_skip_external == FALSE && hibernate_drain_pageout_queue(&vm_pageout_queue_external)) {
7291 		return 1;
7292 	}
7293 
7294 	return 0;
7295 }
7296 
7297 
7298 void
hibernate_reset_stats()7299 hibernate_reset_stats()
7300 {
7301 	bzero(&hibernate_stats, sizeof(struct hibernate_statistics));
7302 }
7303 
7304 
7305 int
hibernate_flush_memory()7306 hibernate_flush_memory()
7307 {
7308 	int     retval;
7309 
7310 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
7311 
7312 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_START, vm_page_free_count, 0, 0, 0, 0);
7313 
7314 	hibernate_cleaning_in_progress = TRUE;
7315 	hibernate_skip_external = FALSE;
7316 
7317 	if ((retval = hibernate_flush_dirty_pages(1)) == 0) {
7318 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_START, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
7319 
7320 		vm_compressor_flush();
7321 
7322 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 10) | DBG_FUNC_END, VM_PAGE_COMPRESSOR_COUNT, 0, 0, 0, 0);
7323 
7324 		if (consider_buffer_cache_collect != NULL) {
7325 			unsigned int orig_wire_count;
7326 
7327 			KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_START, 0, 0, 0, 0, 0);
7328 			orig_wire_count = vm_page_wire_count;
7329 
7330 			(void)(*consider_buffer_cache_collect)(1);
7331 			zone_gc(ZONE_GC_DRAIN);
7332 
7333 			HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count);
7334 
7335 			KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 7) | DBG_FUNC_END, orig_wire_count - vm_page_wire_count, 0, 0, 0, 0);
7336 		}
7337 	}
7338 	hibernate_cleaning_in_progress = FALSE;
7339 
7340 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 3) | DBG_FUNC_END, vm_page_free_count, hibernate_stats.hibernate_found_dirty, retval, 0, 0);
7341 
7342 	if (retval) {
7343 		HIBLOG("hibernate_flush_memory() failed to finish - vm_page_compressor_count(%d)\n", VM_PAGE_COMPRESSOR_COUNT);
7344 	}
7345 
7346 
7347 	HIBPRINT("hibernate_flush_memory() considered(%d) reentered_on_q(%d) found_dirty(%d)\n",
7348 	    hibernate_stats.hibernate_considered,
7349 	    hibernate_stats.hibernate_reentered_on_q,
7350 	    hibernate_stats.hibernate_found_dirty);
7351 	HIBPRINT("   skipped_cleaning(%d) skipped_transient(%d) skipped_precious(%d) skipped_external(%d) queue_nolock(%d)\n",
7352 	    hibernate_stats.hibernate_skipped_cleaning,
7353 	    hibernate_stats.hibernate_skipped_transient,
7354 	    hibernate_stats.hibernate_skipped_precious,
7355 	    hibernate_stats.hibernate_skipped_external,
7356 	    hibernate_stats.hibernate_queue_nolock);
7357 	HIBPRINT("   queue_paused(%d) throttled(%d) throttle_timeout(%d) drained(%d) drain_timeout(%d)\n",
7358 	    hibernate_stats.hibernate_queue_paused,
7359 	    hibernate_stats.hibernate_throttled,
7360 	    hibernate_stats.hibernate_throttle_timeout,
7361 	    hibernate_stats.hibernate_drained,
7362 	    hibernate_stats.hibernate_drain_timeout);
7363 
7364 	return retval;
7365 }
7366 
7367 
7368 static void
hibernate_page_list_zero(hibernate_page_list_t * list)7369 hibernate_page_list_zero(hibernate_page_list_t *list)
7370 {
7371 	uint32_t             bank;
7372 	hibernate_bitmap_t * bitmap;
7373 
7374 	bitmap = &list->bank_bitmap[0];
7375 	for (bank = 0; bank < list->bank_count; bank++) {
7376 		uint32_t last_bit;
7377 
7378 		bzero((void *) &bitmap->bitmap[0], bitmap->bitmapwords << 2);
7379 		// set out-of-bound bits at end of bitmap.
7380 		last_bit = ((bitmap->last_page - bitmap->first_page + 1) & 31);
7381 		if (last_bit) {
7382 			bitmap->bitmap[bitmap->bitmapwords - 1] = (0xFFFFFFFF >> last_bit);
7383 		}
7384 
7385 		bitmap = (hibernate_bitmap_t *) &bitmap->bitmap[bitmap->bitmapwords];
7386 	}
7387 }
7388 
7389 void
hibernate_free_gobble_pages(void)7390 hibernate_free_gobble_pages(void)
7391 {
7392 	vm_page_t m, next;
7393 	uint32_t  count = 0;
7394 
7395 	m = (vm_page_t) hibernate_gobble_queue;
7396 	while (m) {
7397 		next = m->vmp_snext;
7398 		vm_page_free(m);
7399 		count++;
7400 		m = next;
7401 	}
7402 	hibernate_gobble_queue = VM_PAGE_NULL;
7403 
7404 	if (count) {
7405 		HIBLOG("Freed %d pages\n", count);
7406 	}
7407 }
7408 
7409 static boolean_t
hibernate_consider_discard(vm_page_t m,boolean_t preflight)7410 hibernate_consider_discard(vm_page_t m, boolean_t preflight)
7411 {
7412 	vm_object_t object = NULL;
7413 	int                  refmod_state;
7414 	boolean_t            discard = FALSE;
7415 
7416 	do{
7417 		if (m->vmp_private) {
7418 			panic("hibernate_consider_discard: private");
7419 		}
7420 
7421 		object = VM_PAGE_OBJECT(m);
7422 
7423 		if (!vm_object_lock_try(object)) {
7424 			object = NULL;
7425 			if (!preflight) {
7426 				hibernate_stats.cd_lock_failed++;
7427 			}
7428 			break;
7429 		}
7430 		if (VM_PAGE_WIRED(m)) {
7431 			if (!preflight) {
7432 				hibernate_stats.cd_found_wired++;
7433 			}
7434 			break;
7435 		}
7436 		if (m->vmp_precious) {
7437 			if (!preflight) {
7438 				hibernate_stats.cd_found_precious++;
7439 			}
7440 			break;
7441 		}
7442 		if (m->vmp_busy || !object->alive) {
7443 			/*
7444 			 *	Somebody is playing with this page.
7445 			 */
7446 			if (!preflight) {
7447 				hibernate_stats.cd_found_busy++;
7448 			}
7449 			break;
7450 		}
7451 		if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
7452 			/*
7453 			 * If it's unusual in anyway, ignore it
7454 			 */
7455 			if (!preflight) {
7456 				hibernate_stats.cd_found_unusual++;
7457 			}
7458 			break;
7459 		}
7460 		if (m->vmp_cleaning) {
7461 			if (!preflight) {
7462 				hibernate_stats.cd_found_cleaning++;
7463 			}
7464 			break;
7465 		}
7466 		if (m->vmp_laundry) {
7467 			if (!preflight) {
7468 				hibernate_stats.cd_found_laundry++;
7469 			}
7470 			break;
7471 		}
7472 		if (!m->vmp_dirty) {
7473 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
7474 
7475 			if (refmod_state & VM_MEM_REFERENCED) {
7476 				m->vmp_reference = TRUE;
7477 			}
7478 			if (refmod_state & VM_MEM_MODIFIED) {
7479 				SET_PAGE_DIRTY(m, FALSE);
7480 			}
7481 		}
7482 
7483 		/*
7484 		 * If it's clean or purgeable we can discard the page on wakeup.
7485 		 */
7486 		discard = (!m->vmp_dirty)
7487 		    || (VM_PURGABLE_VOLATILE == object->purgable)
7488 		    || (VM_PURGABLE_EMPTY == object->purgable);
7489 
7490 
7491 		if (discard == FALSE) {
7492 			if (!preflight) {
7493 				hibernate_stats.cd_found_dirty++;
7494 			}
7495 		} else if (m->vmp_xpmapped && m->vmp_reference && !object->internal) {
7496 			if (hibernate_stats.cd_found_xpmapped < HIBERNATE_XPMAPPED_LIMIT) {
7497 				if (!preflight) {
7498 					hibernate_stats.cd_found_xpmapped++;
7499 				}
7500 				discard = FALSE;
7501 			} else {
7502 				if (!preflight) {
7503 					hibernate_stats.cd_skipped_xpmapped++;
7504 				}
7505 			}
7506 		}
7507 	}while (FALSE);
7508 
7509 	if (object) {
7510 		vm_object_unlock(object);
7511 	}
7512 
7513 	return discard;
7514 }
7515 
7516 
7517 static void
hibernate_discard_page(vm_page_t m)7518 hibernate_discard_page(vm_page_t m)
7519 {
7520 	vm_object_t m_object;
7521 
7522 	if (m->vmp_absent || m->vmp_unusual || VMP_ERROR_GET(m)) {
7523 		/*
7524 		 * If it's unusual in anyway, ignore
7525 		 */
7526 		return;
7527 	}
7528 
7529 	m_object = VM_PAGE_OBJECT(m);
7530 
7531 #if MACH_ASSERT || DEBUG
7532 	if (!vm_object_lock_try(m_object)) {
7533 		panic("hibernate_discard_page(%p) !vm_object_lock_try", m);
7534 	}
7535 #else
7536 	/* No need to lock page queue for token delete, hibernate_vm_unlock()
7537 	 *  makes sure these locks are uncontended before sleep */
7538 #endif /* MACH_ASSERT || DEBUG */
7539 
7540 	if (m->vmp_pmapped == TRUE) {
7541 		__unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7542 	}
7543 
7544 	if (m->vmp_laundry) {
7545 		panic("hibernate_discard_page(%p) laundry", m);
7546 	}
7547 	if (m->vmp_private) {
7548 		panic("hibernate_discard_page(%p) private", m);
7549 	}
7550 	if (m->vmp_fictitious) {
7551 		panic("hibernate_discard_page(%p) fictitious", m);
7552 	}
7553 
7554 	if (VM_PURGABLE_VOLATILE == m_object->purgable) {
7555 		/* object should be on a queue */
7556 		assert((m_object->objq.next != NULL) && (m_object->objq.prev != NULL));
7557 		purgeable_q_t old_queue = vm_purgeable_object_remove(m_object);
7558 		assert(old_queue);
7559 		if (m_object->purgeable_when_ripe) {
7560 			vm_purgeable_token_delete_first(old_queue);
7561 		}
7562 		vm_object_lock_assert_exclusive(m_object);
7563 		m_object->purgable = VM_PURGABLE_EMPTY;
7564 
7565 		/*
7566 		 * Purgeable ledgers:  pages of VOLATILE and EMPTY objects are
7567 		 * accounted in the "volatile" ledger, so no change here.
7568 		 * We have to update vm_page_purgeable_count, though, since we're
7569 		 * effectively purging this object.
7570 		 */
7571 		unsigned int delta;
7572 		assert(m_object->resident_page_count >= m_object->wired_page_count);
7573 		delta = (m_object->resident_page_count - m_object->wired_page_count);
7574 		assert(vm_page_purgeable_count >= delta);
7575 		assert(delta > 0);
7576 		OSAddAtomic(-delta, (SInt32 *)&vm_page_purgeable_count);
7577 	}
7578 
7579 	vm_page_free(m);
7580 
7581 #if MACH_ASSERT || DEBUG
7582 	vm_object_unlock(m_object);
7583 #endif  /* MACH_ASSERT || DEBUG */
7584 }
7585 
7586 /*
7587  *  Grab locks for hibernate_page_list_setall()
7588  */
7589 void
hibernate_vm_lock_queues(void)7590 hibernate_vm_lock_queues(void)
7591 {
7592 	vm_object_lock(compressor_object);
7593 	vm_page_lock_queues();
7594 	vm_free_page_lock();
7595 	lck_mtx_lock(&vm_purgeable_queue_lock);
7596 
7597 	if (vm_page_local_q) {
7598 		zpercpu_foreach(lq, vm_page_local_q) {
7599 			VPL_LOCK(&lq->vpl_lock);
7600 		}
7601 	}
7602 }
7603 
7604 void
hibernate_vm_unlock_queues(void)7605 hibernate_vm_unlock_queues(void)
7606 {
7607 	if (vm_page_local_q) {
7608 		zpercpu_foreach(lq, vm_page_local_q) {
7609 			VPL_UNLOCK(&lq->vpl_lock);
7610 		}
7611 	}
7612 	lck_mtx_unlock(&vm_purgeable_queue_lock);
7613 	vm_free_page_unlock();
7614 	vm_page_unlock_queues();
7615 	vm_object_unlock(compressor_object);
7616 }
7617 
7618 /*
7619  *  Bits zero in the bitmaps => page needs to be saved. All pages default to be saved,
7620  *  pages known to VM to not need saving are subtracted.
7621  *  Wired pages to be saved are present in page_list_wired, pageable in page_list.
7622  */
7623 
7624 void
hibernate_page_list_setall(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired,hibernate_page_list_t * page_list_pal,boolean_t preflight,boolean_t will_discard,uint32_t * pagesOut)7625 hibernate_page_list_setall(hibernate_page_list_t * page_list,
7626     hibernate_page_list_t * page_list_wired,
7627     hibernate_page_list_t * page_list_pal,
7628     boolean_t preflight,
7629     boolean_t will_discard,
7630     uint32_t * pagesOut)
7631 {
7632 	uint64_t start, end, nsec;
7633 	vm_page_t m;
7634 	vm_page_t next;
7635 	uint32_t pages = page_list->page_count;
7636 	uint32_t count_anonymous = 0, count_throttled = 0, count_compressor = 0;
7637 	uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0;
7638 	uint32_t count_wire = pages;
7639 	uint32_t count_discard_active    = 0;
7640 	uint32_t count_discard_inactive  = 0;
7641 	uint32_t count_retired = 0;
7642 	uint32_t count_discard_cleaned   = 0;
7643 	uint32_t count_discard_purgeable = 0;
7644 	uint32_t count_discard_speculative = 0;
7645 	uint32_t count_discard_vm_struct_pages = 0;
7646 	uint32_t i;
7647 	uint32_t             bank;
7648 	hibernate_bitmap_t * bitmap;
7649 	hibernate_bitmap_t * bitmap_wired;
7650 	boolean_t                    discard_all;
7651 	boolean_t            discard = FALSE;
7652 
7653 	HIBLOG("hibernate_page_list_setall(preflight %d) start\n", preflight);
7654 
7655 	if (preflight) {
7656 		page_list       = NULL;
7657 		page_list_wired = NULL;
7658 		page_list_pal   = NULL;
7659 		discard_all     = FALSE;
7660 	} else {
7661 		discard_all     = will_discard;
7662 	}
7663 
7664 #if MACH_ASSERT || DEBUG
7665 	if (!preflight) {
7666 		assert(hibernate_vm_locks_are_safe());
7667 		vm_page_lock_queues();
7668 		if (vm_page_local_q) {
7669 			zpercpu_foreach(lq, vm_page_local_q) {
7670 				VPL_LOCK(&lq->vpl_lock);
7671 			}
7672 		}
7673 	}
7674 #endif  /* MACH_ASSERT || DEBUG */
7675 
7676 
7677 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_START, count_wire, 0, 0, 0, 0);
7678 
7679 	clock_get_uptime(&start);
7680 
7681 	if (!preflight) {
7682 		hibernate_page_list_zero(page_list);
7683 		hibernate_page_list_zero(page_list_wired);
7684 		hibernate_page_list_zero(page_list_pal);
7685 
7686 		hibernate_stats.cd_vm_page_wire_count = vm_page_wire_count;
7687 		hibernate_stats.cd_pages = pages;
7688 	}
7689 
7690 	if (vm_page_local_q) {
7691 		zpercpu_foreach_cpu(lid) {
7692 			vm_page_reactivate_local(lid, TRUE, !preflight);
7693 		}
7694 	}
7695 
7696 	if (preflight) {
7697 		vm_object_lock(compressor_object);
7698 		vm_page_lock_queues();
7699 		vm_free_page_lock();
7700 	}
7701 
7702 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
7703 
7704 	hibernation_vmqueues_inspection = TRUE;
7705 
7706 	m = (vm_page_t) hibernate_gobble_queue;
7707 	while (m) {
7708 		pages--;
7709 		count_wire--;
7710 		if (!preflight) {
7711 			hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7712 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7713 		}
7714 		m = m->vmp_snext;
7715 	}
7716 
7717 	if (!preflight) {
7718 		percpu_foreach(free_pages_head, free_pages) {
7719 			for (m = *free_pages_head; m; m = m->vmp_snext) {
7720 				assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
7721 
7722 				pages--;
7723 				count_wire--;
7724 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7725 				hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7726 
7727 				hibernate_stats.cd_local_free++;
7728 				hibernate_stats.cd_total_free++;
7729 			}
7730 		}
7731 	}
7732 
7733 	for (i = 0; i < vm_colors; i++) {
7734 		vm_page_queue_iterate(&vm_page_queue_free[i].qhead, m, vmp_pageq) {
7735 			assert(m->vmp_q_state == VM_PAGE_ON_FREE_Q);
7736 
7737 			pages--;
7738 			count_wire--;
7739 			if (!preflight) {
7740 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7741 				hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7742 
7743 				hibernate_stats.cd_total_free++;
7744 			}
7745 		}
7746 	}
7747 
7748 	vm_page_queue_iterate(&vm_lopage_queue_free, m, vmp_pageq) {
7749 		assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
7750 
7751 		pages--;
7752 		count_wire--;
7753 		if (!preflight) {
7754 			hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7755 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7756 
7757 			hibernate_stats.cd_total_free++;
7758 		}
7759 	}
7760 
7761 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
7762 	while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) {
7763 		assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
7764 
7765 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7766 		discard = FALSE;
7767 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode)
7768 		    && hibernate_consider_discard(m, preflight)) {
7769 			if (!preflight) {
7770 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7771 			}
7772 			count_discard_inactive++;
7773 			discard = discard_all;
7774 		} else {
7775 			count_throttled++;
7776 		}
7777 		count_wire--;
7778 		if (!preflight) {
7779 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7780 		}
7781 
7782 		if (discard) {
7783 			hibernate_discard_page(m);
7784 		}
7785 		m = next;
7786 	}
7787 
7788 	m = (vm_page_t)vm_page_queue_first(&vm_page_queue_anonymous);
7789 	while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
7790 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
7791 
7792 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7793 		discard = FALSE;
7794 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7795 		    hibernate_consider_discard(m, preflight)) {
7796 			if (!preflight) {
7797 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7798 			}
7799 			if (m->vmp_dirty) {
7800 				count_discard_purgeable++;
7801 			} else {
7802 				count_discard_inactive++;
7803 			}
7804 			discard = discard_all;
7805 		} else {
7806 			count_anonymous++;
7807 		}
7808 		count_wire--;
7809 		if (!preflight) {
7810 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7811 		}
7812 		if (discard) {
7813 			hibernate_discard_page(m);
7814 		}
7815 		m = next;
7816 	}
7817 
7818 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
7819 	while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
7820 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
7821 
7822 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7823 		discard = FALSE;
7824 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7825 		    hibernate_consider_discard(m, preflight)) {
7826 			if (!preflight) {
7827 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7828 			}
7829 			if (m->vmp_dirty) {
7830 				count_discard_purgeable++;
7831 			} else {
7832 				count_discard_cleaned++;
7833 			}
7834 			discard = discard_all;
7835 		} else {
7836 			count_cleaned++;
7837 		}
7838 		count_wire--;
7839 		if (!preflight) {
7840 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7841 		}
7842 		if (discard) {
7843 			hibernate_discard_page(m);
7844 		}
7845 		m = next;
7846 	}
7847 
7848 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
7849 	while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
7850 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
7851 
7852 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7853 		discard = FALSE;
7854 		if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) &&
7855 		    hibernate_consider_discard(m, preflight)) {
7856 			if (!preflight) {
7857 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7858 			}
7859 			if (m->vmp_dirty) {
7860 				count_discard_purgeable++;
7861 			} else {
7862 				count_discard_active++;
7863 			}
7864 			discard = discard_all;
7865 		} else {
7866 			count_active++;
7867 		}
7868 		count_wire--;
7869 		if (!preflight) {
7870 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7871 		}
7872 		if (discard) {
7873 			hibernate_discard_page(m);
7874 		}
7875 		m = next;
7876 	}
7877 
7878 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
7879 	while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
7880 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
7881 
7882 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7883 		discard = FALSE;
7884 		if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7885 		    hibernate_consider_discard(m, preflight)) {
7886 			if (!preflight) {
7887 				hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7888 			}
7889 			if (m->vmp_dirty) {
7890 				count_discard_purgeable++;
7891 			} else {
7892 				count_discard_inactive++;
7893 			}
7894 			discard = discard_all;
7895 		} else {
7896 			count_inactive++;
7897 		}
7898 		count_wire--;
7899 		if (!preflight) {
7900 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7901 		}
7902 		if (discard) {
7903 			hibernate_discard_page(m);
7904 		}
7905 		m = next;
7906 	}
7907 	/* XXX FBDP TODO: secluded queue */
7908 
7909 	for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
7910 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
7911 		while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
7912 			assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q,
7913 			    "Bad page: %p (0x%x:0x%x) on queue %d has state: %d (Discard: %d, Preflight: %d)",
7914 			    m, m->vmp_pageq.next, m->vmp_pageq.prev, i, m->vmp_q_state, discard, preflight);
7915 
7916 			next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
7917 			discard = FALSE;
7918 			if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) &&
7919 			    hibernate_consider_discard(m, preflight)) {
7920 				if (!preflight) {
7921 					hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7922 				}
7923 				count_discard_speculative++;
7924 				discard = discard_all;
7925 			} else {
7926 				count_speculative++;
7927 			}
7928 			count_wire--;
7929 			if (!preflight) {
7930 				hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7931 			}
7932 			if (discard) {
7933 				hibernate_discard_page(m);
7934 			}
7935 			m = next;
7936 		}
7937 	}
7938 
7939 	vm_page_queue_iterate(&compressor_object->memq, m, vmp_listq) {
7940 		assert(m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
7941 
7942 		count_compressor++;
7943 		count_wire--;
7944 		if (!preflight) {
7945 			hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
7946 		}
7947 	}
7948 
7949 
7950 	if (preflight == FALSE && discard_all == TRUE) {
7951 		KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_START);
7952 
7953 		HIBLOG("hibernate_teardown started\n");
7954 		count_discard_vm_struct_pages = hibernate_teardown_vm_structs(page_list, page_list_wired);
7955 		HIBLOG("hibernate_teardown completed - discarded %d\n", count_discard_vm_struct_pages);
7956 
7957 		pages -= count_discard_vm_struct_pages;
7958 		count_wire -= count_discard_vm_struct_pages;
7959 
7960 		hibernate_stats.cd_vm_struct_pages_unneeded = count_discard_vm_struct_pages;
7961 
7962 		KDBG(IOKDBG_CODE(DBG_HIBERNATE, 12) | DBG_FUNC_END);
7963 	}
7964 
7965 	if (!preflight) {
7966 		// pull wired from hibernate_bitmap
7967 		bitmap = &page_list->bank_bitmap[0];
7968 		bitmap_wired = &page_list_wired->bank_bitmap[0];
7969 		for (bank = 0; bank < page_list->bank_count; bank++) {
7970 			for (i = 0; i < bitmap->bitmapwords; i++) {
7971 				bitmap->bitmap[i] = bitmap->bitmap[i] | ~bitmap_wired->bitmap[i];
7972 			}
7973 			bitmap = (hibernate_bitmap_t *)&bitmap->bitmap[bitmap->bitmapwords];
7974 			bitmap_wired = (hibernate_bitmap_t *) &bitmap_wired->bitmap[bitmap_wired->bitmapwords];
7975 		}
7976 	}
7977 
7978 	// machine dependent adjustments
7979 	hibernate_page_list_setall_machine(page_list, page_list_wired, preflight, &pages);
7980 
7981 	if (!preflight) {
7982 		hibernate_stats.cd_count_wire = count_wire;
7983 		hibernate_stats.cd_discarded = count_discard_active + count_discard_inactive + count_discard_purgeable +
7984 		    count_discard_speculative + count_discard_cleaned + count_discard_vm_struct_pages;
7985 	}
7986 
7987 	clock_get_uptime(&end);
7988 	absolutetime_to_nanoseconds(end - start, &nsec);
7989 	HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL);
7990 
7991 	HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, zf %d, throt %d, compr %d, xpmapped %d\n  %s discard act %d inact %d purgeable %d spec %d cleaned %d retired %d\n",
7992 	    pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative, count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped,
7993 	    discard_all ? "did" : "could",
7994 	    count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned, count_retired);
7995 
7996 	if (hibernate_stats.cd_skipped_xpmapped) {
7997 		HIBLOG("WARNING: hibernate_page_list_setall skipped %d xpmapped pages\n", hibernate_stats.cd_skipped_xpmapped);
7998 	}
7999 
8000 	*pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative - count_discard_cleaned - count_retired;
8001 
8002 	if (preflight && will_discard) {
8003 		*pagesOut -= count_compressor + count_throttled + count_anonymous + count_inactive + count_cleaned + count_speculative + count_active;
8004 		/*
8005 		 * We try to keep max HIBERNATE_XPMAPPED_LIMIT pages around in the hibernation image
8006 		 * even if these are clean and so we need to size the hibernation image accordingly.
8007 		 *
8008 		 * NB: We have to assume all HIBERNATE_XPMAPPED_LIMIT pages might show up because 'dirty'
8009 		 * xpmapped pages aren't distinguishable from other 'dirty' pages in preflight. So we might
8010 		 * only see part of the xpmapped pages if we look at 'cd_found_xpmapped' which solely tracks
8011 		 * clean xpmapped pages.
8012 		 *
8013 		 * Since these pages are all cleaned by the time we are in the post-preflight phase, we might
8014 		 * see a much larger number in 'cd_found_xpmapped' now than we did in the preflight phase
8015 		 */
8016 		*pagesOut +=  HIBERNATE_XPMAPPED_LIMIT;
8017 	}
8018 
8019 	hibernation_vmqueues_inspection = FALSE;
8020 
8021 #if MACH_ASSERT || DEBUG
8022 	if (!preflight) {
8023 		if (vm_page_local_q) {
8024 			zpercpu_foreach(lq, vm_page_local_q) {
8025 				VPL_UNLOCK(&lq->vpl_lock);
8026 			}
8027 		}
8028 		vm_page_unlock_queues();
8029 	}
8030 #endif  /* MACH_ASSERT || DEBUG */
8031 
8032 	if (preflight) {
8033 		vm_free_page_unlock();
8034 		vm_page_unlock_queues();
8035 		vm_object_unlock(compressor_object);
8036 	}
8037 
8038 	KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_END, count_wire, *pagesOut, 0, 0, 0);
8039 }
8040 
8041 void
hibernate_page_list_discard(hibernate_page_list_t * page_list)8042 hibernate_page_list_discard(hibernate_page_list_t * page_list)
8043 {
8044 	uint64_t  start, end, nsec;
8045 	vm_page_t m;
8046 	vm_page_t next;
8047 	uint32_t  i;
8048 	uint32_t  count_discard_active    = 0;
8049 	uint32_t  count_discard_inactive  = 0;
8050 	uint32_t  count_discard_purgeable = 0;
8051 	uint32_t  count_discard_cleaned   = 0;
8052 	uint32_t  count_discard_speculative = 0;
8053 
8054 
8055 #if MACH_ASSERT || DEBUG
8056 	vm_page_lock_queues();
8057 	if (vm_page_local_q) {
8058 		zpercpu_foreach(lq, vm_page_local_q) {
8059 			VPL_LOCK(&lq->vpl_lock);
8060 		}
8061 	}
8062 #endif  /* MACH_ASSERT || DEBUG */
8063 
8064 	clock_get_uptime(&start);
8065 
8066 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
8067 	while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) {
8068 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
8069 
8070 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
8071 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
8072 			if (m->vmp_dirty) {
8073 				count_discard_purgeable++;
8074 			} else {
8075 				count_discard_inactive++;
8076 			}
8077 			hibernate_discard_page(m);
8078 		}
8079 		m = next;
8080 	}
8081 
8082 	for (i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++) {
8083 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
8084 		while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) {
8085 			assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
8086 
8087 			next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
8088 			if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
8089 				count_discard_speculative++;
8090 				hibernate_discard_page(m);
8091 			}
8092 			m = next;
8093 		}
8094 	}
8095 
8096 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
8097 	while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) {
8098 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
8099 
8100 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
8101 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
8102 			if (m->vmp_dirty) {
8103 				count_discard_purgeable++;
8104 			} else {
8105 				count_discard_inactive++;
8106 			}
8107 			hibernate_discard_page(m);
8108 		}
8109 		m = next;
8110 	}
8111 	/* XXX FBDP TODO: secluded queue */
8112 
8113 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
8114 	while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) {
8115 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
8116 
8117 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
8118 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
8119 			if (m->vmp_dirty) {
8120 				count_discard_purgeable++;
8121 			} else {
8122 				count_discard_active++;
8123 			}
8124 			hibernate_discard_page(m);
8125 		}
8126 		m = next;
8127 	}
8128 
8129 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
8130 	while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) {
8131 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
8132 
8133 		next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
8134 		if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) {
8135 			if (m->vmp_dirty) {
8136 				count_discard_purgeable++;
8137 			} else {
8138 				count_discard_cleaned++;
8139 			}
8140 			hibernate_discard_page(m);
8141 		}
8142 		m = next;
8143 	}
8144 
8145 #if MACH_ASSERT || DEBUG
8146 	if (vm_page_local_q) {
8147 		zpercpu_foreach(lq, vm_page_local_q) {
8148 			VPL_UNLOCK(&lq->vpl_lock);
8149 		}
8150 	}
8151 	vm_page_unlock_queues();
8152 #endif  /* MACH_ASSERT || DEBUG */
8153 
8154 	clock_get_uptime(&end);
8155 	absolutetime_to_nanoseconds(end - start, &nsec);
8156 	HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d cleaned %d\n",
8157 	    nsec / 1000000ULL,
8158 	    count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned);
8159 }
8160 
8161 boolean_t       hibernate_paddr_map_inited = FALSE;
8162 unsigned int    hibernate_teardown_last_valid_compact_indx = -1;
8163 vm_page_t       hibernate_rebuild_hash_list = NULL;
8164 
8165 unsigned int    hibernate_teardown_found_tabled_pages = 0;
8166 unsigned int    hibernate_teardown_found_created_pages = 0;
8167 unsigned int    hibernate_teardown_found_free_pages = 0;
8168 unsigned int    hibernate_teardown_vm_page_free_count;
8169 
8170 
8171 struct ppnum_mapping {
8172 	struct ppnum_mapping    *ppnm_next;
8173 	ppnum_t                 ppnm_base_paddr;
8174 	unsigned int            ppnm_sindx;
8175 	unsigned int            ppnm_eindx;
8176 };
8177 
8178 struct ppnum_mapping    *ppnm_head;
8179 struct ppnum_mapping    *ppnm_last_found = NULL;
8180 
8181 
8182 void
hibernate_create_paddr_map(void)8183 hibernate_create_paddr_map(void)
8184 {
8185 	unsigned int    i;
8186 	ppnum_t         next_ppnum_in_run = 0;
8187 	struct ppnum_mapping *ppnm = NULL;
8188 
8189 	if (hibernate_paddr_map_inited == FALSE) {
8190 		for (i = 0; i < vm_pages_count; i++) {
8191 			if (ppnm) {
8192 				ppnm->ppnm_eindx = i;
8193 			}
8194 
8195 			if (ppnm == NULL || VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]) != next_ppnum_in_run) {
8196 				ppnm = zalloc_permanent_type(struct ppnum_mapping);
8197 
8198 				ppnm->ppnm_next = ppnm_head;
8199 				ppnm_head = ppnm;
8200 
8201 				ppnm->ppnm_sindx = i;
8202 				ppnm->ppnm_base_paddr = VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]);
8203 			}
8204 			next_ppnum_in_run = VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]) + 1;
8205 		}
8206 		ppnm->ppnm_eindx = vm_pages_count;
8207 
8208 		hibernate_paddr_map_inited = TRUE;
8209 	}
8210 }
8211 
8212 ppnum_t
hibernate_lookup_paddr(unsigned int indx)8213 hibernate_lookup_paddr(unsigned int indx)
8214 {
8215 	struct ppnum_mapping *ppnm = NULL;
8216 
8217 	ppnm = ppnm_last_found;
8218 
8219 	if (ppnm) {
8220 		if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
8221 			goto done;
8222 		}
8223 	}
8224 	for (ppnm = ppnm_head; ppnm; ppnm = ppnm->ppnm_next) {
8225 		if (indx >= ppnm->ppnm_sindx && indx < ppnm->ppnm_eindx) {
8226 			ppnm_last_found = ppnm;
8227 			break;
8228 		}
8229 	}
8230 	if (ppnm == NULL) {
8231 		panic("hibernate_lookup_paddr of %d failed", indx);
8232 	}
8233 done:
8234 	return ppnm->ppnm_base_paddr + (indx - ppnm->ppnm_sindx);
8235 }
8236 
8237 
8238 uint32_t
hibernate_mark_as_unneeded(addr64_t saddr,addr64_t eaddr,hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)8239 hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
8240 {
8241 	addr64_t        saddr_aligned;
8242 	addr64_t        eaddr_aligned;
8243 	addr64_t        addr;
8244 	ppnum_t         paddr;
8245 	unsigned int    mark_as_unneeded_pages = 0;
8246 
8247 	saddr_aligned = (saddr + PAGE_MASK_64) & ~PAGE_MASK_64;
8248 	eaddr_aligned = eaddr & ~PAGE_MASK_64;
8249 
8250 	for (addr = saddr_aligned; addr < eaddr_aligned; addr += PAGE_SIZE_64) {
8251 		paddr = pmap_find_phys(kernel_pmap, addr);
8252 
8253 		assert(paddr);
8254 
8255 		hibernate_page_bitset(page_list, TRUE, paddr);
8256 		hibernate_page_bitset(page_list_wired, TRUE, paddr);
8257 
8258 		mark_as_unneeded_pages++;
8259 	}
8260 	return mark_as_unneeded_pages;
8261 }
8262 
8263 
8264 void
hibernate_hash_insert_page(vm_page_t mem)8265 hibernate_hash_insert_page(vm_page_t mem)
8266 {
8267 	vm_page_bucket_t *bucket;
8268 	int             hash_id;
8269 	vm_object_t     m_object;
8270 
8271 	m_object = VM_PAGE_OBJECT(mem);
8272 
8273 	assert(mem->vmp_hashed);
8274 	assert(m_object);
8275 	assert(mem->vmp_offset != (vm_object_offset_t) -1);
8276 
8277 	/*
8278 	 *	Insert it into the object_object/offset hash table
8279 	 */
8280 	hash_id = vm_page_hash(m_object, mem->vmp_offset);
8281 	bucket = &vm_page_buckets[hash_id];
8282 
8283 	mem->vmp_next_m = bucket->page_list;
8284 	bucket->page_list = VM_PAGE_PACK_PTR(mem);
8285 }
8286 
8287 
8288 void
hibernate_free_range(int sindx,int eindx)8289 hibernate_free_range(int sindx, int eindx)
8290 {
8291 	vm_page_t       mem;
8292 	unsigned int    color;
8293 
8294 	while (sindx < eindx) {
8295 		mem = &vm_pages[sindx];
8296 
8297 		vm_page_init(mem, hibernate_lookup_paddr(sindx), FALSE);
8298 
8299 		mem->vmp_lopage = FALSE;
8300 		mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
8301 
8302 		color = VM_PAGE_GET_COLOR(mem);
8303 #if defined(__x86_64__)
8304 		vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem);
8305 #else
8306 		vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vmp_pageq);
8307 #endif
8308 		vm_page_free_count++;
8309 
8310 		sindx++;
8311 	}
8312 }
8313 
8314 void
hibernate_rebuild_vm_structs(void)8315 hibernate_rebuild_vm_structs(void)
8316 {
8317 	int             i, cindx, sindx, eindx;
8318 	vm_page_t       mem, tmem, mem_next;
8319 	AbsoluteTime    startTime, endTime;
8320 	uint64_t        nsec;
8321 
8322 	if (hibernate_rebuild_needed == FALSE) {
8323 		return;
8324 	}
8325 
8326 	KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_START);
8327 	HIBLOG("hibernate_rebuild started\n");
8328 
8329 	clock_get_uptime(&startTime);
8330 
8331 	pal_hib_rebuild_pmap_structs();
8332 
8333 	bzero(&vm_page_buckets[0], vm_page_bucket_count * sizeof(vm_page_bucket_t));
8334 	eindx = vm_pages_count;
8335 
8336 	/*
8337 	 * Mark all the vm_pages[] that have not been initialized yet as being
8338 	 * transient. This is needed to ensure that buddy page search is corrrect.
8339 	 * Without this random data in these vm_pages[] can trip the buddy search
8340 	 */
8341 	for (i = hibernate_teardown_last_valid_compact_indx + 1; i < eindx; ++i) {
8342 		vm_pages[i].vmp_q_state = VM_PAGE_NOT_ON_Q;
8343 	}
8344 
8345 	for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
8346 		mem = &vm_pages[cindx];
8347 		assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
8348 		/*
8349 		 * hibernate_teardown_vm_structs leaves the location where
8350 		 * this vm_page_t must be located in "next".
8351 		 */
8352 		tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
8353 		mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
8354 
8355 		sindx = (int)(tmem - &vm_pages[0]);
8356 
8357 		if (mem != tmem) {
8358 			/*
8359 			 * this vm_page_t was moved by hibernate_teardown_vm_structs,
8360 			 * so move it back to its real location
8361 			 */
8362 			*tmem = *mem;
8363 			mem = tmem;
8364 		}
8365 		if (mem->vmp_hashed) {
8366 			hibernate_hash_insert_page(mem);
8367 		}
8368 		/*
8369 		 * the 'hole' between this vm_page_t and the previous
8370 		 * vm_page_t we moved needs to be initialized as
8371 		 * a range of free vm_page_t's
8372 		 */
8373 		hibernate_free_range(sindx + 1, eindx);
8374 
8375 		eindx = sindx;
8376 	}
8377 	if (sindx) {
8378 		hibernate_free_range(0, sindx);
8379 	}
8380 
8381 	assert(vm_page_free_count == hibernate_teardown_vm_page_free_count);
8382 
8383 	/*
8384 	 * process the list of vm_page_t's that were entered in the hash,
8385 	 * but were not located in the vm_pages arrary... these are
8386 	 * vm_page_t's that were created on the fly (i.e. fictitious)
8387 	 */
8388 	for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) {
8389 		mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
8390 
8391 		mem->vmp_next_m = 0;
8392 		hibernate_hash_insert_page(mem);
8393 	}
8394 	hibernate_rebuild_hash_list = NULL;
8395 
8396 	clock_get_uptime(&endTime);
8397 	SUB_ABSOLUTETIME(&endTime, &startTime);
8398 	absolutetime_to_nanoseconds(endTime, &nsec);
8399 
8400 	HIBLOG("hibernate_rebuild completed - took %qd msecs\n", nsec / 1000000ULL);
8401 
8402 	hibernate_rebuild_needed = FALSE;
8403 
8404 	KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END);
8405 }
8406 
8407 uint32_t
hibernate_teardown_vm_structs(hibernate_page_list_t * page_list,hibernate_page_list_t * page_list_wired)8408 hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired)
8409 {
8410 	unsigned int    i;
8411 	unsigned int    compact_target_indx;
8412 	vm_page_t       mem, mem_next;
8413 	vm_page_bucket_t *bucket;
8414 	unsigned int    mark_as_unneeded_pages = 0;
8415 	unsigned int    unneeded_vm_page_bucket_pages = 0;
8416 	unsigned int    unneeded_vm_pages_pages = 0;
8417 	unsigned int    unneeded_pmap_pages = 0;
8418 	addr64_t        start_of_unneeded = 0;
8419 	addr64_t        end_of_unneeded = 0;
8420 
8421 
8422 	if (hibernate_should_abort()) {
8423 		return 0;
8424 	}
8425 
8426 	hibernate_rebuild_needed = TRUE;
8427 
8428 	HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, active_pages %d, inactive_pages %d, speculative_pages %d, cleaned_pages %d, compressor_pages %d\n",
8429 	    vm_page_wire_count, vm_page_free_count, vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count,
8430 	    vm_page_cleaned_count, compressor_object->resident_page_count);
8431 
8432 	for (i = 0; i < vm_page_bucket_count; i++) {
8433 		bucket = &vm_page_buckets[i];
8434 
8435 		for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) {
8436 			assert(mem->vmp_hashed);
8437 
8438 			mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
8439 
8440 			if (mem < &vm_pages[0] || mem >= &vm_pages[vm_pages_count]) {
8441 				mem->vmp_next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list);
8442 				hibernate_rebuild_hash_list = mem;
8443 			}
8444 		}
8445 	}
8446 	unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0], (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired);
8447 	mark_as_unneeded_pages += unneeded_vm_page_bucket_pages;
8448 
8449 	hibernate_teardown_vm_page_free_count = vm_page_free_count;
8450 
8451 	compact_target_indx = 0;
8452 
8453 	for (i = 0; i < vm_pages_count; i++) {
8454 		mem = &vm_pages[i];
8455 
8456 		if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
8457 			unsigned int color;
8458 
8459 			assert(mem->vmp_busy);
8460 			assert(!mem->vmp_lopage);
8461 
8462 			color = VM_PAGE_GET_COLOR(mem);
8463 
8464 			vm_page_queue_remove(&vm_page_queue_free[color].qhead, mem, vmp_pageq);
8465 
8466 			VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
8467 
8468 			vm_page_free_count--;
8469 
8470 			hibernate_teardown_found_free_pages++;
8471 
8472 			if (vm_pages[compact_target_indx].vmp_q_state != VM_PAGE_ON_FREE_Q) {
8473 				compact_target_indx = i;
8474 			}
8475 		} else {
8476 			/*
8477 			 * record this vm_page_t's original location
8478 			 * we need this even if it doesn't get moved
8479 			 * as an indicator to the rebuild function that
8480 			 * we don't have to move it
8481 			 */
8482 			mem->vmp_next_m = VM_PAGE_PACK_PTR(mem);
8483 
8484 			if (vm_pages[compact_target_indx].vmp_q_state == VM_PAGE_ON_FREE_Q) {
8485 				/*
8486 				 * we've got a hole to fill, so
8487 				 * move this vm_page_t to it's new home
8488 				 */
8489 				vm_pages[compact_target_indx] = *mem;
8490 				mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
8491 
8492 				hibernate_teardown_last_valid_compact_indx = compact_target_indx;
8493 				compact_target_indx++;
8494 			} else {
8495 				hibernate_teardown_last_valid_compact_indx = i;
8496 			}
8497 		}
8498 	}
8499 	unneeded_vm_pages_pages = hibernate_mark_as_unneeded((addr64_t)&vm_pages[hibernate_teardown_last_valid_compact_indx + 1],
8500 	    (addr64_t)&vm_pages[vm_pages_count - 1], page_list, page_list_wired);
8501 	mark_as_unneeded_pages += unneeded_vm_pages_pages;
8502 
8503 	pal_hib_teardown_pmap_structs(&start_of_unneeded, &end_of_unneeded);
8504 
8505 	if (start_of_unneeded) {
8506 		unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded, end_of_unneeded, page_list, page_list_wired);
8507 		mark_as_unneeded_pages += unneeded_pmap_pages;
8508 	}
8509 	HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n", unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages);
8510 
8511 	return mark_as_unneeded_pages;
8512 }
8513 
8514 
8515 #endif /* HIBERNATION */
8516 
8517 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8518 
8519 #include <mach_vm_debug.h>
8520 #if     MACH_VM_DEBUG
8521 
8522 #include <mach_debug/hash_info.h>
8523 #include <vm/vm_debug.h>
8524 
8525 /*
8526  *	Routine:	vm_page_info
8527  *	Purpose:
8528  *		Return information about the global VP table.
8529  *		Fills the buffer with as much information as possible
8530  *		and returns the desired size of the buffer.
8531  *	Conditions:
8532  *		Nothing locked.  The caller should provide
8533  *		possibly-pageable memory.
8534  */
8535 
8536 unsigned int
vm_page_info(hash_info_bucket_t * info,unsigned int count)8537 vm_page_info(
8538 	hash_info_bucket_t *info,
8539 	unsigned int count)
8540 {
8541 	unsigned int i;
8542 	lck_spin_t      *bucket_lock;
8543 
8544 	if (vm_page_bucket_count < count) {
8545 		count = vm_page_bucket_count;
8546 	}
8547 
8548 	for (i = 0; i < count; i++) {
8549 		vm_page_bucket_t *bucket = &vm_page_buckets[i];
8550 		unsigned int bucket_count = 0;
8551 		vm_page_t m;
8552 
8553 		bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
8554 		lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
8555 
8556 		for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
8557 		    m != VM_PAGE_NULL;
8558 		    m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->vmp_next_m))) {
8559 			bucket_count++;
8560 		}
8561 
8562 		lck_spin_unlock(bucket_lock);
8563 
8564 		/* don't touch pageable memory while holding locks */
8565 		info[i].hib_count = bucket_count;
8566 	}
8567 
8568 	return vm_page_bucket_count;
8569 }
8570 #endif  /* MACH_VM_DEBUG */
8571 
8572 #if VM_PAGE_BUCKETS_CHECK
8573 void
vm_page_buckets_check(void)8574 vm_page_buckets_check(void)
8575 {
8576 	unsigned int i;
8577 	vm_page_t p;
8578 	unsigned int p_hash;
8579 	vm_page_bucket_t *bucket;
8580 	lck_spin_t      *bucket_lock;
8581 
8582 	if (!vm_page_buckets_check_ready) {
8583 		return;
8584 	}
8585 
8586 #if HIBERNATION
8587 	if (hibernate_rebuild_needed ||
8588 	    hibernate_rebuild_hash_list) {
8589 		panic("BUCKET_CHECK: hibernation in progress: "
8590 		    "rebuild_needed=%d rebuild_hash_list=%p\n",
8591 		    hibernate_rebuild_needed,
8592 		    hibernate_rebuild_hash_list);
8593 	}
8594 #endif /* HIBERNATION */
8595 
8596 #if VM_PAGE_FAKE_BUCKETS
8597 	char *cp;
8598 	for (cp = (char *) vm_page_fake_buckets_start;
8599 	    cp < (char *) vm_page_fake_buckets_end;
8600 	    cp++) {
8601 		if (*cp != 0x5a) {
8602 			panic("BUCKET_CHECK: corruption at %p in fake buckets "
8603 			    "[0x%llx:0x%llx]\n",
8604 			    cp,
8605 			    (uint64_t) vm_page_fake_buckets_start,
8606 			    (uint64_t) vm_page_fake_buckets_end);
8607 		}
8608 	}
8609 #endif /* VM_PAGE_FAKE_BUCKETS */
8610 
8611 	for (i = 0; i < vm_page_bucket_count; i++) {
8612 		vm_object_t     p_object;
8613 
8614 		bucket = &vm_page_buckets[i];
8615 		if (!bucket->page_list) {
8616 			continue;
8617 		}
8618 
8619 		bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK];
8620 		lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket);
8621 		p = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
8622 
8623 		while (p != VM_PAGE_NULL) {
8624 			p_object = VM_PAGE_OBJECT(p);
8625 
8626 			if (!p->vmp_hashed) {
8627 				panic("BUCKET_CHECK: page %p (%p,0x%llx) "
8628 				    "hash %d in bucket %d at %p "
8629 				    "is not hashed\n",
8630 				    p, p_object, p->vmp_offset,
8631 				    p_hash, i, bucket);
8632 			}
8633 			p_hash = vm_page_hash(p_object, p->vmp_offset);
8634 			if (p_hash != i) {
8635 				panic("BUCKET_CHECK: corruption in bucket %d "
8636 				    "at %p: page %p object %p offset 0x%llx "
8637 				    "hash %d\n",
8638 				    i, bucket, p, p_object, p->vmp_offset,
8639 				    p_hash);
8640 			}
8641 			p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m));
8642 		}
8643 		lck_spin_unlock(bucket_lock);
8644 	}
8645 
8646 //	printf("BUCKET_CHECK: checked buckets\n");
8647 }
8648 #endif /* VM_PAGE_BUCKETS_CHECK */
8649 
8650 /*
8651  * 'vm_fault_enter' will place newly created pages (zero-fill and COW) onto the
8652  * local queues if they exist... its the only spot in the system where we add pages
8653  * to those queues...  once on those queues, those pages can only move to one of the
8654  * global page queues or the free queues... they NEVER move from local q to local q.
8655  * the 'local' state is stable when vm_page_queues_remove is called since we're behind
8656  * the global vm_page_queue_lock at this point...  we still need to take the local lock
8657  * in case this operation is being run on a different CPU then the local queue's identity,
8658  * but we don't have to worry about the page moving to a global queue or becoming wired
8659  * while we're grabbing the local lock since those operations would require the global
8660  * vm_page_queue_lock to be held, and we already own it.
8661  *
8662  * this is why its safe to utilze the wire_count field in the vm_page_t as the local_id...
8663  * 'wired' and local are ALWAYS mutually exclusive conditions.
8664  */
8665 
8666 void
vm_page_queues_remove(vm_page_t mem,boolean_t remove_from_specialq)8667 vm_page_queues_remove(vm_page_t mem, boolean_t remove_from_specialq)
8668 {
8669 	boolean_t       was_pageable = TRUE;
8670 	vm_object_t     m_object;
8671 
8672 	m_object = VM_PAGE_OBJECT(mem);
8673 
8674 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
8675 
8676 	if (mem->vmp_q_state == VM_PAGE_NOT_ON_Q) {
8677 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
8678 		if (remove_from_specialq == TRUE) {
8679 			vm_page_remove_from_specialq(mem);
8680 		}
8681 		/*if (mem->vmp_on_specialq != VM_PAGE_SPECIAL_Q_EMPTY) {
8682 		 *       assert(mem->vmp_specialq.next != 0);
8683 		 *       assert(mem->vmp_specialq.prev != 0);
8684 		 *  } else {*/
8685 		if (mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY) {
8686 			assert(mem->vmp_specialq.next == 0);
8687 			assert(mem->vmp_specialq.prev == 0);
8688 		}
8689 		return;
8690 	}
8691 
8692 	if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8693 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
8694 		assert(mem->vmp_specialq.next == 0 &&
8695 		    mem->vmp_specialq.prev == 0 &&
8696 		    mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
8697 		return;
8698 	}
8699 	if (mem->vmp_q_state == VM_PAGE_IS_WIRED) {
8700 		/*
8701 		 * might put these guys on a list for debugging purposes
8702 		 * if we do, we'll need to remove this assert
8703 		 */
8704 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
8705 		assert(mem->vmp_specialq.next == 0 &&
8706 		    mem->vmp_specialq.prev == 0);
8707 		/*
8708 		 * Recall that vmp_on_specialq also means a request to put
8709 		 * it on the special Q. So we don't want to reset that bit
8710 		 * just because a wiring request came in. We might want to
8711 		 * put it on the special queue post-unwiring.
8712 		 *
8713 		 * &&
8714 		 * mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY);
8715 		 */
8716 		return;
8717 	}
8718 
8719 	assert(m_object != compressor_object);
8720 	assert(m_object != kernel_object);
8721 	assert(!mem->vmp_fictitious);
8722 
8723 	switch (mem->vmp_q_state) {
8724 	case VM_PAGE_ON_ACTIVE_LOCAL_Q:
8725 	{
8726 		struct vpl      *lq;
8727 
8728 		lq = zpercpu_get_cpu(vm_page_local_q, mem->vmp_local_id);
8729 		VPL_LOCK(&lq->vpl_lock);
8730 		vm_page_queue_remove(&lq->vpl_queue, mem, vmp_pageq);
8731 		mem->vmp_local_id = 0;
8732 		lq->vpl_count--;
8733 		if (m_object->internal) {
8734 			lq->vpl_internal_count--;
8735 		} else {
8736 			lq->vpl_external_count--;
8737 		}
8738 		VPL_UNLOCK(&lq->vpl_lock);
8739 		was_pageable = FALSE;
8740 		break;
8741 	}
8742 	case VM_PAGE_ON_ACTIVE_Q:
8743 	{
8744 		vm_page_queue_remove(&vm_page_queue_active, mem, vmp_pageq);
8745 		vm_page_active_count--;
8746 		break;
8747 	}
8748 
8749 	case VM_PAGE_ON_INACTIVE_INTERNAL_Q:
8750 	{
8751 		assert(m_object->internal == TRUE);
8752 
8753 		vm_page_inactive_count--;
8754 		vm_page_queue_remove(&vm_page_queue_anonymous, mem, vmp_pageq);
8755 		vm_page_anonymous_count--;
8756 
8757 		vm_purgeable_q_advance_all();
8758 		vm_page_balance_inactive(3);
8759 		break;
8760 	}
8761 
8762 	case VM_PAGE_ON_INACTIVE_EXTERNAL_Q:
8763 	{
8764 		assert(m_object->internal == FALSE);
8765 
8766 		vm_page_inactive_count--;
8767 		vm_page_queue_remove(&vm_page_queue_inactive, mem, vmp_pageq);
8768 		vm_purgeable_q_advance_all();
8769 		vm_page_balance_inactive(3);
8770 		break;
8771 	}
8772 
8773 	case VM_PAGE_ON_INACTIVE_CLEANED_Q:
8774 	{
8775 		assert(m_object->internal == FALSE);
8776 
8777 		vm_page_inactive_count--;
8778 		vm_page_queue_remove(&vm_page_queue_cleaned, mem, vmp_pageq);
8779 		vm_page_cleaned_count--;
8780 		vm_page_balance_inactive(3);
8781 		break;
8782 	}
8783 
8784 	case VM_PAGE_ON_THROTTLED_Q:
8785 	{
8786 		assert(m_object->internal == TRUE);
8787 
8788 		vm_page_queue_remove(&vm_page_queue_throttled, mem, vmp_pageq);
8789 		vm_page_throttled_count--;
8790 		was_pageable = FALSE;
8791 		break;
8792 	}
8793 
8794 	case VM_PAGE_ON_SPECULATIVE_Q:
8795 	{
8796 		assert(m_object->internal == FALSE);
8797 
8798 		vm_page_remque(&mem->vmp_pageq);
8799 		vm_page_speculative_count--;
8800 		vm_page_balance_inactive(3);
8801 		break;
8802 	}
8803 
8804 #if CONFIG_SECLUDED_MEMORY
8805 	case VM_PAGE_ON_SECLUDED_Q:
8806 	{
8807 		vm_page_queue_remove(&vm_page_queue_secluded, mem, vmp_pageq);
8808 		vm_page_secluded_count--;
8809 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
8810 		if (m_object == VM_OBJECT_NULL) {
8811 			vm_page_secluded_count_free--;
8812 			was_pageable = FALSE;
8813 		} else {
8814 			assert(!m_object->internal);
8815 			vm_page_secluded_count_inuse--;
8816 			was_pageable = FALSE;
8817 //			was_pageable = TRUE;
8818 		}
8819 		break;
8820 	}
8821 #endif /* CONFIG_SECLUDED_MEMORY */
8822 
8823 	default:
8824 	{
8825 		/*
8826 		 *	if (mem->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)
8827 		 *              NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue...
8828 		 *              the caller is responsible for determing if the page is on that queue, and if so, must
8829 		 *              either first remove it (it needs both the page queues lock and the object lock to do
8830 		 *              this via vm_pageout_steal_laundry), or avoid the call to vm_page_queues_remove
8831 		 *
8832 		 *	we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q
8833 		 *	or any of the undefined states
8834 		 */
8835 		panic("vm_page_queues_remove - bad page q_state (%p, %d)", mem, mem->vmp_q_state);
8836 		break;
8837 	}
8838 	}
8839 	VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
8840 	mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
8841 
8842 	if (remove_from_specialq == TRUE) {
8843 		vm_page_remove_from_specialq(mem);
8844 	}
8845 	if (was_pageable) {
8846 		if (m_object->internal) {
8847 			vm_page_pageable_internal_count--;
8848 		} else {
8849 			vm_page_pageable_external_count--;
8850 		}
8851 	}
8852 }
8853 
8854 void
vm_page_remove_internal(vm_page_t page)8855 vm_page_remove_internal(vm_page_t page)
8856 {
8857 	vm_object_t __object = VM_PAGE_OBJECT(page);
8858 	if (page == __object->memq_hint) {
8859 		vm_page_t       __new_hint;
8860 		vm_page_queue_entry_t   __qe;
8861 		__qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->vmp_listq);
8862 		if (vm_page_queue_end(&__object->memq, __qe)) {
8863 			__qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->vmp_listq);
8864 			if (vm_page_queue_end(&__object->memq, __qe)) {
8865 				__qe = NULL;
8866 			}
8867 		}
8868 		__new_hint = (vm_page_t)((uintptr_t) __qe);
8869 		__object->memq_hint = __new_hint;
8870 	}
8871 	vm_page_queue_remove(&__object->memq, page, vmp_listq);
8872 #if CONFIG_SECLUDED_MEMORY
8873 	if (__object->eligible_for_secluded) {
8874 		vm_page_secluded.eligible_for_secluded--;
8875 	}
8876 #endif /* CONFIG_SECLUDED_MEMORY */
8877 }
8878 
8879 void
vm_page_enqueue_inactive(vm_page_t mem,boolean_t first)8880 vm_page_enqueue_inactive(vm_page_t mem, boolean_t first)
8881 {
8882 	vm_object_t     m_object;
8883 
8884 	m_object = VM_PAGE_OBJECT(mem);
8885 
8886 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
8887 	assert(!mem->vmp_fictitious);
8888 	assert(!mem->vmp_laundry);
8889 	assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
8890 	vm_page_check_pageable_safe(mem);
8891 
8892 	if (m_object->internal) {
8893 		mem->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
8894 
8895 		if (first == TRUE) {
8896 			vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vmp_pageq);
8897 		} else {
8898 			vm_page_queue_enter(&vm_page_queue_anonymous, mem, vmp_pageq);
8899 		}
8900 
8901 		vm_page_anonymous_count++;
8902 		vm_page_pageable_internal_count++;
8903 	} else {
8904 		mem->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
8905 
8906 		if (first == TRUE) {
8907 			vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vmp_pageq);
8908 		} else {
8909 			vm_page_queue_enter(&vm_page_queue_inactive, mem, vmp_pageq);
8910 		}
8911 
8912 		vm_page_pageable_external_count++;
8913 	}
8914 	vm_page_inactive_count++;
8915 	token_new_pagecount++;
8916 
8917 	vm_page_add_to_specialq(mem, FALSE);
8918 }
8919 
8920 void
vm_page_enqueue_active(vm_page_t mem,boolean_t first)8921 vm_page_enqueue_active(vm_page_t mem, boolean_t first)
8922 {
8923 	vm_object_t     m_object;
8924 
8925 	m_object = VM_PAGE_OBJECT(mem);
8926 
8927 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
8928 	assert(!mem->vmp_fictitious);
8929 	assert(!mem->vmp_laundry);
8930 	assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
8931 	vm_page_check_pageable_safe(mem);
8932 
8933 	mem->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
8934 	if (first == TRUE) {
8935 		vm_page_queue_enter_first(&vm_page_queue_active, mem, vmp_pageq);
8936 	} else {
8937 		vm_page_queue_enter(&vm_page_queue_active, mem, vmp_pageq);
8938 	}
8939 	vm_page_active_count++;
8940 
8941 	if (m_object->internal) {
8942 		vm_page_pageable_internal_count++;
8943 	} else {
8944 		vm_page_pageable_external_count++;
8945 	}
8946 
8947 	vm_page_add_to_specialq(mem, FALSE);
8948 	vm_page_balance_inactive(3);
8949 }
8950 
8951 /*
8952  * Pages from special kernel objects shouldn't
8953  * be placed on pageable queues.
8954  */
8955 void
vm_page_check_pageable_safe(vm_page_t page)8956 vm_page_check_pageable_safe(vm_page_t page)
8957 {
8958 	vm_object_t     page_object;
8959 
8960 	page_object = VM_PAGE_OBJECT(page);
8961 
8962 	if (page_object == kernel_object) {
8963 		panic("vm_page_check_pageable_safe: trying to add page"
8964 		    "from kernel object (%p) to pageable queue", kernel_object);
8965 	}
8966 
8967 	if (page_object == compressor_object) {
8968 		panic("vm_page_check_pageable_safe: trying to add page"
8969 		    "from compressor object (%p) to pageable queue", compressor_object);
8970 	}
8971 }
8972 
8973 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
8974 * wired page diagnose
8975 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
8976 
8977 #include <libkern/OSKextLibPrivate.h>
8978 
8979 #define KA_SIZE(namelen, subtotalscount)        \
8980 	(sizeof(struct vm_allocation_site) + (namelen) + 1 + ((subtotalscount) * sizeof(struct vm_allocation_total)))
8981 
8982 #define KA_NAME(alloc)  \
8983 	((char *)(&(alloc)->subtotals[(alloc->subtotalscount)]))
8984 
8985 #define KA_NAME_LEN(alloc)      \
8986     (VM_TAG_NAME_LEN_MAX & (alloc->flags >> VM_TAG_NAME_LEN_SHIFT))
8987 
8988 vm_tag_t
vm_tag_bt(void)8989 vm_tag_bt(void)
8990 {
8991 	uintptr_t* frameptr;
8992 	uintptr_t* frameptr_next;
8993 	uintptr_t retaddr;
8994 	uintptr_t kstackb, kstackt;
8995 	const vm_allocation_site_t * site;
8996 	thread_t cthread;
8997 	kern_allocation_name_t name;
8998 
8999 	cthread = current_thread();
9000 	if (__improbable(cthread == NULL)) {
9001 		return VM_KERN_MEMORY_OSFMK;
9002 	}
9003 
9004 	if ((name = thread_get_kernel_state(cthread)->allocation_name)) {
9005 		if (!name->tag) {
9006 			vm_tag_alloc(name);
9007 		}
9008 		return name->tag;
9009 	}
9010 
9011 	kstackb = cthread->kernel_stack;
9012 	kstackt = kstackb + kernel_stack_size;
9013 
9014 	/* Load stack frame pointer (EBP on x86) into frameptr */
9015 	frameptr = __builtin_frame_address(0);
9016 	site = NULL;
9017 	while (frameptr != NULL) {
9018 		/* Verify thread stack bounds */
9019 		if (((uintptr_t)(frameptr + 2) > kstackt) || ((uintptr_t)frameptr < kstackb)) {
9020 			break;
9021 		}
9022 
9023 		/* Next frame pointer is pointed to by the previous one */
9024 		frameptr_next = (uintptr_t*) *frameptr;
9025 
9026 		/* Pull return address from one spot above the frame pointer */
9027 		retaddr = *(frameptr + 1);
9028 
9029 #if defined(HAS_APPLE_PAC)
9030 		retaddr = (uintptr_t) ptrauth_strip((void *)retaddr, ptrauth_key_return_address);
9031 #endif
9032 
9033 		if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text))
9034 		    || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) {
9035 			site = OSKextGetAllocationSiteForCaller(retaddr);
9036 			break;
9037 		}
9038 		frameptr = frameptr_next;
9039 	}
9040 
9041 	return site ? site->tag : VM_KERN_MEMORY_NONE;
9042 }
9043 
9044 static uint64_t free_tag_bits[VM_MAX_TAG_VALUE / 64];
9045 
9046 void
vm_tag_alloc_locked(vm_allocation_site_t * site,vm_allocation_site_t ** releasesiteP)9047 vm_tag_alloc_locked(vm_allocation_site_t * site, vm_allocation_site_t ** releasesiteP)
9048 {
9049 	vm_tag_t tag;
9050 	uint64_t avail;
9051 	uint32_t idx;
9052 	vm_allocation_site_t * prev;
9053 
9054 	if (site->tag) {
9055 		return;
9056 	}
9057 
9058 	idx = 0;
9059 	while (TRUE) {
9060 		avail = free_tag_bits[idx];
9061 		if (avail) {
9062 			tag = (vm_tag_t)__builtin_clzll(avail);
9063 			avail &= ~(1ULL << (63 - tag));
9064 			free_tag_bits[idx] = avail;
9065 			tag += (idx << 6);
9066 			break;
9067 		}
9068 		idx++;
9069 		if (idx >= ARRAY_COUNT(free_tag_bits)) {
9070 			for (idx = 0; idx < ARRAY_COUNT(vm_allocation_sites); idx++) {
9071 				prev = vm_allocation_sites[idx];
9072 				if (!prev) {
9073 					continue;
9074 				}
9075 				if (!KA_NAME_LEN(prev)) {
9076 					continue;
9077 				}
9078 				if (!prev->tag) {
9079 					continue;
9080 				}
9081 				if (prev->total) {
9082 					continue;
9083 				}
9084 				if (1 != prev->refcount) {
9085 					continue;
9086 				}
9087 
9088 				assert(idx == prev->tag);
9089 				tag = (vm_tag_t)idx;
9090 				prev->tag = VM_KERN_MEMORY_NONE;
9091 				*releasesiteP = prev;
9092 				break;
9093 			}
9094 			if (idx >= ARRAY_COUNT(vm_allocation_sites)) {
9095 				tag = VM_KERN_MEMORY_ANY;
9096 			}
9097 			break;
9098 		}
9099 	}
9100 	site->tag = tag;
9101 
9102 	OSAddAtomic16(1, &site->refcount);
9103 
9104 	if (VM_KERN_MEMORY_ANY != tag) {
9105 		vm_allocation_sites[tag] = site;
9106 	}
9107 
9108 	if (tag > vm_allocation_tag_highest) {
9109 		vm_allocation_tag_highest = tag;
9110 	}
9111 }
9112 
9113 static void
vm_tag_free_locked(vm_tag_t tag)9114 vm_tag_free_locked(vm_tag_t tag)
9115 {
9116 	uint64_t avail;
9117 	uint32_t idx;
9118 	uint64_t bit;
9119 
9120 	if (VM_KERN_MEMORY_ANY == tag) {
9121 		return;
9122 	}
9123 
9124 	idx = (tag >> 6);
9125 	avail = free_tag_bits[idx];
9126 	tag &= 63;
9127 	bit = (1ULL << (63 - tag));
9128 	assert(!(avail & bit));
9129 	free_tag_bits[idx] = (avail | bit);
9130 }
9131 
9132 static void
vm_tag_init(void)9133 vm_tag_init(void)
9134 {
9135 	vm_tag_t tag;
9136 	for (tag = VM_KERN_MEMORY_FIRST_DYNAMIC; tag < VM_KERN_MEMORY_ANY; tag++) {
9137 		vm_tag_free_locked(tag);
9138 	}
9139 
9140 	for (tag = VM_KERN_MEMORY_ANY + 1; tag < VM_MAX_TAG_VALUE; tag++) {
9141 		vm_tag_free_locked(tag);
9142 	}
9143 }
9144 
9145 vm_tag_t
vm_tag_alloc(vm_allocation_site_t * site)9146 vm_tag_alloc(vm_allocation_site_t * site)
9147 {
9148 	vm_allocation_site_t * releasesite;
9149 
9150 	if (!site->tag) {
9151 		releasesite = NULL;
9152 		lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
9153 		vm_tag_alloc_locked(site, &releasesite);
9154 		lck_ticket_unlock(&vm_allocation_sites_lock);
9155 		if (releasesite) {
9156 			kern_allocation_name_release(releasesite);
9157 		}
9158 	}
9159 
9160 	return site->tag;
9161 }
9162 
9163 void
vm_tag_update_size(vm_tag_t tag,int64_t delta)9164 vm_tag_update_size(vm_tag_t tag, int64_t delta)
9165 {
9166 	assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
9167 
9168 	kern_allocation_update_size(vm_allocation_sites[tag], delta);
9169 }
9170 
9171 uint64_t
vm_tag_get_size(vm_tag_t tag)9172 vm_tag_get_size(vm_tag_t tag)
9173 {
9174 	vm_allocation_site_t *allocation;
9175 
9176 	assert(VM_KERN_MEMORY_NONE != tag && tag < VM_MAX_TAG_VALUE);
9177 
9178 	allocation = vm_allocation_sites[tag];
9179 	return allocation ? os_atomic_load(&allocation->total, relaxed) : 0;
9180 }
9181 
9182 void
kern_allocation_update_size(kern_allocation_name_t allocation,int64_t delta)9183 kern_allocation_update_size(kern_allocation_name_t allocation, int64_t delta)
9184 {
9185 	uint64_t value;
9186 
9187 	value = os_atomic_add(&allocation->total, delta, relaxed);
9188 	if (delta < 0) {
9189 		assertf(value + (uint64_t)-delta > value,
9190 		    "tag %d, site %p", allocation->tag, allocation);
9191 	}
9192 
9193 #if DEBUG || DEVELOPMENT
9194 	if (value > allocation->peak) {
9195 		os_atomic_max(&allocation->peak, value, relaxed);
9196 	}
9197 #endif /* DEBUG || DEVELOPMENT */
9198 
9199 	if (value == (uint64_t)delta && !allocation->tag) {
9200 		vm_tag_alloc(allocation);
9201 	}
9202 }
9203 
9204 #if VM_TAG_SIZECLASSES
9205 
9206 void
vm_allocation_zones_init(void)9207 vm_allocation_zones_init(void)
9208 {
9209 	vm_offset_t   addr;
9210 	vm_size_t     size;
9211 
9212 	const vm_tag_t early_tags[] = {
9213 		VM_KERN_MEMORY_DIAG,
9214 		VM_KERN_MEMORY_KALLOC,
9215 		VM_KERN_MEMORY_KALLOC_DATA,
9216 		VM_KERN_MEMORY_KALLOC_TYPE,
9217 		VM_KERN_MEMORY_LIBKERN,
9218 		VM_KERN_MEMORY_OSFMK,
9219 		VM_KERN_MEMORY_RECOUNT,
9220 	};
9221 
9222 	size = VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *)
9223 	    + ARRAY_COUNT(early_tags) * VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
9224 
9225 	kmem_alloc(kernel_map, &addr, round_page(size),
9226 	    KMA_NOFAIL | KMA_KOBJECT | KMA_ZERO | KMA_PERMANENT,
9227 	    VM_KERN_MEMORY_DIAG);
9228 
9229 	vm_allocation_zone_totals = (vm_allocation_zone_total_t **) addr;
9230 	addr += VM_MAX_TAG_VALUE * sizeof(vm_allocation_zone_total_t * *);
9231 
9232 	// prepopulate early tag ranges so allocations
9233 	// in vm_tag_update_zone_size() and early boot won't recurse
9234 	for (size_t i = 0; i < ARRAY_COUNT(early_tags); i++) {
9235 		vm_allocation_zone_totals[early_tags[i]] = (vm_allocation_zone_total_t *)addr;
9236 		addr += VM_TAG_SIZECLASSES * sizeof(vm_allocation_zone_total_t);
9237 	}
9238 }
9239 
9240 __attribute__((noinline))
9241 static vm_tag_t
vm_tag_zone_stats_alloc(vm_tag_t tag,zalloc_flags_t flags)9242 vm_tag_zone_stats_alloc(vm_tag_t tag, zalloc_flags_t flags)
9243 {
9244 	vm_allocation_zone_total_t *stats;
9245 	vm_size_t size = sizeof(*stats) * VM_TAG_SIZECLASSES;
9246 
9247 	flags = Z_VM_TAG(Z_ZERO | flags, VM_KERN_MEMORY_DIAG);
9248 	stats = kalloc_data(size, flags);
9249 	if (!stats) {
9250 		return VM_KERN_MEMORY_NONE;
9251 	}
9252 	if (!os_atomic_cmpxchg(&vm_allocation_zone_totals[tag], NULL, stats, release)) {
9253 		kfree_data(stats, size);
9254 	}
9255 	return tag;
9256 }
9257 
9258 vm_tag_t
vm_tag_will_update_zone(vm_tag_t tag,uint32_t zidx,uint32_t zflags)9259 vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx, uint32_t zflags)
9260 {
9261 	assert(VM_KERN_MEMORY_NONE != tag);
9262 	assert(tag < VM_MAX_TAG_VALUE);
9263 
9264 	if (zidx >= VM_TAG_SIZECLASSES) {
9265 		return VM_KERN_MEMORY_NONE;
9266 	}
9267 
9268 	if (__probable(vm_allocation_zone_totals[tag])) {
9269 		return tag;
9270 	}
9271 	return vm_tag_zone_stats_alloc(tag, zflags);
9272 }
9273 
9274 void
vm_tag_update_zone_size(vm_tag_t tag,uint32_t zidx,long delta)9275 vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta)
9276 {
9277 	vm_allocation_zone_total_t *stats;
9278 	vm_size_t value;
9279 
9280 	assert(VM_KERN_MEMORY_NONE != tag);
9281 	assert(tag < VM_MAX_TAG_VALUE);
9282 
9283 	if (zidx >= VM_TAG_SIZECLASSES) {
9284 		return;
9285 	}
9286 
9287 	stats = vm_allocation_zone_totals[tag];
9288 	assert(stats);
9289 	stats += zidx;
9290 
9291 	value = os_atomic_add(&stats->vazt_total, delta, relaxed);
9292 	if (delta < 0) {
9293 		assertf((long)value >= 0, "zidx %d, tag %d, %p", zidx, tag, stats);
9294 		return;
9295 	} else if (os_atomic_load(&stats->vazt_peak, relaxed) < value) {
9296 		os_atomic_max(&stats->vazt_peak, value, relaxed);
9297 	}
9298 }
9299 
9300 #endif /* VM_TAG_SIZECLASSES */
9301 
9302 void
kern_allocation_update_subtotal(kern_allocation_name_t allocation,uint32_t subtag,int64_t delta)9303 kern_allocation_update_subtotal(kern_allocation_name_t allocation, uint32_t subtag, int64_t delta)
9304 {
9305 	kern_allocation_name_t other;
9306 	struct vm_allocation_total * total;
9307 	uint32_t subidx;
9308 
9309 	subidx = 0;
9310 	assert(VM_KERN_MEMORY_NONE != subtag);
9311 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
9312 	for (; subidx < allocation->subtotalscount; subidx++) {
9313 		if (VM_KERN_MEMORY_NONE == allocation->subtotals[subidx].tag) {
9314 			allocation->subtotals[subidx].tag = (vm_tag_t)subtag;
9315 			break;
9316 		}
9317 		if (subtag == allocation->subtotals[subidx].tag) {
9318 			break;
9319 		}
9320 	}
9321 	lck_ticket_unlock(&vm_allocation_sites_lock);
9322 	assert(subidx < allocation->subtotalscount);
9323 	if (subidx >= allocation->subtotalscount) {
9324 		return;
9325 	}
9326 
9327 	total = &allocation->subtotals[subidx];
9328 	other = vm_allocation_sites[subtag];
9329 	assert(other);
9330 
9331 	if (delta < 0) {
9332 		assertf(total->total >= ((uint64_t)-delta), "name %p", allocation);
9333 		assertf(other->mapped >= ((uint64_t)-delta), "other %p", other);
9334 	}
9335 	OSAddAtomic64(delta, &other->mapped);
9336 	OSAddAtomic64(delta, &total->total);
9337 }
9338 
9339 const char *
kern_allocation_get_name(kern_allocation_name_t allocation)9340 kern_allocation_get_name(kern_allocation_name_t allocation)
9341 {
9342 	return KA_NAME(allocation);
9343 }
9344 
9345 kern_allocation_name_t
kern_allocation_name_allocate(const char * name,uint16_t subtotalscount)9346 kern_allocation_name_allocate(const char * name, uint16_t subtotalscount)
9347 {
9348 	kern_allocation_name_t allocation;
9349 	uint16_t namelen;
9350 
9351 	namelen = (uint16_t)strnlen(name, MACH_MEMORY_INFO_NAME_MAX_LEN - 1);
9352 
9353 	allocation = kalloc_data(KA_SIZE(namelen, subtotalscount), Z_WAITOK | Z_ZERO);
9354 	allocation->refcount       = 1;
9355 	allocation->subtotalscount = subtotalscount;
9356 	allocation->flags          = (uint16_t)(namelen << VM_TAG_NAME_LEN_SHIFT);
9357 	strlcpy(KA_NAME(allocation), name, namelen + 1);
9358 
9359 	vm_tag_alloc(allocation);
9360 	return allocation;
9361 }
9362 
9363 void
kern_allocation_name_release(kern_allocation_name_t allocation)9364 kern_allocation_name_release(kern_allocation_name_t allocation)
9365 {
9366 	assert(allocation->refcount > 0);
9367 	if (1 == OSAddAtomic16(-1, &allocation->refcount)) {
9368 		kfree_data(allocation,
9369 		    KA_SIZE(KA_NAME_LEN(allocation), allocation->subtotalscount));
9370 	}
9371 }
9372 
9373 vm_tag_t
kern_allocation_name_get_vm_tag(kern_allocation_name_t allocation)9374 kern_allocation_name_get_vm_tag(kern_allocation_name_t allocation)
9375 {
9376 	return vm_tag_alloc(allocation);
9377 }
9378 
9379 #if !VM_TAG_ACTIVE_UPDATE
9380 static void
vm_page_count_object(mach_memory_info_t * info,unsigned int __unused num_info,vm_object_t object)9381 vm_page_count_object(mach_memory_info_t * info, unsigned int __unused num_info, vm_object_t object)
9382 {
9383 	if (!object->wired_page_count) {
9384 		return;
9385 	}
9386 	if (object != kernel_object) {
9387 		assert(object->wire_tag < num_info);
9388 		info[object->wire_tag].size += ptoa_64(object->wired_page_count);
9389 	}
9390 }
9391 
9392 typedef void (*vm_page_iterate_proc)(mach_memory_info_t * info,
9393     unsigned int num_info, vm_object_t object);
9394 
9395 static void
vm_page_iterate_purgeable_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc,purgeable_q_t queue,int group)9396 vm_page_iterate_purgeable_objects(mach_memory_info_t * info, unsigned int num_info,
9397     vm_page_iterate_proc proc, purgeable_q_t queue,
9398     int group)
9399 {
9400 	vm_object_t object;
9401 
9402 	for (object = (vm_object_t) queue_first(&queue->objq[group]);
9403 	    !queue_end(&queue->objq[group], (queue_entry_t) object);
9404 	    object = (vm_object_t) queue_next(&object->objq)) {
9405 		proc(info, num_info, object);
9406 	}
9407 }
9408 
9409 static void
vm_page_iterate_objects(mach_memory_info_t * info,unsigned int num_info,vm_page_iterate_proc proc)9410 vm_page_iterate_objects(mach_memory_info_t * info, unsigned int num_info,
9411     vm_page_iterate_proc proc)
9412 {
9413 	vm_object_t     object;
9414 
9415 	lck_spin_lock_grp(&vm_objects_wired_lock, &vm_page_lck_grp_bucket);
9416 	queue_iterate(&vm_objects_wired,
9417 	    object,
9418 	    vm_object_t,
9419 	    wired_objq)
9420 	{
9421 		proc(info, num_info, object);
9422 	}
9423 	lck_spin_unlock(&vm_objects_wired_lock);
9424 }
9425 #endif /* ! VM_TAG_ACTIVE_UPDATE */
9426 
9427 static uint64_t
process_account(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,boolean_t iterated,bool redact_info __unused)9428 process_account(mach_memory_info_t * info, unsigned int num_info,
9429     uint64_t zones_collectable_bytes, boolean_t iterated, bool redact_info __unused)
9430 {
9431 	size_t                 namelen;
9432 	unsigned int           idx, count, nextinfo;
9433 	vm_allocation_site_t * site;
9434 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
9435 
9436 	for (idx = 0; idx <= vm_allocation_tag_highest; idx++) {
9437 		site = vm_allocation_sites[idx];
9438 		if (!site) {
9439 			continue;
9440 		}
9441 		info[idx].mapped = site->mapped;
9442 		info[idx].tag    = site->tag;
9443 		if (!iterated) {
9444 			info[idx].size = site->total;
9445 #if DEBUG || DEVELOPMENT
9446 			info[idx].peak = site->peak;
9447 #endif /* DEBUG || DEVELOPMENT */
9448 		} else {
9449 			if (!site->subtotalscount && (site->total != info[idx].size)) {
9450 				printf("tag mismatch[%d] 0x%qx, iter 0x%qx\n", idx, site->total, info[idx].size);
9451 				info[idx].size = site->total;
9452 			}
9453 		}
9454 		info[idx].flags |= VM_KERN_SITE_WIRED;
9455 		if (idx < VM_KERN_MEMORY_FIRST_DYNAMIC) {
9456 			info[idx].site   = idx;
9457 			info[idx].flags |= VM_KERN_SITE_TAG;
9458 			if (VM_KERN_MEMORY_ZONE == idx) {
9459 				info[idx].flags |= VM_KERN_SITE_HIDE;
9460 				info[idx].flags &= ~VM_KERN_SITE_WIRED;
9461 				info[idx].collectable_bytes = zones_collectable_bytes;
9462 			}
9463 		} else if ((namelen = (VM_TAG_NAME_LEN_MAX & (site->flags >> VM_TAG_NAME_LEN_SHIFT)))) {
9464 			info[idx].site   = 0;
9465 			info[idx].flags |= VM_KERN_SITE_NAMED;
9466 			if (namelen > sizeof(info[idx].name)) {
9467 				namelen = sizeof(info[idx].name);
9468 			}
9469 			strncpy(&info[idx].name[0], KA_NAME(site), namelen);
9470 		} else if (VM_TAG_KMOD & site->flags) {
9471 			info[idx].site   = OSKextGetKmodIDForSite(site, NULL, 0);
9472 			info[idx].flags |= VM_KERN_SITE_KMOD;
9473 		} else {
9474 			info[idx].site   = VM_KERNEL_UNSLIDE(site);
9475 			info[idx].flags |= VM_KERN_SITE_KERNEL;
9476 		}
9477 	}
9478 
9479 	nextinfo = (vm_allocation_tag_highest + 1);
9480 	count    = nextinfo;
9481 	if (count >= num_info) {
9482 		count = num_info;
9483 	}
9484 
9485 	for (idx = 0; idx < count; idx++) {
9486 		site = vm_allocation_sites[idx];
9487 		if (!site) {
9488 			continue;
9489 		}
9490 #if VM_TAG_SIZECLASSES
9491 		vm_allocation_zone_total_t * zone;
9492 		unsigned int                 zidx;
9493 
9494 		if (!redact_info
9495 		    && vm_allocation_zone_totals
9496 		    && (zone = vm_allocation_zone_totals[idx])
9497 		    && (nextinfo < num_info)) {
9498 			for (zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
9499 				if (!zone[zidx].vazt_peak) {
9500 					continue;
9501 				}
9502 				info[nextinfo]        = info[idx];
9503 				info[nextinfo].zone   = zone_index_from_tag_index(zidx);
9504 				info[nextinfo].flags  &= ~VM_KERN_SITE_WIRED;
9505 				info[nextinfo].flags  |= VM_KERN_SITE_ZONE;
9506 				info[nextinfo].flags  |= VM_KERN_SITE_KALLOC;
9507 				info[nextinfo].size   = zone[zidx].vazt_total;
9508 				info[nextinfo].peak   = zone[zidx].vazt_peak;
9509 				info[nextinfo].mapped = 0;
9510 				nextinfo++;
9511 			}
9512 		}
9513 #endif /* VM_TAG_SIZECLASSES */
9514 		if (site->subtotalscount) {
9515 			uint64_t mapped, mapcost, take;
9516 			uint32_t sub;
9517 			vm_tag_t alloctag;
9518 
9519 			info[idx].size = site->total;
9520 			mapped = info[idx].size;
9521 			info[idx].mapped = mapped;
9522 			mapcost = 0;
9523 			for (sub = 0; sub < site->subtotalscount; sub++) {
9524 				alloctag = site->subtotals[sub].tag;
9525 				assert(alloctag < num_info);
9526 				if (info[alloctag].name[0]) {
9527 					continue;
9528 				}
9529 				take = site->subtotals[sub].total;
9530 				if (take > info[alloctag].size) {
9531 					take = info[alloctag].size;
9532 				}
9533 				if (take > mapped) {
9534 					take = mapped;
9535 				}
9536 				info[alloctag].mapped  -= take;
9537 				info[alloctag].size    -= take;
9538 				mapped                 -= take;
9539 				mapcost                += take;
9540 			}
9541 			info[idx].size = mapcost;
9542 		}
9543 	}
9544 	lck_ticket_unlock(&vm_allocation_sites_lock);
9545 
9546 	return 0;
9547 }
9548 
9549 uint32_t
vm_page_diagnose_estimate(void)9550 vm_page_diagnose_estimate(void)
9551 {
9552 	vm_allocation_site_t * site;
9553 	uint32_t               count = zone_view_count;
9554 	uint32_t               idx;
9555 
9556 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
9557 	for (idx = 0; idx < VM_MAX_TAG_VALUE; idx++) {
9558 		site = vm_allocation_sites[idx];
9559 		if (!site) {
9560 			continue;
9561 		}
9562 		count++;
9563 #if VM_TAG_SIZECLASSES
9564 		if (vm_allocation_zone_totals) {
9565 			vm_allocation_zone_total_t * zone;
9566 			zone = vm_allocation_zone_totals[idx];
9567 			if (!zone) {
9568 				continue;
9569 			}
9570 			for (uint32_t zidx = 0; zidx < VM_TAG_SIZECLASSES; zidx++) {
9571 				count += (zone[zidx].vazt_peak != 0);
9572 			}
9573 		}
9574 #endif
9575 	}
9576 	lck_ticket_unlock(&vm_allocation_sites_lock);
9577 
9578 	/* some slop for new tags created */
9579 	count += 8;
9580 	count += VM_KERN_COUNTER_COUNT;
9581 
9582 	return count;
9583 }
9584 
9585 static void
vm_page_diagnose_zone_stats(mach_memory_info_t * info,zone_stats_t zstats,bool percpu)9586 vm_page_diagnose_zone_stats(mach_memory_info_t *info, zone_stats_t zstats,
9587     bool percpu)
9588 {
9589 	zpercpu_foreach(zs, zstats) {
9590 		info->size += zs->zs_mem_allocated - zs->zs_mem_freed;
9591 	}
9592 	if (percpu) {
9593 		info->size *= zpercpu_count();
9594 	}
9595 	info->flags |= VM_KERN_SITE_NAMED | VM_KERN_SITE_ZONE_VIEW;
9596 }
9597 
9598 static void
vm_page_diagnose_zone(mach_memory_info_t * info,zone_t z)9599 vm_page_diagnose_zone(mach_memory_info_t *info, zone_t z)
9600 {
9601 	vm_page_diagnose_zone_stats(info, z->z_stats, z->z_percpu);
9602 	snprintf(info->name, sizeof(info->name),
9603 	    "%s%s[raw]", zone_heap_name(z), z->z_name);
9604 }
9605 
9606 static int
vm_page_diagnose_heap(mach_memory_info_t * info,kalloc_heap_t kheap)9607 vm_page_diagnose_heap(mach_memory_info_t *info, kalloc_heap_t kheap)
9608 {
9609 	struct kalloc_heap *kh = kheap->kh_views;
9610 	int i = 0;
9611 
9612 	for (; i < KHEAP_NUM_ZONES; i++) {
9613 		vm_page_diagnose_zone(info + i, zone_by_id(kheap->kh_zstart + i));
9614 	}
9615 
9616 	while (kh) {
9617 		vm_page_diagnose_zone_stats(info + i, kh->kh_stats, false);
9618 		snprintf(info[i].name, sizeof(info[i].name),
9619 		    "%skalloc[%s]", kheap->kh_name, kh->kh_name);
9620 		kh = kh->kh_views;
9621 		i++;
9622 	}
9623 
9624 	return i;
9625 }
9626 
9627 static int
vm_page_diagnose_kt_heaps(mach_memory_info_t * info)9628 vm_page_diagnose_kt_heaps(mach_memory_info_t *info)
9629 {
9630 	uint32_t idx = 0;
9631 	vm_page_diagnose_zone_stats(info + idx, KHEAP_KT_VAR->kh_stats, false);
9632 	snprintf(info[idx].name, sizeof(info[idx].name),
9633 	    "%s[raw]", KHEAP_KT_VAR->kh_name);
9634 	idx++;
9635 
9636 	for (uint32_t i = 0; i < KT_VAR_MAX_HEAPS; i++) {
9637 		struct kheap_info heap = kalloc_type_heap_array[i];
9638 
9639 		for (kalloc_type_var_view_t ktv = heap.kt_views; ktv;
9640 		    ktv = (kalloc_type_var_view_t) ktv->kt_next) {
9641 			if (ktv->kt_stats && ktv->kt_stats != KHEAP_KT_VAR->kh_stats) {
9642 				vm_page_diagnose_zone_stats(info + idx, ktv->kt_stats, false);
9643 				snprintf(info[idx].name, sizeof(info[idx].name),
9644 				    "%s[%s]", KHEAP_KT_VAR->kh_name, ktv->kt_name);
9645 				idx++;
9646 			}
9647 		}
9648 	}
9649 
9650 	return idx;
9651 }
9652 
9653 kern_return_t
vm_page_diagnose(mach_memory_info_t * info,unsigned int num_info,uint64_t zones_collectable_bytes,bool redact_info)9654 vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes, bool redact_info)
9655 {
9656 	uint64_t                 wired_size;
9657 	uint64_t                 wired_managed_size;
9658 	uint64_t                 wired_reserved_size;
9659 	boolean_t                iterate;
9660 	mach_memory_info_t     * counts;
9661 	uint32_t                 i;
9662 
9663 	bzero(info, num_info * sizeof(mach_memory_info_t));
9664 
9665 	if (!vm_page_wire_count_initial) {
9666 		return KERN_ABORTED;
9667 	}
9668 
9669 #if !XNU_TARGET_OS_OSX
9670 	wired_size          = ptoa_64(vm_page_wire_count);
9671 	wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count);
9672 #else /* !XNU_TARGET_OS_OSX */
9673 	wired_size          = ptoa_64(vm_page_wire_count + vm_lopage_free_count + vm_page_throttled_count);
9674 	wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count + vm_page_throttled_count);
9675 #endif /* !XNU_TARGET_OS_OSX */
9676 	wired_managed_size  = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial);
9677 
9678 	wired_size += booter_size;
9679 
9680 	assert(num_info >= VM_KERN_COUNTER_COUNT);
9681 	num_info -= VM_KERN_COUNTER_COUNT;
9682 	counts = &info[num_info];
9683 
9684 #define SET_COUNT(xcount, xsize, xflags)                        \
9685     counts[xcount].tag   = VM_MAX_TAG_VALUE + xcount;   \
9686     counts[xcount].site  = (xcount);                            \
9687     counts[xcount].size  = (xsize);                                 \
9688     counts[xcount].mapped  = (xsize);                           \
9689     counts[xcount].flags = VM_KERN_SITE_COUNTER | xflags;
9690 
9691 	SET_COUNT(VM_KERN_COUNT_MANAGED, ptoa_64(vm_page_pages), 0);
9692 	SET_COUNT(VM_KERN_COUNT_WIRED, wired_size, 0);
9693 	SET_COUNT(VM_KERN_COUNT_WIRED_MANAGED, wired_managed_size, 0);
9694 	SET_COUNT(VM_KERN_COUNT_RESERVED, wired_reserved_size, VM_KERN_SITE_WIRED);
9695 	SET_COUNT(VM_KERN_COUNT_STOLEN, ptoa_64(vm_page_stolen_count), VM_KERN_SITE_WIRED);
9696 	SET_COUNT(VM_KERN_COUNT_LOPAGE, ptoa_64(vm_lopage_free_count), VM_KERN_SITE_WIRED);
9697 	SET_COUNT(VM_KERN_COUNT_WIRED_BOOT, ptoa_64(vm_page_wire_count_on_boot), 0);
9698 	SET_COUNT(VM_KERN_COUNT_BOOT_STOLEN, booter_size, VM_KERN_SITE_WIRED);
9699 	SET_COUNT(VM_KERN_COUNT_WIRED_STATIC_KERNELCACHE, ptoa_64(vm_page_kernelcache_count), 0);
9700 
9701 #define SET_MAP(xcount, xsize, xfree, xlargest) \
9702     counts[xcount].site    = (xcount);                  \
9703     counts[xcount].size    = (xsize);                   \
9704     counts[xcount].mapped  = (xsize);                   \
9705     counts[xcount].free    = (xfree);                   \
9706     counts[xcount].largest = (xlargest);                \
9707     counts[xcount].flags   = VM_KERN_SITE_COUNTER;
9708 
9709 	vm_map_size_t map_size, map_free, map_largest;
9710 
9711 	vm_map_sizes(kernel_map, &map_size, &map_free, &map_largest);
9712 	SET_MAP(VM_KERN_COUNT_MAP_KERNEL, map_size, map_free, map_largest);
9713 
9714 	zone_map_sizes(&map_size, &map_free, &map_largest);
9715 	SET_MAP(VM_KERN_COUNT_MAP_ZONE, map_size, map_free, map_largest);
9716 
9717 	assert(num_info >= zone_view_count);
9718 	num_info -= zone_view_count;
9719 	counts = &info[num_info];
9720 	i = 0;
9721 
9722 	if (!redact_info) {
9723 		i += vm_page_diagnose_heap(counts + i, KHEAP_DEFAULT);
9724 		if (KHEAP_DATA_BUFFERS->kh_heap_id == KHEAP_ID_DATA_BUFFERS) {
9725 			i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_BUFFERS);
9726 		}
9727 		if (KHEAP_KT_VAR->kh_heap_id == KHEAP_ID_KT_VAR) {
9728 			i += vm_page_diagnose_kt_heaps(counts + i);
9729 		}
9730 		assert(i <= zone_view_count);
9731 
9732 		zone_index_foreach(zidx) {
9733 			zone_t z = &zone_array[zidx];
9734 			zone_security_flags_t zsflags = zone_security_array[zidx];
9735 			zone_view_t zv = z->z_views;
9736 
9737 			if (zv == NULL) {
9738 				continue;
9739 			}
9740 
9741 			zone_stats_t zv_stats_head = z->z_stats;
9742 			bool has_raw_view = false;
9743 
9744 			for (; zv; zv = zv->zv_next) {
9745 				/*
9746 				 * kalloc_types that allocate from the same zone are linked
9747 				 * as views. Only print the ones that have their own stats.
9748 				 */
9749 				if (zv->zv_stats == zv_stats_head) {
9750 					continue;
9751 				}
9752 				has_raw_view = true;
9753 				vm_page_diagnose_zone_stats(counts + i, zv->zv_stats,
9754 				    z->z_percpu);
9755 				snprintf(counts[i].name, sizeof(counts[i].name), "%s%s[%s]",
9756 				    zone_heap_name(z), z->z_name, zv->zv_name);
9757 				i++;
9758 				assert(i <= zone_view_count);
9759 			}
9760 
9761 			/*
9762 			 * Print raw views for non kalloc or kalloc_type zones
9763 			 */
9764 			bool kalloc_type = zsflags.z_kalloc_type;
9765 			if ((zsflags.z_kheap_id == KHEAP_ID_NONE && !kalloc_type) ||
9766 			    (kalloc_type && has_raw_view)) {
9767 				vm_page_diagnose_zone(counts + i, z);
9768 				i++;
9769 				assert(i <= zone_view_count);
9770 			}
9771 		}
9772 	}
9773 
9774 	iterate = !VM_TAG_ACTIVE_UPDATE;
9775 	if (iterate) {
9776 		enum                       { kMaxKernelDepth = 1 };
9777 		vm_map_t                     maps[kMaxKernelDepth];
9778 		vm_map_entry_t               entries[kMaxKernelDepth];
9779 		vm_map_t                     map;
9780 		vm_map_entry_t               entry;
9781 		vm_object_offset_t           offset;
9782 		vm_page_t                    page;
9783 		int                          stackIdx, count;
9784 
9785 #if !VM_TAG_ACTIVE_UPDATE
9786 		vm_page_iterate_objects(info, num_info, &vm_page_count_object);
9787 #endif /* ! VM_TAG_ACTIVE_UPDATE */
9788 
9789 		map = kernel_map;
9790 		stackIdx = 0;
9791 		while (map) {
9792 			vm_map_lock(map);
9793 			for (entry = map->hdr.links.next; map; entry = entry->vme_next) {
9794 				if (entry->is_sub_map) {
9795 					assert(stackIdx < kMaxKernelDepth);
9796 					maps[stackIdx] = map;
9797 					entries[stackIdx] = entry;
9798 					stackIdx++;
9799 					map = VME_SUBMAP(entry);
9800 					entry = NULL;
9801 					break;
9802 				}
9803 				if (VME_OBJECT(entry) == kernel_object) {
9804 					count = 0;
9805 					vm_object_lock(VME_OBJECT(entry));
9806 					for (offset = entry->vme_start; offset < entry->vme_end; offset += page_size) {
9807 						page = vm_page_lookup(VME_OBJECT(entry), offset);
9808 						if (page && VM_PAGE_WIRED(page)) {
9809 							count++;
9810 						}
9811 					}
9812 					vm_object_unlock(VME_OBJECT(entry));
9813 
9814 					if (count) {
9815 						assert(VME_ALIAS(entry) != VM_KERN_MEMORY_NONE);
9816 						assert(VME_ALIAS(entry) < num_info);
9817 						info[VME_ALIAS(entry)].size += ptoa_64(count);
9818 					}
9819 				}
9820 				while (map && (entry == vm_map_last_entry(map))) {
9821 					vm_map_unlock(map);
9822 					if (!stackIdx) {
9823 						map = NULL;
9824 					} else {
9825 						--stackIdx;
9826 						map = maps[stackIdx];
9827 						entry = entries[stackIdx];
9828 					}
9829 				}
9830 			}
9831 		}
9832 	}
9833 
9834 	process_account(info, num_info, zones_collectable_bytes, iterate, redact_info);
9835 
9836 	return KERN_SUCCESS;
9837 }
9838 
9839 #if DEBUG || DEVELOPMENT
9840 
9841 kern_return_t
vm_kern_allocation_info(uintptr_t addr,vm_size_t * size,vm_tag_t * tag,vm_size_t * zone_size)9842 vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_size_t * zone_size)
9843 {
9844 	kern_return_t  ret;
9845 	vm_size_t      zsize;
9846 	vm_map_t       map;
9847 	vm_map_entry_t entry;
9848 
9849 	zsize = zone_element_info((void *) addr, tag);
9850 	if (zsize) {
9851 		*zone_size = *size = zsize;
9852 		return KERN_SUCCESS;
9853 	}
9854 
9855 	*zone_size = 0;
9856 	ret = KERN_INVALID_ADDRESS;
9857 	for (map = kernel_map; map;) {
9858 		vm_map_lock(map);
9859 		if (!vm_map_lookup_entry_allow_pgz(map, addr, &entry)) {
9860 			break;
9861 		}
9862 		if (entry->is_sub_map) {
9863 			if (map != kernel_map) {
9864 				break;
9865 			}
9866 			map = VME_SUBMAP(entry);
9867 			continue;
9868 		}
9869 		if (entry->vme_start != addr) {
9870 			break;
9871 		}
9872 		*tag = (vm_tag_t)VME_ALIAS(entry);
9873 		*size = (entry->vme_end - addr);
9874 		ret = KERN_SUCCESS;
9875 		break;
9876 	}
9877 	if (map != kernel_map) {
9878 		vm_map_unlock(map);
9879 	}
9880 	vm_map_unlock(kernel_map);
9881 
9882 	return ret;
9883 }
9884 
9885 #endif /* DEBUG || DEVELOPMENT */
9886 
9887 uint32_t
vm_tag_get_kext(vm_tag_t tag,char * name,vm_size_t namelen)9888 vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen)
9889 {
9890 	vm_allocation_site_t * site;
9891 	uint32_t               kmodId;
9892 
9893 	kmodId = 0;
9894 	lck_ticket_lock(&vm_allocation_sites_lock, &vm_page_lck_grp_bucket);
9895 	if ((site = vm_allocation_sites[tag])) {
9896 		if (VM_TAG_KMOD & site->flags) {
9897 			kmodId = OSKextGetKmodIDForSite(site, name, namelen);
9898 		}
9899 	}
9900 	lck_ticket_unlock(&vm_allocation_sites_lock);
9901 
9902 	return kmodId;
9903 }
9904 
9905 
9906 #if CONFIG_SECLUDED_MEMORY
9907 /*
9908  * Note that there's no locking around other accesses to vm_page_secluded_target.
9909  * That should be OK, since these are the only place where it can be changed after
9910  * initialization. Other users (like vm_pageout) may see the wrong value briefly,
9911  * but will eventually get the correct value. This brief mismatch is OK as pageout
9912  * and page freeing will auto-adjust the vm_page_secluded_count to match the target
9913  * over time.
9914  */
9915 unsigned int vm_page_secluded_suppress_cnt = 0;
9916 unsigned int vm_page_secluded_save_target;
9917 
9918 LCK_GRP_DECLARE(secluded_suppress_slock_grp, "secluded_suppress_slock");
9919 LCK_SPIN_DECLARE(secluded_suppress_slock, &secluded_suppress_slock_grp);
9920 
9921 void
start_secluded_suppression(task_t task)9922 start_secluded_suppression(task_t task)
9923 {
9924 	if (task->task_suppressed_secluded) {
9925 		return;
9926 	}
9927 	lck_spin_lock(&secluded_suppress_slock);
9928 	if (!task->task_suppressed_secluded && vm_page_secluded_suppress_cnt++ == 0) {
9929 		task->task_suppressed_secluded = TRUE;
9930 		vm_page_secluded_save_target = vm_page_secluded_target;
9931 		vm_page_secluded_target = 0;
9932 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
9933 	}
9934 	lck_spin_unlock(&secluded_suppress_slock);
9935 }
9936 
9937 void
stop_secluded_suppression(task_t task)9938 stop_secluded_suppression(task_t task)
9939 {
9940 	lck_spin_lock(&secluded_suppress_slock);
9941 	if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) {
9942 		task->task_suppressed_secluded = FALSE;
9943 		vm_page_secluded_target = vm_page_secluded_save_target;
9944 		VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE();
9945 	}
9946 	lck_spin_unlock(&secluded_suppress_slock);
9947 }
9948 
9949 #endif /* CONFIG_SECLUDED_MEMORY */
9950 
9951 /*
9952  * Move the list of retired pages on the vm_page_queue_retired to
9953  * their final resting place on retired_pages_object.
9954  */
9955 void
vm_retire_boot_pages(void)9956 vm_retire_boot_pages(void)
9957 {
9958 }
9959 
9960 /*
9961  * This holds the reported physical address if an ECC error leads to a panic.
9962  * SMC will store it in PMU SRAM under the 'sECC' key.
9963  */
9964 uint64_t ecc_panic_physical_address = 0;
9965 
9966