xref: /xnu-12377.1.9/osfmk/vm/vm_pageout.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69 
70 #include <debug.h>
71 
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80 
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92 
93 #include <os/atomic_private.h>
94 #include <os/log.h>
95 
96 #include <machine/vm_tuning.h>
97 #include <machine/commpage.h>
98 
99 #include <vm/pmap.h>
100 #include <vm/vm_compressor_pager_internal.h>
101 #include <vm/vm_fault_internal.h>
102 #include <vm/vm_log.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115 #include <vm/vm_reclaim_xnu.h>
116 
117 #include <san/kasan.h>
118 #include <sys/kdebug_triage.h>
119 #include <sys/kern_memorystatus_xnu.h>
120 #include <sys/kdebug.h>
121 
122 #if CONFIG_PHANTOM_CACHE
123 #include <vm/vm_phantom_cache_internal.h>
124 #endif
125 
126 
127 #if UPL_DEBUG
128 #include <libkern/OSDebug.h>
129 #endif
130 
131 os_log_t vm_log_handle = OS_LOG_DEFAULT;
132 TUNABLE(bool, vm_log_to_serial, "vm_log_to_serial", false);
133 TUNABLE(bool, vm_log_debug_enabled, "vm_log_debug", false);
134 
135 extern int cs_debug;
136 
137 #if CONFIG_MBUF_MCACHE
138 extern void mbuf_drain(boolean_t);
139 #endif /* CONFIG_MBUF_MCACHE */
140 
141 #if CONFIG_FREEZE
142 extern unsigned int memorystatus_frozen_count;
143 extern unsigned int memorystatus_suspended_count;
144 #endif /* CONFIG_FREEZE */
145 extern vm_pressure_level_t memorystatus_vm_pressure_level;
146 
147 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
148 extern uint32_t memorystatus_jetsam_fg_band_waiters;
149 extern uint32_t memorystatus_jetsam_bg_band_waiters;
150 
151 void vm_pressure_response(void);
152 extern void consider_vm_pressure_events(void);
153 
154 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
155 
156 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
157 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
158 sched_cond_atomic_t vm_pageout_gc_cond;
159 #if CONFIG_VPS_DYNAMIC_PRIO
160 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
161 #else
162 const bool vps_dynamic_priority_enabled = false;
163 #endif
164 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
165 
166 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
167 #if !XNU_TARGET_OS_OSX
168 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
169 #else /* !XNU_TARGET_OS_OSX */
170 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
171 #endif /* !XNU_TARGET_OS_OSX */
172 #endif
173 
174 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
175 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
176 #endif
177 
178 #ifndef VM_PAGE_LAUNDRY_MAX
179 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
180 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
181 
182 #ifndef VM_PAGEOUT_BURST_WAIT
183 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
184 #endif  /* VM_PAGEOUT_BURST_WAIT */
185 
186 #ifndef VM_PAGEOUT_EMPTY_WAIT
187 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
188 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
189 
190 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
191 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
192 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
193 
194 #ifndef VM_PAGEOUT_IDLE_WAIT
195 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
196 #endif  /* VM_PAGEOUT_IDLE_WAIT */
197 
198 #ifndef VM_PAGEOUT_SWAP_WAIT
199 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
200 #endif  /* VM_PAGEOUT_SWAP_WAIT */
201 
202 /*
203  * vm_page_max_speculative_age_q should be less than or equal to
204  * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
205  * vm_page_queue_speculative entries.
206  */
207 
208 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
209 #ifndef VM_PAGE_SPECULATIVE_TARGET
210 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
211 #endif /* VM_PAGE_SPECULATIVE_TARGET */
212 
213 
214 /*
215  *	To obtain a reasonable LRU approximation, the inactive queue
216  *	needs to be large enough to give pages on it a chance to be
217  *	referenced a second time.  This macro defines the fraction
218  *	of active+inactive pages that should be inactive.
219  *	The pageout daemon uses it to update vm_page_inactive_target.
220  *
221  *	If vm_page_free_count falls below vm_page_free_target and
222  *	vm_page_inactive_count is below vm_page_inactive_target,
223  *	then the pageout daemon starts running.
224  */
225 
226 #ifndef VM_PAGE_INACTIVE_TARGET
227 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
228 #endif  /* VM_PAGE_INACTIVE_TARGET */
229 
230 /*
231  *	Once the pageout daemon starts running, it keeps going
232  *	until vm_page_free_count meets or exceeds vm_page_free_target.
233  */
234 
235 #ifndef VM_PAGE_FREE_TARGET
236 #if !XNU_TARGET_OS_OSX
237 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
238 #else /* !XNU_TARGET_OS_OSX */
239 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
240 #endif /* !XNU_TARGET_OS_OSX */
241 #endif  /* VM_PAGE_FREE_TARGET */
242 
243 
244 /*
245  *	The pageout daemon always starts running once vm_page_free_count
246  *	falls below vm_page_free_min.
247  */
248 
249 #ifndef VM_PAGE_FREE_MIN
250 #if !XNU_TARGET_OS_OSX
251 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
252 #else /* !XNU_TARGET_OS_OSX */
253 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
254 #endif /* !XNU_TARGET_OS_OSX */
255 #endif  /* VM_PAGE_FREE_MIN */
256 
257 #if !XNU_TARGET_OS_OSX
258 #define VM_PAGE_FREE_RESERVED_LIMIT     100
259 #define VM_PAGE_FREE_MIN_LIMIT          1500
260 #define VM_PAGE_FREE_TARGET_LIMIT       2000
261 #else /* !XNU_TARGET_OS_OSX */
262 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
263 #define VM_PAGE_FREE_MIN_LIMIT          3500
264 #define VM_PAGE_FREE_TARGET_LIMIT       4000
265 #endif /* !XNU_TARGET_OS_OSX */
266 
267 /*
268  *	When vm_page_free_count falls below vm_page_free_reserved,
269  *	only vm-privileged threads can allocate pages.  vm-privilege
270  *	allows the pageout daemon and default pager (and any other
271  *	associated threads needed for default pageout) to continue
272  *	operation by dipping into the reserved pool of pages.
273  */
274 
275 #ifndef VM_PAGE_FREE_RESERVED
276 #define VM_PAGE_FREE_RESERVED(n)        \
277 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
278 #endif  /* VM_PAGE_FREE_RESERVED */
279 
280 /*
281  *	When we dequeue pages from the inactive list, they are
282  *	reactivated (ie, put back on the active queue) if referenced.
283  *	However, it is possible to starve the free list if other
284  *	processors are referencing pages faster than we can turn off
285  *	the referenced bit.  So we limit the number of reactivations
286  *	we will make per call of vm_pageout_scan().
287  */
288 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
289 
290 #ifndef VM_PAGE_REACTIVATE_LIMIT
291 #if !XNU_TARGET_OS_OSX
292 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
293 #else /* !XNU_TARGET_OS_OSX */
294 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
295 #endif /* !XNU_TARGET_OS_OSX */
296 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
297 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
298 
299 int vm_pageout_protect_realtime = true;
300 
301 extern boolean_t hibernate_cleaning_in_progress;
302 
303 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
304 struct pgo_iothread_state pgo_iothread_external_state;
305 
306 #if VM_PRESSURE_EVENTS
307 void vm_pressure_thread(void);
308 
309 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
310 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
311 
312 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
313 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
314 #endif
315 
316 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
317 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
318 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
319 
320 extern void vm_pageout_continue(void);
321 extern void vm_pageout_scan(void);
322 
323 boolean_t vm_pageout_running = FALSE;
324 
325 uint32_t vm_page_upl_tainted = 0;
326 uint32_t vm_page_iopl_tainted = 0;
327 
328 #if XNU_TARGET_OS_OSX
329 static boolean_t vm_pageout_waiter  = FALSE;
330 #endif /* XNU_TARGET_OS_OSX */
331 
332 
333 #if DEVELOPMENT || DEBUG
334 struct vm_pageout_debug vm_pageout_debug;
335 #endif
336 struct vm_pageout_vminfo vm_pageout_vminfo;
337 struct vm_pageout_state  vm_pageout_state;
338 struct vm_config         vm_config;
339 
340 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
341 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
342 #if DEVELOPMENT || DEBUG
343 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
344 #endif /* DEVELOPMENT || DEBUG */
345 
346 int         vm_upl_wait_for_pages = 0;
347 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
348 
349 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
350 
351 int     vm_debug_events = 0;
352 
353 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
354 
355 #if CONFIG_MEMORYSTATUS
356 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
357 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
358 #endif
359 
360 #if __AMP__
361 
362 
363 /*
364  * Bind compressor threads to e-cores unless there are multiple non-e clusters
365  */
366 #if (MAX_CPU_CLUSTERS > 2)
367 #define VM_COMPRESSOR_EBOUND_DEFAULT false
368 #elif defined(XNU_TARGET_OS_XR)
369 #define VM_COMPRESSOR_EBOUND_DEFAULT false
370 #else
371 #define VM_COMPRESSOR_EBOUND_DEFAULT true
372 #endif
373 
374 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
375 int vm_pgo_pbound = 0;
376 extern kern_return_t thread_soft_bind_cluster_type(thread_t, char);
377 
378 #endif /* __AMP__ */
379 
380 
381 /*
382  *	Routine:	vm_pageout_object_terminate
383  *	Purpose:
384  *		Destroy the pageout_object, and perform all of the
385  *		required cleanup actions.
386  *
387  *	In/Out conditions:
388  *		The object must be locked, and will be returned locked.
389  */
390 void
vm_pageout_object_terminate(vm_object_t object)391 vm_pageout_object_terminate(
392 	vm_object_t     object)
393 {
394 	vm_object_t     shadow_object;
395 
396 	/*
397 	 * Deal with the deallocation (last reference) of a pageout object
398 	 * (used for cleaning-in-place) by dropping the paging references/
399 	 * freeing pages in the original object.
400 	 */
401 
402 	assert(object->pageout);
403 	shadow_object = object->shadow;
404 	vm_object_lock(shadow_object);
405 
406 	while (!vm_page_queue_empty(&object->memq)) {
407 		vm_page_t               p, m;
408 		vm_object_offset_t      offset;
409 
410 		p = (vm_page_t) vm_page_queue_first(&object->memq);
411 
412 		assert(vm_page_is_private(p));
413 		assert(p->vmp_free_when_done);
414 		p->vmp_free_when_done = FALSE;
415 		assert(!p->vmp_cleaning);
416 		assert(!p->vmp_laundry);
417 
418 		offset = p->vmp_offset;
419 		VM_PAGE_FREE(p);
420 		p = VM_PAGE_NULL;
421 
422 		m = vm_page_lookup(shadow_object,
423 		    offset + object->vo_shadow_offset);
424 
425 		if (m == VM_PAGE_NULL) {
426 			continue;
427 		}
428 
429 		assert((m->vmp_dirty) || (m->vmp_precious) ||
430 		    (m->vmp_busy && m->vmp_cleaning));
431 
432 		/*
433 		 * Handle the trusted pager throttle.
434 		 * Also decrement the burst throttle (if external).
435 		 */
436 		vm_page_lock_queues();
437 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
438 			vm_pageout_throttle_up(m);
439 		}
440 
441 		/*
442 		 * Handle the "target" page(s). These pages are to be freed if
443 		 * successfully cleaned. Target pages are always busy, and are
444 		 * wired exactly once. The initial target pages are not mapped,
445 		 * (so cannot be referenced or modified) but converted target
446 		 * pages may have been modified between the selection as an
447 		 * adjacent page and conversion to a target.
448 		 */
449 		if (m->vmp_free_when_done) {
450 			assert(m->vmp_busy);
451 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
452 			assert(m->vmp_wire_count == 1);
453 			m->vmp_cleaning = FALSE;
454 			m->vmp_free_when_done = FALSE;
455 			/*
456 			 * Revoke all access to the page. Since the object is
457 			 * locked, and the page is busy, this prevents the page
458 			 * from being dirtied after the pmap_disconnect() call
459 			 * returns.
460 			 *
461 			 * Since the page is left "dirty" but "not modifed", we
462 			 * can detect whether the page was redirtied during
463 			 * pageout by checking the modify state.
464 			 */
465 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
466 				SET_PAGE_DIRTY(m, FALSE);
467 			} else {
468 				m->vmp_dirty = FALSE;
469 			}
470 
471 			if (m->vmp_dirty) {
472 				vm_page_unwire(m, TRUE);        /* reactivates */
473 				counter_inc(&vm_statistics_reactivations);
474 				vm_page_wakeup_done(object, m);
475 			} else {
476 				vm_page_free(m);  /* clears busy, etc. */
477 			}
478 			vm_page_unlock_queues();
479 			continue;
480 		}
481 		/*
482 		 * Handle the "adjacent" pages. These pages were cleaned in
483 		 * place, and should be left alone.
484 		 * If prep_pin_count is nonzero, then someone is using the
485 		 * page, so make it active.
486 		 */
487 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !vm_page_is_private(m)) {
488 			if (m->vmp_reference) {
489 				vm_page_activate(m);
490 			} else {
491 				vm_page_deactivate(m);
492 			}
493 		}
494 		if (m->vmp_overwriting) {
495 			/*
496 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
497 			 */
498 			if (m->vmp_busy) {
499 				/*
500 				 * We do not re-set m->vmp_dirty !
501 				 * The page was busy so no extraneous activity
502 				 * could have occurred. COPY_INTO is a read into the
503 				 * new pages. CLEAN_IN_PLACE does actually write
504 				 * out the pages but handling outside of this code
505 				 * will take care of resetting dirty. We clear the
506 				 * modify however for the Programmed I/O case.
507 				 */
508 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
509 
510 				m->vmp_busy = FALSE;
511 				m->vmp_absent = FALSE;
512 			} else {
513 				/*
514 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
515 				 * Occurs when the original page was wired
516 				 * at the time of the list request
517 				 */
518 				assert(VM_PAGE_WIRED(m));
519 				vm_page_unwire(m, TRUE);        /* reactivates */
520 			}
521 			m->vmp_overwriting = FALSE;
522 		} else {
523 			m->vmp_dirty = FALSE;
524 		}
525 		m->vmp_cleaning = FALSE;
526 
527 		/*
528 		 * Wakeup any thread waiting for the page to be un-cleaning.
529 		 */
530 		vm_page_wakeup(object, m);
531 		vm_page_unlock_queues();
532 	}
533 	/*
534 	 * Account for the paging reference taken in vm_paging_object_allocate.
535 	 */
536 	vm_object_activity_end(shadow_object);
537 	vm_object_unlock(shadow_object);
538 
539 	assert(os_ref_get_count_raw(&object->ref_count) == 0);
540 	assert(object->paging_in_progress == 0);
541 	assert(object->activity_in_progress == 0);
542 	assert(object->resident_page_count == 0);
543 	return;
544 }
545 
546 /*
547  * Routine:	vm_pageclean_setup
548  *
549  * Purpose:	setup a page to be cleaned (made non-dirty), but not
550  *		necessarily flushed from the VM page cache.
551  *		This is accomplished by cleaning in place.
552  *
553  *		The page must not be busy, and new_object
554  *		must be locked.
555  *
556  */
557 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)558 vm_pageclean_setup(
559 	vm_page_t               m,
560 	vm_page_t               new_m,
561 	vm_object_t             new_object,
562 	vm_object_offset_t      new_offset)
563 {
564 	assert(!m->vmp_busy);
565 #if 0
566 	assert(!m->vmp_cleaning);
567 #endif
568 
569 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
570 
571 	/*
572 	 * Mark original page as cleaning in place.
573 	 */
574 	m->vmp_cleaning = TRUE;
575 	SET_PAGE_DIRTY(m, FALSE);
576 	m->vmp_precious = FALSE;
577 
578 	/*
579 	 * Convert the fictitious page to a private shadow of
580 	 * the real page.
581 	 */
582 	new_m->vmp_free_when_done = TRUE;
583 
584 	vm_page_lockspin_queues();
585 	vm_page_make_private(new_m, VM_PAGE_GET_PHYS_PAGE(m));
586 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
587 	vm_page_unlock_queues();
588 
589 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
590 	assert(!new_m->vmp_wanted);
591 	new_m->vmp_busy = FALSE;
592 }
593 
594 /*
595  *	Routine:	vm_pageout_initialize_page
596  *	Purpose:
597  *		Causes the specified page to be initialized in
598  *		the appropriate memory object. This routine is used to push
599  *		pages into a copy-object when they are modified in the
600  *		permanent object.
601  *
602  *		The page is moved to a temporary object and paged out.
603  *
604  *	In/out conditions:
605  *		The page in question must not be on any pageout queues.
606  *		The object to which it belongs must be locked.
607  *		The page must be busy, but not hold a paging reference.
608  *
609  *	Implementation:
610  *		Move this page to a completely new object.
611  */
612 void
vm_pageout_initialize_page(vm_page_t m)613 vm_pageout_initialize_page(
614 	vm_page_t       m)
615 {
616 	vm_object_t             object;
617 	vm_object_offset_t      paging_offset;
618 	memory_object_t         pager;
619 
620 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
621 
622 	object = VM_PAGE_OBJECT(m);
623 
624 	assert(m->vmp_busy);
625 	assert(object->internal);
626 
627 	/*
628 	 *	Verify that we really want to clean this page
629 	 */
630 	assert(!m->vmp_absent);
631 	assert(m->vmp_dirty);
632 
633 	/*
634 	 *	Create a paging reference to let us play with the object.
635 	 */
636 	paging_offset = m->vmp_offset + object->paging_offset;
637 
638 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
639 		panic("reservation without pageout?"); /* alan */
640 
641 		VM_PAGE_FREE(m);
642 		vm_object_unlock(object);
643 
644 		return;
645 	}
646 
647 	/*
648 	 * If there's no pager, then we can't clean the page.  This should
649 	 * never happen since this should be a copy object and therefore not
650 	 * an external object, so the pager should always be there.
651 	 */
652 
653 	pager = object->pager;
654 
655 	if (pager == MEMORY_OBJECT_NULL) {
656 		panic("missing pager for copy object");
657 
658 		VM_PAGE_FREE(m);
659 		return;
660 	}
661 
662 	/*
663 	 * set the page for future call to vm_fault_list_request
664 	 */
665 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
666 	SET_PAGE_DIRTY(m, FALSE);
667 
668 	/*
669 	 * keep the object from collapsing or terminating
670 	 */
671 	vm_object_paging_begin(object);
672 	vm_object_unlock(object);
673 
674 	/*
675 	 *	Write the data to its pager.
676 	 *	Note that the data is passed by naming the new object,
677 	 *	not a virtual address; the pager interface has been
678 	 *	manipulated to use the "internal memory" data type.
679 	 *	[The object reference from its allocation is donated
680 	 *	to the eventual recipient.]
681 	 */
682 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
683 
684 	vm_object_lock(object);
685 	vm_object_paging_end(object);
686 }
687 
688 
689 /*
690  * vm_pageout_cluster:
691  *
692  * Given a page, queue it to the appropriate I/O thread,
693  * which will page it out and attempt to clean adjacent pages
694  * in the same operation.
695  *
696  * The object and queues must be locked. We will take a
697  * paging reference to prevent deallocation or collapse when we
698  * release the object lock back at the call site.  The I/O thread
699  * is responsible for consuming this reference
700  *
701  * The page must not be on any pageout queue.
702  */
703 #if DEVELOPMENT || DEBUG
704 vmct_stats_t vmct_stats;
705 
706 int32_t vmct_active = 0;
707 uint64_t vm_compressor_epoch_start = 0;
708 uint64_t vm_compressor_epoch_stop = 0;
709 
710 typedef enum vmct_state_t {
711 	VMCT_IDLE,
712 	VMCT_AWAKENED,
713 	VMCT_ACTIVE,
714 } vmct_state_t;
715 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
716 #endif
717 
718 
719 
720 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)721 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
722 {
723 	vm_object_t object = VM_PAGE_OBJECT(m);
724 
725 	VM_PAGE_CHECK(m);
726 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
727 	vm_object_lock_assert_exclusive(object);
728 
729 	/*
730 	 * Make sure it's OK to page this out.
731 	 */
732 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
733 	assert(!m->vmp_cleaning && !m->vmp_laundry);
734 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
735 
736 	/*
737 	 * protect the object from collapse or termination
738 	 */
739 	vm_object_activity_begin(object);
740 
741 
742 	/*
743 	 * pgo_laundry count is tied to the laundry bit
744 	 */
745 	m->vmp_laundry = TRUE;
746 	q->pgo_laundry++;
747 
748 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
749 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
750 
751 	if (object->internal == TRUE) {
752 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
753 		m->vmp_busy = TRUE;
754 #if DEVELOPMENT || DEBUG
755 		/*
756 		 * The benchmark queue will be woken up independently by the benchmark
757 		 * itself.
758 		 */
759 		if (q != &vm_pageout_queue_benchmark) {
760 #else /* DEVELOPMENT || DEBUG */
761 		if (true) {
762 #endif /* DEVELOPMENT || DEBUG */
763 			/*
764 			 * Wake up the first compressor thread. It will wake subsequent
765 			 * threads if necessary.
766 			 */
767 			sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
768 			    pgo_iothread_internal_state[0].pgo_iothread);
769 		}
770 	} else {
771 		sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
772 	}
773 	VM_PAGE_CHECK(m);
774 }
775 
776 void
777 vm_pageout_cluster(vm_page_t m)
778 {
779 	struct          vm_pageout_queue *q;
780 	vm_object_t     object = VM_PAGE_OBJECT(m);
781 	if (object->internal) {
782 		q = &vm_pageout_queue_internal;
783 	} else {
784 		q = &vm_pageout_queue_external;
785 	}
786 	vm_pageout_cluster_to_queue(m, q);
787 }
788 
789 
790 /*
791  * A page is back from laundry or we are stealing it back from
792  * the laundering state.  See if there are some pages waiting to
793  * go to laundry and if we can let some of them go now.
794  *
795  * Object and page queues must be locked.
796  */
797 void
798 vm_pageout_throttle_up(
799 	vm_page_t       m)
800 {
801 	struct vm_pageout_queue *q;
802 	vm_object_t      m_object;
803 
804 	m_object = VM_PAGE_OBJECT(m);
805 
806 	assert(m_object != VM_OBJECT_NULL);
807 	assert(!is_kernel_object(m_object));
808 
809 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
810 	vm_object_lock_assert_exclusive(m_object);
811 
812 	if (m_object->internal == TRUE) {
813 		q = &vm_pageout_queue_internal;
814 	} else {
815 		q = &vm_pageout_queue_external;
816 	}
817 
818 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
819 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
820 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
821 
822 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
823 
824 		vm_object_activity_end(m_object);
825 
826 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
827 	}
828 	if (m->vmp_laundry == TRUE) {
829 		m->vmp_laundry = FALSE;
830 		q->pgo_laundry--;
831 
832 		if (q->pgo_throttled == TRUE) {
833 			q->pgo_throttled = FALSE;
834 			thread_wakeup((event_t) &q->pgo_laundry);
835 		}
836 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
837 			q->pgo_draining = FALSE;
838 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
839 		}
840 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
841 	}
842 }
843 
844 
845 static void
846 vm_pageout_throttle_up_batch(
847 	struct vm_pageout_queue *q,
848 	int             batch_cnt)
849 {
850 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
851 
852 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
853 
854 	q->pgo_laundry -= batch_cnt;
855 
856 	if (q->pgo_throttled == TRUE) {
857 		q->pgo_throttled = FALSE;
858 		thread_wakeup((event_t) &q->pgo_laundry);
859 	}
860 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
861 		q->pgo_draining = FALSE;
862 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
863 	}
864 }
865 
866 
867 
868 /*
869  * VM memory pressure monitoring.
870  *
871  * vm_pageout_scan() keeps track of the number of pages it considers and
872  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
873  *
874  * compute_memory_pressure() is called every second from compute_averages()
875  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
876  * of recalimed pages in a new vm_pageout_stat[] bucket.
877  *
878  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
879  * The caller provides the number of seconds ("nsecs") worth of statistics
880  * it wants, up to 30 seconds.
881  * It computes the number of pages reclaimed in the past "nsecs" seconds and
882  * also returns the number of pages the system still needs to reclaim at this
883  * moment in time.
884  */
885 #if DEVELOPMENT || DEBUG
886 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
887 #else
888 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
889 #endif
890 struct vm_pageout_stat {
891 	unsigned long vm_page_active_count;
892 	unsigned long vm_page_speculative_count;
893 	unsigned long vm_page_inactive_count;
894 	unsigned long vm_page_anonymous_count;
895 
896 	unsigned long vm_page_free_count;
897 	unsigned long vm_page_wire_count;
898 	unsigned long vm_page_compressor_count;
899 
900 	unsigned long vm_page_pages_compressed;
901 	unsigned long vm_page_pageable_internal_count;
902 	unsigned long vm_page_pageable_external_count;
903 	unsigned long vm_page_xpmapped_external_count;
904 
905 	unsigned long vm_page_swapped_count;
906 	uint64_t swapouts;
907 	uint64_t swapins;
908 
909 	unsigned int pages_grabbed;
910 	unsigned int pages_freed;
911 
912 	unsigned int pages_compressed;
913 	unsigned int pages_grabbed_by_compressor;
914 	unsigned int failed_compressions;
915 
916 	unsigned int pages_evicted;
917 	unsigned int pages_purged;
918 
919 	unsigned int considered;
920 	unsigned int considered_bq_internal;
921 	unsigned int considered_bq_external;
922 
923 	unsigned int skipped_external;
924 	unsigned int skipped_internal;
925 	unsigned int filecache_min_reactivations;
926 
927 	unsigned int freed_speculative;
928 	unsigned int freed_cleaned;
929 	unsigned int freed_internal;
930 	unsigned int freed_external;
931 
932 	unsigned int cleaned_dirty_external;
933 	unsigned int cleaned_dirty_internal;
934 
935 	unsigned int inactive_referenced;
936 	unsigned int inactive_nolock;
937 	unsigned int reactivation_limit_exceeded;
938 	unsigned int forced_inactive_reclaim;
939 
940 	unsigned int throttled_internal_q;
941 	unsigned int throttled_external_q;
942 
943 	unsigned int phantom_ghosts_found;
944 	unsigned int phantom_ghosts_added;
945 
946 	unsigned int vm_page_realtime_count;
947 	unsigned int forcereclaimed_sharedcache;
948 	unsigned int forcereclaimed_realtime;
949 	unsigned int protected_sharedcache;
950 	unsigned int protected_realtime;
951 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
952 
953 unsigned int vm_pageout_stat_now = 0;
954 
955 #define VM_PAGEOUT_STAT_BEFORE(i) \
956 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
957 #define VM_PAGEOUT_STAT_AFTER(i) \
958 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
959 
960 #if VM_PAGE_BUCKETS_CHECK
961 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
962 #endif /* VM_PAGE_BUCKETS_CHECK */
963 
964 
965 void
966 record_memory_pressure(void);
967 void
968 record_memory_pressure(void)
969 {
970 	unsigned int vm_pageout_next;
971 
972 #if VM_PAGE_BUCKETS_CHECK
973 	/* check the consistency of VM page buckets at regular interval */
974 	static int counter = 0;
975 	if ((++counter % vm_page_buckets_check_interval) == 0) {
976 		vm_page_buckets_check();
977 	}
978 #endif /* VM_PAGE_BUCKETS_CHECK */
979 
980 	vm_pageout_state.vm_memory_pressure =
981 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
982 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
983 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
984 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
985 
986 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
987 
988 	/* move "now" forward */
989 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
990 
991 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
992 
993 	vm_pageout_stat_now = vm_pageout_next;
994 }
995 
996 
997 /*
998  * IMPORTANT
999  * mach_vm_ctl_page_free_wanted() is called indirectly, via
1000  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1001  * it must be safe in the restricted stackshot context. Locks and/or
1002  * blocking are not allowable.
1003  */
1004 unsigned int
1005 mach_vm_ctl_page_free_wanted(void)
1006 {
1007 	unsigned int page_free_target, page_free_count, page_free_wanted;
1008 
1009 	page_free_target = vm_page_free_target;
1010 	page_free_count = vm_page_free_count;
1011 	if (page_free_target > page_free_count) {
1012 		page_free_wanted = page_free_target - page_free_count;
1013 	} else {
1014 		page_free_wanted = 0;
1015 	}
1016 
1017 	return page_free_wanted;
1018 }
1019 
1020 
1021 /*
1022  * IMPORTANT:
1023  * mach_vm_pressure_monitor() is called when taking a stackshot, with
1024  * wait_for_pressure FALSE, so that code path must remain safe in the
1025  * restricted stackshot context. No blocking or locks are allowable.
1026  * on that code path.
1027  */
1028 
1029 kern_return_t
1030 mach_vm_pressure_monitor(
1031 	boolean_t       wait_for_pressure,
1032 	unsigned int    nsecs_monitored,
1033 	unsigned int    *pages_reclaimed_p,
1034 	unsigned int    *pages_wanted_p)
1035 {
1036 	wait_result_t   wr;
1037 	unsigned int    vm_pageout_then, vm_pageout_now;
1038 	unsigned int    pages_reclaimed;
1039 	unsigned int    units_of_monitor;
1040 
1041 	units_of_monitor = 8 * nsecs_monitored;
1042 	/*
1043 	 * We don't take the vm_page_queue_lock here because we don't want
1044 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1045 	 * thread when it's trying to reclaim memory.  We don't need fully
1046 	 * accurate monitoring anyway...
1047 	 */
1048 
1049 	if (wait_for_pressure) {
1050 		/* wait until there's memory pressure */
1051 		while (vm_page_free_count >= vm_page_free_target) {
1052 			wr = assert_wait((event_t) &vm_page_free_wanted,
1053 			    THREAD_INTERRUPTIBLE);
1054 			if (wr == THREAD_WAITING) {
1055 				wr = thread_block(THREAD_CONTINUE_NULL);
1056 			}
1057 			if (wr == THREAD_INTERRUPTED) {
1058 				return KERN_ABORTED;
1059 			}
1060 			if (wr == THREAD_AWAKENED) {
1061 				/*
1062 				 * The memory pressure might have already
1063 				 * been relieved but let's not block again
1064 				 * and let's report that there was memory
1065 				 * pressure at some point.
1066 				 */
1067 				break;
1068 			}
1069 		}
1070 	}
1071 
1072 	/* provide the number of pages the system wants to reclaim */
1073 	if (pages_wanted_p != NULL) {
1074 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1075 	}
1076 
1077 	if (pages_reclaimed_p == NULL) {
1078 		return KERN_SUCCESS;
1079 	}
1080 
1081 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1082 	vm_pageout_now = vm_pageout_stat_now;
1083 	pages_reclaimed = 0;
1084 	for (vm_pageout_then =
1085 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1086 	    vm_pageout_then != vm_pageout_now &&
1087 	    units_of_monitor-- != 0;
1088 	    vm_pageout_then =
1089 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1090 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1091 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1092 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1093 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1094 	}
1095 	*pages_reclaimed_p = pages_reclaimed;
1096 
1097 	return KERN_SUCCESS;
1098 }
1099 
1100 
1101 
1102 #if DEVELOPMENT || DEBUG
1103 
1104 static void
1105 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1106 
1107 /*
1108  * condition variable used to make sure there is
1109  * only a single sweep going on at a time
1110  */
1111 bool vm_pageout_disconnect_all_pages_active = false;
1112 
1113 void
1114 vm_pageout_disconnect_all_pages()
1115 {
1116 	vm_page_lock_queues();
1117 
1118 	if (vm_pageout_disconnect_all_pages_active) {
1119 		vm_page_unlock_queues();
1120 		return;
1121 	}
1122 	vm_pageout_disconnect_all_pages_active = true;
1123 
1124 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1125 	    vm_page_throttled_count);
1126 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1127 	    vm_page_anonymous_count);
1128 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1129 	    (vm_page_inactive_count - vm_page_anonymous_count));
1130 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1131 	    vm_page_active_count);
1132 #ifdef CONFIG_SECLUDED_MEMORY
1133 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1134 	    vm_page_secluded_count);
1135 #endif /* CONFIG_SECLUDED_MEMORY */
1136 	vm_page_unlock_queues();
1137 
1138 	vm_pageout_disconnect_all_pages_active = false;
1139 }
1140 
1141 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1142 void
1143 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1144 {
1145 	vm_page_t       m;
1146 	vm_object_t     t_object = NULL;
1147 	vm_object_t     l_object = NULL;
1148 	vm_object_t     m_object = NULL;
1149 	int             delayed_unlock = 0;
1150 	int             try_failed_count = 0;
1151 	int             disconnected_count = 0;
1152 	int             paused_count = 0;
1153 	int             object_locked_count = 0;
1154 
1155 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1156 	    DBG_FUNC_START),
1157 	    q, qcount);
1158 
1159 	while (qcount && !vm_page_queue_empty(q)) {
1160 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1161 
1162 		m = (vm_page_t) vm_page_queue_first(q);
1163 		m_object = VM_PAGE_OBJECT(m);
1164 
1165 		if (m_object == VM_OBJECT_NULL) {
1166 			/*
1167 			 * Bumped into a free page. This should only happen on the
1168 			 * secluded queue
1169 			 */
1170 #if CONFIG_SECLUDED_MEMORY
1171 			assert(q == &vm_page_queue_secluded);
1172 #endif /* CONFIG_SECLUDED_MEMORY */
1173 			goto reenter_pg_on_q;
1174 		}
1175 
1176 		/*
1177 		 * check to see if we currently are working
1178 		 * with the same object... if so, we've
1179 		 * already got the lock
1180 		 */
1181 		if (m_object != l_object) {
1182 			/*
1183 			 * the object associated with candidate page is
1184 			 * different from the one we were just working
1185 			 * with... dump the lock if we still own it
1186 			 */
1187 			if (l_object != NULL) {
1188 				vm_object_unlock(l_object);
1189 				l_object = NULL;
1190 			}
1191 			if (m_object != t_object) {
1192 				try_failed_count = 0;
1193 			}
1194 
1195 			/*
1196 			 * Try to lock object; since we've alread got the
1197 			 * page queues lock, we can only 'try' for this one.
1198 			 * if the 'try' fails, we need to do a mutex_pause
1199 			 * to allow the owner of the object lock a chance to
1200 			 * run...
1201 			 */
1202 			if (!vm_object_lock_try_scan(m_object)) {
1203 				if (try_failed_count > 20) {
1204 					goto reenter_pg_on_q;
1205 				}
1206 				vm_page_unlock_queues();
1207 				mutex_pause(try_failed_count++);
1208 				vm_page_lock_queues();
1209 				delayed_unlock = 0;
1210 
1211 				paused_count++;
1212 
1213 				t_object = m_object;
1214 				continue;
1215 			}
1216 			object_locked_count++;
1217 
1218 			l_object = m_object;
1219 		}
1220 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1221 		    m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1222 		    m->vmp_free_when_done) {
1223 			/*
1224 			 * put it back on the head of its queue
1225 			 */
1226 			goto reenter_pg_on_q;
1227 		}
1228 		if (m->vmp_pmapped == TRUE) {
1229 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1230 
1231 			disconnected_count++;
1232 		}
1233 reenter_pg_on_q:
1234 		vm_page_queue_remove(q, m, vmp_pageq);
1235 		vm_page_queue_enter(q, m, vmp_pageq);
1236 
1237 		qcount--;
1238 		try_failed_count = 0;
1239 
1240 		if (delayed_unlock++ > 128) {
1241 			if (l_object != NULL) {
1242 				vm_object_unlock(l_object);
1243 				l_object = NULL;
1244 			}
1245 			lck_mtx_yield(&vm_page_queue_lock);
1246 			delayed_unlock = 0;
1247 		}
1248 	}
1249 	if (l_object != NULL) {
1250 		vm_object_unlock(l_object);
1251 		l_object = NULL;
1252 	}
1253 
1254 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1255 	    DBG_FUNC_END),
1256 	    q, disconnected_count, object_locked_count, paused_count);
1257 }
1258 
1259 extern const char *proc_best_name(struct proc* proc);
1260 
1261 int
1262 vm_toggle_task_selfdonate_pages(task_t task)
1263 {
1264 	int state = 0;
1265 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1266 		printf("VM Donation mode is OFF on the system\n");
1267 		return state;
1268 	}
1269 	if (task != kernel_task) {
1270 		task_lock(task);
1271 		if (!task->donates_own_pages) {
1272 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1273 			task->donates_own_pages = true;
1274 			state = 1;
1275 		} else if (task->donates_own_pages) {
1276 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1277 			task->donates_own_pages = false;
1278 			state = 0;
1279 		}
1280 		task_unlock(task);
1281 	}
1282 	return state;
1283 }
1284 #endif /* DEVELOPMENT || DEBUG */
1285 
1286 void
1287 vm_task_set_selfdonate_pages(task_t task, bool donate)
1288 {
1289 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1290 	assert(task != kernel_task);
1291 
1292 	task_lock(task);
1293 	task->donates_own_pages = donate;
1294 	task_unlock(task);
1295 }
1296 
1297 
1298 
1299 static size_t
1300 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1301 
1302 /*
1303  * condition variable used to make sure there is
1304  * only a single sweep going on at a time
1305  */
1306 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1307 
1308 
1309 kern_return_t
1310 vm_pageout_anonymous_pages()
1311 {
1312 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1313 		size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1314 		vm_page_lock_queues();
1315 
1316 		if (vm_pageout_anonymous_pages_active == TRUE) {
1317 			vm_page_unlock_queues();
1318 			return KERN_RESOURCE_SHORTAGE;
1319 		}
1320 		vm_pageout_anonymous_pages_active = TRUE;
1321 		vm_page_unlock_queues();
1322 
1323 		throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1324 		anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1325 		active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1326 
1327 		os_log(OS_LOG_DEFAULT,
1328 		    "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1329 		    __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1330 
1331 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1332 			vm_consider_swapping();
1333 		}
1334 
1335 		vm_page_lock_queues();
1336 		vm_pageout_anonymous_pages_active = FALSE;
1337 		vm_page_unlock_queues();
1338 		return KERN_SUCCESS;
1339 	} else {
1340 		return KERN_NOT_SUPPORTED;
1341 	}
1342 }
1343 
1344 
1345 size_t
1346 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1347 {
1348 	vm_page_t       m;
1349 	vm_object_t     t_object = NULL;
1350 	vm_object_t     l_object = NULL;
1351 	vm_object_t     m_object = NULL;
1352 	int             delayed_unlock = 0;
1353 	int             try_failed_count = 0;
1354 	int             refmod_state;
1355 	int             pmap_options;
1356 	struct          vm_pageout_queue *iq;
1357 	ppnum_t         phys_page;
1358 	size_t          pages_moved = 0;
1359 
1360 
1361 	iq = &vm_pageout_queue_internal;
1362 
1363 	vm_page_lock_queues();
1364 
1365 #if DEVELOPMENT || DEBUG
1366 	if (perf_test) {
1367 		iq = &vm_pageout_queue_benchmark;
1368 		// ensure the benchmark queue isn't throttled
1369 		iq->pgo_maxlaundry = (unsigned int) qcount;
1370 	}
1371 #endif /* DEVELOPMENT ||DEBUG */
1372 
1373 	while (qcount && !vm_page_queue_empty(q)) {
1374 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1375 
1376 		if (VM_PAGE_Q_THROTTLED(iq)) {
1377 			if (l_object != NULL) {
1378 				vm_object_unlock(l_object);
1379 				l_object = NULL;
1380 			}
1381 			iq->pgo_draining = TRUE;
1382 
1383 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1384 			vm_page_unlock_queues();
1385 
1386 			thread_block(THREAD_CONTINUE_NULL);
1387 
1388 			vm_page_lock_queues();
1389 			delayed_unlock = 0;
1390 			continue;
1391 		}
1392 		m = (vm_page_t) vm_page_queue_first(q);
1393 		m_object = VM_PAGE_OBJECT(m);
1394 
1395 		/*
1396 		 * check to see if we currently are working
1397 		 * with the same object... if so, we've
1398 		 * already got the lock
1399 		 */
1400 		if (m_object != l_object) {
1401 			if (!m_object->internal) {
1402 				goto reenter_pg_on_q;
1403 			}
1404 
1405 			/*
1406 			 * the object associated with candidate page is
1407 			 * different from the one we were just working
1408 			 * with... dump the lock if we still own it
1409 			 */
1410 			if (l_object != NULL) {
1411 				vm_object_unlock(l_object);
1412 				l_object = NULL;
1413 			}
1414 			if (m_object != t_object) {
1415 				try_failed_count = 0;
1416 			}
1417 
1418 			/*
1419 			 * Try to lock object; since we've alread got the
1420 			 * page queues lock, we can only 'try' for this one.
1421 			 * if the 'try' fails, we need to do a mutex_pause
1422 			 * to allow the owner of the object lock a chance to
1423 			 * run...
1424 			 */
1425 			if (!vm_object_lock_try_scan(m_object)) {
1426 				if (try_failed_count > 20) {
1427 					goto reenter_pg_on_q;
1428 				}
1429 				vm_page_unlock_queues();
1430 				mutex_pause(try_failed_count++);
1431 				vm_page_lock_queues();
1432 				delayed_unlock = 0;
1433 
1434 				t_object = m_object;
1435 				continue;
1436 			}
1437 			l_object = m_object;
1438 		}
1439 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1440 			/*
1441 			 * page is not to be cleaned
1442 			 * put it back on the head of its queue
1443 			 */
1444 			goto reenter_pg_on_q;
1445 		}
1446 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1447 
1448 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1449 			refmod_state = pmap_get_refmod(phys_page);
1450 
1451 			if (refmod_state & VM_MEM_REFERENCED) {
1452 				m->vmp_reference = TRUE;
1453 			}
1454 			if (refmod_state & VM_MEM_MODIFIED) {
1455 				SET_PAGE_DIRTY(m, FALSE);
1456 			}
1457 		}
1458 		if (m->vmp_reference == TRUE) {
1459 			m->vmp_reference = FALSE;
1460 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1461 			goto reenter_pg_on_q;
1462 		}
1463 		if (m->vmp_pmapped == TRUE) {
1464 			if (m->vmp_dirty || m->vmp_precious) {
1465 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1466 			} else {
1467 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1468 			}
1469 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1470 			if (refmod_state & VM_MEM_MODIFIED) {
1471 				SET_PAGE_DIRTY(m, FALSE);
1472 			}
1473 		}
1474 
1475 		if (!m->vmp_dirty && !m->vmp_precious) {
1476 			vm_page_unlock_queues();
1477 			VM_PAGE_FREE(m);
1478 			vm_page_lock_queues();
1479 			delayed_unlock = 0;
1480 
1481 			goto next_pg;
1482 		}
1483 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1484 			if (!m_object->pager_initialized) {
1485 				vm_page_unlock_queues();
1486 
1487 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1488 
1489 				if (!m_object->pager_initialized) {
1490 					vm_object_compressor_pager_create(m_object);
1491 				}
1492 
1493 				vm_page_lock_queues();
1494 				delayed_unlock = 0;
1495 			}
1496 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1497 				/*
1498 				 * We dropped the page queues lock above, so
1499 				 * "m" might no longer be on this queue...
1500 				 */
1501 				if (m != (vm_page_t) vm_page_queue_first(q)) {
1502 					continue;
1503 				}
1504 				goto reenter_pg_on_q;
1505 			}
1506 			/*
1507 			 * vm_object_compressor_pager_create will drop the object lock
1508 			 * which means 'm' may no longer be valid to use
1509 			 */
1510 			continue;
1511 		}
1512 
1513 		if (!perf_test) {
1514 			/*
1515 			 * we've already factored out pages in the laundry which
1516 			 * means this page can't be on the pageout queue so it's
1517 			 * safe to do the vm_page_queues_remove
1518 			 */
1519 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1520 			vm_page_queues_remove(m, TRUE);
1521 			if (donate) {
1522 				/*
1523 				 * The compressor needs to see this bit to know
1524 				 * where this page needs to land. Also if stolen,
1525 				 * this bit helps put the page back in the right
1526 				 * special queue where it belongs.
1527 				 */
1528 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1529 			}
1530 		} else {
1531 			vm_page_queue_remove(q, m, vmp_pageq);
1532 		}
1533 
1534 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1535 
1536 		vm_pageout_cluster_to_queue(m, iq);
1537 
1538 		pages_moved++;
1539 		goto next_pg;
1540 
1541 reenter_pg_on_q:
1542 		vm_page_queue_remove(q, m, vmp_pageq);
1543 		vm_page_queue_enter(q, m, vmp_pageq);
1544 next_pg:
1545 		qcount--;
1546 		try_failed_count = 0;
1547 
1548 		if (delayed_unlock++ > 128) {
1549 			if (l_object != NULL) {
1550 				vm_object_unlock(l_object);
1551 				l_object = NULL;
1552 			}
1553 			lck_mtx_yield(&vm_page_queue_lock);
1554 			delayed_unlock = 0;
1555 		}
1556 	}
1557 	if (l_object != NULL) {
1558 		vm_object_unlock(l_object);
1559 		l_object = NULL;
1560 	}
1561 	vm_page_unlock_queues();
1562 	return pages_moved;
1563 }
1564 
1565 
1566 
1567 /*
1568  * function in BSD to apply I/O throttle to the pageout thread
1569  */
1570 extern void vm_pageout_io_throttle(void);
1571 
1572 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1573 	MACRO_BEGIN                                                     \
1574 	/* \
1575 	 * If a "reusable" page somehow made it back into \
1576 	 * the active queue, it's been re-used and is not \
1577 	 * quite re-usable. \
1578 	 * If the VM object was "all_reusable", consider it \
1579 	 * as "all re-used" instead of converting it to \
1580 	 * "partially re-used", which could be expensive. \
1581 	 */                                                             \
1582 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1583 	if ((m)->vmp_reusable ||                                        \
1584 	    (obj)->all_reusable) {                                      \
1585 	        vm_object_reuse_pages((obj),                            \
1586 	                              (m)->vmp_offset,                  \
1587 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1588 	                              FALSE);                           \
1589 	}                                                               \
1590 	MACRO_END
1591 
1592 
1593 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1594 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1595 
1596 #define FCS_IDLE                0
1597 #define FCS_DELAYED             1
1598 #define FCS_DEADLOCK_DETECTED   2
1599 
1600 struct flow_control {
1601 	int             state;
1602 	mach_timespec_t ts;
1603 };
1604 
1605 
1606 uint64_t vm_pageout_rejected_bq_internal = 0;
1607 uint64_t vm_pageout_rejected_bq_external = 0;
1608 uint64_t vm_pageout_skipped_bq_internal = 0;
1609 uint64_t vm_pageout_skipped_bq_external = 0;
1610 
1611 #define ANONS_GRABBED_LIMIT     2
1612 
1613 
1614 #if 0
1615 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1616 #endif
1617 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1618 
1619 #define VM_PAGEOUT_PB_NO_ACTION                         0
1620 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1621 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1622 
1623 
1624 #if 0
1625 static void
1626 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1627 {
1628 	if (*local_freeq) {
1629 		vm_page_unlock_queues();
1630 
1631 		VM_DEBUG_CONSTANT_EVENT(
1632 			vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1633 			vm_page_free_count, 0, 0, 1);
1634 
1635 		vm_page_free_list(*local_freeq, TRUE);
1636 
1637 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1638 		    vm_page_free_count, *local_freed, 0, 1);
1639 
1640 		*local_freeq = NULL;
1641 		*local_freed = 0;
1642 
1643 		vm_page_lock_queues();
1644 	} else {
1645 		lck_mtx_yield(&vm_page_queue_lock);
1646 	}
1647 	*delayed_unlock = 1;
1648 }
1649 #endif
1650 
1651 
1652 static void
1653 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1654     vm_page_t *local_freeq, int *local_freed, int action)
1655 {
1656 	vm_page_unlock_queues();
1657 
1658 	if (*object != NULL) {
1659 		vm_object_unlock(*object);
1660 		*object = NULL;
1661 	}
1662 	if (*local_freeq) {
1663 		vm_page_free_list(*local_freeq, TRUE);
1664 
1665 		*local_freeq = NULL;
1666 		*local_freed = 0;
1667 	}
1668 	*delayed_unlock = 1;
1669 
1670 	switch (action) {
1671 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1672 		vm_consider_waking_compactor_swapper();
1673 		break;
1674 	case VM_PAGEOUT_PB_THREAD_YIELD:
1675 		thread_yield_internal(1);
1676 		break;
1677 	case VM_PAGEOUT_PB_NO_ACTION:
1678 	default:
1679 		break;
1680 	}
1681 	vm_page_lock_queues();
1682 }
1683 
1684 
1685 static struct vm_pageout_vminfo last;
1686 static uint64_t last_swapouts;
1687 static uint64_t last_swapins;
1688 
1689 uint64_t last_vm_page_pages_grabbed = 0;
1690 
1691 extern  uint32_t c_segment_pages_compressed;
1692 
1693 extern uint64_t shared_region_pager_reclaimed;
1694 extern struct memory_object_pager_ops shared_region_pager_ops;
1695 
1696 void
1697 update_vm_info(void)
1698 {
1699 	unsigned long tmp;
1700 	uint64_t tmp64;
1701 
1702 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1703 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1704 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1705 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1706 
1707 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1708 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1709 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1710 	vm_pageout_stats[vm_pageout_stat_now].vm_page_swapped_count = os_atomic_load(&vm_page_swapped_count, relaxed);
1711 
1712 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1713 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1714 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1715 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1716 	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1717 
1718 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1719 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1720 	last.vm_pageout_considered_page = tmp;
1721 
1722 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1723 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1724 	last.vm_pageout_compressions = tmp64;
1725 
1726 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1727 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1728 	last.vm_compressor_failed = tmp;
1729 
1730 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1731 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1732 	last.vm_compressor_pages_grabbed = tmp64;
1733 
1734 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1735 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1736 	last.vm_phantom_cache_found_ghost = tmp;
1737 
1738 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1739 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1740 	last.vm_phantom_cache_added_ghost = tmp;
1741 
1742 	tmp64 = counter_load(&vm_page_grab_count);
1743 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1744 	last_vm_page_pages_grabbed = tmp64;
1745 
1746 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1747 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1748 	last.vm_page_pages_freed = tmp;
1749 
1750 	tmp64 = counter_load(&vm_statistics_swapouts);
1751 	vm_pageout_stats[vm_pageout_stat_now].swapouts = tmp64 - last_swapouts;
1752 	last_swapouts = tmp64;
1753 
1754 	tmp64 = counter_load(&vm_statistics_swapins);
1755 	vm_pageout_stats[vm_pageout_stat_now].swapins = tmp64 - last_swapins;
1756 	last_swapins = tmp64;
1757 
1758 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1759 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1760 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1761 		last.vm_pageout_pages_evicted = tmp;
1762 
1763 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1764 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1765 		last.vm_pageout_pages_purged = tmp;
1766 
1767 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1768 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1769 		last.vm_pageout_freed_speculative = tmp;
1770 
1771 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1772 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1773 		last.vm_pageout_freed_external = tmp;
1774 
1775 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1776 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1777 		last.vm_pageout_inactive_referenced = tmp;
1778 
1779 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1780 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1781 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1782 
1783 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1784 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1785 		last.vm_pageout_inactive_dirty_external = tmp;
1786 
1787 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1788 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1789 		last.vm_pageout_freed_cleaned = tmp;
1790 
1791 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1792 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1793 		last.vm_pageout_inactive_nolock = tmp;
1794 
1795 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1796 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1797 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1798 
1799 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1800 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1801 		last.vm_pageout_skipped_external = tmp;
1802 
1803 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1804 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1805 		last.vm_pageout_skipped_internal = tmp;
1806 
1807 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1808 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1809 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1810 
1811 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1812 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1813 		last.vm_pageout_inactive_force_reclaim = tmp;
1814 
1815 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1816 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1817 		last.vm_pageout_freed_internal = tmp;
1818 
1819 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1820 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1821 		last.vm_pageout_considered_bq_internal = tmp;
1822 
1823 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1824 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1825 		last.vm_pageout_considered_bq_external = tmp;
1826 
1827 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1828 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1829 		last.vm_pageout_filecache_min_reactivated = tmp;
1830 
1831 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1832 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1833 		last.vm_pageout_inactive_dirty_internal = tmp;
1834 
1835 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1836 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1837 		last.vm_pageout_forcereclaimed_sharedcache = tmp;
1838 
1839 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1840 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1841 		last.vm_pageout_forcereclaimed_realtime = tmp;
1842 
1843 		tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1844 		vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1845 		last.vm_pageout_protected_sharedcache = tmp;
1846 
1847 		tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1848 		vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1849 		last.vm_pageout_protected_realtime = tmp;
1850 	}
1851 
1852 	KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT1) | DBG_FUNC_NONE,
1853 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1854 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1855 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1856 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1857 
1858 	KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT2) | DBG_FUNC_NONE,
1859 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1860 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1861 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1862 
1863 	KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT3) | DBG_FUNC_NONE,
1864 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1865 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1866 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1867 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1868 
1869 	KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT4) | DBG_FUNC_NONE,
1870 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_swapped_count);
1871 
1872 
1873 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1874 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1875 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1876 		KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT1) | DBG_FUNC_NONE,
1877 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1878 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1879 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1880 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1881 
1882 		KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT2) | DBG_FUNC_NONE,
1883 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1884 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1885 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1886 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1887 
1888 		KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT3) | DBG_FUNC_NONE,
1889 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1890 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1891 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1892 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1893 
1894 		KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT4) | DBG_FUNC_NONE,
1895 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1896 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1897 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1898 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1899 
1900 		KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT5) | DBG_FUNC_NONE,
1901 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1902 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1903 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1904 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1905 
1906 		KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT6) | DBG_FUNC_NONE,
1907 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1908 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1909 		    vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1910 		    vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1911 	}
1912 	KDBG(MEMINFO_CODE(DBG_MEMINFO_DEMAND1) | DBG_FUNC_NONE,
1913 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1914 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1915 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1916 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1917 
1918 	KDBG(MEMINFO_CODE(DBG_MEMINFO_DEMAND2) | DBG_FUNC_NONE,
1919 	    vm_pageout_stats[vm_pageout_stat_now].swapouts,
1920 	    vm_pageout_stats[vm_pageout_stat_now].swapins);
1921 
1922 	record_memory_pressure();
1923 }
1924 
1925 extern boolean_t hibernation_vmqueues_inspection;
1926 
1927 /*
1928  * Return values for functions called by vm_pageout_scan
1929  * that control its flow.
1930  *
1931  * PROCEED -- vm_pageout_scan will keep making forward progress.
1932  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1933  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1934  */
1935 
1936 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1937 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1938 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1939 
1940 /*
1941  * This function is called only from vm_pageout_scan and
1942  * it moves overflow secluded pages (one-at-a-time) to the
1943  * batched 'local' free Q or active Q.
1944  */
1945 static void
1946 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1947 {
1948 #if CONFIG_SECLUDED_MEMORY
1949 	/*
1950 	 * Deal with secluded_q overflow.
1951 	 */
1952 	if (vm_page_secluded_count > vm_page_secluded_target) {
1953 		vm_page_t secluded_page;
1954 
1955 		/*
1956 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1957 		 * Excess secluded pages go to the active queue and
1958 		 * will later go to the inactive queue.
1959 		 */
1960 		assert((vm_page_secluded_count_free +
1961 		    vm_page_secluded_count_inuse) ==
1962 		    vm_page_secluded_count);
1963 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1964 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1965 
1966 		vm_page_queues_remove(secluded_page, FALSE);
1967 		assert(!vm_page_is_fictitious(secluded_page));
1968 		assert(!VM_PAGE_WIRED(secluded_page));
1969 
1970 		if (secluded_page->vmp_object == 0) {
1971 			/* transfer to free queue */
1972 			assert(secluded_page->vmp_busy);
1973 			secluded_page->vmp_snext = *local_freeq;
1974 			*local_freeq = secluded_page;
1975 			*local_freed += 1;
1976 		} else {
1977 			/* transfer to head of active queue */
1978 			vm_page_enqueue_active(secluded_page, FALSE);
1979 			secluded_page = VM_PAGE_NULL;
1980 		}
1981 	}
1982 #else /* CONFIG_SECLUDED_MEMORY */
1983 
1984 #pragma unused(local_freeq)
1985 #pragma unused(local_freed)
1986 
1987 	return;
1988 
1989 #endif /* CONFIG_SECLUDED_MEMORY */
1990 }
1991 
1992 /*
1993  * This function is called only from vm_pageout_scan and
1994  * it initializes the loop targets for vm_pageout_scan().
1995  */
1996 static void
1997 vps_init_page_targets(void)
1998 {
1999 	/*
2000 	 * LD TODO: Other page targets should be calculated here too.
2001 	 */
2002 	vm_page_anonymous_min = vm_page_inactive_target / 20;
2003 
2004 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
2005 		vm_pageout_state.vm_page_speculative_percentage = 50;
2006 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
2007 		vm_pageout_state.vm_page_speculative_percentage = 1;
2008 	}
2009 
2010 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
2011 	    vm_page_inactive_count);
2012 }
2013 
2014 /*
2015  * This function is called only from vm_pageout_scan and
2016  * it purges a single VM object at-a-time and will either
2017  * make vm_pageout_scan() restart the loop or keeping moving forward.
2018  */
2019 static int
2020 vps_purge_object()
2021 {
2022 	int             force_purge;
2023 
2024 	assert(available_for_purge >= 0);
2025 	force_purge = 0; /* no force-purging */
2026 
2027 #if VM_PRESSURE_EVENTS
2028 	vm_pressure_level_t pressure_level;
2029 
2030 	pressure_level = memorystatus_vm_pressure_level;
2031 
2032 	if (pressure_level > kVMPressureNormal) {
2033 		if (pressure_level >= kVMPressureCritical) {
2034 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2035 		} else if (pressure_level >= kVMPressureUrgent) {
2036 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2037 		} else if (pressure_level >= kVMPressureWarning) {
2038 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2039 		}
2040 	}
2041 #endif /* VM_PRESSURE_EVENTS */
2042 
2043 	if (available_for_purge || force_purge) {
2044 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2045 
2046 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2047 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2048 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2049 			VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2050 			memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2051 
2052 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2053 		}
2054 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2055 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2056 	}
2057 
2058 	return VM_PAGEOUT_SCAN_PROCEED;
2059 }
2060 
2061 /*
2062  * This function is called only from vm_pageout_scan and
2063  * it will try to age the next speculative Q if the oldest
2064  * one is empty.
2065  */
2066 static int
2067 vps_age_speculative_queue(boolean_t force_speculative_aging)
2068 {
2069 #define DELAY_SPECULATIVE_AGE   1000
2070 
2071 	/*
2072 	 * try to pull pages from the aging bins...
2073 	 * see vm_page_internal.h for an explanation of how
2074 	 * this mechanism works
2075 	 */
2076 	boolean_t                       can_steal = FALSE;
2077 	int                             num_scanned_queues;
2078 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2079 	mach_timespec_t                 ts;
2080 	struct vm_speculative_age_q     *aq;
2081 	struct vm_speculative_age_q     *sq;
2082 
2083 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2084 
2085 	aq = &vm_page_queue_speculative[speculative_steal_index];
2086 
2087 	num_scanned_queues = 0;
2088 	while (vm_page_queue_empty(&aq->age_q) &&
2089 	    num_scanned_queues++ != vm_page_max_speculative_age_q) {
2090 		speculative_steal_index++;
2091 
2092 		if (speculative_steal_index > vm_page_max_speculative_age_q) {
2093 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2094 		}
2095 
2096 		aq = &vm_page_queue_speculative[speculative_steal_index];
2097 	}
2098 
2099 	if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2100 		/*
2101 		 * XXX We've scanned all the speculative
2102 		 * queues but still haven't found one
2103 		 * that is not empty, even though
2104 		 * vm_page_speculative_count is not 0.
2105 		 */
2106 		if (!vm_page_queue_empty(&sq->age_q)) {
2107 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2108 		}
2109 #if DEVELOPMENT || DEBUG
2110 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2111 #endif
2112 		/* readjust... */
2113 		vm_page_speculative_count = 0;
2114 		/* ... and continue */
2115 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2116 	}
2117 
2118 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2119 		can_steal = TRUE;
2120 	} else {
2121 		if (!delay_speculative_age) {
2122 			mach_timespec_t ts_fully_aged;
2123 
2124 			ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2125 			ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2126 			    * 1000 * NSEC_PER_USEC;
2127 
2128 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2129 
2130 			clock_sec_t sec;
2131 			clock_nsec_t nsec;
2132 			clock_get_system_nanotime(&sec, &nsec);
2133 			ts.tv_sec = (unsigned int) sec;
2134 			ts.tv_nsec = nsec;
2135 
2136 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2137 				can_steal = TRUE;
2138 			} else {
2139 				delay_speculative_age++;
2140 			}
2141 		} else {
2142 			delay_speculative_age++;
2143 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2144 				delay_speculative_age = 0;
2145 			}
2146 		}
2147 	}
2148 	if (can_steal == TRUE) {
2149 		vm_page_speculate_ageit(aq);
2150 	}
2151 
2152 	return VM_PAGEOUT_SCAN_PROCEED;
2153 }
2154 
2155 /*
2156  * This function is called only from vm_pageout_scan and
2157  * it evicts a single VM object from the cache.
2158  */
2159 static int inline
2160 vps_object_cache_evict(vm_object_t *object_to_unlock)
2161 {
2162 	static int                      cache_evict_throttle = 0;
2163 	struct vm_speculative_age_q     *sq;
2164 
2165 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2166 
2167 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2168 		int     pages_evicted;
2169 
2170 		if (*object_to_unlock != NULL) {
2171 			vm_object_unlock(*object_to_unlock);
2172 			*object_to_unlock = NULL;
2173 		}
2174 		KDBG(0x13001ec | DBG_FUNC_START);
2175 
2176 		pages_evicted = vm_object_cache_evict(100, 10);
2177 
2178 		KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2179 
2180 		if (pages_evicted) {
2181 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2182 
2183 			VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2184 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2185 			memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2186 
2187 			/*
2188 			 * we just freed up to 100 pages,
2189 			 * so go back to the top of the main loop
2190 			 * and re-evaulate the memory situation
2191 			 */
2192 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2193 		} else {
2194 			cache_evict_throttle = 1000;
2195 		}
2196 	}
2197 	if (cache_evict_throttle) {
2198 		cache_evict_throttle--;
2199 	}
2200 
2201 	return VM_PAGEOUT_SCAN_PROCEED;
2202 }
2203 
2204 
2205 /*
2206  * This function is called only from vm_pageout_scan and
2207  * it calculates the filecache min. that needs to be maintained
2208  * as we start to steal pages.
2209  */
2210 static void
2211 vps_calculate_filecache_min(void)
2212 {
2213 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2214 
2215 #if CONFIG_JETSAM
2216 	/*
2217 	 * don't let the filecache_min fall below 15% of available memory
2218 	 * on systems with an active compressor that isn't nearing its
2219 	 * limits w/r to accepting new data
2220 	 *
2221 	 * on systems w/o the compressor/swapper, the filecache is always
2222 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2223 	 * since most (if not all) of the anonymous pages are in the
2224 	 * throttled queue (which isn't counted as available) which
2225 	 * effectively disables this filter
2226 	 */
2227 	if (vm_compressor_low_on_space() || divisor == 0) {
2228 		vm_pageout_state.vm_page_filecache_min = 0;
2229 	} else {
2230 		vm_pageout_state.vm_page_filecache_min =
2231 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2232 	}
2233 #else
2234 	if (vm_compressor_out_of_space() || divisor == 0) {
2235 		vm_pageout_state.vm_page_filecache_min = 0;
2236 	} else {
2237 		/*
2238 		 * don't let the filecache_min fall below the specified critical level
2239 		 */
2240 		vm_pageout_state.vm_page_filecache_min =
2241 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2242 	}
2243 #endif
2244 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2245 		vm_pageout_state.vm_page_filecache_min = 0;
2246 	}
2247 }
2248 
2249 /*
2250  * This function is called only from vm_pageout_scan and
2251  * it updates the flow control time to detect if VM pageoutscan
2252  * isn't making progress.
2253  */
2254 static void
2255 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2256 {
2257 	mach_timespec_t ts;
2258 	clock_sec_t sec;
2259 	clock_nsec_t nsec;
2260 
2261 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2262 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2263 	clock_get_system_nanotime(&sec, &nsec);
2264 	flow_control->ts.tv_sec = (unsigned int) sec;
2265 	flow_control->ts.tv_nsec = nsec;
2266 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2267 
2268 	flow_control->state = FCS_DELAYED;
2269 
2270 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2271 }
2272 
2273 /*
2274  * This function is called only from vm_pageout_scan and
2275  * it is the flow control logic of VM pageout scan which
2276  * controls if it should block and for how long.
2277  * Any blocking of vm_pageout_scan happens ONLY in this function.
2278  */
2279 static int
2280 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2281     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2282 {
2283 	boolean_t       exceeded_burst_throttle = FALSE;
2284 	unsigned int    msecs = 0;
2285 	uint32_t        inactive_external_count;
2286 	mach_timespec_t ts;
2287 	struct  vm_pageout_queue *iq;
2288 	struct  vm_pageout_queue *eq;
2289 	struct  vm_speculative_age_q *sq;
2290 
2291 	iq = &vm_pageout_queue_internal;
2292 	eq = &vm_pageout_queue_external;
2293 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2294 
2295 	/*
2296 	 * Sometimes we have to pause:
2297 	 *	1) No inactive pages - nothing to do.
2298 	 *	2) Loop control - no acceptable pages found on the inactive queue
2299 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2300 	 *	3) Flow control - default pageout queue is full
2301 	 */
2302 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2303 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2304 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2305 	    vm_page_queue_empty(&sq->age_q)) {
2306 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2307 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2308 	} else if (inactive_burst_count >=
2309 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2310 	    (vm_page_inactive_count +
2311 	    vm_page_speculative_count))) {
2312 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2313 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2314 
2315 		exceeded_burst_throttle = TRUE;
2316 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2317 	    VM_DYNAMIC_PAGING_ENABLED()) {
2318 		clock_sec_t sec;
2319 		clock_nsec_t nsec;
2320 
2321 		switch (flow_control->state) {
2322 		case FCS_IDLE:
2323 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2324 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2325 				/*
2326 				 * since the compressor is running independently of vm_pageout_scan
2327 				 * let's not wait for it just yet... as long as we have a healthy supply
2328 				 * of filecache pages to work with, let's keep stealing those.
2329 				 */
2330 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2331 
2332 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2333 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2334 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2335 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2336 					return VM_PAGEOUT_SCAN_PROCEED;
2337 				}
2338 			}
2339 
2340 			vps_flow_control_reset_deadlock_timer(flow_control);
2341 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2342 
2343 			break;
2344 
2345 		case FCS_DELAYED:
2346 			clock_get_system_nanotime(&sec, &nsec);
2347 			ts.tv_sec = (unsigned int) sec;
2348 			ts.tv_nsec = nsec;
2349 
2350 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2351 				/*
2352 				 * the pageout thread for the default pager is potentially
2353 				 * deadlocked since the
2354 				 * default pager queue has been throttled for more than the
2355 				 * allowable time... we need to move some clean pages or dirty
2356 				 * pages belonging to the external pagers if they aren't throttled
2357 				 * vm_page_free_wanted represents the number of threads currently
2358 				 * blocked waiting for pages... we'll move one page for each of
2359 				 * these plus a fixed amount to break the logjam... once we're done
2360 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2361 				 * with a new timeout target since we have no way of knowing
2362 				 * whether we've broken the deadlock except through observation
2363 				 * of the queue associated with the default pager... we need to
2364 				 * stop moving pages and allow the system to run to see what
2365 				 * state it settles into.
2366 				 */
2367 
2368 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2369 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2370 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2371 				flow_control->state = FCS_DEADLOCK_DETECTED;
2372 				sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
2373 				return VM_PAGEOUT_SCAN_PROCEED;
2374 			}
2375 			/*
2376 			 * just resniff instead of trying
2377 			 * to compute a new delay time... we're going to be
2378 			 * awakened immediately upon a laundry completion,
2379 			 * so we won't wait any longer than necessary
2380 			 */
2381 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2382 			break;
2383 
2384 		case FCS_DEADLOCK_DETECTED:
2385 			if (*vm_pageout_deadlock_target) {
2386 				return VM_PAGEOUT_SCAN_PROCEED;
2387 			}
2388 
2389 			vps_flow_control_reset_deadlock_timer(flow_control);
2390 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2391 
2392 			break;
2393 		}
2394 	} else {
2395 		/*
2396 		 * No need to pause...
2397 		 */
2398 		return VM_PAGEOUT_SCAN_PROCEED;
2399 	}
2400 
2401 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2402 
2403 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2404 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2405 
2406 	if (vm_page_free_count >= vm_page_free_target) {
2407 		/*
2408 		 * we're here because
2409 		 *  1) someone else freed up some pages while we had
2410 		 *     the queues unlocked above
2411 		 * and we've hit one of the 3 conditions that
2412 		 * cause us to pause the pageout scan thread
2413 		 *
2414 		 * since we already have enough free pages,
2415 		 * let's avoid stalling and return normally
2416 		 *
2417 		 * before we return, make sure the pageout I/O threads
2418 		 * are running throttled in case there are still requests
2419 		 * in the laundry... since we have enough free pages
2420 		 * we don't need the laundry to be cleaned in a timely
2421 		 * fashion... so let's avoid interfering with foreground
2422 		 * activity
2423 		 *
2424 		 * we don't want to hold vm_page_queue_free_lock when
2425 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2426 		 * may cause other locks to be taken), we do the intitial
2427 		 * check outside of the lock.  Once we take the lock,
2428 		 * we recheck the condition since it may have changed.
2429 		 * if it has, no problem, we will make the threads
2430 		 * non-throttled before actually blocking
2431 		 */
2432 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2433 	}
2434 	vm_free_page_lock();
2435 
2436 	if (vm_page_free_count >= vm_page_free_target &&
2437 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2438 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2439 	}
2440 	vm_free_page_unlock();
2441 
2442 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2443 		/*
2444 		 * we're most likely about to block due to one of
2445 		 * the 3 conditions that cause vm_pageout_scan to
2446 		 * not be able to make forward progress w/r
2447 		 * to providing new pages to the free queue,
2448 		 * so unthrottle the I/O threads in case we
2449 		 * have laundry to be cleaned... it needs
2450 		 * to be completed ASAP.
2451 		 *
2452 		 * even if we don't block, we want the io threads
2453 		 * running unthrottled since the sum of free +
2454 		 * clean pages is still under our free target
2455 		 */
2456 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2457 	}
2458 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2459 		/*
2460 		 * if we get here we're below our free target and
2461 		 * we're stalling due to a full laundry queue or
2462 		 * we don't have any inactive pages other then
2463 		 * those in the clean queue...
2464 		 * however, we have pages on the clean queue that
2465 		 * can be moved to the free queue, so let's not
2466 		 * stall the pageout scan
2467 		 */
2468 		flow_control->state = FCS_IDLE;
2469 		return VM_PAGEOUT_SCAN_PROCEED;
2470 	}
2471 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2472 		flow_control->state = FCS_IDLE;
2473 		return VM_PAGEOUT_SCAN_PROCEED;
2474 	}
2475 
2476 	VM_CHECK_MEMORYSTATUS;
2477 
2478 	if (flow_control->state != FCS_IDLE) {
2479 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2480 	}
2481 
2482 	iq->pgo_throttled = TRUE;
2483 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2484 
2485 	vm_page_unlock_queues();
2486 
2487 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2488 
2489 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2490 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2491 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2492 
2493 	thread_block(THREAD_CONTINUE_NULL);
2494 
2495 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2496 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2497 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2498 
2499 	vm_page_lock_queues();
2500 
2501 	iq->pgo_throttled = FALSE;
2502 
2503 	vps_init_page_targets();
2504 
2505 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2506 }
2507 
2508 extern boolean_t vm_darkwake_mode;
2509 /*
2510  * This function is called only from vm_pageout_scan and
2511  * it will find and return the most appropriate page to be
2512  * reclaimed.
2513  */
2514 static int
2515 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2516     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2517 {
2518 	vm_page_t                       m = NULL;
2519 	vm_object_t                     m_object = VM_OBJECT_NULL;
2520 	uint32_t                        inactive_external_count;
2521 	struct vm_speculative_age_q     *sq;
2522 	struct vm_pageout_queue         *iq;
2523 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2524 
2525 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2526 	iq = &vm_pageout_queue_internal;
2527 
2528 	*is_page_from_bg_q = FALSE;
2529 
2530 	m = NULL;
2531 	m_object = VM_OBJECT_NULL;
2532 
2533 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2534 		assert(vm_page_throttled_count == 0);
2535 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2536 	}
2537 
2538 	/*
2539 	 * Try for a clean-queue inactive page.
2540 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2541 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2542 	 */
2543 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2544 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2545 
2546 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2547 
2548 		goto found_page;
2549 	}
2550 
2551 	/*
2552 	 * The next most eligible pages are ones we paged in speculatively,
2553 	 * but which have not yet been touched and have been aged out.
2554 	 */
2555 	if (!vm_page_queue_empty(&sq->age_q)) {
2556 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2557 
2558 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2559 
2560 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2561 			goto found_page;
2562 		} else {
2563 			m = NULL;
2564 		}
2565 	}
2566 
2567 #if !CONFIG_JETSAM
2568 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2569 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2570 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2571 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2572 			goto found_page;
2573 		}
2574 	}
2575 #endif /* !CONFIG_JETSAM */
2576 
2577 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2578 		vm_object_t     bg_m_object = NULL;
2579 
2580 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2581 
2582 		bg_m_object = VM_PAGE_OBJECT(m);
2583 
2584 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2585 			/*
2586 			 * This page is on the background queue
2587 			 * but not on a pageable queue OR is busy during
2588 			 * darkwake mode when the target is artificially lowered.
2589 			 * If it is busy during darkwake mode, and we don't skip it,
2590 			 * we will just swing back around and try again with the same
2591 			 * queue and might hit the same page or its neighbor in a
2592 			 * similar state. Both of these are transient states and will
2593 			 * get resolved, but, at this point let's ignore this page.
2594 			 */
2595 			if (vm_darkwake_mode && m->vmp_busy) {
2596 				if (bg_m_object->internal) {
2597 					vm_pageout_skipped_bq_internal++;
2598 				} else {
2599 					vm_pageout_skipped_bq_external++;
2600 				}
2601 			}
2602 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2603 			if (bg_m_object->internal &&
2604 			    (VM_PAGE_Q_THROTTLED(iq) ||
2605 			    vm_compressor_out_of_space() == TRUE ||
2606 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2607 				vm_pageout_skipped_bq_internal++;
2608 			} else {
2609 				*is_page_from_bg_q = TRUE;
2610 
2611 				if (bg_m_object->internal) {
2612 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2613 				} else {
2614 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2615 				}
2616 				goto found_page;
2617 			}
2618 		}
2619 	}
2620 
2621 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2622 
2623 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2624 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2625 		*grab_anonymous = TRUE;
2626 		*anons_grabbed = 0;
2627 
2628 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2629 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2630 		} else {
2631 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2632 				/*
2633 				 * No swap and we are in dangerously low levels of free memory.
2634 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2635 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2636 				 *
2637 				 * So, pick a file backed page...
2638 				 */
2639 				*grab_anonymous = FALSE;
2640 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2641 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2642 			}
2643 		}
2644 		goto want_anonymous;
2645 	}
2646 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2647 
2648 #if CONFIG_JETSAM
2649 	/* If the file-backed pool has accumulated
2650 	 * significantly more pages than the jetsam
2651 	 * threshold, prefer to reclaim those
2652 	 * inline to minimise compute overhead of reclaiming
2653 	 * anonymous pages.
2654 	 * This calculation does not account for the CPU local
2655 	 * external page queues, as those are expected to be
2656 	 * much smaller relative to the global pools.
2657 	 */
2658 
2659 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2660 
2661 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2662 		if (vm_page_pageable_external_count >
2663 		    vm_pageout_state.vm_page_filecache_min) {
2664 			if ((vm_page_pageable_external_count *
2665 			    vm_pageout_memorystatus_fb_factor_dr) >
2666 			    (memorystatus_get_critical_page_shortage_threshold() *
2667 			    vm_pageout_memorystatus_fb_factor_nr)) {
2668 				*grab_anonymous = FALSE;
2669 
2670 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2671 			}
2672 		}
2673 		if (*grab_anonymous) {
2674 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2675 		}
2676 	}
2677 #endif /* CONFIG_JETSAM */
2678 
2679 want_anonymous:
2680 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2681 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2682 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2683 
2684 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2685 			*anons_grabbed = 0;
2686 
2687 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2688 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2689 					if ((++(*reactivated_this_call) % 100)) {
2690 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2691 
2692 						vm_page_activate(m);
2693 						counter_inc(&vm_statistics_reactivations);
2694 #if DEVELOPMENT || DEBUG
2695 						if (*is_page_from_bg_q == TRUE) {
2696 							if (m_object->internal) {
2697 								vm_pageout_rejected_bq_internal++;
2698 							} else {
2699 								vm_pageout_rejected_bq_external++;
2700 							}
2701 						}
2702 #endif /* DEVELOPMENT || DEBUG */
2703 						vm_pageout_state.vm_pageout_inactive_used++;
2704 
2705 						m = NULL;
2706 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2707 
2708 						goto found_page;
2709 					}
2710 
2711 					/*
2712 					 * steal 1 of the file backed pages even if
2713 					 * we are under the limit that has been set
2714 					 * for a healthy filecache
2715 					 */
2716 				}
2717 			}
2718 			goto found_page;
2719 		}
2720 	}
2721 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2722 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2723 
2724 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2725 		*anons_grabbed += 1;
2726 
2727 		goto found_page;
2728 	}
2729 
2730 	m = NULL;
2731 
2732 found_page:
2733 	*victim_page = m;
2734 
2735 	return retval;
2736 }
2737 
2738 /*
2739  * This function is called only from vm_pageout_scan and
2740  * it will put a page back on the active/inactive queue
2741  * if we can't reclaim it for some reason.
2742  */
2743 static void
2744 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2745 {
2746 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2747 		vm_page_enqueue_inactive(m, FALSE);
2748 	} else {
2749 		vm_page_activate(m);
2750 	}
2751 
2752 #if DEVELOPMENT || DEBUG
2753 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2754 
2755 	if (page_from_bg_q == TRUE) {
2756 		if (m_object->internal) {
2757 			vm_pageout_rejected_bq_internal++;
2758 		} else {
2759 			vm_pageout_rejected_bq_external++;
2760 		}
2761 	}
2762 #endif /* DEVELOPMENT || DEBUG */
2763 }
2764 
2765 /*
2766  * This function is called only from vm_pageout_scan and
2767  * it will try to grab the victim page's VM object (m_object)
2768  * which differs from the previous victim page's object (object).
2769  */
2770 static int
2771 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2772 {
2773 	struct vm_speculative_age_q *sq;
2774 
2775 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2776 
2777 	/*
2778 	 * the object associated with candidate page is
2779 	 * different from the one we were just working
2780 	 * with... dump the lock if we still own it
2781 	 */
2782 	if (*object != NULL) {
2783 		vm_object_unlock(*object);
2784 		*object = NULL;
2785 	}
2786 	/*
2787 	 * Try to lock object; since we've alread got the
2788 	 * page queues lock, we can only 'try' for this one.
2789 	 * if the 'try' fails, we need to do a mutex_pause
2790 	 * to allow the owner of the object lock a chance to
2791 	 * run... otherwise, we're likely to trip over this
2792 	 * object in the same state as we work our way through
2793 	 * the queue... clumps of pages associated with the same
2794 	 * object are fairly typical on the inactive and active queues
2795 	 */
2796 	if (!vm_object_lock_try_scan(m_object)) {
2797 		vm_page_t m_want = NULL;
2798 
2799 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2800 
2801 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2802 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2803 		}
2804 
2805 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2806 
2807 		m->vmp_reference = FALSE;
2808 
2809 		if (!m_object->object_is_shared_cache) {
2810 			/*
2811 			 * don't apply this optimization if this is the shared cache
2812 			 * object, it's too easy to get rid of very hot and important
2813 			 * pages...
2814 			 * m->vmp_object must be stable since we hold the page queues lock...
2815 			 * we can update the scan_collisions field sans the object lock
2816 			 * since it is a separate field and this is the only spot that does
2817 			 * a read-modify-write operation and it is never executed concurrently...
2818 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2819 			 * is possible for the value to be a bit non-determistic, but that's ok
2820 			 * since it's only used as a hint
2821 			 */
2822 			m_object->scan_collisions = 1;
2823 		}
2824 		if (page_from_bg_q) {
2825 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2826 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2827 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2828 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2829 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2830 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2831 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2832 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2833 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2834 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2835 		}
2836 
2837 		/*
2838 		 * this is the next object we're going to be interested in
2839 		 * try to make sure its available after the mutex_pause
2840 		 * returns control
2841 		 */
2842 		if (m_want) {
2843 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2844 		}
2845 
2846 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2847 
2848 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2849 	} else {
2850 		*object = m_object;
2851 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2852 	}
2853 
2854 	return VM_PAGEOUT_SCAN_PROCEED;
2855 }
2856 
2857 /*
2858  * This function is called only from vm_pageout_scan and
2859  * it notices that pageout scan may be rendered ineffective
2860  * due to a FS deadlock and will jetsam a process if possible.
2861  * If jetsam isn't supported, it'll move the page to the active
2862  * queue to try and get some different pages pushed onwards so
2863  * we can try to get out of this scenario.
2864  */
2865 static void
2866 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2867     boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2868 {
2869 	struct  vm_pageout_queue *eq;
2870 	vm_object_t cur_object = VM_OBJECT_NULL;
2871 
2872 	cur_object = *object;
2873 
2874 	eq = &vm_pageout_queue_external;
2875 
2876 	if (cur_object->internal == FALSE) {
2877 		/*
2878 		 * we need to break up the following potential deadlock case...
2879 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2880 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2881 		 *  c) Most of the pages in the inactive queue belong to this file.
2882 		 *
2883 		 * we are potentially in this deadlock because...
2884 		 *  a) the external pageout queue is throttled
2885 		 *  b) we're done with the active queue and moved on to the inactive queue
2886 		 *  c) we've got a dirty external page
2887 		 *
2888 		 * since we don't know the reason for the external pageout queue being throttled we
2889 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2890 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2891 		 *
2892 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2893 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2894 		 * pool the next time we select a victim page... if we can make enough new free pages,
2895 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2896 		 * be throttled
2897 		 *
2898 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2899 		 * that we can try to find clean pages in the active/inactive queues before
2900 		 * deciding to jetsam a process
2901 		 */
2902 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2903 
2904 		vm_page_check_pageable_safe(m);
2905 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2906 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2907 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2908 		vm_page_active_count++;
2909 		vm_page_pageable_external_count++;
2910 
2911 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2912 
2913 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2914 
2915 #pragma unused(force_anonymous)
2916 
2917 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2918 
2919 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2920 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2921 			/*
2922 			 * Possible deadlock scenario so request jetsam action
2923 			 */
2924 			memorystatus_kill_on_vps_starvation();
2925 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2926 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2927 		}
2928 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2929 
2930 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2931 
2932 		*force_anonymous = TRUE;
2933 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2934 	} else {
2935 		vm_page_activate(m);
2936 		counter_inc(&vm_statistics_reactivations);
2937 
2938 #if DEVELOPMENT || DEBUG
2939 		if (is_page_from_bg_q == TRUE) {
2940 			if (cur_object->internal) {
2941 				vm_pageout_rejected_bq_internal++;
2942 			} else {
2943 				vm_pageout_rejected_bq_external++;
2944 			}
2945 		}
2946 #endif /* DEVELOPMENT || DEBUG */
2947 
2948 		vm_pageout_state.vm_pageout_inactive_used++;
2949 	}
2950 }
2951 
2952 
2953 void
2954 vm_page_balance_inactive(int max_to_move)
2955 {
2956 	vm_page_t m;
2957 
2958 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2959 
2960 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2961 		/*
2962 		 * It is likely that the hibernation code path is
2963 		 * dealing with these very queues as we are about
2964 		 * to move pages around in/from them and completely
2965 		 * change the linkage of the pages.
2966 		 *
2967 		 * And so we skip the rebalancing of these queues.
2968 		 */
2969 		return;
2970 	}
2971 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2972 	    vm_page_inactive_count +
2973 	    vm_page_speculative_count);
2974 
2975 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2976 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2977 
2978 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2979 
2980 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2981 		assert(!m->vmp_laundry);
2982 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2983 		assert(!vm_page_is_guard(m));
2984 
2985 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2986 
2987 		/*
2988 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2989 		 *
2990 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2991 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2992 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2993 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2994 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2995 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2996 		 * have happened before we moved the page
2997 		 */
2998 		if (m->vmp_pmapped == TRUE) {
2999 			/*
3000 			 * We might be holding the page queue lock as a
3001 			 * spin lock and clearing the "referenced" bit could
3002 			 * take a while if there are lots of mappings of
3003 			 * that page, so make sure we acquire the lock as
3004 			 * as mutex to avoid a spinlock timeout.
3005 			 */
3006 			vm_page_lockconvert_queues();
3007 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
3008 		}
3009 
3010 		/*
3011 		 * The page might be absent or busy,
3012 		 * but vm_page_deactivate can handle that.
3013 		 * FALSE indicates that we don't want a H/W clear reference
3014 		 */
3015 		vm_page_deactivate_internal(m, FALSE);
3016 	}
3017 }
3018 
3019 /*
3020  *	vm_pageout_scan does the dirty work for the pageout daemon.
3021  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
3022  *	held and vm_page_free_wanted == 0.
3023  */
3024 void
3025 vm_pageout_scan(void)
3026 {
3027 	unsigned int loop_count = 0;
3028 	unsigned int inactive_burst_count = 0;
3029 	unsigned int reactivated_this_call;
3030 	unsigned int reactivate_limit;
3031 	vm_page_t   local_freeq = NULL;
3032 	int         local_freed = 0;
3033 	int         delayed_unlock;
3034 	int         delayed_unlock_limit = 0;
3035 	int         refmod_state = 0;
3036 	int     vm_pageout_deadlock_target = 0;
3037 	struct  vm_pageout_queue *iq;
3038 	struct  vm_pageout_queue *eq;
3039 	struct  vm_speculative_age_q *sq;
3040 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3041 	boolean_t inactive_throttled = FALSE;
3042 	vm_object_t     object = NULL;  /* object that we're currently working on from previous iterations */
3043 	uint32_t        inactive_reclaim_run;
3044 	boolean_t       grab_anonymous = FALSE;
3045 	boolean_t       force_anonymous = FALSE;
3046 	boolean_t       force_speculative_aging = FALSE;
3047 	int             anons_grabbed = 0;
3048 	int             page_prev_q_state = 0;
3049 	boolean_t       page_from_bg_q = FALSE;
3050 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
3051 	vm_object_t     m_object = VM_OBJECT_NULL;  /* object of the current page (m) */
3052 	int             retval = 0;
3053 	boolean_t       lock_yield_check = FALSE;
3054 
3055 
3056 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3057 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
3058 	    vm_pageout_state.vm_pageout_inactive_clean,
3059 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3060 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3061 
3062 	flow_control.state = FCS_IDLE;
3063 	iq = &vm_pageout_queue_internal;
3064 	eq = &vm_pageout_queue_external;
3065 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3066 
3067 	/* Ask the pmap layer to return any pages it no longer needs. */
3068 	pmap_release_pages_fast();
3069 
3070 	vm_page_lock_queues();
3071 
3072 	delayed_unlock = 1;
3073 
3074 	/*
3075 	 *	Calculate the max number of referenced pages on the inactive
3076 	 *	queue that we will reactivate.
3077 	 */
3078 	reactivated_this_call = 0;
3079 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3080 	    vm_page_inactive_count);
3081 	inactive_reclaim_run = 0;
3082 
3083 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3084 
3085 	/*
3086 	 *	We must limit the rate at which we send pages to the pagers
3087 	 *	so that we don't tie up too many pages in the I/O queues.
3088 	 *	We implement a throttling mechanism using the laundry count
3089 	 *      to limit the number of pages outstanding to the default
3090 	 *	and external pagers.  We can bypass the throttles and look
3091 	 *	for clean pages if the pageout queues don't drain in a timely
3092 	 *	fashion since this may indicate that the pageout paths are
3093 	 *	stalled waiting for memory, which only we can provide.
3094 	 */
3095 
3096 	vps_init_page_targets();
3097 	assert(object == NULL);
3098 	assert(delayed_unlock != 0);
3099 
3100 	for (;;) {
3101 		vm_page_t m;
3102 
3103 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3104 
3105 		if (lock_yield_check) {
3106 			lock_yield_check = FALSE;
3107 
3108 			if (delayed_unlock++ > delayed_unlock_limit) {
3109 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3110 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3111 			} else if (vm_pageout_scan_wants_object) {
3112 				vm_page_unlock_queues();
3113 				mutex_pause(0);
3114 				vm_page_lock_queues();
3115 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3116 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3117 			}
3118 		}
3119 
3120 		if (vm_upl_wait_for_pages < 0) {
3121 			vm_upl_wait_for_pages = 0;
3122 		}
3123 
3124 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3125 
3126 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3127 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3128 		}
3129 
3130 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3131 
3132 		assert(delayed_unlock);
3133 
3134 		/*
3135 		 * maintain our balance
3136 		 */
3137 		vm_page_balance_inactive(1);
3138 
3139 
3140 		/**********************************************************************
3141 		* above this point we're playing with the active and secluded queues
3142 		* below this point we're playing with the throttling mechanisms
3143 		* and the inactive queue
3144 		**********************************************************************/
3145 
3146 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3147 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3148 
3149 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3150 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3151 			/*
3152 			 * make sure the pageout I/O threads are running
3153 			 * throttled in case there are still requests
3154 			 * in the laundry... since we have met our targets
3155 			 * we don't need the laundry to be cleaned in a timely
3156 			 * fashion... so let's avoid interfering with foreground
3157 			 * activity
3158 			 */
3159 			vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3160 
3161 			vm_free_page_lock();
3162 
3163 			if ((vm_page_free_count >= vm_page_free_target) &&
3164 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3165 				/*
3166 				 * done - we have met our target *and*
3167 				 * there is no one waiting for a page.
3168 				 */
3169 return_from_scan:
3170 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3171 
3172 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3173 				    vm_pageout_state.vm_pageout_inactive,
3174 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3175 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3176 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3177 				    vm_pageout_state.vm_pageout_inactive_clean,
3178 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3179 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3180 
3181 				return;
3182 			}
3183 			vm_free_page_unlock();
3184 		}
3185 
3186 		/*
3187 		 * Before anything, we check if we have any ripe volatile
3188 		 * objects around. If so, try to purge the first object.
3189 		 * If the purge fails, fall through to reclaim a page instead.
3190 		 * If the purge succeeds, go back to the top and reevalute
3191 		 * the new memory situation.
3192 		 */
3193 		retval = vps_purge_object();
3194 
3195 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3196 			/*
3197 			 * Success
3198 			 */
3199 			if (object != NULL) {
3200 				vm_object_unlock(object);
3201 				object = NULL;
3202 			}
3203 
3204 			lock_yield_check = FALSE;
3205 			continue;
3206 		}
3207 
3208 		/*
3209 		 * If our 'aged' queue is empty and we have some speculative pages
3210 		 * in the other queues, let's go through and see if we need to age
3211 		 * them.
3212 		 *
3213 		 * If we succeeded in aging a speculative Q or just that everything
3214 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3215 		 *
3216 		 * If, for some reason, we seem to have a mismatch between the spec.
3217 		 * page count and the page queues, we reset those variables and
3218 		 * restart the loop (LD TODO: Track this better?).
3219 		 */
3220 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3221 			retval = vps_age_speculative_queue(force_speculative_aging);
3222 
3223 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3224 				lock_yield_check = FALSE;
3225 				continue;
3226 			}
3227 		}
3228 		force_speculative_aging = FALSE;
3229 
3230 		/*
3231 		 * Check to see if we need to evict objects from the cache.
3232 		 *
3233 		 * Note: 'object' here doesn't have anything to do with
3234 		 * the eviction part. We just need to make sure we have dropped
3235 		 * any object lock we might be holding if we need to go down
3236 		 * into the eviction logic.
3237 		 */
3238 		retval = vps_object_cache_evict(&object);
3239 
3240 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3241 			lock_yield_check = FALSE;
3242 			continue;
3243 		}
3244 
3245 
3246 		/*
3247 		 * Calculate our filecache_min that will affect the loop
3248 		 * going forward.
3249 		 */
3250 		vps_calculate_filecache_min();
3251 
3252 		/*
3253 		 * LD TODO: Use a structure to hold all state variables for a single
3254 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3255 		 */
3256 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3257 		    &delayed_unlock, &local_freeq, &local_freed,
3258 		    &vm_pageout_deadlock_target, inactive_burst_count);
3259 
3260 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3261 			if (loop_count >= vm_page_inactive_count) {
3262 				loop_count = 0;
3263 			}
3264 
3265 			inactive_burst_count = 0;
3266 
3267 			assert(object == NULL);
3268 			assert(delayed_unlock != 0);
3269 
3270 			lock_yield_check = FALSE;
3271 			continue;
3272 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3273 			goto return_from_scan;
3274 		}
3275 
3276 		flow_control.state = FCS_IDLE;
3277 
3278 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3279 		    vm_pageout_inactive_external_forced_reactivate_limit);
3280 		loop_count++;
3281 		inactive_burst_count++;
3282 		vm_pageout_state.vm_pageout_inactive++;
3283 
3284 		/*
3285 		 * Choose a victim.
3286 		 */
3287 
3288 		m = NULL;
3289 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3290 
3291 		if (m == NULL) {
3292 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3293 				inactive_burst_count = 0;
3294 
3295 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3296 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3297 				}
3298 
3299 				lock_yield_check = TRUE;
3300 				continue;
3301 			}
3302 
3303 			/*
3304 			 * if we've gotten here, we have no victim page.
3305 			 * check to see if we've not finished balancing the queues
3306 			 * or we have a page on the aged speculative queue that we
3307 			 * skipped due to force_anonymous == TRUE.. or we have
3308 			 * speculative  pages that we can prematurely age... if
3309 			 * one of these cases we'll keep going, else panic
3310 			 */
3311 			force_anonymous = FALSE;
3312 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3313 
3314 			if (!vm_page_queue_empty(&sq->age_q)) {
3315 				lock_yield_check = TRUE;
3316 				continue;
3317 			}
3318 
3319 			if (vm_page_speculative_count) {
3320 				force_speculative_aging = TRUE;
3321 				lock_yield_check = TRUE;
3322 				continue;
3323 			}
3324 			panic("vm_pageout: no victim");
3325 
3326 			/* NOTREACHED */
3327 		}
3328 
3329 		assert(VM_PAGE_PAGEABLE(m));
3330 		m_object = VM_PAGE_OBJECT(m);
3331 		force_anonymous = FALSE;
3332 
3333 		page_prev_q_state = m->vmp_q_state;
3334 		/*
3335 		 * we just found this page on one of our queues...
3336 		 * it can't also be on the pageout queue, so safe
3337 		 * to call vm_page_queues_remove
3338 		 */
3339 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3340 		vm_page_queues_remove(m, TRUE);
3341 		if (donate) {
3342 			/*
3343 			 * The compressor needs to see this bit to know
3344 			 * where this page needs to land. Also if stolen,
3345 			 * this bit helps put the page back in the right
3346 			 * special queue where it belongs.
3347 			 */
3348 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3349 		}
3350 
3351 		assert(!m->vmp_laundry);
3352 		assert(vm_page_is_canonical(m));
3353 		assert(!is_kernel_object(m_object));
3354 
3355 		vm_pageout_vminfo.vm_pageout_considered_page++;
3356 
3357 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3358 
3359 		/*
3360 		 * check to see if we currently are working
3361 		 * with the same object... if so, we've
3362 		 * already got the lock
3363 		 */
3364 		if (m_object != object) {
3365 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3366 
3367 			/*
3368 			 * vps_switch_object() will always drop the 'object' lock first
3369 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3370 			 * either 'm_object' or NULL.
3371 			 */
3372 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3373 
3374 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3375 				lock_yield_check = TRUE;
3376 				continue;
3377 			}
3378 		}
3379 		assert(m_object == object);
3380 		assert(VM_PAGE_OBJECT(m) == m_object);
3381 
3382 		if (m->vmp_busy) {
3383 			/*
3384 			 *	Somebody is already playing with this page.
3385 			 *	Put it back on the appropriate queue
3386 			 *
3387 			 */
3388 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3389 
3390 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3391 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3392 			}
3393 
3394 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3395 
3396 			lock_yield_check = TRUE;
3397 			continue;
3398 		}
3399 
3400 		/*
3401 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3402 		 *	If already cleaning this page in place
3403 		 *	just leave if off the paging queues.
3404 		 *	We can leave the page mapped, and upl_commit_range
3405 		 *	will put it on the clean queue.
3406 		 *
3407 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3408 		 *	an msync INVALIDATE is in progress...
3409 		 *	this page has been marked for destruction
3410 		 *      after it has been cleaned,
3411 		 *      but not yet gathered into a UPL
3412 		 *	where 'cleaning' will be set...
3413 		 *	just leave it off the paging queues
3414 		 *
3415 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3416 		 *	an msync INVALIDATE is in progress
3417 		 *	and the UPL has already gathered this page...
3418 		 *	just leave it off the paging queues
3419 		 */
3420 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3421 			lock_yield_check = TRUE;
3422 			continue;
3423 		}
3424 
3425 
3426 		/*
3427 		 *	If it's absent, in error or the object is no longer alive,
3428 		 *	we can reclaim the page... in the no longer alive case,
3429 		 *	there are 2 states the page can be in that preclude us
3430 		 *	from reclaiming it - busy or cleaning - that we've already
3431 		 *	dealt with
3432 		 */
3433 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3434 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3435 			if (m->vmp_absent) {
3436 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3437 			} else if (!object->alive ||
3438 			    (!object->internal &&
3439 			    object->pager == MEMORY_OBJECT_NULL)) {
3440 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3441 			} else {
3442 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3443 			}
3444 			if (m->vmp_pmapped) {
3445 				int refmod;
3446 
3447 				/*
3448 				 * If this page was file-backed and wired while its pager
3449 				 * was lost (during a forced unmount, for example), there
3450 				 * could still be some pmap mappings that need to be
3451 				 * cleaned up before we can free the page.
3452 				 */
3453 				refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3454 				if ((refmod & VM_MEM_MODIFIED) &&
3455 				    !m->vmp_dirty) {
3456 					SET_PAGE_DIRTY(m, FALSE);
3457 				}
3458 			}
3459 reclaim_page:
3460 			if (vm_pageout_deadlock_target) {
3461 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3462 				vm_pageout_deadlock_target--;
3463 			}
3464 
3465 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3466 
3467 			if (object->internal) {
3468 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3469 			} else {
3470 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3471 			}
3472 			assert(!m->vmp_cleaning);
3473 			assert(!m->vmp_laundry);
3474 
3475 			if (!object->internal &&
3476 			    object->pager != NULL &&
3477 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3478 				shared_region_pager_reclaimed++;
3479 			}
3480 
3481 			m->vmp_busy = TRUE;
3482 
3483 			/*
3484 			 * remove page from object here since we're already
3485 			 * behind the object lock... defer the rest of the work
3486 			 * we'd normally do in vm_page_free_prepare_object
3487 			 * until 'vm_page_free_list' is called
3488 			 */
3489 			if (m->vmp_tabled) {
3490 				vm_page_remove(m, TRUE);
3491 			}
3492 
3493 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3494 			m->vmp_snext = local_freeq;
3495 			local_freeq = m;
3496 			local_freed++;
3497 
3498 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3499 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3500 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3501 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3502 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3503 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3504 			} else {
3505 				vm_pageout_vminfo.vm_pageout_freed_external++;
3506 			}
3507 
3508 			inactive_burst_count = 0;
3509 
3510 			lock_yield_check = TRUE;
3511 			continue;
3512 		}
3513 		if (object->vo_copy == VM_OBJECT_NULL) {
3514 			/*
3515 			 * No one else can have any interest in this page.
3516 			 * If this is an empty purgable object, the page can be
3517 			 * reclaimed even if dirty.
3518 			 * If the page belongs to a volatile purgable object, we
3519 			 * reactivate it if the compressor isn't active.
3520 			 */
3521 			if (object->purgable == VM_PURGABLE_EMPTY) {
3522 				if (m->vmp_pmapped == TRUE) {
3523 					/* unmap the page */
3524 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3525 					if (refmod_state & VM_MEM_MODIFIED) {
3526 						SET_PAGE_DIRTY(m, FALSE);
3527 					}
3528 				}
3529 				if (m->vmp_dirty || m->vmp_precious) {
3530 					/* we saved the cost of cleaning this page ! */
3531 					vm_page_purged_count++;
3532 				}
3533 				goto reclaim_page;
3534 			}
3535 
3536 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3537 				/*
3538 				 * With the VM compressor, the cost of
3539 				 * reclaiming a page is much lower (no I/O),
3540 				 * so if we find a "volatile" page, it's better
3541 				 * to let it get compressed rather than letting
3542 				 * it occupy a full page until it gets purged.
3543 				 * So no need to check for "volatile" here.
3544 				 */
3545 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3546 				/*
3547 				 * Avoid cleaning a "volatile" page which might
3548 				 * be purged soon.
3549 				 */
3550 
3551 				/* if it's wired, we can't put it on our queue */
3552 				assert(!VM_PAGE_WIRED(m));
3553 
3554 				/* just stick it back on! */
3555 				reactivated_this_call++;
3556 
3557 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3558 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3559 				}
3560 
3561 				goto reactivate_page;
3562 			}
3563 		} /* vo_copy NULL */
3564 		/*
3565 		 *	If it's being used, reactivate.
3566 		 *	(Fictitious pages are either busy or absent.)
3567 		 *	First, update the reference and dirty bits
3568 		 *	to make sure the page is unreferenced.
3569 		 */
3570 		refmod_state = -1;
3571 
3572 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3573 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3574 
3575 			if (refmod_state & VM_MEM_REFERENCED) {
3576 				m->vmp_reference = TRUE;
3577 			}
3578 			if (refmod_state & VM_MEM_MODIFIED) {
3579 				SET_PAGE_DIRTY(m, FALSE);
3580 			}
3581 		}
3582 
3583 		if (m->vmp_reference || m->vmp_dirty) {
3584 			/* deal with a rogue "reusable" page */
3585 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3586 		}
3587 
3588 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3589 			vm_pageout_state.vm_page_xpmapped_min = 0;
3590 		} else {
3591 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3592 			    vm_pageout_state.vm_page_xpmapped_min_divisor;
3593 		}
3594 
3595 		if (!m->vmp_no_cache &&
3596 		    page_from_bg_q == FALSE &&
3597 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3598 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3599 			/*
3600 			 * The page we pulled off the inactive list has
3601 			 * been referenced.  It is possible for other
3602 			 * processors to be touching pages faster than we
3603 			 * can clear the referenced bit and traverse the
3604 			 * inactive queue, so we limit the number of
3605 			 * reactivations.
3606 			 */
3607 			if (++reactivated_this_call >= reactivate_limit &&
3608 			    !object->object_is_shared_cache &&
3609 			    !((m->vmp_realtime ||
3610 			    object->for_realtime) &&
3611 			    vm_pageout_protect_realtime)) {
3612 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3613 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3614 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3615 				if (object->object_is_shared_cache) {
3616 					vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3617 				} else if (m->vmp_realtime ||
3618 				    object->for_realtime) {
3619 					vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3620 				}
3621 			} else {
3622 				uint32_t isinuse;
3623 
3624 				if (reactivated_this_call >= reactivate_limit) {
3625 					if (object->object_is_shared_cache) {
3626 						vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3627 					} else if ((m->vmp_realtime ||
3628 					    object->for_realtime) &&
3629 					    vm_pageout_protect_realtime) {
3630 						vm_pageout_vminfo.vm_pageout_protected_realtime++;
3631 					}
3632 				}
3633 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3634 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3635 				}
3636 
3637 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3638 reactivate_page:
3639 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3640 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3641 					/*
3642 					 * no explict mappings of this object exist
3643 					 * and it's not open via the filesystem
3644 					 */
3645 					vm_page_deactivate(m);
3646 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3647 				} else {
3648 					/*
3649 					 * The page was/is being used, so put back on active list.
3650 					 */
3651 					vm_page_activate(m);
3652 					counter_inc(&vm_statistics_reactivations);
3653 					inactive_burst_count = 0;
3654 				}
3655 #if DEVELOPMENT || DEBUG
3656 				if (page_from_bg_q == TRUE) {
3657 					if (m_object->internal) {
3658 						vm_pageout_rejected_bq_internal++;
3659 					} else {
3660 						vm_pageout_rejected_bq_external++;
3661 					}
3662 				}
3663 #endif /* DEVELOPMENT || DEBUG */
3664 
3665 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3666 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3667 				}
3668 				vm_pageout_state.vm_pageout_inactive_used++;
3669 
3670 				lock_yield_check = TRUE;
3671 				continue;
3672 			}
3673 			/*
3674 			 * Make sure we call pmap_get_refmod() if it
3675 			 * wasn't already called just above, to update
3676 			 * the dirty bit.
3677 			 */
3678 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3679 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3680 				if (refmod_state & VM_MEM_MODIFIED) {
3681 					SET_PAGE_DIRTY(m, FALSE);
3682 				}
3683 			}
3684 		}
3685 
3686 		/*
3687 		 * we've got a candidate page to steal...
3688 		 *
3689 		 * m->vmp_dirty is up to date courtesy of the
3690 		 * preceding check for m->vmp_reference... if
3691 		 * we get here, then m->vmp_reference had to be
3692 		 * FALSE (or possibly "reactivate_limit" was
3693 		 * exceeded), but in either case we called
3694 		 * pmap_get_refmod() and updated both
3695 		 * m->vmp_reference and m->vmp_dirty
3696 		 *
3697 		 * if it's dirty or precious we need to
3698 		 * see if the target queue is throtttled
3699 		 * it if is, we need to skip over it by moving it back
3700 		 * to the end of the inactive queue
3701 		 */
3702 
3703 		inactive_throttled = FALSE;
3704 
3705 		if (m->vmp_dirty || m->vmp_precious) {
3706 			if (object->internal) {
3707 				if (VM_PAGE_Q_THROTTLED(iq)) {
3708 					inactive_throttled = TRUE;
3709 				}
3710 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3711 				inactive_throttled = TRUE;
3712 			}
3713 		}
3714 throttle_inactive:
3715 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3716 		    object->internal && m->vmp_dirty &&
3717 		    (object->purgable == VM_PURGABLE_DENY ||
3718 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3719 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3720 			vm_page_check_pageable_safe(m);
3721 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3722 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3723 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3724 			vm_page_throttled_count++;
3725 
3726 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3727 
3728 			inactive_burst_count = 0;
3729 
3730 			lock_yield_check = TRUE;
3731 			continue;
3732 		}
3733 		if (inactive_throttled == TRUE) {
3734 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3735 			    &force_anonymous, page_from_bg_q);
3736 
3737 			inactive_burst_count = 0;
3738 
3739 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3740 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3741 			}
3742 
3743 			lock_yield_check = TRUE;
3744 			continue;
3745 		}
3746 
3747 		/*
3748 		 * we've got a page that we can steal...
3749 		 * eliminate all mappings and make sure
3750 		 * we have the up-to-date modified state
3751 		 *
3752 		 * if we need to do a pmap_disconnect then we
3753 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3754 		 * provides the true state atomically... the
3755 		 * page was still mapped up to the pmap_disconnect
3756 		 * and may have been dirtied at the last microsecond
3757 		 *
3758 		 * Note that if 'pmapped' is FALSE then the page is not
3759 		 * and has not been in any map, so there is no point calling
3760 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3761 		 * of likely usage of the page.
3762 		 */
3763 		if (m->vmp_pmapped == TRUE) {
3764 			int pmap_options;
3765 
3766 			/*
3767 			 * Don't count this page as going into the compressor
3768 			 * if any of these are true:
3769 			 * 1) compressed pager isn't enabled
3770 			 * 2) Freezer enabled device with compressed pager
3771 			 *    backend (exclusive use) i.e. most of the VM system
3772 			 *    (including vm_pageout_scan) has no knowledge of
3773 			 *    the compressor
3774 			 * 3) This page belongs to a file and hence will not be
3775 			 *    sent into the compressor
3776 			 */
3777 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3778 			    object->internal == FALSE) {
3779 				pmap_options = 0;
3780 			} else if (m->vmp_dirty || m->vmp_precious) {
3781 				/*
3782 				 * VM knows that this page is dirty (or
3783 				 * precious) and needs to be compressed
3784 				 * rather than freed.
3785 				 * Tell the pmap layer to count this page
3786 				 * as "compressed".
3787 				 */
3788 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3789 			} else {
3790 				/*
3791 				 * VM does not know if the page needs to
3792 				 * be preserved but the pmap layer might tell
3793 				 * us if any mapping has "modified" it.
3794 				 * Let's the pmap layer to count this page
3795 				 * as compressed if and only if it has been
3796 				 * modified.
3797 				 */
3798 				pmap_options =
3799 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3800 			}
3801 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3802 			    pmap_options,
3803 			    NULL);
3804 			if (refmod_state & VM_MEM_MODIFIED) {
3805 				SET_PAGE_DIRTY(m, FALSE);
3806 			}
3807 		}
3808 
3809 		/*
3810 		 * reset our count of pages that have been reclaimed
3811 		 * since the last page was 'stolen'
3812 		 */
3813 		inactive_reclaim_run = 0;
3814 
3815 		/*
3816 		 *	If it's clean and not precious, we can free the page.
3817 		 */
3818 		if (!m->vmp_dirty && !m->vmp_precious) {
3819 			vm_pageout_state.vm_pageout_inactive_clean++;
3820 
3821 			/*
3822 			 * OK, at this point we have found a page we are going to free.
3823 			 */
3824 #if CONFIG_PHANTOM_CACHE
3825 			if (!object->internal) {
3826 				vm_phantom_cache_add_ghost(m);
3827 			}
3828 #endif
3829 			goto reclaim_page;
3830 		}
3831 
3832 		/*
3833 		 * The page may have been dirtied since the last check
3834 		 * for a throttled target queue (which may have been skipped
3835 		 * if the page was clean then).  With the dirty page
3836 		 * disconnected here, we can make one final check.
3837 		 */
3838 		if (object->internal) {
3839 			if (VM_PAGE_Q_THROTTLED(iq)) {
3840 				inactive_throttled = TRUE;
3841 			}
3842 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3843 			inactive_throttled = TRUE;
3844 		}
3845 
3846 		if (inactive_throttled == TRUE) {
3847 			goto throttle_inactive;
3848 		}
3849 #if !CONFIG_JETSAM
3850 		memorystatus_update_available_page_count(AVAILABLE_NON_COMPRESSED_MEMORY);
3851 #endif /* !CONFIG_JETSAM */
3852 
3853 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3854 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3855 		}
3856 
3857 		if (object->internal) {
3858 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3859 		} else {
3860 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3861 		}
3862 
3863 		/*
3864 		 * internal pages will go to the compressor...
3865 		 * external pages will go to the appropriate pager to be cleaned
3866 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3867 		 * is a preferred queue to steal from
3868 		 */
3869 		vm_pageout_cluster(m);
3870 		inactive_burst_count = 0;
3871 
3872 		/*
3873 		 * back to top of pageout scan loop
3874 		 */
3875 	}
3876 }
3877 
3878 
3879 void
3880 vm_page_free_reserve(
3881 	int pages)
3882 {
3883 	int             free_after_reserve;
3884 
3885 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3886 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3887 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3888 		} else {
3889 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3890 		}
3891 	} else {
3892 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3893 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3894 		} else {
3895 			vm_page_free_reserved += pages;
3896 		}
3897 	}
3898 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3899 
3900 	vm_page_free_min = vm_page_free_reserved +
3901 	    VM_PAGE_FREE_MIN(free_after_reserve);
3902 
3903 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3904 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3905 	}
3906 
3907 	vm_page_free_target = vm_page_free_reserved +
3908 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3909 
3910 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3911 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3912 	}
3913 
3914 	if (vm_page_free_target < vm_page_free_min + 5) {
3915 		vm_page_free_target = vm_page_free_min + 5;
3916 	}
3917 
3918 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3919 }
3920 
3921 /*
3922  *	vm_pageout is the high level pageout daemon.
3923  */
3924 
3925 void
3926 vm_pageout_continue(void)
3927 {
3928 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3929 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3930 
3931 	vm_free_page_lock();
3932 	vm_pageout_running = TRUE;
3933 	vm_free_page_unlock();
3934 
3935 	vm_pageout_scan();
3936 	/*
3937 	 * we hold both the vm_page_queue_free_lock
3938 	 * and the vm_page_queues_lock at this point
3939 	 */
3940 	assert(vm_page_free_wanted == 0);
3941 	assert(vm_page_free_wanted_privileged == 0);
3942 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3943 
3944 	vm_pageout_running = FALSE;
3945 #if XNU_TARGET_OS_OSX
3946 	if (vm_pageout_waiter) {
3947 		vm_pageout_waiter = FALSE;
3948 		thread_wakeup((event_t)&vm_pageout_waiter);
3949 	}
3950 #endif /* XNU_TARGET_OS_OSX */
3951 
3952 	vm_free_page_unlock();
3953 	vm_page_unlock_queues();
3954 
3955 	thread_block((thread_continue_t)vm_pageout_continue);
3956 	/*NOTREACHED*/
3957 }
3958 
3959 #if XNU_TARGET_OS_OSX
3960 kern_return_t
3961 vm_pageout_wait(uint64_t deadline)
3962 {
3963 	kern_return_t kr;
3964 
3965 	vm_free_page_lock();
3966 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3967 		vm_pageout_waiter = TRUE;
3968 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3969 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3970 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3971 			kr = KERN_OPERATION_TIMED_OUT;
3972 		}
3973 	}
3974 	vm_free_page_unlock();
3975 
3976 	return kr;
3977 }
3978 #endif /* XNU_TARGET_OS_OSX */
3979 
3980 OS_NORETURN
3981 static void
3982 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3983 {
3984 	vm_page_t       m = NULL;
3985 	vm_object_t     object;
3986 	vm_object_offset_t offset;
3987 	memory_object_t pager;
3988 	struct vm_pageout_queue *q = ethr->q;
3989 
3990 	/* On systems with a compressor, the external IO thread clears its
3991 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3992 	 * creation)
3993 	 */
3994 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3995 		current_thread()->options &= ~TH_OPT_VMPRIV;
3996 	}
3997 
3998 	sched_cond_ack(&(ethr->pgo_wakeup));
3999 
4000 	while (true) {
4001 		vm_page_lockspin_queues();
4002 
4003 		while (!vm_page_queue_empty(&q->pgo_pending)) {
4004 			q->pgo_busy = TRUE;
4005 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4006 
4007 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4008 			VM_PAGE_CHECK(m);
4009 			/*
4010 			 * grab a snapshot of the object and offset this
4011 			 * page is tabled in so that we can relookup this
4012 			 * page after we've taken the object lock - these
4013 			 * fields are stable while we hold the page queues lock
4014 			 * but as soon as we drop it, there is nothing to keep
4015 			 * this page in this object... we hold an activity_in_progress
4016 			 * on this object which will keep it from terminating
4017 			 */
4018 			object = VM_PAGE_OBJECT(m);
4019 			offset = m->vmp_offset;
4020 
4021 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4022 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4023 
4024 			vm_page_unlock_queues();
4025 
4026 			vm_object_lock(object);
4027 
4028 			m = vm_page_lookup(object, offset);
4029 
4030 			if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4031 			    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4032 				/*
4033 				 * it's either the same page that someone else has
4034 				 * started cleaning (or it's finished cleaning or
4035 				 * been put back on the pageout queue), or
4036 				 * the page has been freed or we have found a
4037 				 * new page at this offset... in all of these cases
4038 				 * we merely need to release the activity_in_progress
4039 				 * we took when we put the page on the pageout queue
4040 				 */
4041 				vm_object_activity_end(object);
4042 				vm_object_unlock(object);
4043 
4044 				vm_page_lockspin_queues();
4045 				continue;
4046 			}
4047 			pager = object->pager;
4048 
4049 			if (pager == MEMORY_OBJECT_NULL) {
4050 				/*
4051 				 * This pager has been destroyed by either
4052 				 * memory_object_destroy or vm_object_destroy, and
4053 				 * so there is nowhere for the page to go.
4054 				 */
4055 				if (m->vmp_free_when_done) {
4056 					/*
4057 					 * Just free the page... VM_PAGE_FREE takes
4058 					 * care of cleaning up all the state...
4059 					 * including doing the vm_pageout_throttle_up
4060 					 */
4061 					VM_PAGE_FREE(m);
4062 				} else {
4063 					vm_page_lockspin_queues();
4064 
4065 					vm_pageout_throttle_up(m);
4066 					vm_page_activate(m);
4067 
4068 					vm_page_unlock_queues();
4069 
4070 					/*
4071 					 *	And we are done with it.
4072 					 */
4073 				}
4074 				vm_object_activity_end(object);
4075 				vm_object_unlock(object);
4076 
4077 				vm_page_lockspin_queues();
4078 				continue;
4079 			}
4080 	#if 0
4081 			/*
4082 			 * we don't hold the page queue lock
4083 			 * so this check isn't safe to make
4084 			 */
4085 			VM_PAGE_CHECK(m);
4086 	#endif
4087 			/*
4088 			 * give back the activity_in_progress reference we
4089 			 * took when we queued up this page and replace it
4090 			 * it with a paging_in_progress reference that will
4091 			 * also hold the paging offset from changing and
4092 			 * prevent the object from terminating
4093 			 */
4094 			vm_object_activity_end(object);
4095 			vm_object_paging_begin(object);
4096 			vm_object_unlock(object);
4097 
4098 			/*
4099 			 * Send the data to the pager.
4100 			 * any pageout clustering happens there
4101 			 */
4102 			memory_object_data_return(pager,
4103 			    m->vmp_offset + object->paging_offset,
4104 			    PAGE_SIZE,
4105 			    NULL,
4106 			    NULL,
4107 			    FALSE,
4108 			    FALSE,
4109 			    0);
4110 
4111 			vm_object_lock(object);
4112 			vm_object_paging_end(object);
4113 			vm_object_unlock(object);
4114 
4115 			vm_pageout_io_throttle();
4116 
4117 			vm_page_lockspin_queues();
4118 		}
4119 		q->pgo_busy = FALSE;
4120 
4121 		vm_page_unlock_queues();
4122 		sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4123 	}
4124 	/*NOTREACHED*/
4125 }
4126 
4127 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4128 
4129 #if DEVELOPMENT || DEBUG
4130 static void
4131 vm_pageout_record_thread_time(int cqid, int ncomps)
4132 {
4133 	if (__improbable(vm_compressor_time_thread)) {
4134 		vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4135 		vmct_stats.vmct_pages[cqid] += ncomps;
4136 		vmct_stats.vmct_iterations[cqid]++;
4137 		if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4138 			vmct_stats.vmct_maxpages[cqid] = ncomps;
4139 		}
4140 		if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4141 			vmct_stats.vmct_minpages[cqid] = ncomps;
4142 		}
4143 	}
4144 }
4145 #endif
4146 
4147 static void *
4148 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4149 {
4150 	/*
4151 	 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4152 	 * However, this page has been removed from all queues and is only
4153 	 * known to this compressor thread dealing with this local queue.
4154 	 *
4155 	 * TODO: Add a second localq that is the early localq and
4156 	 * put special pages like this one on that queue in the block above
4157 	 * under the pageq lock to avoid this 'works but not clean' logic.
4158 	 */
4159 	void *donate_queue_head;
4160 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4161 	donate_queue_head = &cq->current_early_swapout_chead;
4162 #else /* XNU_TARGET_OS_OSX */
4163 	donate_queue_head = &cq->current_late_swapout_chead;
4164 #endif /* XNU_TARGET_OS_OSX */
4165 	if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4166 		m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4167 		return donate_queue_head;
4168 	}
4169 
4170 	uint32_t sel_i = 0;
4171 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
4172 	vm_object_t object = VM_PAGE_OBJECT(m);
4173 	sel_i = object->vo_chead_hint;
4174 #endif
4175 	assert(sel_i < COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT);
4176 	return &cq->current_regular_swapout_cheads[sel_i];
4177 }
4178 
4179 #define         MAX_FREE_BATCH          32
4180 
4181 OS_NORETURN
4182 static void
4183 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4184 {
4185 	struct vm_pageout_queue *q;
4186 	vm_page_t       m = NULL;
4187 	boolean_t       pgo_draining;
4188 	vm_page_t   local_q;
4189 	int         local_cnt;
4190 	vm_page_t   local_freeq = NULL;
4191 	int         local_freed = 0;
4192 	int         local_batch_size;
4193 #if DEVELOPMENT || DEBUG
4194 	int       ncomps = 0;
4195 	boolean_t marked_active = FALSE;
4196 	int       num_pages_processed = 0;
4197 #endif
4198 	void *chead = NULL;
4199 
4200 	KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4201 
4202 	sched_cond_ack(&(cq->pgo_wakeup));
4203 
4204 	q = cq->q;
4205 
4206 	while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4207 #if DEVELOPMENT || DEBUG
4208 		bool benchmark_accounting = false;
4209 		/* If we're running the compressor perf test, only process the benchmark pages.
4210 		 * We'll get back to our regular queue once the benchmark is done */
4211 		if (compressor_running_perf_test) {
4212 			q = cq->benchmark_q;
4213 			if (!vm_page_queue_empty(&q->pgo_pending)) {
4214 				benchmark_accounting = true;
4215 			} else {
4216 				q = cq->q;
4217 				benchmark_accounting = false;
4218 			}
4219 		}
4220 #endif /* DEVELOPMENT || DEBUG */
4221 
4222 #if __AMP__
4223 		if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4224 			local_batch_size = (q->pgo_maxlaundry >> 3);
4225 			local_batch_size = MAX(local_batch_size, 16);
4226 		} else {
4227 			local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4228 		}
4229 #else
4230 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4231 #endif
4232 
4233 #if RECORD_THE_COMPRESSED_DATA
4234 		if (q->pgo_laundry) {
4235 			c_compressed_record_init();
4236 		}
4237 #endif
4238 		while (true) { /* this loop is for working though all the pages in the pending queue */
4239 			int     pages_left_on_q = 0;
4240 
4241 			local_cnt = 0;
4242 			local_q = NULL;
4243 
4244 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4245 
4246 			vm_page_lock_queues();
4247 #if DEVELOPMENT || DEBUG
4248 			if (marked_active == FALSE) {
4249 				vmct_active++;
4250 				vmct_state[cq->id] = VMCT_ACTIVE;
4251 				marked_active = TRUE;
4252 				if (vmct_active == 1) {
4253 					vm_compressor_epoch_start = mach_absolute_time();
4254 				}
4255 			}
4256 #endif
4257 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4258 
4259 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4260 
4261 			/* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4262 			while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4263 				vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4264 				assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4265 				VM_PAGE_CHECK(m);
4266 
4267 				m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4268 				VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4269 				m->vmp_laundry = FALSE;
4270 
4271 				m->vmp_snext = local_q;
4272 				local_q = m;
4273 				local_cnt++;
4274 			}
4275 			if (local_q == NULL) {
4276 				break;
4277 			}
4278 
4279 			q->pgo_busy = TRUE;
4280 
4281 			if ((pgo_draining = q->pgo_draining) == FALSE) {
4282 				vm_pageout_throttle_up_batch(q, local_cnt);
4283 				pages_left_on_q = q->pgo_laundry;
4284 			} else {
4285 				pages_left_on_q = q->pgo_laundry - local_cnt;
4286 			}
4287 
4288 			vm_page_unlock_queues();
4289 
4290 #if !RECORD_THE_COMPRESSED_DATA
4291 			/* if we have lots to compress, wake up the other thread to help.
4292 			 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4293 			if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4294 				// wake up the next compressor thread
4295 				sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4296 				    pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4297 			}
4298 #endif
4299 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4300 
4301 			while (local_q) {
4302 				KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4303 
4304 				m = local_q;
4305 				local_q = m->vmp_snext;
4306 				m->vmp_snext = NULL;
4307 
4308 
4309 				chead = vm_pageout_select_filling_chead(cq, m);
4310 
4311 				if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4312 #if DEVELOPMENT || DEBUG
4313 					ncomps++;
4314 #endif
4315 					KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4316 
4317 					m->vmp_snext = local_freeq;
4318 					local_freeq = m;
4319 					local_freed++;
4320 
4321 					/* if we gathered enough free pages, free them now */
4322 					if (local_freed >= MAX_FREE_BATCH) {
4323 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4324 
4325 						vm_page_free_list(local_freeq, TRUE);
4326 
4327 						local_freeq = NULL;
4328 						local_freed = 0;
4329 					}
4330 				}
4331 #if DEVELOPMENT || DEBUG
4332 				num_pages_processed++;
4333 #endif /* DEVELOPMENT || DEBUG */
4334 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4335 				while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4336 					kern_return_t   wait_result;
4337 					int             need_wakeup = 0;
4338 
4339 					if (local_freeq) {
4340 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4341 
4342 						vm_page_free_list(local_freeq, TRUE);
4343 						local_freeq = NULL;
4344 						local_freed = 0;
4345 
4346 						continue;
4347 					}
4348 					vm_free_page_lock_spin();
4349 
4350 					if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4351 						if (vm_page_free_wanted_privileged++ == 0) {
4352 							need_wakeup = 1;
4353 						}
4354 						wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4355 
4356 						vm_free_page_unlock();
4357 
4358 						if (need_wakeup) {
4359 							thread_wakeup((event_t)&vm_page_free_wanted);
4360 						}
4361 
4362 						if (wait_result == THREAD_WAITING) {
4363 							thread_block(THREAD_CONTINUE_NULL);
4364 						}
4365 					} else {
4366 						vm_free_page_unlock();
4367 					}
4368 				}
4369 #endif
4370 			}  /* while (local_q) */
4371 			/* free any leftovers in the freeq */
4372 			if (local_freeq) {
4373 				OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4374 
4375 				vm_page_free_list(local_freeq, TRUE);
4376 				local_freeq = NULL;
4377 				local_freed = 0;
4378 			}
4379 			if (pgo_draining == TRUE) {
4380 				vm_page_lockspin_queues();
4381 				vm_pageout_throttle_up_batch(q, local_cnt);
4382 				vm_page_unlock_queues();
4383 			}
4384 		}
4385 		KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4386 
4387 		/*
4388 		 * queue lock is held and our q is empty
4389 		 */
4390 		q->pgo_busy = FALSE;
4391 #if DEVELOPMENT || DEBUG
4392 		if (marked_active == TRUE) {
4393 			vmct_active--;
4394 			vmct_state[cq->id] = VMCT_IDLE;
4395 
4396 			if (vmct_active == 0) {
4397 				vm_compressor_epoch_stop = mach_absolute_time();
4398 				assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4399 				    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4400 				    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4401 				/* This interval includes intervals where one or more
4402 				 * compressor threads were pre-empted
4403 				 */
4404 				vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4405 			}
4406 		}
4407 		if (compressor_running_perf_test && benchmark_accounting) {
4408 			/*
4409 			 * We could turn ON compressor_running_perf_test while still processing
4410 			 * regular non-benchmark pages. We shouldn't count them here else we
4411 			 * could overshoot. We might also still be populating that benchmark Q
4412 			 * and be under pressure. So we will go back to the regular queues. And
4413 			 * benchmark accounting will be off for that case too.
4414 			 */
4415 			compressor_perf_test_pages_processed += num_pages_processed;
4416 			thread_wakeup(&compressor_perf_test_pages_processed);
4417 		}
4418 #endif
4419 		vm_page_unlock_queues();
4420 #if DEVELOPMENT || DEBUG
4421 		vm_pageout_record_thread_time(cq->id, ncomps);
4422 #endif
4423 
4424 		KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4425 #if DEVELOPMENT || DEBUG
4426 		if (compressor_running_perf_test && benchmark_accounting) {
4427 			/*
4428 			 * We've been exclusively compressing pages from the benchmark queue,
4429 			 * do 1 pass over the internal queue before blocking.
4430 			 */
4431 			continue;
4432 		}
4433 #endif
4434 
4435 		sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4436 	}
4437 	/*NOTREACHED*/
4438 }
4439 
4440 /* resolves the pager and maintain stats in the pager and in the vm_object */
4441 kern_return_t
4442 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4443 {
4444 	vm_object_t     object;
4445 	memory_object_t pager;
4446 	int             compressed_count_delta;
4447 	kern_return_t   retval;
4448 
4449 	object = VM_PAGE_OBJECT(m);
4450 
4451 	assert(!m->vmp_free_when_done);
4452 	assert(!m->vmp_laundry);
4453 
4454 	pager = object->pager;
4455 
4456 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4457 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4458 
4459 		vm_object_lock(object);
4460 
4461 		/*
4462 		 * If there is no memory object for the page, create
4463 		 * one and hand it to the compression pager.
4464 		 */
4465 
4466 		if (!object->pager_initialized) {
4467 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4468 		}
4469 		if (!object->pager_initialized) {
4470 			vm_object_compressor_pager_create(object);
4471 		}
4472 
4473 		pager = object->pager;
4474 
4475 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4476 			/*
4477 			 * Still no pager for the object,
4478 			 * or the pager has been destroyed.
4479 			 * Reactivate the page.
4480 			 *
4481 			 * Should only happen if there is no
4482 			 * compression pager
4483 			 */
4484 			vm_page_wakeup_done(object, m);
4485 
4486 			vm_page_lockspin_queues();
4487 			vm_page_activate(m);
4488 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4489 			vm_page_unlock_queues();
4490 
4491 			/*
4492 			 *	And we are done with it.
4493 			 */
4494 			vm_object_activity_end(object);
4495 			vm_object_unlock(object);
4496 
4497 			return KERN_FAILURE;
4498 		}
4499 		vm_object_unlock(object);
4500 
4501 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4502 	}
4503 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4504 	assert(object->activity_in_progress > 0);
4505 
4506 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4507 	if (m->vmp_unmodified_ro == true) {
4508 		os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4509 	}
4510 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4511 
4512 	vm_compressor_options_t flags = 0;
4513 
4514 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4515 	if (m->vmp_unmodified_ro) {
4516 		flags |= C_PAGE_UNMODIFIED;
4517 	}
4518 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4519 
4520 
4521 	retval = vm_compressor_pager_put(
4522 		pager,
4523 		m->vmp_offset + object->paging_offset,
4524 		VM_PAGE_GET_PHYS_PAGE(m),
4525 		current_chead,
4526 		scratch_buf,
4527 		&compressed_count_delta,
4528 		flags);
4529 
4530 	vm_object_lock(object);
4531 
4532 	assert(object->activity_in_progress > 0);
4533 	assert(VM_PAGE_OBJECT(m) == object);
4534 	assert( !VM_PAGE_WIRED(m));
4535 
4536 	vm_compressor_pager_count(pager,
4537 	    compressed_count_delta,
4538 	    FALSE,                       /* shared_lock */
4539 	    object);
4540 
4541 	if (retval == KERN_SUCCESS) {
4542 		/*
4543 		 * If the object is purgeable, its owner's
4544 		 * purgeable ledgers will be updated in
4545 		 * vm_page_remove() but the page still
4546 		 * contributes to the owner's memory footprint,
4547 		 * so account for it as such.
4548 		 */
4549 		if (m->vmp_tabled) {
4550 			vm_page_remove(m, TRUE);
4551 		}
4552 		if ((object->purgable != VM_PURGABLE_DENY ||
4553 		    object->vo_ledger_tag) &&
4554 		    object->vo_owner != NULL) {
4555 			/* one more compressed purgeable/tagged page */
4556 			vm_object_owner_compressed_update(object,
4557 			    compressed_count_delta);
4558 		}
4559 		counter_inc(&vm_statistics_compressions);
4560 	} else {
4561 		vm_page_wakeup_done(object, m);
4562 
4563 		vm_page_lockspin_queues();
4564 
4565 		vm_page_activate(m);
4566 		vm_pageout_vminfo.vm_compressor_failed++;
4567 
4568 		vm_page_unlock_queues();
4569 	}
4570 	vm_object_activity_end(object);
4571 	vm_object_unlock(object);
4572 
4573 	return retval;
4574 }
4575 
4576 
4577 static void
4578 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4579 {
4580 	uint32_t        policy;
4581 
4582 	if (hibernate_cleaning_in_progress == TRUE) {
4583 		req_lowpriority = FALSE;
4584 	}
4585 
4586 	if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4587 		vm_page_unlock_queues();
4588 
4589 		if (req_lowpriority == TRUE) {
4590 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4591 			DTRACE_VM(laundrythrottle);
4592 		} else {
4593 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4594 			DTRACE_VM(laundryunthrottle);
4595 		}
4596 		proc_set_thread_policy(ethr->pgo_iothread,
4597 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4598 
4599 		vm_page_lock_queues();
4600 		ethr->q->pgo_lowpriority = req_lowpriority;
4601 	}
4602 }
4603 
4604 OS_NORETURN
4605 static void
4606 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4607 {
4608 	thread_t        self = current_thread();
4609 
4610 	self->options |= TH_OPT_VMPRIV;
4611 
4612 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4613 
4614 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4615 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4616 
4617 	vm_page_lock_queues();
4618 
4619 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4620 	vm_pageout_queue_external.pgo_inited = TRUE;
4621 
4622 	vm_page_unlock_queues();
4623 
4624 #if CONFIG_THREAD_GROUPS
4625 	thread_group_vm_add();
4626 #endif /* CONFIG_THREAD_GROUPS */
4627 
4628 	vm_pageout_iothread_external_continue(ethr, 0);
4629 	/*NOTREACHED*/
4630 }
4631 
4632 
4633 OS_NORETURN
4634 static void
4635 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4636 {
4637 	thread_t        self = current_thread();
4638 
4639 	self->options |= TH_OPT_VMPRIV;
4640 
4641 	vm_page_lock_queues();
4642 
4643 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4644 	vm_pageout_queue_internal.pgo_inited = TRUE;
4645 
4646 #if DEVELOPMENT || DEBUG
4647 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4648 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4649 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4650 #endif /* DEVELOPMENT || DEBUG */
4651 
4652 	vm_page_unlock_queues();
4653 
4654 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4655 		thread_vm_bind_group_add();
4656 	}
4657 
4658 #if CONFIG_THREAD_GROUPS
4659 	thread_group_vm_add();
4660 #endif /* CONFIG_THREAD_GROUPS */
4661 
4662 #if __AMP__
4663 	if (vm_compressor_ebound) {
4664 		/*
4665 		 * Use the soft bound option for vm_compressor to allow it to run on
4666 		 * P-cores if E-cluster is unavailable.
4667 		 */
4668 		(void) thread_soft_bind_cluster_type(self, 'E');
4669 	}
4670 #endif /* __AMP__ */
4671 
4672 	thread_set_thread_name(current_thread(), "VM_compressor");
4673 #if DEVELOPMENT || DEBUG
4674 	vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4675 #endif
4676 	vm_pageout_iothread_internal_continue(cthr, 0);
4677 
4678 	/*NOTREACHED*/
4679 }
4680 
4681 kern_return_t
4682 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4683 {
4684 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4685 		return KERN_SUCCESS;
4686 	} else {
4687 		return KERN_FAILURE; /* Already set */
4688 	}
4689 }
4690 
4691 extern boolean_t        memorystatus_manual_testing_on;
4692 extern unsigned int     memorystatus_level;
4693 
4694 
4695 #if VM_PRESSURE_EVENTS
4696 
4697 boolean_t vm_pressure_events_enabled = FALSE;
4698 
4699 extern uint64_t next_warning_notification_sent_at_ts;
4700 extern uint64_t next_critical_notification_sent_at_ts;
4701 
4702 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4703 
4704 /*
4705  * The last time there was change in pressure level OR we forced a check
4706  * because the system is stuck in a non-normal pressure level.
4707  */
4708 uint64_t  vm_pressure_last_level_transition_abs = 0;
4709 
4710 /*
4711  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4712  * level before resending out notifications for that level again.
4713  */
4714 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4715 
4716 void
4717 vm_pressure_response()
4718 {
4719 	vm_pressure_level_t     old_level = kVMPressureNormal;
4720 	int                     new_level = -1;
4721 	unsigned int            total_pages;
4722 	uint64_t                available_memory = 0;
4723 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4724 	bool                    force_check = false;
4725 	int                     time_in_mins;
4726 
4727 
4728 	if (vm_pressure_events_enabled == FALSE) {
4729 		return;
4730 	}
4731 
4732 	available_memory = (uint64_t) memorystatus_get_available_page_count();
4733 
4734 	total_pages = (unsigned int) atop_64(max_mem);
4735 #if CONFIG_SECLUDED_MEMORY
4736 	total_pages -= vm_page_secluded_count;
4737 #endif /* CONFIG_SECLUDED_MEMORY */
4738 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4739 
4740 	if (memorystatus_manual_testing_on) {
4741 		return;
4742 	}
4743 
4744 	curr_ts = mach_absolute_time();
4745 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4746 
4747 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4748 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4749 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4750 
4751 	old_level = memorystatus_vm_pressure_level;
4752 
4753 	switch (memorystatus_vm_pressure_level) {
4754 	case kVMPressureNormal:
4755 	{
4756 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4757 			new_level = kVMPressureCritical;
4758 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4759 			new_level = kVMPressureWarning;
4760 		}
4761 		break;
4762 	}
4763 
4764 	case kVMPressureWarning:
4765 	case kVMPressureUrgent:
4766 	{
4767 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4768 			new_level = kVMPressureNormal;
4769 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4770 			new_level = kVMPressureCritical;
4771 		} else if (force_check) {
4772 			new_level = kVMPressureWarning;
4773 			next_warning_notification_sent_at_ts = curr_ts;
4774 		}
4775 		break;
4776 	}
4777 
4778 	case kVMPressureCritical:
4779 	{
4780 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4781 			new_level = kVMPressureNormal;
4782 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4783 			new_level = kVMPressureWarning;
4784 		} else if (force_check) {
4785 			new_level = kVMPressureCritical;
4786 			next_critical_notification_sent_at_ts = curr_ts;
4787 		}
4788 		break;
4789 	}
4790 
4791 	default:
4792 		return;
4793 	}
4794 
4795 	if (new_level != -1 || force_check) {
4796 		if (new_level != -1) {
4797 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4798 
4799 			if (new_level != (int) old_level) {
4800 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4801 				    new_level, old_level, 0, 0);
4802 			}
4803 		} else {
4804 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4805 			    new_level, old_level, force_check, 0);
4806 		}
4807 
4808 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4809 			/*
4810 			 * We don't want to schedule a wakeup while hibernation is in progress
4811 			 * because that could collide with checks for non-monotonicity in the scheduler.
4812 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4813 			 * we _might_ want to use that for decisions regarding which pages or how
4814 			 * many pages we want to dump in hibernation.
4815 			 */
4816 			return;
4817 		}
4818 
4819 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4820 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4821 				thread_wakeup(&vm_pressure_thread);
4822 			}
4823 
4824 			if (old_level != memorystatus_vm_pressure_level) {
4825 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4826 			}
4827 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4828 		}
4829 	}
4830 }
4831 #endif /* VM_PRESSURE_EVENTS */
4832 
4833 
4834 /**
4835  * Called by a kernel thread to ask if a number of pages may be wired.
4836  */
4837 kern_return_t
4838 mach_vm_wire_level_monitor(int64_t requested_pages)
4839 {
4840 	if (requested_pages <= 0) {
4841 		return KERN_INVALID_ARGUMENT;
4842 	}
4843 
4844 	const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4845 	/**
4846 	 * Available pages can be negative in the case where more system memory is
4847 	 * wired than the threshold, so we must use a signed integer.
4848 	 */
4849 	const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4850 
4851 	if (requested_pages > available_pages) {
4852 		return KERN_RESOURCE_SHORTAGE;
4853 	}
4854 	return KERN_SUCCESS;
4855 }
4856 
4857 /*
4858  * Function called by a kernel thread to either get the current pressure level or
4859  * wait until memory pressure changes from a given level.
4860  */
4861 kern_return_t
4862 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4863 {
4864 #if !VM_PRESSURE_EVENTS
4865 	(void)wait_for_pressure;
4866 	(void)pressure_level;
4867 	return KERN_NOT_SUPPORTED;
4868 #else /* VM_PRESSURE_EVENTS */
4869 
4870 	uint32_t *waiters = NULL;
4871 	wait_result_t wr = 0;
4872 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4873 
4874 	if (pressure_level == NULL) {
4875 		return KERN_INVALID_ARGUMENT;
4876 	}
4877 	if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4878 	    *pressure_level == kVMPressureForegroundJetsam)) {
4879 		return KERN_INVALID_ARGUMENT;
4880 	}
4881 
4882 	if (wait_for_pressure) {
4883 		switch (*pressure_level) {
4884 		case kVMPressureForegroundJetsam:
4885 		case kVMPressureBackgroundJetsam:
4886 
4887 			if (*pressure_level == kVMPressureForegroundJetsam) {
4888 				waiters = &memorystatus_jetsam_fg_band_waiters;
4889 			} else {
4890 				/* kVMPressureBackgroundJetsam */
4891 				waiters = &memorystatus_jetsam_bg_band_waiters;
4892 			}
4893 
4894 			lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4895 			wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4896 			if (wr == THREAD_WAITING) {
4897 				*waiters += 1;
4898 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4899 				wr = thread_block(THREAD_CONTINUE_NULL);
4900 			} else {
4901 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4902 			}
4903 
4904 			if (wr != THREAD_AWAKENED) {
4905 				return KERN_ABORTED;
4906 			}
4907 
4908 			return KERN_SUCCESS;
4909 		case kVMPressureNormal:
4910 		case kVMPressureWarning:
4911 		case kVMPressureUrgent:
4912 		case kVMPressureCritical:
4913 			while (old_level == *pressure_level) {
4914 				wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4915 				    THREAD_INTERRUPTIBLE);
4916 				if (wr == THREAD_WAITING) {
4917 					wr = thread_block(THREAD_CONTINUE_NULL);
4918 				}
4919 				if (wr == THREAD_INTERRUPTED) {
4920 					return KERN_ABORTED;
4921 				}
4922 
4923 				if (wr == THREAD_AWAKENED) {
4924 					old_level = memorystatus_vm_pressure_level;
4925 				}
4926 			}
4927 			break;
4928 		default:
4929 			return KERN_INVALID_ARGUMENT;
4930 		}
4931 	}
4932 
4933 	*pressure_level = old_level;
4934 	return KERN_SUCCESS;
4935 #endif /* VM_PRESSURE_EVENTS */
4936 }
4937 
4938 #if VM_PRESSURE_EVENTS
4939 void
4940 vm_pressure_thread(void)
4941 {
4942 	static boolean_t thread_initialized = FALSE;
4943 
4944 	if (thread_initialized == TRUE) {
4945 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4946 		consider_vm_pressure_events();
4947 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4948 	}
4949 
4950 #if CONFIG_THREAD_GROUPS
4951 	thread_group_vm_add();
4952 #endif /* CONFIG_THREAD_GROUPS */
4953 
4954 	thread_set_thread_name(current_thread(), "VM_pressure");
4955 	thread_initialized = TRUE;
4956 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4957 	thread_block((thread_continue_t)vm_pressure_thread);
4958 }
4959 #endif /* VM_PRESSURE_EVENTS */
4960 
4961 
4962 /*
4963  * called once per-second via "compute_averages"
4964  */
4965 void
4966 compute_pageout_gc_throttle(__unused void *arg)
4967 {
4968 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4969 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4970 		sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
4971 	}
4972 }
4973 
4974 /*
4975  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4976  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4977  * jetsams. We need to check if the zone map size is above its jetsam limit to
4978  * decide if this was indeed the case.
4979  *
4980  * We need to do this on a different thread because of the following reasons:
4981  *
4982  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4983  * itself causing the system to hang. We perform synchronous jetsams if we're
4984  * leaking in the VM map entries zone, so the leaking process could be doing a
4985  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4986  * jetsam itself. We also need the vm_map lock on the process termination path,
4987  * which would now lead the dying process to deadlock against itself.
4988  *
4989  * 2. The jetsam path might need to allocate zone memory itself. We could try
4990  * using the non-blocking variant of zalloc for this path, but we can still
4991  * end up trying to do a kmem_alloc when the zone maps are almost full.
4992  */
4993 __dead2
4994 void
4995 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4996 {
4997 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4998 
4999 	if (step != VM_PAGEOUT_GC_INIT) {
5000 		sched_cond_ack(&vm_pageout_gc_cond);
5001 	}
5002 
5003 	while (true) {
5004 		if (step == VM_PAGEOUT_GC_INIT) {
5005 			/* first time being called is not about GC */
5006 #if CONFIG_THREAD_GROUPS
5007 			thread_group_vm_add();
5008 #endif /* CONFIG_THREAD_GROUPS */
5009 			step = VM_PAGEOUT_GC_COLLECT;
5010 		} else if (zone_map_nearing_exhaustion()) {
5011 			/*
5012 			 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
5013 			 *
5014 			 * Bail out after calling zone_gc (which triggers the
5015 			 * zone-map-exhaustion jetsams). If we fall through, the subsequent
5016 			 * operations that clear out a bunch of caches might allocate zone
5017 			 * memory themselves (for eg. vm_map operations would need VM map
5018 			 * entries). Since the zone map is almost full at this point, we
5019 			 * could end up with a panic. We just need to quickly jetsam a
5020 			 * process and exit here.
5021 			 *
5022 			 * It could so happen that we were woken up to relieve memory
5023 			 * pressure and the zone map also happened to be near its limit at
5024 			 * the time, in which case we'll skip out early. But that should be
5025 			 * ok; if memory pressure persists, the thread will simply be woken
5026 			 * up again.
5027 			 */
5028 
5029 			zone_gc(ZONE_GC_JETSAM);
5030 		} else {
5031 			/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
5032 			boolean_t buf_large_zfree = FALSE;
5033 			boolean_t first_try = TRUE;
5034 
5035 			stack_collect();
5036 
5037 			consider_machine_collect();
5038 #if CONFIG_DEFERRED_RECLAIM
5039 			mach_vm_size_t bytes_reclaimed;
5040 			vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
5041 #endif /* CONFIG_DEFERRED_RECLAIM */
5042 #if CONFIG_MBUF_MCACHE
5043 			mbuf_drain(FALSE);
5044 #endif /* CONFIG_MBUF_MCACHE */
5045 
5046 			do {
5047 				if (consider_buffer_cache_collect != NULL) {
5048 					buf_large_zfree = (*consider_buffer_cache_collect)(0);
5049 				}
5050 				if (first_try == TRUE || buf_large_zfree == TRUE) {
5051 					/*
5052 					 * zone_gc should be last, because the other operations
5053 					 * might return memory to zones.
5054 					 */
5055 					zone_gc(ZONE_GC_TRIM);
5056 				}
5057 				first_try = FALSE;
5058 			} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5059 
5060 			consider_machine_adjust();
5061 		}
5062 
5063 		sched_cond_wait_parameter(&vm_pageout_gc_cond, THREAD_UNINT, vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5064 	}
5065 	__builtin_unreachable();
5066 }
5067 
5068 
5069 #if VM_PAGE_BUCKETS_CHECK
5070 #if VM_PAGE_FAKE_BUCKETS
5071 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5072 #endif /* VM_PAGE_FAKE_BUCKETS */
5073 #endif /* VM_PAGE_BUCKETS_CHECK */
5074 
5075 
5076 
5077 void
5078 vm_set_restrictions(unsigned int num_cpus)
5079 {
5080 	int vm_restricted_to_single_processor = 0;
5081 
5082 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5083 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5084 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5085 	} else {
5086 		assert(num_cpus > 0);
5087 
5088 		if (num_cpus <= 3) {
5089 			/*
5090 			 * on systems with a limited number of CPUS, bind the
5091 			 * 4 major threads that can free memory and that tend to use
5092 			 * a fair bit of CPU under pressured conditions to a single processor.
5093 			 * This insures that these threads don't hog all of the available CPUs
5094 			 * (important for camera launch), while allowing them to run independently
5095 			 * w/r to locks... the 4 threads are
5096 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
5097 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
5098 			 * memorystatus_thread (jetsams).
5099 			 *
5100 			 * the first time the thread is run, it is responsible for checking the
5101 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
5102 			 * thread_bind_master...  someday this should be replaced with a group
5103 			 * scheduling mechanism and KPI.
5104 			 */
5105 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5106 		} else {
5107 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5108 		}
5109 	}
5110 }
5111 
5112 /*
5113  * Set up vm_config based on the vm_compressor_mode.
5114  * Must run BEFORE the pageout thread starts up.
5115  */
5116 __startup_func
5117 void
5118 vm_config_init(void)
5119 {
5120 	bzero(&vm_config, sizeof(vm_config));
5121 
5122 	switch (vm_compressor_mode) {
5123 	case VM_PAGER_DEFAULT:
5124 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5125 		OS_FALLTHROUGH;
5126 
5127 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
5128 		vm_config.compressor_is_present = TRUE;
5129 		vm_config.swap_is_present = TRUE;
5130 		vm_config.compressor_is_active = TRUE;
5131 		vm_config.swap_is_active = TRUE;
5132 		break;
5133 
5134 	case VM_PAGER_COMPRESSOR_NO_SWAP:
5135 		vm_config.compressor_is_present = TRUE;
5136 		vm_config.swap_is_present = TRUE;
5137 		vm_config.compressor_is_active = TRUE;
5138 		break;
5139 
5140 	case VM_PAGER_FREEZER_DEFAULT:
5141 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5142 		OS_FALLTHROUGH;
5143 
5144 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5145 		vm_config.compressor_is_present = TRUE;
5146 		vm_config.swap_is_present = TRUE;
5147 		break;
5148 
5149 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5150 		vm_config.compressor_is_present = TRUE;
5151 		vm_config.swap_is_present = TRUE;
5152 		vm_config.compressor_is_active = TRUE;
5153 		vm_config.freezer_swap_is_active = TRUE;
5154 		break;
5155 
5156 	case VM_PAGER_NOT_CONFIGURED:
5157 		break;
5158 
5159 	default:
5160 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
5161 		break;
5162 	}
5163 }
5164 
5165 __startup_func
5166 static void
5167 vm_pageout_create_gc_thread(void)
5168 {
5169 	thread_t thread;
5170 
5171 	sched_cond_init(&vm_pageout_gc_cond);
5172 	if (kernel_thread_create(vm_pageout_garbage_collect,
5173 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5174 		panic("vm_pageout_garbage_collect: create failed");
5175 	}
5176 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5177 	if (thread->reserved_stack == 0) {
5178 		assert(thread->kernel_stack);
5179 		thread->reserved_stack = thread->kernel_stack;
5180 	}
5181 
5182 	/* thread is started in vm_pageout() */
5183 	vm_pageout_gc_thread = thread;
5184 }
5185 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5186 
5187 void
5188 vm_pageout(void)
5189 {
5190 	thread_t        self = current_thread();
5191 	thread_t        thread;
5192 	kern_return_t   result;
5193 	spl_t           s;
5194 
5195 	/*
5196 	 * Set thread privileges.
5197 	 */
5198 	s = splsched();
5199 
5200 #if CONFIG_VPS_DYNAMIC_PRIO
5201 	if (vps_dynamic_priority_enabled) {
5202 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5203 		thread_set_eager_preempt(self);
5204 	} else {
5205 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5206 	}
5207 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5208 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5209 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5210 
5211 	thread_lock(self);
5212 	self->options |= TH_OPT_VMPRIV;
5213 	thread_unlock(self);
5214 
5215 	if (!self->reserved_stack) {
5216 		self->reserved_stack = self->kernel_stack;
5217 	}
5218 
5219 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5220 	    !vps_dynamic_priority_enabled) {
5221 		thread_vm_bind_group_add();
5222 	}
5223 
5224 
5225 #if CONFIG_THREAD_GROUPS
5226 	thread_group_vm_add();
5227 #endif /* CONFIG_THREAD_GROUPS */
5228 
5229 #if __AMP__
5230 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5231 	if (vm_pgo_pbound) {
5232 		/*
5233 		 * Use the soft bound option for vm pageout to allow it to run on
5234 		 * E-cores if P-cluster is unavailable.
5235 		 */
5236 		(void) thread_soft_bind_cluster_type(self, 'P');
5237 	}
5238 #endif /* __AMP__ */
5239 
5240 	PE_parse_boot_argn("vmpgo_protect_realtime",
5241 	    &vm_pageout_protect_realtime,
5242 	    sizeof(vm_pageout_protect_realtime));
5243 	splx(s);
5244 
5245 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5246 
5247 	vm_log_handle = os_log_create("com.apple.xnu", "virtual-memory");
5248 
5249 	/*
5250 	 *	Initialize some paging parameters.
5251 	 */
5252 
5253 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5254 	vm_pageout_state.vm_pressure_changed = FALSE;
5255 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5256 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5257 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5258 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5259 	vm_pageout_state.vm_page_speculative_percentage = 5;
5260 	vm_pageout_state.vm_page_speculative_target = 0;
5261 
5262 	vm_pageout_state.vm_pageout_swap_wait = 0;
5263 	vm_pageout_state.vm_pageout_idle_wait = 0;
5264 	vm_pageout_state.vm_pageout_empty_wait = 0;
5265 	vm_pageout_state.vm_pageout_burst_wait = 0;
5266 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5267 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5268 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5269 
5270 	vm_pageout_state.vm_pageout_inactive = 0;
5271 	vm_pageout_state.vm_pageout_inactive_used = 0;
5272 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5273 
5274 	vm_pageout_state.vm_memory_pressure = 0;
5275 	vm_pageout_state.vm_page_filecache_min = 0;
5276 #if CONFIG_JETSAM
5277 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5278 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5279 #else
5280 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5281 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5282 #endif
5283 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5284 
5285 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5286 
5287 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5288 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5289 	}
5290 
5291 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5292 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5293 	}
5294 
5295 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5296 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5297 	}
5298 
5299 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5300 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5301 	}
5302 
5303 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5304 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5305 	}
5306 
5307 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5308 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5309 	}
5310 
5311 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5312 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5313 	}
5314 	/*
5315 	 * even if we've already called vm_page_free_reserve
5316 	 * call it again here to insure that the targets are
5317 	 * accurately calculated (it uses vm_page_free_count_init)
5318 	 * calling it with an arg of 0 will not change the reserve
5319 	 * but will re-calculate free_min and free_target
5320 	 */
5321 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5322 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5323 	} else {
5324 		vm_page_free_reserve(0);
5325 	}
5326 
5327 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5328 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5329 
5330 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5331 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5332 
5333 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5334 
5335 #if DEVELOPMENT || DEBUG
5336 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5337 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5338 #endif /* DEVELOPMENT || DEBUG */
5339 
5340 
5341 	/* internal pageout thread started when default pager registered first time */
5342 	/* external pageout and garbage collection threads started here */
5343 	struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5344 	ethr->id = 0;
5345 	ethr->q = &vm_pageout_queue_external;
5346 	/* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5347 	ethr->current_early_swapout_chead = NULL;
5348 	for (int reg_i = 0; reg_i < COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT; ++reg_i) {
5349 		ethr->current_regular_swapout_cheads[reg_i] = NULL;
5350 	}
5351 	ethr->current_late_swapout_chead = NULL;
5352 	ethr->scratch_buf = NULL;
5353 #if DEVELOPMENT || DEBUG
5354 	ethr->benchmark_q = NULL;
5355 #endif /* DEVELOPMENT || DEBUG */
5356 	sched_cond_init(&(ethr->pgo_wakeup));
5357 
5358 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5359 	    (void *)ethr, BASEPRI_VM,
5360 	    &(ethr->pgo_iothread));
5361 	if (result != KERN_SUCCESS) {
5362 		panic("vm_pageout: Unable to create external thread (%d)\n", result);
5363 	}
5364 	thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5365 
5366 	thread_mtx_lock(vm_pageout_gc_thread );
5367 	thread_start(vm_pageout_gc_thread );
5368 	thread_mtx_unlock(vm_pageout_gc_thread);
5369 
5370 #if VM_PRESSURE_EVENTS
5371 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5372 	    BASEPRI_DEFAULT,
5373 	    &thread);
5374 
5375 	if (result != KERN_SUCCESS) {
5376 		panic("vm_pressure_thread: create failed");
5377 	}
5378 
5379 	thread_deallocate(thread);
5380 #endif
5381 
5382 	vm_object_reaper_init();
5383 
5384 
5385 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5386 		vm_compressor_init();
5387 	}
5388 
5389 #if VM_PRESSURE_EVENTS
5390 	vm_pressure_events_enabled = TRUE;
5391 #endif /* VM_PRESSURE_EVENTS */
5392 
5393 #if CONFIG_PHANTOM_CACHE
5394 	vm_phantom_cache_init();
5395 #endif
5396 #if VM_PAGE_BUCKETS_CHECK
5397 #if VM_PAGE_FAKE_BUCKETS
5398 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5399 	    (uint64_t) vm_page_fake_buckets_start,
5400 	    (uint64_t) vm_page_fake_buckets_end);
5401 	pmap_protect(kernel_pmap,
5402 	    vm_page_fake_buckets_start,
5403 	    vm_page_fake_buckets_end,
5404 	    VM_PROT_READ);
5405 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5406 #endif /* VM_PAGE_FAKE_BUCKETS */
5407 #endif /* VM_PAGE_BUCKETS_CHECK */
5408 
5409 #if VM_OBJECT_TRACKING
5410 	vm_object_tracking_init();
5411 #endif /* VM_OBJECT_TRACKING */
5412 
5413 #if __arm64__
5414 //	vm_tests();
5415 #endif /* __arm64__ */
5416 
5417 	vm_pageout_continue();
5418 
5419 	/*
5420 	 * Unreached code!
5421 	 *
5422 	 * The vm_pageout_continue() call above never returns, so the code below is never
5423 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5424 	 * points that our kernel doesn't have an analog for.  These are probe points that
5425 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5426 	 * scripts that use them.  Declaring the probe points here means their scripts will
5427 	 * compile and execute which we want for portability of the scripts, but since this
5428 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5429 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5430 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5431 	 */
5432 
5433 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5434 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5435 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5436 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5437 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5438 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5439 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5440 	/*NOTREACHED*/
5441 }
5442 
5443 
5444 
5445 kern_return_t
5446 vm_pageout_internal_start(void)
5447 {
5448 	kern_return_t   result = KERN_SUCCESS;
5449 	host_basic_info_data_t hinfo;
5450 	vm_offset_t     buf, bufsize;
5451 
5452 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5453 
5454 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5455 #define BSD_HOST 1
5456 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5457 
5458 	assert(hinfo.max_cpus > 0);
5459 
5460 #if !XNU_TARGET_OS_OSX
5461 	vm_pageout_state.vm_compressor_thread_count = 1;
5462 #else /* !XNU_TARGET_OS_OSX */
5463 	if (hinfo.max_cpus > 4) {
5464 		vm_pageout_state.vm_compressor_thread_count = 2;
5465 	} else {
5466 		vm_pageout_state.vm_compressor_thread_count = 1;
5467 	}
5468 #endif /* !XNU_TARGET_OS_OSX */
5469 #if     __AMP__
5470 	if (vm_compressor_ebound) {
5471 		vm_pageout_state.vm_compressor_thread_count = 2;
5472 	}
5473 #endif
5474 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5475 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5476 
5477 	/* did we get from the bootargs an unreasonable number? */
5478 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5479 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5480 	}
5481 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5482 		vm_pageout_state.vm_compressor_thread_count = 1;
5483 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5484 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5485 	}
5486 
5487 	vm_pageout_queue_internal.pgo_maxlaundry =
5488 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5489 
5490 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5491 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5492 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5493 
5494 #if DEVELOPMENT || DEBUG
5495 	// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5496 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5497 #endif /* DEVELOPMENT || DEBUG */
5498 
5499 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5500 
5501 	kmem_alloc(kernel_map, &buf,
5502 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5503 	    KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5504 	    VM_KERN_MEMORY_COMPRESSOR);
5505 
5506 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5507 		struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5508 		iq->id = i;
5509 		iq->q = &vm_pageout_queue_internal;
5510 		iq->current_early_swapout_chead = NULL;
5511 		for (int reg_i = 0; reg_i < COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT; ++reg_i) {
5512 			iq->current_regular_swapout_cheads[reg_i] = NULL;
5513 		}
5514 		iq->current_late_swapout_chead = NULL;
5515 		iq->scratch_buf = (char *)(buf + i * bufsize);
5516 #if DEVELOPMENT || DEBUG
5517 		iq->benchmark_q = &vm_pageout_queue_benchmark;
5518 #endif /* DEVELOPMENT || DEBUG */
5519 		sched_cond_init(&(iq->pgo_wakeup));
5520 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5521 		    (void *)iq, BASEPRI_VM,
5522 		    &(iq->pgo_iothread));
5523 
5524 		if (result != KERN_SUCCESS) {
5525 			panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5526 		}
5527 	}
5528 	return result;
5529 }
5530 
5531 #if CONFIG_IOSCHED
5532 /*
5533  * To support I/O Expedite for compressed files we mark the upls with special flags.
5534  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5535  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5536  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5537  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5538  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5539  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5540  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5541  * unless the real I/O upl is being destroyed).
5542  */
5543 
5544 
5545 static void
5546 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5547 {
5548 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5549 
5550 	upl_lock(src_upl);
5551 	if (src_upl->decmp_io_upl) {
5552 		/*
5553 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5554 		 * This case should rarely happen and even if it does, it just means
5555 		 * that we might issue a spurious expedite which the driver is expected
5556 		 * to handle.
5557 		 */
5558 		upl_unlock(src_upl);
5559 		return;
5560 	}
5561 	src_upl->decmp_io_upl = (void *)upl;
5562 	src_upl->ref_count++;
5563 
5564 	upl->flags |= UPL_DECMP_REAL_IO;
5565 	upl->decmp_io_upl = (void *)src_upl;
5566 	upl_unlock(src_upl);
5567 }
5568 #endif /* CONFIG_IOSCHED */
5569 
5570 #if UPL_DEBUG
5571 int     upl_debug_enabled = 1;
5572 #else
5573 int     upl_debug_enabled = 0;
5574 #endif
5575 
5576 static upl_t
5577 upl_create(int type, int flags, upl_size_t size)
5578 {
5579 	uint32_t pages = (uint32_t)atop(round_page_32(size));
5580 	upl_t    upl;
5581 
5582 	assert(page_aligned(size));
5583 
5584 	/*
5585 	 * FIXME: this code assumes the allocation always succeeds,
5586 	 *        however `pages` can be up to MAX_UPL_SIZE.
5587 	 *
5588 	 *        The allocation size is above 32k (resp. 128k)
5589 	 *        on 16k pages (resp. 4k), which kalloc might fail
5590 	 *        to allocate.
5591 	 */
5592 	upl = kalloc_type(struct upl, struct upl_page_info,
5593 	    (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5594 	if (type & UPL_CREATE_INTERNAL) {
5595 		flags |= UPL_INTERNAL;
5596 	}
5597 
5598 	if (type & UPL_CREATE_LITE) {
5599 		flags |= UPL_LITE;
5600 		if (pages) {
5601 			upl->lite_list = bitmap_alloc(pages);
5602 		}
5603 	}
5604 
5605 	upl->flags = flags;
5606 	upl->ref_count = 1;
5607 	upl_lock_init(upl);
5608 #if CONFIG_IOSCHED
5609 	if (type & UPL_CREATE_IO_TRACKING) {
5610 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5611 	}
5612 
5613 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5614 		/* Only support expedite on internal UPLs */
5615 		thread_t        curthread = current_thread();
5616 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5617 		    Z_WAITOK | Z_ZERO);
5618 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5619 		if (curthread->decmp_upl != NULL) {
5620 			upl_set_decmp_info(upl, curthread->decmp_upl);
5621 		}
5622 	}
5623 #endif
5624 #if CONFIG_IOSCHED || UPL_DEBUG
5625 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5626 		upl->upl_creator = current_thread();
5627 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5628 	}
5629 #endif
5630 
5631 #if UPL_DEBUG
5632 	upl->upl_create_btref = btref_get(__builtin_frame_address(0), 0);
5633 #endif /* UPL_DEBUG */
5634 
5635 	return upl;
5636 }
5637 
5638 static void
5639 upl_destroy(upl_t upl)
5640 {
5641 	uint32_t pages;
5642 
5643 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5644 
5645 	if (upl->ext_ref_count) {
5646 		panic("upl(%p) ext_ref_count", upl);
5647 	}
5648 
5649 #if CONFIG_IOSCHED
5650 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5651 		upl_t src_upl;
5652 		src_upl = upl->decmp_io_upl;
5653 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5654 		upl_lock(src_upl);
5655 		src_upl->decmp_io_upl = NULL;
5656 		upl_unlock(src_upl);
5657 		upl_deallocate(src_upl);
5658 	}
5659 #endif /* CONFIG_IOSCHED */
5660 
5661 #if CONFIG_IOSCHED || UPL_DEBUG
5662 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5663 	    !(upl->flags & UPL_VECTOR)) {
5664 		vm_object_t     object;
5665 
5666 		if (upl->flags & UPL_SHADOWED) {
5667 			object = upl->map_object->shadow;
5668 		} else {
5669 			object = upl->map_object;
5670 		}
5671 
5672 		vm_object_lock(object);
5673 		queue_remove(&object->uplq, upl, upl_t, uplq);
5674 		vm_object_activity_end(object);
5675 		vm_object_collapse(object, 0, TRUE);
5676 		vm_object_unlock(object);
5677 	}
5678 #endif
5679 	/*
5680 	 * drop a reference on the map_object whether or
5681 	 * not a pageout object is inserted
5682 	 */
5683 	if (upl->flags & UPL_SHADOWED) {
5684 		vm_object_deallocate(upl->map_object);
5685 	}
5686 
5687 	if (upl->flags & UPL_DEVICE_MEMORY) {
5688 		pages = 1;
5689 	} else {
5690 		pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5691 	}
5692 
5693 	upl_lock_destroy(upl);
5694 
5695 #if CONFIG_IOSCHED
5696 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5697 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5698 	}
5699 #endif
5700 
5701 	if (upl->flags & UPL_HAS_FS_VERIFY_INFO) {
5702 		assert(upl->u_fs_un.verify_info && upl->u_fs_un.verify_info->verify_data_len > 0 &&
5703 		    upl->u_fs_un.verify_info->verify_data_len <= upl_adjusted_size(upl, PAGE_MASK));
5704 
5705 		kfree_data(upl->u_fs_un.verify_info->verify_data_ptr,
5706 		    upl->u_fs_un.verify_info->verify_data_len);
5707 		kfree_type(struct upl_fs_verify_info, upl->u_fs_un.verify_info);
5708 	}
5709 
5710 #if UPL_DEBUG
5711 	for (int i = 0; i < upl->upl_commit_index; i++) {
5712 		btref_put(upl->upl_commit_records[i].c_btref);
5713 	}
5714 	btref_put(upl->upl_create_btref);
5715 #endif /* UPL_DEBUG */
5716 
5717 	if ((upl->flags & UPL_LITE) && pages) {
5718 		bitmap_free(upl->lite_list, pages);
5719 	}
5720 	kfree_type(struct upl, struct upl_page_info,
5721 	    (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5722 }
5723 
5724 void
5725 upl_deallocate(upl_t upl)
5726 {
5727 	upl_lock(upl);
5728 
5729 	if (--upl->ref_count == 0) {
5730 		if (vector_upl_is_valid(upl)) {
5731 			vector_upl_deallocate(upl);
5732 		}
5733 		upl_unlock(upl);
5734 
5735 		if (upl->upl_iodone) {
5736 			upl_callout_iodone(upl);
5737 		}
5738 
5739 		upl_destroy(upl);
5740 	} else {
5741 		upl_unlock(upl);
5742 	}
5743 }
5744 
5745 #if CONFIG_IOSCHED
5746 void
5747 upl_mark_decmp(upl_t upl)
5748 {
5749 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5750 		upl->flags |= UPL_DECMP_REQ;
5751 		upl->upl_creator->decmp_upl = (void *)upl;
5752 	}
5753 }
5754 
5755 void
5756 upl_unmark_decmp(upl_t upl)
5757 {
5758 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5759 		upl->upl_creator->decmp_upl = NULL;
5760 	}
5761 }
5762 
5763 #endif /* CONFIG_IOSCHED */
5764 
5765 #define VM_PAGE_Q_BACKING_UP(q)         \
5766 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5767 
5768 static boolean_t
5769 must_throttle_writes()
5770 {
5771 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5772 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5773 		/*
5774 		 * The external pageout queue is saturated, and there is an abundance of
5775 		 * filecache on the system that VM_pageout still needs to get to. Likely the
5776 		 * pageout thread is contending at the filesystem or storage layers with a
5777 		 * high volume of other I/Os. Attempt to give the pageout thread a chance to
5778 		 * catch up by applying a blanket throttle to all outgoing I/Os.
5779 		 */
5780 		return TRUE;
5781 	}
5782 
5783 	return FALSE;
5784 }
5785 
5786 int vm_page_delayed_work_ctx_needed = 0;
5787 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5788 
5789 __startup_func
5790 static void
5791 vm_page_delayed_work_init_ctx(void)
5792 {
5793 	uint16_t min_delayed_work_ctx_allocated = 16;
5794 
5795 	/*
5796 	 * try really hard to always keep NCPU elements around in the zone
5797 	 * in order for the UPL code to almost always get an element.
5798 	 */
5799 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5800 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5801 	}
5802 
5803 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5804 }
5805 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5806 
5807 struct vm_page_delayed_work*
5808 vm_page_delayed_work_get_ctx(void)
5809 {
5810 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5811 
5812 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5813 
5814 	if (__probable(dw_ctx)) {
5815 		dw_ctx->delayed_owner = current_thread();
5816 	} else {
5817 		vm_page_delayed_work_ctx_needed++;
5818 	}
5819 	return dw_ctx ? dw_ctx->dwp : NULL;
5820 }
5821 
5822 void
5823 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5824 {
5825 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5826 
5827 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5828 	ldw_ctx->delayed_owner = NULL;
5829 
5830 	zfree(dw_ctx_zone, ldw_ctx);
5831 }
5832 
5833 uint64_t vm_object_upl_throttle_cnt;
5834 
5835 TUNABLE(uint32_t, vm_object_throttle_delay_us,
5836     "vm_object_upl_throttle_delay_us", 1000); /* 1ms */
5837 
5838 /*
5839  * @func vm_object_upl_throttle
5840  *
5841  * @brief
5842  * Throttle the current UPL request to give the external pageout thread
5843  * a chance to catch up to system I/O demand.
5844  *
5845  * @discussion
5846  * We may end up in a situation where the file-cache is large, and we need to
5847  * evict some of it. However, the external pageout thread either can't keep up
5848  * with demand or is contending with other I/Os for the storage device (see
5849  * @c must_throttle_writes()). In these situations, we apply a throttle to
5850  * outgoing writes to give the pageout thread a chance to catch up.
5851  */
5852 OS_NOINLINE OS_NOT_TAIL_CALLED
5853 static void
5854 vm_object_upl_throttle(vm_object_t object, upl_size_t size)
5855 {
5856 	int delay_us = vm_object_throttle_delay_us;
5857 #if XNU_TARGET_OS_OSX
5858 	if (memory_object_is_vnode_pager(object->pager)) {
5859 		boolean_t isSSD = FALSE;
5860 		__assert_only kern_return_t kr;
5861 		kr = vnode_pager_get_isSSD(object->pager, &isSSD);
5862 		assert3u(kr, ==, KERN_SUCCESS);
5863 		if (!isSSD) {
5864 			delay_us = 5000; /* 5 ms */
5865 		}
5866 	}
5867 #endif /* !XNU_TARGET_OS_OSX */
5868 
5869 	KDBG(VMDBG_CODE(DBG_VM_UPL_THROTTLE) | DBG_FUNC_START, VM_OBJECT_ID(object),
5870 	    size, delay_us);
5871 
5872 	if (delay_us == 0) {
5873 		goto done;
5874 	}
5875 
5876 	vm_object_unlock(object);
5877 
5878 	uint32_t size_pages = size >> PAGE_SHIFT;
5879 	os_atomic_inc(&vm_object_upl_throttle_cnt, relaxed);
5880 
5881 	os_atomic_add(&vm_upl_wait_for_pages, size_pages, relaxed);
5882 
5883 	/*
5884 	 * Unconditionally block for a fixed delay interval.
5885 	 *
5886 	 * FIXME: This mechanism should likely be revisited. (rdar://157163748)
5887 	 *
5888 	 * Should there be a back-pressure mechanisms that un-throttles the I/O if the
5889 	 * situation resolves?
5890 	 *
5891 	 * Is 1ms long enough? The original mechanism scaled the delay with the I/O
5892 	 * size, but that overly penalized large I/Os (which are actually preferrable
5893 	 * if device contention is the problem).
5894 	 *
5895 	 * Can we isolate only I/Os which are to the same device that the external
5896 	 * pageout thread is stuck on? e.g. There is no reason to penalize I/Os to an
5897 	 * external drive if the pageout thread is gummed up on the internal drive.
5898 	 */
5899 	delay(delay_us);
5900 
5901 	os_atomic_sub(&vm_upl_wait_for_pages, size_pages, relaxed);
5902 
5903 	vm_object_lock(object);
5904 done:
5905 	KDBG(VMDBG_CODE(DBG_VM_UPL_THROTTLE) | DBG_FUNC_END);
5906 }
5907 
5908 
5909 /*
5910  *	Routine:	vm_object_upl_request
5911  *	Purpose:
5912  *		Cause the population of a portion of a vm_object.
5913  *		Depending on the nature of the request, the pages
5914  *		returned may be contain valid data or be uninitialized.
5915  *		A page list structure, listing the physical pages
5916  *		will be returned upon request.
5917  *		This function is called by the file system or any other
5918  *		supplier of backing store to a pager.
5919  *		IMPORTANT NOTE: The caller must still respect the relationship
5920  *		between the vm_object and its backing memory object.  The
5921  *		caller MUST NOT substitute changes in the backing file
5922  *		without first doing a memory_object_lock_request on the
5923  *		target range unless it is know that the pages are not
5924  *		shared with another entity at the pager level.
5925  *		Copy_in_to:
5926  *			if a page list structure is present
5927  *			return the mapped physical pages, where a
5928  *			page is not present, return a non-initialized
5929  *			one.  If the no_sync bit is turned on, don't
5930  *			call the pager unlock to synchronize with other
5931  *			possible copies of the page. Leave pages busy
5932  *			in the original object, if a page list structure
5933  *			was specified.  When a commit of the page list
5934  *			pages is done, the dirty bit will be set for each one.
5935  *		Copy_out_from:
5936  *			If a page list structure is present, return
5937  *			all mapped pages.  Where a page does not exist
5938  *			map a zero filled one. Leave pages busy in
5939  *			the original object.  If a page list structure
5940  *			is not specified, this call is a no-op.
5941  *
5942  *		Note:  access of default pager objects has a rather interesting
5943  *		twist.  The caller of this routine, presumably the file system
5944  *		page cache handling code, will never actually make a request
5945  *		against a default pager backed object.  Only the default
5946  *		pager will make requests on backing store related vm_objects
5947  *		In this way the default pager can maintain the relationship
5948  *		between backing store files (abstract memory objects) and
5949  *		the vm_objects (cache objects), they support.
5950  *
5951  */
5952 
5953 __private_extern__ kern_return_t
5954 vm_object_upl_request(
5955 	vm_object_t             object,
5956 	vm_object_offset_t      offset,
5957 	upl_size_t              size,
5958 	upl_t                   *upl_ptr,
5959 	upl_page_info_array_t   user_page_list,
5960 	unsigned int            *page_list_count,
5961 	upl_control_flags_t     cntrl_flags,
5962 	vm_tag_t                tag)
5963 {
5964 	vm_page_t               dst_page = VM_PAGE_NULL;
5965 	vm_object_offset_t      dst_offset;
5966 	upl_size_t              xfer_size;
5967 	unsigned int            size_in_pages;
5968 	boolean_t               dirty;
5969 	boolean_t               hw_dirty;
5970 	upl_t                   upl = NULL;
5971 	unsigned int            entry;
5972 	vm_page_t               alias_page = NULL;
5973 	int                     refmod_state = 0;
5974 	vm_object_t             last_copy_object;
5975 	uint32_t                last_copy_version;
5976 	struct  vm_page_delayed_work    dw_array;
5977 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5978 	bool                    dwp_finish_ctx = TRUE;
5979 	int                     dw_count;
5980 	int                     dw_limit;
5981 	int                     io_tracking_flag = 0;
5982 	vm_grab_options_t       grab_options;
5983 	int                     page_grab_count = 0;
5984 	ppnum_t                 phys_page;
5985 	pmap_flush_context      pmap_flush_context_storage;
5986 	boolean_t               pmap_flushes_delayed = FALSE;
5987 	task_t                  task = current_task();
5988 
5989 	dwp_start = dwp = NULL;
5990 
5991 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5992 		/*
5993 		 * For forward compatibility's sake,
5994 		 * reject any unknown flag.
5995 		 */
5996 		return KERN_INVALID_VALUE;
5997 	}
5998 	if ((!object->internal) && (object->paging_offset != 0)) {
5999 		panic("vm_object_upl_request: external object with non-zero paging offset");
6000 	}
6001 	if (object->phys_contiguous) {
6002 		panic("vm_object_upl_request: contiguous object specified");
6003 	}
6004 
6005 	assertf(page_aligned(offset) && page_aligned(size),
6006 	    "offset 0x%llx size 0x%x",
6007 	    offset, size);
6008 
6009 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
6010 
6011 	dw_count = 0;
6012 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
6013 	dwp_start = vm_page_delayed_work_get_ctx();
6014 	if (dwp_start == NULL) {
6015 		dwp_start = &dw_array;
6016 		dw_limit = 1;
6017 		dwp_finish_ctx = FALSE;
6018 	}
6019 
6020 	dwp = dwp_start;
6021 
6022 	if (size > MAX_UPL_SIZE_BYTES) {
6023 		size = MAX_UPL_SIZE_BYTES;
6024 	}
6025 
6026 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
6027 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
6028 	}
6029 
6030 #if CONFIG_IOSCHED || UPL_DEBUG
6031 	if (object->io_tracking || upl_debug_enabled) {
6032 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
6033 	}
6034 #endif
6035 #if CONFIG_IOSCHED
6036 	if (object->io_tracking) {
6037 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
6038 	}
6039 #endif
6040 
6041 	if (cntrl_flags & UPL_SET_INTERNAL) {
6042 		if (cntrl_flags & UPL_SET_LITE) {
6043 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
6044 		} else {
6045 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
6046 		}
6047 		user_page_list = size ? upl->page_list : NULL;
6048 	} else {
6049 		if (cntrl_flags & UPL_SET_LITE) {
6050 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
6051 		} else {
6052 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
6053 		}
6054 	}
6055 	*upl_ptr = upl;
6056 
6057 	if (user_page_list) {
6058 		user_page_list[0].device = FALSE;
6059 	}
6060 
6061 	if (cntrl_flags & UPL_SET_LITE) {
6062 		upl->map_object = object;
6063 	} else {
6064 		upl->map_object = vm_object_allocate(size, object->vmo_provenance);
6065 		vm_object_lock(upl->map_object);
6066 		/*
6067 		 * No neeed to lock the new object: nobody else knows
6068 		 * about it yet, so it's all ours so far.
6069 		 */
6070 		upl->map_object->shadow = object;
6071 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
6072 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
6073 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
6074 		upl->map_object->vo_shadow_offset = offset;
6075 		upl->map_object->wimg_bits = object->wimg_bits;
6076 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
6077 		    "object %p shadow_offset 0x%llx",
6078 		    upl->map_object, upl->map_object->vo_shadow_offset);
6079 		vm_object_unlock(upl->map_object);
6080 
6081 		alias_page = vm_page_create_fictitious();
6082 
6083 		upl->flags |= UPL_SHADOWED;
6084 	}
6085 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
6086 		upl->flags |= UPL_PAGEOUT;
6087 	}
6088 
6089 	vm_object_lock(object);
6090 	vm_object_activity_begin(object);
6091 
6092 	grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
6093 #if CONFIG_SECLUDED_MEMORY
6094 	if (object->can_grab_secluded) {
6095 		grab_options |= VM_PAGE_GRAB_SECLUDED;
6096 	}
6097 #endif /* CONFIG_SECLUDED_MEMORY */
6098 
6099 	/*
6100 	 * we can lock in the paging_offset once paging_in_progress is set
6101 	 */
6102 	upl->u_size = size;
6103 	upl->u_offset = offset + object->paging_offset;
6104 
6105 #if CONFIG_IOSCHED || UPL_DEBUG
6106 	if (object->io_tracking || upl_debug_enabled) {
6107 		vm_object_activity_begin(object);
6108 		queue_enter(&object->uplq, upl, upl_t, uplq);
6109 	}
6110 #endif
6111 
6112 	/* remember which copy object we synchronized with */
6113 	last_copy_object = object->vo_copy;
6114 	last_copy_version = object->vo_copy_version;
6115 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
6116 		/*
6117 		 * Honor copy-on-write obligations
6118 		 *
6119 		 * The caller is gathering these pages and
6120 		 * might modify their contents.  We need to
6121 		 * make sure that the copy object has its own
6122 		 * private copies of these pages before we let
6123 		 * the caller modify them.
6124 		 */
6125 		vm_object_update(object,
6126 		    offset,
6127 		    size,
6128 		    NULL,
6129 		    NULL,
6130 		    FALSE,              /* should_return */
6131 		    MEMORY_OBJECT_COPY_SYNC,
6132 		    VM_PROT_NO_CHANGE);
6133 
6134 		VM_PAGEOUT_DEBUG(upl_cow, 1);
6135 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
6136 	}
6137 	entry = 0;
6138 
6139 	xfer_size = size;
6140 	dst_offset = offset;
6141 	size_in_pages = size / PAGE_SIZE;
6142 
6143 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
6144 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
6145 		object->scan_collisions = 0;
6146 	}
6147 
6148 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6149 		vm_object_upl_throttle(object, size);
6150 	}
6151 
6152 	while (xfer_size) {
6153 		dwp->dw_mask = 0;
6154 
6155 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6156 			vm_object_unlock(object);
6157 			alias_page = vm_page_create_fictitious();
6158 			vm_object_lock(object);
6159 		}
6160 		if (cntrl_flags & UPL_COPYOUT_FROM) {
6161 			upl->flags |= UPL_PAGE_SYNC_DONE;
6162 
6163 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6164 			    vm_page_is_fictitious(dst_page) ||
6165 			    dst_page->vmp_absent ||
6166 			    VMP_ERROR_GET(dst_page) ||
6167 			    dst_page->vmp_cleaning ||
6168 			    (VM_PAGE_WIRED(dst_page))) {
6169 				if (user_page_list) {
6170 					user_page_list[entry].phys_addr = 0;
6171 				}
6172 
6173 				goto try_next_page;
6174 			}
6175 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6176 
6177 			/*
6178 			 * grab this up front...
6179 			 * a high percentange of the time we're going to
6180 			 * need the hardware modification state a bit later
6181 			 * anyway... so we can eliminate an extra call into
6182 			 * the pmap layer by grabbing it here and recording it
6183 			 */
6184 			if (dst_page->vmp_pmapped) {
6185 				refmod_state = pmap_get_refmod(phys_page);
6186 			} else {
6187 				refmod_state = 0;
6188 			}
6189 
6190 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6191 				/*
6192 				 * page is on inactive list and referenced...
6193 				 * reactivate it now... this gets it out of the
6194 				 * way of vm_pageout_scan which would have to
6195 				 * reactivate it upon tripping over it
6196 				 */
6197 				dwp->dw_mask |= DW_vm_page_activate;
6198 			}
6199 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6200 				/*
6201 				 * we're only asking for DIRTY pages to be returned
6202 				 */
6203 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6204 					/*
6205 					 * if we were the page stolen by vm_pageout_scan to be
6206 					 * cleaned (as opposed to a buddy being clustered in
6207 					 * or this request is not being driven by a PAGEOUT cluster
6208 					 * then we only need to check for the page being dirty or
6209 					 * precious to decide whether to return it
6210 					 */
6211 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6212 						goto check_busy;
6213 					}
6214 					goto dont_return;
6215 				}
6216 				/*
6217 				 * this is a request for a PAGEOUT cluster and this page
6218 				 * is merely along for the ride as a 'buddy'... not only
6219 				 * does it have to be dirty to be returned, but it also
6220 				 * can't have been referenced recently...
6221 				 */
6222 				if ((hibernate_cleaning_in_progress == TRUE ||
6223 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6224 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6225 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6226 					goto check_busy;
6227 				}
6228 dont_return:
6229 				/*
6230 				 * if we reach here, we're not to return
6231 				 * the page... go on to the next one
6232 				 */
6233 				if (dst_page->vmp_laundry == TRUE) {
6234 					/*
6235 					 * if we get here, the page is not 'cleaning' (filtered out above).
6236 					 * since it has been referenced, remove it from the laundry
6237 					 * so we don't pay the cost of an I/O to clean a page
6238 					 * we're just going to take back
6239 					 */
6240 					vm_page_lockspin_queues();
6241 
6242 					vm_pageout_steal_laundry(dst_page, TRUE);
6243 					vm_page_activate(dst_page);
6244 
6245 					vm_page_unlock_queues();
6246 				}
6247 				if (user_page_list) {
6248 					user_page_list[entry].phys_addr = 0;
6249 				}
6250 
6251 				goto try_next_page;
6252 			}
6253 check_busy:
6254 			if (dst_page->vmp_busy) {
6255 				if (cntrl_flags & UPL_NOBLOCK) {
6256 					if (user_page_list) {
6257 						user_page_list[entry].phys_addr = 0;
6258 					}
6259 					dwp->dw_mask = 0;
6260 
6261 					goto try_next_page;
6262 				}
6263 				/*
6264 				 * someone else is playing with the
6265 				 * page.  We will have to wait.
6266 				 */
6267 				vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6268 
6269 				continue;
6270 			}
6271 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6272 				vm_page_lockspin_queues();
6273 
6274 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6275 					/*
6276 					 * we've buddied up a page for a clustered pageout
6277 					 * that has already been moved to the pageout
6278 					 * queue by pageout_scan... we need to remove
6279 					 * it from the queue and drop the laundry count
6280 					 * on that queue
6281 					 */
6282 					vm_pageout_throttle_up(dst_page);
6283 				}
6284 				vm_page_unlock_queues();
6285 			}
6286 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6287 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6288 
6289 			if (phys_page > upl->highest_page) {
6290 				upl->highest_page = phys_page;
6291 			}
6292 
6293 			assert(!pmap_is_noencrypt(phys_page));
6294 
6295 			if (cntrl_flags & UPL_SET_LITE) {
6296 				unsigned int    pg_num;
6297 
6298 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6299 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6300 				bitmap_set(upl->lite_list, pg_num);
6301 
6302 				if (hw_dirty) {
6303 					if (pmap_flushes_delayed == FALSE) {
6304 						pmap_flush_context_init(&pmap_flush_context_storage);
6305 						pmap_flushes_delayed = TRUE;
6306 					}
6307 					pmap_clear_refmod_options(phys_page,
6308 					    VM_MEM_MODIFIED,
6309 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6310 					    &pmap_flush_context_storage);
6311 				}
6312 
6313 				/*
6314 				 * Mark original page as cleaning
6315 				 * in place.
6316 				 */
6317 				dst_page->vmp_cleaning = TRUE;
6318 				dst_page->vmp_precious = FALSE;
6319 			} else {
6320 				/*
6321 				 * use pageclean setup, it is more
6322 				 * convenient even for the pageout
6323 				 * cases here
6324 				 */
6325 				vm_object_lock(upl->map_object);
6326 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6327 				vm_object_unlock(upl->map_object);
6328 
6329 				alias_page->vmp_absent = FALSE;
6330 				alias_page = NULL;
6331 			}
6332 			if (dirty) {
6333 				SET_PAGE_DIRTY(dst_page, FALSE);
6334 			} else {
6335 				dst_page->vmp_dirty = FALSE;
6336 			}
6337 
6338 			if (!dirty) {
6339 				dst_page->vmp_precious = TRUE;
6340 			}
6341 
6342 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6343 				if (!VM_PAGE_WIRED(dst_page)) {
6344 					dst_page->vmp_free_when_done = TRUE;
6345 				}
6346 			}
6347 		} else {
6348 			while ((cntrl_flags & UPL_WILL_MODIFY) &&
6349 			    (object->vo_copy != last_copy_object ||
6350 			    object->vo_copy_version != last_copy_version)) {
6351 				/*
6352 				 * Honor copy-on-write obligations
6353 				 *
6354 				 * The copy object has changed since we
6355 				 * last synchronized for copy-on-write.
6356 				 * Another copy object might have been
6357 				 * inserted while we released the object's
6358 				 * lock.  Since someone could have seen the
6359 				 * original contents of the remaining pages
6360 				 * through that new object, we have to
6361 				 * synchronize with it again for the remaining
6362 				 * pages only.  The previous pages are "busy"
6363 				 * so they can not be seen through the new
6364 				 * mapping.  The new mapping will see our
6365 				 * upcoming changes for those previous pages,
6366 				 * but that's OK since they couldn't see what
6367 				 * was there before.  It's just a race anyway
6368 				 * and there's no guarantee of consistency or
6369 				 * atomicity.  We just don't want new mappings
6370 				 * to see both the *before* and *after* pages.
6371 				 */
6372 
6373 				/* first remember the copy object we re-synced with */
6374 				last_copy_object = object->vo_copy;
6375 				last_copy_version = object->vo_copy_version;
6376 				if (object->vo_copy != VM_OBJECT_NULL) {
6377 					vm_object_update(
6378 						object,
6379 						dst_offset,/* current offset */
6380 						xfer_size, /* remaining size */
6381 						NULL,
6382 						NULL,
6383 						FALSE,     /* should_return */
6384 						MEMORY_OBJECT_COPY_SYNC,
6385 						VM_PROT_NO_CHANGE);
6386 
6387 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6388 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6389 				}
6390 			}
6391 			dst_page = vm_page_lookup(object, dst_offset);
6392 
6393 			if (dst_page != VM_PAGE_NULL) {
6394 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6395 					/*
6396 					 * skip over pages already present in the cache
6397 					 */
6398 					if (user_page_list) {
6399 						user_page_list[entry].phys_addr = 0;
6400 					}
6401 
6402 					goto try_next_page;
6403 				}
6404 				if (vm_page_is_fictitious(dst_page)) {
6405 					panic("need corner case for fictitious page");
6406 				}
6407 
6408 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6409 					/*
6410 					 * someone else is playing with the
6411 					 * page.  We will have to wait.
6412 					 */
6413 					vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6414 
6415 					continue;
6416 				}
6417 				if (dst_page->vmp_laundry) {
6418 					vm_pageout_steal_laundry(dst_page, FALSE);
6419 				}
6420 			} else {
6421 				if (object->private) {
6422 					/*
6423 					 * This is a nasty wrinkle for users
6424 					 * of upl who encounter device or
6425 					 * private memory however, it is
6426 					 * unavoidable, only a fault can
6427 					 * resolve the actual backing
6428 					 * physical page by asking the
6429 					 * backing device.
6430 					 */
6431 					if (user_page_list) {
6432 						user_page_list[entry].phys_addr = 0;
6433 					}
6434 
6435 					goto try_next_page;
6436 				}
6437 				if (object->scan_collisions) {
6438 					/*
6439 					 * the pageout_scan thread is trying to steal
6440 					 * pages from this object, but has run into our
6441 					 * lock... grab 2 pages from the head of the object...
6442 					 * the first is freed on behalf of pageout_scan, the
6443 					 * 2nd is for our own use... we use vm_object_page_grab
6444 					 * in both cases to avoid taking pages from the free
6445 					 * list since we are under memory pressure and our
6446 					 * lock on this object is getting in the way of
6447 					 * relieving it
6448 					 */
6449 					dst_page = vm_object_page_grab(object);
6450 
6451 					if (dst_page != VM_PAGE_NULL) {
6452 						vm_page_release(dst_page,
6453 						    VMP_RELEASE_NONE);
6454 					}
6455 
6456 					dst_page = vm_object_page_grab(object);
6457 				}
6458 				if (dst_page == VM_PAGE_NULL) {
6459 					/*
6460 					 * need to allocate a page
6461 					 */
6462 					dst_page = vm_page_grab_options(grab_options);
6463 					if (dst_page != VM_PAGE_NULL) {
6464 						page_grab_count++;
6465 					}
6466 				}
6467 				if (dst_page == VM_PAGE_NULL) {
6468 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6469 						/*
6470 						 * we don't want to stall waiting for pages to come onto the free list
6471 						 * while we're already holding absent pages in this UPL
6472 						 * the caller will deal with the empty slots
6473 						 */
6474 						if (user_page_list) {
6475 							user_page_list[entry].phys_addr = 0;
6476 						}
6477 
6478 						goto try_next_page;
6479 					}
6480 					/*
6481 					 * no pages available... wait
6482 					 * then try again for the same
6483 					 * offset...
6484 					 */
6485 					vm_object_unlock(object);
6486 
6487 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6488 
6489 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6490 
6491 					VM_PAGE_WAIT();
6492 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6493 
6494 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6495 
6496 					vm_object_lock(object);
6497 
6498 					continue;
6499 				}
6500 				vm_page_insert(dst_page, object, dst_offset);
6501 
6502 				dst_page->vmp_absent = TRUE;
6503 				dst_page->vmp_busy = FALSE;
6504 
6505 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6506 					/*
6507 					 * if UPL_RET_ONLY_ABSENT was specified,
6508 					 * than we're definitely setting up a
6509 					 * upl for a clustered read/pagein
6510 					 * operation... mark the pages as clustered
6511 					 * so upl_commit_range can put them on the
6512 					 * speculative list
6513 					 */
6514 					dst_page->vmp_clustered = TRUE;
6515 
6516 					if (!(cntrl_flags & UPL_FILE_IO)) {
6517 						counter_inc(&vm_statistics_pageins);
6518 					}
6519 				}
6520 			}
6521 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6522 
6523 			dst_page->vmp_overwriting = TRUE;
6524 
6525 			if (dst_page->vmp_pmapped) {
6526 #if CONFIG_SPTM
6527 				if (__improbable(PMAP_PAGE_IS_USER_EXECUTABLE(dst_page))) {
6528 					/*
6529 					 * Various buffer cache operations may need to reload the page contents
6530 					 * even though the page may have an executable frame type from prior use of
6531 					 * the vnode associated with the VM object.  For those cases, we need to
6532 					 * disconnect all mappings and reset the frame type, regardless of whether
6533 					 * UPL_FILE_IO was passed here, as the SPTM will not allow writable CPU
6534 					 * or IOMMU mappings of exec-typed pages.
6535 					 * NOTE: It's theoretically possible that the retype here could race with
6536 					 * setup/teardown of IOMMU mappings by another thread that went through
6537 					 * the vm_object_iopl_request() path.  I'm not sure that would ever be
6538 					 * expected to happen for an exec page in practice though.  If it does
6539 					 * happen, we may need to change vm_page_do_delayed_work() to forbid all
6540 					 * IOPLs against executable pages rather than only writable ones.
6541 					 */
6542 					refmod_state = pmap_disconnect_options(phys_page, PMAP_OPTIONS_RETYPE, NULL);
6543 				} else
6544 #endif /* CONFIG_SPTM */
6545 				if (!(cntrl_flags & UPL_FILE_IO)) {
6546 					/*
6547 					 * eliminate all mappings from the
6548 					 * original object and its progeny
6549 					 */
6550 					refmod_state = pmap_disconnect(phys_page);
6551 				} else {
6552 					refmod_state = pmap_get_refmod(phys_page);
6553 				}
6554 			} else {
6555 				refmod_state = 0;
6556 			}
6557 
6558 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6559 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6560 
6561 			if (cntrl_flags & UPL_SET_LITE) {
6562 				unsigned int    pg_num;
6563 
6564 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6565 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6566 				bitmap_set(upl->lite_list, pg_num);
6567 
6568 				if (hw_dirty) {
6569 					pmap_clear_modify(phys_page);
6570 				}
6571 
6572 				/*
6573 				 * Mark original page as cleaning
6574 				 * in place.
6575 				 */
6576 				dst_page->vmp_cleaning = TRUE;
6577 				dst_page->vmp_precious = FALSE;
6578 			} else {
6579 				/*
6580 				 * use pageclean setup, it is more
6581 				 * convenient even for the pageout
6582 				 * cases here
6583 				 */
6584 				vm_object_lock(upl->map_object);
6585 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6586 				vm_object_unlock(upl->map_object);
6587 
6588 				alias_page->vmp_absent = FALSE;
6589 				alias_page = NULL;
6590 			}
6591 
6592 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6593 				upl->flags &= ~UPL_CLEAR_DIRTY;
6594 				upl->flags |= UPL_SET_DIRTY;
6595 				dirty = TRUE;
6596 				/*
6597 				 * Page belonging to a code-signed object is about to
6598 				 * be written. Mark it tainted and disconnect it from
6599 				 * all pmaps so processes have to fault it back in and
6600 				 * deal with the tainted bit.
6601 				 */
6602 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6603 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6604 					vm_page_upl_tainted++;
6605 					if (dst_page->vmp_pmapped) {
6606 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6607 						if (refmod_state & VM_MEM_REFERENCED) {
6608 							dst_page->vmp_reference = TRUE;
6609 						}
6610 					}
6611 				}
6612 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6613 				/*
6614 				 * clean in place for read implies
6615 				 * that a write will be done on all
6616 				 * the pages that are dirty before
6617 				 * a upl commit is done.  The caller
6618 				 * is obligated to preserve the
6619 				 * contents of all pages marked dirty
6620 				 */
6621 				upl->flags |= UPL_CLEAR_DIRTY;
6622 			}
6623 			dst_page->vmp_dirty = dirty;
6624 
6625 			if (!dirty) {
6626 				dst_page->vmp_precious = TRUE;
6627 			}
6628 
6629 			if (!VM_PAGE_WIRED(dst_page)) {
6630 				/*
6631 				 * deny access to the target page while
6632 				 * it is being worked on
6633 				 */
6634 				dst_page->vmp_busy = TRUE;
6635 			} else {
6636 				dwp->dw_mask |= DW_vm_page_wire;
6637 			}
6638 
6639 			/*
6640 			 * We might be about to satisfy a fault which has been
6641 			 * requested. So no need for the "restart" bit.
6642 			 */
6643 			dst_page->vmp_restart = FALSE;
6644 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6645 				/*
6646 				 * expect the page to be used
6647 				 */
6648 				dwp->dw_mask |= DW_set_reference;
6649 			}
6650 			if (cntrl_flags & UPL_PRECIOUS) {
6651 				if (object->internal) {
6652 					SET_PAGE_DIRTY(dst_page, FALSE);
6653 					dst_page->vmp_precious = FALSE;
6654 				} else {
6655 					dst_page->vmp_precious = TRUE;
6656 				}
6657 			} else {
6658 				dst_page->vmp_precious = FALSE;
6659 			}
6660 		}
6661 		if (dst_page->vmp_busy) {
6662 			upl->flags |= UPL_HAS_BUSY;
6663 		}
6664 		if (VM_PAGE_WIRED(dst_page)) {
6665 			upl->flags |= UPL_HAS_WIRED;
6666 		}
6667 
6668 		if (phys_page > upl->highest_page) {
6669 			upl->highest_page = phys_page;
6670 		}
6671 		assert(!pmap_is_noencrypt(phys_page));
6672 		if (user_page_list) {
6673 			user_page_list[entry].phys_addr = phys_page;
6674 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6675 			user_page_list[entry].absent    = dst_page->vmp_absent;
6676 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6677 			user_page_list[entry].precious  = dst_page->vmp_precious;
6678 			user_page_list[entry].device    = FALSE;
6679 			user_page_list[entry].needed    = FALSE;
6680 			if (dst_page->vmp_clustered == TRUE) {
6681 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6682 			} else {
6683 				user_page_list[entry].speculative = FALSE;
6684 			}
6685 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6686 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6687 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6688 			user_page_list[entry].mark      = FALSE;
6689 		}
6690 		/*
6691 		 * if UPL_RET_ONLY_ABSENT is set, then
6692 		 * we are working with a fresh page and we've
6693 		 * just set the clustered flag on it to
6694 		 * indicate that it was drug in as part of a
6695 		 * speculative cluster... so leave it alone
6696 		 */
6697 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6698 			/*
6699 			 * someone is explicitly grabbing this page...
6700 			 * update clustered and speculative state
6701 			 *
6702 			 */
6703 			if (dst_page->vmp_clustered) {
6704 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6705 			}
6706 		}
6707 try_next_page:
6708 		if (dwp->dw_mask) {
6709 			if (dwp->dw_mask & DW_vm_page_activate) {
6710 				counter_inc(&vm_statistics_reactivations);
6711 			}
6712 
6713 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6714 
6715 			if (dw_count >= dw_limit) {
6716 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6717 
6718 				dwp = dwp_start;
6719 				dw_count = 0;
6720 			}
6721 		}
6722 		entry++;
6723 		dst_offset += PAGE_SIZE_64;
6724 		xfer_size -= PAGE_SIZE;
6725 	}
6726 	if (dw_count) {
6727 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6728 		dwp = dwp_start;
6729 		dw_count = 0;
6730 	}
6731 
6732 	if (alias_page != NULL) {
6733 		VM_PAGE_FREE(alias_page);
6734 	}
6735 	if (pmap_flushes_delayed == TRUE) {
6736 		pmap_flush(&pmap_flush_context_storage);
6737 	}
6738 
6739 	if (page_list_count != NULL) {
6740 		if (upl->flags & UPL_INTERNAL) {
6741 			*page_list_count = 0;
6742 		} else if (*page_list_count > entry) {
6743 			*page_list_count = entry;
6744 		}
6745 	}
6746 #if UPL_DEBUG
6747 	upl->upl_state = 1;
6748 #endif
6749 	vm_object_unlock(object);
6750 
6751 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6752 	if (task != NULL) {
6753 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6754 	}
6755 	counter_add(&vm_page_grab_count_upl, page_grab_count);
6756 
6757 	if (dwp_start && dwp_finish_ctx) {
6758 		vm_page_delayed_work_finish_ctx(dwp_start);
6759 		dwp_start = dwp = NULL;
6760 	}
6761 
6762 	return KERN_SUCCESS;
6763 }
6764 
6765 int cs_executable_create_upl = 0;
6766 extern int proc_selfpid(void);
6767 extern char *proc_name_address(void *p);
6768 
6769 /**
6770  * Helper for determining whether a writable (!UPL_COPYOUT_FROM) UPL is allowed for a given VA region.
6771  * This is determined not only by the allowed permissions in the relevant vm_map_entry, but also by
6772  * the code integrity enforcement model present on the system.
6773  *
6774  * @param map VM map against which the UPL is being populated.
6775  * @param entry The source vm_map_entry in [map] against which the UPL is being populated.
6776  * @param offset Base offset of UPL request in [map], for debugging purposes.
6777  *
6778  * @return True if the writable UPL is allowed for [entry], false otherwise.
6779  */
6780 static bool
6781 vme_allows_upl_write(
6782 	vm_map_t map __unused,
6783 	vm_map_entry_t entry,
6784 	vm_map_address_t offset __unused)
6785 {
6786 	if (!(entry->protection & VM_PROT_WRITE)) {
6787 		return false;
6788 	}
6789 #if CONFIG_SPTM
6790 	/*
6791 	 * For SPTM configurations, reject any attempt to create a writable UPL against any executable
6792 	 * region.  Even in cases such as JIT/USER_DEBUG in which the vm_map_entry may allow write
6793 	 * access, the SPTM/TXM codesigning model still forbids writable DMA mappings of these pages.
6794 	 */
6795 	if ((entry->protection & VM_PROT_EXECUTE) || entry->vme_xnu_user_debug) {
6796 		vm_map_guard_exception(offset, kGUARD_EXC_SEC_UPL_WRITE_ON_EXEC_REGION);
6797 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
6798 		    KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_UPL_WRITE_ON_EXEC_REGION), (uintptr_t)offset);
6799 		return false;
6800 	}
6801 #endif /* CONFIG_SPTM */
6802 	return true;
6803 }
6804 
6805 /**
6806  * Helper for determining whether a read-only (UPL_COPYOUT_FROM) UPL is allowed for a given VA region,
6807  * possibly with the additional requirement of creating a kernel copy of the source buffer.
6808  * This is determined by the code integrity enforcement model present on the system.
6809  *
6810  * @param map VM map against which the UPL is being populated.
6811  * @param entry The source vm_map_entry in [map] against which the UPL is being populated.
6812  * @param offset Base offset of UPL request in [map], for debugging purposes.
6813  * @param copy_required Output parameter indicating whether the UPL should be created against a kernel
6814  *        copy of the source data.
6815  *
6816  * @return True if the read-only UPL is allowed for [entry], false otherwise.
6817  */
6818 static bool
6819 vme_allows_upl_read(
6820 	vm_map_t map __unused,
6821 	vm_map_entry_t entry __unused,
6822 	vm_map_address_t offset __unused,
6823 	bool *copy_required)
6824 {
6825 	assert(copy_required != NULL);
6826 	*copy_required = false;
6827 #if CONFIG_SPTM
6828 	/*
6829 	 * For SPTM configs, always create a copy when attempting a read-only I/O operation against an
6830 	 * executable or debug (which may become executable) mapping.  The SPTM's stricter security
6831 	 * enforcements against DMA mappings of executable pages may otherwise trigger an SPTM violation
6832 	 * panic.  We expect the added cost of this copy to be manageable as DMA mappings of executable
6833 	 * regions are rare in practice.
6834 	 */
6835 	if ((map->pmap != kernel_pmap) &&
6836 	    ((entry->protection & VM_PROT_EXECUTE) || entry->vme_xnu_user_debug)) {
6837 		*copy_required = true;
6838 	}
6839 #endif /* CONFIG_SPTM */
6840 #if !XNU_TARGET_OS_OSX
6841 	/*
6842 	 * For all non-Mac targets, create a copy when attempting a read-only I/O operation against a
6843 	 * read-only executable region.  These regions are likely to be codesigned and are typically
6844 	 * mapped CoW; our wire operation will be treated as a proactive CoW fault which will copy the
6845 	 * backing pages and thus cause them to no longer be codesigned.
6846 	 */
6847 	if (map->pmap != kernel_pmap &&
6848 	    (entry->protection & VM_PROT_EXECUTE) &&
6849 	    !(entry->protection & VM_PROT_WRITE)) {
6850 		*copy_required = true;
6851 	}
6852 #endif /* !XNU_TARGET_OS_OSX */
6853 	return true;
6854 }
6855 
6856 kern_return_t
6857 vm_map_create_upl(
6858 	vm_map_t                map,
6859 	vm_map_address_t        offset,
6860 	upl_size_t              *upl_size,
6861 	upl_t                   *upl,
6862 	upl_page_info_array_t   page_list,
6863 	unsigned int            *count,
6864 	upl_control_flags_t     *flags,
6865 	vm_tag_t                tag)
6866 {
6867 	vm_map_entry_t          entry;
6868 	upl_control_flags_t     caller_flags;
6869 	int                     force_data_sync;
6870 	int                     sync_cow_data;
6871 	vm_object_t             local_object;
6872 	vm_map_offset_t         local_offset;
6873 	vm_map_offset_t         local_start;
6874 	kern_return_t           ret;
6875 	vm_map_address_t        original_offset;
6876 	vm_map_size_t           original_size, adjusted_size;
6877 	vm_map_offset_t         local_entry_start;
6878 	vm_object_offset_t      local_entry_offset;
6879 	boolean_t               release_map = FALSE;
6880 
6881 	vmlp_api_start(VM_MAP_CREATE_UPL);
6882 start_with_map:
6883 	caller_flags = *flags;
6884 
6885 	if (caller_flags & ~UPL_VALID_FLAGS) {
6886 		/*
6887 		 * For forward compatibility's sake,
6888 		 * reject any unknown flag.
6889 		 */
6890 		ret = KERN_INVALID_VALUE;
6891 		goto done;
6892 	}
6893 
6894 	if (upl == NULL) {
6895 		ret = KERN_INVALID_ARGUMENT;
6896 		goto done;
6897 	}
6898 
6899 
6900 	original_offset = offset;
6901 	original_size = *upl_size;
6902 	adjusted_size = original_size;
6903 
6904 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6905 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6906 
6907 REDISCOVER_ENTRY:
6908 	vm_map_lock_read(map);
6909 
6910 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6911 		vm_map_unlock_read(map);
6912 		ret = KERN_FAILURE;
6913 		goto done;
6914 	}
6915 
6916 	if (!entry->is_sub_map) {
6917 		vmlp_range_event_entry(map, entry);
6918 	}
6919 
6920 	local_entry_start = entry->vme_start;
6921 	local_entry_offset = VME_OFFSET(entry);
6922 
6923 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6924 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6925 	}
6926 
6927 	if (entry->vme_end - original_offset < adjusted_size) {
6928 		adjusted_size = entry->vme_end - original_offset;
6929 		assert(adjusted_size > 0);
6930 		*upl_size = (upl_size_t) adjusted_size;
6931 		assert(*upl_size == adjusted_size);
6932 	}
6933 
6934 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6935 		*flags = 0;
6936 
6937 		if (!entry->is_sub_map &&
6938 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6939 			if (VME_OBJECT(entry)->private) {
6940 				*flags = UPL_DEV_MEMORY;
6941 			}
6942 
6943 			if (VME_OBJECT(entry)->phys_contiguous) {
6944 				*flags |= UPL_PHYS_CONTIG;
6945 			}
6946 		}
6947 		vm_map_unlock_read(map);
6948 		ret = KERN_SUCCESS;
6949 		goto done;
6950 	}
6951 
6952 	bool copy_required = false;
6953 
6954 	if (!entry->is_sub_map) {
6955 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6956 		    !VME_OBJECT(entry)->phys_contiguous) {
6957 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6958 				*upl_size = MAX_UPL_SIZE_BYTES;
6959 			}
6960 		}
6961 
6962 		/*
6963 		 *      Create an object if necessary.
6964 		 */
6965 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6966 			if (entry->max_protection == VM_PROT_NONE) {
6967 				/* don't create an object for a reserved range */
6968 				vm_map_unlock_read(map);
6969 				ret = KERN_PROTECTION_FAILURE;
6970 				goto done;
6971 			}
6972 
6973 			if (vm_map_lock_read_to_write(map)) {
6974 				goto REDISCOVER_ENTRY;
6975 			}
6976 
6977 			VME_OBJECT_SET(entry,
6978 			    vm_object_allocate((vm_size_t)
6979 			    vm_object_round_page((entry->vme_end - entry->vme_start)), map->serial_id),
6980 			    false, 0);
6981 			VME_OFFSET_SET(entry, 0);
6982 			assert(entry->use_pmap);
6983 
6984 			vm_map_lock_write_to_read(map);
6985 		}
6986 
6987 		if (((caller_flags & UPL_COPYOUT_FROM) && !vme_allows_upl_read(map, entry, offset, &copy_required)) ||
6988 		    (!(caller_flags & UPL_COPYOUT_FROM) && !vme_allows_upl_write(map, entry, offset))) {
6989 			vm_map_unlock_read(map);
6990 			ret = KERN_PROTECTION_FAILURE;
6991 			goto done;
6992 		}
6993 	}
6994 
6995 	if (__improbable(copy_required)) {
6996 		vm_offset_t     kaddr;
6997 		vm_size_t       ksize;
6998 
6999 		/*
7000 		 * Depending on the device configuration, wiring certain pages
7001 		 * for I/O may violate the security policy for codesigning-related
7002 		 * reasons.
7003 		 * Instead, let's copy the data into a kernel buffer and
7004 		 * create the UPL from this kernel buffer.
7005 		 * The kernel buffer is then freed, leaving the UPL holding
7006 		 * the last reference on the VM object, so the memory will
7007 		 * be released when the UPL is committed.
7008 		 */
7009 
7010 		vm_map_unlock_read(map);
7011 		entry = VM_MAP_ENTRY_NULL;
7012 		/* allocate kernel buffer */
7013 		ksize = round_page(*upl_size);
7014 		kaddr = 0;
7015 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
7016 		    KMA_PAGEABLE | KMA_DATA_SHARED, tag);
7017 		if (ret == KERN_SUCCESS) {
7018 			/* copyin the user data */
7019 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
7020 		}
7021 		if (ret == KERN_SUCCESS) {
7022 			if (ksize > *upl_size) {
7023 				/* zero out the extra space in kernel buffer */
7024 				memset((void *)(kaddr + *upl_size),
7025 				    0,
7026 				    ksize - *upl_size);
7027 			}
7028 			/* create the UPL from the kernel buffer */
7029 			ret = vm_map_create_upl(kernel_map,
7030 			    (vm_map_address_t)kaddr, upl_size, upl, page_list, count, flags, tag);
7031 		}
7032 		if (kaddr != 0) {
7033 			/* free the kernel buffer */
7034 			kmem_free(kernel_map, kaddr, ksize);
7035 			kaddr = 0;
7036 			ksize = 0;
7037 		}
7038 #if DEVELOPMENT || DEBUG
7039 		DTRACE_VM4(create_upl_from_executable,
7040 		    vm_map_t, map,
7041 		    vm_map_address_t, offset,
7042 		    upl_size_t, *upl_size,
7043 		    kern_return_t, ret);
7044 #endif /* DEVELOPMENT || DEBUG */
7045 		goto done;
7046 	}
7047 
7048 	if (!entry->is_sub_map) {
7049 		local_object = VME_OBJECT(entry);
7050 		assert(local_object != VM_OBJECT_NULL);
7051 	}
7052 
7053 	if (!entry->is_sub_map &&
7054 	    !entry->needs_copy &&
7055 	    *upl_size != 0 &&
7056 	    local_object->vo_size > *upl_size && /* partial UPL */
7057 	    entry->wired_count == 0 && /* No COW for entries that are wired */
7058 	    (map->pmap != kernel_pmap) && /* alias checks */
7059 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
7060 	    ||
7061 	    ( /* case 2 */
7062 		    local_object->internal &&
7063 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
7064 		    os_ref_get_count_raw(&local_object->ref_count) > 1))) {
7065 		vm_prot_t       prot;
7066 
7067 		/*
7068 		 * Case 1:
7069 		 * Set up the targeted range for copy-on-write to avoid
7070 		 * applying true_share/copy_delay to the entire object.
7071 		 *
7072 		 * Case 2:
7073 		 * This map entry covers only part of an internal
7074 		 * object.  There could be other map entries covering
7075 		 * other areas of this object and some of these map
7076 		 * entries could be marked as "needs_copy", which
7077 		 * assumes that the object is COPY_SYMMETRIC.
7078 		 * To avoid marking this object as COPY_DELAY and
7079 		 * "true_share", let's shadow it and mark the new
7080 		 * (smaller) object as "true_share" and COPY_DELAY.
7081 		 */
7082 
7083 		if (vm_map_lock_read_to_write(map)) {
7084 			goto REDISCOVER_ENTRY;
7085 		}
7086 		vm_map_lock_assert_exclusive(map);
7087 		assert(VME_OBJECT(entry) == local_object);
7088 
7089 		vm_map_clip_start(map,
7090 		    entry,
7091 		    vm_map_trunc_page(offset,
7092 		    VM_MAP_PAGE_MASK(map)));
7093 		vm_map_clip_end(map,
7094 		    entry,
7095 		    vm_map_round_page(offset + *upl_size,
7096 		    VM_MAP_PAGE_MASK(map)));
7097 		if ((entry->vme_end - offset) < *upl_size) {
7098 			*upl_size = (upl_size_t) (entry->vme_end - offset);
7099 			assert(*upl_size == entry->vme_end - offset);
7100 		}
7101 
7102 		prot = entry->protection & ~VM_PROT_WRITE;
7103 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
7104 			prot |= VM_PROT_EXECUTE;
7105 		}
7106 		vm_object_pmap_protect(local_object,
7107 		    VME_OFFSET(entry),
7108 		    entry->vme_end - entry->vme_start,
7109 		    ((entry->is_shared ||
7110 		    map->mapped_in_other_pmaps)
7111 		    ? PMAP_NULL
7112 		    : map->pmap),
7113 		    VM_MAP_PAGE_SIZE(map),
7114 		    entry->vme_start,
7115 		    prot);
7116 
7117 		assert(entry->wired_count == 0);
7118 
7119 		/*
7120 		 * Lock the VM object and re-check its status: if it's mapped
7121 		 * in another address space, we could still be racing with
7122 		 * another thread holding that other VM map exclusively.
7123 		 */
7124 		vm_object_lock(local_object);
7125 		if (local_object->true_share) {
7126 			/* object is already in proper state: no COW needed */
7127 			assert(local_object->copy_strategy !=
7128 			    MEMORY_OBJECT_COPY_SYMMETRIC);
7129 		} else {
7130 			/* not true_share: ask for copy-on-write below */
7131 			assert(local_object->copy_strategy ==
7132 			    MEMORY_OBJECT_COPY_SYMMETRIC);
7133 			entry->needs_copy = TRUE;
7134 		}
7135 		vm_object_unlock(local_object);
7136 
7137 		vm_map_lock_write_to_read(map);
7138 	}
7139 
7140 	if (entry->needs_copy) {
7141 		/*
7142 		 * Honor copy-on-write for COPY_SYMMETRIC
7143 		 * strategy.
7144 		 */
7145 		vm_map_t                local_map;
7146 		vm_object_t             object;
7147 		vm_object_offset_t      new_offset;
7148 		vm_prot_t               prot;
7149 		boolean_t               wired;
7150 		vm_map_version_t        version;
7151 		vm_map_t                real_map;
7152 		vm_prot_t               fault_type;
7153 
7154 		local_map = map;
7155 
7156 		if (caller_flags & UPL_COPYOUT_FROM) {
7157 			fault_type = VM_PROT_READ | VM_PROT_COPY;
7158 			vm_counters.create_upl_extra_cow++;
7159 			vm_counters.create_upl_extra_cow_pages +=
7160 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
7161 		} else {
7162 			fault_type = VM_PROT_WRITE;
7163 		}
7164 		if (vm_map_lookup_and_lock_object(&local_map,
7165 		    offset, fault_type,
7166 		    OBJECT_LOCK_EXCLUSIVE,
7167 		    &version, &object,
7168 		    &new_offset, &prot, &wired,
7169 		    NULL,
7170 		    &real_map, NULL) != KERN_SUCCESS) {
7171 			if (fault_type == VM_PROT_WRITE) {
7172 				vm_counters.create_upl_lookup_failure_write++;
7173 			} else {
7174 				vm_counters.create_upl_lookup_failure_copy++;
7175 			}
7176 			vm_map_unlock_read(local_map);
7177 			ret = KERN_FAILURE;
7178 			goto done;
7179 		}
7180 		if (real_map != local_map) {
7181 			vm_map_unlock(real_map);
7182 		}
7183 		vm_map_unlock_read(local_map);
7184 
7185 		vm_object_unlock(object);
7186 
7187 		goto REDISCOVER_ENTRY;
7188 	}
7189 
7190 	if (entry->is_sub_map) {
7191 		vm_map_t        submap;
7192 
7193 		submap = VME_SUBMAP(entry);
7194 		local_start = entry->vme_start;
7195 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7196 
7197 		vm_map_reference(submap);
7198 		vm_map_unlock_read(map);
7199 
7200 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, submap);
7201 
7202 		if (release_map) {
7203 			vm_map_deallocate(map);
7204 		}
7205 		map = submap;
7206 		release_map = TRUE;
7207 		offset = local_offset + (offset - local_start);
7208 		goto start_with_map;
7209 	}
7210 
7211 	if (sync_cow_data &&
7212 	    (VME_OBJECT(entry)->shadow ||
7213 	    VME_OBJECT(entry)->vo_copy)) {
7214 		local_object = VME_OBJECT(entry);
7215 		local_start = entry->vme_start;
7216 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7217 
7218 		vm_object_reference(local_object);
7219 		vm_map_unlock_read(map);
7220 
7221 		if (local_object->shadow && local_object->vo_copy) {
7222 			vm_object_lock_request(local_object->shadow,
7223 			    ((vm_object_offset_t)
7224 			    ((offset - local_start) +
7225 			    local_offset) +
7226 			    local_object->vo_shadow_offset),
7227 			    *upl_size, FALSE,
7228 			    MEMORY_OBJECT_DATA_SYNC,
7229 			    VM_PROT_NO_CHANGE);
7230 		}
7231 		sync_cow_data = FALSE;
7232 		vm_object_deallocate(local_object);
7233 
7234 		goto REDISCOVER_ENTRY;
7235 	}
7236 	if (force_data_sync) {
7237 		local_object = VME_OBJECT(entry);
7238 		local_start = entry->vme_start;
7239 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7240 
7241 		vm_object_reference(local_object);
7242 		vm_map_unlock_read(map);
7243 
7244 		vm_object_lock_request(local_object,
7245 		    ((vm_object_offset_t)
7246 		    ((offset - local_start) +
7247 		    local_offset)),
7248 		    (vm_object_size_t)*upl_size,
7249 		    FALSE,
7250 		    MEMORY_OBJECT_DATA_SYNC,
7251 		    VM_PROT_NO_CHANGE);
7252 
7253 		force_data_sync = FALSE;
7254 		vm_object_deallocate(local_object);
7255 
7256 		goto REDISCOVER_ENTRY;
7257 	}
7258 	if (VME_OBJECT(entry)->private) {
7259 		*flags = UPL_DEV_MEMORY;
7260 	} else {
7261 		*flags = 0;
7262 	}
7263 
7264 	if (VME_OBJECT(entry)->phys_contiguous) {
7265 		*flags |= UPL_PHYS_CONTIG;
7266 	}
7267 
7268 	local_object = VME_OBJECT(entry);
7269 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7270 	local_start = entry->vme_start;
7271 
7272 
7273 	/*
7274 	 * Wiring will copy the pages to the shadow object.
7275 	 * The shadow object will not be code-signed so
7276 	 * attempting to execute code from these copied pages
7277 	 * would trigger a code-signing violation.
7278 	 */
7279 	if (entry->protection & VM_PROT_EXECUTE) {
7280 #if MACH_ASSERT
7281 		printf("pid %d[%s] create_upl out of executable range from "
7282 		    "0x%llx to 0x%llx: side effects may include "
7283 		    "code-signing violations later on\n",
7284 		    proc_selfpid(),
7285 		    (get_bsdtask_info(current_task())
7286 		    ? proc_name_address(get_bsdtask_info(current_task()))
7287 		    : "?"),
7288 		    (uint64_t) entry->vme_start,
7289 		    (uint64_t) entry->vme_end);
7290 #endif /* MACH_ASSERT */
7291 		DTRACE_VM2(cs_executable_create_upl,
7292 		    uint64_t, (uint64_t)entry->vme_start,
7293 		    uint64_t, (uint64_t)entry->vme_end);
7294 		cs_executable_create_upl++;
7295 	}
7296 
7297 	vm_object_lock(local_object);
7298 
7299 	/*
7300 	 * Ensure that this object is "true_share" and "copy_delay" now,
7301 	 * while we're still holding the VM map lock.  After we unlock the map,
7302 	 * anything could happen to that mapping, including some copy-on-write
7303 	 * activity.  We need to make sure that the IOPL will point at the
7304 	 * same memory as the mapping.
7305 	 */
7306 	if (local_object->true_share) {
7307 		assert(local_object->copy_strategy !=
7308 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7309 	} else if (!is_kernel_object(local_object) &&
7310 	    local_object != compressor_object &&
7311 	    !local_object->phys_contiguous) {
7312 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7313 		if (!local_object->true_share &&
7314 		    vm_object_tracking_btlog) {
7315 			btlog_record(vm_object_tracking_btlog, local_object,
7316 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7317 			    btref_get(__builtin_frame_address(0), 0));
7318 		}
7319 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7320 		VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7321 		if (local_object->copy_strategy ==
7322 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7323 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7324 		}
7325 	}
7326 
7327 	vm_object_reference_locked(local_object);
7328 	vm_object_unlock(local_object);
7329 
7330 	vm_map_unlock_read(map);
7331 
7332 	ret = vm_object_iopl_request(local_object,
7333 	    ((vm_object_offset_t)
7334 	    ((offset - local_start) + local_offset)),
7335 	    *upl_size,
7336 	    upl,
7337 	    page_list,
7338 	    count,
7339 	    caller_flags,
7340 	    tag);
7341 	vm_object_deallocate(local_object);
7342 
7343 done:
7344 	if (release_map) {
7345 		vm_map_deallocate(map);
7346 	}
7347 
7348 	vmlp_api_end(VM_MAP_CREATE_UPL, ret);
7349 	return ret;
7350 }
7351 
7352 /*
7353  * Internal routine to enter a UPL into a VM map.
7354  *
7355  * JMM - This should just be doable through the standard
7356  * vm_map_enter() API.
7357  */
7358 kern_return_t
7359 vm_map_enter_upl_range(
7360 	vm_map_t                map,
7361 	upl_t                   upl,
7362 	vm_object_offset_t      offset_to_map,
7363 	vm_size_t               size_to_map,
7364 	vm_prot_t               prot_to_map,
7365 	vm_map_offset_t         *dst_addr)
7366 {
7367 	vm_map_size_t           size;
7368 	vm_object_offset_t      offset;
7369 	vm_map_offset_t         addr;
7370 	vm_page_t               m;
7371 	kern_return_t           kr;
7372 	int                     isVectorUPL = 0, curr_upl = 0;
7373 	upl_t                   vector_upl = NULL;
7374 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7375 	upl_offset_t            subupl_offset = 0;
7376 	upl_size_t              subupl_size = 0;
7377 
7378 	if (upl == UPL_NULL) {
7379 		return KERN_INVALID_ARGUMENT;
7380 	}
7381 
7382 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7383 	assert(map == kernel_map);
7384 
7385 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7386 		int mapped = 0, valid_upls = 0;
7387 		vector_upl = upl;
7388 
7389 		upl_lock(vector_upl);
7390 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7391 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7392 			if (upl == NULL) {
7393 				continue;
7394 			}
7395 			valid_upls++;
7396 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7397 				mapped++;
7398 			}
7399 		}
7400 
7401 		if (mapped) {
7402 			if (mapped != valid_upls) {
7403 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7404 			} else {
7405 				upl_unlock(vector_upl);
7406 				return KERN_FAILURE;
7407 			}
7408 		}
7409 
7410 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7411 			panic("TODO4K: vector UPL not implemented");
7412 		}
7413 
7414 		kern_return_t kr2;
7415 		vm_offset_t alloc_addr = 0;
7416 		kr2 = vm_allocate(map, &alloc_addr, vector_upl->u_size, VM_FLAGS_ANYWHERE);
7417 		if (kr2 != KERN_SUCCESS) {
7418 			os_log(OS_LOG_DEFAULT, "%s: vm_allocate(0x%x) -> %d",
7419 			    __func__, vector_upl->u_size, kr2);
7420 			upl_unlock(vector_upl);
7421 			return kr2;
7422 		}
7423 		vector_upl_dst_addr = alloc_addr;
7424 		vector_upl_set_addr(vector_upl, vector_upl_dst_addr);
7425 		curr_upl = 0;
7426 	} else {
7427 		upl_lock(upl);
7428 	}
7429 
7430 process_upl_to_enter:
7431 	if (isVectorUPL) {
7432 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7433 			*dst_addr = vector_upl_dst_addr;
7434 			upl_unlock(vector_upl);
7435 			return KERN_SUCCESS;
7436 		}
7437 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7438 		if (upl == NULL) {
7439 			goto process_upl_to_enter;
7440 		}
7441 
7442 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7443 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7444 	} else {
7445 		/*
7446 		 * check to see if already mapped
7447 		 */
7448 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7449 			upl_unlock(upl);
7450 			return KERN_FAILURE;
7451 		}
7452 	}
7453 
7454 	if ((!(upl->flags & UPL_SHADOWED)) &&
7455 	    ((upl->flags & UPL_HAS_BUSY) ||
7456 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7457 		vm_object_t             object;
7458 		vm_page_t               alias_page;
7459 		vm_object_offset_t      new_offset;
7460 		unsigned int            pg_num;
7461 
7462 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7463 		object = upl->map_object;
7464 		upl->map_object = vm_object_allocate(
7465 			vm_object_round_page(size),
7466 			/* Provenance is copied from the object we're shadowing */
7467 			object->vmo_provenance);
7468 
7469 		vm_object_lock(upl->map_object);
7470 
7471 		upl->map_object->shadow = object;
7472 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7473 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7474 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7475 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7476 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7477 		    "object %p shadow_offset 0x%llx",
7478 		    upl->map_object,
7479 		    (uint64_t)upl->map_object->vo_shadow_offset);
7480 		upl->map_object->wimg_bits = object->wimg_bits;
7481 		offset = upl->map_object->vo_shadow_offset;
7482 		new_offset = 0;
7483 
7484 		upl->flags |= UPL_SHADOWED;
7485 
7486 		while (size) {
7487 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7488 			assert(pg_num == new_offset / PAGE_SIZE);
7489 
7490 			if (bitmap_test(upl->lite_list, pg_num)) {
7491 				alias_page = vm_page_create_fictitious();
7492 
7493 				vm_object_lock(object);
7494 
7495 				m = vm_page_lookup(object, offset);
7496 				if (m == VM_PAGE_NULL) {
7497 					panic("vm_upl_map: page missing");
7498 				}
7499 
7500 				/*
7501 				 * Convert the fictitious page to a private
7502 				 * shadow of the real page.
7503 				 */
7504 				alias_page->vmp_free_when_done = TRUE;
7505 				/*
7506 				 * since m is a page in the upl it must
7507 				 * already be wired or BUSY, so it's
7508 				 * safe to assign the underlying physical
7509 				 * page to the alias
7510 				 */
7511 
7512 				vm_object_unlock(object);
7513 
7514 				vm_page_lockspin_queues();
7515 				vm_page_make_private(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7516 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7517 				vm_page_unlock_queues();
7518 
7519 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7520 
7521 				assert(!alias_page->vmp_wanted);
7522 				alias_page->vmp_busy = FALSE;
7523 				alias_page->vmp_absent = FALSE;
7524 			}
7525 			size -= PAGE_SIZE;
7526 			offset += PAGE_SIZE_64;
7527 			new_offset += PAGE_SIZE_64;
7528 		}
7529 		vm_object_unlock(upl->map_object);
7530 	}
7531 	if (upl->flags & UPL_SHADOWED) {
7532 		if (isVectorUPL) {
7533 			offset = 0;
7534 		} else {
7535 			offset = offset_to_map;
7536 		}
7537 	} else {
7538 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7539 		if (!isVectorUPL) {
7540 			offset += offset_to_map;
7541 		}
7542 	}
7543 
7544 	if (isVectorUPL) {
7545 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7546 	} else {
7547 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7548 	}
7549 
7550 	vm_object_reference(upl->map_object);
7551 
7552 	if (!isVectorUPL) {
7553 		*dst_addr = 0;
7554 		/*
7555 		 * NEED A UPL_MAP ALIAS
7556 		 */
7557 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7558 		    VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7559 		    upl->map_object, offset, FALSE,
7560 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7561 
7562 		if (kr != KERN_SUCCESS) {
7563 			vm_object_deallocate(upl->map_object);
7564 			upl_unlock(upl);
7565 			return kr;
7566 		}
7567 	} else {
7568 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7569 		    VM_MAP_KERNEL_FLAGS_FIXED(
7570 			    .vm_tag = VM_KERN_MEMORY_OSFMK,
7571 			    .vmf_overwrite = true),
7572 		    upl->map_object, offset, FALSE,
7573 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7574 		if (kr) {
7575 			panic("vm_map_enter failed for a Vector UPL");
7576 		}
7577 	}
7578 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7579 	                                        /* this will have to be an increment rather than */
7580 	                                        /* an assignment. */
7581 	vm_object_lock(upl->map_object);
7582 
7583 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7584 		m = vm_page_lookup(upl->map_object, offset);
7585 
7586 		if (m) {
7587 			m->vmp_pmapped = TRUE;
7588 
7589 			/*
7590 			 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7591 			 * but only in kernel space. If this was on a user map,
7592 			 * we'd have to set the wpmapped bit.
7593 			 */
7594 			/* m->vmp_wpmapped = TRUE; */
7595 			assert(map->pmap == kernel_pmap);
7596 
7597 			kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, TRUE);
7598 
7599 			assert(kr == KERN_SUCCESS);
7600 #if KASAN
7601 			kasan_notify_address(addr, PAGE_SIZE_64);
7602 #endif
7603 		}
7604 		offset += PAGE_SIZE_64;
7605 	}
7606 	vm_object_unlock(upl->map_object);
7607 
7608 	/*
7609 	 * hold a reference for the mapping
7610 	 */
7611 	upl->ref_count++;
7612 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7613 	upl->kaddr = (vm_offset_t) *dst_addr;
7614 	assert(upl->kaddr == *dst_addr);
7615 
7616 	if (isVectorUPL) {
7617 		goto process_upl_to_enter;
7618 	}
7619 
7620 	if (!isVectorUPL) {
7621 		vm_map_offset_t addr_adjustment;
7622 
7623 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7624 		if (addr_adjustment) {
7625 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7626 			*dst_addr += addr_adjustment;
7627 		}
7628 	}
7629 
7630 	upl_unlock(upl);
7631 
7632 	return KERN_SUCCESS;
7633 }
7634 
7635 kern_return_t
7636 vm_map_enter_upl(
7637 	vm_map_t                map,
7638 	upl_t                   upl,
7639 	vm_map_offset_t         *dst_addr)
7640 {
7641 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7642 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7643 }
7644 
7645 /*
7646  * Internal routine to remove a UPL mapping from a VM map.
7647  *
7648  * XXX - This should just be doable through a standard
7649  * vm_map_remove() operation.  Otherwise, implicit clean-up
7650  * of the target map won't be able to correctly remove
7651  * these (and release the reference on the UPL).  Having
7652  * to do this means we can't map these into user-space
7653  * maps yet.
7654  */
7655 kern_return_t
7656 vm_map_remove_upl_range(
7657 	vm_map_t        map,
7658 	upl_t           upl,
7659 	__unused vm_object_offset_t    offset_to_unmap,
7660 	__unused vm_size_t             size_to_unmap)
7661 {
7662 	vm_address_t    addr;
7663 	upl_size_t      size;
7664 	int             isVectorUPL = 0, curr_upl = 0;
7665 	upl_t           vector_upl = NULL;
7666 
7667 	if (upl == UPL_NULL) {
7668 		return KERN_INVALID_ARGUMENT;
7669 	}
7670 
7671 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7672 		int     unmapped = 0, valid_upls = 0;
7673 		vector_upl = upl;
7674 		upl_lock(vector_upl);
7675 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7676 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7677 			if (upl == NULL) {
7678 				continue;
7679 			}
7680 			valid_upls++;
7681 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7682 				unmapped++;
7683 			}
7684 		}
7685 
7686 		if (unmapped) {
7687 			if (unmapped != valid_upls) {
7688 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7689 			} else {
7690 				upl_unlock(vector_upl);
7691 				return KERN_FAILURE;
7692 			}
7693 		}
7694 		curr_upl = 0;
7695 	} else {
7696 		upl_lock(upl);
7697 	}
7698 
7699 process_upl_to_remove:
7700 	if (isVectorUPL) {
7701 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7702 			vm_offset_t v_upl_dst_addr;
7703 			kern_return_t kr;
7704 			vector_upl_get_addr(vector_upl, &v_upl_dst_addr);
7705 
7706 			kr = vm_deallocate(map, v_upl_dst_addr, vector_upl->u_size);
7707 			if (kr != KERN_SUCCESS) {
7708 				os_log(OS_LOG_DEFAULT, "%s: vm_deallocate(0x%llx, 0x%x) -> %d",
7709 				    __func__, (uint64_t)v_upl_dst_addr,
7710 				    vector_upl->u_size, kr);
7711 			}
7712 			v_upl_dst_addr = 0;
7713 			vector_upl_set_addr(vector_upl, v_upl_dst_addr);
7714 			upl_unlock(vector_upl);
7715 			return KERN_SUCCESS;
7716 		}
7717 
7718 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7719 		if (upl == NULL) {
7720 			goto process_upl_to_remove;
7721 		}
7722 	}
7723 
7724 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7725 		addr = upl->kaddr;
7726 		size = upl->u_mapped_size;
7727 
7728 		assert(upl->ref_count > 1);
7729 		upl->ref_count--;               /* removing mapping ref */
7730 
7731 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7732 		upl->kaddr = (vm_offset_t) 0;
7733 		upl->u_mapped_size = 0;
7734 
7735 		if (isVectorUPL) {
7736 			/*
7737 			 * If it's a Vectored UPL, we'll be removing the entire
7738 			 * address range anyway, so no need to remove individual UPL
7739 			 * element mappings from within the range
7740 			 */
7741 			goto process_upl_to_remove;
7742 		}
7743 
7744 		upl_unlock(upl);
7745 
7746 		vm_map_remove(map,
7747 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7748 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7749 		return KERN_SUCCESS;
7750 	}
7751 	upl_unlock(upl);
7752 
7753 	return KERN_FAILURE;
7754 }
7755 
7756 kern_return_t
7757 vm_map_remove_upl(
7758 	vm_map_t        map,
7759 	upl_t           upl)
7760 {
7761 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7762 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7763 }
7764 
7765 void
7766 iopl_valid_data(
7767 	upl_t    upl,
7768 	vm_tag_t tag)
7769 {
7770 	vm_object_t     object;
7771 	vm_offset_t     offset;
7772 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
7773 	upl_size_t      size;
7774 	int             wired_count = 0;
7775 
7776 	if (upl == NULL) {
7777 		panic("iopl_valid_data: NULL upl");
7778 	}
7779 	if (vector_upl_is_valid(upl)) {
7780 		panic("iopl_valid_data: vector upl");
7781 	}
7782 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7783 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7784 	}
7785 
7786 	object = upl->map_object;
7787 
7788 	if (is_kernel_object(object) || object == compressor_object) {
7789 		panic("iopl_valid_data: object == kernel or compressor");
7790 	}
7791 
7792 	if (object->purgable == VM_PURGABLE_VOLATILE ||
7793 	    object->purgable == VM_PURGABLE_EMPTY) {
7794 		panic("iopl_valid_data: object %p purgable %d",
7795 		    object, object->purgable);
7796 	}
7797 
7798 	size = upl_adjusted_size(upl, PAGE_MASK);
7799 
7800 	vm_object_lock(object);
7801 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7802 
7803 	bool whole_object;
7804 
7805 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7806 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7807 		whole_object = true;
7808 	} else {
7809 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7810 		whole_object = false;
7811 	}
7812 
7813 	while (size) {
7814 		if (whole_object) {
7815 			if (nxt_page != VM_PAGE_NULL) {
7816 				m = nxt_page;
7817 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7818 			}
7819 		} else {
7820 			m = vm_page_lookup(object, offset);
7821 			offset += PAGE_SIZE;
7822 
7823 			if (m == VM_PAGE_NULL) {
7824 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7825 			}
7826 		}
7827 		if (m->vmp_busy) {
7828 			if (!m->vmp_absent) {
7829 				panic("iopl_valid_data: busy page w/o absent");
7830 			}
7831 
7832 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7833 				panic("iopl_valid_data: busy+absent page on page queue");
7834 			}
7835 			if (m->vmp_reusable) {
7836 				panic("iopl_valid_data: %p is reusable", m);
7837 			}
7838 
7839 			m->vmp_absent = FALSE;
7840 			m->vmp_dirty = TRUE;
7841 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7842 			assert(m->vmp_wire_count == 0);
7843 			m->vmp_wire_count++;
7844 			m->vmp_iopl_wired = true;
7845 			assert(m->vmp_wire_count);
7846 			if (m->vmp_wire_count == 1) {
7847 				m->vmp_q_state = VM_PAGE_IS_WIRED;
7848 				wired_count++;
7849 			} else {
7850 				panic("iopl_valid_data: %p already wired", m);
7851 			}
7852 
7853 
7854 			vm_page_wakeup_done(object, m);
7855 		}
7856 		size -= PAGE_SIZE;
7857 	}
7858 	if (wired_count) {
7859 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7860 		assert(object->resident_page_count >= object->wired_page_count);
7861 
7862 		/* no need to adjust purgeable accounting for this object: */
7863 		assert(object->purgable != VM_PURGABLE_VOLATILE);
7864 		assert(object->purgable != VM_PURGABLE_EMPTY);
7865 
7866 		vm_page_lockspin_queues();
7867 		vm_page_wire_count += wired_count;
7868 		vm_page_unlock_queues();
7869 	}
7870 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7871 	vm_object_unlock(object);
7872 }
7873 
7874 
7875 void
7876 vm_object_set_pmap_cache_attr(
7877 	vm_object_t             object,
7878 	upl_page_info_array_t   user_page_list,
7879 	unsigned int            num_pages,
7880 	boolean_t               batch_pmap_op)
7881 {
7882 	unsigned int    cache_attr = 0;
7883 
7884 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
7885 	assert(user_page_list);
7886 	if (!HAS_DEFAULT_CACHEABILITY(cache_attr)) {
7887 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7888 	}
7889 }
7890 
7891 
7892 static bool
7893 vm_object_iopl_wire_full(
7894 	vm_object_t             object,
7895 	upl_t                   upl,
7896 	upl_page_info_array_t   user_page_list,
7897 	upl_control_flags_t     cntrl_flags,
7898 	vm_tag_t                tag)
7899 {
7900 	vm_page_t       dst_page;
7901 	unsigned int    entry;
7902 	int             page_count;
7903 	int             delayed_unlock = 0;
7904 	boolean_t       retval = TRUE;
7905 	ppnum_t         phys_page;
7906 
7907 	vm_object_lock_assert_exclusive(object);
7908 	assert(object->purgable != VM_PURGABLE_VOLATILE);
7909 	assert(object->purgable != VM_PURGABLE_EMPTY);
7910 	assert(object->pager == NULL);
7911 	assert(object->vo_copy == NULL);
7912 	assert(object->shadow == NULL);
7913 
7914 	page_count = object->resident_page_count;
7915 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7916 
7917 	vm_page_lock_queues();
7918 
7919 	while (page_count--) {
7920 		if (dst_page->vmp_busy ||
7921 #if CONFIG_SPTM
7922 		    PMAP_PAGE_IS_USER_EXECUTABLE(dst_page) ||
7923 #endif
7924 		    vm_page_is_fictitious(dst_page) ||
7925 		    dst_page->vmp_absent ||
7926 		    VMP_ERROR_GET(dst_page) ||
7927 		    dst_page->vmp_cleaning ||
7928 		    dst_page->vmp_restart ||
7929 		    dst_page->vmp_laundry) {
7930 			retval = FALSE;
7931 			goto done;
7932 		}
7933 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7934 			retval = FALSE;
7935 			goto done;
7936 		}
7937 		dst_page->vmp_reference = TRUE;
7938 
7939 		vm_page_wire(dst_page, tag, FALSE);
7940 		dst_page->vmp_iopl_wired = true;
7941 
7942 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7943 			SET_PAGE_DIRTY(dst_page, FALSE);
7944 		}
7945 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7946 		assert(entry >= 0 && entry < object->resident_page_count);
7947 		bitmap_set(upl->lite_list, entry);
7948 
7949 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7950 
7951 		if (phys_page > upl->highest_page) {
7952 			upl->highest_page = phys_page;
7953 		}
7954 
7955 		if (user_page_list) {
7956 			user_page_list[entry].phys_addr = phys_page;
7957 			user_page_list[entry].absent    = dst_page->vmp_absent;
7958 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
7959 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
7960 			user_page_list[entry].precious  = dst_page->vmp_precious;
7961 			user_page_list[entry].device    = FALSE;
7962 			user_page_list[entry].speculative = FALSE;
7963 			user_page_list[entry].cs_validated = FALSE;
7964 			user_page_list[entry].cs_tainted = FALSE;
7965 			user_page_list[entry].cs_nx     = FALSE;
7966 			user_page_list[entry].needed    = FALSE;
7967 			user_page_list[entry].mark      = FALSE;
7968 		}
7969 		if (delayed_unlock++ > 256) {
7970 			delayed_unlock = 0;
7971 			lck_mtx_yield(&vm_page_queue_lock);
7972 
7973 			VM_CHECK_MEMORYSTATUS;
7974 		}
7975 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7976 	}
7977 done:
7978 	vm_page_unlock_queues();
7979 
7980 	VM_CHECK_MEMORYSTATUS;
7981 
7982 	return retval;
7983 }
7984 
7985 
7986 static kern_return_t
7987 vm_object_iopl_wire_empty(
7988 	vm_object_t             object,
7989 	upl_t                   upl,
7990 	upl_page_info_array_t   user_page_list,
7991 	upl_control_flags_t     cntrl_flags,
7992 	vm_tag_t                tag,
7993 	vm_object_offset_t     *dst_offset,
7994 	int                     page_count,
7995 	int                    *page_grab_count)
7996 {
7997 	vm_page_t         dst_page;
7998 	boolean_t         no_zero_fill = FALSE;
7999 	int               interruptible;
8000 	int               pages_wired = 0;
8001 	int               pages_inserted = 0;
8002 	int               entry = 0;
8003 	uint64_t          delayed_ledger_update = 0;
8004 	kern_return_t     ret = KERN_SUCCESS;
8005 	vm_grab_options_t grab_options;
8006 	ppnum_t           phys_page;
8007 
8008 	vm_object_lock_assert_exclusive(object);
8009 	assert(object->purgable != VM_PURGABLE_VOLATILE);
8010 	assert(object->purgable != VM_PURGABLE_EMPTY);
8011 	assert(object->pager == NULL);
8012 	assert(object->vo_copy == NULL);
8013 	assert(object->shadow == NULL);
8014 
8015 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8016 		interruptible = THREAD_ABORTSAFE;
8017 	} else {
8018 		interruptible = THREAD_UNINT;
8019 	}
8020 
8021 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8022 		no_zero_fill = TRUE;
8023 	}
8024 
8025 	grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
8026 #if CONFIG_SECLUDED_MEMORY
8027 	if (object->can_grab_secluded) {
8028 		grab_options |= VM_PAGE_GRAB_SECLUDED;
8029 	}
8030 #endif /* CONFIG_SECLUDED_MEMORY */
8031 
8032 	while (page_count--) {
8033 		while ((dst_page = vm_page_grab_options(grab_options))
8034 		    == VM_PAGE_NULL) {
8035 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
8036 
8037 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8038 
8039 			if (vm_page_wait(interruptible) == FALSE) {
8040 				/*
8041 				 * interrupted case
8042 				 */
8043 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8044 
8045 				VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8046 
8047 				ret = MACH_SEND_INTERRUPTED;
8048 				goto done;
8049 			}
8050 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8051 
8052 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8053 		}
8054 
8055 		dst_page->vmp_absent = no_zero_fill;
8056 		dst_page->vmp_reference = TRUE;
8057 
8058 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8059 			SET_PAGE_DIRTY(dst_page, FALSE);
8060 		}
8061 		if (dst_page->vmp_absent == FALSE) {
8062 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
8063 			assert(dst_page->vmp_wire_count == 0);
8064 			dst_page->vmp_wire_count++;
8065 			dst_page->vmp_iopl_wired = true;
8066 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
8067 			assert(dst_page->vmp_wire_count);
8068 			pages_wired++;
8069 
8070 
8071 			vm_page_wakeup_done(object, dst_page);
8072 		}
8073 		pages_inserted++;
8074 
8075 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8076 
8077 		if (no_zero_fill == FALSE) {
8078 			vm_page_zero_fill(
8079 				dst_page
8080 				);
8081 		}
8082 
8083 		bitmap_set(upl->lite_list, entry);
8084 
8085 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8086 
8087 		if (phys_page > upl->highest_page) {
8088 			upl->highest_page = phys_page;
8089 		}
8090 
8091 		if (user_page_list) {
8092 			user_page_list[entry].phys_addr = phys_page;
8093 			user_page_list[entry].absent    = dst_page->vmp_absent;
8094 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8095 			user_page_list[entry].free_when_done    = FALSE;
8096 			user_page_list[entry].precious  = FALSE;
8097 			user_page_list[entry].device    = FALSE;
8098 			user_page_list[entry].speculative = FALSE;
8099 			user_page_list[entry].cs_validated = FALSE;
8100 			user_page_list[entry].cs_tainted = FALSE;
8101 			user_page_list[entry].cs_nx     = FALSE;
8102 			user_page_list[entry].needed    = FALSE;
8103 			user_page_list[entry].mark      = FALSE;
8104 		}
8105 		entry++;
8106 		*dst_offset += PAGE_SIZE_64;
8107 	}
8108 done:
8109 	if (pages_wired) {
8110 		vm_page_lockspin_queues();
8111 		vm_page_wire_count += pages_wired;
8112 		vm_page_unlock_queues();
8113 	}
8114 	if (pages_inserted) {
8115 		if (object->internal) {
8116 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
8117 		} else {
8118 			OSAddAtomic(pages_inserted, &vm_page_external_count);
8119 		}
8120 	}
8121 	if (delayed_ledger_update) {
8122 		task_t          owner;
8123 		int             ledger_idx_volatile;
8124 		int             ledger_idx_nonvolatile;
8125 		int             ledger_idx_volatile_compressed;
8126 		int             ledger_idx_nonvolatile_compressed;
8127 		int             ledger_idx_composite;
8128 		int             ledger_idx_external_wired;
8129 		boolean_t       do_footprint;
8130 
8131 		owner = VM_OBJECT_OWNER(object);
8132 		assert(owner);
8133 
8134 		vm_object_ledger_tag_ledgers(object,
8135 		    &ledger_idx_volatile,
8136 		    &ledger_idx_nonvolatile,
8137 		    &ledger_idx_volatile_compressed,
8138 		    &ledger_idx_nonvolatile_compressed,
8139 		    &ledger_idx_composite,
8140 		    &ledger_idx_external_wired,
8141 		    &do_footprint);
8142 
8143 		if (object->internal) {
8144 			/* more non-volatile bytes */
8145 			ledger_credit(owner->ledger,
8146 			    ledger_idx_nonvolatile,
8147 			    delayed_ledger_update);
8148 			if (do_footprint) {
8149 				/* more footprint */
8150 				ledger_credit(owner->ledger,
8151 				    task_ledgers.phys_footprint,
8152 				    delayed_ledger_update);
8153 			} else if (ledger_idx_composite != -1) {
8154 				ledger_credit(owner->ledger,
8155 				    ledger_idx_composite,
8156 				    delayed_ledger_update);
8157 			}
8158 		} else {
8159 			/* more external wired bytes */
8160 			ledger_credit(owner->ledger,
8161 			    ledger_idx_external_wired,
8162 			    delayed_ledger_update);
8163 			if (do_footprint) {
8164 				/* more footprint */
8165 				ledger_credit(owner->ledger,
8166 				    task_ledgers.phys_footprint,
8167 				    delayed_ledger_update);
8168 			} else if (ledger_idx_composite != -1) {
8169 				ledger_credit(owner->ledger,
8170 				    ledger_idx_composite,
8171 				    delayed_ledger_update);
8172 			}
8173 		}
8174 	}
8175 
8176 	assert(page_grab_count);
8177 	*page_grab_count = pages_inserted;
8178 
8179 	return ret;
8180 }
8181 
8182 
8183 kern_return_t
8184 vm_object_iopl_request(
8185 	vm_object_t             object,
8186 	vm_object_offset_t      offset,
8187 	upl_size_t              size,
8188 	upl_t                   *upl_ptr,
8189 	upl_page_info_array_t   user_page_list,
8190 	unsigned int            *page_list_count,
8191 	upl_control_flags_t     cntrl_flags,
8192 	vm_tag_t                tag)
8193 {
8194 	vm_page_t               dst_page;
8195 	vm_object_offset_t      dst_offset;
8196 	upl_size_t              xfer_size;
8197 	upl_t                   upl = NULL;
8198 	unsigned int            entry;
8199 	int                     no_zero_fill = FALSE;
8200 	unsigned int            size_in_pages;
8201 	int                     page_grab_count = 0;
8202 	u_int32_t               psize;
8203 	kern_return_t           ret;
8204 	vm_prot_t               prot;
8205 	struct vm_object_fault_info fault_info = {};
8206 	struct  vm_page_delayed_work    dw_array;
8207 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8208 	bool                    dwp_finish_ctx = TRUE;
8209 	int                     dw_count;
8210 	int                     dw_limit;
8211 	int                     dw_index;
8212 	boolean_t               caller_lookup;
8213 	int                     io_tracking_flag = 0;
8214 	int                     interruptible;
8215 	ppnum_t                 phys_page;
8216 
8217 	boolean_t               set_cache_attr_needed = FALSE;
8218 	boolean_t               free_wired_pages = FALSE;
8219 	boolean_t               fast_path_empty_req = FALSE;
8220 	boolean_t               fast_path_full_req = FALSE;
8221 
8222 	task_t                  task = current_task();
8223 
8224 	dwp_start = dwp = NULL;
8225 	*upl_ptr = NULL;
8226 
8227 	vm_object_offset_t original_offset = offset;
8228 	upl_size_t original_size = size;
8229 
8230 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8231 
8232 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8233 	offset = vm_object_trunc_page(offset);
8234 	if (size != original_size || offset != original_offset) {
8235 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8236 	}
8237 
8238 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
8239 		/*
8240 		 * For forward compatibility's sake,
8241 		 * reject any unknown flag.
8242 		 */
8243 		return KERN_INVALID_VALUE;
8244 	}
8245 	if (!vm_lopage_needed) {
8246 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8247 	}
8248 
8249 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8250 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8251 			return KERN_INVALID_VALUE;
8252 		}
8253 
8254 		if (object->phys_contiguous) {
8255 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8256 				return KERN_INVALID_ADDRESS;
8257 			}
8258 
8259 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8260 				return KERN_INVALID_ADDRESS;
8261 			}
8262 		}
8263 	}
8264 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8265 		no_zero_fill = TRUE;
8266 	}
8267 
8268 	if (cntrl_flags & UPL_COPYOUT_FROM) {
8269 		prot = VM_PROT_READ;
8270 	} else {
8271 		prot = VM_PROT_READ | VM_PROT_WRITE;
8272 	}
8273 
8274 	if ((!object->internal) && (object->paging_offset != 0)) {
8275 		panic("vm_object_iopl_request: external object with non-zero paging offset");
8276 	}
8277 
8278 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8279 
8280 #if CONFIG_IOSCHED || UPL_DEBUG
8281 	if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8282 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8283 	}
8284 #endif
8285 
8286 #if CONFIG_IOSCHED
8287 	if (object->io_tracking) {
8288 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8289 		if (!is_kernel_object(object)) {
8290 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8291 		}
8292 	}
8293 #endif
8294 
8295 	if (object->phys_contiguous) {
8296 		psize = PAGE_SIZE;
8297 	} else {
8298 		psize = size;
8299 
8300 		dw_count = 0;
8301 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8302 		dwp_start = vm_page_delayed_work_get_ctx();
8303 		if (dwp_start == NULL) {
8304 			dwp_start = &dw_array;
8305 			dw_limit = 1;
8306 			dwp_finish_ctx = FALSE;
8307 		}
8308 
8309 		dwp = dwp_start;
8310 	}
8311 
8312 	if (cntrl_flags & UPL_SET_INTERNAL) {
8313 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8314 		user_page_list = size ? upl->page_list : NULL;
8315 	} else {
8316 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8317 	}
8318 	if (user_page_list) {
8319 		user_page_list[0].device = FALSE;
8320 	}
8321 	*upl_ptr = upl;
8322 
8323 	if (cntrl_flags & UPL_NOZEROFILLIO) {
8324 		DTRACE_VM4(upl_nozerofillio,
8325 		    vm_object_t, object,
8326 		    vm_object_offset_t, offset,
8327 		    upl_size_t, size,
8328 		    upl_t, upl);
8329 	}
8330 
8331 	upl->map_object = object;
8332 	upl->u_offset = original_offset;
8333 	upl->u_size = original_size;
8334 
8335 	size_in_pages = size / PAGE_SIZE;
8336 
8337 	if (is_kernel_object(object) &&
8338 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8339 		upl->flags |= UPL_KERNEL_OBJECT;
8340 #if UPL_DEBUG
8341 		vm_object_lock(object);
8342 #else
8343 		vm_object_lock_shared(object);
8344 #endif
8345 	} else {
8346 		vm_object_lock(object);
8347 		vm_object_activity_begin(object);
8348 	}
8349 	/*
8350 	 * paging in progress also protects the paging_offset
8351 	 */
8352 	upl->u_offset = original_offset + object->paging_offset;
8353 
8354 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8355 		/*
8356 		 * The user requested that access to the pages in this UPL
8357 		 * be blocked until the UPL is commited or aborted.
8358 		 */
8359 		upl->flags |= UPL_ACCESS_BLOCKED;
8360 	}
8361 
8362 #if CONFIG_IOSCHED || UPL_DEBUG
8363 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8364 		vm_object_activity_begin(object);
8365 		queue_enter(&object->uplq, upl, upl_t, uplq);
8366 	}
8367 #endif
8368 
8369 	if (object->phys_contiguous) {
8370 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8371 			assert(!object->blocked_access);
8372 			object->blocked_access = TRUE;
8373 		}
8374 
8375 		vm_object_unlock(object);
8376 
8377 		/*
8378 		 * don't need any shadow mappings for this one
8379 		 * since it is already I/O memory
8380 		 */
8381 		upl->flags |= UPL_DEVICE_MEMORY;
8382 
8383 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8384 
8385 		if (user_page_list) {
8386 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8387 			user_page_list[0].device = TRUE;
8388 		}
8389 		if (page_list_count != NULL) {
8390 			if (upl->flags & UPL_INTERNAL) {
8391 				*page_list_count = 0;
8392 			} else {
8393 				*page_list_count = 1;
8394 			}
8395 		}
8396 
8397 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8398 		if (task != NULL) {
8399 			ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8400 		}
8401 		counter_add(&vm_page_grab_count_iopl, page_grab_count);
8402 		return KERN_SUCCESS;
8403 	}
8404 	if (!is_kernel_object(object) && object != compressor_object) {
8405 		/*
8406 		 * Protect user space from future COW operations
8407 		 */
8408 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8409 		if (!object->true_share &&
8410 		    vm_object_tracking_btlog) {
8411 			btlog_record(vm_object_tracking_btlog, object,
8412 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
8413 			    btref_get(__builtin_frame_address(0), 0));
8414 		}
8415 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8416 
8417 		vm_object_lock_assert_exclusive(object);
8418 		VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8419 
8420 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8421 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8422 		}
8423 	}
8424 
8425 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8426 	    object->vo_copy != VM_OBJECT_NULL) {
8427 		/*
8428 		 * Honor copy-on-write obligations
8429 		 *
8430 		 * The caller is gathering these pages and
8431 		 * might modify their contents.  We need to
8432 		 * make sure that the copy object has its own
8433 		 * private copies of these pages before we let
8434 		 * the caller modify them.
8435 		 *
8436 		 * NOTE: someone else could map the original object
8437 		 * after we've done this copy-on-write here, and they
8438 		 * could then see an inconsistent picture of the memory
8439 		 * while it's being modified via the UPL.  To prevent this,
8440 		 * we would have to block access to these pages until the
8441 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
8442 		 * code path for that...
8443 		 */
8444 		vm_object_update(object,
8445 		    offset,
8446 		    size,
8447 		    NULL,
8448 		    NULL,
8449 		    FALSE,              /* should_return */
8450 		    MEMORY_OBJECT_COPY_SYNC,
8451 		    VM_PROT_NO_CHANGE);
8452 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
8453 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8454 	}
8455 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8456 	    object->purgable != VM_PURGABLE_VOLATILE &&
8457 	    object->purgable != VM_PURGABLE_EMPTY &&
8458 	    object->vo_copy == NULL &&
8459 	    size == object->vo_size &&
8460 	    offset == 0 &&
8461 	    object->shadow == NULL &&
8462 	    object->pager == NULL) {
8463 		if (object->resident_page_count == size_in_pages) {
8464 			assert(object != compressor_object);
8465 			assert(!is_kernel_object(object));
8466 			fast_path_full_req = TRUE;
8467 		} else if (object->resident_page_count == 0) {
8468 			assert(object != compressor_object);
8469 			assert(!is_kernel_object(object));
8470 			fast_path_empty_req = TRUE;
8471 			set_cache_attr_needed = TRUE;
8472 		}
8473 	}
8474 
8475 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8476 		interruptible = THREAD_ABORTSAFE;
8477 	} else {
8478 		interruptible = THREAD_UNINT;
8479 	}
8480 
8481 	entry = 0;
8482 
8483 	xfer_size = size;
8484 	dst_offset = offset;
8485 
8486 	if (fast_path_full_req) {
8487 		if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8488 			goto finish;
8489 		}
8490 		/*
8491 		 * we couldn't complete the processing of this request on the fast path
8492 		 * so fall through to the slow path and finish up
8493 		 */
8494 	} else if (fast_path_empty_req) {
8495 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8496 			ret = KERN_MEMORY_ERROR;
8497 			goto return_err;
8498 		}
8499 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8500 		    cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8501 
8502 		if (ret) {
8503 			free_wired_pages = TRUE;
8504 			goto return_err;
8505 		}
8506 		goto finish;
8507 	}
8508 
8509 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8510 	fault_info.lo_offset = offset;
8511 	fault_info.hi_offset = offset + xfer_size;
8512 	fault_info.mark_zf_absent = TRUE;
8513 	fault_info.interruptible = interruptible;
8514 	fault_info.batch_pmap_op = TRUE;
8515 
8516 	while (xfer_size) {
8517 		vm_fault_return_t       result;
8518 
8519 		dwp->dw_mask = 0;
8520 
8521 		if (fast_path_full_req) {
8522 			/*
8523 			 * if we get here, it means that we ran into a page
8524 			 * state we couldn't handle in the fast path and
8525 			 * bailed out to the slow path... since the order
8526 			 * we look at pages is different between the 2 paths,
8527 			 * the following check is needed to determine whether
8528 			 * this page was already processed in the fast path
8529 			 */
8530 			if (bitmap_test(upl->lite_list, entry)) {
8531 				goto skip_page;
8532 			}
8533 		}
8534 		dst_page = vm_page_lookup(object, dst_offset);
8535 
8536 		if (dst_page == VM_PAGE_NULL ||
8537 		    dst_page->vmp_busy ||
8538 		    VMP_ERROR_GET(dst_page) ||
8539 		    dst_page->vmp_restart ||
8540 		    dst_page->vmp_absent ||
8541 		    vm_page_is_fictitious(dst_page)) {
8542 			if (is_kernel_object(object)) {
8543 				panic("vm_object_iopl_request: missing/bad page in kernel object");
8544 			}
8545 			if (object == compressor_object) {
8546 				panic("vm_object_iopl_request: missing/bad page in compressor object");
8547 			}
8548 
8549 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8550 				ret = KERN_MEMORY_ERROR;
8551 				goto return_err;
8552 			}
8553 
8554 			if (dst_page != VM_PAGE_NULL &&
8555 			    dst_page->vmp_busy) {
8556 				wait_result_t wait_result;
8557 				vm_object_lock_assert_exclusive(object);
8558 				wait_result = vm_page_sleep(object, dst_page,
8559 				    interruptible, LCK_SLEEP_DEFAULT);
8560 				if (wait_result == THREAD_AWAKENED ||
8561 				    wait_result == THREAD_RESTART) {
8562 					continue;
8563 				}
8564 				ret = MACH_SEND_INTERRUPTED;
8565 				goto return_err;
8566 			}
8567 
8568 			set_cache_attr_needed = TRUE;
8569 
8570 			/*
8571 			 * We just looked up the page and the result remains valid
8572 			 * until the object lock is release, so send it to
8573 			 * vm_fault_page() (as "dst_page"), to avoid having to
8574 			 * look it up again there.
8575 			 */
8576 			caller_lookup = TRUE;
8577 
8578 			do {
8579 				vm_page_t       top_page;
8580 				kern_return_t   error_code;
8581 
8582 				fault_info.cluster_size = xfer_size;
8583 				vm_object_paging_begin(object);
8584 
8585 				result = vm_fault_page(object, dst_offset,
8586 				    prot | VM_PROT_WRITE, FALSE,
8587 				    caller_lookup,
8588 				    &prot, &dst_page, &top_page,
8589 				    (int *)0,
8590 				    &error_code, no_zero_fill,
8591 				    &fault_info);
8592 
8593 				/* our lookup is no longer valid at this point */
8594 				caller_lookup = FALSE;
8595 
8596 				switch (result) {
8597 				case VM_FAULT_SUCCESS:
8598 					page_grab_count++;
8599 
8600 					if (!dst_page->vmp_absent) {
8601 						vm_page_wakeup_done(object, dst_page);
8602 					} else {
8603 						/*
8604 						 * we only get back an absent page if we
8605 						 * requested that it not be zero-filled
8606 						 * because we are about to fill it via I/O
8607 						 *
8608 						 * absent pages should be left BUSY
8609 						 * to prevent them from being faulted
8610 						 * into an address space before we've
8611 						 * had a chance to complete the I/O on
8612 						 * them since they may contain info that
8613 						 * shouldn't be seen by the faulting task
8614 						 */
8615 					}
8616 					/*
8617 					 *	Release paging references and
8618 					 *	top-level placeholder page, if any.
8619 					 */
8620 					if (top_page != VM_PAGE_NULL) {
8621 						vm_object_t local_object;
8622 
8623 						local_object = VM_PAGE_OBJECT(top_page);
8624 
8625 						/*
8626 						 * comparing 2 packed pointers
8627 						 */
8628 						if (top_page->vmp_object != dst_page->vmp_object) {
8629 							vm_object_lock(local_object);
8630 							VM_PAGE_FREE(top_page);
8631 							vm_object_paging_end(local_object);
8632 							vm_object_unlock(local_object);
8633 						} else {
8634 							VM_PAGE_FREE(top_page);
8635 							vm_object_paging_end(local_object);
8636 						}
8637 					}
8638 					vm_object_paging_end(object);
8639 					break;
8640 
8641 				case VM_FAULT_RETRY:
8642 					vm_object_lock(object);
8643 					break;
8644 
8645 				case VM_FAULT_MEMORY_SHORTAGE:
8646 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8647 
8648 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8649 
8650 					if (vm_page_wait(interruptible)) {
8651 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8652 
8653 						VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8654 						vm_object_lock(object);
8655 
8656 						break;
8657 					}
8658 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8659 
8660 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8661 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8662 					OS_FALLTHROUGH;
8663 
8664 				case VM_FAULT_INTERRUPTED:
8665 					error_code = MACH_SEND_INTERRUPTED;
8666 					OS_FALLTHROUGH;
8667 				case VM_FAULT_MEMORY_ERROR:
8668 memory_error:
8669 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8670 
8671 					vm_object_lock(object);
8672 					goto return_err;
8673 
8674 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
8675 					/* success but no page: fail */
8676 					vm_object_paging_end(object);
8677 					vm_object_unlock(object);
8678 					goto memory_error;
8679 
8680 				default:
8681 					panic("vm_object_iopl_request: unexpected error"
8682 					    " 0x%x from vm_fault_page()\n", result);
8683 				}
8684 			} while (result != VM_FAULT_SUCCESS);
8685 		}
8686 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8687 
8688 		if (upl->flags & UPL_KERNEL_OBJECT) {
8689 			goto record_phys_addr;
8690 		}
8691 
8692 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8693 			dst_page->vmp_busy = TRUE;
8694 			goto record_phys_addr;
8695 		}
8696 
8697 		if (dst_page->vmp_cleaning) {
8698 			/*
8699 			 * Someone else is cleaning this page in place.
8700 			 * In theory, we should be able to  proceed and use this
8701 			 * page but they'll probably end up clearing the "busy"
8702 			 * bit on it in upl_commit_range() but they didn't set
8703 			 * it, so they would clear our "busy" bit and open
8704 			 * us to race conditions.
8705 			 * We'd better wait for the cleaning to complete and
8706 			 * then try again.
8707 			 */
8708 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8709 			vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8710 			continue;
8711 		}
8712 		if (dst_page->vmp_laundry) {
8713 			vm_pageout_steal_laundry(dst_page, FALSE);
8714 		}
8715 
8716 		if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8717 		    phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
8718 			vm_page_t       new_page;
8719 			int             refmod;
8720 
8721 			/*
8722 			 * support devices that can't DMA above 32 bits
8723 			 * by substituting pages from a pool of low address
8724 			 * memory for any pages we find above the 4G mark
8725 			 * can't substitute if the page is already wired because
8726 			 * we don't know whether that physical address has been
8727 			 * handed out to some other 64 bit capable DMA device to use
8728 			 */
8729 			if (VM_PAGE_WIRED(dst_page)) {
8730 				ret = KERN_PROTECTION_FAILURE;
8731 				goto return_err;
8732 			}
8733 
8734 			new_page = vm_page_grablo(VM_PAGE_GRAB_OPTIONS_NONE);
8735 
8736 			if (new_page == VM_PAGE_NULL) {
8737 				ret = KERN_RESOURCE_SHORTAGE;
8738 				goto return_err;
8739 			}
8740 			/*
8741 			 * from here until the vm_page_replace completes
8742 			 * we musn't drop the object lock... we don't
8743 			 * want anyone refaulting this page in and using
8744 			 * it after we disconnect it... we want the fault
8745 			 * to find the new page being substituted.
8746 			 */
8747 			if (dst_page->vmp_pmapped) {
8748 				refmod = pmap_disconnect(phys_page);
8749 			} else {
8750 				refmod = 0;
8751 			}
8752 
8753 			if (!dst_page->vmp_absent) {
8754 				vm_page_copy(dst_page, new_page);
8755 			}
8756 
8757 			new_page->vmp_reference = dst_page->vmp_reference;
8758 			new_page->vmp_dirty     = dst_page->vmp_dirty;
8759 			new_page->vmp_absent    = dst_page->vmp_absent;
8760 
8761 			if (refmod & VM_MEM_REFERENCED) {
8762 				new_page->vmp_reference = TRUE;
8763 			}
8764 			if (refmod & VM_MEM_MODIFIED) {
8765 				SET_PAGE_DIRTY(new_page, FALSE);
8766 			}
8767 
8768 			vm_page_replace(new_page, object, dst_offset);
8769 
8770 			dst_page = new_page;
8771 			/*
8772 			 * vm_page_grablo returned the page marked
8773 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
8774 			 * here, because we've never dropped the object lock
8775 			 */
8776 			if (!dst_page->vmp_absent) {
8777 				dst_page->vmp_busy = FALSE;
8778 			}
8779 
8780 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8781 		}
8782 		if (!dst_page->vmp_busy) {
8783 			/*
8784 			 * Specify that we're wiring the page for I/O, which also means
8785 			 * that the delayed work handler may return KERN_PROTECTION_FAILURE
8786 			 * on certain configs if a page's mapping state doesn't allow I/O
8787 			 * wiring.  For the specifc case in which we're creating an IOPL
8788 			 * against an executable mapping, the buffer copy performed by
8789 			 * vm_map_create_upl() should prevent failure here, but we still
8790 			 * want to gracefully fail here if someone attempts to I/O-wire
8791 			 * an executable page through a named entry or non-executable
8792 			 * alias mapping.
8793 			 */
8794 			dwp->dw_mask |= (DW_vm_page_wire | DW_vm_page_iopl_wire);
8795 			if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8796 				dwp->dw_mask |= DW_vm_page_iopl_wire_write;
8797 			}
8798 		}
8799 
8800 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
8801 			/*
8802 			 * Mark the page "busy" to block any future page fault
8803 			 * on this page in addition to wiring it.
8804 			 * We'll also remove the mapping
8805 			 * of all these pages before leaving this routine.
8806 			 */
8807 			assert(!vm_page_is_fictitious(dst_page));
8808 			dst_page->vmp_busy = TRUE;
8809 		}
8810 		/*
8811 		 * expect the page to be used
8812 		 * page queues lock must be held to set 'reference'
8813 		 */
8814 		dwp->dw_mask |= DW_set_reference;
8815 
8816 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8817 			SET_PAGE_DIRTY(dst_page, TRUE);
8818 			/*
8819 			 * Page belonging to a code-signed object is about to
8820 			 * be written. Mark it tainted and disconnect it from
8821 			 * all pmaps so processes have to fault it back in and
8822 			 * deal with the tainted bit.
8823 			 */
8824 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8825 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8826 				vm_page_iopl_tainted++;
8827 				if (dst_page->vmp_pmapped) {
8828 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8829 					if (refmod & VM_MEM_REFERENCED) {
8830 						dst_page->vmp_reference = TRUE;
8831 					}
8832 				}
8833 			}
8834 		}
8835 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8836 			pmap_sync_page_attributes_phys(phys_page);
8837 			dst_page->vmp_written_by_kernel = FALSE;
8838 		}
8839 
8840 record_phys_addr:
8841 		if (dst_page->vmp_busy) {
8842 			upl->flags |= UPL_HAS_BUSY;
8843 		}
8844 
8845 		bitmap_set(upl->lite_list, entry);
8846 
8847 		if (phys_page > upl->highest_page) {
8848 			upl->highest_page = phys_page;
8849 		}
8850 
8851 		if (user_page_list) {
8852 			user_page_list[entry].phys_addr = phys_page;
8853 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
8854 			user_page_list[entry].absent    = dst_page->vmp_absent;
8855 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8856 			user_page_list[entry].precious  = dst_page->vmp_precious;
8857 			user_page_list[entry].device    = FALSE;
8858 			user_page_list[entry].needed    = FALSE;
8859 			if (dst_page->vmp_clustered == TRUE) {
8860 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8861 			} else {
8862 				user_page_list[entry].speculative = FALSE;
8863 			}
8864 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8865 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8866 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8867 			user_page_list[entry].mark      = FALSE;
8868 		}
8869 		if (!is_kernel_object(object) && object != compressor_object) {
8870 			/*
8871 			 * someone is explicitly grabbing this page...
8872 			 * update clustered and speculative state
8873 			 *
8874 			 */
8875 			if (dst_page->vmp_clustered) {
8876 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
8877 			}
8878 		}
8879 skip_page:
8880 		entry++;
8881 		dst_offset += PAGE_SIZE_64;
8882 		xfer_size -= PAGE_SIZE;
8883 
8884 		if (dwp->dw_mask) {
8885 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8886 
8887 			if (dw_count >= dw_limit) {
8888 				ret = vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8889 
8890 				dwp = dwp_start;
8891 				dw_count = 0;
8892 				if (ret != KERN_SUCCESS) {
8893 					goto return_err;
8894 				}
8895 			}
8896 		}
8897 	}
8898 	assert(entry == size_in_pages);
8899 
8900 	if (dw_count) {
8901 		ret = vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8902 		dwp = dwp_start;
8903 		dw_count = 0;
8904 		if (ret != KERN_SUCCESS) {
8905 			goto return_err;
8906 		}
8907 	}
8908 finish:
8909 	if (user_page_list && set_cache_attr_needed == TRUE) {
8910 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8911 	}
8912 
8913 	if (page_list_count != NULL) {
8914 		if (upl->flags & UPL_INTERNAL) {
8915 			*page_list_count = 0;
8916 		} else if (*page_list_count > size_in_pages) {
8917 			*page_list_count = size_in_pages;
8918 		}
8919 	}
8920 	vm_object_unlock(object);
8921 
8922 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8923 		/*
8924 		 * We've marked all the pages "busy" so that future
8925 		 * page faults will block.
8926 		 * Now remove the mapping for these pages, so that they
8927 		 * can't be accessed without causing a page fault.
8928 		 */
8929 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8930 		    PMAP_NULL,
8931 		    PAGE_SIZE,
8932 		    0, VM_PROT_NONE);
8933 		vm_object_lock(object);
8934 		assert(!object->blocked_access);
8935 		object->blocked_access = TRUE;
8936 		vm_object_unlock(object);
8937 	}
8938 
8939 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8940 	if (task != NULL) {
8941 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
8942 	}
8943 	counter_add(&vm_page_grab_count_iopl, page_grab_count);
8944 
8945 	if (dwp_start && dwp_finish_ctx) {
8946 		vm_page_delayed_work_finish_ctx(dwp_start);
8947 		dwp_start = dwp = NULL;
8948 	}
8949 
8950 	return KERN_SUCCESS;
8951 
8952 return_err:
8953 	dw_index = 0;
8954 
8955 	for (; offset < dst_offset; offset += PAGE_SIZE) {
8956 		boolean_t need_unwire;
8957 		bool need_wakeup;
8958 
8959 		dst_page = vm_page_lookup(object, offset);
8960 
8961 		if (dst_page == VM_PAGE_NULL) {
8962 			panic("vm_object_iopl_request: Wired page missing.");
8963 		}
8964 
8965 		/*
8966 		 * if we've already processed this page in an earlier
8967 		 * dw_do_work, we need to undo the wiring... we will
8968 		 * leave the dirty and reference bits on if they
8969 		 * were set, since we don't have a good way of knowing
8970 		 * what the previous state was and we won't get here
8971 		 * under any normal circumstances...  we will always
8972 		 * clear BUSY and wakeup any waiters via vm_page_free
8973 		 * or PAGE_WAKEUP_DONE
8974 		 */
8975 		need_unwire = TRUE;
8976 
8977 		need_wakeup = false;
8978 		if (dw_count) {
8979 			if ((dwp_start)[dw_index].dw_m == dst_page) {
8980 				/*
8981 				 * still in the deferred work list
8982 				 * which means we haven't yet called
8983 				 * vm_page_wire on this page
8984 				 */
8985 				need_unwire = FALSE;
8986 
8987 				if (dst_page->vmp_busy &&
8988 				    ((dwp_start)[dw_index].dw_mask & DW_clear_busy)) {
8989 					/*
8990 					 * It's our own "busy" bit, so we need to clear it
8991 					 * now and wake up waiters below.
8992 					 */
8993 					dst_page->vmp_busy = false;
8994 					need_wakeup = true;
8995 				}
8996 
8997 				dw_index++;
8998 				dw_count--;
8999 			}
9000 		}
9001 		vm_page_lock_queues();
9002 
9003 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9004 			vm_page_free(dst_page);
9005 
9006 			need_unwire = FALSE;
9007 		} else {
9008 			if (need_unwire == TRUE) {
9009 				vm_page_unwire(dst_page, TRUE);
9010 			}
9011 			if (dst_page->vmp_busy) {
9012 				/* not our "busy" or we would have cleared it above */
9013 				assert(!need_wakeup);
9014 			}
9015 			if (need_wakeup) {
9016 				assert(!dst_page->vmp_busy);
9017 				vm_page_wakeup(object, dst_page);
9018 			}
9019 		}
9020 		vm_page_unlock_queues();
9021 
9022 		if (need_unwire == TRUE) {
9023 			counter_inc(&vm_statistics_reactivations);
9024 		}
9025 	}
9026 #if UPL_DEBUG
9027 	upl->upl_state = 2;
9028 #endif
9029 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9030 		vm_object_activity_end(object);
9031 		vm_object_collapse(object, 0, TRUE);
9032 	}
9033 	vm_object_unlock(object);
9034 	upl_destroy(upl);
9035 	*upl_ptr = NULL;
9036 
9037 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9038 	if (task != NULL) {
9039 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9040 	}
9041 	counter_add(&vm_page_grab_count_iopl, page_grab_count);
9042 
9043 	if (dwp_start && dwp_finish_ctx) {
9044 		vm_page_delayed_work_finish_ctx(dwp_start);
9045 		dwp_start = dwp = NULL;
9046 	}
9047 	return ret;
9048 }
9049 
9050 kern_return_t
9051 upl_transpose(
9052 	upl_t           upl1,
9053 	upl_t           upl2)
9054 {
9055 	kern_return_t           retval;
9056 	boolean_t               upls_locked;
9057 	vm_object_t             object1, object2;
9058 
9059 	/* LD: Should mapped UPLs be eligible for a transpose? */
9060 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9061 		return KERN_INVALID_ARGUMENT;
9062 	}
9063 
9064 	upls_locked = FALSE;
9065 
9066 	/*
9067 	 * Since we need to lock both UPLs at the same time,
9068 	 * avoid deadlocks by always taking locks in the same order.
9069 	 */
9070 	if (upl1 < upl2) {
9071 		upl_lock(upl1);
9072 		upl_lock(upl2);
9073 	} else {
9074 		upl_lock(upl2);
9075 		upl_lock(upl1);
9076 	}
9077 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
9078 
9079 	object1 = upl1->map_object;
9080 	object2 = upl2->map_object;
9081 
9082 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9083 	    upl1->u_size != upl2->u_size) {
9084 		/*
9085 		 * We deal only with full objects, not subsets.
9086 		 * That's because we exchange the entire backing store info
9087 		 * for the objects: pager, resident pages, etc...  We can't do
9088 		 * only part of it.
9089 		 */
9090 		retval = KERN_INVALID_VALUE;
9091 		goto done;
9092 	}
9093 
9094 	/*
9095 	 * Tranpose the VM objects' backing store.
9096 	 */
9097 	retval = vm_object_transpose(object1, object2,
9098 	    upl_adjusted_size(upl1, PAGE_MASK));
9099 
9100 	if (retval == KERN_SUCCESS) {
9101 		/*
9102 		 * Make each UPL point to the correct VM object, i.e. the
9103 		 * object holding the pages that the UPL refers to...
9104 		 */
9105 #if CONFIG_IOSCHED || UPL_DEBUG
9106 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9107 			vm_object_lock(object1);
9108 			vm_object_lock(object2);
9109 		}
9110 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9111 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
9112 		}
9113 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9114 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
9115 		}
9116 #endif
9117 		upl1->map_object = object2;
9118 		upl2->map_object = object1;
9119 
9120 #if CONFIG_IOSCHED || UPL_DEBUG
9121 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9122 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
9123 		}
9124 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9125 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
9126 		}
9127 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9128 			vm_object_unlock(object2);
9129 			vm_object_unlock(object1);
9130 		}
9131 #endif
9132 	}
9133 
9134 done:
9135 	/*
9136 	 * Cleanup.
9137 	 */
9138 	if (upls_locked) {
9139 		upl_unlock(upl1);
9140 		upl_unlock(upl2);
9141 		upls_locked = FALSE;
9142 	}
9143 
9144 	return retval;
9145 }
9146 
9147 void
9148 upl_range_needed(
9149 	upl_t           upl,
9150 	int             index,
9151 	int             count)
9152 {
9153 	int             size_in_pages;
9154 
9155 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
9156 		return;
9157 	}
9158 
9159 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
9160 
9161 	while (count-- && index < size_in_pages) {
9162 		upl->page_list[index++].needed = TRUE;
9163 	}
9164 }
9165 
9166 
9167 /*
9168  * Reserve of virtual addresses in the kernel address space.
9169  * We need to map the physical pages in the kernel, so that we
9170  * can call the code-signing or slide routines with a kernel
9171  * virtual address.  We keep this pool of pre-allocated kernel
9172  * virtual addresses so that we don't have to scan the kernel's
9173  * virtaul address space each time we need to work with
9174  * a physical page.
9175  */
9176 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
9177 #define VM_PAGING_NUM_PAGES     64
9178 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
9179 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
9180 int             vm_paging_max_index = 0;
9181 int             vm_paging_page_waiter = 0;
9182 int             vm_paging_page_waiter_total = 0;
9183 
9184 unsigned long   vm_paging_no_kernel_page = 0;
9185 unsigned long   vm_paging_objects_mapped = 0;
9186 unsigned long   vm_paging_pages_mapped = 0;
9187 unsigned long   vm_paging_objects_mapped_slow = 0;
9188 unsigned long   vm_paging_pages_mapped_slow = 0;
9189 
9190 __startup_func
9191 static void
9192 vm_paging_map_init(void)
9193 {
9194 	kmem_alloc(kernel_map, &vm_paging_base_address,
9195 	    ptoa(VM_PAGING_NUM_PAGES),
9196 	    KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
9197 	    VM_KERN_MEMORY_NONE);
9198 }
9199 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
9200 
9201 /*
9202  * vm_paging_map_object:
9203  *	Maps part of a VM object's pages in the kernel
9204  *      virtual address space, using the pre-allocated
9205  *	kernel virtual addresses, if possible.
9206  * Context:
9207  *      The VM object is locked.  This lock will get
9208  *      dropped and re-acquired though, so the caller
9209  *      must make sure the VM object is kept alive
9210  *	(by holding a VM map that has a reference
9211  *      on it, for example, or taking an extra reference).
9212  *      The page should also be kept busy to prevent
9213  *	it from being reclaimed.
9214  */
9215 kern_return_t
9216 vm_paging_map_object(
9217 	vm_page_t               page,
9218 	vm_object_t             object,
9219 	vm_object_offset_t      offset,
9220 	vm_prot_t               protection,
9221 	boolean_t               can_unlock_object,
9222 	vm_map_size_t           *size,          /* IN/OUT */
9223 	vm_map_offset_t         *address,       /* OUT */
9224 	boolean_t               *need_unmap)    /* OUT */
9225 {
9226 	kern_return_t           kr;
9227 	vm_map_offset_t         page_map_offset;
9228 	vm_map_size_t           map_size;
9229 	vm_object_offset_t      object_offset;
9230 	int                     i;
9231 
9232 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9233 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
9234 		*address = (vm_map_offset_t)
9235 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9236 		*need_unmap = FALSE;
9237 		return KERN_SUCCESS;
9238 
9239 		assert(page->vmp_busy);
9240 		/*
9241 		 * Use one of the pre-allocated kernel virtual addresses
9242 		 * and just enter the VM page in the kernel address space
9243 		 * at that virtual address.
9244 		 */
9245 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9246 
9247 		/*
9248 		 * Try and find an available kernel virtual address
9249 		 * from our pre-allocated pool.
9250 		 */
9251 		page_map_offset = 0;
9252 		for (;;) {
9253 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9254 				if (vm_paging_page_inuse[i] == FALSE) {
9255 					page_map_offset =
9256 					    vm_paging_base_address +
9257 					    (i * PAGE_SIZE);
9258 					break;
9259 				}
9260 			}
9261 			if (page_map_offset != 0) {
9262 				/* found a space to map our page ! */
9263 				break;
9264 			}
9265 
9266 			if (can_unlock_object) {
9267 				/*
9268 				 * If we can afford to unlock the VM object,
9269 				 * let's take the slow path now...
9270 				 */
9271 				break;
9272 			}
9273 			/*
9274 			 * We can't afford to unlock the VM object, so
9275 			 * let's wait for a space to become available...
9276 			 */
9277 			vm_paging_page_waiter_total++;
9278 			vm_paging_page_waiter++;
9279 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9280 			if (kr == THREAD_WAITING) {
9281 				simple_unlock(&vm_paging_lock);
9282 				kr = thread_block(THREAD_CONTINUE_NULL);
9283 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9284 			}
9285 			vm_paging_page_waiter--;
9286 			/* ... and try again */
9287 		}
9288 
9289 		if (page_map_offset != 0) {
9290 			/*
9291 			 * We found a kernel virtual address;
9292 			 * map the physical page to that virtual address.
9293 			 */
9294 			if (i > vm_paging_max_index) {
9295 				vm_paging_max_index = i;
9296 			}
9297 			vm_paging_page_inuse[i] = TRUE;
9298 			simple_unlock(&vm_paging_lock);
9299 
9300 			page->vmp_pmapped = TRUE;
9301 
9302 			/*
9303 			 * Keep the VM object locked over the PMAP_ENTER
9304 			 * and the actual use of the page by the kernel,
9305 			 * or this pmap mapping might get undone by a
9306 			 * vm_object_pmap_protect() call...
9307 			 */
9308 			kr = pmap_enter_check(kernel_pmap,
9309 			    page_map_offset,
9310 			    page,
9311 			    protection,
9312 			    VM_PROT_NONE,
9313 			    TRUE);
9314 			assert(kr == KERN_SUCCESS);
9315 			vm_paging_objects_mapped++;
9316 			vm_paging_pages_mapped++;
9317 			*address = page_map_offset;
9318 			*need_unmap = TRUE;
9319 
9320 #if KASAN
9321 			kasan_notify_address(page_map_offset, PAGE_SIZE);
9322 #endif
9323 
9324 			/* all done and mapped, ready to use ! */
9325 			return KERN_SUCCESS;
9326 		}
9327 
9328 		/*
9329 		 * We ran out of pre-allocated kernel virtual
9330 		 * addresses.  Just map the page in the kernel
9331 		 * the slow and regular way.
9332 		 */
9333 		vm_paging_no_kernel_page++;
9334 		simple_unlock(&vm_paging_lock);
9335 	}
9336 
9337 	if (!can_unlock_object) {
9338 		*address = 0;
9339 		*size = 0;
9340 		*need_unmap = FALSE;
9341 		return KERN_NOT_SUPPORTED;
9342 	}
9343 
9344 	object_offset = vm_object_trunc_page(offset);
9345 	map_size = vm_map_round_page(*size,
9346 	    VM_MAP_PAGE_MASK(kernel_map));
9347 
9348 	/*
9349 	 * Try and map the required range of the object
9350 	 * in the kernel_map. Given that allocation is
9351 	 * for pageable memory, it shouldn't contain
9352 	 * pointers and is mapped into the data range.
9353 	 */
9354 
9355 	vm_object_reference_locked(object);     /* for the map entry */
9356 	vm_object_unlock(object);
9357 
9358 	kr = vm_map_enter(kernel_map,
9359 	    address,
9360 	    map_size,
9361 	    0,
9362 	    VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(),
9363 	    object,
9364 	    object_offset,
9365 	    FALSE,
9366 	    protection,
9367 	    VM_PROT_ALL,
9368 	    VM_INHERIT_NONE);
9369 	if (kr != KERN_SUCCESS) {
9370 		*address = 0;
9371 		*size = 0;
9372 		*need_unmap = FALSE;
9373 		vm_object_deallocate(object);   /* for the map entry */
9374 		vm_object_lock(object);
9375 		return kr;
9376 	}
9377 
9378 	*size = map_size;
9379 
9380 	/*
9381 	 * Enter the mapped pages in the page table now.
9382 	 */
9383 	vm_object_lock(object);
9384 	/*
9385 	 * VM object must be kept locked from before PMAP_ENTER()
9386 	 * until after the kernel is done accessing the page(s).
9387 	 * Otherwise, the pmap mappings in the kernel could be
9388 	 * undone by a call to vm_object_pmap_protect().
9389 	 */
9390 
9391 	for (page_map_offset = 0;
9392 	    map_size != 0;
9393 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9394 		page = vm_page_lookup(object, offset + page_map_offset);
9395 		if (page == VM_PAGE_NULL) {
9396 			printf("vm_paging_map_object: no page !?");
9397 			vm_object_unlock(object);
9398 			vm_map_remove(kernel_map, *address, *size);
9399 			*address = 0;
9400 			*size = 0;
9401 			*need_unmap = FALSE;
9402 			vm_object_lock(object);
9403 			return KERN_MEMORY_ERROR;
9404 		}
9405 		page->vmp_pmapped = TRUE;
9406 
9407 		kr = pmap_enter_check(kernel_pmap,
9408 		    *address + page_map_offset,
9409 		    page,
9410 		    protection,
9411 		    VM_PROT_NONE,
9412 		    TRUE);
9413 		assert(kr == KERN_SUCCESS);
9414 #if KASAN
9415 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9416 #endif
9417 	}
9418 
9419 	vm_paging_objects_mapped_slow++;
9420 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9421 
9422 	*need_unmap = TRUE;
9423 
9424 	return KERN_SUCCESS;
9425 }
9426 
9427 /*
9428  * vm_paging_unmap_object:
9429  *	Unmaps part of a VM object's pages from the kernel
9430  *      virtual address space.
9431  * Context:
9432  *      The VM object is locked.  This lock will get
9433  *      dropped and re-acquired though.
9434  */
9435 void
9436 vm_paging_unmap_object(
9437 	vm_object_t     object,
9438 	vm_map_offset_t start,
9439 	vm_map_offset_t end)
9440 {
9441 	int             i;
9442 
9443 	if ((vm_paging_base_address == 0) ||
9444 	    (start < vm_paging_base_address) ||
9445 	    (end > (vm_paging_base_address
9446 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9447 		/*
9448 		 * We didn't use our pre-allocated pool of
9449 		 * kernel virtual address.  Deallocate the
9450 		 * virtual memory.
9451 		 */
9452 		if (object != VM_OBJECT_NULL) {
9453 			vm_object_unlock(object);
9454 		}
9455 		vm_map_remove(kernel_map, start, end);
9456 		if (object != VM_OBJECT_NULL) {
9457 			vm_object_lock(object);
9458 		}
9459 	} else {
9460 		/*
9461 		 * We used a kernel virtual address from our
9462 		 * pre-allocated pool.  Put it back in the pool
9463 		 * for next time.
9464 		 */
9465 		assert(end - start == PAGE_SIZE);
9466 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9467 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9468 
9469 		/* undo the pmap mapping */
9470 		pmap_remove(kernel_pmap, start, end);
9471 
9472 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9473 		vm_paging_page_inuse[i] = FALSE;
9474 		if (vm_paging_page_waiter) {
9475 			thread_wakeup(&vm_paging_page_waiter);
9476 		}
9477 		simple_unlock(&vm_paging_lock);
9478 	}
9479 }
9480 
9481 
9482 /*
9483  * page->vmp_object must be locked
9484  */
9485 void
9486 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9487 {
9488 	if (!queues_locked) {
9489 		vm_page_lockspin_queues();
9490 	}
9491 
9492 	page->vmp_free_when_done = FALSE;
9493 	/*
9494 	 * need to drop the laundry count...
9495 	 * we may also need to remove it
9496 	 * from the I/O paging queue...
9497 	 * vm_pageout_throttle_up handles both cases
9498 	 *
9499 	 * the laundry and pageout_queue flags are cleared...
9500 	 */
9501 	vm_pageout_throttle_up(page);
9502 
9503 	if (!queues_locked) {
9504 		vm_page_unlock_queues();
9505 	}
9506 }
9507 
9508 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9509 
9510 upl_t
9511 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9512 {
9513 	upl_t   upl;
9514 
9515 	assert(max_upls > 0);
9516 	if (max_upls == 0) {
9517 		return NULL;
9518 	}
9519 
9520 	if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9521 		max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9522 	}
9523 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL | Z_ZERO);
9524 
9525 	upl = upl_create(0, UPL_VECTOR, 0);
9526 	upl->vector_upl = vector_upl;
9527 	upl->u_offset = upl_offset;
9528 	vector_upl->offset = upl_offset;
9529 	vector_upl->max_upls = max_upls;
9530 
9531 	return upl;
9532 }
9533 
9534 upl_size_t
9535 vector_upl_get_size(const upl_t upl)
9536 {
9537 	if (!vector_upl_is_valid(upl)) {
9538 		return upl_get_size(upl);
9539 	} else {
9540 		return round_page_32(upl->vector_upl->size);
9541 	}
9542 }
9543 
9544 uint32_t
9545 vector_upl_max_upls(const upl_t upl)
9546 {
9547 	if (!vector_upl_is_valid(upl)) {
9548 		return 0;
9549 	}
9550 	return ((vector_upl_t)(upl->vector_upl))->max_upls;
9551 }
9552 
9553 void
9554 vector_upl_deallocate(upl_t upl)
9555 {
9556 	vector_upl_t vector_upl = upl->vector_upl;
9557 
9558 	assert(vector_upl_is_valid(upl));
9559 
9560 	if (vector_upl->invalid_upls != vector_upl->num_upls) {
9561 		panic("Deallocating non-empty Vectored UPL");
9562 	}
9563 	uint32_t max_upls = vector_upl->max_upls;
9564 	kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9565 	kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9566 	upl->vector_upl = NULL;
9567 }
9568 
9569 boolean_t
9570 vector_upl_is_valid(upl_t upl)
9571 {
9572 	return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9573 }
9574 
9575 boolean_t
9576 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9577 {
9578 	if (vector_upl_is_valid(upl)) {
9579 		vector_upl_t vector_upl = upl->vector_upl;
9580 
9581 		if (vector_upl) {
9582 			if (subupl) {
9583 				if (io_size) {
9584 					if (io_size < PAGE_SIZE) {
9585 						io_size = PAGE_SIZE;
9586 					}
9587 					subupl->vector_upl = (void*)vector_upl;
9588 					vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9589 					vector_upl->size += io_size;
9590 					upl->u_size += io_size;
9591 				} else {
9592 					uint32_t i = 0, invalid_upls = 0;
9593 					for (i = 0; i < vector_upl->num_upls; i++) {
9594 						if (vector_upl->upls[i].elem == subupl) {
9595 							break;
9596 						}
9597 					}
9598 					if (i == vector_upl->num_upls) {
9599 						panic("Trying to remove sub-upl when none exists");
9600 					}
9601 
9602 					vector_upl->upls[i].elem = NULL;
9603 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9604 					    relaxed);
9605 					if (invalid_upls == vector_upl->num_upls) {
9606 						return TRUE;
9607 					} else {
9608 						return FALSE;
9609 					}
9610 				}
9611 			} else {
9612 				panic("vector_upl_set_subupl was passed a NULL upl element");
9613 			}
9614 		} else {
9615 			panic("vector_upl_set_subupl was passed a non-vectored upl");
9616 		}
9617 	} else {
9618 		panic("vector_upl_set_subupl was passed a NULL upl");
9619 	}
9620 
9621 	return FALSE;
9622 }
9623 
9624 void
9625 vector_upl_set_pagelist(upl_t upl)
9626 {
9627 	if (vector_upl_is_valid(upl)) {
9628 		uint32_t i = 0;
9629 		vector_upl_t vector_upl = upl->vector_upl;
9630 
9631 		if (vector_upl) {
9632 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9633 
9634 			vector_upl->pagelist = kalloc_type(struct upl_page_info,
9635 			    atop(vector_upl->size), Z_WAITOK);
9636 
9637 			for (i = 0; i < vector_upl->num_upls; i++) {
9638 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9639 				bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9640 				pagelist_size += cur_upl_pagelist_size;
9641 				if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9642 					upl->highest_page = vector_upl->upls[i].elem->highest_page;
9643 				}
9644 			}
9645 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9646 		} else {
9647 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
9648 		}
9649 	} else {
9650 		panic("vector_upl_set_pagelist was passed a NULL upl");
9651 	}
9652 }
9653 
9654 upl_t
9655 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9656 {
9657 	if (vector_upl_is_valid(upl)) {
9658 		vector_upl_t vector_upl = upl->vector_upl;
9659 		if (vector_upl) {
9660 			if (index < vector_upl->num_upls) {
9661 				return vector_upl->upls[index].elem;
9662 			}
9663 		} else {
9664 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9665 		}
9666 	}
9667 	return NULL;
9668 }
9669 
9670 upl_t
9671 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9672 {
9673 	if (vector_upl_is_valid(upl)) {
9674 		uint32_t i = 0;
9675 		vector_upl_t vector_upl = upl->vector_upl;
9676 
9677 		if (vector_upl) {
9678 			upl_t subupl = NULL;
9679 			vector_upl_iostates_t subupl_state;
9680 
9681 			for (i = 0; i < vector_upl->num_upls; i++) {
9682 				subupl = vector_upl->upls[i].elem;
9683 				subupl_state = vector_upl->upls[i].iostate;
9684 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9685 					/* We could have been passed an offset/size pair that belongs
9686 					 * to an UPL element that has already been committed/aborted.
9687 					 * If so, return NULL.
9688 					 */
9689 					if (subupl == NULL) {
9690 						return NULL;
9691 					}
9692 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9693 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9694 						if (*upl_size > subupl_state.size) {
9695 							*upl_size = subupl_state.size;
9696 						}
9697 					}
9698 					if (*upl_offset >= subupl_state.offset) {
9699 						*upl_offset -= subupl_state.offset;
9700 					} else if (i) {
9701 						panic("Vector UPL offset miscalculation");
9702 					}
9703 					return subupl;
9704 				}
9705 			}
9706 		} else {
9707 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9708 		}
9709 	}
9710 	return NULL;
9711 }
9712 
9713 void
9714 vector_upl_get_addr(upl_t upl, vm_offset_t *dst_addr)
9715 {
9716 	if (vector_upl_is_valid(upl)) {
9717 		vector_upl_t vector_upl = upl->vector_upl;
9718 		if (vector_upl) {
9719 			assert(vector_upl->dst_addr != 0);
9720 			*dst_addr = vector_upl->dst_addr;
9721 		} else {
9722 			panic("%s was passed a non-vectored UPL", __func__);
9723 		}
9724 	} else {
9725 		panic("%s was passed a null UPL", __func__);
9726 	}
9727 }
9728 
9729 void
9730 vector_upl_set_addr(upl_t upl, vm_offset_t dst_addr)
9731 {
9732 	if (vector_upl_is_valid(upl)) {
9733 		vector_upl_t vector_upl = upl->vector_upl;
9734 		if (vector_upl) {
9735 			if (dst_addr) {
9736 				/* setting a new value: do not overwrite an old one */
9737 				assert(vector_upl->dst_addr == 0);
9738 			} else {
9739 				/* resetting: make sure there was an old value */
9740 				assert(vector_upl->dst_addr != 0);
9741 			}
9742 			vector_upl->dst_addr = dst_addr;
9743 		} else {
9744 			panic("%s was passed a non-vectored UPL", __func__);
9745 		}
9746 	} else {
9747 		panic("%s was passed a NULL UPL", __func__);
9748 	}
9749 }
9750 
9751 void
9752 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9753 {
9754 	if (vector_upl_is_valid(upl)) {
9755 		uint32_t i = 0;
9756 		vector_upl_t vector_upl = upl->vector_upl;
9757 
9758 		if (vector_upl) {
9759 			for (i = 0; i < vector_upl->num_upls; i++) {
9760 				if (vector_upl->upls[i].elem == subupl) {
9761 					break;
9762 				}
9763 			}
9764 
9765 			if (i == vector_upl->num_upls) {
9766 				panic("setting sub-upl iostate when none exists");
9767 			}
9768 
9769 			vector_upl->upls[i].iostate.offset = offset;
9770 			if (size < PAGE_SIZE) {
9771 				size = PAGE_SIZE;
9772 			}
9773 			vector_upl->upls[i].iostate.size = size;
9774 		} else {
9775 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
9776 		}
9777 	} else {
9778 		panic("vector_upl_set_iostate was passed a NULL UPL");
9779 	}
9780 }
9781 
9782 void
9783 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9784 {
9785 	if (vector_upl_is_valid(upl)) {
9786 		uint32_t i = 0;
9787 		vector_upl_t vector_upl = upl->vector_upl;
9788 
9789 		if (vector_upl) {
9790 			for (i = 0; i < vector_upl->num_upls; i++) {
9791 				if (vector_upl->upls[i].elem == subupl) {
9792 					break;
9793 				}
9794 			}
9795 
9796 			if (i == vector_upl->num_upls) {
9797 				panic("getting sub-upl iostate when none exists");
9798 			}
9799 
9800 			*offset = vector_upl->upls[i].iostate.offset;
9801 			*size = vector_upl->upls[i].iostate.size;
9802 		} else {
9803 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
9804 		}
9805 	} else {
9806 		panic("vector_upl_get_iostate was passed a NULL UPL");
9807 	}
9808 }
9809 
9810 void
9811 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9812 {
9813 	if (vector_upl_is_valid(upl)) {
9814 		vector_upl_t vector_upl = upl->vector_upl;
9815 		if (vector_upl) {
9816 			if (index < vector_upl->num_upls) {
9817 				*offset = vector_upl->upls[index].iostate.offset;
9818 				*size = vector_upl->upls[index].iostate.size;
9819 			} else {
9820 				*offset = *size = 0;
9821 			}
9822 		} else {
9823 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9824 		}
9825 	} else {
9826 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9827 	}
9828 }
9829 
9830 void *
9831 upl_get_internal_vectorupl(upl_t upl)
9832 {
9833 	return upl->vector_upl;
9834 }
9835 
9836 upl_page_info_t *
9837 upl_get_internal_vectorupl_pagelist(upl_t upl)
9838 {
9839 	return upl->vector_upl->pagelist;
9840 }
9841 
9842 upl_page_info_t *
9843 upl_get_internal_page_list(upl_t upl)
9844 {
9845 	return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9846 }
9847 
9848 void
9849 upl_clear_dirty(
9850 	upl_t           upl,
9851 	boolean_t       value)
9852 {
9853 	if (value) {
9854 		upl->flags |= UPL_CLEAR_DIRTY;
9855 	} else {
9856 		upl->flags &= ~UPL_CLEAR_DIRTY;
9857 	}
9858 }
9859 
9860 void
9861 upl_set_referenced(
9862 	upl_t           upl,
9863 	boolean_t       value)
9864 {
9865 	upl_lock(upl);
9866 	if (value) {
9867 		upl->ext_ref_count++;
9868 	} else {
9869 		if (!upl->ext_ref_count) {
9870 			panic("upl_set_referenced not %p", upl);
9871 		}
9872 		upl->ext_ref_count--;
9873 	}
9874 	upl_unlock(upl);
9875 }
9876 
9877 void
9878 upl_set_map_exclusive(upl_t upl)
9879 {
9880 	upl_lock(upl);
9881 	while (upl->map_addr_owner) {
9882 		upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9883 		upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9884 	}
9885 	upl->map_addr_owner = thread_get_ctid(current_thread());
9886 	upl_unlock(upl);
9887 }
9888 
9889 void
9890 upl_clear_map_exclusive(upl_t upl)
9891 {
9892 	assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9893 	upl_lock(upl);
9894 	if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9895 		upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9896 		upl_wakeup(&upl->map_addr_owner);
9897 	}
9898 	upl->map_addr_owner = 0;
9899 	upl_unlock(upl);
9900 }
9901 
9902 #if CONFIG_IOSCHED
9903 void
9904 upl_set_blkno(
9905 	upl_t           upl,
9906 	vm_offset_t     upl_offset,
9907 	int             io_size,
9908 	int64_t         blkno)
9909 {
9910 	int i, j;
9911 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9912 		return;
9913 	}
9914 
9915 	assert(upl->upl_reprio_info != 0);
9916 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9917 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9918 	}
9919 }
9920 #endif
9921 
9922 void inline
9923 memoryshot(unsigned int event, unsigned int control)
9924 {
9925 	if (vm_debug_events) {
9926 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9927 		    vm_page_active_count, vm_page_inactive_count,
9928 		    vm_page_free_count, vm_page_speculative_count,
9929 		    vm_page_throttled_count);
9930 	} else {
9931 		(void) event;
9932 		(void) control;
9933 	}
9934 }
9935 
9936 #ifdef MACH_BSD
9937 
9938 boolean_t
9939 upl_device_page(upl_page_info_t *upl)
9940 {
9941 	return UPL_DEVICE_PAGE(upl);
9942 }
9943 boolean_t
9944 upl_page_present(upl_page_info_t *upl, int index)
9945 {
9946 	return UPL_PAGE_PRESENT(upl, index);
9947 }
9948 boolean_t
9949 upl_speculative_page(upl_page_info_t *upl, int index)
9950 {
9951 	return UPL_SPECULATIVE_PAGE(upl, index);
9952 }
9953 boolean_t
9954 upl_dirty_page(upl_page_info_t *upl, int index)
9955 {
9956 	return UPL_DIRTY_PAGE(upl, index);
9957 }
9958 boolean_t
9959 upl_valid_page(upl_page_info_t *upl, int index)
9960 {
9961 	return UPL_VALID_PAGE(upl, index);
9962 }
9963 ppnum_t
9964 upl_phys_page(upl_page_info_t *upl, int index)
9965 {
9966 	return UPL_PHYS_PAGE(upl, index);
9967 }
9968 
9969 void
9970 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9971 {
9972 	upl[index].mark = v;
9973 }
9974 
9975 boolean_t
9976 upl_page_get_mark(upl_page_info_t *upl, int index)
9977 {
9978 	return upl[index].mark;
9979 }
9980 
9981 boolean_t
9982 upl_page_is_needed(upl_page_info_t *upl, int index)
9983 {
9984 	return upl[index].needed;
9985 }
9986 
9987 void
9988 vm_countdirtypages(void)
9989 {
9990 	vm_page_t m;
9991 	int dpages;
9992 	int pgopages;
9993 	int precpages;
9994 
9995 
9996 	dpages = 0;
9997 	pgopages = 0;
9998 	precpages = 0;
9999 
10000 	vm_page_lock_queues();
10001 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10002 	do {
10003 		if (m == (vm_page_t)0) {
10004 			break;
10005 		}
10006 
10007 		if (m->vmp_dirty) {
10008 			dpages++;
10009 		}
10010 		if (m->vmp_free_when_done) {
10011 			pgopages++;
10012 		}
10013 		if (m->vmp_precious) {
10014 			precpages++;
10015 		}
10016 
10017 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10018 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10019 		if (m == (vm_page_t)0) {
10020 			break;
10021 		}
10022 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10023 	vm_page_unlock_queues();
10024 
10025 	vm_page_lock_queues();
10026 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10027 	do {
10028 		if (m == (vm_page_t)0) {
10029 			break;
10030 		}
10031 
10032 		dpages++;
10033 		assert(m->vmp_dirty);
10034 		assert(!m->vmp_free_when_done);
10035 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10036 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10037 		if (m == (vm_page_t)0) {
10038 			break;
10039 		}
10040 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10041 	vm_page_unlock_queues();
10042 
10043 	vm_page_lock_queues();
10044 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10045 	do {
10046 		if (m == (vm_page_t)0) {
10047 			break;
10048 		}
10049 
10050 		if (m->vmp_dirty) {
10051 			dpages++;
10052 		}
10053 		if (m->vmp_free_when_done) {
10054 			pgopages++;
10055 		}
10056 		if (m->vmp_precious) {
10057 			precpages++;
10058 		}
10059 
10060 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10061 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10062 		if (m == (vm_page_t)0) {
10063 			break;
10064 		}
10065 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10066 	vm_page_unlock_queues();
10067 
10068 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10069 
10070 	dpages = 0;
10071 	pgopages = 0;
10072 	precpages = 0;
10073 
10074 	vm_page_lock_queues();
10075 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10076 
10077 	do {
10078 		if (m == (vm_page_t)0) {
10079 			break;
10080 		}
10081 		if (m->vmp_dirty) {
10082 			dpages++;
10083 		}
10084 		if (m->vmp_free_when_done) {
10085 			pgopages++;
10086 		}
10087 		if (m->vmp_precious) {
10088 			precpages++;
10089 		}
10090 
10091 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10092 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10093 		if (m == (vm_page_t)0) {
10094 			break;
10095 		}
10096 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10097 	vm_page_unlock_queues();
10098 
10099 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10100 }
10101 #endif /* MACH_BSD */
10102 
10103 
10104 #if CONFIG_IOSCHED
10105 int
10106 upl_get_cached_tier(upl_t  upl)
10107 {
10108 	assert(upl);
10109 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10110 		return upl->upl_priority;
10111 	}
10112 	return -1;
10113 }
10114 #endif /* CONFIG_IOSCHED */
10115 
10116 
10117 void
10118 upl_callout_iodone(upl_t upl)
10119 {
10120 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
10121 
10122 	if (upl_ctx) {
10123 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
10124 
10125 		assert(upl_ctx->io_done);
10126 
10127 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
10128 	}
10129 }
10130 
10131 void
10132 upl_set_iodone(upl_t upl, void *upl_iodone)
10133 {
10134 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
10135 }
10136 
10137 void
10138 upl_set_iodone_error(upl_t upl, int error)
10139 {
10140 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
10141 
10142 	if (upl_ctx) {
10143 		upl_ctx->io_error = error;
10144 	}
10145 }
10146 
10147 
10148 ppnum_t
10149 upl_get_highest_page(
10150 	upl_t                      upl)
10151 {
10152 	return upl->highest_page;
10153 }
10154 
10155 upl_size_t
10156 upl_get_size(
10157 	upl_t                      upl)
10158 {
10159 	return upl_adjusted_size(upl, PAGE_MASK);
10160 }
10161 
10162 upl_size_t
10163 upl_adjusted_size(
10164 	upl_t upl,
10165 	vm_map_offset_t pgmask)
10166 {
10167 	vm_object_offset_t start_offset, end_offset;
10168 
10169 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
10170 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
10171 
10172 	return (upl_size_t)(end_offset - start_offset);
10173 }
10174 
10175 vm_object_offset_t
10176 upl_adjusted_offset(
10177 	upl_t upl,
10178 	vm_map_offset_t pgmask)
10179 {
10180 	return trunc_page_mask_64(upl->u_offset, pgmask);
10181 }
10182 
10183 vm_object_offset_t
10184 upl_get_data_offset(
10185 	upl_t upl)
10186 {
10187 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
10188 }
10189 
10190 upl_t
10191 upl_associated_upl(upl_t upl)
10192 {
10193 	if (!(upl->flags & UPL_HAS_FS_VERIFY_INFO)) {
10194 		return upl->u_fs_un.associated_upl;
10195 	}
10196 	return NULL;
10197 }
10198 
10199 void
10200 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
10201 {
10202 	assert(!(upl->flags & UPL_HAS_FS_VERIFY_INFO));
10203 	upl->u_fs_un.associated_upl = associated_upl;
10204 }
10205 
10206 bool
10207 upl_has_fs_verify_info(upl_t upl)
10208 {
10209 	return upl->flags & UPL_HAS_FS_VERIFY_INFO;
10210 }
10211 
10212 void
10213 upl_set_fs_verify_info(upl_t upl, uint32_t size)
10214 {
10215 	struct upl_fs_verify_info *fs_verify_infop;
10216 
10217 	if (upl->flags & UPL_HAS_FS_VERIFY_INFO || !size) {
10218 		return;
10219 	}
10220 
10221 	fs_verify_infop = kalloc_type(struct upl_fs_verify_info, Z_WAITOK);
10222 	fs_verify_infop->verify_data_ptr = kalloc_data(size, Z_WAITOK);
10223 	fs_verify_infop->verify_data_len = size;
10224 
10225 	upl_lock(upl);
10226 	if (upl->flags & UPL_HAS_FS_VERIFY_INFO) {
10227 		upl_unlock(upl);
10228 
10229 		assert(upl->u_fs_un.verify_info &&
10230 		    upl->u_fs_un.verify_info->verify_data_len > 0 &&
10231 		    upl->u_fs_un.verify_info->verify_data_len <= upl_adjusted_size(upl, PAGE_MASK));
10232 
10233 		kfree_data(fs_verify_infop->verify_data_ptr, size);
10234 		kfree_type(struct upl_fs_verify_info, fs_verify_infop);
10235 	} else {
10236 		upl->flags |= UPL_HAS_FS_VERIFY_INFO;
10237 		upl->u_fs_un.verify_info = fs_verify_infop;
10238 
10239 		upl_unlock(upl);
10240 	}
10241 }
10242 
10243 uint8_t *
10244 upl_fs_verify_buf(upl_t upl, uint32_t *size)
10245 {
10246 	assert(size);
10247 
10248 	if (!(upl->flags & UPL_HAS_FS_VERIFY_INFO)) {
10249 		*size = 0;
10250 		return NULL;
10251 	}
10252 
10253 	*size = upl->u_fs_un.verify_info->verify_data_len;
10254 	return upl->u_fs_un.verify_info->verify_data_ptr;
10255 }
10256 
10257 struct vnode *
10258 upl_lookup_vnode(upl_t upl)
10259 {
10260 	if (!upl->map_object->internal) {
10261 		return vnode_pager_lookup_vnode(upl->map_object->pager);
10262 	} else {
10263 		return NULL;
10264 	}
10265 }
10266 
10267 boolean_t
10268 upl_has_wired_pages(upl_t upl)
10269 {
10270 	return (upl->flags & UPL_HAS_WIRED) ? TRUE : FALSE;
10271 }
10272 
10273 #if UPL_DEBUG
10274 kern_return_t
10275 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
10276 {
10277 	upl->ubc_alias1 = alias1;
10278 	upl->ubc_alias2 = alias2;
10279 	return KERN_SUCCESS;
10280 }
10281 int
10282 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10283 {
10284 	if (al) {
10285 		*al = upl->ubc_alias1;
10286 	}
10287 	if (al2) {
10288 		*al2 = upl->ubc_alias2;
10289 	}
10290 	return KERN_SUCCESS;
10291 }
10292 #endif /* UPL_DEBUG */
10293 
10294 #if VM_PRESSURE_EVENTS
10295 /*
10296  * Upward trajectory.
10297  */
10298 
10299 boolean_t
10300 VM_PRESSURE_NORMAL_TO_WARNING(void)
10301 {
10302 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10303 		/* Available pages below our threshold */
10304 		uint32_t available_pages = memorystatus_get_available_page_count();
10305 		if (available_pages < memorystatus_get_soft_memlimit_page_shortage_threshold()) {
10306 #if CONFIG_FREEZE
10307 			/* No frozen processes to kill */
10308 			if (memorystatus_frozen_count == 0) {
10309 				/* Not enough suspended processes available. */
10310 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10311 					return TRUE;
10312 				}
10313 			}
10314 #else /* CONFIG_FREEZE */
10315 			return TRUE;
10316 #endif /* CONFIG_FREEZE */
10317 		}
10318 		return FALSE;
10319 	} else {
10320 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10321 	}
10322 }
10323 
10324 boolean_t
10325 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10326 {
10327 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10328 		/* Available pages below our threshold */
10329 		uint32_t available_pages = memorystatus_get_available_page_count();
10330 		return available_pages < memorystatus_get_critical_page_shortage_threshold();
10331 	} else {
10332 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10333 	}
10334 }
10335 
10336 /*
10337  * Downward trajectory.
10338  */
10339 boolean_t
10340 VM_PRESSURE_WARNING_TO_NORMAL(void)
10341 {
10342 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10343 		/* Available pages above our threshold */
10344 		uint32_t available_pages = memorystatus_get_available_page_count();
10345 		uint32_t target_threshold = (((115 * memorystatus_get_soft_memlimit_page_shortage_threshold()) / 100));
10346 		return available_pages > target_threshold;
10347 	} else {
10348 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10349 	}
10350 }
10351 
10352 boolean_t
10353 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10354 {
10355 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10356 		uint32_t available_pages = memorystatus_get_available_page_count();
10357 		uint32_t target_threshold = (((115 * memorystatus_get_critical_page_shortage_threshold()) / 100));
10358 		return available_pages > target_threshold;
10359 	} else {
10360 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10361 	}
10362 }
10363 #endif /* VM_PRESSURE_EVENTS */
10364 
10365 #if DEVELOPMENT || DEBUG
10366 bool compressor_running_perf_test;
10367 uint64_t compressor_perf_test_pages_processed;
10368 
10369 static kern_return_t
10370 move_pages_to_queue(
10371 	vm_map_t map,
10372 	user_addr_t start_addr,
10373 	size_t buffer_size,
10374 	vm_page_queue_head_t *queue,
10375 	size_t *pages_moved)
10376 {
10377 	kern_return_t err = KERN_SUCCESS;
10378 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10379 	boolean_t addr_in_map = FALSE;
10380 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10381 	vm_object_t curr_object = VM_OBJECT_NULL;
10382 	*pages_moved = 0;
10383 
10384 	vmlp_api_start(MOVE_PAGES_TO_QUEUE);
10385 
10386 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10387 		/*
10388 		 * We don't currently support benchmarking maps with a different page size
10389 		 * than the kernel.
10390 		 */
10391 		vmlp_api_end(MOVE_PAGES_TO_QUEUE, KERN_INVALID_ARGUMENT);
10392 		return KERN_INVALID_ARGUMENT;
10393 	}
10394 
10395 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10396 		vmlp_api_end(MOVE_PAGES_TO_QUEUE, KERN_INVALID_ARGUMENT);
10397 		return KERN_INVALID_ARGUMENT;
10398 	}
10399 
10400 	vm_map_lock_read(map);
10401 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10402 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10403 
10404 
10405 	while (curr_addr < end_addr) {
10406 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10407 		if (!addr_in_map) {
10408 			err = KERN_INVALID_ARGUMENT;
10409 			break;
10410 		}
10411 
10412 		vmlp_range_event_entry(map, curr_entry);
10413 
10414 		curr_object = VME_OBJECT(curr_entry);
10415 		if (curr_object) {
10416 			vm_object_lock(curr_object);
10417 			/* We really only want anonymous memory that's in the top level map and object here. */
10418 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10419 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10420 				err = KERN_INVALID_ARGUMENT;
10421 				vm_object_unlock(curr_object);
10422 				break;
10423 			}
10424 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10425 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10426 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
10427 			vm_map_offset_t curr_offset = start_offset;
10428 			vm_page_t curr_page;
10429 			while (curr_offset < end_offset) {
10430 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10431 				if (curr_page != VM_PAGE_NULL) {
10432 					vm_page_lock_queues();
10433 					if (curr_page->vmp_laundry) {
10434 						vm_pageout_steal_laundry(curr_page, TRUE);
10435 					}
10436 					/*
10437 					 * we've already factored out pages in the laundry which
10438 					 * means this page can't be on the pageout queue so it's
10439 					 * safe to do the vm_page_queues_remove
10440 					 */
10441 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10442 					vm_page_queues_remove(curr_page, TRUE);
10443 					if (donate) {
10444 						/*
10445 						 * The compressor needs to see this bit to know
10446 						 * where this page needs to land. Also if stolen,
10447 						 * this bit helps put the page back in the right
10448 						 * special queue where it belongs.
10449 						 */
10450 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10451 					}
10452 					// Clear the referenced bit so we ensure this gets paged out
10453 					curr_page->vmp_reference = false;
10454 					if (curr_page->vmp_pmapped) {
10455 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10456 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10457 					}
10458 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
10459 					vm_page_unlock_queues();
10460 					*pages_moved += 1;
10461 				}
10462 				curr_offset += PAGE_SIZE_64;
10463 				curr_addr += PAGE_SIZE_64;
10464 			}
10465 		}
10466 		vm_object_unlock(curr_object);
10467 	}
10468 	vm_map_unlock_read(map);
10469 	vmlp_api_end(MOVE_PAGES_TO_QUEUE, err);
10470 	return err;
10471 }
10472 
10473 /*
10474  * Local queue for processing benchmark pages.
10475  * Can't be allocated on the stack because the pointer has to
10476  * be packable.
10477  */
10478 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10479 kern_return_t
10480 run_compressor_perf_test(
10481 	user_addr_t buf,
10482 	size_t buffer_size,
10483 	uint64_t *time,
10484 	uint64_t *bytes_compressed,
10485 	uint64_t *compressor_growth)
10486 {
10487 	kern_return_t err = KERN_SUCCESS;
10488 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10489 		return KERN_NOT_SUPPORTED;
10490 	}
10491 	if (current_task() == kernel_task) {
10492 		return KERN_INVALID_ARGUMENT;
10493 	}
10494 	vm_page_lock_queues();
10495 	if (compressor_running_perf_test) {
10496 		/* Only run one instance of the benchmark at a time. */
10497 		vm_page_unlock_queues();
10498 		return KERN_RESOURCE_SHORTAGE;
10499 	}
10500 	vm_page_unlock_queues();
10501 	size_t page_count = 0;
10502 	vm_map_t map;
10503 	vm_page_t p, next;
10504 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10505 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10506 	*bytes_compressed = *compressor_growth = 0;
10507 
10508 	vm_page_queue_init(&compressor_perf_test_queue);
10509 	map = current_task()->map;
10510 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10511 	if (err != KERN_SUCCESS) {
10512 		goto out;
10513 	}
10514 
10515 	vm_page_lock_queues();
10516 	compressor_running_perf_test = true;
10517 	compressor_perf_test_pages_processed = 0;
10518 	/*
10519 	 * At this point the compressor threads should only process the benchmark queue
10520 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10521 	 * to determine how many compressed bytes we ended up using.
10522 	 */
10523 	compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10524 	vm_page_unlock_queues();
10525 
10526 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10527 
10528 	vm_page_lock_queues();
10529 	compressor_perf_test_start = mach_absolute_time();
10530 
10531 	// Wake up the compressor thread(s)
10532 	sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10533 	    pgo_iothread_internal_state[0].pgo_iothread);
10534 
10535 	/*
10536 	 * Depending on when this test is run we could overshoot or be right on the mark
10537 	 * with our page_count. So the comparison is of the _less than_ variety.
10538 	 */
10539 	while (compressor_perf_test_pages_processed < page_count) {
10540 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10541 		vm_page_unlock_queues();
10542 		thread_block(THREAD_CONTINUE_NULL);
10543 		vm_page_lock_queues();
10544 	}
10545 	compressor_perf_test_end = mach_absolute_time();
10546 	compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10547 	vm_page_unlock_queues();
10548 
10549 
10550 out:
10551 	/*
10552 	 * If we errored out above, then we could still have some pages
10553 	 * on the local queue. Make sure to put them back on the active queue before
10554 	 * returning so they're not orphaned.
10555 	 */
10556 	vm_page_lock_queues();
10557 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10558 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10559 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10560 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10561 
10562 		vm_page_enqueue_active(p, FALSE);
10563 		p = next;
10564 	}
10565 
10566 	compressor_running_perf_test = false;
10567 	vm_page_unlock_queues();
10568 	if (err == KERN_SUCCESS) {
10569 		*bytes_compressed = page_count * PAGE_SIZE_64;
10570 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
10571 	}
10572 
10573 	/*
10574 	 * pageout_scan will consider waking the compactor swapper
10575 	 * before it blocks. Do the same thing here before we return
10576 	 * to ensure that back to back benchmark runs can't overly fragment the
10577 	 * compressor pool.
10578 	 */
10579 	vm_consider_waking_compactor_swapper();
10580 	return err;
10581 }
10582 #endif /* DEVELOPMENT || DEBUG */
10583