xref: /xnu-11417.140.69/osfmk/vm/vm_pageout.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69 
70 #include <debug.h>
71 
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80 
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92 
93 #include <os/log.h>
94 
95 #include <sys/kdebug_triage.h>
96 
97 #include <machine/vm_tuning.h>
98 #include <machine/commpage.h>
99 
100 #include <vm/pmap.h>
101 #include <vm/vm_compressor_pager_internal.h>
102 #include <vm/vm_fault_internal.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115 #include <vm/vm_reclaim_xnu.h>
116 
117 #include <san/kasan.h>
118 #include <sys/kern_memorystatus_xnu.h>
119 
120 #if CONFIG_PHANTOM_CACHE
121 #include <vm/vm_phantom_cache_internal.h>
122 #endif
123 
124 
125 #if UPL_DEBUG
126 #include <libkern/OSDebug.h>
127 #endif
128 
129 extern int cs_debug;
130 
131 #if CONFIG_MBUF_MCACHE
132 extern void mbuf_drain(boolean_t);
133 #endif /* CONFIG_MBUF_MCACHE */
134 
135 #if CONFIG_FREEZE
136 extern unsigned int memorystatus_frozen_count;
137 extern unsigned int memorystatus_suspended_count;
138 #endif /* CONFIG_FREEZE */
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140 
141 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143 extern uint32_t memorystatus_jetsam_bg_band_waiters;
144 
145 void vm_pressure_response(void);
146 extern void consider_vm_pressure_events(void);
147 
148 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
149 
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 sched_cond_atomic_t vm_pageout_gc_cond;
153 #if CONFIG_VPS_DYNAMIC_PRIO
154 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
155 #else
156 const bool vps_dynamic_priority_enabled = false;
157 #endif
158 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
159 
160 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
161 #if !XNU_TARGET_OS_OSX
162 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
163 #else /* !XNU_TARGET_OS_OSX */
164 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
165 #endif /* !XNU_TARGET_OS_OSX */
166 #endif
167 
168 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
169 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
170 #endif
171 
172 #ifndef VM_PAGE_LAUNDRY_MAX
173 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
174 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
175 
176 #ifndef VM_PAGEOUT_BURST_WAIT
177 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
178 #endif  /* VM_PAGEOUT_BURST_WAIT */
179 
180 #ifndef VM_PAGEOUT_EMPTY_WAIT
181 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
182 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
183 
184 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
185 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
186 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
187 
188 #ifndef VM_PAGEOUT_IDLE_WAIT
189 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
190 #endif  /* VM_PAGEOUT_IDLE_WAIT */
191 
192 #ifndef VM_PAGEOUT_SWAP_WAIT
193 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
194 #endif  /* VM_PAGEOUT_SWAP_WAIT */
195 
196 /*
197  * vm_page_max_speculative_age_q should be less than or equal to
198  * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
199  * vm_page_queue_speculative entries.
200  */
201 
202 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
203 #ifndef VM_PAGE_SPECULATIVE_TARGET
204 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
205 #endif /* VM_PAGE_SPECULATIVE_TARGET */
206 
207 
208 /*
209  *	To obtain a reasonable LRU approximation, the inactive queue
210  *	needs to be large enough to give pages on it a chance to be
211  *	referenced a second time.  This macro defines the fraction
212  *	of active+inactive pages that should be inactive.
213  *	The pageout daemon uses it to update vm_page_inactive_target.
214  *
215  *	If vm_page_free_count falls below vm_page_free_target and
216  *	vm_page_inactive_count is below vm_page_inactive_target,
217  *	then the pageout daemon starts running.
218  */
219 
220 #ifndef VM_PAGE_INACTIVE_TARGET
221 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
222 #endif  /* VM_PAGE_INACTIVE_TARGET */
223 
224 /*
225  *	Once the pageout daemon starts running, it keeps going
226  *	until vm_page_free_count meets or exceeds vm_page_free_target.
227  */
228 
229 #ifndef VM_PAGE_FREE_TARGET
230 #if !XNU_TARGET_OS_OSX
231 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
232 #else /* !XNU_TARGET_OS_OSX */
233 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
234 #endif /* !XNU_TARGET_OS_OSX */
235 #endif  /* VM_PAGE_FREE_TARGET */
236 
237 
238 /*
239  *	The pageout daemon always starts running once vm_page_free_count
240  *	falls below vm_page_free_min.
241  */
242 
243 #ifndef VM_PAGE_FREE_MIN
244 #if !XNU_TARGET_OS_OSX
245 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
246 #else /* !XNU_TARGET_OS_OSX */
247 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
248 #endif /* !XNU_TARGET_OS_OSX */
249 #endif  /* VM_PAGE_FREE_MIN */
250 
251 #if !XNU_TARGET_OS_OSX
252 #define VM_PAGE_FREE_RESERVED_LIMIT     100
253 #define VM_PAGE_FREE_MIN_LIMIT          1500
254 #define VM_PAGE_FREE_TARGET_LIMIT       2000
255 #else /* !XNU_TARGET_OS_OSX */
256 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
257 #define VM_PAGE_FREE_MIN_LIMIT          3500
258 #define VM_PAGE_FREE_TARGET_LIMIT       4000
259 #endif /* !XNU_TARGET_OS_OSX */
260 
261 /*
262  *	When vm_page_free_count falls below vm_page_free_reserved,
263  *	only vm-privileged threads can allocate pages.  vm-privilege
264  *	allows the pageout daemon and default pager (and any other
265  *	associated threads needed for default pageout) to continue
266  *	operation by dipping into the reserved pool of pages.
267  */
268 
269 #ifndef VM_PAGE_FREE_RESERVED
270 #define VM_PAGE_FREE_RESERVED(n)        \
271 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
272 #endif  /* VM_PAGE_FREE_RESERVED */
273 
274 /*
275  *	When we dequeue pages from the inactive list, they are
276  *	reactivated (ie, put back on the active queue) if referenced.
277  *	However, it is possible to starve the free list if other
278  *	processors are referencing pages faster than we can turn off
279  *	the referenced bit.  So we limit the number of reactivations
280  *	we will make per call of vm_pageout_scan().
281  */
282 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
283 
284 #ifndef VM_PAGE_REACTIVATE_LIMIT
285 #if !XNU_TARGET_OS_OSX
286 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
287 #else /* !XNU_TARGET_OS_OSX */
288 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
289 #endif /* !XNU_TARGET_OS_OSX */
290 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
291 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
292 
293 int vm_pageout_protect_realtime = true;
294 
295 extern boolean_t hibernate_cleaning_in_progress;
296 
297 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
298 struct pgo_iothread_state pgo_iothread_external_state;
299 
300 #if VM_PRESSURE_EVENTS
301 void vm_pressure_thread(void);
302 
303 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
304 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
305 
306 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
307 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
308 #endif
309 
310 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
311 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
312 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
313 
314 extern void vm_pageout_continue(void);
315 extern void vm_pageout_scan(void);
316 
317 boolean_t vm_pageout_running = FALSE;
318 
319 uint32_t vm_page_upl_tainted = 0;
320 uint32_t vm_page_iopl_tainted = 0;
321 
322 #if XNU_TARGET_OS_OSX
323 static boolean_t vm_pageout_waiter  = FALSE;
324 #endif /* XNU_TARGET_OS_OSX */
325 
326 
327 #if DEVELOPMENT || DEBUG
328 struct vm_pageout_debug vm_pageout_debug;
329 #endif
330 struct vm_pageout_vminfo vm_pageout_vminfo;
331 struct vm_pageout_state  vm_pageout_state;
332 struct vm_config         vm_config;
333 
334 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
335 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
336 #if DEVELOPMENT || DEBUG
337 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
338 #endif /* DEVELOPMENT || DEBUG */
339 
340 int         vm_upl_wait_for_pages = 0;
341 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
342 
343 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
344 
345 int     vm_debug_events = 0;
346 
347 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
348 
349 #if CONFIG_MEMORYSTATUS
350 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
351 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
352 #endif
353 
354 #if __AMP__
355 
356 
357 /*
358  * Bind compressor threads to e-cores unless there are multiple non-e clusters
359  */
360 #if (MAX_CPU_CLUSTERS > 2)
361 #define VM_COMPRESSOR_EBOUND_DEFAULT false
362 #elif defined(XNU_TARGET_OS_XR)
363 #define VM_COMPRESSOR_EBOUND_DEFAULT false
364 #else
365 #define VM_COMPRESSOR_EBOUND_DEFAULT true
366 #endif
367 
368 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
369 int vm_pgo_pbound = 0;
370 extern void thread_soft_bind_cluster_type(thread_t, char);
371 
372 #endif /* __AMP__ */
373 
374 
375 /*
376  *	Routine:	vm_pageout_object_terminate
377  *	Purpose:
378  *		Destroy the pageout_object, and perform all of the
379  *		required cleanup actions.
380  *
381  *	In/Out conditions:
382  *		The object must be locked, and will be returned locked.
383  */
384 void
vm_pageout_object_terminate(vm_object_t object)385 vm_pageout_object_terminate(
386 	vm_object_t     object)
387 {
388 	vm_object_t     shadow_object;
389 
390 	/*
391 	 * Deal with the deallocation (last reference) of a pageout object
392 	 * (used for cleaning-in-place) by dropping the paging references/
393 	 * freeing pages in the original object.
394 	 */
395 
396 	assert(object->pageout);
397 	shadow_object = object->shadow;
398 	vm_object_lock(shadow_object);
399 
400 	while (!vm_page_queue_empty(&object->memq)) {
401 		vm_page_t               p, m;
402 		vm_object_offset_t      offset;
403 
404 		p = (vm_page_t) vm_page_queue_first(&object->memq);
405 
406 		assert(vm_page_is_private(p));
407 		assert(p->vmp_free_when_done);
408 		p->vmp_free_when_done = FALSE;
409 		assert(!p->vmp_cleaning);
410 		assert(!p->vmp_laundry);
411 
412 		offset = p->vmp_offset;
413 		VM_PAGE_FREE(p);
414 		p = VM_PAGE_NULL;
415 
416 		m = vm_page_lookup(shadow_object,
417 		    offset + object->vo_shadow_offset);
418 
419 		if (m == VM_PAGE_NULL) {
420 			continue;
421 		}
422 
423 		assert((m->vmp_dirty) || (m->vmp_precious) ||
424 		    (m->vmp_busy && m->vmp_cleaning));
425 
426 		/*
427 		 * Handle the trusted pager throttle.
428 		 * Also decrement the burst throttle (if external).
429 		 */
430 		vm_page_lock_queues();
431 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
432 			vm_pageout_throttle_up(m);
433 		}
434 
435 		/*
436 		 * Handle the "target" page(s). These pages are to be freed if
437 		 * successfully cleaned. Target pages are always busy, and are
438 		 * wired exactly once. The initial target pages are not mapped,
439 		 * (so cannot be referenced or modified) but converted target
440 		 * pages may have been modified between the selection as an
441 		 * adjacent page and conversion to a target.
442 		 */
443 		if (m->vmp_free_when_done) {
444 			assert(m->vmp_busy);
445 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
446 			assert(m->vmp_wire_count == 1);
447 			m->vmp_cleaning = FALSE;
448 			m->vmp_free_when_done = FALSE;
449 			/*
450 			 * Revoke all access to the page. Since the object is
451 			 * locked, and the page is busy, this prevents the page
452 			 * from being dirtied after the pmap_disconnect() call
453 			 * returns.
454 			 *
455 			 * Since the page is left "dirty" but "not modifed", we
456 			 * can detect whether the page was redirtied during
457 			 * pageout by checking the modify state.
458 			 */
459 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
460 				SET_PAGE_DIRTY(m, FALSE);
461 			} else {
462 				m->vmp_dirty = FALSE;
463 			}
464 
465 			if (m->vmp_dirty) {
466 				vm_page_unwire(m, TRUE);        /* reactivates */
467 				counter_inc(&vm_statistics_reactivations);
468 				vm_page_wakeup_done(object, m);
469 			} else {
470 				vm_page_free(m);  /* clears busy, etc. */
471 			}
472 			vm_page_unlock_queues();
473 			continue;
474 		}
475 		/*
476 		 * Handle the "adjacent" pages. These pages were cleaned in
477 		 * place, and should be left alone.
478 		 * If prep_pin_count is nonzero, then someone is using the
479 		 * page, so make it active.
480 		 */
481 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !vm_page_is_private(m)) {
482 			if (m->vmp_reference) {
483 				vm_page_activate(m);
484 			} else {
485 				vm_page_deactivate(m);
486 			}
487 		}
488 		if (m->vmp_overwriting) {
489 			/*
490 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
491 			 */
492 			if (m->vmp_busy) {
493 				/*
494 				 * We do not re-set m->vmp_dirty !
495 				 * The page was busy so no extraneous activity
496 				 * could have occurred. COPY_INTO is a read into the
497 				 * new pages. CLEAN_IN_PLACE does actually write
498 				 * out the pages but handling outside of this code
499 				 * will take care of resetting dirty. We clear the
500 				 * modify however for the Programmed I/O case.
501 				 */
502 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
503 
504 				m->vmp_busy = FALSE;
505 				m->vmp_absent = FALSE;
506 			} else {
507 				/*
508 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
509 				 * Occurs when the original page was wired
510 				 * at the time of the list request
511 				 */
512 				assert(VM_PAGE_WIRED(m));
513 				vm_page_unwire(m, TRUE);        /* reactivates */
514 			}
515 			m->vmp_overwriting = FALSE;
516 		} else {
517 			m->vmp_dirty = FALSE;
518 		}
519 		m->vmp_cleaning = FALSE;
520 
521 		/*
522 		 * Wakeup any thread waiting for the page to be un-cleaning.
523 		 */
524 		vm_page_wakeup(object, m);
525 		vm_page_unlock_queues();
526 	}
527 	/*
528 	 * Account for the paging reference taken in vm_paging_object_allocate.
529 	 */
530 	vm_object_activity_end(shadow_object);
531 	vm_object_unlock(shadow_object);
532 
533 	assert(os_ref_get_count_raw(&object->ref_count) == 0);
534 	assert(object->paging_in_progress == 0);
535 	assert(object->activity_in_progress == 0);
536 	assert(object->resident_page_count == 0);
537 	return;
538 }
539 
540 /*
541  * Routine:	vm_pageclean_setup
542  *
543  * Purpose:	setup a page to be cleaned (made non-dirty), but not
544  *		necessarily flushed from the VM page cache.
545  *		This is accomplished by cleaning in place.
546  *
547  *		The page must not be busy, and new_object
548  *		must be locked.
549  *
550  */
551 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)552 vm_pageclean_setup(
553 	vm_page_t               m,
554 	vm_page_t               new_m,
555 	vm_object_t             new_object,
556 	vm_object_offset_t      new_offset)
557 {
558 	assert(!m->vmp_busy);
559 #if 0
560 	assert(!m->vmp_cleaning);
561 #endif
562 
563 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
564 
565 	/*
566 	 * Mark original page as cleaning in place.
567 	 */
568 	m->vmp_cleaning = TRUE;
569 	SET_PAGE_DIRTY(m, FALSE);
570 	m->vmp_precious = FALSE;
571 
572 	/*
573 	 * Convert the fictitious page to a private shadow of
574 	 * the real page.
575 	 */
576 	new_m->vmp_free_when_done = TRUE;
577 
578 	vm_page_lockspin_queues();
579 	vm_page_make_private(new_m, VM_PAGE_GET_PHYS_PAGE(m));
580 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
581 	vm_page_unlock_queues();
582 
583 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
584 	assert(!new_m->vmp_wanted);
585 	new_m->vmp_busy = FALSE;
586 }
587 
588 /*
589  *	Routine:	vm_pageout_initialize_page
590  *	Purpose:
591  *		Causes the specified page to be initialized in
592  *		the appropriate memory object. This routine is used to push
593  *		pages into a copy-object when they are modified in the
594  *		permanent object.
595  *
596  *		The page is moved to a temporary object and paged out.
597  *
598  *	In/out conditions:
599  *		The page in question must not be on any pageout queues.
600  *		The object to which it belongs must be locked.
601  *		The page must be busy, but not hold a paging reference.
602  *
603  *	Implementation:
604  *		Move this page to a completely new object.
605  */
606 void
vm_pageout_initialize_page(vm_page_t m)607 vm_pageout_initialize_page(
608 	vm_page_t       m)
609 {
610 	vm_object_t             object;
611 	vm_object_offset_t      paging_offset;
612 	memory_object_t         pager;
613 
614 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615 
616 	object = VM_PAGE_OBJECT(m);
617 
618 	assert(m->vmp_busy);
619 	assert(object->internal);
620 
621 	/*
622 	 *	Verify that we really want to clean this page
623 	 */
624 	assert(!m->vmp_absent);
625 	assert(m->vmp_dirty);
626 
627 	/*
628 	 *	Create a paging reference to let us play with the object.
629 	 */
630 	paging_offset = m->vmp_offset + object->paging_offset;
631 
632 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 		panic("reservation without pageout?"); /* alan */
634 
635 		VM_PAGE_FREE(m);
636 		vm_object_unlock(object);
637 
638 		return;
639 	}
640 
641 	/*
642 	 * If there's no pager, then we can't clean the page.  This should
643 	 * never happen since this should be a copy object and therefore not
644 	 * an external object, so the pager should always be there.
645 	 */
646 
647 	pager = object->pager;
648 
649 	if (pager == MEMORY_OBJECT_NULL) {
650 		panic("missing pager for copy object");
651 
652 		VM_PAGE_FREE(m);
653 		return;
654 	}
655 
656 	/*
657 	 * set the page for future call to vm_fault_list_request
658 	 */
659 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
660 	SET_PAGE_DIRTY(m, FALSE);
661 
662 	/*
663 	 * keep the object from collapsing or terminating
664 	 */
665 	vm_object_paging_begin(object);
666 	vm_object_unlock(object);
667 
668 	/*
669 	 *	Write the data to its pager.
670 	 *	Note that the data is passed by naming the new object,
671 	 *	not a virtual address; the pager interface has been
672 	 *	manipulated to use the "internal memory" data type.
673 	 *	[The object reference from its allocation is donated
674 	 *	to the eventual recipient.]
675 	 */
676 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
677 
678 	vm_object_lock(object);
679 	vm_object_paging_end(object);
680 }
681 
682 
683 /*
684  * vm_pageout_cluster:
685  *
686  * Given a page, queue it to the appropriate I/O thread,
687  * which will page it out and attempt to clean adjacent pages
688  * in the same operation.
689  *
690  * The object and queues must be locked. We will take a
691  * paging reference to prevent deallocation or collapse when we
692  * release the object lock back at the call site.  The I/O thread
693  * is responsible for consuming this reference
694  *
695  * The page must not be on any pageout queue.
696  */
697 #if DEVELOPMENT || DEBUG
698 vmct_stats_t vmct_stats;
699 
700 int32_t vmct_active = 0;
701 uint64_t vm_compressor_epoch_start = 0;
702 uint64_t vm_compressor_epoch_stop = 0;
703 
704 typedef enum vmct_state_t {
705 	VMCT_IDLE,
706 	VMCT_AWAKENED,
707 	VMCT_ACTIVE,
708 } vmct_state_t;
709 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710 #endif
711 
712 
713 
714 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)715 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716 {
717 	vm_object_t object = VM_PAGE_OBJECT(m);
718 
719 	VM_PAGE_CHECK(m);
720 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 	vm_object_lock_assert_exclusive(object);
722 
723 	/*
724 	 * Make sure it's OK to page this out.
725 	 */
726 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 	assert(!m->vmp_cleaning && !m->vmp_laundry);
728 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729 
730 	/*
731 	 * protect the object from collapse or termination
732 	 */
733 	vm_object_activity_begin(object);
734 
735 
736 	/*
737 	 * pgo_laundry count is tied to the laundry bit
738 	 */
739 	m->vmp_laundry = TRUE;
740 	q->pgo_laundry++;
741 
742 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744 
745 	if (object->internal == TRUE) {
746 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
747 		m->vmp_busy = TRUE;
748 #if DEVELOPMENT || DEBUG
749 		/*
750 		 * The benchmark queue will be woken up independently by the benchmark
751 		 * itself.
752 		 */
753 		if (q != &vm_pageout_queue_benchmark) {
754 #else /* DEVELOPMENT || DEBUG */
755 		if (true) {
756 #endif /* DEVELOPMENT || DEBUG */
757 			/*
758 			 * Wake up the first compressor thread. It will wake subsequent
759 			 * threads if necessary.
760 			 */
761 			sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
762 			    pgo_iothread_internal_state[0].pgo_iothread);
763 		}
764 	} else {
765 		sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
766 	}
767 	VM_PAGE_CHECK(m);
768 }
769 
770 void
771 vm_pageout_cluster(vm_page_t m)
772 {
773 	struct          vm_pageout_queue *q;
774 	vm_object_t     object = VM_PAGE_OBJECT(m);
775 	if (object->internal) {
776 		q = &vm_pageout_queue_internal;
777 	} else {
778 		q = &vm_pageout_queue_external;
779 	}
780 	vm_pageout_cluster_to_queue(m, q);
781 }
782 
783 
784 /*
785  * A page is back from laundry or we are stealing it back from
786  * the laundering state.  See if there are some pages waiting to
787  * go to laundry and if we can let some of them go now.
788  *
789  * Object and page queues must be locked.
790  */
791 void
792 vm_pageout_throttle_up(
793 	vm_page_t       m)
794 {
795 	struct vm_pageout_queue *q;
796 	vm_object_t      m_object;
797 
798 	m_object = VM_PAGE_OBJECT(m);
799 
800 	assert(m_object != VM_OBJECT_NULL);
801 	assert(!is_kernel_object(m_object));
802 
803 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
804 	vm_object_lock_assert_exclusive(m_object);
805 
806 	if (m_object->internal == TRUE) {
807 		q = &vm_pageout_queue_internal;
808 	} else {
809 		q = &vm_pageout_queue_external;
810 	}
811 
812 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
813 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
814 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
815 
816 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
817 
818 		vm_object_activity_end(m_object);
819 
820 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
821 	}
822 	if (m->vmp_laundry == TRUE) {
823 		m->vmp_laundry = FALSE;
824 		q->pgo_laundry--;
825 
826 		if (q->pgo_throttled == TRUE) {
827 			q->pgo_throttled = FALSE;
828 			thread_wakeup((event_t) &q->pgo_laundry);
829 		}
830 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
831 			q->pgo_draining = FALSE;
832 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
833 		}
834 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
835 	}
836 }
837 
838 
839 static void
840 vm_pageout_throttle_up_batch(
841 	struct vm_pageout_queue *q,
842 	int             batch_cnt)
843 {
844 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
845 
846 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
847 
848 	q->pgo_laundry -= batch_cnt;
849 
850 	if (q->pgo_throttled == TRUE) {
851 		q->pgo_throttled = FALSE;
852 		thread_wakeup((event_t) &q->pgo_laundry);
853 	}
854 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
855 		q->pgo_draining = FALSE;
856 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
857 	}
858 }
859 
860 
861 
862 /*
863  * VM memory pressure monitoring.
864  *
865  * vm_pageout_scan() keeps track of the number of pages it considers and
866  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
867  *
868  * compute_memory_pressure() is called every second from compute_averages()
869  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
870  * of recalimed pages in a new vm_pageout_stat[] bucket.
871  *
872  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
873  * The caller provides the number of seconds ("nsecs") worth of statistics
874  * it wants, up to 30 seconds.
875  * It computes the number of pages reclaimed in the past "nsecs" seconds and
876  * also returns the number of pages the system still needs to reclaim at this
877  * moment in time.
878  */
879 #if DEVELOPMENT || DEBUG
880 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
881 #else
882 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
883 #endif
884 struct vm_pageout_stat {
885 	unsigned long vm_page_active_count;
886 	unsigned long vm_page_speculative_count;
887 	unsigned long vm_page_inactive_count;
888 	unsigned long vm_page_anonymous_count;
889 
890 	unsigned long vm_page_free_count;
891 	unsigned long vm_page_wire_count;
892 	unsigned long vm_page_compressor_count;
893 
894 	unsigned long vm_page_pages_compressed;
895 	unsigned long vm_page_pageable_internal_count;
896 	unsigned long vm_page_pageable_external_count;
897 	unsigned long vm_page_xpmapped_external_count;
898 
899 	unsigned int pages_grabbed;
900 	unsigned int pages_freed;
901 
902 	unsigned int pages_compressed;
903 	unsigned int pages_grabbed_by_compressor;
904 	unsigned int failed_compressions;
905 
906 	unsigned int pages_evicted;
907 	unsigned int pages_purged;
908 
909 	unsigned int considered;
910 	unsigned int considered_bq_internal;
911 	unsigned int considered_bq_external;
912 
913 	unsigned int skipped_external;
914 	unsigned int skipped_internal;
915 	unsigned int filecache_min_reactivations;
916 
917 	unsigned int freed_speculative;
918 	unsigned int freed_cleaned;
919 	unsigned int freed_internal;
920 	unsigned int freed_external;
921 
922 	unsigned int cleaned_dirty_external;
923 	unsigned int cleaned_dirty_internal;
924 
925 	unsigned int inactive_referenced;
926 	unsigned int inactive_nolock;
927 	unsigned int reactivation_limit_exceeded;
928 	unsigned int forced_inactive_reclaim;
929 
930 	unsigned int throttled_internal_q;
931 	unsigned int throttled_external_q;
932 
933 	unsigned int phantom_ghosts_found;
934 	unsigned int phantom_ghosts_added;
935 
936 	unsigned int vm_page_realtime_count;
937 	unsigned int forcereclaimed_sharedcache;
938 	unsigned int forcereclaimed_realtime;
939 	unsigned int protected_sharedcache;
940 	unsigned int protected_realtime;
941 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
942 
943 unsigned int vm_pageout_stat_now = 0;
944 
945 #define VM_PAGEOUT_STAT_BEFORE(i) \
946 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
947 #define VM_PAGEOUT_STAT_AFTER(i) \
948 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
949 
950 #if VM_PAGE_BUCKETS_CHECK
951 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
952 #endif /* VM_PAGE_BUCKETS_CHECK */
953 
954 
955 void
956 record_memory_pressure(void);
957 void
958 record_memory_pressure(void)
959 {
960 	unsigned int vm_pageout_next;
961 
962 #if VM_PAGE_BUCKETS_CHECK
963 	/* check the consistency of VM page buckets at regular interval */
964 	static int counter = 0;
965 	if ((++counter % vm_page_buckets_check_interval) == 0) {
966 		vm_page_buckets_check();
967 	}
968 #endif /* VM_PAGE_BUCKETS_CHECK */
969 
970 	vm_pageout_state.vm_memory_pressure =
971 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
972 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
973 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
974 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
975 
976 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
977 
978 	/* move "now" forward */
979 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
980 
981 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
982 
983 	vm_pageout_stat_now = vm_pageout_next;
984 }
985 
986 
987 /*
988  * IMPORTANT
989  * mach_vm_ctl_page_free_wanted() is called indirectly, via
990  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
991  * it must be safe in the restricted stackshot context. Locks and/or
992  * blocking are not allowable.
993  */
994 unsigned int
995 mach_vm_ctl_page_free_wanted(void)
996 {
997 	unsigned int page_free_target, page_free_count, page_free_wanted;
998 
999 	page_free_target = vm_page_free_target;
1000 	page_free_count = vm_page_free_count;
1001 	if (page_free_target > page_free_count) {
1002 		page_free_wanted = page_free_target - page_free_count;
1003 	} else {
1004 		page_free_wanted = 0;
1005 	}
1006 
1007 	return page_free_wanted;
1008 }
1009 
1010 
1011 /*
1012  * IMPORTANT:
1013  * mach_vm_pressure_monitor() is called when taking a stackshot, with
1014  * wait_for_pressure FALSE, so that code path must remain safe in the
1015  * restricted stackshot context. No blocking or locks are allowable.
1016  * on that code path.
1017  */
1018 
1019 kern_return_t
1020 mach_vm_pressure_monitor(
1021 	boolean_t       wait_for_pressure,
1022 	unsigned int    nsecs_monitored,
1023 	unsigned int    *pages_reclaimed_p,
1024 	unsigned int    *pages_wanted_p)
1025 {
1026 	wait_result_t   wr;
1027 	unsigned int    vm_pageout_then, vm_pageout_now;
1028 	unsigned int    pages_reclaimed;
1029 	unsigned int    units_of_monitor;
1030 
1031 	units_of_monitor = 8 * nsecs_monitored;
1032 	/*
1033 	 * We don't take the vm_page_queue_lock here because we don't want
1034 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1035 	 * thread when it's trying to reclaim memory.  We don't need fully
1036 	 * accurate monitoring anyway...
1037 	 */
1038 
1039 	if (wait_for_pressure) {
1040 		/* wait until there's memory pressure */
1041 		while (vm_page_free_count >= vm_page_free_target) {
1042 			wr = assert_wait((event_t) &vm_page_free_wanted,
1043 			    THREAD_INTERRUPTIBLE);
1044 			if (wr == THREAD_WAITING) {
1045 				wr = thread_block(THREAD_CONTINUE_NULL);
1046 			}
1047 			if (wr == THREAD_INTERRUPTED) {
1048 				return KERN_ABORTED;
1049 			}
1050 			if (wr == THREAD_AWAKENED) {
1051 				/*
1052 				 * The memory pressure might have already
1053 				 * been relieved but let's not block again
1054 				 * and let's report that there was memory
1055 				 * pressure at some point.
1056 				 */
1057 				break;
1058 			}
1059 		}
1060 	}
1061 
1062 	/* provide the number of pages the system wants to reclaim */
1063 	if (pages_wanted_p != NULL) {
1064 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1065 	}
1066 
1067 	if (pages_reclaimed_p == NULL) {
1068 		return KERN_SUCCESS;
1069 	}
1070 
1071 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1072 	vm_pageout_now = vm_pageout_stat_now;
1073 	pages_reclaimed = 0;
1074 	for (vm_pageout_then =
1075 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1076 	    vm_pageout_then != vm_pageout_now &&
1077 	    units_of_monitor-- != 0;
1078 	    vm_pageout_then =
1079 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1080 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1081 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1082 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1083 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1084 	}
1085 	*pages_reclaimed_p = pages_reclaimed;
1086 
1087 	return KERN_SUCCESS;
1088 }
1089 
1090 
1091 
1092 #if DEVELOPMENT || DEBUG
1093 
1094 static void
1095 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1096 
1097 /*
1098  * condition variable used to make sure there is
1099  * only a single sweep going on at a time
1100  */
1101 bool vm_pageout_disconnect_all_pages_active = false;
1102 
1103 void
1104 vm_pageout_disconnect_all_pages()
1105 {
1106 	vm_page_lock_queues();
1107 
1108 	if (vm_pageout_disconnect_all_pages_active) {
1109 		vm_page_unlock_queues();
1110 		return;
1111 	}
1112 	vm_pageout_disconnect_all_pages_active = true;
1113 
1114 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1115 	    vm_page_throttled_count);
1116 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1117 	    vm_page_anonymous_count);
1118 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1119 	    (vm_page_inactive_count - vm_page_anonymous_count));
1120 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1121 	    vm_page_active_count);
1122 #ifdef CONFIG_SECLUDED_MEMORY
1123 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1124 	    vm_page_secluded_count);
1125 #endif /* CONFIG_SECLUDED_MEMORY */
1126 	vm_page_unlock_queues();
1127 
1128 	vm_pageout_disconnect_all_pages_active = false;
1129 }
1130 
1131 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1132 void
1133 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1134 {
1135 	vm_page_t       m;
1136 	vm_object_t     t_object = NULL;
1137 	vm_object_t     l_object = NULL;
1138 	vm_object_t     m_object = NULL;
1139 	int             delayed_unlock = 0;
1140 	int             try_failed_count = 0;
1141 	int             disconnected_count = 0;
1142 	int             paused_count = 0;
1143 	int             object_locked_count = 0;
1144 
1145 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1146 	    DBG_FUNC_START),
1147 	    q, qcount);
1148 
1149 	while (qcount && !vm_page_queue_empty(q)) {
1150 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1151 
1152 		m = (vm_page_t) vm_page_queue_first(q);
1153 		m_object = VM_PAGE_OBJECT(m);
1154 
1155 		if (m_object == VM_OBJECT_NULL) {
1156 			/*
1157 			 * Bumped into a free page. This should only happen on the
1158 			 * secluded queue
1159 			 */
1160 #if CONFIG_SECLUDED_MEMORY
1161 			assert(q == &vm_page_queue_secluded);
1162 #endif /* CONFIG_SECLUDED_MEMORY */
1163 			goto reenter_pg_on_q;
1164 		}
1165 
1166 		/*
1167 		 * check to see if we currently are working
1168 		 * with the same object... if so, we've
1169 		 * already got the lock
1170 		 */
1171 		if (m_object != l_object) {
1172 			/*
1173 			 * the object associated with candidate page is
1174 			 * different from the one we were just working
1175 			 * with... dump the lock if we still own it
1176 			 */
1177 			if (l_object != NULL) {
1178 				vm_object_unlock(l_object);
1179 				l_object = NULL;
1180 			}
1181 			if (m_object != t_object) {
1182 				try_failed_count = 0;
1183 			}
1184 
1185 			/*
1186 			 * Try to lock object; since we've alread got the
1187 			 * page queues lock, we can only 'try' for this one.
1188 			 * if the 'try' fails, we need to do a mutex_pause
1189 			 * to allow the owner of the object lock a chance to
1190 			 * run...
1191 			 */
1192 			if (!vm_object_lock_try_scan(m_object)) {
1193 				if (try_failed_count > 20) {
1194 					goto reenter_pg_on_q;
1195 				}
1196 				vm_page_unlock_queues();
1197 				mutex_pause(try_failed_count++);
1198 				vm_page_lock_queues();
1199 				delayed_unlock = 0;
1200 
1201 				paused_count++;
1202 
1203 				t_object = m_object;
1204 				continue;
1205 			}
1206 			object_locked_count++;
1207 
1208 			l_object = m_object;
1209 		}
1210 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1211 		    m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1212 		    m->vmp_free_when_done) {
1213 			/*
1214 			 * put it back on the head of its queue
1215 			 */
1216 			goto reenter_pg_on_q;
1217 		}
1218 		if (m->vmp_pmapped == TRUE) {
1219 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1220 
1221 			disconnected_count++;
1222 		}
1223 reenter_pg_on_q:
1224 		vm_page_queue_remove(q, m, vmp_pageq);
1225 		vm_page_queue_enter(q, m, vmp_pageq);
1226 
1227 		qcount--;
1228 		try_failed_count = 0;
1229 
1230 		if (delayed_unlock++ > 128) {
1231 			if (l_object != NULL) {
1232 				vm_object_unlock(l_object);
1233 				l_object = NULL;
1234 			}
1235 			lck_mtx_yield(&vm_page_queue_lock);
1236 			delayed_unlock = 0;
1237 		}
1238 	}
1239 	if (l_object != NULL) {
1240 		vm_object_unlock(l_object);
1241 		l_object = NULL;
1242 	}
1243 
1244 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1245 	    DBG_FUNC_END),
1246 	    q, disconnected_count, object_locked_count, paused_count);
1247 }
1248 
1249 extern const char *proc_best_name(struct proc* proc);
1250 
1251 int
1252 vm_toggle_task_selfdonate_pages(task_t task)
1253 {
1254 	int state = 0;
1255 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1256 		printf("VM Donation mode is OFF on the system\n");
1257 		return state;
1258 	}
1259 	if (task != kernel_task) {
1260 		task_lock(task);
1261 		if (!task->donates_own_pages) {
1262 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1263 			task->donates_own_pages = true;
1264 			state = 1;
1265 		} else if (task->donates_own_pages) {
1266 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1267 			task->donates_own_pages = false;
1268 			state = 0;
1269 		}
1270 		task_unlock(task);
1271 	}
1272 	return state;
1273 }
1274 #endif /* DEVELOPMENT || DEBUG */
1275 
1276 void
1277 vm_task_set_selfdonate_pages(task_t task, bool donate)
1278 {
1279 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1280 	assert(task != kernel_task);
1281 
1282 	task_lock(task);
1283 	task->donates_own_pages = donate;
1284 	task_unlock(task);
1285 }
1286 
1287 
1288 
1289 static size_t
1290 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1291 
1292 /*
1293  * condition variable used to make sure there is
1294  * only a single sweep going on at a time
1295  */
1296 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1297 
1298 
1299 kern_return_t
1300 vm_pageout_anonymous_pages()
1301 {
1302 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1303 		size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1304 		vm_page_lock_queues();
1305 
1306 		if (vm_pageout_anonymous_pages_active == TRUE) {
1307 			vm_page_unlock_queues();
1308 			return KERN_RESOURCE_SHORTAGE;
1309 		}
1310 		vm_pageout_anonymous_pages_active = TRUE;
1311 		vm_page_unlock_queues();
1312 
1313 		throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1314 		anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1315 		active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1316 
1317 		os_log(OS_LOG_DEFAULT,
1318 		    "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1319 		    __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1320 
1321 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1322 			vm_consider_swapping();
1323 		}
1324 
1325 		vm_page_lock_queues();
1326 		vm_pageout_anonymous_pages_active = FALSE;
1327 		vm_page_unlock_queues();
1328 		return KERN_SUCCESS;
1329 	} else {
1330 		return KERN_NOT_SUPPORTED;
1331 	}
1332 }
1333 
1334 
1335 size_t
1336 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1337 {
1338 	vm_page_t       m;
1339 	vm_object_t     t_object = NULL;
1340 	vm_object_t     l_object = NULL;
1341 	vm_object_t     m_object = NULL;
1342 	int             delayed_unlock = 0;
1343 	int             try_failed_count = 0;
1344 	int             refmod_state;
1345 	int             pmap_options;
1346 	struct          vm_pageout_queue *iq;
1347 	ppnum_t         phys_page;
1348 	size_t          pages_moved = 0;
1349 
1350 
1351 	iq = &vm_pageout_queue_internal;
1352 
1353 	vm_page_lock_queues();
1354 
1355 #if DEVELOPMENT || DEBUG
1356 	if (perf_test) {
1357 		iq = &vm_pageout_queue_benchmark;
1358 		// ensure the benchmark queue isn't throttled
1359 		iq->pgo_maxlaundry = (unsigned int) qcount;
1360 	}
1361 #endif /* DEVELOPMENT ||DEBUG */
1362 
1363 	while (qcount && !vm_page_queue_empty(q)) {
1364 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1365 
1366 		if (VM_PAGE_Q_THROTTLED(iq)) {
1367 			if (l_object != NULL) {
1368 				vm_object_unlock(l_object);
1369 				l_object = NULL;
1370 			}
1371 			iq->pgo_draining = TRUE;
1372 
1373 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1374 			vm_page_unlock_queues();
1375 
1376 			thread_block(THREAD_CONTINUE_NULL);
1377 
1378 			vm_page_lock_queues();
1379 			delayed_unlock = 0;
1380 			continue;
1381 		}
1382 		m = (vm_page_t) vm_page_queue_first(q);
1383 		m_object = VM_PAGE_OBJECT(m);
1384 
1385 		/*
1386 		 * check to see if we currently are working
1387 		 * with the same object... if so, we've
1388 		 * already got the lock
1389 		 */
1390 		if (m_object != l_object) {
1391 			if (!m_object->internal) {
1392 				goto reenter_pg_on_q;
1393 			}
1394 
1395 			/*
1396 			 * the object associated with candidate page is
1397 			 * different from the one we were just working
1398 			 * with... dump the lock if we still own it
1399 			 */
1400 			if (l_object != NULL) {
1401 				vm_object_unlock(l_object);
1402 				l_object = NULL;
1403 			}
1404 			if (m_object != t_object) {
1405 				try_failed_count = 0;
1406 			}
1407 
1408 			/*
1409 			 * Try to lock object; since we've alread got the
1410 			 * page queues lock, we can only 'try' for this one.
1411 			 * if the 'try' fails, we need to do a mutex_pause
1412 			 * to allow the owner of the object lock a chance to
1413 			 * run...
1414 			 */
1415 			if (!vm_object_lock_try_scan(m_object)) {
1416 				if (try_failed_count > 20) {
1417 					goto reenter_pg_on_q;
1418 				}
1419 				vm_page_unlock_queues();
1420 				mutex_pause(try_failed_count++);
1421 				vm_page_lock_queues();
1422 				delayed_unlock = 0;
1423 
1424 				t_object = m_object;
1425 				continue;
1426 			}
1427 			l_object = m_object;
1428 		}
1429 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1430 			/*
1431 			 * page is not to be cleaned
1432 			 * put it back on the head of its queue
1433 			 */
1434 			goto reenter_pg_on_q;
1435 		}
1436 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1437 
1438 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1439 			refmod_state = pmap_get_refmod(phys_page);
1440 
1441 			if (refmod_state & VM_MEM_REFERENCED) {
1442 				m->vmp_reference = TRUE;
1443 			}
1444 			if (refmod_state & VM_MEM_MODIFIED) {
1445 				SET_PAGE_DIRTY(m, FALSE);
1446 			}
1447 		}
1448 		if (m->vmp_reference == TRUE) {
1449 			m->vmp_reference = FALSE;
1450 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1451 			goto reenter_pg_on_q;
1452 		}
1453 		if (m->vmp_pmapped == TRUE) {
1454 			if (m->vmp_dirty || m->vmp_precious) {
1455 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1456 			} else {
1457 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1458 			}
1459 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1460 			if (refmod_state & VM_MEM_MODIFIED) {
1461 				SET_PAGE_DIRTY(m, FALSE);
1462 			}
1463 		}
1464 
1465 		if (!m->vmp_dirty && !m->vmp_precious) {
1466 			vm_page_unlock_queues();
1467 			VM_PAGE_FREE(m);
1468 			vm_page_lock_queues();
1469 			delayed_unlock = 0;
1470 
1471 			goto next_pg;
1472 		}
1473 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1474 			if (!m_object->pager_initialized) {
1475 				vm_page_unlock_queues();
1476 
1477 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1478 
1479 				if (!m_object->pager_initialized) {
1480 					vm_object_compressor_pager_create(m_object);
1481 				}
1482 
1483 				vm_page_lock_queues();
1484 				delayed_unlock = 0;
1485 			}
1486 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1487 				/*
1488 				 * We dropped the page queues lock above, so
1489 				 * "m" might no longer be on this queue...
1490 				 */
1491 				if (m != (vm_page_t) vm_page_queue_first(q)) {
1492 					continue;
1493 				}
1494 				goto reenter_pg_on_q;
1495 			}
1496 			/*
1497 			 * vm_object_compressor_pager_create will drop the object lock
1498 			 * which means 'm' may no longer be valid to use
1499 			 */
1500 			continue;
1501 		}
1502 
1503 		if (!perf_test) {
1504 			/*
1505 			 * we've already factored out pages in the laundry which
1506 			 * means this page can't be on the pageout queue so it's
1507 			 * safe to do the vm_page_queues_remove
1508 			 */
1509 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1510 			vm_page_queues_remove(m, TRUE);
1511 			if (donate) {
1512 				/*
1513 				 * The compressor needs to see this bit to know
1514 				 * where this page needs to land. Also if stolen,
1515 				 * this bit helps put the page back in the right
1516 				 * special queue where it belongs.
1517 				 */
1518 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1519 			}
1520 		} else {
1521 			vm_page_queue_remove(q, m, vmp_pageq);
1522 		}
1523 
1524 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1525 
1526 		vm_pageout_cluster_to_queue(m, iq);
1527 
1528 		pages_moved++;
1529 		goto next_pg;
1530 
1531 reenter_pg_on_q:
1532 		vm_page_queue_remove(q, m, vmp_pageq);
1533 		vm_page_queue_enter(q, m, vmp_pageq);
1534 next_pg:
1535 		qcount--;
1536 		try_failed_count = 0;
1537 
1538 		if (delayed_unlock++ > 128) {
1539 			if (l_object != NULL) {
1540 				vm_object_unlock(l_object);
1541 				l_object = NULL;
1542 			}
1543 			lck_mtx_yield(&vm_page_queue_lock);
1544 			delayed_unlock = 0;
1545 		}
1546 	}
1547 	if (l_object != NULL) {
1548 		vm_object_unlock(l_object);
1549 		l_object = NULL;
1550 	}
1551 	vm_page_unlock_queues();
1552 	return pages_moved;
1553 }
1554 
1555 
1556 
1557 /*
1558  * function in BSD to apply I/O throttle to the pageout thread
1559  */
1560 extern void vm_pageout_io_throttle(void);
1561 
1562 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1563 	MACRO_BEGIN                                                     \
1564 	/* \
1565 	 * If a "reusable" page somehow made it back into \
1566 	 * the active queue, it's been re-used and is not \
1567 	 * quite re-usable. \
1568 	 * If the VM object was "all_reusable", consider it \
1569 	 * as "all re-used" instead of converting it to \
1570 	 * "partially re-used", which could be expensive. \
1571 	 */                                                             \
1572 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1573 	if ((m)->vmp_reusable ||                                        \
1574 	    (obj)->all_reusable) {                                      \
1575 	        vm_object_reuse_pages((obj),                            \
1576 	                              (m)->vmp_offset,                  \
1577 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1578 	                              FALSE);                           \
1579 	}                                                               \
1580 	MACRO_END
1581 
1582 
1583 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1585 
1586 #define FCS_IDLE                0
1587 #define FCS_DELAYED             1
1588 #define FCS_DEADLOCK_DETECTED   2
1589 
1590 struct flow_control {
1591 	int             state;
1592 	mach_timespec_t ts;
1593 };
1594 
1595 
1596 uint64_t vm_pageout_rejected_bq_internal = 0;
1597 uint64_t vm_pageout_rejected_bq_external = 0;
1598 uint64_t vm_pageout_skipped_bq_internal = 0;
1599 uint64_t vm_pageout_skipped_bq_external = 0;
1600 
1601 #define ANONS_GRABBED_LIMIT     2
1602 
1603 
1604 #if 0
1605 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1606 #endif
1607 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1608 
1609 #define VM_PAGEOUT_PB_NO_ACTION                         0
1610 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1611 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1612 
1613 
1614 #if 0
1615 static void
1616 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1617 {
1618 	if (*local_freeq) {
1619 		vm_page_unlock_queues();
1620 
1621 		VM_DEBUG_CONSTANT_EVENT(
1622 			vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1623 			vm_page_free_count, 0, 0, 1);
1624 
1625 		vm_page_free_list(*local_freeq, TRUE);
1626 
1627 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1628 		    vm_page_free_count, *local_freed, 0, 1);
1629 
1630 		*local_freeq = NULL;
1631 		*local_freed = 0;
1632 
1633 		vm_page_lock_queues();
1634 	} else {
1635 		lck_mtx_yield(&vm_page_queue_lock);
1636 	}
1637 	*delayed_unlock = 1;
1638 }
1639 #endif
1640 
1641 
1642 static void
1643 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1644     vm_page_t *local_freeq, int *local_freed, int action)
1645 {
1646 	vm_page_unlock_queues();
1647 
1648 	if (*object != NULL) {
1649 		vm_object_unlock(*object);
1650 		*object = NULL;
1651 	}
1652 	if (*local_freeq) {
1653 		vm_page_free_list(*local_freeq, TRUE);
1654 
1655 		*local_freeq = NULL;
1656 		*local_freed = 0;
1657 	}
1658 	*delayed_unlock = 1;
1659 
1660 	switch (action) {
1661 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1662 		vm_consider_waking_compactor_swapper();
1663 		break;
1664 	case VM_PAGEOUT_PB_THREAD_YIELD:
1665 		thread_yield_internal(1);
1666 		break;
1667 	case VM_PAGEOUT_PB_NO_ACTION:
1668 	default:
1669 		break;
1670 	}
1671 	vm_page_lock_queues();
1672 }
1673 
1674 
1675 static struct vm_pageout_vminfo last;
1676 
1677 uint64_t last_vm_page_pages_grabbed = 0;
1678 
1679 extern  uint32_t c_segment_pages_compressed;
1680 
1681 extern uint64_t shared_region_pager_reclaimed;
1682 extern struct memory_object_pager_ops shared_region_pager_ops;
1683 
1684 void
1685 update_vm_info(void)
1686 {
1687 	unsigned long tmp;
1688 	uint64_t tmp64;
1689 
1690 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1691 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1692 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1693 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1694 
1695 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1696 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1697 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1698 
1699 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1700 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1701 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1702 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1703 	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1704 
1705 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1706 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1707 	last.vm_pageout_considered_page = tmp;
1708 
1709 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1710 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1711 	last.vm_pageout_compressions = tmp64;
1712 
1713 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1714 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1715 	last.vm_compressor_failed = tmp;
1716 
1717 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1718 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1719 	last.vm_compressor_pages_grabbed = tmp64;
1720 
1721 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1722 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1723 	last.vm_phantom_cache_found_ghost = tmp;
1724 
1725 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1726 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1727 	last.vm_phantom_cache_added_ghost = tmp;
1728 
1729 	tmp64 = counter_load(&vm_page_grab_count);
1730 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1731 	last_vm_page_pages_grabbed = tmp64;
1732 
1733 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1734 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1735 	last.vm_page_pages_freed = tmp;
1736 
1737 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1738 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1739 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1740 		last.vm_pageout_pages_evicted = tmp;
1741 
1742 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1743 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1744 		last.vm_pageout_pages_purged = tmp;
1745 
1746 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1747 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1748 		last.vm_pageout_freed_speculative = tmp;
1749 
1750 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1751 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1752 		last.vm_pageout_freed_external = tmp;
1753 
1754 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1755 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1756 		last.vm_pageout_inactive_referenced = tmp;
1757 
1758 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1759 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1760 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1761 
1762 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1763 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1764 		last.vm_pageout_inactive_dirty_external = tmp;
1765 
1766 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1767 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1768 		last.vm_pageout_freed_cleaned = tmp;
1769 
1770 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1771 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1772 		last.vm_pageout_inactive_nolock = tmp;
1773 
1774 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1775 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1776 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1777 
1778 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1779 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1780 		last.vm_pageout_skipped_external = tmp;
1781 
1782 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1783 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1784 		last.vm_pageout_skipped_internal = tmp;
1785 
1786 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1787 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1788 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1789 
1790 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1791 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1792 		last.vm_pageout_inactive_force_reclaim = tmp;
1793 
1794 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1795 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1796 		last.vm_pageout_freed_internal = tmp;
1797 
1798 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1799 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1800 		last.vm_pageout_considered_bq_internal = tmp;
1801 
1802 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1803 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1804 		last.vm_pageout_considered_bq_external = tmp;
1805 
1806 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1807 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1808 		last.vm_pageout_filecache_min_reactivated = tmp;
1809 
1810 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1811 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1812 		last.vm_pageout_inactive_dirty_internal = tmp;
1813 
1814 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1815 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1816 		last.vm_pageout_forcereclaimed_sharedcache = tmp;
1817 
1818 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1819 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1820 		last.vm_pageout_forcereclaimed_realtime = tmp;
1821 
1822 		tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1823 		vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1824 		last.vm_pageout_protected_sharedcache = tmp;
1825 
1826 		tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1827 		vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1828 		last.vm_pageout_protected_realtime = tmp;
1829 	}
1830 
1831 	KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
1832 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1833 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1834 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1835 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1836 
1837 	KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
1838 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1839 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1840 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1841 
1842 	KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
1843 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1844 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1845 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1846 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1847 
1848 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1849 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1850 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1851 		KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
1852 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1853 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1854 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1855 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1856 
1857 		KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
1858 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1859 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1860 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1861 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1862 
1863 		KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
1864 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1865 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1866 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1867 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1868 
1869 		KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
1870 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1871 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1872 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1873 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1874 
1875 		KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
1876 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1877 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1878 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1879 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1880 
1881 		KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
1882 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1883 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1884 		    vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1885 		    vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1886 	}
1887 	KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
1888 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1889 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1890 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1891 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1892 
1893 	record_memory_pressure();
1894 }
1895 
1896 extern boolean_t hibernation_vmqueues_inspection;
1897 
1898 /*
1899  * Return values for functions called by vm_pageout_scan
1900  * that control its flow.
1901  *
1902  * PROCEED -- vm_pageout_scan will keep making forward progress.
1903  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1904  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1905  */
1906 
1907 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1908 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1909 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1910 
1911 /*
1912  * This function is called only from vm_pageout_scan and
1913  * it moves overflow secluded pages (one-at-a-time) to the
1914  * batched 'local' free Q or active Q.
1915  */
1916 static void
1917 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1918 {
1919 #if CONFIG_SECLUDED_MEMORY
1920 	/*
1921 	 * Deal with secluded_q overflow.
1922 	 */
1923 	if (vm_page_secluded_count > vm_page_secluded_target) {
1924 		vm_page_t secluded_page;
1925 
1926 		/*
1927 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1928 		 * Excess secluded pages go to the active queue and
1929 		 * will later go to the inactive queue.
1930 		 */
1931 		assert((vm_page_secluded_count_free +
1932 		    vm_page_secluded_count_inuse) ==
1933 		    vm_page_secluded_count);
1934 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1935 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1936 
1937 		vm_page_queues_remove(secluded_page, FALSE);
1938 		assert(!vm_page_is_fictitious(secluded_page));
1939 		assert(!VM_PAGE_WIRED(secluded_page));
1940 
1941 		if (secluded_page->vmp_object == 0) {
1942 			/* transfer to free queue */
1943 			assert(secluded_page->vmp_busy);
1944 			secluded_page->vmp_snext = *local_freeq;
1945 			*local_freeq = secluded_page;
1946 			*local_freed += 1;
1947 		} else {
1948 			/* transfer to head of active queue */
1949 			vm_page_enqueue_active(secluded_page, FALSE);
1950 			secluded_page = VM_PAGE_NULL;
1951 		}
1952 	}
1953 #else /* CONFIG_SECLUDED_MEMORY */
1954 
1955 #pragma unused(local_freeq)
1956 #pragma unused(local_freed)
1957 
1958 	return;
1959 
1960 #endif /* CONFIG_SECLUDED_MEMORY */
1961 }
1962 
1963 /*
1964  * This function is called only from vm_pageout_scan and
1965  * it initializes the loop targets for vm_pageout_scan().
1966  */
1967 static void
1968 vps_init_page_targets(void)
1969 {
1970 	/*
1971 	 * LD TODO: Other page targets should be calculated here too.
1972 	 */
1973 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1974 
1975 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1976 		vm_pageout_state.vm_page_speculative_percentage = 50;
1977 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1978 		vm_pageout_state.vm_page_speculative_percentage = 1;
1979 	}
1980 
1981 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1982 	    vm_page_inactive_count);
1983 }
1984 
1985 /*
1986  * This function is called only from vm_pageout_scan and
1987  * it purges a single VM object at-a-time and will either
1988  * make vm_pageout_scan() restart the loop or keeping moving forward.
1989  */
1990 static int
1991 vps_purge_object()
1992 {
1993 	int             force_purge;
1994 
1995 	assert(available_for_purge >= 0);
1996 	force_purge = 0; /* no force-purging */
1997 
1998 #if VM_PRESSURE_EVENTS
1999 	vm_pressure_level_t pressure_level;
2000 
2001 	pressure_level = memorystatus_vm_pressure_level;
2002 
2003 	if (pressure_level > kVMPressureNormal) {
2004 		if (pressure_level >= kVMPressureCritical) {
2005 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2006 		} else if (pressure_level >= kVMPressureUrgent) {
2007 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2008 		} else if (pressure_level >= kVMPressureWarning) {
2009 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2010 		}
2011 	}
2012 #endif /* VM_PRESSURE_EVENTS */
2013 
2014 	if (available_for_purge || force_purge) {
2015 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2016 
2017 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2018 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2019 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2020 			VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2021 			memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2022 
2023 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2024 		}
2025 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2026 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2027 	}
2028 
2029 	return VM_PAGEOUT_SCAN_PROCEED;
2030 }
2031 
2032 /*
2033  * This function is called only from vm_pageout_scan and
2034  * it will try to age the next speculative Q if the oldest
2035  * one is empty.
2036  */
2037 static int
2038 vps_age_speculative_queue(boolean_t force_speculative_aging)
2039 {
2040 #define DELAY_SPECULATIVE_AGE   1000
2041 
2042 	/*
2043 	 * try to pull pages from the aging bins...
2044 	 * see vm_page_internal.h for an explanation of how
2045 	 * this mechanism works
2046 	 */
2047 	boolean_t                       can_steal = FALSE;
2048 	int                             num_scanned_queues;
2049 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2050 	mach_timespec_t                 ts;
2051 	struct vm_speculative_age_q     *aq;
2052 	struct vm_speculative_age_q     *sq;
2053 
2054 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2055 
2056 	aq = &vm_page_queue_speculative[speculative_steal_index];
2057 
2058 	num_scanned_queues = 0;
2059 	while (vm_page_queue_empty(&aq->age_q) &&
2060 	    num_scanned_queues++ != vm_page_max_speculative_age_q) {
2061 		speculative_steal_index++;
2062 
2063 		if (speculative_steal_index > vm_page_max_speculative_age_q) {
2064 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2065 		}
2066 
2067 		aq = &vm_page_queue_speculative[speculative_steal_index];
2068 	}
2069 
2070 	if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2071 		/*
2072 		 * XXX We've scanned all the speculative
2073 		 * queues but still haven't found one
2074 		 * that is not empty, even though
2075 		 * vm_page_speculative_count is not 0.
2076 		 */
2077 		if (!vm_page_queue_empty(&sq->age_q)) {
2078 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2079 		}
2080 #if DEVELOPMENT || DEBUG
2081 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2082 #endif
2083 		/* readjust... */
2084 		vm_page_speculative_count = 0;
2085 		/* ... and continue */
2086 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2087 	}
2088 
2089 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2090 		can_steal = TRUE;
2091 	} else {
2092 		if (!delay_speculative_age) {
2093 			mach_timespec_t ts_fully_aged;
2094 
2095 			ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2096 			ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2097 			    * 1000 * NSEC_PER_USEC;
2098 
2099 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2100 
2101 			clock_sec_t sec;
2102 			clock_nsec_t nsec;
2103 			clock_get_system_nanotime(&sec, &nsec);
2104 			ts.tv_sec = (unsigned int) sec;
2105 			ts.tv_nsec = nsec;
2106 
2107 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2108 				can_steal = TRUE;
2109 			} else {
2110 				delay_speculative_age++;
2111 			}
2112 		} else {
2113 			delay_speculative_age++;
2114 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2115 				delay_speculative_age = 0;
2116 			}
2117 		}
2118 	}
2119 	if (can_steal == TRUE) {
2120 		vm_page_speculate_ageit(aq);
2121 	}
2122 
2123 	return VM_PAGEOUT_SCAN_PROCEED;
2124 }
2125 
2126 /*
2127  * This function is called only from vm_pageout_scan and
2128  * it evicts a single VM object from the cache.
2129  */
2130 static int inline
2131 vps_object_cache_evict(vm_object_t *object_to_unlock)
2132 {
2133 	static int                      cache_evict_throttle = 0;
2134 	struct vm_speculative_age_q     *sq;
2135 
2136 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2137 
2138 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2139 		int     pages_evicted;
2140 
2141 		if (*object_to_unlock != NULL) {
2142 			vm_object_unlock(*object_to_unlock);
2143 			*object_to_unlock = NULL;
2144 		}
2145 		KDBG(0x13001ec | DBG_FUNC_START);
2146 
2147 		pages_evicted = vm_object_cache_evict(100, 10);
2148 
2149 		KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2150 
2151 		if (pages_evicted) {
2152 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2153 
2154 			VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2155 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2156 			memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2157 
2158 			/*
2159 			 * we just freed up to 100 pages,
2160 			 * so go back to the top of the main loop
2161 			 * and re-evaulate the memory situation
2162 			 */
2163 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2164 		} else {
2165 			cache_evict_throttle = 1000;
2166 		}
2167 	}
2168 	if (cache_evict_throttle) {
2169 		cache_evict_throttle--;
2170 	}
2171 
2172 	return VM_PAGEOUT_SCAN_PROCEED;
2173 }
2174 
2175 
2176 /*
2177  * This function is called only from vm_pageout_scan and
2178  * it calculates the filecache min. that needs to be maintained
2179  * as we start to steal pages.
2180  */
2181 static void
2182 vps_calculate_filecache_min(void)
2183 {
2184 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2185 
2186 #if CONFIG_JETSAM
2187 	/*
2188 	 * don't let the filecache_min fall below 15% of available memory
2189 	 * on systems with an active compressor that isn't nearing its
2190 	 * limits w/r to accepting new data
2191 	 *
2192 	 * on systems w/o the compressor/swapper, the filecache is always
2193 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2194 	 * since most (if not all) of the anonymous pages are in the
2195 	 * throttled queue (which isn't counted as available) which
2196 	 * effectively disables this filter
2197 	 */
2198 	if (vm_compressor_low_on_space() || divisor == 0) {
2199 		vm_pageout_state.vm_page_filecache_min = 0;
2200 	} else {
2201 		vm_pageout_state.vm_page_filecache_min =
2202 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2203 	}
2204 #else
2205 	if (vm_compressor_out_of_space() || divisor == 0) {
2206 		vm_pageout_state.vm_page_filecache_min = 0;
2207 	} else {
2208 		/*
2209 		 * don't let the filecache_min fall below the specified critical level
2210 		 */
2211 		vm_pageout_state.vm_page_filecache_min =
2212 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2213 	}
2214 #endif
2215 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2216 		vm_pageout_state.vm_page_filecache_min = 0;
2217 	}
2218 }
2219 
2220 /*
2221  * This function is called only from vm_pageout_scan and
2222  * it updates the flow control time to detect if VM pageoutscan
2223  * isn't making progress.
2224  */
2225 static void
2226 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2227 {
2228 	mach_timespec_t ts;
2229 	clock_sec_t sec;
2230 	clock_nsec_t nsec;
2231 
2232 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2233 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2234 	clock_get_system_nanotime(&sec, &nsec);
2235 	flow_control->ts.tv_sec = (unsigned int) sec;
2236 	flow_control->ts.tv_nsec = nsec;
2237 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2238 
2239 	flow_control->state = FCS_DELAYED;
2240 
2241 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2242 }
2243 
2244 /*
2245  * This function is called only from vm_pageout_scan and
2246  * it is the flow control logic of VM pageout scan which
2247  * controls if it should block and for how long.
2248  * Any blocking of vm_pageout_scan happens ONLY in this function.
2249  */
2250 static int
2251 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2252     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2253 {
2254 	boolean_t       exceeded_burst_throttle = FALSE;
2255 	unsigned int    msecs = 0;
2256 	uint32_t        inactive_external_count;
2257 	mach_timespec_t ts;
2258 	struct  vm_pageout_queue *iq;
2259 	struct  vm_pageout_queue *eq;
2260 	struct  vm_speculative_age_q *sq;
2261 
2262 	iq = &vm_pageout_queue_internal;
2263 	eq = &vm_pageout_queue_external;
2264 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2265 
2266 	/*
2267 	 * Sometimes we have to pause:
2268 	 *	1) No inactive pages - nothing to do.
2269 	 *	2) Loop control - no acceptable pages found on the inactive queue
2270 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2271 	 *	3) Flow control - default pageout queue is full
2272 	 */
2273 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2274 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2275 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2276 	    vm_page_queue_empty(&sq->age_q)) {
2277 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2278 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2279 	} else if (inactive_burst_count >=
2280 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2281 	    (vm_page_inactive_count +
2282 	    vm_page_speculative_count))) {
2283 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2284 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2285 
2286 		exceeded_burst_throttle = TRUE;
2287 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2288 	    VM_DYNAMIC_PAGING_ENABLED()) {
2289 		clock_sec_t sec;
2290 		clock_nsec_t nsec;
2291 
2292 		switch (flow_control->state) {
2293 		case FCS_IDLE:
2294 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2295 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2296 				/*
2297 				 * since the compressor is running independently of vm_pageout_scan
2298 				 * let's not wait for it just yet... as long as we have a healthy supply
2299 				 * of filecache pages to work with, let's keep stealing those.
2300 				 */
2301 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2302 
2303 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2304 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2305 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2306 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2307 					return VM_PAGEOUT_SCAN_PROCEED;
2308 				}
2309 			}
2310 
2311 			vps_flow_control_reset_deadlock_timer(flow_control);
2312 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2313 
2314 			break;
2315 
2316 		case FCS_DELAYED:
2317 			clock_get_system_nanotime(&sec, &nsec);
2318 			ts.tv_sec = (unsigned int) sec;
2319 			ts.tv_nsec = nsec;
2320 
2321 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2322 				/*
2323 				 * the pageout thread for the default pager is potentially
2324 				 * deadlocked since the
2325 				 * default pager queue has been throttled for more than the
2326 				 * allowable time... we need to move some clean pages or dirty
2327 				 * pages belonging to the external pagers if they aren't throttled
2328 				 * vm_page_free_wanted represents the number of threads currently
2329 				 * blocked waiting for pages... we'll move one page for each of
2330 				 * these plus a fixed amount to break the logjam... once we're done
2331 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2332 				 * with a new timeout target since we have no way of knowing
2333 				 * whether we've broken the deadlock except through observation
2334 				 * of the queue associated with the default pager... we need to
2335 				 * stop moving pages and allow the system to run to see what
2336 				 * state it settles into.
2337 				 */
2338 
2339 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2340 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2341 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2342 				flow_control->state = FCS_DEADLOCK_DETECTED;
2343 				sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
2344 				return VM_PAGEOUT_SCAN_PROCEED;
2345 			}
2346 			/*
2347 			 * just resniff instead of trying
2348 			 * to compute a new delay time... we're going to be
2349 			 * awakened immediately upon a laundry completion,
2350 			 * so we won't wait any longer than necessary
2351 			 */
2352 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2353 			break;
2354 
2355 		case FCS_DEADLOCK_DETECTED:
2356 			if (*vm_pageout_deadlock_target) {
2357 				return VM_PAGEOUT_SCAN_PROCEED;
2358 			}
2359 
2360 			vps_flow_control_reset_deadlock_timer(flow_control);
2361 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2362 
2363 			break;
2364 		}
2365 	} else {
2366 		/*
2367 		 * No need to pause...
2368 		 */
2369 		return VM_PAGEOUT_SCAN_PROCEED;
2370 	}
2371 
2372 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2373 
2374 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2375 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2376 
2377 	if (vm_page_free_count >= vm_page_free_target) {
2378 		/*
2379 		 * we're here because
2380 		 *  1) someone else freed up some pages while we had
2381 		 *     the queues unlocked above
2382 		 * and we've hit one of the 3 conditions that
2383 		 * cause us to pause the pageout scan thread
2384 		 *
2385 		 * since we already have enough free pages,
2386 		 * let's avoid stalling and return normally
2387 		 *
2388 		 * before we return, make sure the pageout I/O threads
2389 		 * are running throttled in case there are still requests
2390 		 * in the laundry... since we have enough free pages
2391 		 * we don't need the laundry to be cleaned in a timely
2392 		 * fashion... so let's avoid interfering with foreground
2393 		 * activity
2394 		 *
2395 		 * we don't want to hold vm_page_queue_free_lock when
2396 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2397 		 * may cause other locks to be taken), we do the intitial
2398 		 * check outside of the lock.  Once we take the lock,
2399 		 * we recheck the condition since it may have changed.
2400 		 * if it has, no problem, we will make the threads
2401 		 * non-throttled before actually blocking
2402 		 */
2403 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2404 	}
2405 	vm_free_page_lock();
2406 
2407 	if (vm_page_free_count >= vm_page_free_target &&
2408 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2409 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2410 	}
2411 	vm_free_page_unlock();
2412 
2413 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2414 		/*
2415 		 * we're most likely about to block due to one of
2416 		 * the 3 conditions that cause vm_pageout_scan to
2417 		 * not be able to make forward progress w/r
2418 		 * to providing new pages to the free queue,
2419 		 * so unthrottle the I/O threads in case we
2420 		 * have laundry to be cleaned... it needs
2421 		 * to be completed ASAP.
2422 		 *
2423 		 * even if we don't block, we want the io threads
2424 		 * running unthrottled since the sum of free +
2425 		 * clean pages is still under our free target
2426 		 */
2427 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2428 	}
2429 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2430 		/*
2431 		 * if we get here we're below our free target and
2432 		 * we're stalling due to a full laundry queue or
2433 		 * we don't have any inactive pages other then
2434 		 * those in the clean queue...
2435 		 * however, we have pages on the clean queue that
2436 		 * can be moved to the free queue, so let's not
2437 		 * stall the pageout scan
2438 		 */
2439 		flow_control->state = FCS_IDLE;
2440 		return VM_PAGEOUT_SCAN_PROCEED;
2441 	}
2442 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2443 		flow_control->state = FCS_IDLE;
2444 		return VM_PAGEOUT_SCAN_PROCEED;
2445 	}
2446 
2447 	VM_CHECK_MEMORYSTATUS;
2448 
2449 	if (flow_control->state != FCS_IDLE) {
2450 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2451 	}
2452 
2453 	iq->pgo_throttled = TRUE;
2454 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2455 
2456 	vm_page_unlock_queues();
2457 
2458 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2459 
2460 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2461 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2462 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2463 
2464 	thread_block(THREAD_CONTINUE_NULL);
2465 
2466 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2467 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2468 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2469 
2470 	vm_page_lock_queues();
2471 
2472 	iq->pgo_throttled = FALSE;
2473 
2474 	vps_init_page_targets();
2475 
2476 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2477 }
2478 
2479 extern boolean_t vm_darkwake_mode;
2480 /*
2481  * This function is called only from vm_pageout_scan and
2482  * it will find and return the most appropriate page to be
2483  * reclaimed.
2484  */
2485 static int
2486 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2487     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2488 {
2489 	vm_page_t                       m = NULL;
2490 	vm_object_t                     m_object = VM_OBJECT_NULL;
2491 	uint32_t                        inactive_external_count;
2492 	struct vm_speculative_age_q     *sq;
2493 	struct vm_pageout_queue         *iq;
2494 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2495 
2496 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2497 	iq = &vm_pageout_queue_internal;
2498 
2499 	*is_page_from_bg_q = FALSE;
2500 
2501 	m = NULL;
2502 	m_object = VM_OBJECT_NULL;
2503 
2504 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2505 		assert(vm_page_throttled_count == 0);
2506 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2507 	}
2508 
2509 	/*
2510 	 * Try for a clean-queue inactive page.
2511 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2512 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2513 	 */
2514 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2515 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2516 
2517 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2518 
2519 		goto found_page;
2520 	}
2521 
2522 	/*
2523 	 * The next most eligible pages are ones we paged in speculatively,
2524 	 * but which have not yet been touched and have been aged out.
2525 	 */
2526 	if (!vm_page_queue_empty(&sq->age_q)) {
2527 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2528 
2529 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2530 
2531 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2532 			goto found_page;
2533 		} else {
2534 			m = NULL;
2535 		}
2536 	}
2537 
2538 #if !CONFIG_JETSAM
2539 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2540 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2541 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2542 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2543 			goto found_page;
2544 		}
2545 	}
2546 #endif /* !CONFIG_JETSAM */
2547 
2548 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2549 		vm_object_t     bg_m_object = NULL;
2550 
2551 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2552 
2553 		bg_m_object = VM_PAGE_OBJECT(m);
2554 
2555 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2556 			/*
2557 			 * This page is on the background queue
2558 			 * but not on a pageable queue OR is busy during
2559 			 * darkwake mode when the target is artificially lowered.
2560 			 * If it is busy during darkwake mode, and we don't skip it,
2561 			 * we will just swing back around and try again with the same
2562 			 * queue and might hit the same page or its neighbor in a
2563 			 * similar state. Both of these are transient states and will
2564 			 * get resolved, but, at this point let's ignore this page.
2565 			 */
2566 			if (vm_darkwake_mode && m->vmp_busy) {
2567 				if (bg_m_object->internal) {
2568 					vm_pageout_skipped_bq_internal++;
2569 				} else {
2570 					vm_pageout_skipped_bq_external++;
2571 				}
2572 			}
2573 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2574 			if (bg_m_object->internal &&
2575 			    (VM_PAGE_Q_THROTTLED(iq) ||
2576 			    vm_compressor_out_of_space() == TRUE ||
2577 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2578 				vm_pageout_skipped_bq_internal++;
2579 			} else {
2580 				*is_page_from_bg_q = TRUE;
2581 
2582 				if (bg_m_object->internal) {
2583 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2584 				} else {
2585 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2586 				}
2587 				goto found_page;
2588 			}
2589 		}
2590 	}
2591 
2592 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2593 
2594 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2595 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2596 		*grab_anonymous = TRUE;
2597 		*anons_grabbed = 0;
2598 
2599 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2600 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2601 		} else {
2602 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2603 				/*
2604 				 * No swap and we are in dangerously low levels of free memory.
2605 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2606 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2607 				 *
2608 				 * So, pick a file backed page...
2609 				 */
2610 				*grab_anonymous = FALSE;
2611 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2612 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2613 			}
2614 		}
2615 		goto want_anonymous;
2616 	}
2617 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2618 
2619 #if CONFIG_JETSAM
2620 	/* If the file-backed pool has accumulated
2621 	 * significantly more pages than the jetsam
2622 	 * threshold, prefer to reclaim those
2623 	 * inline to minimise compute overhead of reclaiming
2624 	 * anonymous pages.
2625 	 * This calculation does not account for the CPU local
2626 	 * external page queues, as those are expected to be
2627 	 * much smaller relative to the global pools.
2628 	 */
2629 
2630 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2631 
2632 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2633 		if (vm_page_pageable_external_count >
2634 		    vm_pageout_state.vm_page_filecache_min) {
2635 			if ((vm_page_pageable_external_count *
2636 			    vm_pageout_memorystatus_fb_factor_dr) >
2637 			    (memorystatus_get_critical_page_shortage_threshold() *
2638 			    vm_pageout_memorystatus_fb_factor_nr)) {
2639 				*grab_anonymous = FALSE;
2640 
2641 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2642 			}
2643 		}
2644 		if (*grab_anonymous) {
2645 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2646 		}
2647 	}
2648 #endif /* CONFIG_JETSAM */
2649 
2650 want_anonymous:
2651 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2652 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2653 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2654 
2655 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2656 			*anons_grabbed = 0;
2657 
2658 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2659 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2660 					if ((++(*reactivated_this_call) % 100)) {
2661 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2662 
2663 						vm_page_activate(m);
2664 						counter_inc(&vm_statistics_reactivations);
2665 #if DEVELOPMENT || DEBUG
2666 						if (*is_page_from_bg_q == TRUE) {
2667 							if (m_object->internal) {
2668 								vm_pageout_rejected_bq_internal++;
2669 							} else {
2670 								vm_pageout_rejected_bq_external++;
2671 							}
2672 						}
2673 #endif /* DEVELOPMENT || DEBUG */
2674 						vm_pageout_state.vm_pageout_inactive_used++;
2675 
2676 						m = NULL;
2677 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2678 
2679 						goto found_page;
2680 					}
2681 
2682 					/*
2683 					 * steal 1 of the file backed pages even if
2684 					 * we are under the limit that has been set
2685 					 * for a healthy filecache
2686 					 */
2687 				}
2688 			}
2689 			goto found_page;
2690 		}
2691 	}
2692 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2693 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2694 
2695 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2696 		*anons_grabbed += 1;
2697 
2698 		goto found_page;
2699 	}
2700 
2701 	m = NULL;
2702 
2703 found_page:
2704 	*victim_page = m;
2705 
2706 	return retval;
2707 }
2708 
2709 /*
2710  * This function is called only from vm_pageout_scan and
2711  * it will put a page back on the active/inactive queue
2712  * if we can't reclaim it for some reason.
2713  */
2714 static void
2715 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2716 {
2717 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2718 		vm_page_enqueue_inactive(m, FALSE);
2719 	} else {
2720 		vm_page_activate(m);
2721 	}
2722 
2723 #if DEVELOPMENT || DEBUG
2724 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2725 
2726 	if (page_from_bg_q == TRUE) {
2727 		if (m_object->internal) {
2728 			vm_pageout_rejected_bq_internal++;
2729 		} else {
2730 			vm_pageout_rejected_bq_external++;
2731 		}
2732 	}
2733 #endif /* DEVELOPMENT || DEBUG */
2734 }
2735 
2736 /*
2737  * This function is called only from vm_pageout_scan and
2738  * it will try to grab the victim page's VM object (m_object)
2739  * which differs from the previous victim page's object (object).
2740  */
2741 static int
2742 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2743 {
2744 	struct vm_speculative_age_q *sq;
2745 
2746 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2747 
2748 	/*
2749 	 * the object associated with candidate page is
2750 	 * different from the one we were just working
2751 	 * with... dump the lock if we still own it
2752 	 */
2753 	if (*object != NULL) {
2754 		vm_object_unlock(*object);
2755 		*object = NULL;
2756 	}
2757 	/*
2758 	 * Try to lock object; since we've alread got the
2759 	 * page queues lock, we can only 'try' for this one.
2760 	 * if the 'try' fails, we need to do a mutex_pause
2761 	 * to allow the owner of the object lock a chance to
2762 	 * run... otherwise, we're likely to trip over this
2763 	 * object in the same state as we work our way through
2764 	 * the queue... clumps of pages associated with the same
2765 	 * object are fairly typical on the inactive and active queues
2766 	 */
2767 	if (!vm_object_lock_try_scan(m_object)) {
2768 		vm_page_t m_want = NULL;
2769 
2770 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2771 
2772 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2773 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2774 		}
2775 
2776 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2777 
2778 		m->vmp_reference = FALSE;
2779 
2780 		if (!m_object->object_is_shared_cache) {
2781 			/*
2782 			 * don't apply this optimization if this is the shared cache
2783 			 * object, it's too easy to get rid of very hot and important
2784 			 * pages...
2785 			 * m->vmp_object must be stable since we hold the page queues lock...
2786 			 * we can update the scan_collisions field sans the object lock
2787 			 * since it is a separate field and this is the only spot that does
2788 			 * a read-modify-write operation and it is never executed concurrently...
2789 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2790 			 * is possible for the value to be a bit non-determistic, but that's ok
2791 			 * since it's only used as a hint
2792 			 */
2793 			m_object->scan_collisions = 1;
2794 		}
2795 		if (page_from_bg_q) {
2796 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2797 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2798 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2799 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2800 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2801 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2802 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2803 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2804 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2805 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2806 		}
2807 
2808 		/*
2809 		 * this is the next object we're going to be interested in
2810 		 * try to make sure its available after the mutex_pause
2811 		 * returns control
2812 		 */
2813 		if (m_want) {
2814 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2815 		}
2816 
2817 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2818 
2819 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2820 	} else {
2821 		*object = m_object;
2822 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2823 	}
2824 
2825 	return VM_PAGEOUT_SCAN_PROCEED;
2826 }
2827 
2828 /*
2829  * This function is called only from vm_pageout_scan and
2830  * it notices that pageout scan may be rendered ineffective
2831  * due to a FS deadlock and will jetsam a process if possible.
2832  * If jetsam isn't supported, it'll move the page to the active
2833  * queue to try and get some different pages pushed onwards so
2834  * we can try to get out of this scenario.
2835  */
2836 static void
2837 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2838     boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2839 {
2840 	struct  vm_pageout_queue *eq;
2841 	vm_object_t cur_object = VM_OBJECT_NULL;
2842 
2843 	cur_object = *object;
2844 
2845 	eq = &vm_pageout_queue_external;
2846 
2847 	if (cur_object->internal == FALSE) {
2848 		/*
2849 		 * we need to break up the following potential deadlock case...
2850 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2851 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2852 		 *  c) Most of the pages in the inactive queue belong to this file.
2853 		 *
2854 		 * we are potentially in this deadlock because...
2855 		 *  a) the external pageout queue is throttled
2856 		 *  b) we're done with the active queue and moved on to the inactive queue
2857 		 *  c) we've got a dirty external page
2858 		 *
2859 		 * since we don't know the reason for the external pageout queue being throttled we
2860 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2861 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2862 		 *
2863 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2864 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2865 		 * pool the next time we select a victim page... if we can make enough new free pages,
2866 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2867 		 * be throttled
2868 		 *
2869 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2870 		 * that we can try to find clean pages in the active/inactive queues before
2871 		 * deciding to jetsam a process
2872 		 */
2873 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2874 
2875 		vm_page_check_pageable_safe(m);
2876 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2877 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2878 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2879 		vm_page_active_count++;
2880 		vm_page_pageable_external_count++;
2881 
2882 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2883 
2884 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2885 
2886 #pragma unused(force_anonymous)
2887 
2888 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2889 
2890 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2891 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2892 			/*
2893 			 * Possible deadlock scenario so request jetsam action
2894 			 */
2895 			memorystatus_kill_on_vps_starvation();
2896 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2897 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2898 		}
2899 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2900 
2901 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2902 
2903 		*force_anonymous = TRUE;
2904 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2905 	} else {
2906 		vm_page_activate(m);
2907 		counter_inc(&vm_statistics_reactivations);
2908 
2909 #if DEVELOPMENT || DEBUG
2910 		if (is_page_from_bg_q == TRUE) {
2911 			if (cur_object->internal) {
2912 				vm_pageout_rejected_bq_internal++;
2913 			} else {
2914 				vm_pageout_rejected_bq_external++;
2915 			}
2916 		}
2917 #endif /* DEVELOPMENT || DEBUG */
2918 
2919 		vm_pageout_state.vm_pageout_inactive_used++;
2920 	}
2921 }
2922 
2923 
2924 void
2925 vm_page_balance_inactive(int max_to_move)
2926 {
2927 	vm_page_t m;
2928 
2929 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2930 
2931 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2932 		/*
2933 		 * It is likely that the hibernation code path is
2934 		 * dealing with these very queues as we are about
2935 		 * to move pages around in/from them and completely
2936 		 * change the linkage of the pages.
2937 		 *
2938 		 * And so we skip the rebalancing of these queues.
2939 		 */
2940 		return;
2941 	}
2942 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2943 	    vm_page_inactive_count +
2944 	    vm_page_speculative_count);
2945 
2946 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2947 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2948 
2949 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2950 
2951 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2952 		assert(!m->vmp_laundry);
2953 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2954 		assert(!vm_page_is_guard(m));
2955 
2956 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2957 
2958 		/*
2959 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2960 		 *
2961 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2962 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2963 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2964 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2965 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2966 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2967 		 * have happened before we moved the page
2968 		 */
2969 		if (m->vmp_pmapped == TRUE) {
2970 			/*
2971 			 * We might be holding the page queue lock as a
2972 			 * spin lock and clearing the "referenced" bit could
2973 			 * take a while if there are lots of mappings of
2974 			 * that page, so make sure we acquire the lock as
2975 			 * as mutex to avoid a spinlock timeout.
2976 			 */
2977 			vm_page_lockconvert_queues();
2978 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2979 		}
2980 
2981 		/*
2982 		 * The page might be absent or busy,
2983 		 * but vm_page_deactivate can handle that.
2984 		 * FALSE indicates that we don't want a H/W clear reference
2985 		 */
2986 		vm_page_deactivate_internal(m, FALSE);
2987 	}
2988 }
2989 
2990 /*
2991  *	vm_pageout_scan does the dirty work for the pageout daemon.
2992  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2993  *	held and vm_page_free_wanted == 0.
2994  */
2995 void
2996 vm_pageout_scan(void)
2997 {
2998 	unsigned int loop_count = 0;
2999 	unsigned int inactive_burst_count = 0;
3000 	unsigned int reactivated_this_call;
3001 	unsigned int reactivate_limit;
3002 	vm_page_t   local_freeq = NULL;
3003 	int         local_freed = 0;
3004 	int         delayed_unlock;
3005 	int         delayed_unlock_limit = 0;
3006 	int         refmod_state = 0;
3007 	int     vm_pageout_deadlock_target = 0;
3008 	struct  vm_pageout_queue *iq;
3009 	struct  vm_pageout_queue *eq;
3010 	struct  vm_speculative_age_q *sq;
3011 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3012 	boolean_t inactive_throttled = FALSE;
3013 	vm_object_t     object = NULL;
3014 	uint32_t        inactive_reclaim_run;
3015 	boolean_t       grab_anonymous = FALSE;
3016 	boolean_t       force_anonymous = FALSE;
3017 	boolean_t       force_speculative_aging = FALSE;
3018 	int             anons_grabbed = 0;
3019 	int             page_prev_q_state = 0;
3020 	boolean_t       page_from_bg_q = FALSE;
3021 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
3022 	vm_object_t     m_object = VM_OBJECT_NULL;
3023 	int             retval = 0;
3024 	boolean_t       lock_yield_check = FALSE;
3025 
3026 
3027 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3028 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
3029 	    vm_pageout_state.vm_pageout_inactive_clean,
3030 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3031 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3032 
3033 	flow_control.state = FCS_IDLE;
3034 	iq = &vm_pageout_queue_internal;
3035 	eq = &vm_pageout_queue_external;
3036 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3037 
3038 	/* Ask the pmap layer to return any pages it no longer needs. */
3039 	pmap_release_pages_fast();
3040 
3041 	vm_page_lock_queues();
3042 
3043 	delayed_unlock = 1;
3044 
3045 	/*
3046 	 *	Calculate the max number of referenced pages on the inactive
3047 	 *	queue that we will reactivate.
3048 	 */
3049 	reactivated_this_call = 0;
3050 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3051 	    vm_page_inactive_count);
3052 	inactive_reclaim_run = 0;
3053 
3054 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3055 
3056 	/*
3057 	 *	We must limit the rate at which we send pages to the pagers
3058 	 *	so that we don't tie up too many pages in the I/O queues.
3059 	 *	We implement a throttling mechanism using the laundry count
3060 	 *      to limit the number of pages outstanding to the default
3061 	 *	and external pagers.  We can bypass the throttles and look
3062 	 *	for clean pages if the pageout queues don't drain in a timely
3063 	 *	fashion since this may indicate that the pageout paths are
3064 	 *	stalled waiting for memory, which only we can provide.
3065 	 */
3066 
3067 	vps_init_page_targets();
3068 	assert(object == NULL);
3069 	assert(delayed_unlock != 0);
3070 
3071 	for (;;) {
3072 		vm_page_t m;
3073 
3074 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3075 
3076 		if (lock_yield_check) {
3077 			lock_yield_check = FALSE;
3078 
3079 			if (delayed_unlock++ > delayed_unlock_limit) {
3080 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3081 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3082 			} else if (vm_pageout_scan_wants_object) {
3083 				vm_page_unlock_queues();
3084 				mutex_pause(0);
3085 				vm_page_lock_queues();
3086 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3087 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3088 			}
3089 		}
3090 
3091 		if (vm_upl_wait_for_pages < 0) {
3092 			vm_upl_wait_for_pages = 0;
3093 		}
3094 
3095 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3096 
3097 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3098 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3099 		}
3100 
3101 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3102 
3103 		assert(delayed_unlock);
3104 
3105 		/*
3106 		 * maintain our balance
3107 		 */
3108 		vm_page_balance_inactive(1);
3109 
3110 
3111 		/**********************************************************************
3112 		* above this point we're playing with the active and secluded queues
3113 		* below this point we're playing with the throttling mechanisms
3114 		* and the inactive queue
3115 		**********************************************************************/
3116 
3117 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3118 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3119 
3120 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3121 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3122 			/*
3123 			 * make sure the pageout I/O threads are running
3124 			 * throttled in case there are still requests
3125 			 * in the laundry... since we have met our targets
3126 			 * we don't need the laundry to be cleaned in a timely
3127 			 * fashion... so let's avoid interfering with foreground
3128 			 * activity
3129 			 */
3130 			vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3131 
3132 			vm_free_page_lock();
3133 
3134 			if ((vm_page_free_count >= vm_page_free_target) &&
3135 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3136 				/*
3137 				 * done - we have met our target *and*
3138 				 * there is no one waiting for a page.
3139 				 */
3140 return_from_scan:
3141 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3142 
3143 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3144 				    vm_pageout_state.vm_pageout_inactive,
3145 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3146 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3147 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3148 				    vm_pageout_state.vm_pageout_inactive_clean,
3149 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3150 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3151 
3152 				return;
3153 			}
3154 			vm_free_page_unlock();
3155 		}
3156 
3157 		/*
3158 		 * Before anything, we check if we have any ripe volatile
3159 		 * objects around. If so, try to purge the first object.
3160 		 * If the purge fails, fall through to reclaim a page instead.
3161 		 * If the purge succeeds, go back to the top and reevalute
3162 		 * the new memory situation.
3163 		 */
3164 		retval = vps_purge_object();
3165 
3166 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3167 			/*
3168 			 * Success
3169 			 */
3170 			if (object != NULL) {
3171 				vm_object_unlock(object);
3172 				object = NULL;
3173 			}
3174 
3175 			lock_yield_check = FALSE;
3176 			continue;
3177 		}
3178 
3179 		/*
3180 		 * If our 'aged' queue is empty and we have some speculative pages
3181 		 * in the other queues, let's go through and see if we need to age
3182 		 * them.
3183 		 *
3184 		 * If we succeeded in aging a speculative Q or just that everything
3185 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3186 		 *
3187 		 * If, for some reason, we seem to have a mismatch between the spec.
3188 		 * page count and the page queues, we reset those variables and
3189 		 * restart the loop (LD TODO: Track this better?).
3190 		 */
3191 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3192 			retval = vps_age_speculative_queue(force_speculative_aging);
3193 
3194 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3195 				lock_yield_check = FALSE;
3196 				continue;
3197 			}
3198 		}
3199 		force_speculative_aging = FALSE;
3200 
3201 		/*
3202 		 * Check to see if we need to evict objects from the cache.
3203 		 *
3204 		 * Note: 'object' here doesn't have anything to do with
3205 		 * the eviction part. We just need to make sure we have dropped
3206 		 * any object lock we might be holding if we need to go down
3207 		 * into the eviction logic.
3208 		 */
3209 		retval = vps_object_cache_evict(&object);
3210 
3211 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3212 			lock_yield_check = FALSE;
3213 			continue;
3214 		}
3215 
3216 
3217 		/*
3218 		 * Calculate our filecache_min that will affect the loop
3219 		 * going forward.
3220 		 */
3221 		vps_calculate_filecache_min();
3222 
3223 		/*
3224 		 * LD TODO: Use a structure to hold all state variables for a single
3225 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3226 		 */
3227 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3228 		    &delayed_unlock, &local_freeq, &local_freed,
3229 		    &vm_pageout_deadlock_target, inactive_burst_count);
3230 
3231 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3232 			if (loop_count >= vm_page_inactive_count) {
3233 				loop_count = 0;
3234 			}
3235 
3236 			inactive_burst_count = 0;
3237 
3238 			assert(object == NULL);
3239 			assert(delayed_unlock != 0);
3240 
3241 			lock_yield_check = FALSE;
3242 			continue;
3243 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3244 			goto return_from_scan;
3245 		}
3246 
3247 		flow_control.state = FCS_IDLE;
3248 
3249 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3250 		    vm_pageout_inactive_external_forced_reactivate_limit);
3251 		loop_count++;
3252 		inactive_burst_count++;
3253 		vm_pageout_state.vm_pageout_inactive++;
3254 
3255 		/*
3256 		 * Choose a victim.
3257 		 */
3258 
3259 		m = NULL;
3260 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3261 
3262 		if (m == NULL) {
3263 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3264 				inactive_burst_count = 0;
3265 
3266 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3267 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3268 				}
3269 
3270 				lock_yield_check = TRUE;
3271 				continue;
3272 			}
3273 
3274 			/*
3275 			 * if we've gotten here, we have no victim page.
3276 			 * check to see if we've not finished balancing the queues
3277 			 * or we have a page on the aged speculative queue that we
3278 			 * skipped due to force_anonymous == TRUE.. or we have
3279 			 * speculative  pages that we can prematurely age... if
3280 			 * one of these cases we'll keep going, else panic
3281 			 */
3282 			force_anonymous = FALSE;
3283 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3284 
3285 			if (!vm_page_queue_empty(&sq->age_q)) {
3286 				lock_yield_check = TRUE;
3287 				continue;
3288 			}
3289 
3290 			if (vm_page_speculative_count) {
3291 				force_speculative_aging = TRUE;
3292 				lock_yield_check = TRUE;
3293 				continue;
3294 			}
3295 			panic("vm_pageout: no victim");
3296 
3297 			/* NOTREACHED */
3298 		}
3299 
3300 		assert(VM_PAGE_PAGEABLE(m));
3301 		m_object = VM_PAGE_OBJECT(m);
3302 		force_anonymous = FALSE;
3303 
3304 		page_prev_q_state = m->vmp_q_state;
3305 		/*
3306 		 * we just found this page on one of our queues...
3307 		 * it can't also be on the pageout queue, so safe
3308 		 * to call vm_page_queues_remove
3309 		 */
3310 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3311 		vm_page_queues_remove(m, TRUE);
3312 		if (donate) {
3313 			/*
3314 			 * The compressor needs to see this bit to know
3315 			 * where this page needs to land. Also if stolen,
3316 			 * this bit helps put the page back in the right
3317 			 * special queue where it belongs.
3318 			 */
3319 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3320 		}
3321 
3322 		assert(!m->vmp_laundry);
3323 		assert(vm_page_is_canonical(m));
3324 		assert(!is_kernel_object(m_object));
3325 
3326 		vm_pageout_vminfo.vm_pageout_considered_page++;
3327 
3328 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3329 
3330 		/*
3331 		 * check to see if we currently are working
3332 		 * with the same object... if so, we've
3333 		 * already got the lock
3334 		 */
3335 		if (m_object != object) {
3336 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3337 
3338 			/*
3339 			 * vps_switch_object() will always drop the 'object' lock first
3340 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3341 			 * either 'm_object' or NULL.
3342 			 */
3343 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3344 
3345 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3346 				lock_yield_check = TRUE;
3347 				continue;
3348 			}
3349 		}
3350 		assert(m_object == object);
3351 		assert(VM_PAGE_OBJECT(m) == m_object);
3352 
3353 		if (m->vmp_busy) {
3354 			/*
3355 			 *	Somebody is already playing with this page.
3356 			 *	Put it back on the appropriate queue
3357 			 *
3358 			 */
3359 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3360 
3361 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3362 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3363 			}
3364 
3365 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3366 
3367 			lock_yield_check = TRUE;
3368 			continue;
3369 		}
3370 
3371 		/*
3372 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3373 		 *	If already cleaning this page in place
3374 		 *	just leave if off the paging queues.
3375 		 *	We can leave the page mapped, and upl_commit_range
3376 		 *	will put it on the clean queue.
3377 		 *
3378 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3379 		 *	an msync INVALIDATE is in progress...
3380 		 *	this page has been marked for destruction
3381 		 *      after it has been cleaned,
3382 		 *      but not yet gathered into a UPL
3383 		 *	where 'cleaning' will be set...
3384 		 *	just leave it off the paging queues
3385 		 *
3386 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3387 		 *	an msync INVALIDATE is in progress
3388 		 *	and the UPL has already gathered this page...
3389 		 *	just leave it off the paging queues
3390 		 */
3391 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3392 			lock_yield_check = TRUE;
3393 			continue;
3394 		}
3395 
3396 
3397 		/*
3398 		 *	If it's absent, in error or the object is no longer alive,
3399 		 *	we can reclaim the page... in the no longer alive case,
3400 		 *	there are 2 states the page can be in that preclude us
3401 		 *	from reclaiming it - busy or cleaning - that we've already
3402 		 *	dealt with
3403 		 */
3404 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3405 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3406 			if (m->vmp_absent) {
3407 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3408 			} else if (!object->alive ||
3409 			    (!object->internal &&
3410 			    object->pager == MEMORY_OBJECT_NULL)) {
3411 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3412 			} else {
3413 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3414 			}
3415 			if (m->vmp_pmapped) {
3416 				int refmod;
3417 
3418 				/*
3419 				 * If this page was file-backed and wired while its pager
3420 				 * was lost (during a forced unmount, for example), there
3421 				 * could still be some pmap mappings that need to be
3422 				 * cleaned up before we can free the page.
3423 				 */
3424 				refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3425 				if ((refmod & VM_MEM_MODIFIED) &&
3426 				    !m->vmp_dirty) {
3427 					SET_PAGE_DIRTY(m, FALSE);
3428 				}
3429 			}
3430 reclaim_page:
3431 			if (vm_pageout_deadlock_target) {
3432 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3433 				vm_pageout_deadlock_target--;
3434 			}
3435 
3436 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3437 
3438 			if (object->internal) {
3439 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3440 			} else {
3441 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3442 			}
3443 			assert(!m->vmp_cleaning);
3444 			assert(!m->vmp_laundry);
3445 
3446 			if (!object->internal &&
3447 			    object->pager != NULL &&
3448 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3449 				shared_region_pager_reclaimed++;
3450 			}
3451 
3452 			m->vmp_busy = TRUE;
3453 
3454 			/*
3455 			 * remove page from object here since we're already
3456 			 * behind the object lock... defer the rest of the work
3457 			 * we'd normally do in vm_page_free_prepare_object
3458 			 * until 'vm_page_free_list' is called
3459 			 */
3460 			if (m->vmp_tabled) {
3461 				vm_page_remove(m, TRUE);
3462 			}
3463 
3464 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3465 			m->vmp_snext = local_freeq;
3466 			local_freeq = m;
3467 			local_freed++;
3468 
3469 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3470 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3471 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3472 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3473 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3474 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3475 			} else {
3476 				vm_pageout_vminfo.vm_pageout_freed_external++;
3477 			}
3478 
3479 			inactive_burst_count = 0;
3480 
3481 			lock_yield_check = TRUE;
3482 			continue;
3483 		}
3484 		if (object->vo_copy == VM_OBJECT_NULL) {
3485 			/*
3486 			 * No one else can have any interest in this page.
3487 			 * If this is an empty purgable object, the page can be
3488 			 * reclaimed even if dirty.
3489 			 * If the page belongs to a volatile purgable object, we
3490 			 * reactivate it if the compressor isn't active.
3491 			 */
3492 			if (object->purgable == VM_PURGABLE_EMPTY) {
3493 				if (m->vmp_pmapped == TRUE) {
3494 					/* unmap the page */
3495 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3496 					if (refmod_state & VM_MEM_MODIFIED) {
3497 						SET_PAGE_DIRTY(m, FALSE);
3498 					}
3499 				}
3500 				if (m->vmp_dirty || m->vmp_precious) {
3501 					/* we saved the cost of cleaning this page ! */
3502 					vm_page_purged_count++;
3503 				}
3504 				goto reclaim_page;
3505 			}
3506 
3507 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3508 				/*
3509 				 * With the VM compressor, the cost of
3510 				 * reclaiming a page is much lower (no I/O),
3511 				 * so if we find a "volatile" page, it's better
3512 				 * to let it get compressed rather than letting
3513 				 * it occupy a full page until it gets purged.
3514 				 * So no need to check for "volatile" here.
3515 				 */
3516 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3517 				/*
3518 				 * Avoid cleaning a "volatile" page which might
3519 				 * be purged soon.
3520 				 */
3521 
3522 				/* if it's wired, we can't put it on our queue */
3523 				assert(!VM_PAGE_WIRED(m));
3524 
3525 				/* just stick it back on! */
3526 				reactivated_this_call++;
3527 
3528 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3529 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3530 				}
3531 
3532 				goto reactivate_page;
3533 			}
3534 		} /* vo_copy NULL */
3535 		/*
3536 		 *	If it's being used, reactivate.
3537 		 *	(Fictitious pages are either busy or absent.)
3538 		 *	First, update the reference and dirty bits
3539 		 *	to make sure the page is unreferenced.
3540 		 */
3541 		refmod_state = -1;
3542 
3543 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3544 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3545 
3546 			if (refmod_state & VM_MEM_REFERENCED) {
3547 				m->vmp_reference = TRUE;
3548 			}
3549 			if (refmod_state & VM_MEM_MODIFIED) {
3550 				SET_PAGE_DIRTY(m, FALSE);
3551 			}
3552 		}
3553 
3554 		if (m->vmp_reference || m->vmp_dirty) {
3555 			/* deal with a rogue "reusable" page */
3556 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3557 		}
3558 
3559 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3560 			vm_pageout_state.vm_page_xpmapped_min = 0;
3561 		} else {
3562 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3563 			    vm_pageout_state.vm_page_xpmapped_min_divisor;
3564 		}
3565 
3566 		if (!m->vmp_no_cache &&
3567 		    page_from_bg_q == FALSE &&
3568 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3569 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3570 			/*
3571 			 * The page we pulled off the inactive list has
3572 			 * been referenced.  It is possible for other
3573 			 * processors to be touching pages faster than we
3574 			 * can clear the referenced bit and traverse the
3575 			 * inactive queue, so we limit the number of
3576 			 * reactivations.
3577 			 */
3578 			if (++reactivated_this_call >= reactivate_limit &&
3579 			    !object->object_is_shared_cache &&
3580 			    !((m->vmp_realtime ||
3581 			    object->for_realtime) &&
3582 			    vm_pageout_protect_realtime)) {
3583 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3584 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3585 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3586 				if (object->object_is_shared_cache) {
3587 					vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3588 				} else if (m->vmp_realtime ||
3589 				    object->for_realtime) {
3590 					vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3591 				}
3592 			} else {
3593 				uint32_t isinuse;
3594 
3595 				if (reactivated_this_call >= reactivate_limit) {
3596 					if (object->object_is_shared_cache) {
3597 						vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3598 					} else if ((m->vmp_realtime ||
3599 					    object->for_realtime) &&
3600 					    vm_pageout_protect_realtime) {
3601 						vm_pageout_vminfo.vm_pageout_protected_realtime++;
3602 					}
3603 				}
3604 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3605 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3606 				}
3607 
3608 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3609 reactivate_page:
3610 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3611 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3612 					/*
3613 					 * no explict mappings of this object exist
3614 					 * and it's not open via the filesystem
3615 					 */
3616 					vm_page_deactivate(m);
3617 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3618 				} else {
3619 					/*
3620 					 * The page was/is being used, so put back on active list.
3621 					 */
3622 					vm_page_activate(m);
3623 					counter_inc(&vm_statistics_reactivations);
3624 					inactive_burst_count = 0;
3625 				}
3626 #if DEVELOPMENT || DEBUG
3627 				if (page_from_bg_q == TRUE) {
3628 					if (m_object->internal) {
3629 						vm_pageout_rejected_bq_internal++;
3630 					} else {
3631 						vm_pageout_rejected_bq_external++;
3632 					}
3633 				}
3634 #endif /* DEVELOPMENT || DEBUG */
3635 
3636 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3637 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3638 				}
3639 				vm_pageout_state.vm_pageout_inactive_used++;
3640 
3641 				lock_yield_check = TRUE;
3642 				continue;
3643 			}
3644 			/*
3645 			 * Make sure we call pmap_get_refmod() if it
3646 			 * wasn't already called just above, to update
3647 			 * the dirty bit.
3648 			 */
3649 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3650 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3651 				if (refmod_state & VM_MEM_MODIFIED) {
3652 					SET_PAGE_DIRTY(m, FALSE);
3653 				}
3654 			}
3655 		}
3656 
3657 		/*
3658 		 * we've got a candidate page to steal...
3659 		 *
3660 		 * m->vmp_dirty is up to date courtesy of the
3661 		 * preceding check for m->vmp_reference... if
3662 		 * we get here, then m->vmp_reference had to be
3663 		 * FALSE (or possibly "reactivate_limit" was
3664 		 * exceeded), but in either case we called
3665 		 * pmap_get_refmod() and updated both
3666 		 * m->vmp_reference and m->vmp_dirty
3667 		 *
3668 		 * if it's dirty or precious we need to
3669 		 * see if the target queue is throtttled
3670 		 * it if is, we need to skip over it by moving it back
3671 		 * to the end of the inactive queue
3672 		 */
3673 
3674 		inactive_throttled = FALSE;
3675 
3676 		if (m->vmp_dirty || m->vmp_precious) {
3677 			if (object->internal) {
3678 				if (VM_PAGE_Q_THROTTLED(iq)) {
3679 					inactive_throttled = TRUE;
3680 				}
3681 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3682 				inactive_throttled = TRUE;
3683 			}
3684 		}
3685 throttle_inactive:
3686 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3687 		    object->internal && m->vmp_dirty &&
3688 		    (object->purgable == VM_PURGABLE_DENY ||
3689 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3690 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3691 			vm_page_check_pageable_safe(m);
3692 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3693 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3694 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3695 			vm_page_throttled_count++;
3696 
3697 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3698 
3699 			inactive_burst_count = 0;
3700 
3701 			lock_yield_check = TRUE;
3702 			continue;
3703 		}
3704 		if (inactive_throttled == TRUE) {
3705 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3706 			    &force_anonymous, page_from_bg_q);
3707 
3708 			inactive_burst_count = 0;
3709 
3710 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3711 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3712 			}
3713 
3714 			lock_yield_check = TRUE;
3715 			continue;
3716 		}
3717 
3718 		/*
3719 		 * we've got a page that we can steal...
3720 		 * eliminate all mappings and make sure
3721 		 * we have the up-to-date modified state
3722 		 *
3723 		 * if we need to do a pmap_disconnect then we
3724 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3725 		 * provides the true state atomically... the
3726 		 * page was still mapped up to the pmap_disconnect
3727 		 * and may have been dirtied at the last microsecond
3728 		 *
3729 		 * Note that if 'pmapped' is FALSE then the page is not
3730 		 * and has not been in any map, so there is no point calling
3731 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3732 		 * of likely usage of the page.
3733 		 */
3734 		if (m->vmp_pmapped == TRUE) {
3735 			int pmap_options;
3736 
3737 			/*
3738 			 * Don't count this page as going into the compressor
3739 			 * if any of these are true:
3740 			 * 1) compressed pager isn't enabled
3741 			 * 2) Freezer enabled device with compressed pager
3742 			 *    backend (exclusive use) i.e. most of the VM system
3743 			 *    (including vm_pageout_scan) has no knowledge of
3744 			 *    the compressor
3745 			 * 3) This page belongs to a file and hence will not be
3746 			 *    sent into the compressor
3747 			 */
3748 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3749 			    object->internal == FALSE) {
3750 				pmap_options = 0;
3751 			} else if (m->vmp_dirty || m->vmp_precious) {
3752 				/*
3753 				 * VM knows that this page is dirty (or
3754 				 * precious) and needs to be compressed
3755 				 * rather than freed.
3756 				 * Tell the pmap layer to count this page
3757 				 * as "compressed".
3758 				 */
3759 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3760 			} else {
3761 				/*
3762 				 * VM does not know if the page needs to
3763 				 * be preserved but the pmap layer might tell
3764 				 * us if any mapping has "modified" it.
3765 				 * Let's the pmap layer to count this page
3766 				 * as compressed if and only if it has been
3767 				 * modified.
3768 				 */
3769 				pmap_options =
3770 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3771 			}
3772 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3773 			    pmap_options,
3774 			    NULL);
3775 			if (refmod_state & VM_MEM_MODIFIED) {
3776 				SET_PAGE_DIRTY(m, FALSE);
3777 			}
3778 		}
3779 
3780 		/*
3781 		 * reset our count of pages that have been reclaimed
3782 		 * since the last page was 'stolen'
3783 		 */
3784 		inactive_reclaim_run = 0;
3785 
3786 		/*
3787 		 *	If it's clean and not precious, we can free the page.
3788 		 */
3789 		if (!m->vmp_dirty && !m->vmp_precious) {
3790 			vm_pageout_state.vm_pageout_inactive_clean++;
3791 
3792 			/*
3793 			 * OK, at this point we have found a page we are going to free.
3794 			 */
3795 #if CONFIG_PHANTOM_CACHE
3796 			if (!object->internal) {
3797 				vm_phantom_cache_add_ghost(m);
3798 			}
3799 #endif
3800 			goto reclaim_page;
3801 		}
3802 
3803 		/*
3804 		 * The page may have been dirtied since the last check
3805 		 * for a throttled target queue (which may have been skipped
3806 		 * if the page was clean then).  With the dirty page
3807 		 * disconnected here, we can make one final check.
3808 		 */
3809 		if (object->internal) {
3810 			if (VM_PAGE_Q_THROTTLED(iq)) {
3811 				inactive_throttled = TRUE;
3812 			}
3813 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3814 			inactive_throttled = TRUE;
3815 		}
3816 
3817 		if (inactive_throttled == TRUE) {
3818 			goto throttle_inactive;
3819 		}
3820 #if !CONFIG_JETSAM
3821 		memorystatus_update_available_page_count(AVAILABLE_NON_COMPRESSED_MEMORY);
3822 #endif /* !CONFIG_JETSAM */
3823 
3824 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3825 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3826 		}
3827 
3828 		if (object->internal) {
3829 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3830 		} else {
3831 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3832 		}
3833 
3834 		/*
3835 		 * internal pages will go to the compressor...
3836 		 * external pages will go to the appropriate pager to be cleaned
3837 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3838 		 * is a preferred queue to steal from
3839 		 */
3840 		vm_pageout_cluster(m);
3841 		inactive_burst_count = 0;
3842 
3843 		/*
3844 		 * back to top of pageout scan loop
3845 		 */
3846 	}
3847 }
3848 
3849 
3850 void
3851 vm_page_free_reserve(
3852 	int pages)
3853 {
3854 	int             free_after_reserve;
3855 
3856 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3857 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3858 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3859 		} else {
3860 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3861 		}
3862 	} else {
3863 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3864 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3865 		} else {
3866 			vm_page_free_reserved += pages;
3867 		}
3868 	}
3869 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3870 
3871 	vm_page_free_min = vm_page_free_reserved +
3872 	    VM_PAGE_FREE_MIN(free_after_reserve);
3873 
3874 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3875 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3876 	}
3877 
3878 	vm_page_free_target = vm_page_free_reserved +
3879 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3880 
3881 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3882 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3883 	}
3884 
3885 	if (vm_page_free_target < vm_page_free_min + 5) {
3886 		vm_page_free_target = vm_page_free_min + 5;
3887 	}
3888 
3889 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3890 }
3891 
3892 /*
3893  *	vm_pageout is the high level pageout daemon.
3894  */
3895 
3896 void
3897 vm_pageout_continue(void)
3898 {
3899 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3900 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3901 
3902 	vm_free_page_lock();
3903 	vm_pageout_running = TRUE;
3904 	vm_free_page_unlock();
3905 
3906 	vm_pageout_scan();
3907 	/*
3908 	 * we hold both the vm_page_queue_free_lock
3909 	 * and the vm_page_queues_lock at this point
3910 	 */
3911 	assert(vm_page_free_wanted == 0);
3912 	assert(vm_page_free_wanted_privileged == 0);
3913 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3914 
3915 	vm_pageout_running = FALSE;
3916 #if XNU_TARGET_OS_OSX
3917 	if (vm_pageout_waiter) {
3918 		vm_pageout_waiter = FALSE;
3919 		thread_wakeup((event_t)&vm_pageout_waiter);
3920 	}
3921 #endif /* XNU_TARGET_OS_OSX */
3922 
3923 	vm_free_page_unlock();
3924 	vm_page_unlock_queues();
3925 
3926 	thread_block((thread_continue_t)vm_pageout_continue);
3927 	/*NOTREACHED*/
3928 }
3929 
3930 #if XNU_TARGET_OS_OSX
3931 kern_return_t
3932 vm_pageout_wait(uint64_t deadline)
3933 {
3934 	kern_return_t kr;
3935 
3936 	vm_free_page_lock();
3937 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3938 		vm_pageout_waiter = TRUE;
3939 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3940 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3941 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3942 			kr = KERN_OPERATION_TIMED_OUT;
3943 		}
3944 	}
3945 	vm_free_page_unlock();
3946 
3947 	return kr;
3948 }
3949 #endif /* XNU_TARGET_OS_OSX */
3950 
3951 OS_NORETURN
3952 static void
3953 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3954 {
3955 	vm_page_t       m = NULL;
3956 	vm_object_t     object;
3957 	vm_object_offset_t offset;
3958 	memory_object_t pager;
3959 	struct vm_pageout_queue *q = ethr->q;
3960 
3961 	/* On systems with a compressor, the external IO thread clears its
3962 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3963 	 * creation)
3964 	 */
3965 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3966 		current_thread()->options &= ~TH_OPT_VMPRIV;
3967 	}
3968 
3969 	sched_cond_ack(&(ethr->pgo_wakeup));
3970 
3971 	while (true) {
3972 		vm_page_lockspin_queues();
3973 
3974 		while (!vm_page_queue_empty(&q->pgo_pending)) {
3975 			q->pgo_busy = TRUE;
3976 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3977 
3978 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3979 			VM_PAGE_CHECK(m);
3980 			/*
3981 			 * grab a snapshot of the object and offset this
3982 			 * page is tabled in so that we can relookup this
3983 			 * page after we've taken the object lock - these
3984 			 * fields are stable while we hold the page queues lock
3985 			 * but as soon as we drop it, there is nothing to keep
3986 			 * this page in this object... we hold an activity_in_progress
3987 			 * on this object which will keep it from terminating
3988 			 */
3989 			object = VM_PAGE_OBJECT(m);
3990 			offset = m->vmp_offset;
3991 
3992 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3993 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3994 
3995 			vm_page_unlock_queues();
3996 
3997 			vm_object_lock(object);
3998 
3999 			m = vm_page_lookup(object, offset);
4000 
4001 			if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4002 			    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4003 				/*
4004 				 * it's either the same page that someone else has
4005 				 * started cleaning (or it's finished cleaning or
4006 				 * been put back on the pageout queue), or
4007 				 * the page has been freed or we have found a
4008 				 * new page at this offset... in all of these cases
4009 				 * we merely need to release the activity_in_progress
4010 				 * we took when we put the page on the pageout queue
4011 				 */
4012 				vm_object_activity_end(object);
4013 				vm_object_unlock(object);
4014 
4015 				vm_page_lockspin_queues();
4016 				continue;
4017 			}
4018 			pager = object->pager;
4019 
4020 			if (pager == MEMORY_OBJECT_NULL) {
4021 				/*
4022 				 * This pager has been destroyed by either
4023 				 * memory_object_destroy or vm_object_destroy, and
4024 				 * so there is nowhere for the page to go.
4025 				 */
4026 				if (m->vmp_free_when_done) {
4027 					/*
4028 					 * Just free the page... VM_PAGE_FREE takes
4029 					 * care of cleaning up all the state...
4030 					 * including doing the vm_pageout_throttle_up
4031 					 */
4032 					VM_PAGE_FREE(m);
4033 				} else {
4034 					vm_page_lockspin_queues();
4035 
4036 					vm_pageout_throttle_up(m);
4037 					vm_page_activate(m);
4038 
4039 					vm_page_unlock_queues();
4040 
4041 					/*
4042 					 *	And we are done with it.
4043 					 */
4044 				}
4045 				vm_object_activity_end(object);
4046 				vm_object_unlock(object);
4047 
4048 				vm_page_lockspin_queues();
4049 				continue;
4050 			}
4051 	#if 0
4052 			/*
4053 			 * we don't hold the page queue lock
4054 			 * so this check isn't safe to make
4055 			 */
4056 			VM_PAGE_CHECK(m);
4057 	#endif
4058 			/*
4059 			 * give back the activity_in_progress reference we
4060 			 * took when we queued up this page and replace it
4061 			 * it with a paging_in_progress reference that will
4062 			 * also hold the paging offset from changing and
4063 			 * prevent the object from terminating
4064 			 */
4065 			vm_object_activity_end(object);
4066 			vm_object_paging_begin(object);
4067 			vm_object_unlock(object);
4068 
4069 			/*
4070 			 * Send the data to the pager.
4071 			 * any pageout clustering happens there
4072 			 */
4073 			memory_object_data_return(pager,
4074 			    m->vmp_offset + object->paging_offset,
4075 			    PAGE_SIZE,
4076 			    NULL,
4077 			    NULL,
4078 			    FALSE,
4079 			    FALSE,
4080 			    0);
4081 
4082 			vm_object_lock(object);
4083 			vm_object_paging_end(object);
4084 			vm_object_unlock(object);
4085 
4086 			vm_pageout_io_throttle();
4087 
4088 			vm_page_lockspin_queues();
4089 		}
4090 		q->pgo_busy = FALSE;
4091 
4092 		vm_page_unlock_queues();
4093 		sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4094 	}
4095 	/*NOTREACHED*/
4096 }
4097 
4098 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4099 
4100 #if DEVELOPMENT || DEBUG
4101 static void
4102 vm_pageout_record_thread_time(int cqid, int ncomps)
4103 {
4104 	if (__improbable(vm_compressor_time_thread)) {
4105 		vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4106 		vmct_stats.vmct_pages[cqid] += ncomps;
4107 		vmct_stats.vmct_iterations[cqid]++;
4108 		if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4109 			vmct_stats.vmct_maxpages[cqid] = ncomps;
4110 		}
4111 		if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4112 			vmct_stats.vmct_minpages[cqid] = ncomps;
4113 		}
4114 	}
4115 }
4116 #endif
4117 
4118 static void *
4119 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4120 {
4121 	/*
4122 	 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4123 	 * However, this page has been removed from all queues and is only
4124 	 * known to this compressor thread dealing with this local queue.
4125 	 *
4126 	 * TODO: Add a second localq that is the early localq and
4127 	 * put special pages like this one on that queue in the block above
4128 	 * under the pageq lock to avoid this 'works but not clean' logic.
4129 	 */
4130 	void *donate_queue_head;
4131 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4132 	donate_queue_head = &cq->current_early_swapout_chead;
4133 #else /* XNU_TARGET_OS_OSX */
4134 	donate_queue_head = &cq->current_late_swapout_chead;
4135 #endif /* XNU_TARGET_OS_OSX */
4136 	if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4137 		m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4138 		return donate_queue_head;
4139 	} else {
4140 		return &cq->current_regular_swapout_chead;
4141 	}
4142 }
4143 
4144 #define         MAX_FREE_BATCH          32
4145 
4146 OS_NORETURN
4147 static void
4148 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4149 {
4150 	struct vm_pageout_queue *q;
4151 	vm_page_t       m = NULL;
4152 	boolean_t       pgo_draining;
4153 	vm_page_t   local_q;
4154 	int         local_cnt;
4155 	vm_page_t   local_freeq = NULL;
4156 	int         local_freed = 0;
4157 	int         local_batch_size;
4158 #if DEVELOPMENT || DEBUG
4159 	int       ncomps = 0;
4160 	boolean_t marked_active = FALSE;
4161 	int       num_pages_processed = 0;
4162 #endif
4163 	void *chead = NULL;
4164 
4165 	KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4166 
4167 	sched_cond_ack(&(cq->pgo_wakeup));
4168 
4169 	q = cq->q;
4170 
4171 	while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4172 #if DEVELOPMENT || DEBUG
4173 		bool benchmark_accounting = false;
4174 		/* If we're running the compressor perf test, only process the benchmark pages.
4175 		 * We'll get back to our regular queue once the benchmark is done */
4176 		if (compressor_running_perf_test) {
4177 			q = cq->benchmark_q;
4178 			if (!vm_page_queue_empty(&q->pgo_pending)) {
4179 				benchmark_accounting = true;
4180 			} else {
4181 				q = cq->q;
4182 				benchmark_accounting = false;
4183 			}
4184 		}
4185 #endif /* DEVELOPMENT || DEBUG */
4186 
4187 #if __AMP__
4188 		if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4189 			local_batch_size = (q->pgo_maxlaundry >> 3);
4190 			local_batch_size = MAX(local_batch_size, 16);
4191 		} else {
4192 			local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4193 		}
4194 #else
4195 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4196 #endif
4197 
4198 #if RECORD_THE_COMPRESSED_DATA
4199 		if (q->pgo_laundry) {
4200 			c_compressed_record_init();
4201 		}
4202 #endif
4203 		while (true) { /* this loop is for working though all the pages in the pending queue */
4204 			int     pages_left_on_q = 0;
4205 
4206 			local_cnt = 0;
4207 			local_q = NULL;
4208 
4209 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4210 
4211 			vm_page_lock_queues();
4212 #if DEVELOPMENT || DEBUG
4213 			if (marked_active == FALSE) {
4214 				vmct_active++;
4215 				vmct_state[cq->id] = VMCT_ACTIVE;
4216 				marked_active = TRUE;
4217 				if (vmct_active == 1) {
4218 					vm_compressor_epoch_start = mach_absolute_time();
4219 				}
4220 			}
4221 #endif
4222 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4223 
4224 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4225 
4226 			/* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4227 			while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4228 				vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4229 				assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4230 				VM_PAGE_CHECK(m);
4231 
4232 				m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4233 				VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4234 				m->vmp_laundry = FALSE;
4235 
4236 				m->vmp_snext = local_q;
4237 				local_q = m;
4238 				local_cnt++;
4239 			}
4240 			if (local_q == NULL) {
4241 				break;
4242 			}
4243 
4244 			q->pgo_busy = TRUE;
4245 
4246 			if ((pgo_draining = q->pgo_draining) == FALSE) {
4247 				vm_pageout_throttle_up_batch(q, local_cnt);
4248 				pages_left_on_q = q->pgo_laundry;
4249 			} else {
4250 				pages_left_on_q = q->pgo_laundry - local_cnt;
4251 			}
4252 
4253 			vm_page_unlock_queues();
4254 
4255 #if !RECORD_THE_COMPRESSED_DATA
4256 			/* if we have lots to compress, wake up the other thread to help.
4257 			 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4258 			if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4259 				// wake up the next compressor thread
4260 				sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4261 				    pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4262 			}
4263 #endif
4264 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4265 
4266 			while (local_q) {
4267 				KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4268 
4269 				m = local_q;
4270 				local_q = m->vmp_snext;
4271 				m->vmp_snext = NULL;
4272 
4273 
4274 				chead = vm_pageout_select_filling_chead(cq, m);
4275 
4276 				if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4277 #if DEVELOPMENT || DEBUG
4278 					ncomps++;
4279 #endif
4280 					KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4281 
4282 					m->vmp_snext = local_freeq;
4283 					local_freeq = m;
4284 					local_freed++;
4285 
4286 					/* if we gathered enough free pages, free them now */
4287 					if (local_freed >= MAX_FREE_BATCH) {
4288 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4289 
4290 						vm_page_free_list(local_freeq, TRUE);
4291 
4292 						local_freeq = NULL;
4293 						local_freed = 0;
4294 					}
4295 				}
4296 #if DEVELOPMENT || DEBUG
4297 				num_pages_processed++;
4298 #endif /* DEVELOPMENT || DEBUG */
4299 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4300 				while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4301 					kern_return_t   wait_result;
4302 					int             need_wakeup = 0;
4303 
4304 					if (local_freeq) {
4305 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4306 
4307 						vm_page_free_list(local_freeq, TRUE);
4308 						local_freeq = NULL;
4309 						local_freed = 0;
4310 
4311 						continue;
4312 					}
4313 					vm_free_page_lock_spin();
4314 
4315 					if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4316 						if (vm_page_free_wanted_privileged++ == 0) {
4317 							need_wakeup = 1;
4318 						}
4319 						wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4320 
4321 						vm_free_page_unlock();
4322 
4323 						if (need_wakeup) {
4324 							thread_wakeup((event_t)&vm_page_free_wanted);
4325 						}
4326 
4327 						if (wait_result == THREAD_WAITING) {
4328 							thread_block(THREAD_CONTINUE_NULL);
4329 						}
4330 					} else {
4331 						vm_free_page_unlock();
4332 					}
4333 				}
4334 #endif
4335 			}  /* while (local_q) */
4336 			/* free any leftovers in the freeq */
4337 			if (local_freeq) {
4338 				OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4339 
4340 				vm_page_free_list(local_freeq, TRUE);
4341 				local_freeq = NULL;
4342 				local_freed = 0;
4343 			}
4344 			if (pgo_draining == TRUE) {
4345 				vm_page_lockspin_queues();
4346 				vm_pageout_throttle_up_batch(q, local_cnt);
4347 				vm_page_unlock_queues();
4348 			}
4349 		}
4350 		KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4351 
4352 		/*
4353 		 * queue lock is held and our q is empty
4354 		 */
4355 		q->pgo_busy = FALSE;
4356 #if DEVELOPMENT || DEBUG
4357 		if (marked_active == TRUE) {
4358 			vmct_active--;
4359 			vmct_state[cq->id] = VMCT_IDLE;
4360 
4361 			if (vmct_active == 0) {
4362 				vm_compressor_epoch_stop = mach_absolute_time();
4363 				assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4364 				    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4365 				    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4366 				/* This interval includes intervals where one or more
4367 				 * compressor threads were pre-empted
4368 				 */
4369 				vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4370 			}
4371 		}
4372 		if (compressor_running_perf_test && benchmark_accounting) {
4373 			/*
4374 			 * We could turn ON compressor_running_perf_test while still processing
4375 			 * regular non-benchmark pages. We shouldn't count them here else we
4376 			 * could overshoot. We might also still be populating that benchmark Q
4377 			 * and be under pressure. So we will go back to the regular queues. And
4378 			 * benchmark accounting will be off for that case too.
4379 			 */
4380 			compressor_perf_test_pages_processed += num_pages_processed;
4381 			thread_wakeup(&compressor_perf_test_pages_processed);
4382 		}
4383 #endif
4384 		vm_page_unlock_queues();
4385 #if DEVELOPMENT || DEBUG
4386 		vm_pageout_record_thread_time(cq->id, ncomps);
4387 #endif
4388 
4389 		KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4390 #if DEVELOPMENT || DEBUG
4391 		if (compressor_running_perf_test && benchmark_accounting) {
4392 			/*
4393 			 * We've been exclusively compressing pages from the benchmark queue,
4394 			 * do 1 pass over the internal queue before blocking.
4395 			 */
4396 			continue;
4397 		}
4398 #endif
4399 
4400 		sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4401 	}
4402 	/*NOTREACHED*/
4403 }
4404 
4405 /* resolves the pager and maintain stats in the pager and in the vm_object */
4406 kern_return_t
4407 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4408 {
4409 	vm_object_t     object;
4410 	memory_object_t pager;
4411 	int             compressed_count_delta;
4412 	kern_return_t   retval;
4413 
4414 	object = VM_PAGE_OBJECT(m);
4415 
4416 	assert(!m->vmp_free_when_done);
4417 	assert(!m->vmp_laundry);
4418 
4419 	pager = object->pager;
4420 
4421 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4422 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4423 
4424 		vm_object_lock(object);
4425 
4426 		/*
4427 		 * If there is no memory object for the page, create
4428 		 * one and hand it to the compression pager.
4429 		 */
4430 
4431 		if (!object->pager_initialized) {
4432 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4433 		}
4434 		if (!object->pager_initialized) {
4435 			vm_object_compressor_pager_create(object);
4436 		}
4437 
4438 		pager = object->pager;
4439 
4440 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4441 			/*
4442 			 * Still no pager for the object,
4443 			 * or the pager has been destroyed.
4444 			 * Reactivate the page.
4445 			 *
4446 			 * Should only happen if there is no
4447 			 * compression pager
4448 			 */
4449 			vm_page_wakeup_done(object, m);
4450 
4451 			vm_page_lockspin_queues();
4452 			vm_page_activate(m);
4453 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4454 			vm_page_unlock_queues();
4455 
4456 			/*
4457 			 *	And we are done with it.
4458 			 */
4459 			vm_object_activity_end(object);
4460 			vm_object_unlock(object);
4461 
4462 			return KERN_FAILURE;
4463 		}
4464 		vm_object_unlock(object);
4465 
4466 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4467 	}
4468 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4469 	assert(object->activity_in_progress > 0);
4470 
4471 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4472 	if (m->vmp_unmodified_ro == true) {
4473 		os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4474 	}
4475 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4476 
4477 	vm_compressor_options_t flags = 0;
4478 
4479 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4480 	if (m->vmp_unmodified_ro) {
4481 		flags |= C_PAGE_UNMODIFIED;
4482 	}
4483 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4484 
4485 
4486 	retval = vm_compressor_pager_put(
4487 		pager,
4488 		m->vmp_offset + object->paging_offset,
4489 		VM_PAGE_GET_PHYS_PAGE(m),
4490 		current_chead,
4491 		scratch_buf,
4492 		&compressed_count_delta,
4493 		flags);
4494 
4495 	vm_object_lock(object);
4496 
4497 	assert(object->activity_in_progress > 0);
4498 	assert(VM_PAGE_OBJECT(m) == object);
4499 	assert( !VM_PAGE_WIRED(m));
4500 
4501 	vm_compressor_pager_count(pager,
4502 	    compressed_count_delta,
4503 	    FALSE,                       /* shared_lock */
4504 	    object);
4505 
4506 	if (retval == KERN_SUCCESS) {
4507 		/*
4508 		 * If the object is purgeable, its owner's
4509 		 * purgeable ledgers will be updated in
4510 		 * vm_page_remove() but the page still
4511 		 * contributes to the owner's memory footprint,
4512 		 * so account for it as such.
4513 		 */
4514 		if (m->vmp_tabled) {
4515 			vm_page_remove(m, TRUE);
4516 		}
4517 		if ((object->purgable != VM_PURGABLE_DENY ||
4518 		    object->vo_ledger_tag) &&
4519 		    object->vo_owner != NULL) {
4520 			/* one more compressed purgeable/tagged page */
4521 			vm_object_owner_compressed_update(object,
4522 			    compressed_count_delta);
4523 		}
4524 		counter_inc(&vm_statistics_compressions);
4525 	} else {
4526 		vm_page_wakeup_done(object, m);
4527 
4528 		vm_page_lockspin_queues();
4529 
4530 		vm_page_activate(m);
4531 		vm_pageout_vminfo.vm_compressor_failed++;
4532 
4533 		vm_page_unlock_queues();
4534 	}
4535 	vm_object_activity_end(object);
4536 	vm_object_unlock(object);
4537 
4538 	return retval;
4539 }
4540 
4541 
4542 static void
4543 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4544 {
4545 	uint32_t        policy;
4546 
4547 	if (hibernate_cleaning_in_progress == TRUE) {
4548 		req_lowpriority = FALSE;
4549 	}
4550 
4551 	if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4552 		vm_page_unlock_queues();
4553 
4554 		if (req_lowpriority == TRUE) {
4555 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4556 			DTRACE_VM(laundrythrottle);
4557 		} else {
4558 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4559 			DTRACE_VM(laundryunthrottle);
4560 		}
4561 		proc_set_thread_policy(ethr->pgo_iothread,
4562 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4563 
4564 		vm_page_lock_queues();
4565 		ethr->q->pgo_lowpriority = req_lowpriority;
4566 	}
4567 }
4568 
4569 OS_NORETURN
4570 static void
4571 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4572 {
4573 	thread_t        self = current_thread();
4574 
4575 	self->options |= TH_OPT_VMPRIV;
4576 
4577 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4578 
4579 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4580 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4581 
4582 	vm_page_lock_queues();
4583 
4584 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4585 	vm_pageout_queue_external.pgo_inited = TRUE;
4586 
4587 	vm_page_unlock_queues();
4588 
4589 #if CONFIG_THREAD_GROUPS
4590 	thread_group_vm_add();
4591 #endif /* CONFIG_THREAD_GROUPS */
4592 
4593 	vm_pageout_iothread_external_continue(ethr, 0);
4594 	/*NOTREACHED*/
4595 }
4596 
4597 
4598 OS_NORETURN
4599 static void
4600 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4601 {
4602 	thread_t        self = current_thread();
4603 
4604 	self->options |= TH_OPT_VMPRIV;
4605 
4606 	vm_page_lock_queues();
4607 
4608 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4609 	vm_pageout_queue_internal.pgo_inited = TRUE;
4610 
4611 #if DEVELOPMENT || DEBUG
4612 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4613 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4614 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4615 #endif /* DEVELOPMENT || DEBUG */
4616 
4617 	vm_page_unlock_queues();
4618 
4619 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4620 		thread_vm_bind_group_add();
4621 	}
4622 
4623 #if CONFIG_THREAD_GROUPS
4624 	thread_group_vm_add();
4625 #endif /* CONFIG_THREAD_GROUPS */
4626 
4627 #if __AMP__
4628 	if (vm_compressor_ebound) {
4629 		/*
4630 		 * Use the soft bound option for vm_compressor to allow it to run on
4631 		 * P-cores if E-cluster is unavailable.
4632 		 */
4633 		thread_soft_bind_cluster_type(self, 'E');
4634 	}
4635 #endif /* __AMP__ */
4636 
4637 	thread_set_thread_name(current_thread(), "VM_compressor");
4638 #if DEVELOPMENT || DEBUG
4639 	vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4640 #endif
4641 	vm_pageout_iothread_internal_continue(cthr, 0);
4642 
4643 	/*NOTREACHED*/
4644 }
4645 
4646 kern_return_t
4647 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4648 {
4649 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4650 		return KERN_SUCCESS;
4651 	} else {
4652 		return KERN_FAILURE; /* Already set */
4653 	}
4654 }
4655 
4656 extern boolean_t        memorystatus_manual_testing_on;
4657 extern unsigned int     memorystatus_level;
4658 
4659 
4660 #if VM_PRESSURE_EVENTS
4661 
4662 boolean_t vm_pressure_events_enabled = FALSE;
4663 
4664 extern uint64_t next_warning_notification_sent_at_ts;
4665 extern uint64_t next_critical_notification_sent_at_ts;
4666 
4667 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4668 
4669 /*
4670  * The last time there was change in pressure level OR we forced a check
4671  * because the system is stuck in a non-normal pressure level.
4672  */
4673 uint64_t  vm_pressure_last_level_transition_abs = 0;
4674 
4675 /*
4676  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4677  * level before resending out notifications for that level again.
4678  */
4679 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4680 
4681 void
4682 vm_pressure_response(void)
4683 {
4684 	vm_pressure_level_t     old_level = kVMPressureNormal;
4685 	int                     new_level = -1;
4686 	unsigned int            total_pages;
4687 	uint64_t                available_memory = 0;
4688 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4689 	bool                    force_check = false;
4690 	int                     time_in_mins;
4691 
4692 
4693 	if (vm_pressure_events_enabled == FALSE) {
4694 		return;
4695 	}
4696 
4697 	available_memory = (uint64_t) memorystatus_get_available_page_count();
4698 
4699 	total_pages = (unsigned int) atop_64(max_mem);
4700 #if CONFIG_SECLUDED_MEMORY
4701 	total_pages -= vm_page_secluded_count;
4702 #endif /* CONFIG_SECLUDED_MEMORY */
4703 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4704 
4705 	if (memorystatus_manual_testing_on) {
4706 		return;
4707 	}
4708 
4709 	curr_ts = mach_absolute_time();
4710 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4711 
4712 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4713 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4714 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4715 
4716 	old_level = memorystatus_vm_pressure_level;
4717 
4718 	switch (memorystatus_vm_pressure_level) {
4719 	case kVMPressureNormal:
4720 	{
4721 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4722 			new_level = kVMPressureCritical;
4723 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4724 			new_level = kVMPressureWarning;
4725 		}
4726 		break;
4727 	}
4728 
4729 	case kVMPressureWarning:
4730 	case kVMPressureUrgent:
4731 	{
4732 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4733 			new_level = kVMPressureNormal;
4734 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4735 			new_level = kVMPressureCritical;
4736 		} else if (force_check) {
4737 			new_level = kVMPressureWarning;
4738 			next_warning_notification_sent_at_ts = curr_ts;
4739 		}
4740 		break;
4741 	}
4742 
4743 	case kVMPressureCritical:
4744 	{
4745 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4746 			new_level = kVMPressureNormal;
4747 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4748 			new_level = kVMPressureWarning;
4749 		} else if (force_check) {
4750 			new_level = kVMPressureCritical;
4751 			next_critical_notification_sent_at_ts = curr_ts;
4752 		}
4753 		break;
4754 	}
4755 
4756 	default:
4757 		return;
4758 	}
4759 
4760 	if (new_level != -1 || force_check) {
4761 		if (new_level != -1) {
4762 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4763 
4764 			if (new_level != (int) old_level) {
4765 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4766 				    new_level, old_level, 0, 0);
4767 			}
4768 		} else {
4769 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4770 			    new_level, old_level, force_check, 0);
4771 		}
4772 
4773 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4774 			/*
4775 			 * We don't want to schedule a wakeup while hibernation is in progress
4776 			 * because that could collide with checks for non-monotonicity in the scheduler.
4777 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4778 			 * we _might_ want to use that for decisions regarding which pages or how
4779 			 * many pages we want to dump in hibernation.
4780 			 */
4781 			return;
4782 		}
4783 
4784 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4785 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4786 				thread_wakeup(&vm_pressure_thread);
4787 			}
4788 
4789 			if (old_level != memorystatus_vm_pressure_level) {
4790 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4791 			}
4792 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4793 		}
4794 	}
4795 }
4796 #endif /* VM_PRESSURE_EVENTS */
4797 
4798 
4799 /**
4800  * Called by a kernel thread to ask if a number of pages may be wired.
4801  */
4802 kern_return_t
4803 mach_vm_wire_level_monitor(int64_t requested_pages)
4804 {
4805 	if (requested_pages <= 0) {
4806 		return KERN_INVALID_ARGUMENT;
4807 	}
4808 
4809 	const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4810 	/**
4811 	 * Available pages can be negative in the case where more system memory is
4812 	 * wired than the threshold, so we must use a signed integer.
4813 	 */
4814 	const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4815 
4816 	if (requested_pages > available_pages) {
4817 		return KERN_RESOURCE_SHORTAGE;
4818 	}
4819 	return KERN_SUCCESS;
4820 }
4821 
4822 /*
4823  * Function called by a kernel thread to either get the current pressure level or
4824  * wait until memory pressure changes from a given level.
4825  */
4826 kern_return_t
4827 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4828 {
4829 #if !VM_PRESSURE_EVENTS
4830 	(void)wait_for_pressure;
4831 	(void)pressure_level;
4832 	return KERN_NOT_SUPPORTED;
4833 #else /* VM_PRESSURE_EVENTS */
4834 
4835 	uint32_t *waiters = NULL;
4836 	wait_result_t wr = 0;
4837 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4838 
4839 	if (pressure_level == NULL) {
4840 		return KERN_INVALID_ARGUMENT;
4841 	}
4842 	if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4843 	    *pressure_level == kVMPressureForegroundJetsam)) {
4844 		return KERN_INVALID_ARGUMENT;
4845 	}
4846 
4847 	if (wait_for_pressure) {
4848 		switch (*pressure_level) {
4849 		case kVMPressureForegroundJetsam:
4850 		case kVMPressureBackgroundJetsam:
4851 
4852 			if (*pressure_level == kVMPressureForegroundJetsam) {
4853 				waiters = &memorystatus_jetsam_fg_band_waiters;
4854 			} else {
4855 				/* kVMPressureBackgroundJetsam */
4856 				waiters = &memorystatus_jetsam_bg_band_waiters;
4857 			}
4858 
4859 			lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4860 			wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4861 			if (wr == THREAD_WAITING) {
4862 				*waiters += 1;
4863 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4864 				wr = thread_block(THREAD_CONTINUE_NULL);
4865 			} else {
4866 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4867 			}
4868 
4869 			if (wr != THREAD_AWAKENED) {
4870 				return KERN_ABORTED;
4871 			}
4872 
4873 			return KERN_SUCCESS;
4874 		case kVMPressureNormal:
4875 		case kVMPressureWarning:
4876 		case kVMPressureUrgent:
4877 		case kVMPressureCritical:
4878 			while (old_level == *pressure_level) {
4879 				wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4880 				    THREAD_INTERRUPTIBLE);
4881 				if (wr == THREAD_WAITING) {
4882 					wr = thread_block(THREAD_CONTINUE_NULL);
4883 				}
4884 				if (wr == THREAD_INTERRUPTED) {
4885 					return KERN_ABORTED;
4886 				}
4887 
4888 				if (wr == THREAD_AWAKENED) {
4889 					old_level = memorystatus_vm_pressure_level;
4890 				}
4891 			}
4892 			break;
4893 		default:
4894 			return KERN_INVALID_ARGUMENT;
4895 		}
4896 	}
4897 
4898 	*pressure_level = old_level;
4899 	return KERN_SUCCESS;
4900 #endif /* VM_PRESSURE_EVENTS */
4901 }
4902 
4903 #if VM_PRESSURE_EVENTS
4904 void
4905 vm_pressure_thread(void)
4906 {
4907 	static boolean_t thread_initialized = FALSE;
4908 
4909 	if (thread_initialized == TRUE) {
4910 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4911 		consider_vm_pressure_events();
4912 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4913 	}
4914 
4915 #if CONFIG_THREAD_GROUPS
4916 	thread_group_vm_add();
4917 #endif /* CONFIG_THREAD_GROUPS */
4918 
4919 	thread_set_thread_name(current_thread(), "VM_pressure");
4920 	thread_initialized = TRUE;
4921 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4922 	thread_block((thread_continue_t)vm_pressure_thread);
4923 }
4924 #endif /* VM_PRESSURE_EVENTS */
4925 
4926 
4927 /*
4928  * called once per-second via "compute_averages"
4929  */
4930 void
4931 compute_pageout_gc_throttle(__unused void *arg)
4932 {
4933 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4934 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4935 		sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
4936 	}
4937 }
4938 
4939 /*
4940  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4941  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4942  * jetsams. We need to check if the zone map size is above its jetsam limit to
4943  * decide if this was indeed the case.
4944  *
4945  * We need to do this on a different thread because of the following reasons:
4946  *
4947  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4948  * itself causing the system to hang. We perform synchronous jetsams if we're
4949  * leaking in the VM map entries zone, so the leaking process could be doing a
4950  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4951  * jetsam itself. We also need the vm_map lock on the process termination path,
4952  * which would now lead the dying process to deadlock against itself.
4953  *
4954  * 2. The jetsam path might need to allocate zone memory itself. We could try
4955  * using the non-blocking variant of zalloc for this path, but we can still
4956  * end up trying to do a kmem_alloc when the zone maps are almost full.
4957  */
4958 __dead2
4959 void
4960 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4961 {
4962 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4963 
4964 	if (step != VM_PAGEOUT_GC_INIT) {
4965 		sched_cond_ack(&vm_pageout_gc_cond);
4966 	}
4967 
4968 	while (true) {
4969 		if (step == VM_PAGEOUT_GC_INIT) {
4970 			/* first time being called is not about GC */
4971 #if CONFIG_THREAD_GROUPS
4972 			thread_group_vm_add();
4973 #endif /* CONFIG_THREAD_GROUPS */
4974 			step = VM_PAGEOUT_GC_COLLECT;
4975 		} else if (zone_map_nearing_exhaustion()) {
4976 			/*
4977 			 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4978 			 *
4979 			 * Bail out after calling zone_gc (which triggers the
4980 			 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4981 			 * operations that clear out a bunch of caches might allocate zone
4982 			 * memory themselves (for eg. vm_map operations would need VM map
4983 			 * entries). Since the zone map is almost full at this point, we
4984 			 * could end up with a panic. We just need to quickly jetsam a
4985 			 * process and exit here.
4986 			 *
4987 			 * It could so happen that we were woken up to relieve memory
4988 			 * pressure and the zone map also happened to be near its limit at
4989 			 * the time, in which case we'll skip out early. But that should be
4990 			 * ok; if memory pressure persists, the thread will simply be woken
4991 			 * up again.
4992 			 */
4993 
4994 			zone_gc(ZONE_GC_JETSAM);
4995 		} else {
4996 			/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4997 			boolean_t buf_large_zfree = FALSE;
4998 			boolean_t first_try = TRUE;
4999 
5000 			stack_collect();
5001 
5002 			consider_machine_collect();
5003 #if CONFIG_DEFERRED_RECLAIM
5004 			vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, RECLAIM_OPTIONS_NONE);
5005 #endif /* CONFIG_DEFERRED_RECLAIM */
5006 #if CONFIG_MBUF_MCACHE
5007 			mbuf_drain(FALSE);
5008 #endif /* CONFIG_MBUF_MCACHE */
5009 
5010 			do {
5011 				if (consider_buffer_cache_collect != NULL) {
5012 					buf_large_zfree = (*consider_buffer_cache_collect)(0);
5013 				}
5014 				if (first_try == TRUE || buf_large_zfree == TRUE) {
5015 					/*
5016 					 * zone_gc should be last, because the other operations
5017 					 * might return memory to zones.
5018 					 */
5019 					zone_gc(ZONE_GC_TRIM);
5020 				}
5021 				first_try = FALSE;
5022 			} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5023 
5024 			consider_machine_adjust();
5025 		}
5026 
5027 		sched_cond_wait_parameter(&vm_pageout_gc_cond, THREAD_UNINT, vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5028 	}
5029 	__builtin_unreachable();
5030 }
5031 
5032 
5033 #if VM_PAGE_BUCKETS_CHECK
5034 #if VM_PAGE_FAKE_BUCKETS
5035 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5036 #endif /* VM_PAGE_FAKE_BUCKETS */
5037 #endif /* VM_PAGE_BUCKETS_CHECK */
5038 
5039 
5040 
5041 void
5042 vm_set_restrictions(unsigned int num_cpus)
5043 {
5044 	int vm_restricted_to_single_processor = 0;
5045 
5046 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5047 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5048 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5049 	} else {
5050 		assert(num_cpus > 0);
5051 
5052 		if (num_cpus <= 3) {
5053 			/*
5054 			 * on systems with a limited number of CPUS, bind the
5055 			 * 4 major threads that can free memory and that tend to use
5056 			 * a fair bit of CPU under pressured conditions to a single processor.
5057 			 * This insures that these threads don't hog all of the available CPUs
5058 			 * (important for camera launch), while allowing them to run independently
5059 			 * w/r to locks... the 4 threads are
5060 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
5061 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
5062 			 * memorystatus_thread (jetsams).
5063 			 *
5064 			 * the first time the thread is run, it is responsible for checking the
5065 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
5066 			 * thread_bind_master...  someday this should be replaced with a group
5067 			 * scheduling mechanism and KPI.
5068 			 */
5069 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5070 		} else {
5071 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5072 		}
5073 	}
5074 }
5075 
5076 /*
5077  * Set up vm_config based on the vm_compressor_mode.
5078  * Must run BEFORE the pageout thread starts up.
5079  */
5080 __startup_func
5081 void
5082 vm_config_init(void)
5083 {
5084 	bzero(&vm_config, sizeof(vm_config));
5085 
5086 	switch (vm_compressor_mode) {
5087 	case VM_PAGER_DEFAULT:
5088 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5089 		OS_FALLTHROUGH;
5090 
5091 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
5092 		vm_config.compressor_is_present = TRUE;
5093 		vm_config.swap_is_present = TRUE;
5094 		vm_config.compressor_is_active = TRUE;
5095 		vm_config.swap_is_active = TRUE;
5096 		break;
5097 
5098 	case VM_PAGER_COMPRESSOR_NO_SWAP:
5099 		vm_config.compressor_is_present = TRUE;
5100 		vm_config.swap_is_present = TRUE;
5101 		vm_config.compressor_is_active = TRUE;
5102 		break;
5103 
5104 	case VM_PAGER_FREEZER_DEFAULT:
5105 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5106 		OS_FALLTHROUGH;
5107 
5108 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5109 		vm_config.compressor_is_present = TRUE;
5110 		vm_config.swap_is_present = TRUE;
5111 		break;
5112 
5113 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5114 		vm_config.compressor_is_present = TRUE;
5115 		vm_config.swap_is_present = TRUE;
5116 		vm_config.compressor_is_active = TRUE;
5117 		vm_config.freezer_swap_is_active = TRUE;
5118 		break;
5119 
5120 	case VM_PAGER_NOT_CONFIGURED:
5121 		break;
5122 
5123 	default:
5124 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
5125 		break;
5126 	}
5127 }
5128 
5129 __startup_func
5130 static void
5131 vm_pageout_create_gc_thread(void)
5132 {
5133 	thread_t thread;
5134 
5135 	sched_cond_init(&vm_pageout_gc_cond);
5136 	if (kernel_thread_create(vm_pageout_garbage_collect,
5137 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5138 		panic("vm_pageout_garbage_collect: create failed");
5139 	}
5140 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5141 	if (thread->reserved_stack == 0) {
5142 		assert(thread->kernel_stack);
5143 		thread->reserved_stack = thread->kernel_stack;
5144 	}
5145 
5146 	/* thread is started in vm_pageout() */
5147 	vm_pageout_gc_thread = thread;
5148 }
5149 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5150 
5151 void
5152 vm_pageout(void)
5153 {
5154 	thread_t        self = current_thread();
5155 	thread_t        thread;
5156 	kern_return_t   result;
5157 	spl_t           s;
5158 
5159 	/*
5160 	 * Set thread privileges.
5161 	 */
5162 	s = splsched();
5163 
5164 #if CONFIG_VPS_DYNAMIC_PRIO
5165 	if (vps_dynamic_priority_enabled) {
5166 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5167 		thread_set_eager_preempt(self);
5168 	} else {
5169 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5170 	}
5171 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5172 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5173 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5174 
5175 	thread_lock(self);
5176 	self->options |= TH_OPT_VMPRIV;
5177 	thread_unlock(self);
5178 
5179 	if (!self->reserved_stack) {
5180 		self->reserved_stack = self->kernel_stack;
5181 	}
5182 
5183 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5184 	    !vps_dynamic_priority_enabled) {
5185 		thread_vm_bind_group_add();
5186 	}
5187 
5188 
5189 #if CONFIG_THREAD_GROUPS
5190 	thread_group_vm_add();
5191 #endif /* CONFIG_THREAD_GROUPS */
5192 
5193 #if __AMP__
5194 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5195 	if (vm_pgo_pbound) {
5196 		/*
5197 		 * Use the soft bound option for vm pageout to allow it to run on
5198 		 * E-cores if P-cluster is unavailable.
5199 		 */
5200 		thread_soft_bind_cluster_type(self, 'P');
5201 	}
5202 #endif /* __AMP__ */
5203 
5204 	PE_parse_boot_argn("vmpgo_protect_realtime",
5205 	    &vm_pageout_protect_realtime,
5206 	    sizeof(vm_pageout_protect_realtime));
5207 	splx(s);
5208 
5209 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5210 
5211 	/*
5212 	 *	Initialize some paging parameters.
5213 	 */
5214 
5215 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5216 	vm_pageout_state.vm_pressure_changed = FALSE;
5217 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5218 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5219 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5220 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5221 	vm_pageout_state.vm_page_speculative_percentage = 5;
5222 	vm_pageout_state.vm_page_speculative_target = 0;
5223 
5224 	vm_pageout_state.vm_pageout_swap_wait = 0;
5225 	vm_pageout_state.vm_pageout_idle_wait = 0;
5226 	vm_pageout_state.vm_pageout_empty_wait = 0;
5227 	vm_pageout_state.vm_pageout_burst_wait = 0;
5228 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5229 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5230 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5231 
5232 	vm_pageout_state.vm_pageout_inactive = 0;
5233 	vm_pageout_state.vm_pageout_inactive_used = 0;
5234 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5235 
5236 	vm_pageout_state.vm_memory_pressure = 0;
5237 	vm_pageout_state.vm_page_filecache_min = 0;
5238 #if CONFIG_JETSAM
5239 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5240 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5241 #else
5242 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5243 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5244 #endif
5245 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5246 
5247 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5248 
5249 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5250 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5251 	}
5252 
5253 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5254 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5255 	}
5256 
5257 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5258 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5259 	}
5260 
5261 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5262 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5263 	}
5264 
5265 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5266 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5267 	}
5268 
5269 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5270 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5271 	}
5272 
5273 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5274 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5275 	}
5276 	/*
5277 	 * even if we've already called vm_page_free_reserve
5278 	 * call it again here to insure that the targets are
5279 	 * accurately calculated (it uses vm_page_free_count_init)
5280 	 * calling it with an arg of 0 will not change the reserve
5281 	 * but will re-calculate free_min and free_target
5282 	 */
5283 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5284 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5285 	} else {
5286 		vm_page_free_reserve(0);
5287 	}
5288 
5289 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5290 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5291 
5292 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5293 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5294 
5295 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5296 
5297 #if DEVELOPMENT || DEBUG
5298 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5299 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5300 #endif /* DEVELOPMENT || DEBUG */
5301 
5302 
5303 	/* internal pageout thread started when default pager registered first time */
5304 	/* external pageout and garbage collection threads started here */
5305 	struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5306 	ethr->id = 0;
5307 	ethr->q = &vm_pageout_queue_external;
5308 	/* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5309 	ethr->current_early_swapout_chead = NULL;
5310 	ethr->current_regular_swapout_chead = NULL;
5311 	ethr->current_late_swapout_chead = NULL;
5312 	ethr->scratch_buf = NULL;
5313 #if DEVELOPMENT || DEBUG
5314 	ethr->benchmark_q = NULL;
5315 #endif /* DEVELOPMENT || DEBUG */
5316 	sched_cond_init(&(ethr->pgo_wakeup));
5317 
5318 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5319 	    (void *)ethr, BASEPRI_VM,
5320 	    &(ethr->pgo_iothread));
5321 	if (result != KERN_SUCCESS) {
5322 		panic("vm_pageout: Unable to create external thread (%d)\n", result);
5323 	}
5324 	thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5325 
5326 	thread_mtx_lock(vm_pageout_gc_thread );
5327 	thread_start(vm_pageout_gc_thread );
5328 	thread_mtx_unlock(vm_pageout_gc_thread);
5329 
5330 #if VM_PRESSURE_EVENTS
5331 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5332 	    BASEPRI_DEFAULT,
5333 	    &thread);
5334 
5335 	if (result != KERN_SUCCESS) {
5336 		panic("vm_pressure_thread: create failed");
5337 	}
5338 
5339 	thread_deallocate(thread);
5340 #endif
5341 
5342 	vm_object_reaper_init();
5343 
5344 
5345 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5346 		vm_compressor_init();
5347 	}
5348 
5349 #if VM_PRESSURE_EVENTS
5350 	vm_pressure_events_enabled = TRUE;
5351 #endif /* VM_PRESSURE_EVENTS */
5352 
5353 #if CONFIG_PHANTOM_CACHE
5354 	vm_phantom_cache_init();
5355 #endif
5356 #if VM_PAGE_BUCKETS_CHECK
5357 #if VM_PAGE_FAKE_BUCKETS
5358 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5359 	    (uint64_t) vm_page_fake_buckets_start,
5360 	    (uint64_t) vm_page_fake_buckets_end);
5361 	pmap_protect(kernel_pmap,
5362 	    vm_page_fake_buckets_start,
5363 	    vm_page_fake_buckets_end,
5364 	    VM_PROT_READ);
5365 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5366 #endif /* VM_PAGE_FAKE_BUCKETS */
5367 #endif /* VM_PAGE_BUCKETS_CHECK */
5368 
5369 #if VM_OBJECT_TRACKING
5370 	vm_object_tracking_init();
5371 #endif /* VM_OBJECT_TRACKING */
5372 
5373 #if __arm64__
5374 //	vm_tests();
5375 #endif /* __arm64__ */
5376 
5377 	vm_pageout_continue();
5378 
5379 	/*
5380 	 * Unreached code!
5381 	 *
5382 	 * The vm_pageout_continue() call above never returns, so the code below is never
5383 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5384 	 * points that our kernel doesn't have an analog for.  These are probe points that
5385 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5386 	 * scripts that use them.  Declaring the probe points here means their scripts will
5387 	 * compile and execute which we want for portability of the scripts, but since this
5388 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5389 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5390 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5391 	 */
5392 
5393 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5394 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5395 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5396 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5397 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5398 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5399 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5400 	/*NOTREACHED*/
5401 }
5402 
5403 
5404 
5405 kern_return_t
5406 vm_pageout_internal_start(void)
5407 {
5408 	kern_return_t   result = KERN_SUCCESS;
5409 	host_basic_info_data_t hinfo;
5410 	vm_offset_t     buf, bufsize;
5411 
5412 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5413 
5414 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5415 #define BSD_HOST 1
5416 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5417 
5418 	assert(hinfo.max_cpus > 0);
5419 
5420 #if !XNU_TARGET_OS_OSX
5421 	vm_pageout_state.vm_compressor_thread_count = 1;
5422 #else /* !XNU_TARGET_OS_OSX */
5423 	if (hinfo.max_cpus > 4) {
5424 		vm_pageout_state.vm_compressor_thread_count = 2;
5425 	} else {
5426 		vm_pageout_state.vm_compressor_thread_count = 1;
5427 	}
5428 #endif /* !XNU_TARGET_OS_OSX */
5429 #if     __AMP__
5430 	if (vm_compressor_ebound) {
5431 		vm_pageout_state.vm_compressor_thread_count = 2;
5432 	}
5433 #endif
5434 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5435 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5436 
5437 	/* did we get from the bootargs an unreasonable number? */
5438 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5439 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5440 	}
5441 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5442 		vm_pageout_state.vm_compressor_thread_count = 1;
5443 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5444 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5445 	}
5446 
5447 	vm_pageout_queue_internal.pgo_maxlaundry =
5448 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5449 
5450 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5451 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5452 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5453 
5454 #if DEVELOPMENT || DEBUG
5455 	// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5456 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5457 #endif /* DEVELOPMENT || DEBUG */
5458 
5459 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5460 
5461 	kmem_alloc(kernel_map, &buf,
5462 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5463 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5464 	    VM_KERN_MEMORY_COMPRESSOR);
5465 
5466 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5467 		struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5468 		iq->id = i;
5469 		iq->q = &vm_pageout_queue_internal;
5470 		iq->current_early_swapout_chead = NULL;
5471 		iq->current_regular_swapout_chead = NULL;
5472 		iq->current_late_swapout_chead = NULL;
5473 		iq->scratch_buf = (char *)(buf + i * bufsize);
5474 #if DEVELOPMENT || DEBUG
5475 		iq->benchmark_q = &vm_pageout_queue_benchmark;
5476 #endif /* DEVELOPMENT || DEBUG */
5477 		sched_cond_init(&(iq->pgo_wakeup));
5478 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5479 		    (void *)iq, BASEPRI_VM,
5480 		    &(iq->pgo_iothread));
5481 
5482 		if (result != KERN_SUCCESS) {
5483 			panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5484 		}
5485 	}
5486 	return result;
5487 }
5488 
5489 #if CONFIG_IOSCHED
5490 /*
5491  * To support I/O Expedite for compressed files we mark the upls with special flags.
5492  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5493  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5494  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5495  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5496  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5497  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5498  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5499  * unless the real I/O upl is being destroyed).
5500  */
5501 
5502 
5503 static void
5504 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5505 {
5506 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5507 
5508 	upl_lock(src_upl);
5509 	if (src_upl->decmp_io_upl) {
5510 		/*
5511 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5512 		 * This case should rarely happen and even if it does, it just means
5513 		 * that we might issue a spurious expedite which the driver is expected
5514 		 * to handle.
5515 		 */
5516 		upl_unlock(src_upl);
5517 		return;
5518 	}
5519 	src_upl->decmp_io_upl = (void *)upl;
5520 	src_upl->ref_count++;
5521 
5522 	upl->flags |= UPL_DECMP_REAL_IO;
5523 	upl->decmp_io_upl = (void *)src_upl;
5524 	upl_unlock(src_upl);
5525 }
5526 #endif /* CONFIG_IOSCHED */
5527 
5528 #if UPL_DEBUG
5529 int     upl_debug_enabled = 1;
5530 #else
5531 int     upl_debug_enabled = 0;
5532 #endif
5533 
5534 static upl_t
5535 upl_create(int type, int flags, upl_size_t size)
5536 {
5537 	uint32_t pages = (uint32_t)atop(round_page_32(size));
5538 	upl_t    upl;
5539 
5540 	assert(page_aligned(size));
5541 
5542 	/*
5543 	 * FIXME: this code assumes the allocation always succeeds,
5544 	 *        however `pages` can be up to MAX_UPL_SIZE.
5545 	 *
5546 	 *        The allocation size is above 32k (resp. 128k)
5547 	 *        on 16k pages (resp. 4k), which kalloc might fail
5548 	 *        to allocate.
5549 	 */
5550 	upl = kalloc_type(struct upl, struct upl_page_info,
5551 	    (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5552 	if (type & UPL_CREATE_INTERNAL) {
5553 		flags |= UPL_INTERNAL;
5554 	}
5555 
5556 	if (type & UPL_CREATE_LITE) {
5557 		flags |= UPL_LITE;
5558 		if (pages) {
5559 			upl->lite_list = bitmap_alloc(pages);
5560 		}
5561 	}
5562 
5563 	upl->flags = flags;
5564 	upl->ref_count = 1;
5565 	upl_lock_init(upl);
5566 #if CONFIG_IOSCHED
5567 	if (type & UPL_CREATE_IO_TRACKING) {
5568 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5569 	}
5570 
5571 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5572 		/* Only support expedite on internal UPLs */
5573 		thread_t        curthread = current_thread();
5574 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5575 		    Z_WAITOK | Z_ZERO);
5576 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5577 		if (curthread->decmp_upl != NULL) {
5578 			upl_set_decmp_info(upl, curthread->decmp_upl);
5579 		}
5580 	}
5581 #endif
5582 #if CONFIG_IOSCHED || UPL_DEBUG
5583 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5584 		upl->upl_creator = current_thread();
5585 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5586 	}
5587 #endif
5588 
5589 #if UPL_DEBUG
5590 	upl->upl_create_btref = btref_get(__builtin_frame_address(0), 0);
5591 #endif /* UPL_DEBUG */
5592 
5593 	return upl;
5594 }
5595 
5596 static void
5597 upl_destroy(upl_t upl)
5598 {
5599 	uint32_t pages;
5600 
5601 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5602 
5603 	if (upl->ext_ref_count) {
5604 		panic("upl(%p) ext_ref_count", upl);
5605 	}
5606 
5607 #if CONFIG_IOSCHED
5608 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5609 		upl_t src_upl;
5610 		src_upl = upl->decmp_io_upl;
5611 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5612 		upl_lock(src_upl);
5613 		src_upl->decmp_io_upl = NULL;
5614 		upl_unlock(src_upl);
5615 		upl_deallocate(src_upl);
5616 	}
5617 #endif /* CONFIG_IOSCHED */
5618 
5619 #if CONFIG_IOSCHED || UPL_DEBUG
5620 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5621 	    !(upl->flags & UPL_VECTOR)) {
5622 		vm_object_t     object;
5623 
5624 		if (upl->flags & UPL_SHADOWED) {
5625 			object = upl->map_object->shadow;
5626 		} else {
5627 			object = upl->map_object;
5628 		}
5629 
5630 		vm_object_lock(object);
5631 		queue_remove(&object->uplq, upl, upl_t, uplq);
5632 		vm_object_activity_end(object);
5633 		vm_object_collapse(object, 0, TRUE);
5634 		vm_object_unlock(object);
5635 	}
5636 #endif
5637 	/*
5638 	 * drop a reference on the map_object whether or
5639 	 * not a pageout object is inserted
5640 	 */
5641 	if (upl->flags & UPL_SHADOWED) {
5642 		vm_object_deallocate(upl->map_object);
5643 	}
5644 
5645 	if (upl->flags & UPL_DEVICE_MEMORY) {
5646 		pages = 1;
5647 	} else {
5648 		pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5649 	}
5650 
5651 	upl_lock_destroy(upl);
5652 
5653 #if CONFIG_IOSCHED
5654 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5655 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5656 	}
5657 #endif
5658 
5659 #if UPL_DEBUG
5660 	for (int i = 0; i < upl->upl_commit_index; i++) {
5661 		btref_put(upl->upl_commit_records[i].c_btref);
5662 	}
5663 	btref_put(upl->upl_create_btref);
5664 #endif /* UPL_DEBUG */
5665 
5666 	if ((upl->flags & UPL_LITE) && pages) {
5667 		bitmap_free(upl->lite_list, pages);
5668 	}
5669 	kfree_type(struct upl, struct upl_page_info,
5670 	    (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5671 }
5672 
5673 void
5674 upl_deallocate(upl_t upl)
5675 {
5676 	upl_lock(upl);
5677 
5678 	if (--upl->ref_count == 0) {
5679 		if (vector_upl_is_valid(upl)) {
5680 			vector_upl_deallocate(upl);
5681 		}
5682 		upl_unlock(upl);
5683 
5684 		if (upl->upl_iodone) {
5685 			upl_callout_iodone(upl);
5686 		}
5687 
5688 		upl_destroy(upl);
5689 	} else {
5690 		upl_unlock(upl);
5691 	}
5692 }
5693 
5694 #if CONFIG_IOSCHED
5695 void
5696 upl_mark_decmp(upl_t upl)
5697 {
5698 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5699 		upl->flags |= UPL_DECMP_REQ;
5700 		upl->upl_creator->decmp_upl = (void *)upl;
5701 	}
5702 }
5703 
5704 void
5705 upl_unmark_decmp(upl_t upl)
5706 {
5707 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5708 		upl->upl_creator->decmp_upl = NULL;
5709 	}
5710 }
5711 
5712 #endif /* CONFIG_IOSCHED */
5713 
5714 #define VM_PAGE_Q_BACKING_UP(q)         \
5715 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5716 
5717 boolean_t must_throttle_writes(void);
5718 
5719 boolean_t
5720 must_throttle_writes()
5721 {
5722 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5723 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5724 		return TRUE;
5725 	}
5726 
5727 	return FALSE;
5728 }
5729 
5730 int vm_page_delayed_work_ctx_needed = 0;
5731 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5732 
5733 __startup_func
5734 static void
5735 vm_page_delayed_work_init_ctx(void)
5736 {
5737 	uint16_t min_delayed_work_ctx_allocated = 16;
5738 
5739 	/*
5740 	 * try really hard to always keep NCPU elements around in the zone
5741 	 * in order for the UPL code to almost always get an element.
5742 	 */
5743 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5744 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5745 	}
5746 
5747 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5748 }
5749 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5750 
5751 struct vm_page_delayed_work*
5752 vm_page_delayed_work_get_ctx(void)
5753 {
5754 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5755 
5756 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5757 
5758 	if (__probable(dw_ctx)) {
5759 		dw_ctx->delayed_owner = current_thread();
5760 	} else {
5761 		vm_page_delayed_work_ctx_needed++;
5762 	}
5763 	return dw_ctx ? dw_ctx->dwp : NULL;
5764 }
5765 
5766 void
5767 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5768 {
5769 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5770 
5771 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5772 	ldw_ctx->delayed_owner = NULL;
5773 
5774 	zfree(dw_ctx_zone, ldw_ctx);
5775 }
5776 
5777 /*
5778  *	Routine:	vm_object_upl_request
5779  *	Purpose:
5780  *		Cause the population of a portion of a vm_object.
5781  *		Depending on the nature of the request, the pages
5782  *		returned may be contain valid data or be uninitialized.
5783  *		A page list structure, listing the physical pages
5784  *		will be returned upon request.
5785  *		This function is called by the file system or any other
5786  *		supplier of backing store to a pager.
5787  *		IMPORTANT NOTE: The caller must still respect the relationship
5788  *		between the vm_object and its backing memory object.  The
5789  *		caller MUST NOT substitute changes in the backing file
5790  *		without first doing a memory_object_lock_request on the
5791  *		target range unless it is know that the pages are not
5792  *		shared with another entity at the pager level.
5793  *		Copy_in_to:
5794  *			if a page list structure is present
5795  *			return the mapped physical pages, where a
5796  *			page is not present, return a non-initialized
5797  *			one.  If the no_sync bit is turned on, don't
5798  *			call the pager unlock to synchronize with other
5799  *			possible copies of the page. Leave pages busy
5800  *			in the original object, if a page list structure
5801  *			was specified.  When a commit of the page list
5802  *			pages is done, the dirty bit will be set for each one.
5803  *		Copy_out_from:
5804  *			If a page list structure is present, return
5805  *			all mapped pages.  Where a page does not exist
5806  *			map a zero filled one. Leave pages busy in
5807  *			the original object.  If a page list structure
5808  *			is not specified, this call is a no-op.
5809  *
5810  *		Note:  access of default pager objects has a rather interesting
5811  *		twist.  The caller of this routine, presumably the file system
5812  *		page cache handling code, will never actually make a request
5813  *		against a default pager backed object.  Only the default
5814  *		pager will make requests on backing store related vm_objects
5815  *		In this way the default pager can maintain the relationship
5816  *		between backing store files (abstract memory objects) and
5817  *		the vm_objects (cache objects), they support.
5818  *
5819  */
5820 
5821 __private_extern__ kern_return_t
5822 vm_object_upl_request(
5823 	vm_object_t             object,
5824 	vm_object_offset_t      offset,
5825 	upl_size_t              size,
5826 	upl_t                   *upl_ptr,
5827 	upl_page_info_array_t   user_page_list,
5828 	unsigned int            *page_list_count,
5829 	upl_control_flags_t     cntrl_flags,
5830 	vm_tag_t                tag)
5831 {
5832 	vm_page_t               dst_page = VM_PAGE_NULL;
5833 	vm_object_offset_t      dst_offset;
5834 	upl_size_t              xfer_size;
5835 	unsigned int            size_in_pages;
5836 	boolean_t               dirty;
5837 	boolean_t               hw_dirty;
5838 	upl_t                   upl = NULL;
5839 	unsigned int            entry;
5840 	vm_page_t               alias_page = NULL;
5841 	int                     refmod_state = 0;
5842 	vm_object_t             last_copy_object;
5843 	uint32_t                last_copy_version;
5844 	struct  vm_page_delayed_work    dw_array;
5845 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5846 	bool                    dwp_finish_ctx = TRUE;
5847 	int                     dw_count;
5848 	int                     dw_limit;
5849 	int                     io_tracking_flag = 0;
5850 	vm_grab_options_t       grab_options;
5851 	int                     page_grab_count = 0;
5852 	ppnum_t                 phys_page;
5853 	pmap_flush_context      pmap_flush_context_storage;
5854 	boolean_t               pmap_flushes_delayed = FALSE;
5855 	task_t                  task = current_task();
5856 
5857 	dwp_start = dwp = NULL;
5858 
5859 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5860 		/*
5861 		 * For forward compatibility's sake,
5862 		 * reject any unknown flag.
5863 		 */
5864 		return KERN_INVALID_VALUE;
5865 	}
5866 	if ((!object->internal) && (object->paging_offset != 0)) {
5867 		panic("vm_object_upl_request: external object with non-zero paging offset");
5868 	}
5869 	if (object->phys_contiguous) {
5870 		panic("vm_object_upl_request: contiguous object specified");
5871 	}
5872 
5873 	assertf(page_aligned(offset) && page_aligned(size),
5874 	    "offset 0x%llx size 0x%x",
5875 	    offset, size);
5876 
5877 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5878 
5879 	dw_count = 0;
5880 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5881 	dwp_start = vm_page_delayed_work_get_ctx();
5882 	if (dwp_start == NULL) {
5883 		dwp_start = &dw_array;
5884 		dw_limit = 1;
5885 		dwp_finish_ctx = FALSE;
5886 	}
5887 
5888 	dwp = dwp_start;
5889 
5890 	if (size > MAX_UPL_SIZE_BYTES) {
5891 		size = MAX_UPL_SIZE_BYTES;
5892 	}
5893 
5894 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5895 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5896 	}
5897 
5898 #if CONFIG_IOSCHED || UPL_DEBUG
5899 	if (object->io_tracking || upl_debug_enabled) {
5900 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5901 	}
5902 #endif
5903 #if CONFIG_IOSCHED
5904 	if (object->io_tracking) {
5905 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5906 	}
5907 #endif
5908 
5909 	if (cntrl_flags & UPL_SET_INTERNAL) {
5910 		if (cntrl_flags & UPL_SET_LITE) {
5911 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5912 		} else {
5913 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5914 		}
5915 		user_page_list = size ? upl->page_list : NULL;
5916 	} else {
5917 		if (cntrl_flags & UPL_SET_LITE) {
5918 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5919 		} else {
5920 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5921 		}
5922 	}
5923 	*upl_ptr = upl;
5924 
5925 	if (user_page_list) {
5926 		user_page_list[0].device = FALSE;
5927 	}
5928 
5929 	if (cntrl_flags & UPL_SET_LITE) {
5930 		upl->map_object = object;
5931 	} else {
5932 		upl->map_object = vm_object_allocate(size, object->vmo_provenance);
5933 		vm_object_lock(upl->map_object);
5934 		/*
5935 		 * No neeed to lock the new object: nobody else knows
5936 		 * about it yet, so it's all ours so far.
5937 		 */
5938 		upl->map_object->shadow = object;
5939 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
5940 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
5941 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5942 		upl->map_object->vo_shadow_offset = offset;
5943 		upl->map_object->wimg_bits = object->wimg_bits;
5944 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5945 		    "object %p shadow_offset 0x%llx",
5946 		    upl->map_object, upl->map_object->vo_shadow_offset);
5947 		vm_object_unlock(upl->map_object);
5948 
5949 		alias_page = vm_page_create_fictitious();
5950 
5951 		upl->flags |= UPL_SHADOWED;
5952 	}
5953 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5954 		upl->flags |= UPL_PAGEOUT;
5955 	}
5956 
5957 	vm_object_lock(object);
5958 	vm_object_activity_begin(object);
5959 
5960 	grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
5961 #if CONFIG_SECLUDED_MEMORY
5962 	if (object->can_grab_secluded) {
5963 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5964 	}
5965 #endif /* CONFIG_SECLUDED_MEMORY */
5966 
5967 	/*
5968 	 * we can lock in the paging_offset once paging_in_progress is set
5969 	 */
5970 	upl->u_size = size;
5971 	upl->u_offset = offset + object->paging_offset;
5972 
5973 #if CONFIG_IOSCHED || UPL_DEBUG
5974 	if (object->io_tracking || upl_debug_enabled) {
5975 		vm_object_activity_begin(object);
5976 		queue_enter(&object->uplq, upl, upl_t, uplq);
5977 	}
5978 #endif
5979 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5980 		/*
5981 		 * Honor copy-on-write obligations
5982 		 *
5983 		 * The caller is gathering these pages and
5984 		 * might modify their contents.  We need to
5985 		 * make sure that the copy object has its own
5986 		 * private copies of these pages before we let
5987 		 * the caller modify them.
5988 		 */
5989 		vm_object_update(object,
5990 		    offset,
5991 		    size,
5992 		    NULL,
5993 		    NULL,
5994 		    FALSE,              /* should_return */
5995 		    MEMORY_OBJECT_COPY_SYNC,
5996 		    VM_PROT_NO_CHANGE);
5997 
5998 		VM_PAGEOUT_DEBUG(upl_cow, 1);
5999 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
6000 	}
6001 	/*
6002 	 * remember which copy object we synchronized with
6003 	 */
6004 	last_copy_object = object->vo_copy;
6005 	last_copy_version = object->vo_copy_version;
6006 	entry = 0;
6007 
6008 	xfer_size = size;
6009 	dst_offset = offset;
6010 	size_in_pages = size / PAGE_SIZE;
6011 
6012 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
6013 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
6014 		object->scan_collisions = 0;
6015 	}
6016 
6017 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6018 		boolean_t       isSSD = FALSE;
6019 
6020 #if !XNU_TARGET_OS_OSX
6021 		isSSD = TRUE;
6022 #else /* !XNU_TARGET_OS_OSX */
6023 		vnode_pager_get_isSSD(object->pager, &isSSD);
6024 #endif /* !XNU_TARGET_OS_OSX */
6025 		vm_object_unlock(object);
6026 
6027 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6028 
6029 		if (isSSD == TRUE) {
6030 			delay(1000 * size_in_pages);
6031 		} else {
6032 			delay(5000 * size_in_pages);
6033 		}
6034 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6035 
6036 		vm_object_lock(object);
6037 	}
6038 
6039 	while (xfer_size) {
6040 		dwp->dw_mask = 0;
6041 
6042 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6043 			vm_object_unlock(object);
6044 			alias_page = vm_page_create_fictitious();
6045 			vm_object_lock(object);
6046 		}
6047 		if (cntrl_flags & UPL_COPYOUT_FROM) {
6048 			upl->flags |= UPL_PAGE_SYNC_DONE;
6049 
6050 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6051 			    vm_page_is_fictitious(dst_page) ||
6052 			    dst_page->vmp_absent ||
6053 			    VMP_ERROR_GET(dst_page) ||
6054 			    dst_page->vmp_cleaning ||
6055 			    (VM_PAGE_WIRED(dst_page))) {
6056 				if (user_page_list) {
6057 					user_page_list[entry].phys_addr = 0;
6058 				}
6059 
6060 				goto try_next_page;
6061 			}
6062 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6063 
6064 			/*
6065 			 * grab this up front...
6066 			 * a high percentange of the time we're going to
6067 			 * need the hardware modification state a bit later
6068 			 * anyway... so we can eliminate an extra call into
6069 			 * the pmap layer by grabbing it here and recording it
6070 			 */
6071 			if (dst_page->vmp_pmapped) {
6072 				refmod_state = pmap_get_refmod(phys_page);
6073 			} else {
6074 				refmod_state = 0;
6075 			}
6076 
6077 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6078 				/*
6079 				 * page is on inactive list and referenced...
6080 				 * reactivate it now... this gets it out of the
6081 				 * way of vm_pageout_scan which would have to
6082 				 * reactivate it upon tripping over it
6083 				 */
6084 				dwp->dw_mask |= DW_vm_page_activate;
6085 			}
6086 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6087 				/*
6088 				 * we're only asking for DIRTY pages to be returned
6089 				 */
6090 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6091 					/*
6092 					 * if we were the page stolen by vm_pageout_scan to be
6093 					 * cleaned (as opposed to a buddy being clustered in
6094 					 * or this request is not being driven by a PAGEOUT cluster
6095 					 * then we only need to check for the page being dirty or
6096 					 * precious to decide whether to return it
6097 					 */
6098 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6099 						goto check_busy;
6100 					}
6101 					goto dont_return;
6102 				}
6103 				/*
6104 				 * this is a request for a PAGEOUT cluster and this page
6105 				 * is merely along for the ride as a 'buddy'... not only
6106 				 * does it have to be dirty to be returned, but it also
6107 				 * can't have been referenced recently...
6108 				 */
6109 				if ((hibernate_cleaning_in_progress == TRUE ||
6110 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6111 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6112 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6113 					goto check_busy;
6114 				}
6115 dont_return:
6116 				/*
6117 				 * if we reach here, we're not to return
6118 				 * the page... go on to the next one
6119 				 */
6120 				if (dst_page->vmp_laundry == TRUE) {
6121 					/*
6122 					 * if we get here, the page is not 'cleaning' (filtered out above).
6123 					 * since it has been referenced, remove it from the laundry
6124 					 * so we don't pay the cost of an I/O to clean a page
6125 					 * we're just going to take back
6126 					 */
6127 					vm_page_lockspin_queues();
6128 
6129 					vm_pageout_steal_laundry(dst_page, TRUE);
6130 					vm_page_activate(dst_page);
6131 
6132 					vm_page_unlock_queues();
6133 				}
6134 				if (user_page_list) {
6135 					user_page_list[entry].phys_addr = 0;
6136 				}
6137 
6138 				goto try_next_page;
6139 			}
6140 check_busy:
6141 			if (dst_page->vmp_busy) {
6142 				if (cntrl_flags & UPL_NOBLOCK) {
6143 					if (user_page_list) {
6144 						user_page_list[entry].phys_addr = 0;
6145 					}
6146 					dwp->dw_mask = 0;
6147 
6148 					goto try_next_page;
6149 				}
6150 				/*
6151 				 * someone else is playing with the
6152 				 * page.  We will have to wait.
6153 				 */
6154 				vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6155 
6156 				continue;
6157 			}
6158 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6159 				vm_page_lockspin_queues();
6160 
6161 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6162 					/*
6163 					 * we've buddied up a page for a clustered pageout
6164 					 * that has already been moved to the pageout
6165 					 * queue by pageout_scan... we need to remove
6166 					 * it from the queue and drop the laundry count
6167 					 * on that queue
6168 					 */
6169 					vm_pageout_throttle_up(dst_page);
6170 				}
6171 				vm_page_unlock_queues();
6172 			}
6173 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6174 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6175 
6176 			if (phys_page > upl->highest_page) {
6177 				upl->highest_page = phys_page;
6178 			}
6179 
6180 			assert(!pmap_is_noencrypt(phys_page));
6181 
6182 			if (cntrl_flags & UPL_SET_LITE) {
6183 				unsigned int    pg_num;
6184 
6185 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6186 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6187 				bitmap_set(upl->lite_list, pg_num);
6188 
6189 				if (hw_dirty) {
6190 					if (pmap_flushes_delayed == FALSE) {
6191 						pmap_flush_context_init(&pmap_flush_context_storage);
6192 						pmap_flushes_delayed = TRUE;
6193 					}
6194 					pmap_clear_refmod_options(phys_page,
6195 					    VM_MEM_MODIFIED,
6196 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6197 					    &pmap_flush_context_storage);
6198 				}
6199 
6200 				/*
6201 				 * Mark original page as cleaning
6202 				 * in place.
6203 				 */
6204 				dst_page->vmp_cleaning = TRUE;
6205 				dst_page->vmp_precious = FALSE;
6206 			} else {
6207 				/*
6208 				 * use pageclean setup, it is more
6209 				 * convenient even for the pageout
6210 				 * cases here
6211 				 */
6212 				vm_object_lock(upl->map_object);
6213 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6214 				vm_object_unlock(upl->map_object);
6215 
6216 				alias_page->vmp_absent = FALSE;
6217 				alias_page = NULL;
6218 			}
6219 			if (dirty) {
6220 				SET_PAGE_DIRTY(dst_page, FALSE);
6221 			} else {
6222 				dst_page->vmp_dirty = FALSE;
6223 			}
6224 
6225 			if (!dirty) {
6226 				dst_page->vmp_precious = TRUE;
6227 			}
6228 
6229 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6230 				if (!VM_PAGE_WIRED(dst_page)) {
6231 					dst_page->vmp_free_when_done = TRUE;
6232 				}
6233 			}
6234 		} else {
6235 			if ((cntrl_flags & UPL_WILL_MODIFY) &&
6236 			    (object->vo_copy != last_copy_object ||
6237 			    object->vo_copy_version != last_copy_version)) {
6238 				/*
6239 				 * Honor copy-on-write obligations
6240 				 *
6241 				 * The copy object has changed since we
6242 				 * last synchronized for copy-on-write.
6243 				 * Another copy object might have been
6244 				 * inserted while we released the object's
6245 				 * lock.  Since someone could have seen the
6246 				 * original contents of the remaining pages
6247 				 * through that new object, we have to
6248 				 * synchronize with it again for the remaining
6249 				 * pages only.  The previous pages are "busy"
6250 				 * so they can not be seen through the new
6251 				 * mapping.  The new mapping will see our
6252 				 * upcoming changes for those previous pages,
6253 				 * but that's OK since they couldn't see what
6254 				 * was there before.  It's just a race anyway
6255 				 * and there's no guarantee of consistency or
6256 				 * atomicity.  We just don't want new mappings
6257 				 * to see both the *before* and *after* pages.
6258 				 */
6259 				if (object->vo_copy != VM_OBJECT_NULL) {
6260 					vm_object_update(
6261 						object,
6262 						dst_offset,/* current offset */
6263 						xfer_size, /* remaining size */
6264 						NULL,
6265 						NULL,
6266 						FALSE,     /* should_return */
6267 						MEMORY_OBJECT_COPY_SYNC,
6268 						VM_PROT_NO_CHANGE);
6269 
6270 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6271 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6272 				}
6273 				/*
6274 				 * remember the copy object we synced with
6275 				 */
6276 				last_copy_object = object->vo_copy;
6277 				last_copy_version = object->vo_copy_version;
6278 			}
6279 			dst_page = vm_page_lookup(object, dst_offset);
6280 
6281 			if (dst_page != VM_PAGE_NULL) {
6282 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6283 					/*
6284 					 * skip over pages already present in the cache
6285 					 */
6286 					if (user_page_list) {
6287 						user_page_list[entry].phys_addr = 0;
6288 					}
6289 
6290 					goto try_next_page;
6291 				}
6292 				if (vm_page_is_fictitious(dst_page)) {
6293 					panic("need corner case for fictitious page");
6294 				}
6295 
6296 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6297 					/*
6298 					 * someone else is playing with the
6299 					 * page.  We will have to wait.
6300 					 */
6301 					vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6302 
6303 					continue;
6304 				}
6305 				if (dst_page->vmp_laundry) {
6306 					vm_pageout_steal_laundry(dst_page, FALSE);
6307 				}
6308 			} else {
6309 				if (object->private) {
6310 					/*
6311 					 * This is a nasty wrinkle for users
6312 					 * of upl who encounter device or
6313 					 * private memory however, it is
6314 					 * unavoidable, only a fault can
6315 					 * resolve the actual backing
6316 					 * physical page by asking the
6317 					 * backing device.
6318 					 */
6319 					if (user_page_list) {
6320 						user_page_list[entry].phys_addr = 0;
6321 					}
6322 
6323 					goto try_next_page;
6324 				}
6325 				if (object->scan_collisions) {
6326 					/*
6327 					 * the pageout_scan thread is trying to steal
6328 					 * pages from this object, but has run into our
6329 					 * lock... grab 2 pages from the head of the object...
6330 					 * the first is freed on behalf of pageout_scan, the
6331 					 * 2nd is for our own use... we use vm_object_page_grab
6332 					 * in both cases to avoid taking pages from the free
6333 					 * list since we are under memory pressure and our
6334 					 * lock on this object is getting in the way of
6335 					 * relieving it
6336 					 */
6337 					dst_page = vm_object_page_grab(object);
6338 
6339 					if (dst_page != VM_PAGE_NULL) {
6340 						vm_page_release(dst_page,
6341 						    VMP_RELEASE_NONE);
6342 					}
6343 
6344 					dst_page = vm_object_page_grab(object);
6345 				}
6346 				if (dst_page == VM_PAGE_NULL) {
6347 					/*
6348 					 * need to allocate a page
6349 					 */
6350 					dst_page = vm_page_grab_options(grab_options);
6351 					if (dst_page != VM_PAGE_NULL) {
6352 						page_grab_count++;
6353 					}
6354 				}
6355 				if (dst_page == VM_PAGE_NULL) {
6356 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6357 						/*
6358 						 * we don't want to stall waiting for pages to come onto the free list
6359 						 * while we're already holding absent pages in this UPL
6360 						 * the caller will deal with the empty slots
6361 						 */
6362 						if (user_page_list) {
6363 							user_page_list[entry].phys_addr = 0;
6364 						}
6365 
6366 						goto try_next_page;
6367 					}
6368 					/*
6369 					 * no pages available... wait
6370 					 * then try again for the same
6371 					 * offset...
6372 					 */
6373 					vm_object_unlock(object);
6374 
6375 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6376 
6377 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6378 
6379 					VM_PAGE_WAIT();
6380 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6381 
6382 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6383 
6384 					vm_object_lock(object);
6385 
6386 					continue;
6387 				}
6388 				vm_page_insert(dst_page, object, dst_offset);
6389 
6390 				dst_page->vmp_absent = TRUE;
6391 				dst_page->vmp_busy = FALSE;
6392 
6393 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6394 					/*
6395 					 * if UPL_RET_ONLY_ABSENT was specified,
6396 					 * than we're definitely setting up a
6397 					 * upl for a clustered read/pagein
6398 					 * operation... mark the pages as clustered
6399 					 * so upl_commit_range can put them on the
6400 					 * speculative list
6401 					 */
6402 					dst_page->vmp_clustered = TRUE;
6403 
6404 					if (!(cntrl_flags & UPL_FILE_IO)) {
6405 						counter_inc(&vm_statistics_pageins);
6406 					}
6407 				}
6408 			}
6409 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6410 
6411 			dst_page->vmp_overwriting = TRUE;
6412 
6413 			if (dst_page->vmp_pmapped) {
6414 				if (!(cntrl_flags & UPL_FILE_IO)) {
6415 					/*
6416 					 * eliminate all mappings from the
6417 					 * original object and its prodigy
6418 					 */
6419 					refmod_state = pmap_disconnect(phys_page);
6420 				} else {
6421 					refmod_state = pmap_get_refmod(phys_page);
6422 				}
6423 			} else {
6424 				refmod_state = 0;
6425 			}
6426 
6427 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6428 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6429 
6430 			if (cntrl_flags & UPL_SET_LITE) {
6431 				unsigned int    pg_num;
6432 
6433 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6434 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6435 				bitmap_set(upl->lite_list, pg_num);
6436 
6437 				if (hw_dirty) {
6438 					pmap_clear_modify(phys_page);
6439 				}
6440 
6441 				/*
6442 				 * Mark original page as cleaning
6443 				 * in place.
6444 				 */
6445 				dst_page->vmp_cleaning = TRUE;
6446 				dst_page->vmp_precious = FALSE;
6447 			} else {
6448 				/*
6449 				 * use pageclean setup, it is more
6450 				 * convenient even for the pageout
6451 				 * cases here
6452 				 */
6453 				vm_object_lock(upl->map_object);
6454 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6455 				vm_object_unlock(upl->map_object);
6456 
6457 				alias_page->vmp_absent = FALSE;
6458 				alias_page = NULL;
6459 			}
6460 
6461 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6462 				upl->flags &= ~UPL_CLEAR_DIRTY;
6463 				upl->flags |= UPL_SET_DIRTY;
6464 				dirty = TRUE;
6465 				/*
6466 				 * Page belonging to a code-signed object is about to
6467 				 * be written. Mark it tainted and disconnect it from
6468 				 * all pmaps so processes have to fault it back in and
6469 				 * deal with the tainted bit.
6470 				 */
6471 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6472 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6473 					vm_page_upl_tainted++;
6474 					if (dst_page->vmp_pmapped) {
6475 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6476 						if (refmod_state & VM_MEM_REFERENCED) {
6477 							dst_page->vmp_reference = TRUE;
6478 						}
6479 					}
6480 				}
6481 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6482 				/*
6483 				 * clean in place for read implies
6484 				 * that a write will be done on all
6485 				 * the pages that are dirty before
6486 				 * a upl commit is done.  The caller
6487 				 * is obligated to preserve the
6488 				 * contents of all pages marked dirty
6489 				 */
6490 				upl->flags |= UPL_CLEAR_DIRTY;
6491 			}
6492 			dst_page->vmp_dirty = dirty;
6493 
6494 			if (!dirty) {
6495 				dst_page->vmp_precious = TRUE;
6496 			}
6497 
6498 			if (!VM_PAGE_WIRED(dst_page)) {
6499 				/*
6500 				 * deny access to the target page while
6501 				 * it is being worked on
6502 				 */
6503 				dst_page->vmp_busy = TRUE;
6504 			} else {
6505 				dwp->dw_mask |= DW_vm_page_wire;
6506 			}
6507 
6508 			/*
6509 			 * We might be about to satisfy a fault which has been
6510 			 * requested. So no need for the "restart" bit.
6511 			 */
6512 			dst_page->vmp_restart = FALSE;
6513 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6514 				/*
6515 				 * expect the page to be used
6516 				 */
6517 				dwp->dw_mask |= DW_set_reference;
6518 			}
6519 			if (cntrl_flags & UPL_PRECIOUS) {
6520 				if (object->internal) {
6521 					SET_PAGE_DIRTY(dst_page, FALSE);
6522 					dst_page->vmp_precious = FALSE;
6523 				} else {
6524 					dst_page->vmp_precious = TRUE;
6525 				}
6526 			} else {
6527 				dst_page->vmp_precious = FALSE;
6528 			}
6529 		}
6530 		if (dst_page->vmp_busy) {
6531 			upl->flags |= UPL_HAS_BUSY;
6532 		}
6533 		if (VM_PAGE_WIRED(dst_page)) {
6534 			upl->flags |= UPL_HAS_WIRED;
6535 		}
6536 
6537 		if (phys_page > upl->highest_page) {
6538 			upl->highest_page = phys_page;
6539 		}
6540 		assert(!pmap_is_noencrypt(phys_page));
6541 		if (user_page_list) {
6542 			user_page_list[entry].phys_addr = phys_page;
6543 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6544 			user_page_list[entry].absent    = dst_page->vmp_absent;
6545 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6546 			user_page_list[entry].precious  = dst_page->vmp_precious;
6547 			user_page_list[entry].device    = FALSE;
6548 			user_page_list[entry].needed    = FALSE;
6549 			if (dst_page->vmp_clustered == TRUE) {
6550 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6551 			} else {
6552 				user_page_list[entry].speculative = FALSE;
6553 			}
6554 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6555 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6556 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6557 			user_page_list[entry].mark      = FALSE;
6558 		}
6559 		/*
6560 		 * if UPL_RET_ONLY_ABSENT is set, then
6561 		 * we are working with a fresh page and we've
6562 		 * just set the clustered flag on it to
6563 		 * indicate that it was drug in as part of a
6564 		 * speculative cluster... so leave it alone
6565 		 */
6566 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6567 			/*
6568 			 * someone is explicitly grabbing this page...
6569 			 * update clustered and speculative state
6570 			 *
6571 			 */
6572 			if (dst_page->vmp_clustered) {
6573 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6574 			}
6575 		}
6576 try_next_page:
6577 		if (dwp->dw_mask) {
6578 			if (dwp->dw_mask & DW_vm_page_activate) {
6579 				counter_inc(&vm_statistics_reactivations);
6580 			}
6581 
6582 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6583 
6584 			if (dw_count >= dw_limit) {
6585 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6586 
6587 				dwp = dwp_start;
6588 				dw_count = 0;
6589 			}
6590 		}
6591 		entry++;
6592 		dst_offset += PAGE_SIZE_64;
6593 		xfer_size -= PAGE_SIZE;
6594 	}
6595 	if (dw_count) {
6596 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6597 		dwp = dwp_start;
6598 		dw_count = 0;
6599 	}
6600 
6601 	if (alias_page != NULL) {
6602 		VM_PAGE_FREE(alias_page);
6603 	}
6604 	if (pmap_flushes_delayed == TRUE) {
6605 		pmap_flush(&pmap_flush_context_storage);
6606 	}
6607 
6608 	if (page_list_count != NULL) {
6609 		if (upl->flags & UPL_INTERNAL) {
6610 			*page_list_count = 0;
6611 		} else if (*page_list_count > entry) {
6612 			*page_list_count = entry;
6613 		}
6614 	}
6615 #if UPL_DEBUG
6616 	upl->upl_state = 1;
6617 #endif
6618 	vm_object_unlock(object);
6619 
6620 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6621 	if (task != NULL) {
6622 		counter_add(&task->pages_grabbed_upl, page_grab_count);
6623 	}
6624 
6625 	if (dwp_start && dwp_finish_ctx) {
6626 		vm_page_delayed_work_finish_ctx(dwp_start);
6627 		dwp_start = dwp = NULL;
6628 	}
6629 
6630 	return KERN_SUCCESS;
6631 }
6632 
6633 int cs_executable_create_upl = 0;
6634 extern int proc_selfpid(void);
6635 extern char *proc_name_address(void *p);
6636 
6637 kern_return_t
6638 vm_map_create_upl(
6639 	vm_map_t                map,
6640 	vm_map_address_t        offset,
6641 	upl_size_t              *upl_size,
6642 	upl_t                   *upl,
6643 	upl_page_info_array_t   page_list,
6644 	unsigned int            *count,
6645 	upl_control_flags_t     *flags,
6646 	vm_tag_t                tag)
6647 {
6648 	vm_map_entry_t          entry;
6649 	upl_control_flags_t     caller_flags;
6650 	int                     force_data_sync;
6651 	int                     sync_cow_data;
6652 	vm_object_t             local_object;
6653 	vm_map_offset_t         local_offset;
6654 	vm_map_offset_t         local_start;
6655 	kern_return_t           ret;
6656 	vm_map_address_t        original_offset;
6657 	vm_map_size_t           original_size, adjusted_size;
6658 	vm_map_offset_t         local_entry_start;
6659 	vm_object_offset_t      local_entry_offset;
6660 	vm_object_offset_t      offset_in_mapped_page;
6661 	boolean_t               release_map = FALSE;
6662 
6663 start_with_map:
6664 	caller_flags = *flags;
6665 
6666 	if (caller_flags & ~UPL_VALID_FLAGS) {
6667 		/*
6668 		 * For forward compatibility's sake,
6669 		 * reject any unknown flag.
6670 		 */
6671 		ret = KERN_INVALID_VALUE;
6672 		goto done;
6673 	}
6674 
6675 	if (upl == NULL) {
6676 		ret = KERN_INVALID_ARGUMENT;
6677 		goto done;
6678 	}
6679 
6680 
6681 	original_offset = offset;
6682 	original_size = *upl_size;
6683 	adjusted_size = original_size;
6684 
6685 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6686 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6687 
6688 REDISCOVER_ENTRY:
6689 	vm_map_lock_read(map);
6690 
6691 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6692 		vm_map_unlock_read(map);
6693 		ret = KERN_FAILURE;
6694 		goto done;
6695 	}
6696 
6697 	local_entry_start = entry->vme_start;
6698 	local_entry_offset = VME_OFFSET(entry);
6699 
6700 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6701 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6702 	}
6703 
6704 	if (entry->vme_end - original_offset < adjusted_size) {
6705 		adjusted_size = entry->vme_end - original_offset;
6706 		assert(adjusted_size > 0);
6707 		*upl_size = (upl_size_t) adjusted_size;
6708 		assert(*upl_size == adjusted_size);
6709 	}
6710 
6711 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6712 		*flags = 0;
6713 
6714 		if (!entry->is_sub_map &&
6715 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6716 			if (VME_OBJECT(entry)->private) {
6717 				*flags = UPL_DEV_MEMORY;
6718 			}
6719 
6720 			if (VME_OBJECT(entry)->phys_contiguous) {
6721 				*flags |= UPL_PHYS_CONTIG;
6722 			}
6723 		}
6724 		vm_map_unlock_read(map);
6725 		ret = KERN_SUCCESS;
6726 		goto done;
6727 	}
6728 
6729 	offset_in_mapped_page = 0;
6730 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6731 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6732 		*upl_size = (upl_size_t)
6733 		    (vm_map_round_page(original_offset + adjusted_size,
6734 		    VM_MAP_PAGE_MASK(map))
6735 		    - offset);
6736 
6737 		offset_in_mapped_page = original_offset - offset;
6738 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6739 
6740 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6741 	}
6742 
6743 	if (!entry->is_sub_map) {
6744 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6745 		    !VME_OBJECT(entry)->phys_contiguous) {
6746 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6747 				*upl_size = MAX_UPL_SIZE_BYTES;
6748 			}
6749 		}
6750 
6751 		/*
6752 		 *      Create an object if necessary.
6753 		 */
6754 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6755 			if (entry->max_protection == VM_PROT_NONE) {
6756 				/* don't create an object for a reserved range */
6757 				vm_map_unlock_read(map);
6758 				ret = KERN_PROTECTION_FAILURE;
6759 				goto done;
6760 			}
6761 
6762 			if (vm_map_lock_read_to_write(map)) {
6763 				goto REDISCOVER_ENTRY;
6764 			}
6765 
6766 			VME_OBJECT_SET(entry,
6767 			    vm_object_allocate((vm_size_t)
6768 			    vm_object_round_page((entry->vme_end - entry->vme_start)), map->serial_id),
6769 			    false, 0);
6770 			VME_OFFSET_SET(entry, 0);
6771 			assert(entry->use_pmap);
6772 
6773 			vm_map_lock_write_to_read(map);
6774 		}
6775 
6776 		if (!(caller_flags & UPL_COPYOUT_FROM) &&
6777 		    !(entry->protection & VM_PROT_WRITE)) {
6778 			vm_map_unlock_read(map);
6779 			ret = KERN_PROTECTION_FAILURE;
6780 			goto done;
6781 		}
6782 	}
6783 
6784 #if !XNU_TARGET_OS_OSX
6785 	if (map->pmap != kernel_pmap &&
6786 	    (caller_flags & UPL_COPYOUT_FROM) &&
6787 	    (entry->protection & VM_PROT_EXECUTE) &&
6788 	    !(entry->protection & VM_PROT_WRITE)) {
6789 		vm_offset_t     kaddr;
6790 		vm_size_t       ksize;
6791 
6792 		/*
6793 		 * We're about to create a read-only UPL backed by
6794 		 * memory from an executable mapping.
6795 		 * Wiring the pages would result in the pages being copied
6796 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6797 		 * code-signed, so no longer eligible for execution.
6798 		 * Instead, let's copy the data into a kernel buffer and
6799 		 * create the UPL from this kernel buffer.
6800 		 * The kernel buffer is then freed, leaving the UPL holding
6801 		 * the last reference on the VM object, so the memory will
6802 		 * be released when the UPL is committed.
6803 		 */
6804 
6805 		vm_map_unlock_read(map);
6806 		entry = VM_MAP_ENTRY_NULL;
6807 		/* allocate kernel buffer */
6808 		ksize = round_page(*upl_size);
6809 		kaddr = 0;
6810 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
6811 		    KMA_PAGEABLE | KMA_DATA, tag);
6812 		if (ret == KERN_SUCCESS) {
6813 			/* copyin the user data */
6814 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6815 		}
6816 		if (ret == KERN_SUCCESS) {
6817 			if (ksize > *upl_size) {
6818 				/* zero out the extra space in kernel buffer */
6819 				memset((void *)(kaddr + *upl_size),
6820 				    0,
6821 				    ksize - *upl_size);
6822 			}
6823 			/* create the UPL from the kernel buffer */
6824 			vm_object_offset_t      offset_in_object;
6825 			vm_object_offset_t      offset_in_object_page;
6826 
6827 			offset_in_object = offset - local_entry_start + local_entry_offset;
6828 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6829 			assert(offset_in_object_page < PAGE_SIZE);
6830 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6831 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6832 			ret = vm_map_create_upl(kernel_map,
6833 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6834 			    upl_size, upl, page_list, count, flags, tag);
6835 		}
6836 		if (kaddr != 0) {
6837 			/* free the kernel buffer */
6838 			kmem_free(kernel_map, kaddr, ksize);
6839 			kaddr = 0;
6840 			ksize = 0;
6841 		}
6842 #if DEVELOPMENT || DEBUG
6843 		DTRACE_VM4(create_upl_from_executable,
6844 		    vm_map_t, map,
6845 		    vm_map_address_t, offset,
6846 		    upl_size_t, *upl_size,
6847 		    kern_return_t, ret);
6848 #endif /* DEVELOPMENT || DEBUG */
6849 		goto done;
6850 	}
6851 #endif /* !XNU_TARGET_OS_OSX */
6852 
6853 	if (!entry->is_sub_map) {
6854 		local_object = VME_OBJECT(entry);
6855 		assert(local_object != VM_OBJECT_NULL);
6856 	}
6857 
6858 	if (!entry->is_sub_map &&
6859 	    !entry->needs_copy &&
6860 	    *upl_size != 0 &&
6861 	    local_object->vo_size > *upl_size && /* partial UPL */
6862 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6863 	    (map->pmap != kernel_pmap) && /* alias checks */
6864 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6865 	    ||
6866 	    ( /* case 2 */
6867 		    local_object->internal &&
6868 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6869 		    os_ref_get_count_raw(&local_object->ref_count) > 1))) {
6870 		vm_prot_t       prot;
6871 
6872 		/*
6873 		 * Case 1:
6874 		 * Set up the targeted range for copy-on-write to avoid
6875 		 * applying true_share/copy_delay to the entire object.
6876 		 *
6877 		 * Case 2:
6878 		 * This map entry covers only part of an internal
6879 		 * object.  There could be other map entries covering
6880 		 * other areas of this object and some of these map
6881 		 * entries could be marked as "needs_copy", which
6882 		 * assumes that the object is COPY_SYMMETRIC.
6883 		 * To avoid marking this object as COPY_DELAY and
6884 		 * "true_share", let's shadow it and mark the new
6885 		 * (smaller) object as "true_share" and COPY_DELAY.
6886 		 */
6887 
6888 		if (vm_map_lock_read_to_write(map)) {
6889 			goto REDISCOVER_ENTRY;
6890 		}
6891 		vm_map_lock_assert_exclusive(map);
6892 		assert(VME_OBJECT(entry) == local_object);
6893 
6894 		vm_map_clip_start(map,
6895 		    entry,
6896 		    vm_map_trunc_page(offset,
6897 		    VM_MAP_PAGE_MASK(map)));
6898 		vm_map_clip_end(map,
6899 		    entry,
6900 		    vm_map_round_page(offset + *upl_size,
6901 		    VM_MAP_PAGE_MASK(map)));
6902 		if ((entry->vme_end - offset) < *upl_size) {
6903 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6904 			assert(*upl_size == entry->vme_end - offset);
6905 		}
6906 
6907 		prot = entry->protection & ~VM_PROT_WRITE;
6908 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6909 			prot |= VM_PROT_EXECUTE;
6910 		}
6911 		vm_object_pmap_protect(local_object,
6912 		    VME_OFFSET(entry),
6913 		    entry->vme_end - entry->vme_start,
6914 		    ((entry->is_shared ||
6915 		    map->mapped_in_other_pmaps)
6916 		    ? PMAP_NULL
6917 		    : map->pmap),
6918 		    VM_MAP_PAGE_SIZE(map),
6919 		    entry->vme_start,
6920 		    prot);
6921 
6922 		assert(entry->wired_count == 0);
6923 
6924 		/*
6925 		 * Lock the VM object and re-check its status: if it's mapped
6926 		 * in another address space, we could still be racing with
6927 		 * another thread holding that other VM map exclusively.
6928 		 */
6929 		vm_object_lock(local_object);
6930 		if (local_object->true_share) {
6931 			/* object is already in proper state: no COW needed */
6932 			assert(local_object->copy_strategy !=
6933 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6934 		} else {
6935 			/* not true_share: ask for copy-on-write below */
6936 			assert(local_object->copy_strategy ==
6937 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6938 			entry->needs_copy = TRUE;
6939 		}
6940 		vm_object_unlock(local_object);
6941 
6942 		vm_map_lock_write_to_read(map);
6943 	}
6944 
6945 	if (entry->needs_copy) {
6946 		/*
6947 		 * Honor copy-on-write for COPY_SYMMETRIC
6948 		 * strategy.
6949 		 */
6950 		vm_map_t                local_map;
6951 		vm_object_t             object;
6952 		vm_object_offset_t      new_offset;
6953 		vm_prot_t               prot;
6954 		boolean_t               wired;
6955 		vm_map_version_t        version;
6956 		vm_map_t                real_map;
6957 		vm_prot_t               fault_type;
6958 
6959 		local_map = map;
6960 
6961 		if (caller_flags & UPL_COPYOUT_FROM) {
6962 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6963 			vm_counters.create_upl_extra_cow++;
6964 			vm_counters.create_upl_extra_cow_pages +=
6965 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6966 		} else {
6967 			fault_type = VM_PROT_WRITE;
6968 		}
6969 		if (vm_map_lookup_and_lock_object(&local_map,
6970 		    offset, fault_type,
6971 		    OBJECT_LOCK_EXCLUSIVE,
6972 		    &version, &object,
6973 		    &new_offset, &prot, &wired,
6974 		    NULL,
6975 		    &real_map, NULL) != KERN_SUCCESS) {
6976 			if (fault_type == VM_PROT_WRITE) {
6977 				vm_counters.create_upl_lookup_failure_write++;
6978 			} else {
6979 				vm_counters.create_upl_lookup_failure_copy++;
6980 			}
6981 			vm_map_unlock_read(local_map);
6982 			ret = KERN_FAILURE;
6983 			goto done;
6984 		}
6985 		if (real_map != local_map) {
6986 			vm_map_unlock(real_map);
6987 		}
6988 		vm_map_unlock_read(local_map);
6989 
6990 		vm_object_unlock(object);
6991 
6992 		goto REDISCOVER_ENTRY;
6993 	}
6994 
6995 	if (entry->is_sub_map) {
6996 		vm_map_t        submap;
6997 
6998 		submap = VME_SUBMAP(entry);
6999 		local_start = entry->vme_start;
7000 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7001 
7002 		vm_map_reference(submap);
7003 		vm_map_unlock_read(map);
7004 
7005 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7006 		offset += offset_in_mapped_page;
7007 		*upl_size -= offset_in_mapped_page;
7008 
7009 		if (release_map) {
7010 			vm_map_deallocate(map);
7011 		}
7012 		map = submap;
7013 		release_map = TRUE;
7014 		offset = local_offset + (offset - local_start);
7015 		goto start_with_map;
7016 	}
7017 
7018 	if (sync_cow_data &&
7019 	    (VME_OBJECT(entry)->shadow ||
7020 	    VME_OBJECT(entry)->vo_copy)) {
7021 		local_object = VME_OBJECT(entry);
7022 		local_start = entry->vme_start;
7023 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7024 
7025 		vm_object_reference(local_object);
7026 		vm_map_unlock_read(map);
7027 
7028 		if (local_object->shadow && local_object->vo_copy) {
7029 			vm_object_lock_request(local_object->shadow,
7030 			    ((vm_object_offset_t)
7031 			    ((offset - local_start) +
7032 			    local_offset) +
7033 			    local_object->vo_shadow_offset),
7034 			    *upl_size, FALSE,
7035 			    MEMORY_OBJECT_DATA_SYNC,
7036 			    VM_PROT_NO_CHANGE);
7037 		}
7038 		sync_cow_data = FALSE;
7039 		vm_object_deallocate(local_object);
7040 
7041 		goto REDISCOVER_ENTRY;
7042 	}
7043 	if (force_data_sync) {
7044 		local_object = VME_OBJECT(entry);
7045 		local_start = entry->vme_start;
7046 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7047 
7048 		vm_object_reference(local_object);
7049 		vm_map_unlock_read(map);
7050 
7051 		vm_object_lock_request(local_object,
7052 		    ((vm_object_offset_t)
7053 		    ((offset - local_start) +
7054 		    local_offset)),
7055 		    (vm_object_size_t)*upl_size,
7056 		    FALSE,
7057 		    MEMORY_OBJECT_DATA_SYNC,
7058 		    VM_PROT_NO_CHANGE);
7059 
7060 		force_data_sync = FALSE;
7061 		vm_object_deallocate(local_object);
7062 
7063 		goto REDISCOVER_ENTRY;
7064 	}
7065 	if (VME_OBJECT(entry)->private) {
7066 		*flags = UPL_DEV_MEMORY;
7067 	} else {
7068 		*flags = 0;
7069 	}
7070 
7071 	if (VME_OBJECT(entry)->phys_contiguous) {
7072 		*flags |= UPL_PHYS_CONTIG;
7073 	}
7074 
7075 	local_object = VME_OBJECT(entry);
7076 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7077 	local_start = entry->vme_start;
7078 
7079 
7080 	/*
7081 	 * Wiring will copy the pages to the shadow object.
7082 	 * The shadow object will not be code-signed so
7083 	 * attempting to execute code from these copied pages
7084 	 * would trigger a code-signing violation.
7085 	 */
7086 	if (entry->protection & VM_PROT_EXECUTE) {
7087 #if MACH_ASSERT
7088 		printf("pid %d[%s] create_upl out of executable range from "
7089 		    "0x%llx to 0x%llx: side effects may include "
7090 		    "code-signing violations later on\n",
7091 		    proc_selfpid(),
7092 		    (get_bsdtask_info(current_task())
7093 		    ? proc_name_address(get_bsdtask_info(current_task()))
7094 		    : "?"),
7095 		    (uint64_t) entry->vme_start,
7096 		    (uint64_t) entry->vme_end);
7097 #endif /* MACH_ASSERT */
7098 		DTRACE_VM2(cs_executable_create_upl,
7099 		    uint64_t, (uint64_t)entry->vme_start,
7100 		    uint64_t, (uint64_t)entry->vme_end);
7101 		cs_executable_create_upl++;
7102 	}
7103 
7104 	vm_object_lock(local_object);
7105 
7106 	/*
7107 	 * Ensure that this object is "true_share" and "copy_delay" now,
7108 	 * while we're still holding the VM map lock.  After we unlock the map,
7109 	 * anything could happen to that mapping, including some copy-on-write
7110 	 * activity.  We need to make sure that the IOPL will point at the
7111 	 * same memory as the mapping.
7112 	 */
7113 	if (local_object->true_share) {
7114 		assert(local_object->copy_strategy !=
7115 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7116 	} else if (!is_kernel_object(local_object) &&
7117 	    local_object != compressor_object &&
7118 	    !local_object->phys_contiguous) {
7119 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7120 		if (!local_object->true_share &&
7121 		    vm_object_tracking_btlog) {
7122 			btlog_record(vm_object_tracking_btlog, local_object,
7123 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7124 			    btref_get(__builtin_frame_address(0), 0));
7125 		}
7126 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7127 		VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7128 		if (local_object->copy_strategy ==
7129 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7130 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7131 		}
7132 	}
7133 
7134 	vm_object_reference_locked(local_object);
7135 	vm_object_unlock(local_object);
7136 
7137 	vm_map_unlock_read(map);
7138 
7139 	offset += offset_in_mapped_page;
7140 	assert(*upl_size > offset_in_mapped_page);
7141 	*upl_size -= offset_in_mapped_page;
7142 
7143 	ret = vm_object_iopl_request(local_object,
7144 	    ((vm_object_offset_t)
7145 	    ((offset - local_start) + local_offset)),
7146 	    *upl_size,
7147 	    upl,
7148 	    page_list,
7149 	    count,
7150 	    caller_flags,
7151 	    tag);
7152 	vm_object_deallocate(local_object);
7153 
7154 done:
7155 	if (release_map) {
7156 		vm_map_deallocate(map);
7157 	}
7158 
7159 	return ret;
7160 }
7161 
7162 /*
7163  * Internal routine to enter a UPL into a VM map.
7164  *
7165  * JMM - This should just be doable through the standard
7166  * vm_map_enter() API.
7167  */
7168 kern_return_t
7169 vm_map_enter_upl_range(
7170 	vm_map_t                map,
7171 	upl_t                   upl,
7172 	vm_object_offset_t      offset_to_map,
7173 	vm_size_t               size_to_map,
7174 	vm_prot_t               prot_to_map,
7175 	vm_map_offset_t         *dst_addr)
7176 {
7177 	vm_map_size_t           size;
7178 	vm_object_offset_t      offset;
7179 	vm_map_offset_t         addr;
7180 	vm_page_t               m;
7181 	kern_return_t           kr;
7182 	int                     isVectorUPL = 0, curr_upl = 0;
7183 	upl_t                   vector_upl = NULL;
7184 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7185 	vm_map_t                vector_upl_submap = NULL;
7186 	upl_offset_t            subupl_offset = 0;
7187 	upl_size_t              subupl_size = 0;
7188 
7189 	if (upl == UPL_NULL) {
7190 		return KERN_INVALID_ARGUMENT;
7191 	}
7192 
7193 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7194 	assert(map == kernel_map);
7195 
7196 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7197 		int mapped = 0, valid_upls = 0;
7198 		vector_upl = upl;
7199 
7200 		upl_lock(vector_upl);
7201 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7202 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7203 			if (upl == NULL) {
7204 				continue;
7205 			}
7206 			valid_upls++;
7207 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7208 				mapped++;
7209 			}
7210 		}
7211 
7212 		if (mapped) {
7213 			if (mapped != valid_upls) {
7214 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7215 			} else {
7216 				upl_unlock(vector_upl);
7217 				return KERN_FAILURE;
7218 			}
7219 		}
7220 
7221 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7222 			panic("TODO4K: vector UPL not implemented");
7223 		}
7224 
7225 		vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7226 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7227 		    VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7228 		    VM_KERN_MEMORY_NONE).kmr_submap;
7229 		map = vector_upl_submap;
7230 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7231 		curr_upl = 0;
7232 	} else {
7233 		upl_lock(upl);
7234 	}
7235 
7236 process_upl_to_enter:
7237 	if (isVectorUPL) {
7238 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7239 			*dst_addr = vector_upl_dst_addr;
7240 			upl_unlock(vector_upl);
7241 			return KERN_SUCCESS;
7242 		}
7243 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7244 		if (upl == NULL) {
7245 			goto process_upl_to_enter;
7246 		}
7247 
7248 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7249 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7250 	} else {
7251 		/*
7252 		 * check to see if already mapped
7253 		 */
7254 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7255 			upl_unlock(upl);
7256 			return KERN_FAILURE;
7257 		}
7258 	}
7259 
7260 	if ((!(upl->flags & UPL_SHADOWED)) &&
7261 	    ((upl->flags & UPL_HAS_BUSY) ||
7262 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7263 		vm_object_t             object;
7264 		vm_page_t               alias_page;
7265 		vm_object_offset_t      new_offset;
7266 		unsigned int            pg_num;
7267 
7268 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7269 		object = upl->map_object;
7270 		upl->map_object = vm_object_allocate(
7271 			vm_object_round_page(size),
7272 			/* Provenance is copied from the object we're shadowing */
7273 			object->vmo_provenance);
7274 
7275 		vm_object_lock(upl->map_object);
7276 
7277 		upl->map_object->shadow = object;
7278 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7279 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7280 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7281 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7282 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7283 		    "object %p shadow_offset 0x%llx",
7284 		    upl->map_object,
7285 		    (uint64_t)upl->map_object->vo_shadow_offset);
7286 		upl->map_object->wimg_bits = object->wimg_bits;
7287 		offset = upl->map_object->vo_shadow_offset;
7288 		new_offset = 0;
7289 
7290 		upl->flags |= UPL_SHADOWED;
7291 
7292 		while (size) {
7293 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7294 			assert(pg_num == new_offset / PAGE_SIZE);
7295 
7296 			if (bitmap_test(upl->lite_list, pg_num)) {
7297 				alias_page = vm_page_create_fictitious();
7298 
7299 				vm_object_lock(object);
7300 
7301 				m = vm_page_lookup(object, offset);
7302 				if (m == VM_PAGE_NULL) {
7303 					panic("vm_upl_map: page missing");
7304 				}
7305 
7306 				/*
7307 				 * Convert the fictitious page to a private
7308 				 * shadow of the real page.
7309 				 */
7310 				alias_page->vmp_free_when_done = TRUE;
7311 				/*
7312 				 * since m is a page in the upl it must
7313 				 * already be wired or BUSY, so it's
7314 				 * safe to assign the underlying physical
7315 				 * page to the alias
7316 				 */
7317 
7318 				vm_object_unlock(object);
7319 
7320 				vm_page_lockspin_queues();
7321 				vm_page_make_private(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7322 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7323 				vm_page_unlock_queues();
7324 
7325 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7326 
7327 				assert(!alias_page->vmp_wanted);
7328 				alias_page->vmp_busy = FALSE;
7329 				alias_page->vmp_absent = FALSE;
7330 			}
7331 			size -= PAGE_SIZE;
7332 			offset += PAGE_SIZE_64;
7333 			new_offset += PAGE_SIZE_64;
7334 		}
7335 		vm_object_unlock(upl->map_object);
7336 	}
7337 	if (upl->flags & UPL_SHADOWED) {
7338 		if (isVectorUPL) {
7339 			offset = 0;
7340 		} else {
7341 			offset = offset_to_map;
7342 		}
7343 	} else {
7344 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7345 		if (!isVectorUPL) {
7346 			offset += offset_to_map;
7347 		}
7348 	}
7349 
7350 	if (isVectorUPL) {
7351 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7352 	} else {
7353 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7354 	}
7355 
7356 	vm_object_reference(upl->map_object);
7357 
7358 	if (!isVectorUPL) {
7359 		*dst_addr = 0;
7360 		/*
7361 		 * NEED A UPL_MAP ALIAS
7362 		 */
7363 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7364 		    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7365 		    upl->map_object, offset, FALSE,
7366 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7367 
7368 		if (kr != KERN_SUCCESS) {
7369 			vm_object_deallocate(upl->map_object);
7370 			upl_unlock(upl);
7371 			return kr;
7372 		}
7373 	} else {
7374 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7375 		    VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7376 		    upl->map_object, offset, FALSE,
7377 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7378 		if (kr) {
7379 			panic("vm_map_enter failed for a Vector UPL");
7380 		}
7381 	}
7382 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7383 	                                        /* this will have to be an increment rather than */
7384 	                                        /* an assignment. */
7385 	vm_object_lock(upl->map_object);
7386 
7387 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7388 		m = vm_page_lookup(upl->map_object, offset);
7389 
7390 		if (m) {
7391 			m->vmp_pmapped = TRUE;
7392 
7393 			/*
7394 			 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7395 			 * but only in kernel space. If this was on a user map,
7396 			 * we'd have to set the wpmapped bit.
7397 			 */
7398 			/* m->vmp_wpmapped = TRUE; */
7399 			assert(map->pmap == kernel_pmap);
7400 
7401 			kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, TRUE);
7402 
7403 			assert(kr == KERN_SUCCESS);
7404 #if KASAN
7405 			kasan_notify_address(addr, PAGE_SIZE_64);
7406 #endif
7407 		}
7408 		offset += PAGE_SIZE_64;
7409 	}
7410 	vm_object_unlock(upl->map_object);
7411 
7412 	/*
7413 	 * hold a reference for the mapping
7414 	 */
7415 	upl->ref_count++;
7416 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7417 	upl->kaddr = (vm_offset_t) *dst_addr;
7418 	assert(upl->kaddr == *dst_addr);
7419 
7420 	if (isVectorUPL) {
7421 		goto process_upl_to_enter;
7422 	}
7423 
7424 	if (!isVectorUPL) {
7425 		vm_map_offset_t addr_adjustment;
7426 
7427 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7428 		if (addr_adjustment) {
7429 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7430 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7431 			*dst_addr += addr_adjustment;
7432 		}
7433 	}
7434 
7435 	upl_unlock(upl);
7436 
7437 	return KERN_SUCCESS;
7438 }
7439 
7440 kern_return_t
7441 vm_map_enter_upl(
7442 	vm_map_t                map,
7443 	upl_t                   upl,
7444 	vm_map_offset_t         *dst_addr)
7445 {
7446 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7447 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7448 }
7449 
7450 /*
7451  * Internal routine to remove a UPL mapping from a VM map.
7452  *
7453  * XXX - This should just be doable through a standard
7454  * vm_map_remove() operation.  Otherwise, implicit clean-up
7455  * of the target map won't be able to correctly remove
7456  * these (and release the reference on the UPL).  Having
7457  * to do this means we can't map these into user-space
7458  * maps yet.
7459  */
7460 kern_return_t
7461 vm_map_remove_upl_range(
7462 	vm_map_t        map,
7463 	upl_t           upl,
7464 	__unused vm_object_offset_t    offset_to_unmap,
7465 	__unused vm_size_t             size_to_unmap)
7466 {
7467 	vm_address_t    addr;
7468 	upl_size_t      size;
7469 	int             isVectorUPL = 0, curr_upl = 0;
7470 	upl_t           vector_upl = NULL;
7471 
7472 	if (upl == UPL_NULL) {
7473 		return KERN_INVALID_ARGUMENT;
7474 	}
7475 
7476 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7477 		int     unmapped = 0, valid_upls = 0;
7478 		vector_upl = upl;
7479 		upl_lock(vector_upl);
7480 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7481 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7482 			if (upl == NULL) {
7483 				continue;
7484 			}
7485 			valid_upls++;
7486 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7487 				unmapped++;
7488 			}
7489 		}
7490 
7491 		if (unmapped) {
7492 			if (unmapped != valid_upls) {
7493 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7494 			} else {
7495 				upl_unlock(vector_upl);
7496 				return KERN_FAILURE;
7497 			}
7498 		}
7499 		curr_upl = 0;
7500 	} else {
7501 		upl_lock(upl);
7502 	}
7503 
7504 process_upl_to_remove:
7505 	if (isVectorUPL) {
7506 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7507 			vm_map_t v_upl_submap;
7508 			vm_offset_t v_upl_submap_dst_addr;
7509 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7510 
7511 			kmem_free_guard(map, v_upl_submap_dst_addr,
7512 			    vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7513 			vm_map_deallocate(v_upl_submap);
7514 			upl_unlock(vector_upl);
7515 			return KERN_SUCCESS;
7516 		}
7517 
7518 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7519 		if (upl == NULL) {
7520 			goto process_upl_to_remove;
7521 		}
7522 	}
7523 
7524 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7525 		addr = upl->kaddr;
7526 		size = upl->u_mapped_size;
7527 
7528 		assert(upl->ref_count > 1);
7529 		upl->ref_count--;               /* removing mapping ref */
7530 
7531 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7532 		upl->kaddr = (vm_offset_t) 0;
7533 		upl->u_mapped_size = 0;
7534 
7535 		if (isVectorUPL) {
7536 			/*
7537 			 * If it's a Vectored UPL, we'll be removing the entire
7538 			 * submap anyways, so no need to remove individual UPL
7539 			 * element mappings from within the submap
7540 			 */
7541 			goto process_upl_to_remove;
7542 		}
7543 
7544 		upl_unlock(upl);
7545 
7546 		vm_map_remove(map,
7547 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7548 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7549 		return KERN_SUCCESS;
7550 	}
7551 	upl_unlock(upl);
7552 
7553 	return KERN_FAILURE;
7554 }
7555 
7556 kern_return_t
7557 vm_map_remove_upl(
7558 	vm_map_t        map,
7559 	upl_t           upl)
7560 {
7561 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7562 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7563 }
7564 
7565 void
7566 iopl_valid_data(
7567 	upl_t    upl,
7568 	vm_tag_t tag)
7569 {
7570 	vm_object_t     object;
7571 	vm_offset_t     offset;
7572 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
7573 	upl_size_t      size;
7574 	int             wired_count = 0;
7575 
7576 	if (upl == NULL) {
7577 		panic("iopl_valid_data: NULL upl");
7578 	}
7579 	if (vector_upl_is_valid(upl)) {
7580 		panic("iopl_valid_data: vector upl");
7581 	}
7582 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7583 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7584 	}
7585 
7586 	object = upl->map_object;
7587 
7588 	if (is_kernel_object(object) || object == compressor_object) {
7589 		panic("iopl_valid_data: object == kernel or compressor");
7590 	}
7591 
7592 	if (object->purgable == VM_PURGABLE_VOLATILE ||
7593 	    object->purgable == VM_PURGABLE_EMPTY) {
7594 		panic("iopl_valid_data: object %p purgable %d",
7595 		    object, object->purgable);
7596 	}
7597 
7598 	size = upl_adjusted_size(upl, PAGE_MASK);
7599 
7600 	vm_object_lock(object);
7601 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7602 
7603 	bool whole_object;
7604 
7605 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7606 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7607 		whole_object = true;
7608 	} else {
7609 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7610 		whole_object = false;
7611 	}
7612 
7613 	while (size) {
7614 		if (whole_object) {
7615 			if (nxt_page != VM_PAGE_NULL) {
7616 				m = nxt_page;
7617 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7618 			}
7619 		} else {
7620 			m = vm_page_lookup(object, offset);
7621 			offset += PAGE_SIZE;
7622 
7623 			if (m == VM_PAGE_NULL) {
7624 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7625 			}
7626 		}
7627 		if (m->vmp_busy) {
7628 			if (!m->vmp_absent) {
7629 				panic("iopl_valid_data: busy page w/o absent");
7630 			}
7631 
7632 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7633 				panic("iopl_valid_data: busy+absent page on page queue");
7634 			}
7635 			if (m->vmp_reusable) {
7636 				panic("iopl_valid_data: %p is reusable", m);
7637 			}
7638 
7639 			m->vmp_absent = FALSE;
7640 			m->vmp_dirty = TRUE;
7641 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7642 			assert(m->vmp_wire_count == 0);
7643 			m->vmp_wire_count++;
7644 			assert(m->vmp_wire_count);
7645 			if (m->vmp_wire_count == 1) {
7646 				m->vmp_q_state = VM_PAGE_IS_WIRED;
7647 				wired_count++;
7648 			} else {
7649 				panic("iopl_valid_data: %p already wired", m);
7650 			}
7651 
7652 
7653 			vm_page_wakeup_done(object, m);
7654 		}
7655 		size -= PAGE_SIZE;
7656 	}
7657 	if (wired_count) {
7658 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7659 		assert(object->resident_page_count >= object->wired_page_count);
7660 
7661 		/* no need to adjust purgeable accounting for this object: */
7662 		assert(object->purgable != VM_PURGABLE_VOLATILE);
7663 		assert(object->purgable != VM_PURGABLE_EMPTY);
7664 
7665 		vm_page_lockspin_queues();
7666 		vm_page_wire_count += wired_count;
7667 		vm_page_unlock_queues();
7668 	}
7669 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7670 	vm_object_unlock(object);
7671 }
7672 
7673 
7674 void
7675 vm_object_set_pmap_cache_attr(
7676 	vm_object_t             object,
7677 	upl_page_info_array_t   user_page_list,
7678 	unsigned int            num_pages,
7679 	boolean_t               batch_pmap_op)
7680 {
7681 	unsigned int    cache_attr = 0;
7682 
7683 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
7684 	assert(user_page_list);
7685 	if (!HAS_DEFAULT_CACHEABILITY(cache_attr)) {
7686 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7687 	}
7688 }
7689 
7690 
7691 static bool
7692 vm_object_iopl_wire_full(
7693 	vm_object_t             object,
7694 	upl_t                   upl,
7695 	upl_page_info_array_t   user_page_list,
7696 	upl_control_flags_t     cntrl_flags,
7697 	vm_tag_t                tag)
7698 {
7699 	vm_page_t       dst_page;
7700 	unsigned int    entry;
7701 	int             page_count;
7702 	int             delayed_unlock = 0;
7703 	boolean_t       retval = TRUE;
7704 	ppnum_t         phys_page;
7705 
7706 	vm_object_lock_assert_exclusive(object);
7707 	assert(object->purgable != VM_PURGABLE_VOLATILE);
7708 	assert(object->purgable != VM_PURGABLE_EMPTY);
7709 	assert(object->pager == NULL);
7710 	assert(object->vo_copy == NULL);
7711 	assert(object->shadow == NULL);
7712 
7713 	page_count = object->resident_page_count;
7714 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7715 
7716 	vm_page_lock_queues();
7717 
7718 	while (page_count--) {
7719 		if (dst_page->vmp_busy ||
7720 		    vm_page_is_fictitious(dst_page) ||
7721 		    dst_page->vmp_absent ||
7722 		    VMP_ERROR_GET(dst_page) ||
7723 		    dst_page->vmp_cleaning ||
7724 		    dst_page->vmp_restart ||
7725 		    dst_page->vmp_laundry) {
7726 			retval = FALSE;
7727 			goto done;
7728 		}
7729 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7730 			retval = FALSE;
7731 			goto done;
7732 		}
7733 		dst_page->vmp_reference = TRUE;
7734 
7735 		vm_page_wire(dst_page, tag, FALSE);
7736 
7737 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7738 			SET_PAGE_DIRTY(dst_page, FALSE);
7739 		}
7740 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7741 		assert(entry >= 0 && entry < object->resident_page_count);
7742 		bitmap_set(upl->lite_list, entry);
7743 
7744 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7745 
7746 		if (phys_page > upl->highest_page) {
7747 			upl->highest_page = phys_page;
7748 		}
7749 
7750 		if (user_page_list) {
7751 			user_page_list[entry].phys_addr = phys_page;
7752 			user_page_list[entry].absent    = dst_page->vmp_absent;
7753 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
7754 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
7755 			user_page_list[entry].precious  = dst_page->vmp_precious;
7756 			user_page_list[entry].device    = FALSE;
7757 			user_page_list[entry].speculative = FALSE;
7758 			user_page_list[entry].cs_validated = FALSE;
7759 			user_page_list[entry].cs_tainted = FALSE;
7760 			user_page_list[entry].cs_nx     = FALSE;
7761 			user_page_list[entry].needed    = FALSE;
7762 			user_page_list[entry].mark      = FALSE;
7763 		}
7764 		if (delayed_unlock++ > 256) {
7765 			delayed_unlock = 0;
7766 			lck_mtx_yield(&vm_page_queue_lock);
7767 
7768 			VM_CHECK_MEMORYSTATUS;
7769 		}
7770 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7771 	}
7772 done:
7773 	vm_page_unlock_queues();
7774 
7775 	VM_CHECK_MEMORYSTATUS;
7776 
7777 	return retval;
7778 }
7779 
7780 
7781 static kern_return_t
7782 vm_object_iopl_wire_empty(
7783 	vm_object_t             object,
7784 	upl_t                   upl,
7785 	upl_page_info_array_t   user_page_list,
7786 	upl_control_flags_t     cntrl_flags,
7787 	vm_tag_t                tag,
7788 	vm_object_offset_t     *dst_offset,
7789 	int                     page_count,
7790 	int                    *page_grab_count)
7791 {
7792 	vm_page_t         dst_page;
7793 	boolean_t         no_zero_fill = FALSE;
7794 	int               interruptible;
7795 	int               pages_wired = 0;
7796 	int               pages_inserted = 0;
7797 	int               entry = 0;
7798 	uint64_t          delayed_ledger_update = 0;
7799 	kern_return_t     ret = KERN_SUCCESS;
7800 	vm_grab_options_t grab_options;
7801 	ppnum_t           phys_page;
7802 
7803 	vm_object_lock_assert_exclusive(object);
7804 	assert(object->purgable != VM_PURGABLE_VOLATILE);
7805 	assert(object->purgable != VM_PURGABLE_EMPTY);
7806 	assert(object->pager == NULL);
7807 	assert(object->vo_copy == NULL);
7808 	assert(object->shadow == NULL);
7809 
7810 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
7811 		interruptible = THREAD_ABORTSAFE;
7812 	} else {
7813 		interruptible = THREAD_UNINT;
7814 	}
7815 
7816 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
7817 		no_zero_fill = TRUE;
7818 	}
7819 
7820 	grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
7821 #if CONFIG_SECLUDED_MEMORY
7822 	if (object->can_grab_secluded) {
7823 		grab_options |= VM_PAGE_GRAB_SECLUDED;
7824 	}
7825 #endif /* CONFIG_SECLUDED_MEMORY */
7826 
7827 	while (page_count--) {
7828 		while ((dst_page = vm_page_grab_options(grab_options))
7829 		    == VM_PAGE_NULL) {
7830 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7831 
7832 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7833 
7834 			if (vm_page_wait(interruptible) == FALSE) {
7835 				/*
7836 				 * interrupted case
7837 				 */
7838 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7839 
7840 				VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7841 
7842 				ret = MACH_SEND_INTERRUPTED;
7843 				goto done;
7844 			}
7845 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7846 
7847 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7848 		}
7849 
7850 		dst_page->vmp_absent = no_zero_fill;
7851 		dst_page->vmp_reference = TRUE;
7852 
7853 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7854 			SET_PAGE_DIRTY(dst_page, FALSE);
7855 		}
7856 		if (dst_page->vmp_absent == FALSE) {
7857 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
7858 			assert(dst_page->vmp_wire_count == 0);
7859 			dst_page->vmp_wire_count++;
7860 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
7861 			assert(dst_page->vmp_wire_count);
7862 			pages_wired++;
7863 
7864 
7865 			vm_page_wakeup_done(object, dst_page);
7866 		}
7867 		pages_inserted++;
7868 
7869 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
7870 
7871 		if (no_zero_fill == FALSE) {
7872 			vm_page_zero_fill(
7873 				dst_page
7874 				);
7875 		}
7876 
7877 		bitmap_set(upl->lite_list, entry);
7878 
7879 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7880 
7881 		if (phys_page > upl->highest_page) {
7882 			upl->highest_page = phys_page;
7883 		}
7884 
7885 		if (user_page_list) {
7886 			user_page_list[entry].phys_addr = phys_page;
7887 			user_page_list[entry].absent    = dst_page->vmp_absent;
7888 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
7889 			user_page_list[entry].free_when_done    = FALSE;
7890 			user_page_list[entry].precious  = FALSE;
7891 			user_page_list[entry].device    = FALSE;
7892 			user_page_list[entry].speculative = FALSE;
7893 			user_page_list[entry].cs_validated = FALSE;
7894 			user_page_list[entry].cs_tainted = FALSE;
7895 			user_page_list[entry].cs_nx     = FALSE;
7896 			user_page_list[entry].needed    = FALSE;
7897 			user_page_list[entry].mark      = FALSE;
7898 		}
7899 		entry++;
7900 		*dst_offset += PAGE_SIZE_64;
7901 	}
7902 done:
7903 	if (pages_wired) {
7904 		vm_page_lockspin_queues();
7905 		vm_page_wire_count += pages_wired;
7906 		vm_page_unlock_queues();
7907 	}
7908 	if (pages_inserted) {
7909 		if (object->internal) {
7910 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
7911 		} else {
7912 			OSAddAtomic(pages_inserted, &vm_page_external_count);
7913 		}
7914 	}
7915 	if (delayed_ledger_update) {
7916 		task_t          owner;
7917 		int             ledger_idx_volatile;
7918 		int             ledger_idx_nonvolatile;
7919 		int             ledger_idx_volatile_compressed;
7920 		int             ledger_idx_nonvolatile_compressed;
7921 		int             ledger_idx_composite;
7922 		int             ledger_idx_external_wired;
7923 		boolean_t       do_footprint;
7924 
7925 		owner = VM_OBJECT_OWNER(object);
7926 		assert(owner);
7927 
7928 		vm_object_ledger_tag_ledgers(object,
7929 		    &ledger_idx_volatile,
7930 		    &ledger_idx_nonvolatile,
7931 		    &ledger_idx_volatile_compressed,
7932 		    &ledger_idx_nonvolatile_compressed,
7933 		    &ledger_idx_composite,
7934 		    &ledger_idx_external_wired,
7935 		    &do_footprint);
7936 
7937 		if (object->internal) {
7938 			/* more non-volatile bytes */
7939 			ledger_credit(owner->ledger,
7940 			    ledger_idx_nonvolatile,
7941 			    delayed_ledger_update);
7942 			if (do_footprint) {
7943 				/* more footprint */
7944 				ledger_credit(owner->ledger,
7945 				    task_ledgers.phys_footprint,
7946 				    delayed_ledger_update);
7947 			} else if (ledger_idx_composite != -1) {
7948 				ledger_credit(owner->ledger,
7949 				    ledger_idx_composite,
7950 				    delayed_ledger_update);
7951 			}
7952 		} else {
7953 			/* more external wired bytes */
7954 			ledger_credit(owner->ledger,
7955 			    ledger_idx_external_wired,
7956 			    delayed_ledger_update);
7957 			if (do_footprint) {
7958 				/* more footprint */
7959 				ledger_credit(owner->ledger,
7960 				    task_ledgers.phys_footprint,
7961 				    delayed_ledger_update);
7962 			} else if (ledger_idx_composite != -1) {
7963 				ledger_credit(owner->ledger,
7964 				    ledger_idx_composite,
7965 				    delayed_ledger_update);
7966 			}
7967 		}
7968 	}
7969 
7970 	assert(page_grab_count);
7971 	*page_grab_count = pages_inserted;
7972 
7973 	return ret;
7974 }
7975 
7976 
7977 kern_return_t
7978 vm_object_iopl_request(
7979 	vm_object_t             object,
7980 	vm_object_offset_t      offset,
7981 	upl_size_t              size,
7982 	upl_t                   *upl_ptr,
7983 	upl_page_info_array_t   user_page_list,
7984 	unsigned int            *page_list_count,
7985 	upl_control_flags_t     cntrl_flags,
7986 	vm_tag_t                tag)
7987 {
7988 	vm_page_t               dst_page;
7989 	vm_object_offset_t      dst_offset;
7990 	upl_size_t              xfer_size;
7991 	upl_t                   upl = NULL;
7992 	unsigned int            entry;
7993 	int                     no_zero_fill = FALSE;
7994 	unsigned int            size_in_pages;
7995 	int                     page_grab_count = 0;
7996 	u_int32_t               psize;
7997 	kern_return_t           ret;
7998 	vm_prot_t               prot;
7999 	struct vm_object_fault_info fault_info = {};
8000 	struct  vm_page_delayed_work    dw_array;
8001 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8002 	bool                    dwp_finish_ctx = TRUE;
8003 	int                     dw_count;
8004 	int                     dw_limit;
8005 	int                     dw_index;
8006 	boolean_t               caller_lookup;
8007 	int                     io_tracking_flag = 0;
8008 	int                     interruptible;
8009 	ppnum_t                 phys_page;
8010 
8011 	boolean_t               set_cache_attr_needed = FALSE;
8012 	boolean_t               free_wired_pages = FALSE;
8013 	boolean_t               fast_path_empty_req = FALSE;
8014 	boolean_t               fast_path_full_req = FALSE;
8015 
8016 	task_t                  task = current_task();
8017 
8018 	dwp_start = dwp = NULL;
8019 
8020 	vm_object_offset_t original_offset = offset;
8021 	upl_size_t original_size = size;
8022 
8023 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8024 
8025 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8026 	offset = vm_object_trunc_page(offset);
8027 	if (size != original_size || offset != original_offset) {
8028 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8029 	}
8030 
8031 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
8032 		/*
8033 		 * For forward compatibility's sake,
8034 		 * reject any unknown flag.
8035 		 */
8036 		return KERN_INVALID_VALUE;
8037 	}
8038 	if (!vm_lopage_needed) {
8039 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8040 	}
8041 
8042 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8043 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8044 			return KERN_INVALID_VALUE;
8045 		}
8046 
8047 		if (object->phys_contiguous) {
8048 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8049 				return KERN_INVALID_ADDRESS;
8050 			}
8051 
8052 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8053 				return KERN_INVALID_ADDRESS;
8054 			}
8055 		}
8056 	}
8057 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8058 		no_zero_fill = TRUE;
8059 	}
8060 
8061 	if (cntrl_flags & UPL_COPYOUT_FROM) {
8062 		prot = VM_PROT_READ;
8063 	} else {
8064 		prot = VM_PROT_READ | VM_PROT_WRITE;
8065 	}
8066 
8067 	if ((!object->internal) && (object->paging_offset != 0)) {
8068 		panic("vm_object_iopl_request: external object with non-zero paging offset");
8069 	}
8070 
8071 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8072 
8073 #if CONFIG_IOSCHED || UPL_DEBUG
8074 	if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8075 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8076 	}
8077 #endif
8078 
8079 #if CONFIG_IOSCHED
8080 	if (object->io_tracking) {
8081 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8082 		if (!is_kernel_object(object)) {
8083 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8084 		}
8085 	}
8086 #endif
8087 
8088 	if (object->phys_contiguous) {
8089 		psize = PAGE_SIZE;
8090 	} else {
8091 		psize = size;
8092 
8093 		dw_count = 0;
8094 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8095 		dwp_start = vm_page_delayed_work_get_ctx();
8096 		if (dwp_start == NULL) {
8097 			dwp_start = &dw_array;
8098 			dw_limit = 1;
8099 			dwp_finish_ctx = FALSE;
8100 		}
8101 
8102 		dwp = dwp_start;
8103 	}
8104 
8105 	if (cntrl_flags & UPL_SET_INTERNAL) {
8106 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8107 		user_page_list = size ? upl->page_list : NULL;
8108 	} else {
8109 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8110 	}
8111 	if (user_page_list) {
8112 		user_page_list[0].device = FALSE;
8113 	}
8114 	*upl_ptr = upl;
8115 
8116 	if (cntrl_flags & UPL_NOZEROFILLIO) {
8117 		DTRACE_VM4(upl_nozerofillio,
8118 		    vm_object_t, object,
8119 		    vm_object_offset_t, offset,
8120 		    upl_size_t, size,
8121 		    upl_t, upl);
8122 	}
8123 
8124 	upl->map_object = object;
8125 	upl->u_offset = original_offset;
8126 	upl->u_size = original_size;
8127 
8128 	size_in_pages = size / PAGE_SIZE;
8129 
8130 	if (is_kernel_object(object) &&
8131 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8132 		upl->flags |= UPL_KERNEL_OBJECT;
8133 #if UPL_DEBUG
8134 		vm_object_lock(object);
8135 #else
8136 		vm_object_lock_shared(object);
8137 #endif
8138 	} else {
8139 		vm_object_lock(object);
8140 		vm_object_activity_begin(object);
8141 	}
8142 	/*
8143 	 * paging in progress also protects the paging_offset
8144 	 */
8145 	upl->u_offset = original_offset + object->paging_offset;
8146 
8147 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8148 		/*
8149 		 * The user requested that access to the pages in this UPL
8150 		 * be blocked until the UPL is commited or aborted.
8151 		 */
8152 		upl->flags |= UPL_ACCESS_BLOCKED;
8153 	}
8154 
8155 #if CONFIG_IOSCHED || UPL_DEBUG
8156 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8157 		vm_object_activity_begin(object);
8158 		queue_enter(&object->uplq, upl, upl_t, uplq);
8159 	}
8160 #endif
8161 
8162 	if (object->phys_contiguous) {
8163 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8164 			assert(!object->blocked_access);
8165 			object->blocked_access = TRUE;
8166 		}
8167 
8168 		vm_object_unlock(object);
8169 
8170 		/*
8171 		 * don't need any shadow mappings for this one
8172 		 * since it is already I/O memory
8173 		 */
8174 		upl->flags |= UPL_DEVICE_MEMORY;
8175 
8176 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8177 
8178 		if (user_page_list) {
8179 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8180 			user_page_list[0].device = TRUE;
8181 		}
8182 		if (page_list_count != NULL) {
8183 			if (upl->flags & UPL_INTERNAL) {
8184 				*page_list_count = 0;
8185 			} else {
8186 				*page_list_count = 1;
8187 			}
8188 		}
8189 
8190 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8191 		if (task != NULL) {
8192 			counter_add(&task->pages_grabbed_iopl, page_grab_count);
8193 		}
8194 		return KERN_SUCCESS;
8195 	}
8196 	if (!is_kernel_object(object) && object != compressor_object) {
8197 		/*
8198 		 * Protect user space from future COW operations
8199 		 */
8200 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8201 		if (!object->true_share &&
8202 		    vm_object_tracking_btlog) {
8203 			btlog_record(vm_object_tracking_btlog, object,
8204 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
8205 			    btref_get(__builtin_frame_address(0), 0));
8206 		}
8207 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8208 
8209 		vm_object_lock_assert_exclusive(object);
8210 		VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8211 
8212 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8213 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8214 		}
8215 	}
8216 
8217 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8218 	    object->vo_copy != VM_OBJECT_NULL) {
8219 		/*
8220 		 * Honor copy-on-write obligations
8221 		 *
8222 		 * The caller is gathering these pages and
8223 		 * might modify their contents.  We need to
8224 		 * make sure that the copy object has its own
8225 		 * private copies of these pages before we let
8226 		 * the caller modify them.
8227 		 *
8228 		 * NOTE: someone else could map the original object
8229 		 * after we've done this copy-on-write here, and they
8230 		 * could then see an inconsistent picture of the memory
8231 		 * while it's being modified via the UPL.  To prevent this,
8232 		 * we would have to block access to these pages until the
8233 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
8234 		 * code path for that...
8235 		 */
8236 		vm_object_update(object,
8237 		    offset,
8238 		    size,
8239 		    NULL,
8240 		    NULL,
8241 		    FALSE,              /* should_return */
8242 		    MEMORY_OBJECT_COPY_SYNC,
8243 		    VM_PROT_NO_CHANGE);
8244 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
8245 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8246 	}
8247 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8248 	    object->purgable != VM_PURGABLE_VOLATILE &&
8249 	    object->purgable != VM_PURGABLE_EMPTY &&
8250 	    object->vo_copy == NULL &&
8251 	    size == object->vo_size &&
8252 	    offset == 0 &&
8253 	    object->shadow == NULL &&
8254 	    object->pager == NULL) {
8255 		if (object->resident_page_count == size_in_pages) {
8256 			assert(object != compressor_object);
8257 			assert(!is_kernel_object(object));
8258 			fast_path_full_req = TRUE;
8259 		} else if (object->resident_page_count == 0) {
8260 			assert(object != compressor_object);
8261 			assert(!is_kernel_object(object));
8262 			fast_path_empty_req = TRUE;
8263 			set_cache_attr_needed = TRUE;
8264 		}
8265 	}
8266 
8267 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8268 		interruptible = THREAD_ABORTSAFE;
8269 	} else {
8270 		interruptible = THREAD_UNINT;
8271 	}
8272 
8273 	entry = 0;
8274 
8275 	xfer_size = size;
8276 	dst_offset = offset;
8277 
8278 	if (fast_path_full_req) {
8279 		if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8280 			goto finish;
8281 		}
8282 		/*
8283 		 * we couldn't complete the processing of this request on the fast path
8284 		 * so fall through to the slow path and finish up
8285 		 */
8286 	} else if (fast_path_empty_req) {
8287 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8288 			ret = KERN_MEMORY_ERROR;
8289 			goto return_err;
8290 		}
8291 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8292 		    cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8293 
8294 		if (ret) {
8295 			free_wired_pages = TRUE;
8296 			goto return_err;
8297 		}
8298 		goto finish;
8299 	}
8300 
8301 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8302 	fault_info.lo_offset = offset;
8303 	fault_info.hi_offset = offset + xfer_size;
8304 	fault_info.mark_zf_absent = TRUE;
8305 	fault_info.interruptible = interruptible;
8306 	fault_info.batch_pmap_op = TRUE;
8307 
8308 	while (xfer_size) {
8309 		vm_fault_return_t       result;
8310 
8311 		dwp->dw_mask = 0;
8312 
8313 		if (fast_path_full_req) {
8314 			/*
8315 			 * if we get here, it means that we ran into a page
8316 			 * state we couldn't handle in the fast path and
8317 			 * bailed out to the slow path... since the order
8318 			 * we look at pages is different between the 2 paths,
8319 			 * the following check is needed to determine whether
8320 			 * this page was already processed in the fast path
8321 			 */
8322 			if (bitmap_test(upl->lite_list, entry)) {
8323 				goto skip_page;
8324 			}
8325 		}
8326 		dst_page = vm_page_lookup(object, dst_offset);
8327 
8328 		if (dst_page == VM_PAGE_NULL ||
8329 		    dst_page->vmp_busy ||
8330 		    VMP_ERROR_GET(dst_page) ||
8331 		    dst_page->vmp_restart ||
8332 		    dst_page->vmp_absent ||
8333 		    vm_page_is_fictitious(dst_page)) {
8334 			if (is_kernel_object(object)) {
8335 				panic("vm_object_iopl_request: missing/bad page in kernel object");
8336 			}
8337 			if (object == compressor_object) {
8338 				panic("vm_object_iopl_request: missing/bad page in compressor object");
8339 			}
8340 
8341 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8342 				ret = KERN_MEMORY_ERROR;
8343 				goto return_err;
8344 			}
8345 
8346 			if (dst_page != VM_PAGE_NULL &&
8347 			    dst_page->vmp_busy) {
8348 				wait_result_t wait_result;
8349 				vm_object_lock_assert_exclusive(object);
8350 				wait_result = vm_page_sleep(object, dst_page,
8351 				    interruptible, LCK_SLEEP_DEFAULT);
8352 				if (wait_result == THREAD_AWAKENED ||
8353 				    wait_result == THREAD_RESTART) {
8354 					continue;
8355 				}
8356 				ret = MACH_SEND_INTERRUPTED;
8357 				goto return_err;
8358 			}
8359 
8360 			set_cache_attr_needed = TRUE;
8361 
8362 			/*
8363 			 * We just looked up the page and the result remains valid
8364 			 * until the object lock is release, so send it to
8365 			 * vm_fault_page() (as "dst_page"), to avoid having to
8366 			 * look it up again there.
8367 			 */
8368 			caller_lookup = TRUE;
8369 
8370 			do {
8371 				vm_page_t       top_page;
8372 				kern_return_t   error_code;
8373 
8374 				fault_info.cluster_size = xfer_size;
8375 				vm_object_paging_begin(object);
8376 
8377 				result = vm_fault_page(object, dst_offset,
8378 				    prot | VM_PROT_WRITE, FALSE,
8379 				    caller_lookup,
8380 				    &prot, &dst_page, &top_page,
8381 				    (int *)0,
8382 				    &error_code, no_zero_fill,
8383 				    &fault_info);
8384 
8385 				/* our lookup is no longer valid at this point */
8386 				caller_lookup = FALSE;
8387 
8388 				switch (result) {
8389 				case VM_FAULT_SUCCESS:
8390 					page_grab_count++;
8391 
8392 					if (!dst_page->vmp_absent) {
8393 						vm_page_wakeup_done(object, dst_page);
8394 					} else {
8395 						/*
8396 						 * we only get back an absent page if we
8397 						 * requested that it not be zero-filled
8398 						 * because we are about to fill it via I/O
8399 						 *
8400 						 * absent pages should be left BUSY
8401 						 * to prevent them from being faulted
8402 						 * into an address space before we've
8403 						 * had a chance to complete the I/O on
8404 						 * them since they may contain info that
8405 						 * shouldn't be seen by the faulting task
8406 						 */
8407 					}
8408 					/*
8409 					 *	Release paging references and
8410 					 *	top-level placeholder page, if any.
8411 					 */
8412 					if (top_page != VM_PAGE_NULL) {
8413 						vm_object_t local_object;
8414 
8415 						local_object = VM_PAGE_OBJECT(top_page);
8416 
8417 						/*
8418 						 * comparing 2 packed pointers
8419 						 */
8420 						if (top_page->vmp_object != dst_page->vmp_object) {
8421 							vm_object_lock(local_object);
8422 							VM_PAGE_FREE(top_page);
8423 							vm_object_paging_end(local_object);
8424 							vm_object_unlock(local_object);
8425 						} else {
8426 							VM_PAGE_FREE(top_page);
8427 							vm_object_paging_end(local_object);
8428 						}
8429 					}
8430 					vm_object_paging_end(object);
8431 					break;
8432 
8433 				case VM_FAULT_RETRY:
8434 					vm_object_lock(object);
8435 					break;
8436 
8437 				case VM_FAULT_MEMORY_SHORTAGE:
8438 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8439 
8440 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8441 
8442 					if (vm_page_wait(interruptible)) {
8443 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8444 
8445 						VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8446 						vm_object_lock(object);
8447 
8448 						break;
8449 					}
8450 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8451 
8452 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8453 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8454 					OS_FALLTHROUGH;
8455 
8456 				case VM_FAULT_INTERRUPTED:
8457 					error_code = MACH_SEND_INTERRUPTED;
8458 					OS_FALLTHROUGH;
8459 				case VM_FAULT_MEMORY_ERROR:
8460 memory_error:
8461 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8462 
8463 					vm_object_lock(object);
8464 					goto return_err;
8465 
8466 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
8467 					/* success but no page: fail */
8468 					vm_object_paging_end(object);
8469 					vm_object_unlock(object);
8470 					goto memory_error;
8471 
8472 				default:
8473 					panic("vm_object_iopl_request: unexpected error"
8474 					    " 0x%x from vm_fault_page()\n", result);
8475 				}
8476 			} while (result != VM_FAULT_SUCCESS);
8477 		}
8478 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8479 
8480 		if (upl->flags & UPL_KERNEL_OBJECT) {
8481 			goto record_phys_addr;
8482 		}
8483 
8484 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8485 			dst_page->vmp_busy = TRUE;
8486 			goto record_phys_addr;
8487 		}
8488 
8489 		if (dst_page->vmp_cleaning) {
8490 			/*
8491 			 * Someone else is cleaning this page in place.
8492 			 * In theory, we should be able to  proceed and use this
8493 			 * page but they'll probably end up clearing the "busy"
8494 			 * bit on it in upl_commit_range() but they didn't set
8495 			 * it, so they would clear our "busy" bit and open
8496 			 * us to race conditions.
8497 			 * We'd better wait for the cleaning to complete and
8498 			 * then try again.
8499 			 */
8500 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8501 			vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8502 			continue;
8503 		}
8504 		if (dst_page->vmp_laundry) {
8505 			vm_pageout_steal_laundry(dst_page, FALSE);
8506 		}
8507 
8508 		if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8509 		    phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
8510 			vm_page_t       new_page;
8511 			int             refmod;
8512 
8513 			/*
8514 			 * support devices that can't DMA above 32 bits
8515 			 * by substituting pages from a pool of low address
8516 			 * memory for any pages we find above the 4G mark
8517 			 * can't substitute if the page is already wired because
8518 			 * we don't know whether that physical address has been
8519 			 * handed out to some other 64 bit capable DMA device to use
8520 			 */
8521 			if (VM_PAGE_WIRED(dst_page)) {
8522 				ret = KERN_PROTECTION_FAILURE;
8523 				goto return_err;
8524 			}
8525 
8526 			new_page = vm_page_grablo(VM_PAGE_GRAB_OPTIONS_NONE);
8527 
8528 			if (new_page == VM_PAGE_NULL) {
8529 				ret = KERN_RESOURCE_SHORTAGE;
8530 				goto return_err;
8531 			}
8532 			/*
8533 			 * from here until the vm_page_replace completes
8534 			 * we musn't drop the object lock... we don't
8535 			 * want anyone refaulting this page in and using
8536 			 * it after we disconnect it... we want the fault
8537 			 * to find the new page being substituted.
8538 			 */
8539 			if (dst_page->vmp_pmapped) {
8540 				refmod = pmap_disconnect(phys_page);
8541 			} else {
8542 				refmod = 0;
8543 			}
8544 
8545 			if (!dst_page->vmp_absent) {
8546 				vm_page_copy(dst_page, new_page);
8547 			}
8548 
8549 			new_page->vmp_reference = dst_page->vmp_reference;
8550 			new_page->vmp_dirty     = dst_page->vmp_dirty;
8551 			new_page->vmp_absent    = dst_page->vmp_absent;
8552 
8553 			if (refmod & VM_MEM_REFERENCED) {
8554 				new_page->vmp_reference = TRUE;
8555 			}
8556 			if (refmod & VM_MEM_MODIFIED) {
8557 				SET_PAGE_DIRTY(new_page, FALSE);
8558 			}
8559 
8560 			vm_page_replace(new_page, object, dst_offset);
8561 
8562 			dst_page = new_page;
8563 			/*
8564 			 * vm_page_grablo returned the page marked
8565 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
8566 			 * here, because we've never dropped the object lock
8567 			 */
8568 			if (!dst_page->vmp_absent) {
8569 				dst_page->vmp_busy = FALSE;
8570 			}
8571 
8572 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8573 		}
8574 		if (!dst_page->vmp_busy) {
8575 			dwp->dw_mask |= DW_vm_page_wire;
8576 		}
8577 
8578 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
8579 			/*
8580 			 * Mark the page "busy" to block any future page fault
8581 			 * on this page in addition to wiring it.
8582 			 * We'll also remove the mapping
8583 			 * of all these pages before leaving this routine.
8584 			 */
8585 			assert(!vm_page_is_fictitious(dst_page));
8586 			dst_page->vmp_busy = TRUE;
8587 		}
8588 		/*
8589 		 * expect the page to be used
8590 		 * page queues lock must be held to set 'reference'
8591 		 */
8592 		dwp->dw_mask |= DW_set_reference;
8593 
8594 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8595 			SET_PAGE_DIRTY(dst_page, TRUE);
8596 			/*
8597 			 * Page belonging to a code-signed object is about to
8598 			 * be written. Mark it tainted and disconnect it from
8599 			 * all pmaps so processes have to fault it back in and
8600 			 * deal with the tainted bit.
8601 			 */
8602 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8603 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8604 				vm_page_iopl_tainted++;
8605 				if (dst_page->vmp_pmapped) {
8606 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8607 					if (refmod & VM_MEM_REFERENCED) {
8608 						dst_page->vmp_reference = TRUE;
8609 					}
8610 				}
8611 			}
8612 		}
8613 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8614 			pmap_sync_page_attributes_phys(phys_page);
8615 			dst_page->vmp_written_by_kernel = FALSE;
8616 		}
8617 
8618 record_phys_addr:
8619 		if (dst_page->vmp_busy) {
8620 			upl->flags |= UPL_HAS_BUSY;
8621 		}
8622 
8623 		bitmap_set(upl->lite_list, entry);
8624 
8625 		if (phys_page > upl->highest_page) {
8626 			upl->highest_page = phys_page;
8627 		}
8628 
8629 		if (user_page_list) {
8630 			user_page_list[entry].phys_addr = phys_page;
8631 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
8632 			user_page_list[entry].absent    = dst_page->vmp_absent;
8633 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8634 			user_page_list[entry].precious  = dst_page->vmp_precious;
8635 			user_page_list[entry].device    = FALSE;
8636 			user_page_list[entry].needed    = FALSE;
8637 			if (dst_page->vmp_clustered == TRUE) {
8638 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8639 			} else {
8640 				user_page_list[entry].speculative = FALSE;
8641 			}
8642 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8643 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8644 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8645 			user_page_list[entry].mark      = FALSE;
8646 		}
8647 		if (!is_kernel_object(object) && object != compressor_object) {
8648 			/*
8649 			 * someone is explicitly grabbing this page...
8650 			 * update clustered and speculative state
8651 			 *
8652 			 */
8653 			if (dst_page->vmp_clustered) {
8654 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
8655 			}
8656 		}
8657 skip_page:
8658 		entry++;
8659 		dst_offset += PAGE_SIZE_64;
8660 		xfer_size -= PAGE_SIZE;
8661 
8662 		if (dwp->dw_mask) {
8663 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8664 
8665 			if (dw_count >= dw_limit) {
8666 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8667 
8668 				dwp = dwp_start;
8669 				dw_count = 0;
8670 			}
8671 		}
8672 	}
8673 	assert(entry == size_in_pages);
8674 
8675 	if (dw_count) {
8676 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8677 		dwp = dwp_start;
8678 		dw_count = 0;
8679 	}
8680 finish:
8681 	if (user_page_list && set_cache_attr_needed == TRUE) {
8682 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8683 	}
8684 
8685 	if (page_list_count != NULL) {
8686 		if (upl->flags & UPL_INTERNAL) {
8687 			*page_list_count = 0;
8688 		} else if (*page_list_count > size_in_pages) {
8689 			*page_list_count = size_in_pages;
8690 		}
8691 	}
8692 	vm_object_unlock(object);
8693 
8694 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8695 		/*
8696 		 * We've marked all the pages "busy" so that future
8697 		 * page faults will block.
8698 		 * Now remove the mapping for these pages, so that they
8699 		 * can't be accessed without causing a page fault.
8700 		 */
8701 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8702 		    PMAP_NULL,
8703 		    PAGE_SIZE,
8704 		    0, VM_PROT_NONE);
8705 		assert(!object->blocked_access);
8706 		object->blocked_access = TRUE;
8707 	}
8708 
8709 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8710 	if (task != NULL) {
8711 		counter_add(&task->pages_grabbed_iopl, page_grab_count);
8712 	}
8713 
8714 	if (dwp_start && dwp_finish_ctx) {
8715 		vm_page_delayed_work_finish_ctx(dwp_start);
8716 		dwp_start = dwp = NULL;
8717 	}
8718 
8719 	return KERN_SUCCESS;
8720 
8721 return_err:
8722 	dw_index = 0;
8723 
8724 	for (; offset < dst_offset; offset += PAGE_SIZE) {
8725 		boolean_t need_unwire;
8726 		bool need_wakeup;
8727 
8728 		dst_page = vm_page_lookup(object, offset);
8729 
8730 		if (dst_page == VM_PAGE_NULL) {
8731 			panic("vm_object_iopl_request: Wired page missing.");
8732 		}
8733 
8734 		/*
8735 		 * if we've already processed this page in an earlier
8736 		 * dw_do_work, we need to undo the wiring... we will
8737 		 * leave the dirty and reference bits on if they
8738 		 * were set, since we don't have a good way of knowing
8739 		 * what the previous state was and we won't get here
8740 		 * under any normal circumstances...  we will always
8741 		 * clear BUSY and wakeup any waiters via vm_page_free
8742 		 * or PAGE_WAKEUP_DONE
8743 		 */
8744 		need_unwire = TRUE;
8745 
8746 		need_wakeup = false;
8747 		if (dw_count) {
8748 			if ((dwp_start)[dw_index].dw_m == dst_page) {
8749 				/*
8750 				 * still in the deferred work list
8751 				 * which means we haven't yet called
8752 				 * vm_page_wire on this page
8753 				 */
8754 				need_unwire = FALSE;
8755 
8756 				if (dst_page->vmp_busy &&
8757 				    ((dwp_start)[dw_index].dw_mask & DW_clear_busy)) {
8758 					/*
8759 					 * It's our own "busy" bit, so we need to clear it
8760 					 * now and wake up waiters below.
8761 					 */
8762 					dst_page->vmp_busy = false;
8763 					need_wakeup = true;
8764 				}
8765 
8766 				dw_index++;
8767 				dw_count--;
8768 			}
8769 		}
8770 		vm_page_lock_queues();
8771 
8772 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
8773 			vm_page_free(dst_page);
8774 
8775 			need_unwire = FALSE;
8776 		} else {
8777 			if (need_unwire == TRUE) {
8778 				vm_page_unwire(dst_page, TRUE);
8779 			}
8780 			if (dst_page->vmp_busy) {
8781 				/* not our "busy" or we would have cleared it above */
8782 				assert(!need_wakeup);
8783 			}
8784 			if (need_wakeup) {
8785 				assert(!dst_page->vmp_busy);
8786 				vm_page_wakeup(object, dst_page);
8787 			}
8788 		}
8789 		vm_page_unlock_queues();
8790 
8791 		if (need_unwire == TRUE) {
8792 			counter_inc(&vm_statistics_reactivations);
8793 		}
8794 	}
8795 #if UPL_DEBUG
8796 	upl->upl_state = 2;
8797 #endif
8798 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8799 		vm_object_activity_end(object);
8800 		vm_object_collapse(object, 0, TRUE);
8801 	}
8802 	vm_object_unlock(object);
8803 	upl_destroy(upl);
8804 
8805 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
8806 	if (task != NULL) {
8807 		counter_add(&task->pages_grabbed_iopl, page_grab_count);
8808 	}
8809 
8810 	if (dwp_start && dwp_finish_ctx) {
8811 		vm_page_delayed_work_finish_ctx(dwp_start);
8812 		dwp_start = dwp = NULL;
8813 	}
8814 	return ret;
8815 }
8816 
8817 kern_return_t
8818 upl_transpose(
8819 	upl_t           upl1,
8820 	upl_t           upl2)
8821 {
8822 	kern_return_t           retval;
8823 	boolean_t               upls_locked;
8824 	vm_object_t             object1, object2;
8825 
8826 	/* LD: Should mapped UPLs be eligible for a transpose? */
8827 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
8828 		return KERN_INVALID_ARGUMENT;
8829 	}
8830 
8831 	upls_locked = FALSE;
8832 
8833 	/*
8834 	 * Since we need to lock both UPLs at the same time,
8835 	 * avoid deadlocks by always taking locks in the same order.
8836 	 */
8837 	if (upl1 < upl2) {
8838 		upl_lock(upl1);
8839 		upl_lock(upl2);
8840 	} else {
8841 		upl_lock(upl2);
8842 		upl_lock(upl1);
8843 	}
8844 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
8845 
8846 	object1 = upl1->map_object;
8847 	object2 = upl2->map_object;
8848 
8849 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
8850 	    upl1->u_size != upl2->u_size) {
8851 		/*
8852 		 * We deal only with full objects, not subsets.
8853 		 * That's because we exchange the entire backing store info
8854 		 * for the objects: pager, resident pages, etc...  We can't do
8855 		 * only part of it.
8856 		 */
8857 		retval = KERN_INVALID_VALUE;
8858 		goto done;
8859 	}
8860 
8861 	/*
8862 	 * Tranpose the VM objects' backing store.
8863 	 */
8864 	retval = vm_object_transpose(object1, object2,
8865 	    upl_adjusted_size(upl1, PAGE_MASK));
8866 
8867 	if (retval == KERN_SUCCESS) {
8868 		/*
8869 		 * Make each UPL point to the correct VM object, i.e. the
8870 		 * object holding the pages that the UPL refers to...
8871 		 */
8872 #if CONFIG_IOSCHED || UPL_DEBUG
8873 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8874 			vm_object_lock(object1);
8875 			vm_object_lock(object2);
8876 		}
8877 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8878 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
8879 		}
8880 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8881 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
8882 		}
8883 #endif
8884 		upl1->map_object = object2;
8885 		upl2->map_object = object1;
8886 
8887 #if CONFIG_IOSCHED || UPL_DEBUG
8888 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8889 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
8890 		}
8891 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8892 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
8893 		}
8894 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8895 			vm_object_unlock(object2);
8896 			vm_object_unlock(object1);
8897 		}
8898 #endif
8899 	}
8900 
8901 done:
8902 	/*
8903 	 * Cleanup.
8904 	 */
8905 	if (upls_locked) {
8906 		upl_unlock(upl1);
8907 		upl_unlock(upl2);
8908 		upls_locked = FALSE;
8909 	}
8910 
8911 	return retval;
8912 }
8913 
8914 void
8915 upl_range_needed(
8916 	upl_t           upl,
8917 	int             index,
8918 	int             count)
8919 {
8920 	int             size_in_pages;
8921 
8922 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
8923 		return;
8924 	}
8925 
8926 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8927 
8928 	while (count-- && index < size_in_pages) {
8929 		upl->page_list[index++].needed = TRUE;
8930 	}
8931 }
8932 
8933 
8934 /*
8935  * Reserve of virtual addresses in the kernel address space.
8936  * We need to map the physical pages in the kernel, so that we
8937  * can call the code-signing or slide routines with a kernel
8938  * virtual address.  We keep this pool of pre-allocated kernel
8939  * virtual addresses so that we don't have to scan the kernel's
8940  * virtaul address space each time we need to work with
8941  * a physical page.
8942  */
8943 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
8944 #define VM_PAGING_NUM_PAGES     64
8945 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
8946 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8947 int             vm_paging_max_index = 0;
8948 int             vm_paging_page_waiter = 0;
8949 int             vm_paging_page_waiter_total = 0;
8950 
8951 unsigned long   vm_paging_no_kernel_page = 0;
8952 unsigned long   vm_paging_objects_mapped = 0;
8953 unsigned long   vm_paging_pages_mapped = 0;
8954 unsigned long   vm_paging_objects_mapped_slow = 0;
8955 unsigned long   vm_paging_pages_mapped_slow = 0;
8956 
8957 __startup_func
8958 static void
8959 vm_paging_map_init(void)
8960 {
8961 	kmem_alloc(kernel_map, &vm_paging_base_address,
8962 	    ptoa(VM_PAGING_NUM_PAGES),
8963 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
8964 	    VM_KERN_MEMORY_NONE);
8965 }
8966 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
8967 
8968 /*
8969  * vm_paging_map_object:
8970  *	Maps part of a VM object's pages in the kernel
8971  *      virtual address space, using the pre-allocated
8972  *	kernel virtual addresses, if possible.
8973  * Context:
8974  *      The VM object is locked.  This lock will get
8975  *      dropped and re-acquired though, so the caller
8976  *      must make sure the VM object is kept alive
8977  *	(by holding a VM map that has a reference
8978  *      on it, for example, or taking an extra reference).
8979  *      The page should also be kept busy to prevent
8980  *	it from being reclaimed.
8981  */
8982 kern_return_t
8983 vm_paging_map_object(
8984 	vm_page_t               page,
8985 	vm_object_t             object,
8986 	vm_object_offset_t      offset,
8987 	vm_prot_t               protection,
8988 	boolean_t               can_unlock_object,
8989 	vm_map_size_t           *size,          /* IN/OUT */
8990 	vm_map_offset_t         *address,       /* OUT */
8991 	boolean_t               *need_unmap)    /* OUT */
8992 {
8993 	kern_return_t           kr;
8994 	vm_map_offset_t         page_map_offset;
8995 	vm_map_size_t           map_size;
8996 	vm_object_offset_t      object_offset;
8997 	int                     i;
8998 
8999 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9000 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
9001 		*address = (vm_map_offset_t)
9002 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9003 		*need_unmap = FALSE;
9004 		return KERN_SUCCESS;
9005 
9006 		assert(page->vmp_busy);
9007 		/*
9008 		 * Use one of the pre-allocated kernel virtual addresses
9009 		 * and just enter the VM page in the kernel address space
9010 		 * at that virtual address.
9011 		 */
9012 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9013 
9014 		/*
9015 		 * Try and find an available kernel virtual address
9016 		 * from our pre-allocated pool.
9017 		 */
9018 		page_map_offset = 0;
9019 		for (;;) {
9020 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9021 				if (vm_paging_page_inuse[i] == FALSE) {
9022 					page_map_offset =
9023 					    vm_paging_base_address +
9024 					    (i * PAGE_SIZE);
9025 					break;
9026 				}
9027 			}
9028 			if (page_map_offset != 0) {
9029 				/* found a space to map our page ! */
9030 				break;
9031 			}
9032 
9033 			if (can_unlock_object) {
9034 				/*
9035 				 * If we can afford to unlock the VM object,
9036 				 * let's take the slow path now...
9037 				 */
9038 				break;
9039 			}
9040 			/*
9041 			 * We can't afford to unlock the VM object, so
9042 			 * let's wait for a space to become available...
9043 			 */
9044 			vm_paging_page_waiter_total++;
9045 			vm_paging_page_waiter++;
9046 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9047 			if (kr == THREAD_WAITING) {
9048 				simple_unlock(&vm_paging_lock);
9049 				kr = thread_block(THREAD_CONTINUE_NULL);
9050 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9051 			}
9052 			vm_paging_page_waiter--;
9053 			/* ... and try again */
9054 		}
9055 
9056 		if (page_map_offset != 0) {
9057 			/*
9058 			 * We found a kernel virtual address;
9059 			 * map the physical page to that virtual address.
9060 			 */
9061 			if (i > vm_paging_max_index) {
9062 				vm_paging_max_index = i;
9063 			}
9064 			vm_paging_page_inuse[i] = TRUE;
9065 			simple_unlock(&vm_paging_lock);
9066 
9067 			page->vmp_pmapped = TRUE;
9068 
9069 			/*
9070 			 * Keep the VM object locked over the PMAP_ENTER
9071 			 * and the actual use of the page by the kernel,
9072 			 * or this pmap mapping might get undone by a
9073 			 * vm_object_pmap_protect() call...
9074 			 */
9075 			kr = pmap_enter_check(kernel_pmap,
9076 			    page_map_offset,
9077 			    page,
9078 			    protection,
9079 			    VM_PROT_NONE,
9080 			    TRUE);
9081 			assert(kr == KERN_SUCCESS);
9082 			vm_paging_objects_mapped++;
9083 			vm_paging_pages_mapped++;
9084 			*address = page_map_offset;
9085 			*need_unmap = TRUE;
9086 
9087 #if KASAN
9088 			kasan_notify_address(page_map_offset, PAGE_SIZE);
9089 #endif
9090 
9091 			/* all done and mapped, ready to use ! */
9092 			return KERN_SUCCESS;
9093 		}
9094 
9095 		/*
9096 		 * We ran out of pre-allocated kernel virtual
9097 		 * addresses.  Just map the page in the kernel
9098 		 * the slow and regular way.
9099 		 */
9100 		vm_paging_no_kernel_page++;
9101 		simple_unlock(&vm_paging_lock);
9102 	}
9103 
9104 	if (!can_unlock_object) {
9105 		*address = 0;
9106 		*size = 0;
9107 		*need_unmap = FALSE;
9108 		return KERN_NOT_SUPPORTED;
9109 	}
9110 
9111 	object_offset = vm_object_trunc_page(offset);
9112 	map_size = vm_map_round_page(*size,
9113 	    VM_MAP_PAGE_MASK(kernel_map));
9114 
9115 	/*
9116 	 * Try and map the required range of the object
9117 	 * in the kernel_map. Given that allocation is
9118 	 * for pageable memory, it shouldn't contain
9119 	 * pointers and is mapped into the data range.
9120 	 */
9121 
9122 	vm_object_reference_locked(object);     /* for the map entry */
9123 	vm_object_unlock(object);
9124 
9125 	kr = vm_map_enter(kernel_map,
9126 	    address,
9127 	    map_size,
9128 	    0,
9129 	    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
9130 	    object,
9131 	    object_offset,
9132 	    FALSE,
9133 	    protection,
9134 	    VM_PROT_ALL,
9135 	    VM_INHERIT_NONE);
9136 	if (kr != KERN_SUCCESS) {
9137 		*address = 0;
9138 		*size = 0;
9139 		*need_unmap = FALSE;
9140 		vm_object_deallocate(object);   /* for the map entry */
9141 		vm_object_lock(object);
9142 		return kr;
9143 	}
9144 
9145 	*size = map_size;
9146 
9147 	/*
9148 	 * Enter the mapped pages in the page table now.
9149 	 */
9150 	vm_object_lock(object);
9151 	/*
9152 	 * VM object must be kept locked from before PMAP_ENTER()
9153 	 * until after the kernel is done accessing the page(s).
9154 	 * Otherwise, the pmap mappings in the kernel could be
9155 	 * undone by a call to vm_object_pmap_protect().
9156 	 */
9157 
9158 	for (page_map_offset = 0;
9159 	    map_size != 0;
9160 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9161 		page = vm_page_lookup(object, offset + page_map_offset);
9162 		if (page == VM_PAGE_NULL) {
9163 			printf("vm_paging_map_object: no page !?");
9164 			vm_object_unlock(object);
9165 			vm_map_remove(kernel_map, *address, *size);
9166 			*address = 0;
9167 			*size = 0;
9168 			*need_unmap = FALSE;
9169 			vm_object_lock(object);
9170 			return KERN_MEMORY_ERROR;
9171 		}
9172 		page->vmp_pmapped = TRUE;
9173 
9174 		kr = pmap_enter_check(kernel_pmap,
9175 		    *address + page_map_offset,
9176 		    page,
9177 		    protection,
9178 		    VM_PROT_NONE,
9179 		    TRUE);
9180 		assert(kr == KERN_SUCCESS);
9181 #if KASAN
9182 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9183 #endif
9184 	}
9185 
9186 	vm_paging_objects_mapped_slow++;
9187 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9188 
9189 	*need_unmap = TRUE;
9190 
9191 	return KERN_SUCCESS;
9192 }
9193 
9194 /*
9195  * vm_paging_unmap_object:
9196  *	Unmaps part of a VM object's pages from the kernel
9197  *      virtual address space.
9198  * Context:
9199  *      The VM object is locked.  This lock will get
9200  *      dropped and re-acquired though.
9201  */
9202 void
9203 vm_paging_unmap_object(
9204 	vm_object_t     object,
9205 	vm_map_offset_t start,
9206 	vm_map_offset_t end)
9207 {
9208 	int             i;
9209 
9210 	if ((vm_paging_base_address == 0) ||
9211 	    (start < vm_paging_base_address) ||
9212 	    (end > (vm_paging_base_address
9213 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9214 		/*
9215 		 * We didn't use our pre-allocated pool of
9216 		 * kernel virtual address.  Deallocate the
9217 		 * virtual memory.
9218 		 */
9219 		if (object != VM_OBJECT_NULL) {
9220 			vm_object_unlock(object);
9221 		}
9222 		vm_map_remove(kernel_map, start, end);
9223 		if (object != VM_OBJECT_NULL) {
9224 			vm_object_lock(object);
9225 		}
9226 	} else {
9227 		/*
9228 		 * We used a kernel virtual address from our
9229 		 * pre-allocated pool.  Put it back in the pool
9230 		 * for next time.
9231 		 */
9232 		assert(end - start == PAGE_SIZE);
9233 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9234 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9235 
9236 		/* undo the pmap mapping */
9237 		pmap_remove(kernel_pmap, start, end);
9238 
9239 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9240 		vm_paging_page_inuse[i] = FALSE;
9241 		if (vm_paging_page_waiter) {
9242 			thread_wakeup(&vm_paging_page_waiter);
9243 		}
9244 		simple_unlock(&vm_paging_lock);
9245 	}
9246 }
9247 
9248 
9249 /*
9250  * page->vmp_object must be locked
9251  */
9252 void
9253 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9254 {
9255 	if (!queues_locked) {
9256 		vm_page_lockspin_queues();
9257 	}
9258 
9259 	page->vmp_free_when_done = FALSE;
9260 	/*
9261 	 * need to drop the laundry count...
9262 	 * we may also need to remove it
9263 	 * from the I/O paging queue...
9264 	 * vm_pageout_throttle_up handles both cases
9265 	 *
9266 	 * the laundry and pageout_queue flags are cleared...
9267 	 */
9268 	vm_pageout_throttle_up(page);
9269 
9270 	if (!queues_locked) {
9271 		vm_page_unlock_queues();
9272 	}
9273 }
9274 
9275 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9276 
9277 upl_t
9278 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9279 {
9280 	int i = 0;
9281 	upl_t   upl;
9282 
9283 	assert(max_upls > 0);
9284 	if (max_upls == 0) {
9285 		return NULL;
9286 	}
9287 
9288 	if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9289 		max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9290 	}
9291 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
9292 
9293 	upl = upl_create(0, UPL_VECTOR, 0);
9294 	upl->vector_upl = vector_upl;
9295 	upl->u_offset = upl_offset;
9296 	vector_upl->size = 0;
9297 	vector_upl->offset = upl_offset;
9298 	vector_upl->invalid_upls = 0;
9299 	vector_upl->num_upls = 0;
9300 	vector_upl->pagelist = NULL;
9301 	vector_upl->max_upls = max_upls;
9302 
9303 	for (i = 0; i < max_upls; i++) {
9304 		vector_upl->upls[i].iostate.size = 0;
9305 		vector_upl->upls[i].iostate.offset = 0;
9306 	}
9307 	return upl;
9308 }
9309 
9310 upl_size_t
9311 vector_upl_get_size(const upl_t upl)
9312 {
9313 	if (!vector_upl_is_valid(upl)) {
9314 		return upl_get_size(upl);
9315 	} else {
9316 		return round_page_32(upl->vector_upl->size);
9317 	}
9318 }
9319 
9320 uint32_t
9321 vector_upl_max_upls(const upl_t upl)
9322 {
9323 	if (!vector_upl_is_valid(upl)) {
9324 		return 0;
9325 	}
9326 	return ((vector_upl_t)(upl->vector_upl))->max_upls;
9327 }
9328 
9329 void
9330 vector_upl_deallocate(upl_t upl)
9331 {
9332 	vector_upl_t vector_upl = upl->vector_upl;
9333 
9334 	assert(vector_upl_is_valid(upl));
9335 
9336 	if (vector_upl->invalid_upls != vector_upl->num_upls) {
9337 		panic("Deallocating non-empty Vectored UPL");
9338 	}
9339 	uint32_t max_upls = vector_upl->max_upls;
9340 	kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9341 	kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9342 	upl->vector_upl = NULL;
9343 }
9344 
9345 boolean_t
9346 vector_upl_is_valid(upl_t upl)
9347 {
9348 	return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9349 }
9350 
9351 boolean_t
9352 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9353 {
9354 	if (vector_upl_is_valid(upl)) {
9355 		vector_upl_t vector_upl = upl->vector_upl;
9356 
9357 		if (vector_upl) {
9358 			if (subupl) {
9359 				if (io_size) {
9360 					if (io_size < PAGE_SIZE) {
9361 						io_size = PAGE_SIZE;
9362 					}
9363 					subupl->vector_upl = (void*)vector_upl;
9364 					vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9365 					vector_upl->size += io_size;
9366 					upl->u_size += io_size;
9367 				} else {
9368 					uint32_t i = 0, invalid_upls = 0;
9369 					for (i = 0; i < vector_upl->num_upls; i++) {
9370 						if (vector_upl->upls[i].elem == subupl) {
9371 							break;
9372 						}
9373 					}
9374 					if (i == vector_upl->num_upls) {
9375 						panic("Trying to remove sub-upl when none exists");
9376 					}
9377 
9378 					vector_upl->upls[i].elem = NULL;
9379 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9380 					    relaxed);
9381 					if (invalid_upls == vector_upl->num_upls) {
9382 						return TRUE;
9383 					} else {
9384 						return FALSE;
9385 					}
9386 				}
9387 			} else {
9388 				panic("vector_upl_set_subupl was passed a NULL upl element");
9389 			}
9390 		} else {
9391 			panic("vector_upl_set_subupl was passed a non-vectored upl");
9392 		}
9393 	} else {
9394 		panic("vector_upl_set_subupl was passed a NULL upl");
9395 	}
9396 
9397 	return FALSE;
9398 }
9399 
9400 void
9401 vector_upl_set_pagelist(upl_t upl)
9402 {
9403 	if (vector_upl_is_valid(upl)) {
9404 		uint32_t i = 0;
9405 		vector_upl_t vector_upl = upl->vector_upl;
9406 
9407 		if (vector_upl) {
9408 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9409 
9410 			vector_upl->pagelist = kalloc_type(struct upl_page_info,
9411 			    atop(vector_upl->size), Z_WAITOK);
9412 
9413 			for (i = 0; i < vector_upl->num_upls; i++) {
9414 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9415 				bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9416 				pagelist_size += cur_upl_pagelist_size;
9417 				if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9418 					upl->highest_page = vector_upl->upls[i].elem->highest_page;
9419 				}
9420 			}
9421 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9422 		} else {
9423 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
9424 		}
9425 	} else {
9426 		panic("vector_upl_set_pagelist was passed a NULL upl");
9427 	}
9428 }
9429 
9430 upl_t
9431 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9432 {
9433 	if (vector_upl_is_valid(upl)) {
9434 		vector_upl_t vector_upl = upl->vector_upl;
9435 		if (vector_upl) {
9436 			if (index < vector_upl->num_upls) {
9437 				return vector_upl->upls[index].elem;
9438 			}
9439 		} else {
9440 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9441 		}
9442 	}
9443 	return NULL;
9444 }
9445 
9446 upl_t
9447 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9448 {
9449 	if (vector_upl_is_valid(upl)) {
9450 		uint32_t i = 0;
9451 		vector_upl_t vector_upl = upl->vector_upl;
9452 
9453 		if (vector_upl) {
9454 			upl_t subupl = NULL;
9455 			vector_upl_iostates_t subupl_state;
9456 
9457 			for (i = 0; i < vector_upl->num_upls; i++) {
9458 				subupl = vector_upl->upls[i].elem;
9459 				subupl_state = vector_upl->upls[i].iostate;
9460 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9461 					/* We could have been passed an offset/size pair that belongs
9462 					 * to an UPL element that has already been committed/aborted.
9463 					 * If so, return NULL.
9464 					 */
9465 					if (subupl == NULL) {
9466 						return NULL;
9467 					}
9468 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9469 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9470 						if (*upl_size > subupl_state.size) {
9471 							*upl_size = subupl_state.size;
9472 						}
9473 					}
9474 					if (*upl_offset >= subupl_state.offset) {
9475 						*upl_offset -= subupl_state.offset;
9476 					} else if (i) {
9477 						panic("Vector UPL offset miscalculation");
9478 					}
9479 					return subupl;
9480 				}
9481 			}
9482 		} else {
9483 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9484 		}
9485 	}
9486 	return NULL;
9487 }
9488 
9489 void
9490 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9491 {
9492 	*v_upl_submap = NULL;
9493 
9494 	if (vector_upl_is_valid(upl)) {
9495 		vector_upl_t vector_upl = upl->vector_upl;
9496 		if (vector_upl) {
9497 			*v_upl_submap = vector_upl->submap;
9498 			*submap_dst_addr = vector_upl->submap_dst_addr;
9499 		} else {
9500 			panic("vector_upl_get_submap was passed a non-vectored UPL");
9501 		}
9502 	} else {
9503 		panic("vector_upl_get_submap was passed a null UPL");
9504 	}
9505 }
9506 
9507 void
9508 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9509 {
9510 	if (vector_upl_is_valid(upl)) {
9511 		vector_upl_t vector_upl = upl->vector_upl;
9512 		if (vector_upl) {
9513 			vector_upl->submap = submap;
9514 			vector_upl->submap_dst_addr = submap_dst_addr;
9515 		} else {
9516 			panic("vector_upl_get_submap was passed a non-vectored UPL");
9517 		}
9518 	} else {
9519 		panic("vector_upl_get_submap was passed a NULL UPL");
9520 	}
9521 }
9522 
9523 void
9524 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9525 {
9526 	if (vector_upl_is_valid(upl)) {
9527 		uint32_t i = 0;
9528 		vector_upl_t vector_upl = upl->vector_upl;
9529 
9530 		if (vector_upl) {
9531 			for (i = 0; i < vector_upl->num_upls; i++) {
9532 				if (vector_upl->upls[i].elem == subupl) {
9533 					break;
9534 				}
9535 			}
9536 
9537 			if (i == vector_upl->num_upls) {
9538 				panic("setting sub-upl iostate when none exists");
9539 			}
9540 
9541 			vector_upl->upls[i].iostate.offset = offset;
9542 			if (size < PAGE_SIZE) {
9543 				size = PAGE_SIZE;
9544 			}
9545 			vector_upl->upls[i].iostate.size = size;
9546 		} else {
9547 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
9548 		}
9549 	} else {
9550 		panic("vector_upl_set_iostate was passed a NULL UPL");
9551 	}
9552 }
9553 
9554 void
9555 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9556 {
9557 	if (vector_upl_is_valid(upl)) {
9558 		uint32_t i = 0;
9559 		vector_upl_t vector_upl = upl->vector_upl;
9560 
9561 		if (vector_upl) {
9562 			for (i = 0; i < vector_upl->num_upls; i++) {
9563 				if (vector_upl->upls[i].elem == subupl) {
9564 					break;
9565 				}
9566 			}
9567 
9568 			if (i == vector_upl->num_upls) {
9569 				panic("getting sub-upl iostate when none exists");
9570 			}
9571 
9572 			*offset = vector_upl->upls[i].iostate.offset;
9573 			*size = vector_upl->upls[i].iostate.size;
9574 		} else {
9575 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
9576 		}
9577 	} else {
9578 		panic("vector_upl_get_iostate was passed a NULL UPL");
9579 	}
9580 }
9581 
9582 void
9583 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9584 {
9585 	if (vector_upl_is_valid(upl)) {
9586 		vector_upl_t vector_upl = upl->vector_upl;
9587 		if (vector_upl) {
9588 			if (index < vector_upl->num_upls) {
9589 				*offset = vector_upl->upls[index].iostate.offset;
9590 				*size = vector_upl->upls[index].iostate.size;
9591 			} else {
9592 				*offset = *size = 0;
9593 			}
9594 		} else {
9595 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9596 		}
9597 	} else {
9598 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9599 	}
9600 }
9601 
9602 void *
9603 upl_get_internal_vectorupl(upl_t upl)
9604 {
9605 	return upl->vector_upl;
9606 }
9607 
9608 upl_page_info_t *
9609 upl_get_internal_vectorupl_pagelist(upl_t upl)
9610 {
9611 	return upl->vector_upl->pagelist;
9612 }
9613 
9614 upl_page_info_t *
9615 upl_get_internal_page_list(upl_t upl)
9616 {
9617 	return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9618 }
9619 
9620 void
9621 upl_clear_dirty(
9622 	upl_t           upl,
9623 	boolean_t       value)
9624 {
9625 	if (value) {
9626 		upl->flags |= UPL_CLEAR_DIRTY;
9627 	} else {
9628 		upl->flags &= ~UPL_CLEAR_DIRTY;
9629 	}
9630 }
9631 
9632 void
9633 upl_set_referenced(
9634 	upl_t           upl,
9635 	boolean_t       value)
9636 {
9637 	upl_lock(upl);
9638 	if (value) {
9639 		upl->ext_ref_count++;
9640 	} else {
9641 		if (!upl->ext_ref_count) {
9642 			panic("upl_set_referenced not %p", upl);
9643 		}
9644 		upl->ext_ref_count--;
9645 	}
9646 	upl_unlock(upl);
9647 }
9648 
9649 void
9650 upl_set_map_exclusive(upl_t upl)
9651 {
9652 	upl_lock(upl);
9653 	while (upl->map_addr_owner) {
9654 		upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9655 		upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9656 	}
9657 	upl->map_addr_owner = thread_get_ctid(current_thread());
9658 	upl_unlock(upl);
9659 }
9660 
9661 void
9662 upl_clear_map_exclusive(upl_t upl)
9663 {
9664 	assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9665 	upl_lock(upl);
9666 	if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9667 		upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9668 		upl_wakeup(&upl->map_addr_owner);
9669 	}
9670 	upl->map_addr_owner = 0;
9671 	upl_unlock(upl);
9672 }
9673 
9674 #if CONFIG_IOSCHED
9675 void
9676 upl_set_blkno(
9677 	upl_t           upl,
9678 	vm_offset_t     upl_offset,
9679 	int             io_size,
9680 	int64_t         blkno)
9681 {
9682 	int i, j;
9683 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9684 		return;
9685 	}
9686 
9687 	assert(upl->upl_reprio_info != 0);
9688 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9689 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9690 	}
9691 }
9692 #endif
9693 
9694 void inline
9695 memoryshot(unsigned int event, unsigned int control)
9696 {
9697 	if (vm_debug_events) {
9698 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9699 		    vm_page_active_count, vm_page_inactive_count,
9700 		    vm_page_free_count, vm_page_speculative_count,
9701 		    vm_page_throttled_count);
9702 	} else {
9703 		(void) event;
9704 		(void) control;
9705 	}
9706 }
9707 
9708 #ifdef MACH_BSD
9709 
9710 boolean_t
9711 upl_device_page(upl_page_info_t *upl)
9712 {
9713 	return UPL_DEVICE_PAGE(upl);
9714 }
9715 boolean_t
9716 upl_page_present(upl_page_info_t *upl, int index)
9717 {
9718 	return UPL_PAGE_PRESENT(upl, index);
9719 }
9720 boolean_t
9721 upl_speculative_page(upl_page_info_t *upl, int index)
9722 {
9723 	return UPL_SPECULATIVE_PAGE(upl, index);
9724 }
9725 boolean_t
9726 upl_dirty_page(upl_page_info_t *upl, int index)
9727 {
9728 	return UPL_DIRTY_PAGE(upl, index);
9729 }
9730 boolean_t
9731 upl_valid_page(upl_page_info_t *upl, int index)
9732 {
9733 	return UPL_VALID_PAGE(upl, index);
9734 }
9735 ppnum_t
9736 upl_phys_page(upl_page_info_t *upl, int index)
9737 {
9738 	return UPL_PHYS_PAGE(upl, index);
9739 }
9740 
9741 void
9742 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9743 {
9744 	upl[index].mark = v;
9745 }
9746 
9747 boolean_t
9748 upl_page_get_mark(upl_page_info_t *upl, int index)
9749 {
9750 	return upl[index].mark;
9751 }
9752 
9753 void
9754 vm_countdirtypages(void)
9755 {
9756 	vm_page_t m;
9757 	int dpages;
9758 	int pgopages;
9759 	int precpages;
9760 
9761 
9762 	dpages = 0;
9763 	pgopages = 0;
9764 	precpages = 0;
9765 
9766 	vm_page_lock_queues();
9767 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9768 	do {
9769 		if (m == (vm_page_t)0) {
9770 			break;
9771 		}
9772 
9773 		if (m->vmp_dirty) {
9774 			dpages++;
9775 		}
9776 		if (m->vmp_free_when_done) {
9777 			pgopages++;
9778 		}
9779 		if (m->vmp_precious) {
9780 			precpages++;
9781 		}
9782 
9783 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9784 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9785 		if (m == (vm_page_t)0) {
9786 			break;
9787 		}
9788 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9789 	vm_page_unlock_queues();
9790 
9791 	vm_page_lock_queues();
9792 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9793 	do {
9794 		if (m == (vm_page_t)0) {
9795 			break;
9796 		}
9797 
9798 		dpages++;
9799 		assert(m->vmp_dirty);
9800 		assert(!m->vmp_free_when_done);
9801 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9802 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9803 		if (m == (vm_page_t)0) {
9804 			break;
9805 		}
9806 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9807 	vm_page_unlock_queues();
9808 
9809 	vm_page_lock_queues();
9810 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9811 	do {
9812 		if (m == (vm_page_t)0) {
9813 			break;
9814 		}
9815 
9816 		if (m->vmp_dirty) {
9817 			dpages++;
9818 		}
9819 		if (m->vmp_free_when_done) {
9820 			pgopages++;
9821 		}
9822 		if (m->vmp_precious) {
9823 			precpages++;
9824 		}
9825 
9826 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9827 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9828 		if (m == (vm_page_t)0) {
9829 			break;
9830 		}
9831 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9832 	vm_page_unlock_queues();
9833 
9834 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9835 
9836 	dpages = 0;
9837 	pgopages = 0;
9838 	precpages = 0;
9839 
9840 	vm_page_lock_queues();
9841 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9842 
9843 	do {
9844 		if (m == (vm_page_t)0) {
9845 			break;
9846 		}
9847 		if (m->vmp_dirty) {
9848 			dpages++;
9849 		}
9850 		if (m->vmp_free_when_done) {
9851 			pgopages++;
9852 		}
9853 		if (m->vmp_precious) {
9854 			precpages++;
9855 		}
9856 
9857 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9858 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9859 		if (m == (vm_page_t)0) {
9860 			break;
9861 		}
9862 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9863 	vm_page_unlock_queues();
9864 
9865 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9866 }
9867 #endif /* MACH_BSD */
9868 
9869 
9870 #if CONFIG_IOSCHED
9871 int
9872 upl_get_cached_tier(upl_t  upl)
9873 {
9874 	assert(upl);
9875 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
9876 		return upl->upl_priority;
9877 	}
9878 	return -1;
9879 }
9880 #endif /* CONFIG_IOSCHED */
9881 
9882 
9883 void
9884 upl_callout_iodone(upl_t upl)
9885 {
9886 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
9887 
9888 	if (upl_ctx) {
9889 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
9890 
9891 		assert(upl_ctx->io_done);
9892 
9893 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
9894 	}
9895 }
9896 
9897 void
9898 upl_set_iodone(upl_t upl, void *upl_iodone)
9899 {
9900 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
9901 }
9902 
9903 void
9904 upl_set_iodone_error(upl_t upl, int error)
9905 {
9906 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
9907 
9908 	if (upl_ctx) {
9909 		upl_ctx->io_error = error;
9910 	}
9911 }
9912 
9913 
9914 ppnum_t
9915 upl_get_highest_page(
9916 	upl_t                      upl)
9917 {
9918 	return upl->highest_page;
9919 }
9920 
9921 upl_size_t
9922 upl_get_size(
9923 	upl_t                      upl)
9924 {
9925 	return upl_adjusted_size(upl, PAGE_MASK);
9926 }
9927 
9928 upl_size_t
9929 upl_adjusted_size(
9930 	upl_t upl,
9931 	vm_map_offset_t pgmask)
9932 {
9933 	vm_object_offset_t start_offset, end_offset;
9934 
9935 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
9936 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
9937 
9938 	return (upl_size_t)(end_offset - start_offset);
9939 }
9940 
9941 vm_object_offset_t
9942 upl_adjusted_offset(
9943 	upl_t upl,
9944 	vm_map_offset_t pgmask)
9945 {
9946 	return trunc_page_mask_64(upl->u_offset, pgmask);
9947 }
9948 
9949 vm_object_offset_t
9950 upl_get_data_offset(
9951 	upl_t upl)
9952 {
9953 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
9954 }
9955 
9956 upl_t
9957 upl_associated_upl(upl_t upl)
9958 {
9959 	return upl->associated_upl;
9960 }
9961 
9962 void
9963 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
9964 {
9965 	upl->associated_upl = associated_upl;
9966 }
9967 
9968 struct vnode *
9969 upl_lookup_vnode(upl_t upl)
9970 {
9971 	if (!upl->map_object->internal) {
9972 		return vnode_pager_lookup_vnode(upl->map_object->pager);
9973 	} else {
9974 		return NULL;
9975 	}
9976 }
9977 
9978 boolean_t
9979 upl_has_wired_pages(upl_t upl)
9980 {
9981 	return (upl->flags & UPL_HAS_WIRED) ? TRUE : FALSE;
9982 }
9983 
9984 #if UPL_DEBUG
9985 kern_return_t
9986 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
9987 {
9988 	upl->ubc_alias1 = alias1;
9989 	upl->ubc_alias2 = alias2;
9990 	return KERN_SUCCESS;
9991 }
9992 int
9993 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
9994 {
9995 	if (al) {
9996 		*al = upl->ubc_alias1;
9997 	}
9998 	if (al2) {
9999 		*al2 = upl->ubc_alias2;
10000 	}
10001 	return KERN_SUCCESS;
10002 }
10003 #endif /* UPL_DEBUG */
10004 
10005 #if VM_PRESSURE_EVENTS
10006 /*
10007  * Upward trajectory.
10008  */
10009 
10010 boolean_t
10011 VM_PRESSURE_NORMAL_TO_WARNING(void)
10012 {
10013 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10014 		/* Available pages below our threshold */
10015 		uint32_t available_pages = memorystatus_get_available_page_count();
10016 		if (available_pages < memorystatus_get_soft_memlimit_page_shortage_threshold()) {
10017 #if CONFIG_FREEZE
10018 			/* No frozen processes to kill */
10019 			if (memorystatus_frozen_count == 0) {
10020 				/* Not enough suspended processes available. */
10021 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10022 					return TRUE;
10023 				}
10024 			}
10025 #else /* CONFIG_FREEZE */
10026 			return TRUE;
10027 #endif /* CONFIG_FREEZE */
10028 		}
10029 		return FALSE;
10030 	} else {
10031 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10032 	}
10033 }
10034 
10035 boolean_t
10036 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10037 {
10038 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10039 		/* Available pages below our threshold */
10040 		uint32_t available_pages = memorystatus_get_available_page_count();
10041 		return available_pages < memorystatus_get_critical_page_shortage_threshold();
10042 	} else {
10043 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10044 	}
10045 }
10046 
10047 /*
10048  * Downward trajectory.
10049  */
10050 boolean_t
10051 VM_PRESSURE_WARNING_TO_NORMAL(void)
10052 {
10053 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10054 		/* Available pages above our threshold */
10055 		uint32_t available_pages = memorystatus_get_available_page_count();
10056 		uint32_t target_threshold = (((115 * memorystatus_get_soft_memlimit_page_shortage_threshold()) / 100));
10057 		return available_pages > target_threshold;
10058 	} else {
10059 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10060 	}
10061 }
10062 
10063 boolean_t
10064 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10065 {
10066 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10067 		uint32_t available_pages = memorystatus_get_available_page_count();
10068 		uint32_t target_threshold = (((115 * memorystatus_get_critical_page_shortage_threshold()) / 100));
10069 		return available_pages > target_threshold;
10070 	} else {
10071 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10072 	}
10073 }
10074 #endif /* VM_PRESSURE_EVENTS */
10075 
10076 #if DEVELOPMENT || DEBUG
10077 bool compressor_running_perf_test;
10078 uint64_t compressor_perf_test_pages_processed;
10079 
10080 static kern_return_t
10081 move_pages_to_queue(
10082 	vm_map_t map,
10083 	user_addr_t start_addr,
10084 	size_t buffer_size,
10085 	vm_page_queue_head_t *queue,
10086 	size_t *pages_moved)
10087 {
10088 	kern_return_t err = KERN_SUCCESS;
10089 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10090 	boolean_t addr_in_map = FALSE;
10091 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10092 	vm_object_t curr_object = VM_OBJECT_NULL;
10093 	*pages_moved = 0;
10094 
10095 
10096 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10097 		/*
10098 		 * We don't currently support benchmarking maps with a different page size
10099 		 * than the kernel.
10100 		 */
10101 		return KERN_INVALID_ARGUMENT;
10102 	}
10103 
10104 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10105 		return KERN_INVALID_ARGUMENT;
10106 	}
10107 
10108 	vm_map_lock_read(map);
10109 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10110 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10111 
10112 
10113 	while (curr_addr < end_addr) {
10114 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10115 		if (!addr_in_map) {
10116 			err = KERN_INVALID_ARGUMENT;
10117 			break;
10118 		}
10119 		curr_object = VME_OBJECT(curr_entry);
10120 		if (curr_object) {
10121 			vm_object_lock(curr_object);
10122 			/* We really only want anonymous memory that's in the top level map and object here. */
10123 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10124 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10125 				err = KERN_INVALID_ARGUMENT;
10126 				vm_object_unlock(curr_object);
10127 				break;
10128 			}
10129 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10130 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10131 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
10132 			vm_map_offset_t curr_offset = start_offset;
10133 			vm_page_t curr_page;
10134 			while (curr_offset < end_offset) {
10135 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10136 				if (curr_page != VM_PAGE_NULL) {
10137 					vm_page_lock_queues();
10138 					if (curr_page->vmp_laundry) {
10139 						vm_pageout_steal_laundry(curr_page, TRUE);
10140 					}
10141 					/*
10142 					 * we've already factored out pages in the laundry which
10143 					 * means this page can't be on the pageout queue so it's
10144 					 * safe to do the vm_page_queues_remove
10145 					 */
10146 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10147 					vm_page_queues_remove(curr_page, TRUE);
10148 					if (donate) {
10149 						/*
10150 						 * The compressor needs to see this bit to know
10151 						 * where this page needs to land. Also if stolen,
10152 						 * this bit helps put the page back in the right
10153 						 * special queue where it belongs.
10154 						 */
10155 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10156 					}
10157 					// Clear the referenced bit so we ensure this gets paged out
10158 					curr_page->vmp_reference = false;
10159 					if (curr_page->vmp_pmapped) {
10160 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10161 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10162 					}
10163 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
10164 					vm_page_unlock_queues();
10165 					*pages_moved += 1;
10166 				}
10167 				curr_offset += PAGE_SIZE_64;
10168 				curr_addr += PAGE_SIZE_64;
10169 			}
10170 		}
10171 		vm_object_unlock(curr_object);
10172 	}
10173 	vm_map_unlock_read(map);
10174 	return err;
10175 }
10176 
10177 /*
10178  * Local queue for processing benchmark pages.
10179  * Can't be allocated on the stack because the pointer has to
10180  * be packable.
10181  */
10182 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10183 kern_return_t
10184 run_compressor_perf_test(
10185 	user_addr_t buf,
10186 	size_t buffer_size,
10187 	uint64_t *time,
10188 	uint64_t *bytes_compressed,
10189 	uint64_t *compressor_growth)
10190 {
10191 	kern_return_t err = KERN_SUCCESS;
10192 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10193 		return KERN_NOT_SUPPORTED;
10194 	}
10195 	if (current_task() == kernel_task) {
10196 		return KERN_INVALID_ARGUMENT;
10197 	}
10198 	vm_page_lock_queues();
10199 	if (compressor_running_perf_test) {
10200 		/* Only run one instance of the benchmark at a time. */
10201 		vm_page_unlock_queues();
10202 		return KERN_RESOURCE_SHORTAGE;
10203 	}
10204 	vm_page_unlock_queues();
10205 	size_t page_count = 0;
10206 	vm_map_t map;
10207 	vm_page_t p, next;
10208 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10209 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10210 	*bytes_compressed = *compressor_growth = 0;
10211 
10212 	vm_page_queue_init(&compressor_perf_test_queue);
10213 	map = current_task()->map;
10214 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10215 	if (err != KERN_SUCCESS) {
10216 		goto out;
10217 	}
10218 
10219 	vm_page_lock_queues();
10220 	compressor_running_perf_test = true;
10221 	compressor_perf_test_pages_processed = 0;
10222 	/*
10223 	 * At this point the compressor threads should only process the benchmark queue
10224 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10225 	 * to determine how many compressed bytes we ended up using.
10226 	 */
10227 	compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10228 	vm_page_unlock_queues();
10229 
10230 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10231 
10232 	vm_page_lock_queues();
10233 	compressor_perf_test_start = mach_absolute_time();
10234 
10235 	// Wake up the compressor thread(s)
10236 	sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10237 	    pgo_iothread_internal_state[0].pgo_iothread);
10238 
10239 	/*
10240 	 * Depending on when this test is run we could overshoot or be right on the mark
10241 	 * with our page_count. So the comparison is of the _less than_ variety.
10242 	 */
10243 	while (compressor_perf_test_pages_processed < page_count) {
10244 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10245 		vm_page_unlock_queues();
10246 		thread_block(THREAD_CONTINUE_NULL);
10247 		vm_page_lock_queues();
10248 	}
10249 	compressor_perf_test_end = mach_absolute_time();
10250 	compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10251 	vm_page_unlock_queues();
10252 
10253 
10254 out:
10255 	/*
10256 	 * If we errored out above, then we could still have some pages
10257 	 * on the local queue. Make sure to put them back on the active queue before
10258 	 * returning so they're not orphaned.
10259 	 */
10260 	vm_page_lock_queues();
10261 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10262 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10263 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10264 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10265 
10266 		vm_page_enqueue_active(p, FALSE);
10267 		p = next;
10268 	}
10269 
10270 	compressor_running_perf_test = false;
10271 	vm_page_unlock_queues();
10272 	if (err == KERN_SUCCESS) {
10273 		*bytes_compressed = page_count * PAGE_SIZE_64;
10274 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
10275 	}
10276 
10277 	/*
10278 	 * pageout_scan will consider waking the compactor swapper
10279 	 * before it blocks. Do the same thing here before we return
10280 	 * to ensure that back to back benchmark runs can't overly fragment the
10281 	 * compressor pool.
10282 	 */
10283 	vm_consider_waking_compactor_swapper();
10284 	return err;
10285 }
10286 #endif /* DEVELOPMENT || DEBUG */
10287