xref: /xnu-11417.121.6/osfmk/vm/vm_pageout.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_pageout.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	The proverbial page-out daemon.
64  */
65 
66 #include "mach/kern_return.h"
67 #include <stdint.h>
68 #include <ptrauth.h>
69 
70 #include <debug.h>
71 
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/upl.h>
76 #include <mach/vm_map.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/sdt.h>
80 
81 #include <kern/kern_types.h>
82 #include <kern/counter.h>
83 #include <kern/host_statistics.h>
84 #include <kern/machine.h>
85 #include <kern/misc_protos.h>
86 #include <kern/sched.h>
87 #include <kern/thread.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread_group.h>
92 
93 #include <os/log.h>
94 
95 #include <sys/kdebug_triage.h>
96 
97 #include <machine/vm_tuning.h>
98 #include <machine/commpage.h>
99 
100 #include <vm/pmap.h>
101 #include <vm/vm_compressor_pager_internal.h>
102 #include <vm/vm_fault_internal.h>
103 #include <vm/vm_map_internal.h>
104 #include <vm/vm_object_internal.h>
105 #include <vm/vm_page_internal.h>
106 #include <vm/vm_pageout_internal.h>
107 #include <vm/vm_protos_internal.h> /* must be last */
108 #include <vm/memory_object.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_shared_region.h>
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_kern_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_ubc.h>
115 #include <vm/vm_reclaim_xnu.h>
116 
117 #include <san/kasan.h>
118 #include <sys/kern_memorystatus_xnu.h>
119 
120 #if CONFIG_PHANTOM_CACHE
121 #include <vm/vm_phantom_cache_internal.h>
122 #endif
123 
124 
125 #if UPL_DEBUG
126 #include <libkern/OSDebug.h>
127 #endif
128 
129 extern int cs_debug;
130 
131 #if CONFIG_MBUF_MCACHE
132 extern void mbuf_drain(boolean_t);
133 #endif /* CONFIG_MBUF_MCACHE */
134 
135 #if CONFIG_FREEZE
136 extern unsigned int memorystatus_frozen_count;
137 extern unsigned int memorystatus_suspended_count;
138 #endif /* CONFIG_FREEZE */
139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
140 
141 extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
143 extern uint32_t memorystatus_jetsam_bg_band_waiters;
144 
145 void vm_pressure_response(void);
146 extern void consider_vm_pressure_events(void);
147 
148 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
149 
150 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
151 SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
152 sched_cond_atomic_t vm_pageout_gc_cond;
153 #if CONFIG_VPS_DYNAMIC_PRIO
154 TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
155 #else
156 const bool vps_dynamic_priority_enabled = false;
157 #endif
158 boolean_t vps_yield_for_pgqlockwaiters = TRUE;
159 
160 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
161 #if !XNU_TARGET_OS_OSX
162 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
163 #else /* !XNU_TARGET_OS_OSX */
164 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
165 #endif /* !XNU_TARGET_OS_OSX */
166 #endif
167 
168 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
169 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
170 #endif
171 
172 #ifndef VM_PAGE_LAUNDRY_MAX
173 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
174 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
175 
176 #ifndef VM_PAGEOUT_BURST_WAIT
177 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
178 #endif  /* VM_PAGEOUT_BURST_WAIT */
179 
180 #ifndef VM_PAGEOUT_EMPTY_WAIT
181 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
182 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
183 
184 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
185 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
186 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
187 
188 #ifndef VM_PAGEOUT_IDLE_WAIT
189 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
190 #endif  /* VM_PAGEOUT_IDLE_WAIT */
191 
192 #ifndef VM_PAGEOUT_SWAP_WAIT
193 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
194 #endif  /* VM_PAGEOUT_SWAP_WAIT */
195 
196 /*
197  * vm_page_max_speculative_age_q should be less than or equal to
198  * VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
199  * vm_page_queue_speculative entries.
200  */
201 
202 TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
203 #ifndef VM_PAGE_SPECULATIVE_TARGET
204 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
205 #endif /* VM_PAGE_SPECULATIVE_TARGET */
206 
207 
208 /*
209  *	To obtain a reasonable LRU approximation, the inactive queue
210  *	needs to be large enough to give pages on it a chance to be
211  *	referenced a second time.  This macro defines the fraction
212  *	of active+inactive pages that should be inactive.
213  *	The pageout daemon uses it to update vm_page_inactive_target.
214  *
215  *	If vm_page_free_count falls below vm_page_free_target and
216  *	vm_page_inactive_count is below vm_page_inactive_target,
217  *	then the pageout daemon starts running.
218  */
219 
220 #ifndef VM_PAGE_INACTIVE_TARGET
221 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
222 #endif  /* VM_PAGE_INACTIVE_TARGET */
223 
224 /*
225  *	Once the pageout daemon starts running, it keeps going
226  *	until vm_page_free_count meets or exceeds vm_page_free_target.
227  */
228 
229 #ifndef VM_PAGE_FREE_TARGET
230 #if !XNU_TARGET_OS_OSX
231 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
232 #else /* !XNU_TARGET_OS_OSX */
233 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
234 #endif /* !XNU_TARGET_OS_OSX */
235 #endif  /* VM_PAGE_FREE_TARGET */
236 
237 
238 /*
239  *	The pageout daemon always starts running once vm_page_free_count
240  *	falls below vm_page_free_min.
241  */
242 
243 #ifndef VM_PAGE_FREE_MIN
244 #if !XNU_TARGET_OS_OSX
245 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
246 #else /* !XNU_TARGET_OS_OSX */
247 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
248 #endif /* !XNU_TARGET_OS_OSX */
249 #endif  /* VM_PAGE_FREE_MIN */
250 
251 #if !XNU_TARGET_OS_OSX
252 #define VM_PAGE_FREE_RESERVED_LIMIT     100
253 #define VM_PAGE_FREE_MIN_LIMIT          1500
254 #define VM_PAGE_FREE_TARGET_LIMIT       2000
255 #else /* !XNU_TARGET_OS_OSX */
256 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
257 #define VM_PAGE_FREE_MIN_LIMIT          3500
258 #define VM_PAGE_FREE_TARGET_LIMIT       4000
259 #endif /* !XNU_TARGET_OS_OSX */
260 
261 /*
262  *	When vm_page_free_count falls below vm_page_free_reserved,
263  *	only vm-privileged threads can allocate pages.  vm-privilege
264  *	allows the pageout daemon and default pager (and any other
265  *	associated threads needed for default pageout) to continue
266  *	operation by dipping into the reserved pool of pages.
267  */
268 
269 #ifndef VM_PAGE_FREE_RESERVED
270 #define VM_PAGE_FREE_RESERVED(n)        \
271 	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
272 #endif  /* VM_PAGE_FREE_RESERVED */
273 
274 /*
275  *	When we dequeue pages from the inactive list, they are
276  *	reactivated (ie, put back on the active queue) if referenced.
277  *	However, it is possible to starve the free list if other
278  *	processors are referencing pages faster than we can turn off
279  *	the referenced bit.  So we limit the number of reactivations
280  *	we will make per call of vm_pageout_scan().
281  */
282 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
283 
284 #ifndef VM_PAGE_REACTIVATE_LIMIT
285 #if !XNU_TARGET_OS_OSX
286 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
287 #else /* !XNU_TARGET_OS_OSX */
288 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
289 #endif /* !XNU_TARGET_OS_OSX */
290 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
291 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
292 
293 int vm_pageout_protect_realtime = true;
294 
295 extern boolean_t hibernate_cleaning_in_progress;
296 
297 struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
298 struct pgo_iothread_state pgo_iothread_external_state;
299 
300 #if VM_PRESSURE_EVENTS
301 void vm_pressure_thread(void);
302 
303 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
304 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
305 
306 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
307 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
308 #endif
309 
310 static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
311 static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
312 static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
313 
314 extern void vm_pageout_continue(void);
315 extern void vm_pageout_scan(void);
316 
317 boolean_t vm_pageout_running = FALSE;
318 
319 uint32_t vm_page_upl_tainted = 0;
320 uint32_t vm_page_iopl_tainted = 0;
321 
322 #if XNU_TARGET_OS_OSX
323 static boolean_t vm_pageout_waiter  = FALSE;
324 #endif /* XNU_TARGET_OS_OSX */
325 
326 
327 #if DEVELOPMENT || DEBUG
328 struct vm_pageout_debug vm_pageout_debug;
329 #endif
330 struct vm_pageout_vminfo vm_pageout_vminfo;
331 struct vm_pageout_state  vm_pageout_state;
332 struct vm_config         vm_config;
333 
334 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
335 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
336 #if DEVELOPMENT || DEBUG
337 struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
338 #endif /* DEVELOPMENT || DEBUG */
339 
340 int         vm_upl_wait_for_pages = 0;
341 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
342 
343 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
344 
345 int     vm_debug_events = 0;
346 
347 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
348 
349 #if CONFIG_MEMORYSTATUS
350 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
351 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
352 #endif
353 
354 #if __AMP__
355 
356 
357 /*
358  * Bind compressor threads to e-cores unless there are multiple non-e clusters
359  */
360 #if (MAX_CPU_CLUSTERS > 2)
361 #define VM_COMPRESSOR_EBOUND_DEFAULT false
362 #elif defined(XNU_TARGET_OS_XR)
363 #define VM_COMPRESSOR_EBOUND_DEFAULT false
364 #else
365 #define VM_COMPRESSOR_EBOUND_DEFAULT true
366 #endif
367 
368 TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
369 int vm_pgo_pbound = 0;
370 extern void thread_soft_bind_cluster_type(thread_t, char);
371 
372 #endif /* __AMP__ */
373 
374 
375 /*
376  *	Routine:	vm_pageout_object_terminate
377  *	Purpose:
378  *		Destroy the pageout_object, and perform all of the
379  *		required cleanup actions.
380  *
381  *	In/Out conditions:
382  *		The object must be locked, and will be returned locked.
383  */
384 void
vm_pageout_object_terminate(vm_object_t object)385 vm_pageout_object_terminate(
386 	vm_object_t     object)
387 {
388 	vm_object_t     shadow_object;
389 
390 	/*
391 	 * Deal with the deallocation (last reference) of a pageout object
392 	 * (used for cleaning-in-place) by dropping the paging references/
393 	 * freeing pages in the original object.
394 	 */
395 
396 	assert(object->pageout);
397 	shadow_object = object->shadow;
398 	vm_object_lock(shadow_object);
399 
400 	while (!vm_page_queue_empty(&object->memq)) {
401 		vm_page_t               p, m;
402 		vm_object_offset_t      offset;
403 
404 		p = (vm_page_t) vm_page_queue_first(&object->memq);
405 
406 		assert(vm_page_is_private(p));
407 		assert(p->vmp_free_when_done);
408 		p->vmp_free_when_done = FALSE;
409 		assert(!p->vmp_cleaning);
410 		assert(!p->vmp_laundry);
411 
412 		offset = p->vmp_offset;
413 		VM_PAGE_FREE(p);
414 		p = VM_PAGE_NULL;
415 
416 		m = vm_page_lookup(shadow_object,
417 		    offset + object->vo_shadow_offset);
418 
419 		if (m == VM_PAGE_NULL) {
420 			continue;
421 		}
422 
423 		assert((m->vmp_dirty) || (m->vmp_precious) ||
424 		    (m->vmp_busy && m->vmp_cleaning));
425 
426 		/*
427 		 * Handle the trusted pager throttle.
428 		 * Also decrement the burst throttle (if external).
429 		 */
430 		vm_page_lock_queues();
431 		if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
432 			vm_pageout_throttle_up(m);
433 		}
434 
435 		/*
436 		 * Handle the "target" page(s). These pages are to be freed if
437 		 * successfully cleaned. Target pages are always busy, and are
438 		 * wired exactly once. The initial target pages are not mapped,
439 		 * (so cannot be referenced or modified) but converted target
440 		 * pages may have been modified between the selection as an
441 		 * adjacent page and conversion to a target.
442 		 */
443 		if (m->vmp_free_when_done) {
444 			assert(m->vmp_busy);
445 			assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
446 			assert(m->vmp_wire_count == 1);
447 			m->vmp_cleaning = FALSE;
448 			m->vmp_free_when_done = FALSE;
449 			/*
450 			 * Revoke all access to the page. Since the object is
451 			 * locked, and the page is busy, this prevents the page
452 			 * from being dirtied after the pmap_disconnect() call
453 			 * returns.
454 			 *
455 			 * Since the page is left "dirty" but "not modifed", we
456 			 * can detect whether the page was redirtied during
457 			 * pageout by checking the modify state.
458 			 */
459 			if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
460 				SET_PAGE_DIRTY(m, FALSE);
461 			} else {
462 				m->vmp_dirty = FALSE;
463 			}
464 
465 			if (m->vmp_dirty) {
466 				vm_page_unwire(m, TRUE);        /* reactivates */
467 				counter_inc(&vm_statistics_reactivations);
468 				vm_page_wakeup_done(object, m);
469 			} else {
470 				vm_page_free(m);  /* clears busy, etc. */
471 			}
472 			vm_page_unlock_queues();
473 			continue;
474 		}
475 		/*
476 		 * Handle the "adjacent" pages. These pages were cleaned in
477 		 * place, and should be left alone.
478 		 * If prep_pin_count is nonzero, then someone is using the
479 		 * page, so make it active.
480 		 */
481 		if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !vm_page_is_private(m)) {
482 			if (m->vmp_reference) {
483 				vm_page_activate(m);
484 			} else {
485 				vm_page_deactivate(m);
486 			}
487 		}
488 		if (m->vmp_overwriting) {
489 			/*
490 			 * the (COPY_OUT_FROM == FALSE) request_page_list case
491 			 */
492 			if (m->vmp_busy) {
493 				/*
494 				 * We do not re-set m->vmp_dirty !
495 				 * The page was busy so no extraneous activity
496 				 * could have occurred. COPY_INTO is a read into the
497 				 * new pages. CLEAN_IN_PLACE does actually write
498 				 * out the pages but handling outside of this code
499 				 * will take care of resetting dirty. We clear the
500 				 * modify however for the Programmed I/O case.
501 				 */
502 				pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
503 
504 				m->vmp_busy = FALSE;
505 				m->vmp_absent = FALSE;
506 			} else {
507 				/*
508 				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
509 				 * Occurs when the original page was wired
510 				 * at the time of the list request
511 				 */
512 				assert(VM_PAGE_WIRED(m));
513 				vm_page_unwire(m, TRUE);        /* reactivates */
514 			}
515 			m->vmp_overwriting = FALSE;
516 		} else {
517 			m->vmp_dirty = FALSE;
518 		}
519 		m->vmp_cleaning = FALSE;
520 
521 		/*
522 		 * Wakeup any thread waiting for the page to be un-cleaning.
523 		 */
524 		vm_page_wakeup(object, m);
525 		vm_page_unlock_queues();
526 	}
527 	/*
528 	 * Account for the paging reference taken in vm_paging_object_allocate.
529 	 */
530 	vm_object_activity_end(shadow_object);
531 	vm_object_unlock(shadow_object);
532 
533 	assert(os_ref_get_count_raw(&object->ref_count) == 0);
534 	assert(object->paging_in_progress == 0);
535 	assert(object->activity_in_progress == 0);
536 	assert(object->resident_page_count == 0);
537 	return;
538 }
539 
540 /*
541  * Routine:	vm_pageclean_setup
542  *
543  * Purpose:	setup a page to be cleaned (made non-dirty), but not
544  *		necessarily flushed from the VM page cache.
545  *		This is accomplished by cleaning in place.
546  *
547  *		The page must not be busy, and new_object
548  *		must be locked.
549  *
550  */
551 static void
vm_pageclean_setup(vm_page_t m,vm_page_t new_m,vm_object_t new_object,vm_object_offset_t new_offset)552 vm_pageclean_setup(
553 	vm_page_t               m,
554 	vm_page_t               new_m,
555 	vm_object_t             new_object,
556 	vm_object_offset_t      new_offset)
557 {
558 	assert(!m->vmp_busy);
559 #if 0
560 	assert(!m->vmp_cleaning);
561 #endif
562 
563 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
564 
565 	/*
566 	 * Mark original page as cleaning in place.
567 	 */
568 	m->vmp_cleaning = TRUE;
569 	SET_PAGE_DIRTY(m, FALSE);
570 	m->vmp_precious = FALSE;
571 
572 	/*
573 	 * Convert the fictitious page to a private shadow of
574 	 * the real page.
575 	 */
576 	new_m->vmp_free_when_done = TRUE;
577 
578 	vm_page_lockspin_queues();
579 	vm_page_make_private(new_m, VM_PAGE_GET_PHYS_PAGE(m));
580 	vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
581 	vm_page_unlock_queues();
582 
583 	vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
584 	assert(!new_m->vmp_wanted);
585 	new_m->vmp_busy = FALSE;
586 }
587 
588 /*
589  *	Routine:	vm_pageout_initialize_page
590  *	Purpose:
591  *		Causes the specified page to be initialized in
592  *		the appropriate memory object. This routine is used to push
593  *		pages into a copy-object when they are modified in the
594  *		permanent object.
595  *
596  *		The page is moved to a temporary object and paged out.
597  *
598  *	In/out conditions:
599  *		The page in question must not be on any pageout queues.
600  *		The object to which it belongs must be locked.
601  *		The page must be busy, but not hold a paging reference.
602  *
603  *	Implementation:
604  *		Move this page to a completely new object.
605  */
606 void
vm_pageout_initialize_page(vm_page_t m)607 vm_pageout_initialize_page(
608 	vm_page_t       m)
609 {
610 	vm_object_t             object;
611 	vm_object_offset_t      paging_offset;
612 	memory_object_t         pager;
613 
614 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615 
616 	object = VM_PAGE_OBJECT(m);
617 
618 	assert(m->vmp_busy);
619 	assert(object->internal);
620 
621 	/*
622 	 *	Verify that we really want to clean this page
623 	 */
624 	assert(!m->vmp_absent);
625 	assert(m->vmp_dirty);
626 
627 	/*
628 	 *	Create a paging reference to let us play with the object.
629 	 */
630 	paging_offset = m->vmp_offset + object->paging_offset;
631 
632 	if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 		panic("reservation without pageout?"); /* alan */
634 
635 		VM_PAGE_FREE(m);
636 		vm_object_unlock(object);
637 
638 		return;
639 	}
640 
641 	/*
642 	 * If there's no pager, then we can't clean the page.  This should
643 	 * never happen since this should be a copy object and therefore not
644 	 * an external object, so the pager should always be there.
645 	 */
646 
647 	pager = object->pager;
648 
649 	if (pager == MEMORY_OBJECT_NULL) {
650 		panic("missing pager for copy object");
651 
652 		VM_PAGE_FREE(m);
653 		return;
654 	}
655 
656 	/*
657 	 * set the page for future call to vm_fault_list_request
658 	 */
659 	pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
660 	SET_PAGE_DIRTY(m, FALSE);
661 
662 	/*
663 	 * keep the object from collapsing or terminating
664 	 */
665 	vm_object_paging_begin(object);
666 	vm_object_unlock(object);
667 
668 	/*
669 	 *	Write the data to its pager.
670 	 *	Note that the data is passed by naming the new object,
671 	 *	not a virtual address; the pager interface has been
672 	 *	manipulated to use the "internal memory" data type.
673 	 *	[The object reference from its allocation is donated
674 	 *	to the eventual recipient.]
675 	 */
676 	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
677 
678 	vm_object_lock(object);
679 	vm_object_paging_end(object);
680 }
681 
682 
683 /*
684  * vm_pageout_cluster:
685  *
686  * Given a page, queue it to the appropriate I/O thread,
687  * which will page it out and attempt to clean adjacent pages
688  * in the same operation.
689  *
690  * The object and queues must be locked. We will take a
691  * paging reference to prevent deallocation or collapse when we
692  * release the object lock back at the call site.  The I/O thread
693  * is responsible for consuming this reference
694  *
695  * The page must not be on any pageout queue.
696  */
697 #if DEVELOPMENT || DEBUG
698 vmct_stats_t vmct_stats;
699 
700 int32_t vmct_active = 0;
701 uint64_t vm_compressor_epoch_start = 0;
702 uint64_t vm_compressor_epoch_stop = 0;
703 
704 typedef enum vmct_state_t {
705 	VMCT_IDLE,
706 	VMCT_AWAKENED,
707 	VMCT_ACTIVE,
708 } vmct_state_t;
709 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710 #endif
711 
712 
713 
714 static void
vm_pageout_cluster_to_queue(vm_page_t m,struct vm_pageout_queue * q)715 vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716 {
717 	vm_object_t object = VM_PAGE_OBJECT(m);
718 
719 	VM_PAGE_CHECK(m);
720 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 	vm_object_lock_assert_exclusive(object);
722 
723 	/*
724 	 * Make sure it's OK to page this out.
725 	 */
726 	assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 	assert(!m->vmp_cleaning && !m->vmp_laundry);
728 	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729 
730 	/*
731 	 * protect the object from collapse or termination
732 	 */
733 	vm_object_activity_begin(object);
734 
735 
736 	/*
737 	 * pgo_laundry count is tied to the laundry bit
738 	 */
739 	m->vmp_laundry = TRUE;
740 	q->pgo_laundry++;
741 
742 	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744 
745 	if (object->internal == TRUE) {
746 		assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
747 		m->vmp_busy = TRUE;
748 #if DEVELOPMENT || DEBUG
749 		/*
750 		 * The benchmark queue will be woken up independently by the benchmark
751 		 * itself.
752 		 */
753 		if (q != &vm_pageout_queue_benchmark) {
754 #else /* DEVELOPMENT || DEBUG */
755 		if (true) {
756 #endif /* DEVELOPMENT || DEBUG */
757 			/*
758 			 * Wake up the first compressor thread. It will wake subsequent
759 			 * threads if necessary.
760 			 */
761 			sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
762 			    pgo_iothread_internal_state[0].pgo_iothread);
763 		}
764 	} else {
765 		sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
766 	}
767 	VM_PAGE_CHECK(m);
768 }
769 
770 void
771 vm_pageout_cluster(vm_page_t m)
772 {
773 	struct          vm_pageout_queue *q;
774 	vm_object_t     object = VM_PAGE_OBJECT(m);
775 	if (object->internal) {
776 		q = &vm_pageout_queue_internal;
777 	} else {
778 		q = &vm_pageout_queue_external;
779 	}
780 	vm_pageout_cluster_to_queue(m, q);
781 }
782 
783 
784 /*
785  * A page is back from laundry or we are stealing it back from
786  * the laundering state.  See if there are some pages waiting to
787  * go to laundry and if we can let some of them go now.
788  *
789  * Object and page queues must be locked.
790  */
791 void
792 vm_pageout_throttle_up(
793 	vm_page_t       m)
794 {
795 	struct vm_pageout_queue *q;
796 	vm_object_t      m_object;
797 
798 	m_object = VM_PAGE_OBJECT(m);
799 
800 	assert(m_object != VM_OBJECT_NULL);
801 	assert(!is_kernel_object(m_object));
802 
803 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
804 	vm_object_lock_assert_exclusive(m_object);
805 
806 	if (m_object->internal == TRUE) {
807 		q = &vm_pageout_queue_internal;
808 	} else {
809 		q = &vm_pageout_queue_external;
810 	}
811 
812 	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
813 		vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
814 		m->vmp_q_state = VM_PAGE_NOT_ON_Q;
815 
816 		VM_PAGE_ZERO_PAGEQ_ENTRY(m);
817 
818 		vm_object_activity_end(m_object);
819 
820 		VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
821 	}
822 	if (m->vmp_laundry == TRUE) {
823 		m->vmp_laundry = FALSE;
824 		q->pgo_laundry--;
825 
826 		if (q->pgo_throttled == TRUE) {
827 			q->pgo_throttled = FALSE;
828 			thread_wakeup((event_t) &q->pgo_laundry);
829 		}
830 		if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
831 			q->pgo_draining = FALSE;
832 			thread_wakeup((event_t) (&q->pgo_laundry + 1));
833 		}
834 		VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
835 	}
836 }
837 
838 
839 static void
840 vm_pageout_throttle_up_batch(
841 	struct vm_pageout_queue *q,
842 	int             batch_cnt)
843 {
844 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
845 
846 	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
847 
848 	q->pgo_laundry -= batch_cnt;
849 
850 	if (q->pgo_throttled == TRUE) {
851 		q->pgo_throttled = FALSE;
852 		thread_wakeup((event_t) &q->pgo_laundry);
853 	}
854 	if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
855 		q->pgo_draining = FALSE;
856 		thread_wakeup((event_t) (&q->pgo_laundry + 1));
857 	}
858 }
859 
860 
861 
862 /*
863  * VM memory pressure monitoring.
864  *
865  * vm_pageout_scan() keeps track of the number of pages it considers and
866  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
867  *
868  * compute_memory_pressure() is called every second from compute_averages()
869  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
870  * of recalimed pages in a new vm_pageout_stat[] bucket.
871  *
872  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
873  * The caller provides the number of seconds ("nsecs") worth of statistics
874  * it wants, up to 30 seconds.
875  * It computes the number of pages reclaimed in the past "nsecs" seconds and
876  * also returns the number of pages the system still needs to reclaim at this
877  * moment in time.
878  */
879 #if DEVELOPMENT || DEBUG
880 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
881 #else
882 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
883 #endif
884 struct vm_pageout_stat {
885 	unsigned long vm_page_active_count;
886 	unsigned long vm_page_speculative_count;
887 	unsigned long vm_page_inactive_count;
888 	unsigned long vm_page_anonymous_count;
889 
890 	unsigned long vm_page_free_count;
891 	unsigned long vm_page_wire_count;
892 	unsigned long vm_page_compressor_count;
893 
894 	unsigned long vm_page_pages_compressed;
895 	unsigned long vm_page_pageable_internal_count;
896 	unsigned long vm_page_pageable_external_count;
897 	unsigned long vm_page_xpmapped_external_count;
898 
899 	unsigned int pages_grabbed;
900 	unsigned int pages_freed;
901 
902 	unsigned int pages_compressed;
903 	unsigned int pages_grabbed_by_compressor;
904 	unsigned int failed_compressions;
905 
906 	unsigned int pages_evicted;
907 	unsigned int pages_purged;
908 
909 	unsigned int considered;
910 	unsigned int considered_bq_internal;
911 	unsigned int considered_bq_external;
912 
913 	unsigned int skipped_external;
914 	unsigned int skipped_internal;
915 	unsigned int filecache_min_reactivations;
916 
917 	unsigned int freed_speculative;
918 	unsigned int freed_cleaned;
919 	unsigned int freed_internal;
920 	unsigned int freed_external;
921 
922 	unsigned int cleaned_dirty_external;
923 	unsigned int cleaned_dirty_internal;
924 
925 	unsigned int inactive_referenced;
926 	unsigned int inactive_nolock;
927 	unsigned int reactivation_limit_exceeded;
928 	unsigned int forced_inactive_reclaim;
929 
930 	unsigned int throttled_internal_q;
931 	unsigned int throttled_external_q;
932 
933 	unsigned int phantom_ghosts_found;
934 	unsigned int phantom_ghosts_added;
935 
936 	unsigned int vm_page_realtime_count;
937 	unsigned int forcereclaimed_sharedcache;
938 	unsigned int forcereclaimed_realtime;
939 	unsigned int protected_sharedcache;
940 	unsigned int protected_realtime;
941 
942 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
943 
944 unsigned int vm_pageout_stat_now = 0;
945 
946 #define VM_PAGEOUT_STAT_BEFORE(i) \
947 	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
948 #define VM_PAGEOUT_STAT_AFTER(i) \
949 	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
950 
951 #if VM_PAGE_BUCKETS_CHECK
952 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
953 #endif /* VM_PAGE_BUCKETS_CHECK */
954 
955 
956 void
957 record_memory_pressure(void);
958 void
959 record_memory_pressure(void)
960 {
961 	unsigned int vm_pageout_next;
962 
963 #if VM_PAGE_BUCKETS_CHECK
964 	/* check the consistency of VM page buckets at regular interval */
965 	static int counter = 0;
966 	if ((++counter % vm_page_buckets_check_interval) == 0) {
967 		vm_page_buckets_check();
968 	}
969 #endif /* VM_PAGE_BUCKETS_CHECK */
970 
971 	vm_pageout_state.vm_memory_pressure =
972 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
973 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
974 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
975 	    vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
976 
977 	commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
978 
979 	/* move "now" forward */
980 	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
981 
982 	bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
983 
984 	vm_pageout_stat_now = vm_pageout_next;
985 }
986 
987 
988 /*
989  * IMPORTANT
990  * mach_vm_ctl_page_free_wanted() is called indirectly, via
991  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
992  * it must be safe in the restricted stackshot context. Locks and/or
993  * blocking are not allowable.
994  */
995 unsigned int
996 mach_vm_ctl_page_free_wanted(void)
997 {
998 	unsigned int page_free_target, page_free_count, page_free_wanted;
999 
1000 	page_free_target = vm_page_free_target;
1001 	page_free_count = vm_page_free_count;
1002 	if (page_free_target > page_free_count) {
1003 		page_free_wanted = page_free_target - page_free_count;
1004 	} else {
1005 		page_free_wanted = 0;
1006 	}
1007 
1008 	return page_free_wanted;
1009 }
1010 
1011 
1012 /*
1013  * IMPORTANT:
1014  * mach_vm_pressure_monitor() is called when taking a stackshot, with
1015  * wait_for_pressure FALSE, so that code path must remain safe in the
1016  * restricted stackshot context. No blocking or locks are allowable.
1017  * on that code path.
1018  */
1019 
1020 kern_return_t
1021 mach_vm_pressure_monitor(
1022 	boolean_t       wait_for_pressure,
1023 	unsigned int    nsecs_monitored,
1024 	unsigned int    *pages_reclaimed_p,
1025 	unsigned int    *pages_wanted_p)
1026 {
1027 	wait_result_t   wr;
1028 	unsigned int    vm_pageout_then, vm_pageout_now;
1029 	unsigned int    pages_reclaimed;
1030 	unsigned int    units_of_monitor;
1031 
1032 	units_of_monitor = 8 * nsecs_monitored;
1033 	/*
1034 	 * We don't take the vm_page_queue_lock here because we don't want
1035 	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1036 	 * thread when it's trying to reclaim memory.  We don't need fully
1037 	 * accurate monitoring anyway...
1038 	 */
1039 
1040 	if (wait_for_pressure) {
1041 		/* wait until there's memory pressure */
1042 		while (vm_page_free_count >= vm_page_free_target) {
1043 			wr = assert_wait((event_t) &vm_page_free_wanted,
1044 			    THREAD_INTERRUPTIBLE);
1045 			if (wr == THREAD_WAITING) {
1046 				wr = thread_block(THREAD_CONTINUE_NULL);
1047 			}
1048 			if (wr == THREAD_INTERRUPTED) {
1049 				return KERN_ABORTED;
1050 			}
1051 			if (wr == THREAD_AWAKENED) {
1052 				/*
1053 				 * The memory pressure might have already
1054 				 * been relieved but let's not block again
1055 				 * and let's report that there was memory
1056 				 * pressure at some point.
1057 				 */
1058 				break;
1059 			}
1060 		}
1061 	}
1062 
1063 	/* provide the number of pages the system wants to reclaim */
1064 	if (pages_wanted_p != NULL) {
1065 		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1066 	}
1067 
1068 	if (pages_reclaimed_p == NULL) {
1069 		return KERN_SUCCESS;
1070 	}
1071 
1072 	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1073 	vm_pageout_now = vm_pageout_stat_now;
1074 	pages_reclaimed = 0;
1075 	for (vm_pageout_then =
1076 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1077 	    vm_pageout_then != vm_pageout_now &&
1078 	    units_of_monitor-- != 0;
1079 	    vm_pageout_then =
1080 	    VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1081 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1082 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1083 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1084 		pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1085 	}
1086 	*pages_reclaimed_p = pages_reclaimed;
1087 
1088 	return KERN_SUCCESS;
1089 }
1090 
1091 
1092 
1093 #if DEVELOPMENT || DEBUG
1094 
1095 static void
1096 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1097 
1098 /*
1099  * condition variable used to make sure there is
1100  * only a single sweep going on at a time
1101  */
1102 bool vm_pageout_disconnect_all_pages_active = false;
1103 
1104 void
1105 vm_pageout_disconnect_all_pages()
1106 {
1107 	vm_page_lock_queues();
1108 
1109 	if (vm_pageout_disconnect_all_pages_active) {
1110 		vm_page_unlock_queues();
1111 		return;
1112 	}
1113 	vm_pageout_disconnect_all_pages_active = true;
1114 
1115 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1116 	    vm_page_throttled_count);
1117 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1118 	    vm_page_anonymous_count);
1119 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1120 	    (vm_page_inactive_count - vm_page_anonymous_count));
1121 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1122 	    vm_page_active_count);
1123 #ifdef CONFIG_SECLUDED_MEMORY
1124 	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1125 	    vm_page_secluded_count);
1126 #endif /* CONFIG_SECLUDED_MEMORY */
1127 	vm_page_unlock_queues();
1128 
1129 	vm_pageout_disconnect_all_pages_active = false;
1130 }
1131 
1132 /* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1133 void
1134 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1135 {
1136 	vm_page_t       m;
1137 	vm_object_t     t_object = NULL;
1138 	vm_object_t     l_object = NULL;
1139 	vm_object_t     m_object = NULL;
1140 	int             delayed_unlock = 0;
1141 	int             try_failed_count = 0;
1142 	int             disconnected_count = 0;
1143 	int             paused_count = 0;
1144 	int             object_locked_count = 0;
1145 
1146 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1147 	    DBG_FUNC_START),
1148 	    q, qcount);
1149 
1150 	while (qcount && !vm_page_queue_empty(q)) {
1151 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1152 
1153 		m = (vm_page_t) vm_page_queue_first(q);
1154 		m_object = VM_PAGE_OBJECT(m);
1155 
1156 		if (m_object == VM_OBJECT_NULL) {
1157 			/*
1158 			 * Bumped into a free page. This should only happen on the
1159 			 * secluded queue
1160 			 */
1161 #if CONFIG_SECLUDED_MEMORY
1162 			assert(q == &vm_page_queue_secluded);
1163 #endif /* CONFIG_SECLUDED_MEMORY */
1164 			goto reenter_pg_on_q;
1165 		}
1166 
1167 		/*
1168 		 * check to see if we currently are working
1169 		 * with the same object... if so, we've
1170 		 * already got the lock
1171 		 */
1172 		if (m_object != l_object) {
1173 			/*
1174 			 * the object associated with candidate page is
1175 			 * different from the one we were just working
1176 			 * with... dump the lock if we still own it
1177 			 */
1178 			if (l_object != NULL) {
1179 				vm_object_unlock(l_object);
1180 				l_object = NULL;
1181 			}
1182 			if (m_object != t_object) {
1183 				try_failed_count = 0;
1184 			}
1185 
1186 			/*
1187 			 * Try to lock object; since we've alread got the
1188 			 * page queues lock, we can only 'try' for this one.
1189 			 * if the 'try' fails, we need to do a mutex_pause
1190 			 * to allow the owner of the object lock a chance to
1191 			 * run...
1192 			 */
1193 			if (!vm_object_lock_try_scan(m_object)) {
1194 				if (try_failed_count > 20) {
1195 					goto reenter_pg_on_q;
1196 				}
1197 				vm_page_unlock_queues();
1198 				mutex_pause(try_failed_count++);
1199 				vm_page_lock_queues();
1200 				delayed_unlock = 0;
1201 
1202 				paused_count++;
1203 
1204 				t_object = m_object;
1205 				continue;
1206 			}
1207 			object_locked_count++;
1208 
1209 			l_object = m_object;
1210 		}
1211 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1212 		    m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1213 		    m->vmp_free_when_done) {
1214 			/*
1215 			 * put it back on the head of its queue
1216 			 */
1217 			goto reenter_pg_on_q;
1218 		}
1219 		if (m->vmp_pmapped == TRUE) {
1220 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1221 
1222 			disconnected_count++;
1223 		}
1224 reenter_pg_on_q:
1225 		vm_page_queue_remove(q, m, vmp_pageq);
1226 		vm_page_queue_enter(q, m, vmp_pageq);
1227 
1228 		qcount--;
1229 		try_failed_count = 0;
1230 
1231 		if (delayed_unlock++ > 128) {
1232 			if (l_object != NULL) {
1233 				vm_object_unlock(l_object);
1234 				l_object = NULL;
1235 			}
1236 			lck_mtx_yield(&vm_page_queue_lock);
1237 			delayed_unlock = 0;
1238 		}
1239 	}
1240 	if (l_object != NULL) {
1241 		vm_object_unlock(l_object);
1242 		l_object = NULL;
1243 	}
1244 
1245 	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1246 	    DBG_FUNC_END),
1247 	    q, disconnected_count, object_locked_count, paused_count);
1248 }
1249 
1250 extern const char *proc_best_name(struct proc* proc);
1251 
1252 int
1253 vm_toggle_task_selfdonate_pages(task_t task)
1254 {
1255 	int state = 0;
1256 	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1257 		printf("VM Donation mode is OFF on the system\n");
1258 		return state;
1259 	}
1260 	if (task != kernel_task) {
1261 		task_lock(task);
1262 		if (!task->donates_own_pages) {
1263 			printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1264 			task->donates_own_pages = true;
1265 			state = 1;
1266 		} else if (task->donates_own_pages) {
1267 			printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1268 			task->donates_own_pages = false;
1269 			state = 0;
1270 		}
1271 		task_unlock(task);
1272 	}
1273 	return state;
1274 }
1275 #endif /* DEVELOPMENT || DEBUG */
1276 
1277 void
1278 vm_task_set_selfdonate_pages(task_t task, bool donate)
1279 {
1280 	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1281 	assert(task != kernel_task);
1282 
1283 	task_lock(task);
1284 	task->donates_own_pages = donate;
1285 	task_unlock(task);
1286 }
1287 
1288 
1289 
1290 static size_t
1291 vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1292 
1293 /*
1294  * condition variable used to make sure there is
1295  * only a single sweep going on at a time
1296  */
1297 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1298 
1299 
1300 kern_return_t
1301 vm_pageout_anonymous_pages()
1302 {
1303 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1304 		size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
1305 		vm_page_lock_queues();
1306 
1307 		if (vm_pageout_anonymous_pages_active == TRUE) {
1308 			vm_page_unlock_queues();
1309 			return KERN_RESOURCE_SHORTAGE;
1310 		}
1311 		vm_pageout_anonymous_pages_active = TRUE;
1312 		vm_page_unlock_queues();
1313 
1314 		throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1315 		anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1316 		active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1317 
1318 		os_log(OS_LOG_DEFAULT,
1319 		    "%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
1320 		    __func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
1321 
1322 		if (VM_CONFIG_SWAP_IS_PRESENT) {
1323 			vm_consider_swapping();
1324 		}
1325 
1326 		vm_page_lock_queues();
1327 		vm_pageout_anonymous_pages_active = FALSE;
1328 		vm_page_unlock_queues();
1329 		return KERN_SUCCESS;
1330 	} else {
1331 		return KERN_NOT_SUPPORTED;
1332 	}
1333 }
1334 
1335 
1336 size_t
1337 vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1338 {
1339 	vm_page_t       m;
1340 	vm_object_t     t_object = NULL;
1341 	vm_object_t     l_object = NULL;
1342 	vm_object_t     m_object = NULL;
1343 	int             delayed_unlock = 0;
1344 	int             try_failed_count = 0;
1345 	int             refmod_state;
1346 	int             pmap_options;
1347 	struct          vm_pageout_queue *iq;
1348 	ppnum_t         phys_page;
1349 	size_t          pages_moved = 0;
1350 
1351 
1352 	iq = &vm_pageout_queue_internal;
1353 
1354 	vm_page_lock_queues();
1355 
1356 #if DEVELOPMENT || DEBUG
1357 	if (perf_test) {
1358 		iq = &vm_pageout_queue_benchmark;
1359 		// ensure the benchmark queue isn't throttled
1360 		iq->pgo_maxlaundry = (unsigned int) qcount;
1361 	}
1362 #endif /* DEVELOPMENT ||DEBUG */
1363 
1364 	while (qcount && !vm_page_queue_empty(q)) {
1365 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1366 
1367 		if (VM_PAGE_Q_THROTTLED(iq)) {
1368 			if (l_object != NULL) {
1369 				vm_object_unlock(l_object);
1370 				l_object = NULL;
1371 			}
1372 			iq->pgo_draining = TRUE;
1373 
1374 			assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1375 			vm_page_unlock_queues();
1376 
1377 			thread_block(THREAD_CONTINUE_NULL);
1378 
1379 			vm_page_lock_queues();
1380 			delayed_unlock = 0;
1381 			continue;
1382 		}
1383 		m = (vm_page_t) vm_page_queue_first(q);
1384 		m_object = VM_PAGE_OBJECT(m);
1385 
1386 		/*
1387 		 * check to see if we currently are working
1388 		 * with the same object... if so, we've
1389 		 * already got the lock
1390 		 */
1391 		if (m_object != l_object) {
1392 			if (!m_object->internal) {
1393 				goto reenter_pg_on_q;
1394 			}
1395 
1396 			/*
1397 			 * the object associated with candidate page is
1398 			 * different from the one we were just working
1399 			 * with... dump the lock if we still own it
1400 			 */
1401 			if (l_object != NULL) {
1402 				vm_object_unlock(l_object);
1403 				l_object = NULL;
1404 			}
1405 			if (m_object != t_object) {
1406 				try_failed_count = 0;
1407 			}
1408 
1409 			/*
1410 			 * Try to lock object; since we've alread got the
1411 			 * page queues lock, we can only 'try' for this one.
1412 			 * if the 'try' fails, we need to do a mutex_pause
1413 			 * to allow the owner of the object lock a chance to
1414 			 * run...
1415 			 */
1416 			if (!vm_object_lock_try_scan(m_object)) {
1417 				if (try_failed_count > 20) {
1418 					goto reenter_pg_on_q;
1419 				}
1420 				vm_page_unlock_queues();
1421 				mutex_pause(try_failed_count++);
1422 				vm_page_lock_queues();
1423 				delayed_unlock = 0;
1424 
1425 				t_object = m_object;
1426 				continue;
1427 			}
1428 			l_object = m_object;
1429 		}
1430 		if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1431 			/*
1432 			 * page is not to be cleaned
1433 			 * put it back on the head of its queue
1434 			 */
1435 			goto reenter_pg_on_q;
1436 		}
1437 		phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1438 
1439 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1440 			refmod_state = pmap_get_refmod(phys_page);
1441 
1442 			if (refmod_state & VM_MEM_REFERENCED) {
1443 				m->vmp_reference = TRUE;
1444 			}
1445 			if (refmod_state & VM_MEM_MODIFIED) {
1446 				SET_PAGE_DIRTY(m, FALSE);
1447 			}
1448 		}
1449 		if (m->vmp_reference == TRUE) {
1450 			m->vmp_reference = FALSE;
1451 			pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1452 			goto reenter_pg_on_q;
1453 		}
1454 		if (m->vmp_pmapped == TRUE) {
1455 			if (m->vmp_dirty || m->vmp_precious) {
1456 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
1457 			} else {
1458 				pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1459 			}
1460 			refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1461 			if (refmod_state & VM_MEM_MODIFIED) {
1462 				SET_PAGE_DIRTY(m, FALSE);
1463 			}
1464 		}
1465 
1466 		if (!m->vmp_dirty && !m->vmp_precious) {
1467 			vm_page_unlock_queues();
1468 			VM_PAGE_FREE(m);
1469 			vm_page_lock_queues();
1470 			delayed_unlock = 0;
1471 
1472 			goto next_pg;
1473 		}
1474 		if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1475 			if (!m_object->pager_initialized) {
1476 				vm_page_unlock_queues();
1477 
1478 				vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1479 
1480 				if (!m_object->pager_initialized) {
1481 					vm_object_compressor_pager_create(m_object);
1482 				}
1483 
1484 				vm_page_lock_queues();
1485 				delayed_unlock = 0;
1486 			}
1487 			if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1488 				/*
1489 				 * We dropped the page queues lock above, so
1490 				 * "m" might no longer be on this queue...
1491 				 */
1492 				if (m != (vm_page_t) vm_page_queue_first(q)) {
1493 					continue;
1494 				}
1495 				goto reenter_pg_on_q;
1496 			}
1497 			/*
1498 			 * vm_object_compressor_pager_create will drop the object lock
1499 			 * which means 'm' may no longer be valid to use
1500 			 */
1501 			continue;
1502 		}
1503 
1504 		if (!perf_test) {
1505 			/*
1506 			 * we've already factored out pages in the laundry which
1507 			 * means this page can't be on the pageout queue so it's
1508 			 * safe to do the vm_page_queues_remove
1509 			 */
1510 			bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1511 			vm_page_queues_remove(m, TRUE);
1512 			if (donate) {
1513 				/*
1514 				 * The compressor needs to see this bit to know
1515 				 * where this page needs to land. Also if stolen,
1516 				 * this bit helps put the page back in the right
1517 				 * special queue where it belongs.
1518 				 */
1519 				m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1520 			}
1521 		} else {
1522 			vm_page_queue_remove(q, m, vmp_pageq);
1523 		}
1524 
1525 		LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1526 
1527 		vm_pageout_cluster_to_queue(m, iq);
1528 
1529 		pages_moved++;
1530 		goto next_pg;
1531 
1532 reenter_pg_on_q:
1533 		vm_page_queue_remove(q, m, vmp_pageq);
1534 		vm_page_queue_enter(q, m, vmp_pageq);
1535 next_pg:
1536 		qcount--;
1537 		try_failed_count = 0;
1538 
1539 		if (delayed_unlock++ > 128) {
1540 			if (l_object != NULL) {
1541 				vm_object_unlock(l_object);
1542 				l_object = NULL;
1543 			}
1544 			lck_mtx_yield(&vm_page_queue_lock);
1545 			delayed_unlock = 0;
1546 		}
1547 	}
1548 	if (l_object != NULL) {
1549 		vm_object_unlock(l_object);
1550 		l_object = NULL;
1551 	}
1552 	vm_page_unlock_queues();
1553 	return pages_moved;
1554 }
1555 
1556 
1557 
1558 /*
1559  * function in BSD to apply I/O throttle to the pageout thread
1560  */
1561 extern void vm_pageout_io_throttle(void);
1562 
1563 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1564 	MACRO_BEGIN                                                     \
1565 	/* \
1566 	 * If a "reusable" page somehow made it back into \
1567 	 * the active queue, it's been re-used and is not \
1568 	 * quite re-usable. \
1569 	 * If the VM object was "all_reusable", consider it \
1570 	 * as "all re-used" instead of converting it to \
1571 	 * "partially re-used", which could be expensive. \
1572 	 */                                                             \
1573 	assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1574 	if ((m)->vmp_reusable ||                                        \
1575 	    (obj)->all_reusable) {                                      \
1576 	        vm_object_reuse_pages((obj),                            \
1577 	                              (m)->vmp_offset,                  \
1578 	                              (m)->vmp_offset + PAGE_SIZE_64,   \
1579 	                              FALSE);                           \
1580 	}                                                               \
1581 	MACRO_END
1582 
1583 
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1585 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1586 
1587 #define FCS_IDLE                0
1588 #define FCS_DELAYED             1
1589 #define FCS_DEADLOCK_DETECTED   2
1590 
1591 struct flow_control {
1592 	int             state;
1593 	mach_timespec_t ts;
1594 };
1595 
1596 
1597 uint64_t vm_pageout_rejected_bq_internal = 0;
1598 uint64_t vm_pageout_rejected_bq_external = 0;
1599 uint64_t vm_pageout_skipped_bq_internal = 0;
1600 uint64_t vm_pageout_skipped_bq_external = 0;
1601 
1602 #define ANONS_GRABBED_LIMIT     2
1603 
1604 
1605 #if 0
1606 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1607 #endif
1608 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1609 
1610 #define VM_PAGEOUT_PB_NO_ACTION                         0
1611 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1612 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1613 
1614 
1615 #if 0
1616 static void
1617 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1618 {
1619 	if (*local_freeq) {
1620 		vm_page_unlock_queues();
1621 
1622 		VM_DEBUG_CONSTANT_EVENT(
1623 			vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1624 			vm_page_free_count, 0, 0, 1);
1625 
1626 		vm_page_free_list(*local_freeq, TRUE);
1627 
1628 		VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1629 		    vm_page_free_count, *local_freed, 0, 1);
1630 
1631 		*local_freeq = NULL;
1632 		*local_freed = 0;
1633 
1634 		vm_page_lock_queues();
1635 	} else {
1636 		lck_mtx_yield(&vm_page_queue_lock);
1637 	}
1638 	*delayed_unlock = 1;
1639 }
1640 #endif
1641 
1642 
1643 static void
1644 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1645     vm_page_t *local_freeq, int *local_freed, int action)
1646 {
1647 	vm_page_unlock_queues();
1648 
1649 	if (*object != NULL) {
1650 		vm_object_unlock(*object);
1651 		*object = NULL;
1652 	}
1653 	if (*local_freeq) {
1654 		vm_page_free_list(*local_freeq, TRUE);
1655 
1656 		*local_freeq = NULL;
1657 		*local_freed = 0;
1658 	}
1659 	*delayed_unlock = 1;
1660 
1661 	switch (action) {
1662 	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1663 		vm_consider_waking_compactor_swapper();
1664 		break;
1665 	case VM_PAGEOUT_PB_THREAD_YIELD:
1666 		thread_yield_internal(1);
1667 		break;
1668 	case VM_PAGEOUT_PB_NO_ACTION:
1669 	default:
1670 		break;
1671 	}
1672 	vm_page_lock_queues();
1673 }
1674 
1675 
1676 static struct vm_pageout_vminfo last;
1677 
1678 uint64_t last_vm_page_pages_grabbed = 0;
1679 
1680 extern  uint32_t c_segment_pages_compressed;
1681 
1682 extern uint64_t shared_region_pager_reclaimed;
1683 extern struct memory_object_pager_ops shared_region_pager_ops;
1684 
1685 void
1686 update_vm_info(void)
1687 {
1688 	unsigned long tmp;
1689 	uint64_t tmp64;
1690 
1691 	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1692 	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1693 	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1694 	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1695 
1696 	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1697 	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1698 	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1699 
1700 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1701 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1702 	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1703 	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1704 	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1705 
1706 	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1707 	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1708 	last.vm_pageout_considered_page = tmp;
1709 
1710 	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1711 	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1712 	last.vm_pageout_compressions = tmp64;
1713 
1714 	tmp = vm_pageout_vminfo.vm_compressor_failed;
1715 	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1716 	last.vm_compressor_failed = tmp;
1717 
1718 	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1719 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1720 	last.vm_compressor_pages_grabbed = tmp64;
1721 
1722 	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1723 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1724 	last.vm_phantom_cache_found_ghost = tmp;
1725 
1726 	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1727 	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1728 	last.vm_phantom_cache_added_ghost = tmp;
1729 
1730 	tmp64 = counter_load(&vm_page_grab_count);
1731 	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1732 	last_vm_page_pages_grabbed = tmp64;
1733 
1734 	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1735 	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1736 	last.vm_page_pages_freed = tmp;
1737 
1738 
1739 	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1740 		tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1741 		vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1742 		last.vm_pageout_pages_evicted = tmp;
1743 
1744 		tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1745 		vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1746 		last.vm_pageout_pages_purged = tmp;
1747 
1748 		tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1749 		vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1750 		last.vm_pageout_freed_speculative = tmp;
1751 
1752 		tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1753 		vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1754 		last.vm_pageout_freed_external = tmp;
1755 
1756 		tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1757 		vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1758 		last.vm_pageout_inactive_referenced = tmp;
1759 
1760 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1761 		vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1762 		last.vm_pageout_scan_inactive_throttled_external = tmp;
1763 
1764 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1765 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1766 		last.vm_pageout_inactive_dirty_external = tmp;
1767 
1768 		tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1769 		vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1770 		last.vm_pageout_freed_cleaned = tmp;
1771 
1772 		tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1773 		vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1774 		last.vm_pageout_inactive_nolock = tmp;
1775 
1776 		tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1777 		vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1778 		last.vm_pageout_scan_inactive_throttled_internal = tmp;
1779 
1780 		tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1781 		vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1782 		last.vm_pageout_skipped_external = tmp;
1783 
1784 		tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1785 		vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1786 		last.vm_pageout_skipped_internal = tmp;
1787 
1788 		tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1789 		vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1790 		last.vm_pageout_reactivation_limit_exceeded = tmp;
1791 
1792 		tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1793 		vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1794 		last.vm_pageout_inactive_force_reclaim = tmp;
1795 
1796 		tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1797 		vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1798 		last.vm_pageout_freed_internal = tmp;
1799 
1800 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1801 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1802 		last.vm_pageout_considered_bq_internal = tmp;
1803 
1804 		tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1805 		vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1806 		last.vm_pageout_considered_bq_external = tmp;
1807 
1808 		tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1809 		vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1810 		last.vm_pageout_filecache_min_reactivated = tmp;
1811 
1812 		tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1813 		vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1814 		last.vm_pageout_inactive_dirty_internal = tmp;
1815 
1816 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1817 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1818 		last.vm_pageout_forcereclaimed_sharedcache = tmp;
1819 
1820 		tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1821 		vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1822 		last.vm_pageout_forcereclaimed_realtime = tmp;
1823 
1824 		tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1825 		vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1826 		last.vm_pageout_protected_sharedcache = tmp;
1827 
1828 		tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1829 		vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1830 		last.vm_pageout_protected_realtime = tmp;
1831 	}
1832 
1833 	KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
1834 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1835 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1836 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1837 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
1838 
1839 	KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
1840 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1841 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1842 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
1843 
1844 	KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
1845 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1846 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1847 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1848 	    vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
1849 
1850 	if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1851 	    vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1852 	    vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1853 		KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
1854 		    vm_pageout_stats[vm_pageout_stat_now].considered,
1855 		    vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1856 		    vm_pageout_stats[vm_pageout_stat_now].freed_external,
1857 		    vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
1858 
1859 		KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
1860 		    vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1861 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1862 		    vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1863 		    vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
1864 
1865 		KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
1866 		    vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1867 		    vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1868 		    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1869 		    vm_pageout_stats[vm_pageout_stat_now].skipped_external);
1870 
1871 		KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
1872 		    vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1873 		    vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1874 		    vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1875 		    vm_pageout_stats[vm_pageout_stat_now].freed_internal);
1876 
1877 		KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
1878 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1879 		    vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1880 		    vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1881 		    vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
1882 
1883 		KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
1884 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1885 		    vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1886 		    vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1887 		    vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
1888 	}
1889 	KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
1890 	    vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1891 	    vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1892 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1893 	    vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
1894 
1895 	record_memory_pressure();
1896 }
1897 
1898 extern boolean_t hibernation_vmqueues_inspection;
1899 
1900 /*
1901  * Return values for functions called by vm_pageout_scan
1902  * that control its flow.
1903  *
1904  * PROCEED -- vm_pageout_scan will keep making forward progress.
1905  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1906  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1907  */
1908 
1909 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1910 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1911 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1912 
1913 /*
1914  * This function is called only from vm_pageout_scan and
1915  * it moves overflow secluded pages (one-at-a-time) to the
1916  * batched 'local' free Q or active Q.
1917  */
1918 static void
1919 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1920 {
1921 #if CONFIG_SECLUDED_MEMORY
1922 	/*
1923 	 * Deal with secluded_q overflow.
1924 	 */
1925 	if (vm_page_secluded_count > vm_page_secluded_target) {
1926 		vm_page_t secluded_page;
1927 
1928 		/*
1929 		 * SECLUDED_AGING_BEFORE_ACTIVE:
1930 		 * Excess secluded pages go to the active queue and
1931 		 * will later go to the inactive queue.
1932 		 */
1933 		assert((vm_page_secluded_count_free +
1934 		    vm_page_secluded_count_inuse) ==
1935 		    vm_page_secluded_count);
1936 		secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1937 		assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1938 
1939 		vm_page_queues_remove(secluded_page, FALSE);
1940 		assert(!vm_page_is_fictitious(secluded_page));
1941 		assert(!VM_PAGE_WIRED(secluded_page));
1942 
1943 		if (secluded_page->vmp_object == 0) {
1944 			/* transfer to free queue */
1945 			assert(secluded_page->vmp_busy);
1946 			secluded_page->vmp_snext = *local_freeq;
1947 			*local_freeq = secluded_page;
1948 			*local_freed += 1;
1949 		} else {
1950 			/* transfer to head of active queue */
1951 			vm_page_enqueue_active(secluded_page, FALSE);
1952 			secluded_page = VM_PAGE_NULL;
1953 		}
1954 	}
1955 #else /* CONFIG_SECLUDED_MEMORY */
1956 
1957 #pragma unused(local_freeq)
1958 #pragma unused(local_freed)
1959 
1960 	return;
1961 
1962 #endif /* CONFIG_SECLUDED_MEMORY */
1963 }
1964 
1965 
1966 /*
1967  * This function is called only from vm_pageout_scan and
1968  * it initializes the loop targets for vm_pageout_scan().
1969  */
1970 static void
1971 vps_init_page_targets(void)
1972 {
1973 	/*
1974 	 * LD TODO: Other page targets should be calculated here too.
1975 	 */
1976 	vm_page_anonymous_min = vm_page_inactive_target / 20;
1977 
1978 	if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1979 		vm_pageout_state.vm_page_speculative_percentage = 50;
1980 	} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1981 		vm_pageout_state.vm_page_speculative_percentage = 1;
1982 	}
1983 
1984 	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1985 	    vm_page_inactive_count);
1986 }
1987 
1988 /*
1989  * This function is called only from vm_pageout_scan and
1990  * it purges a single VM object at-a-time and will either
1991  * make vm_pageout_scan() restart the loop or keeping moving forward.
1992  */
1993 static int
1994 vps_purge_object()
1995 {
1996 	int             force_purge;
1997 
1998 	assert(available_for_purge >= 0);
1999 	force_purge = 0; /* no force-purging */
2000 
2001 #if VM_PRESSURE_EVENTS
2002 	vm_pressure_level_t pressure_level;
2003 
2004 	pressure_level = memorystatus_vm_pressure_level;
2005 
2006 	if (pressure_level > kVMPressureNormal) {
2007 		if (pressure_level >= kVMPressureCritical) {
2008 			force_purge = vm_pageout_state.memorystatus_purge_on_critical;
2009 		} else if (pressure_level >= kVMPressureUrgent) {
2010 			force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
2011 		} else if (pressure_level >= kVMPressureWarning) {
2012 			force_purge = vm_pageout_state.memorystatus_purge_on_warning;
2013 		}
2014 	}
2015 #endif /* VM_PRESSURE_EVENTS */
2016 
2017 	if (available_for_purge || force_purge) {
2018 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2019 
2020 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2021 		if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2022 			VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
2023 			VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2024 			memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2025 
2026 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2027 		}
2028 		VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2029 		memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2030 	}
2031 
2032 	return VM_PAGEOUT_SCAN_PROCEED;
2033 }
2034 
2035 /*
2036  * This function is called only from vm_pageout_scan and
2037  * it will try to age the next speculative Q if the oldest
2038  * one is empty.
2039  */
2040 static int
2041 vps_age_speculative_queue(boolean_t force_speculative_aging)
2042 {
2043 #define DELAY_SPECULATIVE_AGE   1000
2044 
2045 	/*
2046 	 * try to pull pages from the aging bins...
2047 	 * see vm_page_internal.h for an explanation of how
2048 	 * this mechanism works
2049 	 */
2050 	boolean_t                       can_steal = FALSE;
2051 	int                             num_scanned_queues;
2052 	static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2053 	mach_timespec_t                 ts;
2054 	struct vm_speculative_age_q     *aq;
2055 	struct vm_speculative_age_q     *sq;
2056 
2057 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2058 
2059 	aq = &vm_page_queue_speculative[speculative_steal_index];
2060 
2061 	num_scanned_queues = 0;
2062 	while (vm_page_queue_empty(&aq->age_q) &&
2063 	    num_scanned_queues++ != vm_page_max_speculative_age_q) {
2064 		speculative_steal_index++;
2065 
2066 		if (speculative_steal_index > vm_page_max_speculative_age_q) {
2067 			speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2068 		}
2069 
2070 		aq = &vm_page_queue_speculative[speculative_steal_index];
2071 	}
2072 
2073 	if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
2074 		/*
2075 		 * XXX We've scanned all the speculative
2076 		 * queues but still haven't found one
2077 		 * that is not empty, even though
2078 		 * vm_page_speculative_count is not 0.
2079 		 */
2080 		if (!vm_page_queue_empty(&sq->age_q)) {
2081 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2082 		}
2083 #if DEVELOPMENT || DEBUG
2084 		panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2085 #endif
2086 		/* readjust... */
2087 		vm_page_speculative_count = 0;
2088 		/* ... and continue */
2089 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2090 	}
2091 
2092 	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2093 		can_steal = TRUE;
2094 	} else {
2095 		if (!delay_speculative_age) {
2096 			mach_timespec_t ts_fully_aged;
2097 
2098 			ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2099 			ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2100 			    * 1000 * NSEC_PER_USEC;
2101 
2102 			ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2103 
2104 			clock_sec_t sec;
2105 			clock_nsec_t nsec;
2106 			clock_get_system_nanotime(&sec, &nsec);
2107 			ts.tv_sec = (unsigned int) sec;
2108 			ts.tv_nsec = nsec;
2109 
2110 			if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2111 				can_steal = TRUE;
2112 			} else {
2113 				delay_speculative_age++;
2114 			}
2115 		} else {
2116 			delay_speculative_age++;
2117 			if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2118 				delay_speculative_age = 0;
2119 			}
2120 		}
2121 	}
2122 	if (can_steal == TRUE) {
2123 		vm_page_speculate_ageit(aq);
2124 	}
2125 
2126 	return VM_PAGEOUT_SCAN_PROCEED;
2127 }
2128 
2129 /*
2130  * This function is called only from vm_pageout_scan and
2131  * it evicts a single VM object from the cache.
2132  */
2133 static int inline
2134 vps_object_cache_evict(vm_object_t *object_to_unlock)
2135 {
2136 	static int                      cache_evict_throttle = 0;
2137 	struct vm_speculative_age_q     *sq;
2138 
2139 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2140 
2141 	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2142 		int     pages_evicted;
2143 
2144 		if (*object_to_unlock != NULL) {
2145 			vm_object_unlock(*object_to_unlock);
2146 			*object_to_unlock = NULL;
2147 		}
2148 		KDBG(0x13001ec | DBG_FUNC_START);
2149 
2150 		pages_evicted = vm_object_cache_evict(100, 10);
2151 
2152 		KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
2153 
2154 		if (pages_evicted) {
2155 			vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2156 
2157 			VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2158 			    vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2159 			memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2160 
2161 			/*
2162 			 * we just freed up to 100 pages,
2163 			 * so go back to the top of the main loop
2164 			 * and re-evaulate the memory situation
2165 			 */
2166 			return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2167 		} else {
2168 			cache_evict_throttle = 1000;
2169 		}
2170 	}
2171 	if (cache_evict_throttle) {
2172 		cache_evict_throttle--;
2173 	}
2174 
2175 	return VM_PAGEOUT_SCAN_PROCEED;
2176 }
2177 
2178 
2179 /*
2180  * This function is called only from vm_pageout_scan and
2181  * it calculates the filecache min. that needs to be maintained
2182  * as we start to steal pages.
2183  */
2184 static void
2185 vps_calculate_filecache_min(void)
2186 {
2187 	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2188 
2189 #if CONFIG_JETSAM
2190 	/*
2191 	 * don't let the filecache_min fall below 15% of available memory
2192 	 * on systems with an active compressor that isn't nearing its
2193 	 * limits w/r to accepting new data
2194 	 *
2195 	 * on systems w/o the compressor/swapper, the filecache is always
2196 	 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2197 	 * since most (if not all) of the anonymous pages are in the
2198 	 * throttled queue (which isn't counted as available) which
2199 	 * effectively disables this filter
2200 	 */
2201 	if (vm_compressor_low_on_space() || divisor == 0) {
2202 		vm_pageout_state.vm_page_filecache_min = 0;
2203 	} else {
2204 		vm_pageout_state.vm_page_filecache_min =
2205 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2206 	}
2207 #else
2208 	if (vm_compressor_out_of_space() || divisor == 0) {
2209 		vm_pageout_state.vm_page_filecache_min = 0;
2210 	} else {
2211 		/*
2212 		 * don't let the filecache_min fall below the specified critical level
2213 		 */
2214 		vm_pageout_state.vm_page_filecache_min =
2215 		    ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2216 	}
2217 #endif
2218 	if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2219 		vm_pageout_state.vm_page_filecache_min = 0;
2220 	}
2221 }
2222 
2223 /*
2224  * This function is called only from vm_pageout_scan and
2225  * it updates the flow control time to detect if VM pageoutscan
2226  * isn't making progress.
2227  */
2228 static void
2229 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2230 {
2231 	mach_timespec_t ts;
2232 	clock_sec_t sec;
2233 	clock_nsec_t nsec;
2234 
2235 	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2236 	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2237 	clock_get_system_nanotime(&sec, &nsec);
2238 	flow_control->ts.tv_sec = (unsigned int) sec;
2239 	flow_control->ts.tv_nsec = nsec;
2240 	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2241 
2242 	flow_control->state = FCS_DELAYED;
2243 
2244 	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2245 }
2246 
2247 /*
2248  * This function is called only from vm_pageout_scan and
2249  * it is the flow control logic of VM pageout scan which
2250  * controls if it should block and for how long.
2251  * Any blocking of vm_pageout_scan happens ONLY in this function.
2252  */
2253 static int
2254 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2255     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2256 {
2257 	boolean_t       exceeded_burst_throttle = FALSE;
2258 	unsigned int    msecs = 0;
2259 	uint32_t        inactive_external_count;
2260 	mach_timespec_t ts;
2261 	struct  vm_pageout_queue *iq;
2262 	struct  vm_pageout_queue *eq;
2263 	struct  vm_speculative_age_q *sq;
2264 
2265 	iq = &vm_pageout_queue_internal;
2266 	eq = &vm_pageout_queue_external;
2267 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2268 
2269 	/*
2270 	 * Sometimes we have to pause:
2271 	 *	1) No inactive pages - nothing to do.
2272 	 *	2) Loop control - no acceptable pages found on the inactive queue
2273 	 *         within the last vm_pageout_burst_inactive_throttle iterations
2274 	 *	3) Flow control - default pageout queue is full
2275 	 */
2276 	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2277 	    vm_page_queue_empty(&vm_page_queue_anonymous) &&
2278 	    vm_page_queue_empty(&vm_page_queue_cleaned) &&
2279 	    vm_page_queue_empty(&sq->age_q)) {
2280 		VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2281 		msecs = vm_pageout_state.vm_pageout_empty_wait;
2282 	} else if (inactive_burst_count >=
2283 	    MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2284 	    (vm_page_inactive_count +
2285 	    vm_page_speculative_count))) {
2286 		VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2287 		msecs = vm_pageout_state.vm_pageout_burst_wait;
2288 
2289 		exceeded_burst_throttle = TRUE;
2290 	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2291 	    VM_DYNAMIC_PAGING_ENABLED()) {
2292 		clock_sec_t sec;
2293 		clock_nsec_t nsec;
2294 
2295 		switch (flow_control->state) {
2296 		case FCS_IDLE:
2297 			if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2298 			    vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2299 				/*
2300 				 * since the compressor is running independently of vm_pageout_scan
2301 				 * let's not wait for it just yet... as long as we have a healthy supply
2302 				 * of filecache pages to work with, let's keep stealing those.
2303 				 */
2304 				inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2305 
2306 				if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2307 				    (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2308 					*anons_grabbed = ANONS_GRABBED_LIMIT;
2309 					VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2310 					return VM_PAGEOUT_SCAN_PROCEED;
2311 				}
2312 			}
2313 
2314 			vps_flow_control_reset_deadlock_timer(flow_control);
2315 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2316 
2317 			break;
2318 
2319 		case FCS_DELAYED:
2320 			clock_get_system_nanotime(&sec, &nsec);
2321 			ts.tv_sec = (unsigned int) sec;
2322 			ts.tv_nsec = nsec;
2323 
2324 			if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2325 				/*
2326 				 * the pageout thread for the default pager is potentially
2327 				 * deadlocked since the
2328 				 * default pager queue has been throttled for more than the
2329 				 * allowable time... we need to move some clean pages or dirty
2330 				 * pages belonging to the external pagers if they aren't throttled
2331 				 * vm_page_free_wanted represents the number of threads currently
2332 				 * blocked waiting for pages... we'll move one page for each of
2333 				 * these plus a fixed amount to break the logjam... once we're done
2334 				 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2335 				 * with a new timeout target since we have no way of knowing
2336 				 * whether we've broken the deadlock except through observation
2337 				 * of the queue associated with the default pager... we need to
2338 				 * stop moving pages and allow the system to run to see what
2339 				 * state it settles into.
2340 				 */
2341 
2342 				*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2343 				    vm_page_free_wanted + vm_page_free_wanted_privileged;
2344 				VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2345 				flow_control->state = FCS_DEADLOCK_DETECTED;
2346 				sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
2347 				return VM_PAGEOUT_SCAN_PROCEED;
2348 			}
2349 			/*
2350 			 * just resniff instead of trying
2351 			 * to compute a new delay time... we're going to be
2352 			 * awakened immediately upon a laundry completion,
2353 			 * so we won't wait any longer than necessary
2354 			 */
2355 			msecs = vm_pageout_state.vm_pageout_idle_wait;
2356 			break;
2357 
2358 		case FCS_DEADLOCK_DETECTED:
2359 			if (*vm_pageout_deadlock_target) {
2360 				return VM_PAGEOUT_SCAN_PROCEED;
2361 			}
2362 
2363 			vps_flow_control_reset_deadlock_timer(flow_control);
2364 			msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2365 
2366 			break;
2367 		}
2368 	} else {
2369 		/*
2370 		 * No need to pause...
2371 		 */
2372 		return VM_PAGEOUT_SCAN_PROCEED;
2373 	}
2374 
2375 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2376 
2377 	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2378 	    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2379 
2380 	if (vm_page_free_count >= vm_page_free_target) {
2381 		/*
2382 		 * we're here because
2383 		 *  1) someone else freed up some pages while we had
2384 		 *     the queues unlocked above
2385 		 * and we've hit one of the 3 conditions that
2386 		 * cause us to pause the pageout scan thread
2387 		 *
2388 		 * since we already have enough free pages,
2389 		 * let's avoid stalling and return normally
2390 		 *
2391 		 * before we return, make sure the pageout I/O threads
2392 		 * are running throttled in case there are still requests
2393 		 * in the laundry... since we have enough free pages
2394 		 * we don't need the laundry to be cleaned in a timely
2395 		 * fashion... so let's avoid interfering with foreground
2396 		 * activity
2397 		 *
2398 		 * we don't want to hold vm_page_queue_free_lock when
2399 		 * calling vm_pageout_adjust_eq_iothrottle (since it
2400 		 * may cause other locks to be taken), we do the intitial
2401 		 * check outside of the lock.  Once we take the lock,
2402 		 * we recheck the condition since it may have changed.
2403 		 * if it has, no problem, we will make the threads
2404 		 * non-throttled before actually blocking
2405 		 */
2406 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2407 	}
2408 	vm_free_page_lock();
2409 
2410 	if (vm_page_free_count >= vm_page_free_target &&
2411 	    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2412 		return VM_PAGEOUT_SCAN_DONE_RETURN;
2413 	}
2414 	vm_free_page_unlock();
2415 
2416 	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2417 		/*
2418 		 * we're most likely about to block due to one of
2419 		 * the 3 conditions that cause vm_pageout_scan to
2420 		 * not be able to make forward progress w/r
2421 		 * to providing new pages to the free queue,
2422 		 * so unthrottle the I/O threads in case we
2423 		 * have laundry to be cleaned... it needs
2424 		 * to be completed ASAP.
2425 		 *
2426 		 * even if we don't block, we want the io threads
2427 		 * running unthrottled since the sum of free +
2428 		 * clean pages is still under our free target
2429 		 */
2430 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2431 	}
2432 	if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2433 		/*
2434 		 * if we get here we're below our free target and
2435 		 * we're stalling due to a full laundry queue or
2436 		 * we don't have any inactive pages other then
2437 		 * those in the clean queue...
2438 		 * however, we have pages on the clean queue that
2439 		 * can be moved to the free queue, so let's not
2440 		 * stall the pageout scan
2441 		 */
2442 		flow_control->state = FCS_IDLE;
2443 		return VM_PAGEOUT_SCAN_PROCEED;
2444 	}
2445 	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2446 		flow_control->state = FCS_IDLE;
2447 		return VM_PAGEOUT_SCAN_PROCEED;
2448 	}
2449 
2450 	VM_CHECK_MEMORYSTATUS;
2451 
2452 	if (flow_control->state != FCS_IDLE) {
2453 		VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2454 	}
2455 
2456 	iq->pgo_throttled = TRUE;
2457 	assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2458 
2459 	vm_page_unlock_queues();
2460 
2461 	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2462 
2463 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2464 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2465 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2466 
2467 	thread_block(THREAD_CONTINUE_NULL);
2468 
2469 	VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2470 	    iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2471 	memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2472 
2473 	vm_page_lock_queues();
2474 
2475 	iq->pgo_throttled = FALSE;
2476 
2477 	vps_init_page_targets();
2478 
2479 	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2480 }
2481 
2482 extern boolean_t vm_darkwake_mode;
2483 /*
2484  * This function is called only from vm_pageout_scan and
2485  * it will find and return the most appropriate page to be
2486  * reclaimed.
2487  */
2488 static int
2489 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2490     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2491 {
2492 	vm_page_t                       m = NULL;
2493 	vm_object_t                     m_object = VM_OBJECT_NULL;
2494 	uint32_t                        inactive_external_count;
2495 	struct vm_speculative_age_q     *sq;
2496 	struct vm_pageout_queue         *iq;
2497 	int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2498 
2499 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2500 	iq = &vm_pageout_queue_internal;
2501 
2502 	*is_page_from_bg_q = FALSE;
2503 
2504 	m = NULL;
2505 	m_object = VM_OBJECT_NULL;
2506 
2507 	if (VM_DYNAMIC_PAGING_ENABLED()) {
2508 		assert(vm_page_throttled_count == 0);
2509 		assert(vm_page_queue_empty(&vm_page_queue_throttled));
2510 	}
2511 
2512 	/*
2513 	 * Try for a clean-queue inactive page.
2514 	 * These are pages that vm_pageout_scan tried to steal earlier, but
2515 	 * were dirty and had to be cleaned.  Pick them up now that they are clean.
2516 	 */
2517 	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2518 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2519 
2520 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2521 
2522 		goto found_page;
2523 	}
2524 
2525 	/*
2526 	 * The next most eligible pages are ones we paged in speculatively,
2527 	 * but which have not yet been touched and have been aged out.
2528 	 */
2529 	if (!vm_page_queue_empty(&sq->age_q)) {
2530 		m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2531 
2532 		assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2533 
2534 		if (!m->vmp_dirty || force_anonymous == FALSE) {
2535 			goto found_page;
2536 		} else {
2537 			m = NULL;
2538 		}
2539 	}
2540 
2541 #if !CONFIG_JETSAM
2542 	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2543 		if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2544 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2545 			assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2546 			goto found_page;
2547 		}
2548 	}
2549 #endif /* !CONFIG_JETSAM */
2550 
2551 	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2552 		vm_object_t     bg_m_object = NULL;
2553 
2554 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2555 
2556 		bg_m_object = VM_PAGE_OBJECT(m);
2557 
2558 		if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2559 			/*
2560 			 * This page is on the background queue
2561 			 * but not on a pageable queue OR is busy during
2562 			 * darkwake mode when the target is artificially lowered.
2563 			 * If it is busy during darkwake mode, and we don't skip it,
2564 			 * we will just swing back around and try again with the same
2565 			 * queue and might hit the same page or its neighbor in a
2566 			 * similar state. Both of these are transient states and will
2567 			 * get resolved, but, at this point let's ignore this page.
2568 			 */
2569 			if (vm_darkwake_mode && m->vmp_busy) {
2570 				if (bg_m_object->internal) {
2571 					vm_pageout_skipped_bq_internal++;
2572 				} else {
2573 					vm_pageout_skipped_bq_external++;
2574 				}
2575 			}
2576 		} else if (force_anonymous == FALSE || bg_m_object->internal) {
2577 			if (bg_m_object->internal &&
2578 			    (VM_PAGE_Q_THROTTLED(iq) ||
2579 			    vm_compressor_out_of_space() == TRUE ||
2580 			    vm_page_free_count < (vm_page_free_reserved / 4))) {
2581 				vm_pageout_skipped_bq_internal++;
2582 			} else {
2583 				*is_page_from_bg_q = TRUE;
2584 
2585 				if (bg_m_object->internal) {
2586 					vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2587 				} else {
2588 					vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2589 				}
2590 				goto found_page;
2591 			}
2592 		}
2593 	}
2594 
2595 	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2596 
2597 	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2598 	    (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2599 		*grab_anonymous = TRUE;
2600 		*anons_grabbed = 0;
2601 
2602 		if (VM_CONFIG_SWAP_IS_ACTIVE) {
2603 			vm_pageout_vminfo.vm_pageout_skipped_external++;
2604 		} else {
2605 			if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2606 				/*
2607 				 * No swap and we are in dangerously low levels of free memory.
2608 				 * If we keep going ahead with anonymous pages, we are going to run into a situation
2609 				 * where the compressor will be stuck waiting for free pages (if it isn't already).
2610 				 *
2611 				 * So, pick a file backed page...
2612 				 */
2613 				*grab_anonymous = FALSE;
2614 				*anons_grabbed = ANONS_GRABBED_LIMIT;
2615 				vm_pageout_vminfo.vm_pageout_skipped_internal++;
2616 			}
2617 		}
2618 		goto want_anonymous;
2619 	}
2620 	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2621 
2622 #if CONFIG_JETSAM
2623 	/* If the file-backed pool has accumulated
2624 	 * significantly more pages than the jetsam
2625 	 * threshold, prefer to reclaim those
2626 	 * inline to minimise compute overhead of reclaiming
2627 	 * anonymous pages.
2628 	 * This calculation does not account for the CPU local
2629 	 * external page queues, as those are expected to be
2630 	 * much smaller relative to the global pools.
2631 	 */
2632 
2633 	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2634 
2635 	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2636 		if (vm_page_pageable_external_count >
2637 		    vm_pageout_state.vm_page_filecache_min) {
2638 			if ((vm_page_pageable_external_count *
2639 			    vm_pageout_memorystatus_fb_factor_dr) >
2640 			    (memorystatus_get_critical_page_shortage_threshold() *
2641 			    vm_pageout_memorystatus_fb_factor_nr)) {
2642 				*grab_anonymous = FALSE;
2643 
2644 				VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2645 			}
2646 		}
2647 		if (*grab_anonymous) {
2648 			VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2649 		}
2650 	}
2651 #endif /* CONFIG_JETSAM */
2652 
2653 want_anonymous:
2654 	if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2655 		if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2656 			m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2657 
2658 			assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2659 			*anons_grabbed = 0;
2660 
2661 			if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2662 				if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2663 					if ((++(*reactivated_this_call) % 100)) {
2664 						vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2665 
2666 						vm_page_activate(m);
2667 						counter_inc(&vm_statistics_reactivations);
2668 #if DEVELOPMENT || DEBUG
2669 						if (*is_page_from_bg_q == TRUE) {
2670 							if (m_object->internal) {
2671 								vm_pageout_rejected_bq_internal++;
2672 							} else {
2673 								vm_pageout_rejected_bq_external++;
2674 							}
2675 						}
2676 #endif /* DEVELOPMENT || DEBUG */
2677 						vm_pageout_state.vm_pageout_inactive_used++;
2678 
2679 						m = NULL;
2680 						retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2681 
2682 						goto found_page;
2683 					}
2684 
2685 					/*
2686 					 * steal 1 of the file backed pages even if
2687 					 * we are under the limit that has been set
2688 					 * for a healthy filecache
2689 					 */
2690 				}
2691 			}
2692 			goto found_page;
2693 		}
2694 	}
2695 	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2696 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2697 
2698 		assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2699 		*anons_grabbed += 1;
2700 
2701 		goto found_page;
2702 	}
2703 
2704 	m = NULL;
2705 
2706 found_page:
2707 	*victim_page = m;
2708 
2709 	return retval;
2710 }
2711 
2712 /*
2713  * This function is called only from vm_pageout_scan and
2714  * it will put a page back on the active/inactive queue
2715  * if we can't reclaim it for some reason.
2716  */
2717 static void
2718 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2719 {
2720 	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2721 		vm_page_enqueue_inactive(m, FALSE);
2722 	} else {
2723 		vm_page_activate(m);
2724 	}
2725 
2726 #if DEVELOPMENT || DEBUG
2727 	vm_object_t m_object = VM_PAGE_OBJECT(m);
2728 
2729 	if (page_from_bg_q == TRUE) {
2730 		if (m_object->internal) {
2731 			vm_pageout_rejected_bq_internal++;
2732 		} else {
2733 			vm_pageout_rejected_bq_external++;
2734 		}
2735 	}
2736 #endif /* DEVELOPMENT || DEBUG */
2737 }
2738 
2739 /*
2740  * This function is called only from vm_pageout_scan and
2741  * it will try to grab the victim page's VM object (m_object)
2742  * which differs from the previous victim page's object (object).
2743  */
2744 static int
2745 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2746 {
2747 	struct vm_speculative_age_q *sq;
2748 
2749 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2750 
2751 	/*
2752 	 * the object associated with candidate page is
2753 	 * different from the one we were just working
2754 	 * with... dump the lock if we still own it
2755 	 */
2756 	if (*object != NULL) {
2757 		vm_object_unlock(*object);
2758 		*object = NULL;
2759 	}
2760 	/*
2761 	 * Try to lock object; since we've alread got the
2762 	 * page queues lock, we can only 'try' for this one.
2763 	 * if the 'try' fails, we need to do a mutex_pause
2764 	 * to allow the owner of the object lock a chance to
2765 	 * run... otherwise, we're likely to trip over this
2766 	 * object in the same state as we work our way through
2767 	 * the queue... clumps of pages associated with the same
2768 	 * object are fairly typical on the inactive and active queues
2769 	 */
2770 	if (!vm_object_lock_try_scan(m_object)) {
2771 		vm_page_t m_want = NULL;
2772 
2773 		vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2774 
2775 		if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2776 			VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2777 		}
2778 
2779 		pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2780 
2781 		m->vmp_reference = FALSE;
2782 
2783 		if (!m_object->object_is_shared_cache) {
2784 			/*
2785 			 * don't apply this optimization if this is the shared cache
2786 			 * object, it's too easy to get rid of very hot and important
2787 			 * pages...
2788 			 * m->vmp_object must be stable since we hold the page queues lock...
2789 			 * we can update the scan_collisions field sans the object lock
2790 			 * since it is a separate field and this is the only spot that does
2791 			 * a read-modify-write operation and it is never executed concurrently...
2792 			 * we can asynchronously set this field to 0 when creating a UPL, so it
2793 			 * is possible for the value to be a bit non-determistic, but that's ok
2794 			 * since it's only used as a hint
2795 			 */
2796 			m_object->scan_collisions = 1;
2797 		}
2798 		if (page_from_bg_q) {
2799 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2800 		} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2801 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2802 		} else if (!vm_page_queue_empty(&sq->age_q)) {
2803 			m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2804 		} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2805 		    !vm_page_queue_empty(&vm_page_queue_inactive)) {
2806 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2807 		} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2808 			m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2809 		}
2810 
2811 		/*
2812 		 * this is the next object we're going to be interested in
2813 		 * try to make sure its available after the mutex_pause
2814 		 * returns control
2815 		 */
2816 		if (m_want) {
2817 			vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2818 		}
2819 
2820 		vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2821 
2822 		return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2823 	} else {
2824 		*object = m_object;
2825 		vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2826 	}
2827 
2828 	return VM_PAGEOUT_SCAN_PROCEED;
2829 }
2830 
2831 /*
2832  * This function is called only from vm_pageout_scan and
2833  * it notices that pageout scan may be rendered ineffective
2834  * due to a FS deadlock and will jetsam a process if possible.
2835  * If jetsam isn't supported, it'll move the page to the active
2836  * queue to try and get some different pages pushed onwards so
2837  * we can try to get out of this scenario.
2838  */
2839 static void
2840 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2841     boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2842 {
2843 	struct  vm_pageout_queue *eq;
2844 	vm_object_t cur_object = VM_OBJECT_NULL;
2845 
2846 	cur_object = *object;
2847 
2848 	eq = &vm_pageout_queue_external;
2849 
2850 	if (cur_object->internal == FALSE) {
2851 		/*
2852 		 * we need to break up the following potential deadlock case...
2853 		 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2854 		 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2855 		 *  c) Most of the pages in the inactive queue belong to this file.
2856 		 *
2857 		 * we are potentially in this deadlock because...
2858 		 *  a) the external pageout queue is throttled
2859 		 *  b) we're done with the active queue and moved on to the inactive queue
2860 		 *  c) we've got a dirty external page
2861 		 *
2862 		 * since we don't know the reason for the external pageout queue being throttled we
2863 		 * must suspect that we are deadlocked, so move the current page onto the active queue
2864 		 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2865 		 *
2866 		 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2867 		 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2868 		 * pool the next time we select a victim page... if we can make enough new free pages,
2869 		 * the deadlock will break, the external pageout queue will empty and it will no longer
2870 		 * be throttled
2871 		 *
2872 		 * if we have jetsam configured, keep a count of the pages reactivated this way so
2873 		 * that we can try to find clean pages in the active/inactive queues before
2874 		 * deciding to jetsam a process
2875 		 */
2876 		vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2877 
2878 		vm_page_check_pageable_safe(m);
2879 		assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2880 		vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2881 		m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2882 		vm_page_active_count++;
2883 		vm_page_pageable_external_count++;
2884 
2885 		vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2886 
2887 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2888 
2889 #pragma unused(force_anonymous)
2890 
2891 		*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2892 
2893 		if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2894 			*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2895 			/*
2896 			 * Possible deadlock scenario so request jetsam action
2897 			 */
2898 			memorystatus_kill_on_vps_starvation();
2899 			VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2900 			    vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2901 		}
2902 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2903 
2904 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2905 
2906 		*force_anonymous = TRUE;
2907 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2908 	} else {
2909 		vm_page_activate(m);
2910 		counter_inc(&vm_statistics_reactivations);
2911 
2912 #if DEVELOPMENT || DEBUG
2913 		if (is_page_from_bg_q == TRUE) {
2914 			if (cur_object->internal) {
2915 				vm_pageout_rejected_bq_internal++;
2916 			} else {
2917 				vm_pageout_rejected_bq_external++;
2918 			}
2919 		}
2920 #endif /* DEVELOPMENT || DEBUG */
2921 
2922 		vm_pageout_state.vm_pageout_inactive_used++;
2923 	}
2924 }
2925 
2926 
2927 void
2928 vm_page_balance_inactive(int max_to_move)
2929 {
2930 	vm_page_t m;
2931 
2932 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2933 
2934 	if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2935 		/*
2936 		 * It is likely that the hibernation code path is
2937 		 * dealing with these very queues as we are about
2938 		 * to move pages around in/from them and completely
2939 		 * change the linkage of the pages.
2940 		 *
2941 		 * And so we skip the rebalancing of these queues.
2942 		 */
2943 		return;
2944 	}
2945 	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2946 	    vm_page_inactive_count +
2947 	    vm_page_speculative_count);
2948 
2949 	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2950 		VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2951 
2952 		m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2953 
2954 		assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2955 		assert(!m->vmp_laundry);
2956 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2957 		assert(!vm_page_is_guard(m));
2958 
2959 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2960 
2961 		/*
2962 		 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2963 		 *
2964 		 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2965 		 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2966 		 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2967 		 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2968 		 * by pageout_scan, which is just fine since the last reference would have happened quite far
2969 		 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2970 		 * have happened before we moved the page
2971 		 */
2972 		if (m->vmp_pmapped == TRUE) {
2973 			/*
2974 			 * We might be holding the page queue lock as a
2975 			 * spin lock and clearing the "referenced" bit could
2976 			 * take a while if there are lots of mappings of
2977 			 * that page, so make sure we acquire the lock as
2978 			 * as mutex to avoid a spinlock timeout.
2979 			 */
2980 			vm_page_lockconvert_queues();
2981 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2982 		}
2983 
2984 		/*
2985 		 * The page might be absent or busy,
2986 		 * but vm_page_deactivate can handle that.
2987 		 * FALSE indicates that we don't want a H/W clear reference
2988 		 */
2989 		vm_page_deactivate_internal(m, FALSE);
2990 	}
2991 }
2992 
2993 /*
2994  *	vm_pageout_scan does the dirty work for the pageout daemon.
2995  *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2996  *	held and vm_page_free_wanted == 0.
2997  */
2998 void
2999 vm_pageout_scan(void)
3000 {
3001 	unsigned int loop_count = 0;
3002 	unsigned int inactive_burst_count = 0;
3003 	unsigned int reactivated_this_call;
3004 	unsigned int reactivate_limit;
3005 	vm_page_t   local_freeq = NULL;
3006 	int         local_freed = 0;
3007 	int         delayed_unlock;
3008 	int         delayed_unlock_limit = 0;
3009 	int         refmod_state = 0;
3010 	int     vm_pageout_deadlock_target = 0;
3011 	struct  vm_pageout_queue *iq;
3012 	struct  vm_pageout_queue *eq;
3013 	struct  vm_speculative_age_q *sq;
3014 	struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
3015 	boolean_t inactive_throttled = FALSE;
3016 	vm_object_t     object = NULL;
3017 	uint32_t        inactive_reclaim_run;
3018 	boolean_t       grab_anonymous = FALSE;
3019 	boolean_t       force_anonymous = FALSE;
3020 	boolean_t       force_speculative_aging = FALSE;
3021 	int             anons_grabbed = 0;
3022 	int             page_prev_q_state = 0;
3023 	boolean_t       page_from_bg_q = FALSE;
3024 	uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
3025 	vm_object_t     m_object = VM_OBJECT_NULL;
3026 	int             retval = 0;
3027 	boolean_t       lock_yield_check = FALSE;
3028 
3029 
3030 	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
3031 	    vm_pageout_vminfo.vm_pageout_freed_speculative,
3032 	    vm_pageout_state.vm_pageout_inactive_clean,
3033 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3034 	    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3035 
3036 	flow_control.state = FCS_IDLE;
3037 	iq = &vm_pageout_queue_internal;
3038 	eq = &vm_pageout_queue_external;
3039 	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3040 
3041 	/* Ask the pmap layer to return any pages it no longer needs. */
3042 	pmap_release_pages_fast();
3043 
3044 	vm_page_lock_queues();
3045 
3046 	delayed_unlock = 1;
3047 
3048 	/*
3049 	 *	Calculate the max number of referenced pages on the inactive
3050 	 *	queue that we will reactivate.
3051 	 */
3052 	reactivated_this_call = 0;
3053 	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3054 	    vm_page_inactive_count);
3055 	inactive_reclaim_run = 0;
3056 
3057 	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3058 
3059 	/*
3060 	 *	We must limit the rate at which we send pages to the pagers
3061 	 *	so that we don't tie up too many pages in the I/O queues.
3062 	 *	We implement a throttling mechanism using the laundry count
3063 	 *      to limit the number of pages outstanding to the default
3064 	 *	and external pagers.  We can bypass the throttles and look
3065 	 *	for clean pages if the pageout queues don't drain in a timely
3066 	 *	fashion since this may indicate that the pageout paths are
3067 	 *	stalled waiting for memory, which only we can provide.
3068 	 */
3069 
3070 	vps_init_page_targets();
3071 	assert(object == NULL);
3072 	assert(delayed_unlock != 0);
3073 
3074 	for (;;) {
3075 		vm_page_t m;
3076 
3077 		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3078 
3079 		if (lock_yield_check) {
3080 			lock_yield_check = FALSE;
3081 
3082 			if (delayed_unlock++ > delayed_unlock_limit) {
3083 				vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3084 				    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3085 			} else if (vm_pageout_scan_wants_object) {
3086 				vm_page_unlock_queues();
3087 				mutex_pause(0);
3088 				vm_page_lock_queues();
3089 			} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
3090 				VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3091 			}
3092 		}
3093 
3094 		if (vm_upl_wait_for_pages < 0) {
3095 			vm_upl_wait_for_pages = 0;
3096 		}
3097 
3098 		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3099 
3100 		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3101 			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3102 		}
3103 
3104 		vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
3105 
3106 		assert(delayed_unlock);
3107 
3108 		/*
3109 		 * maintain our balance
3110 		 */
3111 		vm_page_balance_inactive(1);
3112 
3113 
3114 		/**********************************************************************
3115 		* above this point we're playing with the active and secluded queues
3116 		* below this point we're playing with the throttling mechanisms
3117 		* and the inactive queue
3118 		**********************************************************************/
3119 
3120 		if (vm_page_free_count + local_freed >= vm_page_free_target) {
3121 			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3122 
3123 			vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3124 			    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3125 			/*
3126 			 * make sure the pageout I/O threads are running
3127 			 * throttled in case there are still requests
3128 			 * in the laundry... since we have met our targets
3129 			 * we don't need the laundry to be cleaned in a timely
3130 			 * fashion... so let's avoid interfering with foreground
3131 			 * activity
3132 			 */
3133 			vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3134 
3135 			vm_free_page_lock();
3136 
3137 			if ((vm_page_free_count >= vm_page_free_target) &&
3138 			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3139 				/*
3140 				 * done - we have met our target *and*
3141 				 * there is no one waiting for a page.
3142 				 */
3143 return_from_scan:
3144 				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3145 
3146 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3147 				    vm_pageout_state.vm_pageout_inactive,
3148 				    vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3149 				VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
3150 				    vm_pageout_vminfo.vm_pageout_freed_speculative,
3151 				    vm_pageout_state.vm_pageout_inactive_clean,
3152 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3153 				    vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3154 
3155 				return;
3156 			}
3157 			vm_free_page_unlock();
3158 		}
3159 
3160 		/*
3161 		 * Before anything, we check if we have any ripe volatile
3162 		 * objects around. If so, try to purge the first object.
3163 		 * If the purge fails, fall through to reclaim a page instead.
3164 		 * If the purge succeeds, go back to the top and reevalute
3165 		 * the new memory situation.
3166 		 */
3167 		retval = vps_purge_object();
3168 
3169 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3170 			/*
3171 			 * Success
3172 			 */
3173 			if (object != NULL) {
3174 				vm_object_unlock(object);
3175 				object = NULL;
3176 			}
3177 
3178 			lock_yield_check = FALSE;
3179 			continue;
3180 		}
3181 
3182 
3183 		/*
3184 		 * If our 'aged' queue is empty and we have some speculative pages
3185 		 * in the other queues, let's go through and see if we need to age
3186 		 * them.
3187 		 *
3188 		 * If we succeeded in aging a speculative Q or just that everything
3189 		 * looks normal w.r.t queue age and queue counts, we keep going onward.
3190 		 *
3191 		 * If, for some reason, we seem to have a mismatch between the spec.
3192 		 * page count and the page queues, we reset those variables and
3193 		 * restart the loop (LD TODO: Track this better?).
3194 		 */
3195 		if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3196 			retval = vps_age_speculative_queue(force_speculative_aging);
3197 
3198 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3199 				lock_yield_check = FALSE;
3200 				continue;
3201 			}
3202 		}
3203 		force_speculative_aging = FALSE;
3204 
3205 		/*
3206 		 * Check to see if we need to evict objects from the cache.
3207 		 *
3208 		 * Note: 'object' here doesn't have anything to do with
3209 		 * the eviction part. We just need to make sure we have dropped
3210 		 * any object lock we might be holding if we need to go down
3211 		 * into the eviction logic.
3212 		 */
3213 		retval = vps_object_cache_evict(&object);
3214 
3215 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3216 			lock_yield_check = FALSE;
3217 			continue;
3218 		}
3219 
3220 
3221 		/*
3222 		 * Calculate our filecache_min that will affect the loop
3223 		 * going forward.
3224 		 */
3225 		vps_calculate_filecache_min();
3226 
3227 		/*
3228 		 * LD TODO: Use a structure to hold all state variables for a single
3229 		 * vm_pageout_scan iteration and pass that structure to this function instead.
3230 		 */
3231 		retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3232 		    &delayed_unlock, &local_freeq, &local_freed,
3233 		    &vm_pageout_deadlock_target, inactive_burst_count);
3234 
3235 		if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3236 			if (loop_count >= vm_page_inactive_count) {
3237 				loop_count = 0;
3238 			}
3239 
3240 			inactive_burst_count = 0;
3241 
3242 			assert(object == NULL);
3243 			assert(delayed_unlock != 0);
3244 
3245 			lock_yield_check = FALSE;
3246 			continue;
3247 		} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3248 			goto return_from_scan;
3249 		}
3250 
3251 		flow_control.state = FCS_IDLE;
3252 
3253 		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3254 		    vm_pageout_inactive_external_forced_reactivate_limit);
3255 		loop_count++;
3256 		inactive_burst_count++;
3257 		vm_pageout_state.vm_pageout_inactive++;
3258 
3259 		/*
3260 		 * Choose a victim.
3261 		 */
3262 
3263 		m = NULL;
3264 		retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3265 
3266 		if (m == NULL) {
3267 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3268 				inactive_burst_count = 0;
3269 
3270 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3271 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3272 				}
3273 
3274 				lock_yield_check = TRUE;
3275 				continue;
3276 			}
3277 
3278 			/*
3279 			 * if we've gotten here, we have no victim page.
3280 			 * check to see if we've not finished balancing the queues
3281 			 * or we have a page on the aged speculative queue that we
3282 			 * skipped due to force_anonymous == TRUE.. or we have
3283 			 * speculative  pages that we can prematurely age... if
3284 			 * one of these cases we'll keep going, else panic
3285 			 */
3286 			force_anonymous = FALSE;
3287 			VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3288 
3289 			if (!vm_page_queue_empty(&sq->age_q)) {
3290 				lock_yield_check = TRUE;
3291 				continue;
3292 			}
3293 
3294 			if (vm_page_speculative_count) {
3295 				force_speculative_aging = TRUE;
3296 				lock_yield_check = TRUE;
3297 				continue;
3298 			}
3299 			panic("vm_pageout: no victim");
3300 
3301 			/* NOTREACHED */
3302 		}
3303 
3304 		assert(VM_PAGE_PAGEABLE(m));
3305 		m_object = VM_PAGE_OBJECT(m);
3306 		force_anonymous = FALSE;
3307 
3308 		page_prev_q_state = m->vmp_q_state;
3309 		/*
3310 		 * we just found this page on one of our queues...
3311 		 * it can't also be on the pageout queue, so safe
3312 		 * to call vm_page_queues_remove
3313 		 */
3314 		bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3315 		vm_page_queues_remove(m, TRUE);
3316 		if (donate) {
3317 			/*
3318 			 * The compressor needs to see this bit to know
3319 			 * where this page needs to land. Also if stolen,
3320 			 * this bit helps put the page back in the right
3321 			 * special queue where it belongs.
3322 			 */
3323 			m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3324 		}
3325 
3326 		assert(!m->vmp_laundry);
3327 		assert(vm_page_is_canonical(m));
3328 		assert(!is_kernel_object(m_object));
3329 
3330 		vm_pageout_vminfo.vm_pageout_considered_page++;
3331 
3332 		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3333 
3334 		/*
3335 		 * check to see if we currently are working
3336 		 * with the same object... if so, we've
3337 		 * already got the lock
3338 		 */
3339 		if (m_object != object) {
3340 			boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3341 
3342 			/*
3343 			 * vps_switch_object() will always drop the 'object' lock first
3344 			 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3345 			 * either 'm_object' or NULL.
3346 			 */
3347 			retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3348 
3349 			if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3350 				lock_yield_check = TRUE;
3351 				continue;
3352 			}
3353 		}
3354 		assert(m_object == object);
3355 		assert(VM_PAGE_OBJECT(m) == m_object);
3356 
3357 		if (m->vmp_busy) {
3358 			/*
3359 			 *	Somebody is already playing with this page.
3360 			 *	Put it back on the appropriate queue
3361 			 *
3362 			 */
3363 			VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3364 
3365 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3366 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3367 			}
3368 
3369 			vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3370 
3371 			lock_yield_check = TRUE;
3372 			continue;
3373 		}
3374 
3375 		/*
3376 		 *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3377 		 *	If already cleaning this page in place
3378 		 *	just leave if off the paging queues.
3379 		 *	We can leave the page mapped, and upl_commit_range
3380 		 *	will put it on the clean queue.
3381 		 *
3382 		 *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3383 		 *	an msync INVALIDATE is in progress...
3384 		 *	this page has been marked for destruction
3385 		 *      after it has been cleaned,
3386 		 *      but not yet gathered into a UPL
3387 		 *	where 'cleaning' will be set...
3388 		 *	just leave it off the paging queues
3389 		 *
3390 		 *   if (m->vmp_free_when_done && m->vmp_clenaing)
3391 		 *	an msync INVALIDATE is in progress
3392 		 *	and the UPL has already gathered this page...
3393 		 *	just leave it off the paging queues
3394 		 */
3395 		if (m->vmp_free_when_done || m->vmp_cleaning) {
3396 			lock_yield_check = TRUE;
3397 			continue;
3398 		}
3399 
3400 
3401 		/*
3402 		 *	If it's absent, in error or the object is no longer alive,
3403 		 *	we can reclaim the page... in the no longer alive case,
3404 		 *	there are 2 states the page can be in that preclude us
3405 		 *	from reclaiming it - busy or cleaning - that we've already
3406 		 *	dealt with
3407 		 */
3408 		if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3409 		    (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3410 			if (m->vmp_absent) {
3411 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3412 			} else if (!object->alive ||
3413 			    (!object->internal &&
3414 			    object->pager == MEMORY_OBJECT_NULL)) {
3415 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3416 			} else {
3417 				VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3418 			}
3419 			if (m->vmp_pmapped) {
3420 				int refmod;
3421 
3422 				/*
3423 				 * If this page was file-backed and wired while its pager
3424 				 * was lost (during a forced unmount, for example), there
3425 				 * could still be some pmap mappings that need to be
3426 				 * cleaned up before we can free the page.
3427 				 */
3428 				refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3429 				if ((refmod & VM_MEM_MODIFIED) &&
3430 				    !m->vmp_dirty) {
3431 					SET_PAGE_DIRTY(m, FALSE);
3432 				}
3433 			}
3434 reclaim_page:
3435 			if (vm_pageout_deadlock_target) {
3436 				VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3437 				vm_pageout_deadlock_target--;
3438 			}
3439 
3440 			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3441 
3442 			if (object->internal) {
3443 				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3444 			} else {
3445 				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3446 			}
3447 			assert(!m->vmp_cleaning);
3448 			assert(!m->vmp_laundry);
3449 
3450 			if (!object->internal &&
3451 			    object->pager != NULL &&
3452 			    object->pager->mo_pager_ops == &shared_region_pager_ops) {
3453 				shared_region_pager_reclaimed++;
3454 			}
3455 
3456 			m->vmp_busy = TRUE;
3457 
3458 			/*
3459 			 * remove page from object here since we're already
3460 			 * behind the object lock... defer the rest of the work
3461 			 * we'd normally do in vm_page_free_prepare_object
3462 			 * until 'vm_page_free_list' is called
3463 			 */
3464 			if (m->vmp_tabled) {
3465 				vm_page_remove(m, TRUE);
3466 			}
3467 
3468 			assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3469 			m->vmp_snext = local_freeq;
3470 			local_freeq = m;
3471 			local_freed++;
3472 
3473 			if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3474 				vm_pageout_vminfo.vm_pageout_freed_speculative++;
3475 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3476 				vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3477 			} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3478 				vm_pageout_vminfo.vm_pageout_freed_internal++;
3479 			} else {
3480 				vm_pageout_vminfo.vm_pageout_freed_external++;
3481 			}
3482 
3483 			inactive_burst_count = 0;
3484 
3485 			lock_yield_check = TRUE;
3486 			continue;
3487 		}
3488 		if (object->vo_copy == VM_OBJECT_NULL) {
3489 			/*
3490 			 * No one else can have any interest in this page.
3491 			 * If this is an empty purgable object, the page can be
3492 			 * reclaimed even if dirty.
3493 			 * If the page belongs to a volatile purgable object, we
3494 			 * reactivate it if the compressor isn't active.
3495 			 */
3496 			if (object->purgable == VM_PURGABLE_EMPTY) {
3497 				if (m->vmp_pmapped == TRUE) {
3498 					/* unmap the page */
3499 					refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3500 					if (refmod_state & VM_MEM_MODIFIED) {
3501 						SET_PAGE_DIRTY(m, FALSE);
3502 					}
3503 				}
3504 				if (m->vmp_dirty || m->vmp_precious) {
3505 					/* we saved the cost of cleaning this page ! */
3506 					vm_page_purged_count++;
3507 				}
3508 				goto reclaim_page;
3509 			}
3510 
3511 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3512 				/*
3513 				 * With the VM compressor, the cost of
3514 				 * reclaiming a page is much lower (no I/O),
3515 				 * so if we find a "volatile" page, it's better
3516 				 * to let it get compressed rather than letting
3517 				 * it occupy a full page until it gets purged.
3518 				 * So no need to check for "volatile" here.
3519 				 */
3520 			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3521 				/*
3522 				 * Avoid cleaning a "volatile" page which might
3523 				 * be purged soon.
3524 				 */
3525 
3526 				/* if it's wired, we can't put it on our queue */
3527 				assert(!VM_PAGE_WIRED(m));
3528 
3529 				/* just stick it back on! */
3530 				reactivated_this_call++;
3531 
3532 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3533 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3534 				}
3535 
3536 				goto reactivate_page;
3537 			}
3538 		} /* vo_copy NULL */
3539 		/*
3540 		 *	If it's being used, reactivate.
3541 		 *	(Fictitious pages are either busy or absent.)
3542 		 *	First, update the reference and dirty bits
3543 		 *	to make sure the page is unreferenced.
3544 		 */
3545 		refmod_state = -1;
3546 
3547 		if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3548 			refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3549 
3550 			if (refmod_state & VM_MEM_REFERENCED) {
3551 				m->vmp_reference = TRUE;
3552 			}
3553 			if (refmod_state & VM_MEM_MODIFIED) {
3554 				SET_PAGE_DIRTY(m, FALSE);
3555 			}
3556 		}
3557 
3558 		if (m->vmp_reference || m->vmp_dirty) {
3559 			/* deal with a rogue "reusable" page */
3560 			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3561 		}
3562 
3563 		if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3564 			vm_pageout_state.vm_page_xpmapped_min = 0;
3565 		} else {
3566 			vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
3567 			    vm_pageout_state.vm_page_xpmapped_min_divisor;
3568 		}
3569 
3570 		if (!m->vmp_no_cache &&
3571 		    page_from_bg_q == FALSE &&
3572 		    (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3573 		    (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3574 			/*
3575 			 * The page we pulled off the inactive list has
3576 			 * been referenced.  It is possible for other
3577 			 * processors to be touching pages faster than we
3578 			 * can clear the referenced bit and traverse the
3579 			 * inactive queue, so we limit the number of
3580 			 * reactivations.
3581 			 */
3582 			if (++reactivated_this_call >= reactivate_limit &&
3583 			    !object->object_is_shared_cache &&
3584 			    !((m->vmp_realtime ||
3585 			    object->for_realtime) &&
3586 			    vm_pageout_protect_realtime)) {
3587 				vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3588 			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3589 				vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3590 				if (object->object_is_shared_cache) {
3591 					vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3592 				} else if (m->vmp_realtime ||
3593 				    object->for_realtime) {
3594 					vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3595 				}
3596 			} else {
3597 				uint32_t isinuse;
3598 
3599 				if (reactivated_this_call >= reactivate_limit) {
3600 					if (object->object_is_shared_cache) {
3601 						vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3602 					} else if ((m->vmp_realtime ||
3603 					    object->for_realtime) &&
3604 					    vm_pageout_protect_realtime) {
3605 						vm_pageout_vminfo.vm_pageout_protected_realtime++;
3606 					}
3607 				}
3608 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3609 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3610 				}
3611 
3612 				vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3613 reactivate_page:
3614 				if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3615 				    vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3616 					/*
3617 					 * no explict mappings of this object exist
3618 					 * and it's not open via the filesystem
3619 					 */
3620 					vm_page_deactivate(m);
3621 					VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3622 				} else {
3623 					/*
3624 					 * The page was/is being used, so put back on active list.
3625 					 */
3626 					vm_page_activate(m);
3627 					counter_inc(&vm_statistics_reactivations);
3628 					inactive_burst_count = 0;
3629 				}
3630 #if DEVELOPMENT || DEBUG
3631 				if (page_from_bg_q == TRUE) {
3632 					if (m_object->internal) {
3633 						vm_pageout_rejected_bq_internal++;
3634 					} else {
3635 						vm_pageout_rejected_bq_external++;
3636 					}
3637 				}
3638 #endif /* DEVELOPMENT || DEBUG */
3639 
3640 				if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3641 					VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3642 				}
3643 				vm_pageout_state.vm_pageout_inactive_used++;
3644 
3645 				lock_yield_check = TRUE;
3646 				continue;
3647 			}
3648 			/*
3649 			 * Make sure we call pmap_get_refmod() if it
3650 			 * wasn't already called just above, to update
3651 			 * the dirty bit.
3652 			 */
3653 			if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3654 				refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3655 				if (refmod_state & VM_MEM_MODIFIED) {
3656 					SET_PAGE_DIRTY(m, FALSE);
3657 				}
3658 			}
3659 		}
3660 
3661 		/*
3662 		 * we've got a candidate page to steal...
3663 		 *
3664 		 * m->vmp_dirty is up to date courtesy of the
3665 		 * preceding check for m->vmp_reference... if
3666 		 * we get here, then m->vmp_reference had to be
3667 		 * FALSE (or possibly "reactivate_limit" was
3668 		 * exceeded), but in either case we called
3669 		 * pmap_get_refmod() and updated both
3670 		 * m->vmp_reference and m->vmp_dirty
3671 		 *
3672 		 * if it's dirty or precious we need to
3673 		 * see if the target queue is throtttled
3674 		 * it if is, we need to skip over it by moving it back
3675 		 * to the end of the inactive queue
3676 		 */
3677 
3678 		inactive_throttled = FALSE;
3679 
3680 		if (m->vmp_dirty || m->vmp_precious) {
3681 			if (object->internal) {
3682 				if (VM_PAGE_Q_THROTTLED(iq)) {
3683 					inactive_throttled = TRUE;
3684 				}
3685 			} else if (VM_PAGE_Q_THROTTLED(eq)) {
3686 				inactive_throttled = TRUE;
3687 			}
3688 		}
3689 throttle_inactive:
3690 		if (!VM_DYNAMIC_PAGING_ENABLED() &&
3691 		    object->internal && m->vmp_dirty &&
3692 		    (object->purgable == VM_PURGABLE_DENY ||
3693 		    object->purgable == VM_PURGABLE_NONVOLATILE ||
3694 		    object->purgable == VM_PURGABLE_VOLATILE)) {
3695 			vm_page_check_pageable_safe(m);
3696 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3697 			vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3698 			m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3699 			vm_page_throttled_count++;
3700 
3701 			VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3702 
3703 			inactive_burst_count = 0;
3704 
3705 			lock_yield_check = TRUE;
3706 			continue;
3707 		}
3708 		if (inactive_throttled == TRUE) {
3709 			vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3710 			    &force_anonymous, page_from_bg_q);
3711 
3712 			inactive_burst_count = 0;
3713 
3714 			if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3715 				VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3716 			}
3717 
3718 			lock_yield_check = TRUE;
3719 			continue;
3720 		}
3721 
3722 		/*
3723 		 * we've got a page that we can steal...
3724 		 * eliminate all mappings and make sure
3725 		 * we have the up-to-date modified state
3726 		 *
3727 		 * if we need to do a pmap_disconnect then we
3728 		 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3729 		 * provides the true state atomically... the
3730 		 * page was still mapped up to the pmap_disconnect
3731 		 * and may have been dirtied at the last microsecond
3732 		 *
3733 		 * Note that if 'pmapped' is FALSE then the page is not
3734 		 * and has not been in any map, so there is no point calling
3735 		 * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3736 		 * of likely usage of the page.
3737 		 */
3738 		if (m->vmp_pmapped == TRUE) {
3739 			int pmap_options;
3740 
3741 			/*
3742 			 * Don't count this page as going into the compressor
3743 			 * if any of these are true:
3744 			 * 1) compressed pager isn't enabled
3745 			 * 2) Freezer enabled device with compressed pager
3746 			 *    backend (exclusive use) i.e. most of the VM system
3747 			 *    (including vm_pageout_scan) has no knowledge of
3748 			 *    the compressor
3749 			 * 3) This page belongs to a file and hence will not be
3750 			 *    sent into the compressor
3751 			 */
3752 			if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3753 			    object->internal == FALSE) {
3754 				pmap_options = 0;
3755 			} else if (m->vmp_dirty || m->vmp_precious) {
3756 				/*
3757 				 * VM knows that this page is dirty (or
3758 				 * precious) and needs to be compressed
3759 				 * rather than freed.
3760 				 * Tell the pmap layer to count this page
3761 				 * as "compressed".
3762 				 */
3763 				pmap_options = PMAP_OPTIONS_COMPRESSOR;
3764 			} else {
3765 				/*
3766 				 * VM does not know if the page needs to
3767 				 * be preserved but the pmap layer might tell
3768 				 * us if any mapping has "modified" it.
3769 				 * Let's the pmap layer to count this page
3770 				 * as compressed if and only if it has been
3771 				 * modified.
3772 				 */
3773 				pmap_options =
3774 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3775 			}
3776 			refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3777 			    pmap_options,
3778 			    NULL);
3779 			if (refmod_state & VM_MEM_MODIFIED) {
3780 				SET_PAGE_DIRTY(m, FALSE);
3781 			}
3782 		}
3783 
3784 		/*
3785 		 * reset our count of pages that have been reclaimed
3786 		 * since the last page was 'stolen'
3787 		 */
3788 		inactive_reclaim_run = 0;
3789 
3790 		/*
3791 		 *	If it's clean and not precious, we can free the page.
3792 		 */
3793 		if (!m->vmp_dirty && !m->vmp_precious) {
3794 			vm_pageout_state.vm_pageout_inactive_clean++;
3795 
3796 			/*
3797 			 * OK, at this point we have found a page we are going to free.
3798 			 */
3799 #if CONFIG_PHANTOM_CACHE
3800 			if (!object->internal) {
3801 				vm_phantom_cache_add_ghost(m);
3802 			}
3803 #endif
3804 			goto reclaim_page;
3805 		}
3806 
3807 		/*
3808 		 * The page may have been dirtied since the last check
3809 		 * for a throttled target queue (which may have been skipped
3810 		 * if the page was clean then).  With the dirty page
3811 		 * disconnected here, we can make one final check.
3812 		 */
3813 		if (object->internal) {
3814 			if (VM_PAGE_Q_THROTTLED(iq)) {
3815 				inactive_throttled = TRUE;
3816 			}
3817 		} else if (VM_PAGE_Q_THROTTLED(eq)) {
3818 			inactive_throttled = TRUE;
3819 		}
3820 
3821 		if (inactive_throttled == TRUE) {
3822 			goto throttle_inactive;
3823 		}
3824 #if !CONFIG_JETSAM
3825 		memorystatus_update_available_page_count(AVAILABLE_NON_COMPRESSED_MEMORY);
3826 #endif /* !CONFIG_JETSAM */
3827 
3828 		if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3829 			VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3830 		}
3831 
3832 		if (object->internal) {
3833 			vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3834 		} else {
3835 			vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3836 		}
3837 
3838 		/*
3839 		 * internal pages will go to the compressor...
3840 		 * external pages will go to the appropriate pager to be cleaned
3841 		 * and upon completion will end up on 'vm_page_queue_cleaned' which
3842 		 * is a preferred queue to steal from
3843 		 */
3844 		vm_pageout_cluster(m);
3845 		inactive_burst_count = 0;
3846 
3847 		/*
3848 		 * back to top of pageout scan loop
3849 		 */
3850 	}
3851 }
3852 
3853 
3854 void
3855 vm_page_free_reserve(
3856 	int pages)
3857 {
3858 	int             free_after_reserve;
3859 
3860 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3861 		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3862 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3863 		} else {
3864 			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3865 		}
3866 	} else {
3867 		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3868 			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3869 		} else {
3870 			vm_page_free_reserved += pages;
3871 		}
3872 	}
3873 	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3874 
3875 	vm_page_free_min = vm_page_free_reserved +
3876 	    VM_PAGE_FREE_MIN(free_after_reserve);
3877 
3878 	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3879 		vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3880 	}
3881 
3882 	vm_page_free_target = vm_page_free_reserved +
3883 	    VM_PAGE_FREE_TARGET(free_after_reserve);
3884 
3885 	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3886 		vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3887 	}
3888 
3889 	if (vm_page_free_target < vm_page_free_min + 5) {
3890 		vm_page_free_target = vm_page_free_min + 5;
3891 	}
3892 
3893 	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3894 }
3895 
3896 /*
3897  *	vm_pageout is the high level pageout daemon.
3898  */
3899 
3900 void
3901 vm_pageout_continue(void)
3902 {
3903 	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3904 	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3905 
3906 	vm_free_page_lock();
3907 	vm_pageout_running = TRUE;
3908 	vm_free_page_unlock();
3909 
3910 	vm_pageout_scan();
3911 	/*
3912 	 * we hold both the vm_page_queue_free_lock
3913 	 * and the vm_page_queues_lock at this point
3914 	 */
3915 	assert(vm_page_free_wanted == 0);
3916 	assert(vm_page_free_wanted_privileged == 0);
3917 	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3918 
3919 	vm_pageout_running = FALSE;
3920 #if XNU_TARGET_OS_OSX
3921 	if (vm_pageout_waiter) {
3922 		vm_pageout_waiter = FALSE;
3923 		thread_wakeup((event_t)&vm_pageout_waiter);
3924 	}
3925 #endif /* XNU_TARGET_OS_OSX */
3926 
3927 	vm_free_page_unlock();
3928 	vm_page_unlock_queues();
3929 
3930 	thread_block((thread_continue_t)vm_pageout_continue);
3931 	/*NOTREACHED*/
3932 }
3933 
3934 #if XNU_TARGET_OS_OSX
3935 kern_return_t
3936 vm_pageout_wait(uint64_t deadline)
3937 {
3938 	kern_return_t kr;
3939 
3940 	vm_free_page_lock();
3941 	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3942 		vm_pageout_waiter = TRUE;
3943 		if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3944 			    &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3945 			    (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3946 			kr = KERN_OPERATION_TIMED_OUT;
3947 		}
3948 	}
3949 	vm_free_page_unlock();
3950 
3951 	return kr;
3952 }
3953 #endif /* XNU_TARGET_OS_OSX */
3954 
3955 OS_NORETURN
3956 static void
3957 vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3958 {
3959 	vm_page_t       m = NULL;
3960 	vm_object_t     object;
3961 	vm_object_offset_t offset;
3962 	memory_object_t pager;
3963 	struct vm_pageout_queue *q = ethr->q;
3964 
3965 	/* On systems with a compressor, the external IO thread clears its
3966 	 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3967 	 * creation)
3968 	 */
3969 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3970 		current_thread()->options &= ~TH_OPT_VMPRIV;
3971 	}
3972 
3973 	sched_cond_ack(&(ethr->pgo_wakeup));
3974 
3975 	while (true) {
3976 		vm_page_lockspin_queues();
3977 
3978 		while (!vm_page_queue_empty(&q->pgo_pending)) {
3979 			q->pgo_busy = TRUE;
3980 			vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3981 
3982 			assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3983 			VM_PAGE_CHECK(m);
3984 			/*
3985 			 * grab a snapshot of the object and offset this
3986 			 * page is tabled in so that we can relookup this
3987 			 * page after we've taken the object lock - these
3988 			 * fields are stable while we hold the page queues lock
3989 			 * but as soon as we drop it, there is nothing to keep
3990 			 * this page in this object... we hold an activity_in_progress
3991 			 * on this object which will keep it from terminating
3992 			 */
3993 			object = VM_PAGE_OBJECT(m);
3994 			offset = m->vmp_offset;
3995 
3996 			m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3997 			VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3998 
3999 			vm_page_unlock_queues();
4000 
4001 			vm_object_lock(object);
4002 
4003 			m = vm_page_lookup(object, offset);
4004 
4005 			if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
4006 			    !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
4007 				/*
4008 				 * it's either the same page that someone else has
4009 				 * started cleaning (or it's finished cleaning or
4010 				 * been put back on the pageout queue), or
4011 				 * the page has been freed or we have found a
4012 				 * new page at this offset... in all of these cases
4013 				 * we merely need to release the activity_in_progress
4014 				 * we took when we put the page on the pageout queue
4015 				 */
4016 				vm_object_activity_end(object);
4017 				vm_object_unlock(object);
4018 
4019 				vm_page_lockspin_queues();
4020 				continue;
4021 			}
4022 			pager = object->pager;
4023 
4024 			if (pager == MEMORY_OBJECT_NULL) {
4025 				/*
4026 				 * This pager has been destroyed by either
4027 				 * memory_object_destroy or vm_object_destroy, and
4028 				 * so there is nowhere for the page to go.
4029 				 */
4030 				if (m->vmp_free_when_done) {
4031 					/*
4032 					 * Just free the page... VM_PAGE_FREE takes
4033 					 * care of cleaning up all the state...
4034 					 * including doing the vm_pageout_throttle_up
4035 					 */
4036 					VM_PAGE_FREE(m);
4037 				} else {
4038 					vm_page_lockspin_queues();
4039 
4040 					vm_pageout_throttle_up(m);
4041 					vm_page_activate(m);
4042 
4043 					vm_page_unlock_queues();
4044 
4045 					/*
4046 					 *	And we are done with it.
4047 					 */
4048 				}
4049 				vm_object_activity_end(object);
4050 				vm_object_unlock(object);
4051 
4052 				vm_page_lockspin_queues();
4053 				continue;
4054 			}
4055 	#if 0
4056 			/*
4057 			 * we don't hold the page queue lock
4058 			 * so this check isn't safe to make
4059 			 */
4060 			VM_PAGE_CHECK(m);
4061 	#endif
4062 			/*
4063 			 * give back the activity_in_progress reference we
4064 			 * took when we queued up this page and replace it
4065 			 * it with a paging_in_progress reference that will
4066 			 * also hold the paging offset from changing and
4067 			 * prevent the object from terminating
4068 			 */
4069 			vm_object_activity_end(object);
4070 			vm_object_paging_begin(object);
4071 			vm_object_unlock(object);
4072 
4073 			/*
4074 			 * Send the data to the pager.
4075 			 * any pageout clustering happens there
4076 			 */
4077 			memory_object_data_return(pager,
4078 			    m->vmp_offset + object->paging_offset,
4079 			    PAGE_SIZE,
4080 			    NULL,
4081 			    NULL,
4082 			    FALSE,
4083 			    FALSE,
4084 			    0);
4085 
4086 			vm_object_lock(object);
4087 			vm_object_paging_end(object);
4088 			vm_object_unlock(object);
4089 
4090 			vm_pageout_io_throttle();
4091 
4092 			vm_page_lockspin_queues();
4093 		}
4094 		q->pgo_busy = FALSE;
4095 
4096 		vm_page_unlock_queues();
4097 		sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
4098 	}
4099 	/*NOTREACHED*/
4100 }
4101 
4102 uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
4103 
4104 #if DEVELOPMENT || DEBUG
4105 static void
4106 vm_pageout_record_thread_time(int cqid, int ncomps)
4107 {
4108 	if (__improbable(vm_compressor_time_thread)) {
4109 		vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
4110 		vmct_stats.vmct_pages[cqid] += ncomps;
4111 		vmct_stats.vmct_iterations[cqid]++;
4112 		if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
4113 			vmct_stats.vmct_maxpages[cqid] = ncomps;
4114 		}
4115 		if (ncomps < vmct_stats.vmct_minpages[cqid]) {
4116 			vmct_stats.vmct_minpages[cqid] = ncomps;
4117 		}
4118 	}
4119 }
4120 #endif
4121 
4122 static void *
4123 vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
4124 {
4125 	/*
4126 	 * Technically we need the pageq locks to manipulate the vmp_on_specialq field.
4127 	 * However, this page has been removed from all queues and is only
4128 	 * known to this compressor thread dealing with this local queue.
4129 	 *
4130 	 * TODO: Add a second localq that is the early localq and
4131 	 * put special pages like this one on that queue in the block above
4132 	 * under the pageq lock to avoid this 'works but not clean' logic.
4133 	 */
4134 	void *donate_queue_head;
4135 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4136 	donate_queue_head = &cq->current_early_swapout_chead;
4137 #else /* XNU_TARGET_OS_OSX */
4138 	donate_queue_head = &cq->current_late_swapout_chead;
4139 #endif /* XNU_TARGET_OS_OSX */
4140 	if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4141 		m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4142 		return donate_queue_head;
4143 	} else {
4144 		return &cq->current_regular_swapout_chead;
4145 	}
4146 }
4147 
4148 #define         MAX_FREE_BATCH          32
4149 
4150 OS_NORETURN
4151 static void
4152 vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4153 {
4154 	struct vm_pageout_queue *q;
4155 	vm_page_t       m = NULL;
4156 	boolean_t       pgo_draining;
4157 	vm_page_t   local_q;
4158 	int         local_cnt;
4159 	vm_page_t   local_freeq = NULL;
4160 	int         local_freed = 0;
4161 	int         local_batch_size;
4162 #if DEVELOPMENT || DEBUG
4163 	int       ncomps = 0;
4164 	boolean_t marked_active = FALSE;
4165 	int       num_pages_processed = 0;
4166 #endif
4167 	void *chead = NULL;
4168 
4169 	KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
4170 
4171 	sched_cond_ack(&(cq->pgo_wakeup));
4172 
4173 	q = cq->q;
4174 
4175 	while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
4176 #if DEVELOPMENT || DEBUG
4177 		bool benchmark_accounting = false;
4178 		/* If we're running the compressor perf test, only process the benchmark pages.
4179 		 * We'll get back to our regular queue once the benchmark is done */
4180 		if (compressor_running_perf_test) {
4181 			q = cq->benchmark_q;
4182 			if (!vm_page_queue_empty(&q->pgo_pending)) {
4183 				benchmark_accounting = true;
4184 			} else {
4185 				q = cq->q;
4186 				benchmark_accounting = false;
4187 			}
4188 		}
4189 #endif /* DEVELOPMENT || DEBUG */
4190 
4191 #if __AMP__
4192 		if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4193 			local_batch_size = (q->pgo_maxlaundry >> 3);
4194 			local_batch_size = MAX(local_batch_size, 16);
4195 		} else {
4196 			local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4197 		}
4198 #else
4199 		local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4200 #endif
4201 
4202 #if RECORD_THE_COMPRESSED_DATA
4203 		if (q->pgo_laundry) {
4204 			c_compressed_record_init();
4205 		}
4206 #endif
4207 		while (true) { /* this loop is for working though all the pages in the pending queue */
4208 			int     pages_left_on_q = 0;
4209 
4210 			local_cnt = 0;
4211 			local_q = NULL;
4212 
4213 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
4214 
4215 			vm_page_lock_queues();
4216 #if DEVELOPMENT || DEBUG
4217 			if (marked_active == FALSE) {
4218 				vmct_active++;
4219 				vmct_state[cq->id] = VMCT_ACTIVE;
4220 				marked_active = TRUE;
4221 				if (vmct_active == 1) {
4222 					vm_compressor_epoch_start = mach_absolute_time();
4223 				}
4224 			}
4225 #endif
4226 			KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
4227 
4228 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
4229 
4230 			/* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
4231 			while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4232 				vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4233 				assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4234 				VM_PAGE_CHECK(m);
4235 
4236 				m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4237 				VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4238 				m->vmp_laundry = FALSE;
4239 
4240 				m->vmp_snext = local_q;
4241 				local_q = m;
4242 				local_cnt++;
4243 			}
4244 			if (local_q == NULL) {
4245 				break;
4246 			}
4247 
4248 			q->pgo_busy = TRUE;
4249 
4250 			if ((pgo_draining = q->pgo_draining) == FALSE) {
4251 				vm_pageout_throttle_up_batch(q, local_cnt);
4252 				pages_left_on_q = q->pgo_laundry;
4253 			} else {
4254 				pages_left_on_q = q->pgo_laundry - local_cnt;
4255 			}
4256 
4257 			vm_page_unlock_queues();
4258 
4259 #if !RECORD_THE_COMPRESSED_DATA
4260 			/* if we have lots to compress, wake up the other thread to help.
4261 			 * disabled when recording data since record data is not protected with a mutex so this may cause races */
4262 			if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4263 				// wake up the next compressor thread
4264 				sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4265 				    pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4266 			}
4267 #endif
4268 			KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
4269 
4270 			while (local_q) {
4271 				KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
4272 
4273 				m = local_q;
4274 				local_q = m->vmp_snext;
4275 				m->vmp_snext = NULL;
4276 
4277 
4278 				chead = vm_pageout_select_filling_chead(cq, m);
4279 
4280 				if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4281 #if DEVELOPMENT || DEBUG
4282 					ncomps++;
4283 #endif
4284 					KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
4285 
4286 					m->vmp_snext = local_freeq;
4287 					local_freeq = m;
4288 					local_freed++;
4289 
4290 					/* if we gathered enough free pages, free them now */
4291 					if (local_freed >= MAX_FREE_BATCH) {
4292 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4293 
4294 						vm_page_free_list(local_freeq, TRUE);
4295 
4296 						local_freeq = NULL;
4297 						local_freed = 0;
4298 					}
4299 				}
4300 #if DEVELOPMENT || DEBUG
4301 				num_pages_processed++;
4302 #endif /* DEVELOPMENT || DEBUG */
4303 #if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
4304 				while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4305 					kern_return_t   wait_result;
4306 					int             need_wakeup = 0;
4307 
4308 					if (local_freeq) {
4309 						OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4310 
4311 						vm_page_free_list(local_freeq, TRUE);
4312 						local_freeq = NULL;
4313 						local_freed = 0;
4314 
4315 						continue;
4316 					}
4317 					vm_free_page_lock_spin();
4318 
4319 					if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4320 						if (vm_page_free_wanted_privileged++ == 0) {
4321 							need_wakeup = 1;
4322 						}
4323 						wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4324 
4325 						vm_free_page_unlock();
4326 
4327 						if (need_wakeup) {
4328 							thread_wakeup((event_t)&vm_page_free_wanted);
4329 						}
4330 
4331 						if (wait_result == THREAD_WAITING) {
4332 							thread_block(THREAD_CONTINUE_NULL);
4333 						}
4334 					} else {
4335 						vm_free_page_unlock();
4336 					}
4337 				}
4338 #endif
4339 			}  /* while (local_q) */
4340 			/* free any leftovers in the freeq */
4341 			if (local_freeq) {
4342 				OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4343 
4344 				vm_page_free_list(local_freeq, TRUE);
4345 				local_freeq = NULL;
4346 				local_freed = 0;
4347 			}
4348 			if (pgo_draining == TRUE) {
4349 				vm_page_lockspin_queues();
4350 				vm_pageout_throttle_up_batch(q, local_cnt);
4351 				vm_page_unlock_queues();
4352 			}
4353 		}
4354 		KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
4355 
4356 		/*
4357 		 * queue lock is held and our q is empty
4358 		 */
4359 		q->pgo_busy = FALSE;
4360 #if DEVELOPMENT || DEBUG
4361 		if (marked_active == TRUE) {
4362 			vmct_active--;
4363 			vmct_state[cq->id] = VMCT_IDLE;
4364 
4365 			if (vmct_active == 0) {
4366 				vm_compressor_epoch_stop = mach_absolute_time();
4367 				assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4368 				    "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4369 				    vm_compressor_epoch_start, vm_compressor_epoch_stop);
4370 				/* This interval includes intervals where one or more
4371 				 * compressor threads were pre-empted
4372 				 */
4373 				vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4374 			}
4375 		}
4376 		if (compressor_running_perf_test && benchmark_accounting) {
4377 			/*
4378 			 * We could turn ON compressor_running_perf_test while still processing
4379 			 * regular non-benchmark pages. We shouldn't count them here else we
4380 			 * could overshoot. We might also still be populating that benchmark Q
4381 			 * and be under pressure. So we will go back to the regular queues. And
4382 			 * benchmark accounting will be off for that case too.
4383 			 */
4384 			compressor_perf_test_pages_processed += num_pages_processed;
4385 			thread_wakeup(&compressor_perf_test_pages_processed);
4386 		}
4387 #endif
4388 		vm_page_unlock_queues();
4389 #if DEVELOPMENT || DEBUG
4390 		vm_pageout_record_thread_time(cq->id, ncomps);
4391 #endif
4392 
4393 		KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
4394 #if DEVELOPMENT || DEBUG
4395 		if (compressor_running_perf_test && benchmark_accounting) {
4396 			/*
4397 			 * We've been exclusively compressing pages from the benchmark queue,
4398 			 * do 1 pass over the internal queue before blocking.
4399 			 */
4400 			continue;
4401 		}
4402 #endif
4403 
4404 		sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4405 	}
4406 	/*NOTREACHED*/
4407 }
4408 
4409 /* resolves the pager and maintain stats in the pager and in the vm_object */
4410 kern_return_t
4411 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4412 {
4413 	vm_object_t     object;
4414 	memory_object_t pager;
4415 	int             compressed_count_delta;
4416 	kern_return_t   retval;
4417 
4418 	object = VM_PAGE_OBJECT(m);
4419 
4420 	assert(!m->vmp_free_when_done);
4421 	assert(!m->vmp_laundry);
4422 
4423 	pager = object->pager;
4424 
4425 	if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4426 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
4427 
4428 		vm_object_lock(object);
4429 
4430 		/*
4431 		 * If there is no memory object for the page, create
4432 		 * one and hand it to the compression pager.
4433 		 */
4434 
4435 		if (!object->pager_initialized) {
4436 			vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4437 		}
4438 		if (!object->pager_initialized) {
4439 			vm_object_compressor_pager_create(object);
4440 		}
4441 
4442 		pager = object->pager;
4443 
4444 		if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4445 			/*
4446 			 * Still no pager for the object,
4447 			 * or the pager has been destroyed.
4448 			 * Reactivate the page.
4449 			 *
4450 			 * Should only happen if there is no
4451 			 * compression pager
4452 			 */
4453 			vm_page_wakeup_done(object, m);
4454 
4455 			vm_page_lockspin_queues();
4456 			vm_page_activate(m);
4457 			VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4458 			vm_page_unlock_queues();
4459 
4460 			/*
4461 			 *	And we are done with it.
4462 			 */
4463 			vm_object_activity_end(object);
4464 			vm_object_unlock(object);
4465 
4466 			return KERN_FAILURE;
4467 		}
4468 		vm_object_unlock(object);
4469 
4470 		KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
4471 	}
4472 	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4473 	assert(object->activity_in_progress > 0);
4474 
4475 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4476 	if (m->vmp_unmodified_ro == true) {
4477 		os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4478 	}
4479 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4480 
4481 	vm_compressor_options_t flags = 0;
4482 
4483 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4484 	if (m->vmp_unmodified_ro) {
4485 		flags |= C_PAGE_UNMODIFIED;
4486 	}
4487 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4488 
4489 
4490 	retval = vm_compressor_pager_put(
4491 		pager,
4492 		m->vmp_offset + object->paging_offset,
4493 		VM_PAGE_GET_PHYS_PAGE(m),
4494 		current_chead,
4495 		scratch_buf,
4496 		&compressed_count_delta,
4497 		flags);
4498 
4499 	vm_object_lock(object);
4500 
4501 	assert(object->activity_in_progress > 0);
4502 	assert(VM_PAGE_OBJECT(m) == object);
4503 	assert( !VM_PAGE_WIRED(m));
4504 
4505 	vm_compressor_pager_count(pager,
4506 	    compressed_count_delta,
4507 	    FALSE,                       /* shared_lock */
4508 	    object);
4509 
4510 	if (retval == KERN_SUCCESS) {
4511 		/*
4512 		 * If the object is purgeable, its owner's
4513 		 * purgeable ledgers will be updated in
4514 		 * vm_page_remove() but the page still
4515 		 * contributes to the owner's memory footprint,
4516 		 * so account for it as such.
4517 		 */
4518 		if (m->vmp_tabled) {
4519 			vm_page_remove(m, TRUE);
4520 		}
4521 		if ((object->purgable != VM_PURGABLE_DENY ||
4522 		    object->vo_ledger_tag) &&
4523 		    object->vo_owner != NULL) {
4524 			/* one more compressed purgeable/tagged page */
4525 			vm_object_owner_compressed_update(object,
4526 			    compressed_count_delta);
4527 		}
4528 		counter_inc(&vm_statistics_compressions);
4529 	} else {
4530 		vm_page_wakeup_done(object, m);
4531 
4532 		vm_page_lockspin_queues();
4533 
4534 		vm_page_activate(m);
4535 		vm_pageout_vminfo.vm_compressor_failed++;
4536 
4537 		vm_page_unlock_queues();
4538 	}
4539 	vm_object_activity_end(object);
4540 	vm_object_unlock(object);
4541 
4542 	return retval;
4543 }
4544 
4545 
4546 static void
4547 vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4548 {
4549 	uint32_t        policy;
4550 
4551 	if (hibernate_cleaning_in_progress == TRUE) {
4552 		req_lowpriority = FALSE;
4553 	}
4554 
4555 	if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4556 		vm_page_unlock_queues();
4557 
4558 		if (req_lowpriority == TRUE) {
4559 			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4560 			DTRACE_VM(laundrythrottle);
4561 		} else {
4562 			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4563 			DTRACE_VM(laundryunthrottle);
4564 		}
4565 		proc_set_thread_policy(ethr->pgo_iothread,
4566 		    TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4567 
4568 		vm_page_lock_queues();
4569 		ethr->q->pgo_lowpriority = req_lowpriority;
4570 	}
4571 }
4572 
4573 OS_NORETURN
4574 static void
4575 vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4576 {
4577 	thread_t        self = current_thread();
4578 
4579 	self->options |= TH_OPT_VMPRIV;
4580 
4581 	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4582 
4583 	proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4584 	    TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4585 
4586 	vm_page_lock_queues();
4587 
4588 	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4589 	vm_pageout_queue_external.pgo_inited = TRUE;
4590 
4591 	vm_page_unlock_queues();
4592 
4593 #if CONFIG_THREAD_GROUPS
4594 	thread_group_vm_add();
4595 #endif /* CONFIG_THREAD_GROUPS */
4596 
4597 	vm_pageout_iothread_external_continue(ethr, 0);
4598 	/*NOTREACHED*/
4599 }
4600 
4601 
4602 OS_NORETURN
4603 static void
4604 vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4605 {
4606 	thread_t        self = current_thread();
4607 
4608 	self->options |= TH_OPT_VMPRIV;
4609 
4610 	vm_page_lock_queues();
4611 
4612 	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4613 	vm_pageout_queue_internal.pgo_inited = TRUE;
4614 
4615 #if DEVELOPMENT || DEBUG
4616 	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4617 	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4618 	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4619 #endif /* DEVELOPMENT || DEBUG */
4620 
4621 	vm_page_unlock_queues();
4622 
4623 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4624 		thread_vm_bind_group_add();
4625 	}
4626 
4627 #if CONFIG_THREAD_GROUPS
4628 	thread_group_vm_add();
4629 #endif /* CONFIG_THREAD_GROUPS */
4630 
4631 #if __AMP__
4632 	if (vm_compressor_ebound) {
4633 		/*
4634 		 * Use the soft bound option for vm_compressor to allow it to run on
4635 		 * P-cores if E-cluster is unavailable.
4636 		 */
4637 		thread_soft_bind_cluster_type(self, 'E');
4638 	}
4639 #endif /* __AMP__ */
4640 
4641 	thread_set_thread_name(current_thread(), "VM_compressor");
4642 #if DEVELOPMENT || DEBUG
4643 	vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4644 #endif
4645 	vm_pageout_iothread_internal_continue(cthr, 0);
4646 
4647 	/*NOTREACHED*/
4648 }
4649 
4650 kern_return_t
4651 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4652 {
4653 	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4654 		return KERN_SUCCESS;
4655 	} else {
4656 		return KERN_FAILURE; /* Already set */
4657 	}
4658 }
4659 
4660 extern boolean_t        memorystatus_manual_testing_on;
4661 extern unsigned int     memorystatus_level;
4662 
4663 
4664 #if VM_PRESSURE_EVENTS
4665 
4666 boolean_t vm_pressure_events_enabled = FALSE;
4667 
4668 extern uint64_t next_warning_notification_sent_at_ts;
4669 extern uint64_t next_critical_notification_sent_at_ts;
4670 
4671 #define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS    (30)    /* 30 minutes. */
4672 
4673 /*
4674  * The last time there was change in pressure level OR we forced a check
4675  * because the system is stuck in a non-normal pressure level.
4676  */
4677 uint64_t  vm_pressure_last_level_transition_abs = 0;
4678 
4679 /*
4680  *  This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4681  * level before resending out notifications for that level again.
4682  */
4683 int  vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4684 
4685 void
4686 vm_pressure_response(void)
4687 {
4688 	vm_pressure_level_t     old_level = kVMPressureNormal;
4689 	int                     new_level = -1;
4690 	unsigned int            total_pages;
4691 	uint64_t                available_memory = 0;
4692 	uint64_t                curr_ts, abs_time_since_level_transition, time_in_ns;
4693 	bool                    force_check = false;
4694 	int                     time_in_mins;
4695 
4696 
4697 	if (vm_pressure_events_enabled == FALSE) {
4698 		return;
4699 	}
4700 
4701 	available_memory = (uint64_t) memorystatus_get_available_page_count();
4702 
4703 	total_pages = (unsigned int) atop_64(max_mem);
4704 #if CONFIG_SECLUDED_MEMORY
4705 	total_pages -= vm_page_secluded_count;
4706 #endif /* CONFIG_SECLUDED_MEMORY */
4707 	memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4708 
4709 	if (memorystatus_manual_testing_on) {
4710 		return;
4711 	}
4712 
4713 	curr_ts = mach_absolute_time();
4714 	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4715 
4716 	absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
4717 	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4718 	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4719 
4720 	old_level = memorystatus_vm_pressure_level;
4721 
4722 	switch (memorystatus_vm_pressure_level) {
4723 	case kVMPressureNormal:
4724 	{
4725 		if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4726 			new_level = kVMPressureCritical;
4727 		} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4728 			new_level = kVMPressureWarning;
4729 		}
4730 		break;
4731 	}
4732 
4733 	case kVMPressureWarning:
4734 	case kVMPressureUrgent:
4735 	{
4736 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4737 			new_level = kVMPressureNormal;
4738 		} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4739 			new_level = kVMPressureCritical;
4740 		} else if (force_check) {
4741 			new_level = kVMPressureWarning;
4742 			next_warning_notification_sent_at_ts = curr_ts;
4743 		}
4744 		break;
4745 	}
4746 
4747 	case kVMPressureCritical:
4748 	{
4749 		if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4750 			new_level = kVMPressureNormal;
4751 		} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4752 			new_level = kVMPressureWarning;
4753 		} else if (force_check) {
4754 			new_level = kVMPressureCritical;
4755 			next_critical_notification_sent_at_ts = curr_ts;
4756 		}
4757 		break;
4758 	}
4759 
4760 	default:
4761 		return;
4762 	}
4763 
4764 	if (new_level != -1 || force_check) {
4765 		if (new_level != -1) {
4766 			memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4767 
4768 			if (new_level != (int) old_level) {
4769 				VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4770 				    new_level, old_level, 0, 0);
4771 			}
4772 		} else {
4773 			VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4774 			    new_level, old_level, force_check, 0);
4775 		}
4776 
4777 		if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4778 			/*
4779 			 * We don't want to schedule a wakeup while hibernation is in progress
4780 			 * because that could collide with checks for non-monotonicity in the scheduler.
4781 			 * We do however do all the updates to memorystatus_vm_pressure_level because
4782 			 * we _might_ want to use that for decisions regarding which pages or how
4783 			 * many pages we want to dump in hibernation.
4784 			 */
4785 			return;
4786 		}
4787 
4788 		if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4789 			if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4790 				thread_wakeup(&vm_pressure_thread);
4791 			}
4792 
4793 			if (old_level != memorystatus_vm_pressure_level) {
4794 				thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4795 			}
4796 			vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4797 		}
4798 	}
4799 }
4800 #endif /* VM_PRESSURE_EVENTS */
4801 
4802 
4803 /**
4804  * Called by a kernel thread to ask if a number of pages may be wired.
4805  */
4806 kern_return_t
4807 mach_vm_wire_level_monitor(int64_t requested_pages)
4808 {
4809 	if (requested_pages <= 0) {
4810 		return KERN_INVALID_ARGUMENT;
4811 	}
4812 
4813 	const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4814 	/**
4815 	 * Available pages can be negative in the case where more system memory is
4816 	 * wired than the threshold, so we must use a signed integer.
4817 	 */
4818 	const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4819 
4820 	if (requested_pages > available_pages) {
4821 		return KERN_RESOURCE_SHORTAGE;
4822 	}
4823 	return KERN_SUCCESS;
4824 }
4825 
4826 /*
4827  * Function called by a kernel thread to either get the current pressure level or
4828  * wait until memory pressure changes from a given level.
4829  */
4830 kern_return_t
4831 mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
4832 {
4833 #if !VM_PRESSURE_EVENTS
4834 	(void)wait_for_pressure;
4835 	(void)pressure_level;
4836 	return KERN_NOT_SUPPORTED;
4837 #else /* VM_PRESSURE_EVENTS */
4838 
4839 	uint32_t *waiters = NULL;
4840 	wait_result_t wr = 0;
4841 	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4842 
4843 	if (pressure_level == NULL) {
4844 		return KERN_INVALID_ARGUMENT;
4845 	}
4846 	if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
4847 	    *pressure_level == kVMPressureForegroundJetsam)) {
4848 		return KERN_INVALID_ARGUMENT;
4849 	}
4850 
4851 	if (wait_for_pressure) {
4852 		switch (*pressure_level) {
4853 		case kVMPressureForegroundJetsam:
4854 		case kVMPressureBackgroundJetsam:
4855 
4856 			if (*pressure_level == kVMPressureForegroundJetsam) {
4857 				waiters = &memorystatus_jetsam_fg_band_waiters;
4858 			} else {
4859 				/* kVMPressureBackgroundJetsam */
4860 				waiters = &memorystatus_jetsam_bg_band_waiters;
4861 			}
4862 
4863 			lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
4864 			wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
4865 			if (wr == THREAD_WAITING) {
4866 				*waiters += 1;
4867 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4868 				wr = thread_block(THREAD_CONTINUE_NULL);
4869 			} else {
4870 				lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
4871 			}
4872 
4873 			if (wr != THREAD_AWAKENED) {
4874 				return KERN_ABORTED;
4875 			}
4876 
4877 			return KERN_SUCCESS;
4878 		case kVMPressureNormal:
4879 		case kVMPressureWarning:
4880 		case kVMPressureUrgent:
4881 		case kVMPressureCritical:
4882 			while (old_level == *pressure_level) {
4883 				wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4884 				    THREAD_INTERRUPTIBLE);
4885 				if (wr == THREAD_WAITING) {
4886 					wr = thread_block(THREAD_CONTINUE_NULL);
4887 				}
4888 				if (wr == THREAD_INTERRUPTED) {
4889 					return KERN_ABORTED;
4890 				}
4891 
4892 				if (wr == THREAD_AWAKENED) {
4893 					old_level = memorystatus_vm_pressure_level;
4894 				}
4895 			}
4896 			break;
4897 		default:
4898 			return KERN_INVALID_ARGUMENT;
4899 		}
4900 	}
4901 
4902 	*pressure_level = old_level;
4903 	return KERN_SUCCESS;
4904 #endif /* VM_PRESSURE_EVENTS */
4905 }
4906 
4907 #if VM_PRESSURE_EVENTS
4908 void
4909 vm_pressure_thread(void)
4910 {
4911 	static boolean_t thread_initialized = FALSE;
4912 
4913 	if (thread_initialized == TRUE) {
4914 		vm_pageout_state.vm_pressure_thread_running = TRUE;
4915 		consider_vm_pressure_events();
4916 		vm_pageout_state.vm_pressure_thread_running = FALSE;
4917 	}
4918 
4919 #if CONFIG_THREAD_GROUPS
4920 	thread_group_vm_add();
4921 #endif /* CONFIG_THREAD_GROUPS */
4922 
4923 	thread_set_thread_name(current_thread(), "VM_pressure");
4924 	thread_initialized = TRUE;
4925 	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4926 	thread_block((thread_continue_t)vm_pressure_thread);
4927 }
4928 #endif /* VM_PRESSURE_EVENTS */
4929 
4930 
4931 /*
4932  * called once per-second via "compute_averages"
4933  */
4934 void
4935 compute_pageout_gc_throttle(__unused void *arg)
4936 {
4937 	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4938 		vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4939 		sched_cond_signal(&vm_pageout_gc_cond, vm_pageout_gc_thread);
4940 	}
4941 }
4942 
4943 /*
4944  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4945  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4946  * jetsams. We need to check if the zone map size is above its jetsam limit to
4947  * decide if this was indeed the case.
4948  *
4949  * We need to do this on a different thread because of the following reasons:
4950  *
4951  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4952  * itself causing the system to hang. We perform synchronous jetsams if we're
4953  * leaking in the VM map entries zone, so the leaking process could be doing a
4954  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4955  * jetsam itself. We also need the vm_map lock on the process termination path,
4956  * which would now lead the dying process to deadlock against itself.
4957  *
4958  * 2. The jetsam path might need to allocate zone memory itself. We could try
4959  * using the non-blocking variant of zalloc for this path, but we can still
4960  * end up trying to do a kmem_alloc when the zone maps are almost full.
4961  */
4962 __dead2
4963 void
4964 vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4965 {
4966 	assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4967 
4968 	if (step != VM_PAGEOUT_GC_INIT) {
4969 		sched_cond_ack(&vm_pageout_gc_cond);
4970 	}
4971 
4972 	while (true) {
4973 		if (step == VM_PAGEOUT_GC_INIT) {
4974 			/* first time being called is not about GC */
4975 #if CONFIG_THREAD_GROUPS
4976 			thread_group_vm_add();
4977 #endif /* CONFIG_THREAD_GROUPS */
4978 			step = VM_PAGEOUT_GC_COLLECT;
4979 		} else if (zone_map_nearing_exhaustion()) {
4980 			/*
4981 			 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4982 			 *
4983 			 * Bail out after calling zone_gc (which triggers the
4984 			 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4985 			 * operations that clear out a bunch of caches might allocate zone
4986 			 * memory themselves (for eg. vm_map operations would need VM map
4987 			 * entries). Since the zone map is almost full at this point, we
4988 			 * could end up with a panic. We just need to quickly jetsam a
4989 			 * process and exit here.
4990 			 *
4991 			 * It could so happen that we were woken up to relieve memory
4992 			 * pressure and the zone map also happened to be near its limit at
4993 			 * the time, in which case we'll skip out early. But that should be
4994 			 * ok; if memory pressure persists, the thread will simply be woken
4995 			 * up again.
4996 			 */
4997 
4998 			zone_gc(ZONE_GC_JETSAM);
4999 		} else {
5000 			/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
5001 			boolean_t buf_large_zfree = FALSE;
5002 			boolean_t first_try = TRUE;
5003 
5004 			stack_collect();
5005 
5006 			consider_machine_collect();
5007 #if CONFIG_DEFERRED_RECLAIM
5008 			vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, RECLAIM_OPTIONS_NONE);
5009 #endif /* CONFIG_DEFERRED_RECLAIM */
5010 #if CONFIG_MBUF_MCACHE
5011 			mbuf_drain(FALSE);
5012 #endif /* CONFIG_MBUF_MCACHE */
5013 
5014 			do {
5015 				if (consider_buffer_cache_collect != NULL) {
5016 					buf_large_zfree = (*consider_buffer_cache_collect)(0);
5017 				}
5018 				if (first_try == TRUE || buf_large_zfree == TRUE) {
5019 					/*
5020 					 * zone_gc should be last, because the other operations
5021 					 * might return memory to zones.
5022 					 */
5023 					zone_gc(ZONE_GC_TRIM);
5024 				}
5025 				first_try = FALSE;
5026 			} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
5027 
5028 			consider_machine_adjust();
5029 		}
5030 
5031 		sched_cond_wait_parameter(&vm_pageout_gc_cond, THREAD_UNINT, vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
5032 	}
5033 	__builtin_unreachable();
5034 }
5035 
5036 
5037 #if VM_PAGE_BUCKETS_CHECK
5038 #if VM_PAGE_FAKE_BUCKETS
5039 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
5040 #endif /* VM_PAGE_FAKE_BUCKETS */
5041 #endif /* VM_PAGE_BUCKETS_CHECK */
5042 
5043 
5044 
5045 void
5046 vm_set_restrictions(unsigned int num_cpus)
5047 {
5048 	int vm_restricted_to_single_processor = 0;
5049 
5050 	if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
5051 		kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
5052 		vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
5053 	} else {
5054 		assert(num_cpus > 0);
5055 
5056 		if (num_cpus <= 3) {
5057 			/*
5058 			 * on systems with a limited number of CPUS, bind the
5059 			 * 4 major threads that can free memory and that tend to use
5060 			 * a fair bit of CPU under pressured conditions to a single processor.
5061 			 * This insures that these threads don't hog all of the available CPUs
5062 			 * (important for camera launch), while allowing them to run independently
5063 			 * w/r to locks... the 4 threads are
5064 			 * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
5065 			 * vm_compressor_swap_trigger_thread (minor and major compactions),
5066 			 * memorystatus_thread (jetsams).
5067 			 *
5068 			 * the first time the thread is run, it is responsible for checking the
5069 			 * state of vm_restricted_to_single_processor, and if TRUE it calls
5070 			 * thread_bind_master...  someday this should be replaced with a group
5071 			 * scheduling mechanism and KPI.
5072 			 */
5073 			vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5074 		} else {
5075 			vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5076 		}
5077 	}
5078 }
5079 
5080 /*
5081  * Set up vm_config based on the vm_compressor_mode.
5082  * Must run BEFORE the pageout thread starts up.
5083  */
5084 __startup_func
5085 void
5086 vm_config_init(void)
5087 {
5088 	bzero(&vm_config, sizeof(vm_config));
5089 
5090 	switch (vm_compressor_mode) {
5091 	case VM_PAGER_DEFAULT:
5092 		printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5093 		OS_FALLTHROUGH;
5094 
5095 	case VM_PAGER_COMPRESSOR_WITH_SWAP:
5096 		vm_config.compressor_is_present = TRUE;
5097 		vm_config.swap_is_present = TRUE;
5098 		vm_config.compressor_is_active = TRUE;
5099 		vm_config.swap_is_active = TRUE;
5100 		break;
5101 
5102 	case VM_PAGER_COMPRESSOR_NO_SWAP:
5103 		vm_config.compressor_is_present = TRUE;
5104 		vm_config.swap_is_present = TRUE;
5105 		vm_config.compressor_is_active = TRUE;
5106 		break;
5107 
5108 	case VM_PAGER_FREEZER_DEFAULT:
5109 		printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5110 		OS_FALLTHROUGH;
5111 
5112 	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5113 		vm_config.compressor_is_present = TRUE;
5114 		vm_config.swap_is_present = TRUE;
5115 		break;
5116 
5117 	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5118 		vm_config.compressor_is_present = TRUE;
5119 		vm_config.swap_is_present = TRUE;
5120 		vm_config.compressor_is_active = TRUE;
5121 		vm_config.freezer_swap_is_active = TRUE;
5122 		break;
5123 
5124 	case VM_PAGER_NOT_CONFIGURED:
5125 		break;
5126 
5127 	default:
5128 		printf("unknown compressor mode - %x\n", vm_compressor_mode);
5129 		break;
5130 	}
5131 }
5132 
5133 __startup_func
5134 static void
5135 vm_pageout_create_gc_thread(void)
5136 {
5137 	thread_t thread;
5138 
5139 	sched_cond_init(&vm_pageout_gc_cond);
5140 	if (kernel_thread_create(vm_pageout_garbage_collect,
5141 	    VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
5142 		panic("vm_pageout_garbage_collect: create failed");
5143 	}
5144 	thread_set_thread_name(thread, "VM_pageout_garbage_collect");
5145 	if (thread->reserved_stack == 0) {
5146 		assert(thread->kernel_stack);
5147 		thread->reserved_stack = thread->kernel_stack;
5148 	}
5149 
5150 	/* thread is started in vm_pageout() */
5151 	vm_pageout_gc_thread = thread;
5152 }
5153 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5154 
5155 void
5156 vm_pageout(void)
5157 {
5158 	thread_t        self = current_thread();
5159 	thread_t        thread;
5160 	kern_return_t   result;
5161 	spl_t           s;
5162 
5163 	/*
5164 	 * Set thread privileges.
5165 	 */
5166 	s = splsched();
5167 
5168 #if CONFIG_VPS_DYNAMIC_PRIO
5169 	if (vps_dynamic_priority_enabled) {
5170 		sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5171 		thread_set_eager_preempt(self);
5172 	} else {
5173 		sched_set_kernel_thread_priority(self, BASEPRI_VM);
5174 	}
5175 #else /* CONFIG_VPS_DYNAMIC_PRIO */
5176 	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5177 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
5178 
5179 	thread_lock(self);
5180 	self->options |= TH_OPT_VMPRIV;
5181 	thread_unlock(self);
5182 
5183 	if (!self->reserved_stack) {
5184 		self->reserved_stack = self->kernel_stack;
5185 	}
5186 
5187 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5188 	    !vps_dynamic_priority_enabled) {
5189 		thread_vm_bind_group_add();
5190 	}
5191 
5192 
5193 #if CONFIG_THREAD_GROUPS
5194 	thread_group_vm_add();
5195 #endif /* CONFIG_THREAD_GROUPS */
5196 
5197 #if __AMP__
5198 	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5199 	if (vm_pgo_pbound) {
5200 		/*
5201 		 * Use the soft bound option for vm pageout to allow it to run on
5202 		 * E-cores if P-cluster is unavailable.
5203 		 */
5204 		thread_soft_bind_cluster_type(self, 'P');
5205 	}
5206 #endif /* __AMP__ */
5207 
5208 	PE_parse_boot_argn("vmpgo_protect_realtime",
5209 	    &vm_pageout_protect_realtime,
5210 	    sizeof(vm_pageout_protect_realtime));
5211 	splx(s);
5212 
5213 	thread_set_thread_name(current_thread(), "VM_pageout_scan");
5214 
5215 	/*
5216 	 *	Initialize some paging parameters.
5217 	 */
5218 
5219 	vm_pageout_state.vm_pressure_thread_running = FALSE;
5220 	vm_pageout_state.vm_pressure_changed = FALSE;
5221 	vm_pageout_state.memorystatus_purge_on_warning = 2;
5222 	vm_pageout_state.memorystatus_purge_on_urgent = 5;
5223 	vm_pageout_state.memorystatus_purge_on_critical = 8;
5224 	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5225 	vm_pageout_state.vm_page_speculative_percentage = 5;
5226 	vm_pageout_state.vm_page_speculative_target = 0;
5227 
5228 	vm_pageout_state.vm_pageout_swap_wait = 0;
5229 	vm_pageout_state.vm_pageout_idle_wait = 0;
5230 	vm_pageout_state.vm_pageout_empty_wait = 0;
5231 	vm_pageout_state.vm_pageout_burst_wait = 0;
5232 	vm_pageout_state.vm_pageout_deadlock_wait = 0;
5233 	vm_pageout_state.vm_pageout_deadlock_relief = 0;
5234 	vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5235 
5236 	vm_pageout_state.vm_pageout_inactive = 0;
5237 	vm_pageout_state.vm_pageout_inactive_used = 0;
5238 	vm_pageout_state.vm_pageout_inactive_clean = 0;
5239 
5240 	vm_pageout_state.vm_memory_pressure = 0;
5241 	vm_pageout_state.vm_page_filecache_min = 0;
5242 #if CONFIG_JETSAM
5243 	vm_pageout_state.vm_page_filecache_min_divisor = 70;
5244 	vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5245 #else
5246 	vm_pageout_state.vm_page_filecache_min_divisor = 27;
5247 	vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5248 #endif
5249 	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5250 
5251 	vm_pageout_state.vm_pageout_considered_page_last = 0;
5252 
5253 	if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5254 		vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5255 	}
5256 
5257 	if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5258 		vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5259 	}
5260 
5261 	if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5262 		vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5263 	}
5264 
5265 	if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5266 		vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5267 	}
5268 
5269 	if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5270 		vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5271 	}
5272 
5273 	if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5274 		vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5275 	}
5276 
5277 	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5278 		vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5279 	}
5280 	/*
5281 	 * even if we've already called vm_page_free_reserve
5282 	 * call it again here to insure that the targets are
5283 	 * accurately calculated (it uses vm_page_free_count_init)
5284 	 * calling it with an arg of 0 will not change the reserve
5285 	 * but will re-calculate free_min and free_target
5286 	 */
5287 	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5288 		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5289 	} else {
5290 		vm_page_free_reserve(0);
5291 	}
5292 
5293 	bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
5294 	bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
5295 
5296 	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5297 	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5298 
5299 	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5300 
5301 #if DEVELOPMENT || DEBUG
5302 	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5303 	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5304 #endif /* DEVELOPMENT || DEBUG */
5305 
5306 
5307 	/* internal pageout thread started when default pager registered first time */
5308 	/* external pageout and garbage collection threads started here */
5309 	struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5310 	ethr->id = 0;
5311 	ethr->q = &vm_pageout_queue_external;
5312 	/* in external_state these cheads are never used, they are used only in internal_state for te compressor */
5313 	ethr->current_early_swapout_chead = NULL;
5314 	ethr->current_regular_swapout_chead = NULL;
5315 	ethr->current_late_swapout_chead = NULL;
5316 	ethr->scratch_buf = NULL;
5317 #if DEVELOPMENT || DEBUG
5318 	ethr->benchmark_q = NULL;
5319 #endif /* DEVELOPMENT || DEBUG */
5320 	sched_cond_init(&(ethr->pgo_wakeup));
5321 
5322 	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
5323 	    (void *)ethr, BASEPRI_VM,
5324 	    &(ethr->pgo_iothread));
5325 	if (result != KERN_SUCCESS) {
5326 		panic("vm_pageout: Unable to create external thread (%d)\n", result);
5327 	}
5328 	thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
5329 
5330 	thread_mtx_lock(vm_pageout_gc_thread );
5331 	thread_start(vm_pageout_gc_thread );
5332 	thread_mtx_unlock(vm_pageout_gc_thread);
5333 
5334 #if VM_PRESSURE_EVENTS
5335 	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
5336 	    BASEPRI_DEFAULT,
5337 	    &thread);
5338 
5339 	if (result != KERN_SUCCESS) {
5340 		panic("vm_pressure_thread: create failed");
5341 	}
5342 
5343 	thread_deallocate(thread);
5344 #endif
5345 
5346 	vm_object_reaper_init();
5347 
5348 
5349 	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5350 		vm_compressor_init();
5351 	}
5352 
5353 #if VM_PRESSURE_EVENTS
5354 	vm_pressure_events_enabled = TRUE;
5355 #endif /* VM_PRESSURE_EVENTS */
5356 
5357 #if CONFIG_PHANTOM_CACHE
5358 	vm_phantom_cache_init();
5359 #endif
5360 #if VM_PAGE_BUCKETS_CHECK
5361 #if VM_PAGE_FAKE_BUCKETS
5362 	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5363 	    (uint64_t) vm_page_fake_buckets_start,
5364 	    (uint64_t) vm_page_fake_buckets_end);
5365 	pmap_protect(kernel_pmap,
5366 	    vm_page_fake_buckets_start,
5367 	    vm_page_fake_buckets_end,
5368 	    VM_PROT_READ);
5369 //	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
5370 #endif /* VM_PAGE_FAKE_BUCKETS */
5371 #endif /* VM_PAGE_BUCKETS_CHECK */
5372 
5373 #if VM_OBJECT_TRACKING
5374 	vm_object_tracking_init();
5375 #endif /* VM_OBJECT_TRACKING */
5376 
5377 #if __arm64__
5378 //	vm_tests();
5379 #endif /* __arm64__ */
5380 
5381 	vm_pageout_continue();
5382 
5383 	/*
5384 	 * Unreached code!
5385 	 *
5386 	 * The vm_pageout_continue() call above never returns, so the code below is never
5387 	 * executed.  We take advantage of this to declare several DTrace VM related probe
5388 	 * points that our kernel doesn't have an analog for.  These are probe points that
5389 	 * exist in Solaris and are in the DTrace documentation, so people may have written
5390 	 * scripts that use them.  Declaring the probe points here means their scripts will
5391 	 * compile and execute which we want for portability of the scripts, but since this
5392 	 * section of code is never reached, the probe points will simply never fire.  Yes,
5393 	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
5394 	 * Solaris specific VM events in mind, not portability to different VM implementations.
5395 	 */
5396 
5397 	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5398 	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5399 	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5400 	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5401 	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5402 	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5403 	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5404 	/*NOTREACHED*/
5405 }
5406 
5407 
5408 
5409 kern_return_t
5410 vm_pageout_internal_start(void)
5411 {
5412 	kern_return_t   result = KERN_SUCCESS;
5413 	host_basic_info_data_t hinfo;
5414 	vm_offset_t     buf, bufsize;
5415 
5416 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5417 
5418 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5419 #define BSD_HOST 1
5420 	host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5421 
5422 	assert(hinfo.max_cpus > 0);
5423 
5424 #if !XNU_TARGET_OS_OSX
5425 	vm_pageout_state.vm_compressor_thread_count = 1;
5426 #else /* !XNU_TARGET_OS_OSX */
5427 	if (hinfo.max_cpus > 4) {
5428 		vm_pageout_state.vm_compressor_thread_count = 2;
5429 	} else {
5430 		vm_pageout_state.vm_compressor_thread_count = 1;
5431 	}
5432 #endif /* !XNU_TARGET_OS_OSX */
5433 #if     __AMP__
5434 	if (vm_compressor_ebound) {
5435 		vm_pageout_state.vm_compressor_thread_count = 2;
5436 	}
5437 #endif
5438 	PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5439 	    sizeof(vm_pageout_state.vm_compressor_thread_count));
5440 
5441 	/* did we get from the bootargs an unreasonable number? */
5442 	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5443 		vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5444 	}
5445 	if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5446 		vm_pageout_state.vm_compressor_thread_count = 1;
5447 	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5448 		vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5449 	}
5450 
5451 	vm_pageout_queue_internal.pgo_maxlaundry =
5452 	    (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5453 
5454 	PE_parse_boot_argn("vmpgoi_maxlaundry",
5455 	    &vm_pageout_queue_internal.pgo_maxlaundry,
5456 	    sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5457 
5458 #if DEVELOPMENT || DEBUG
5459 	// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5460 	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5461 #endif /* DEVELOPMENT || DEBUG */
5462 
5463 	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5464 
5465 	kmem_alloc(kernel_map, &buf,
5466 	    bufsize * vm_pageout_state.vm_compressor_thread_count,
5467 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5468 	    VM_KERN_MEMORY_COMPRESSOR);
5469 
5470 	for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5471 		struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5472 		iq->id = i;
5473 		iq->q = &vm_pageout_queue_internal;
5474 		iq->current_early_swapout_chead = NULL;
5475 		iq->current_regular_swapout_chead = NULL;
5476 		iq->current_late_swapout_chead = NULL;
5477 		iq->scratch_buf = (char *)(buf + i * bufsize);
5478 #if DEVELOPMENT || DEBUG
5479 		iq->benchmark_q = &vm_pageout_queue_benchmark;
5480 #endif /* DEVELOPMENT || DEBUG */
5481 		sched_cond_init(&(iq->pgo_wakeup));
5482 		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5483 		    (void *)iq, BASEPRI_VM,
5484 		    &(iq->pgo_iothread));
5485 
5486 		if (result != KERN_SUCCESS) {
5487 			panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5488 		}
5489 	}
5490 	return result;
5491 }
5492 
5493 #if CONFIG_IOSCHED
5494 /*
5495  * To support I/O Expedite for compressed files we mark the upls with special flags.
5496  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5497  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5498  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5499  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5500  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5501  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5502  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5503  * unless the real I/O upl is being destroyed).
5504  */
5505 
5506 
5507 static void
5508 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5509 {
5510 	assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5511 
5512 	upl_lock(src_upl);
5513 	if (src_upl->decmp_io_upl) {
5514 		/*
5515 		 * If there is already an alive real I/O UPL, ignore this new UPL.
5516 		 * This case should rarely happen and even if it does, it just means
5517 		 * that we might issue a spurious expedite which the driver is expected
5518 		 * to handle.
5519 		 */
5520 		upl_unlock(src_upl);
5521 		return;
5522 	}
5523 	src_upl->decmp_io_upl = (void *)upl;
5524 	src_upl->ref_count++;
5525 
5526 	upl->flags |= UPL_DECMP_REAL_IO;
5527 	upl->decmp_io_upl = (void *)src_upl;
5528 	upl_unlock(src_upl);
5529 }
5530 #endif /* CONFIG_IOSCHED */
5531 
5532 #if UPL_DEBUG
5533 int     upl_debug_enabled = 1;
5534 #else
5535 int     upl_debug_enabled = 0;
5536 #endif
5537 
5538 static upl_t
5539 upl_create(int type, int flags, upl_size_t size)
5540 {
5541 	uint32_t pages = (uint32_t)atop(round_page_32(size));
5542 	upl_t    upl;
5543 
5544 	assert(page_aligned(size));
5545 
5546 	/*
5547 	 * FIXME: this code assumes the allocation always succeeds,
5548 	 *        however `pages` can be up to MAX_UPL_SIZE.
5549 	 *
5550 	 *        The allocation size is above 32k (resp. 128k)
5551 	 *        on 16k pages (resp. 4k), which kalloc might fail
5552 	 *        to allocate.
5553 	 */
5554 	upl = kalloc_type(struct upl, struct upl_page_info,
5555 	    (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5556 	if (type & UPL_CREATE_INTERNAL) {
5557 		flags |= UPL_INTERNAL;
5558 	}
5559 
5560 	if (type & UPL_CREATE_LITE) {
5561 		flags |= UPL_LITE;
5562 		if (pages) {
5563 			upl->lite_list = bitmap_alloc(pages);
5564 		}
5565 	}
5566 
5567 	upl->flags = flags;
5568 	upl->ref_count = 1;
5569 	upl_lock_init(upl);
5570 #if CONFIG_IOSCHED
5571 	if (type & UPL_CREATE_IO_TRACKING) {
5572 		upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5573 	}
5574 
5575 	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5576 		/* Only support expedite on internal UPLs */
5577 		thread_t        curthread = current_thread();
5578 		upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5579 		    Z_WAITOK | Z_ZERO);
5580 		upl->flags |= UPL_EXPEDITE_SUPPORTED;
5581 		if (curthread->decmp_upl != NULL) {
5582 			upl_set_decmp_info(upl, curthread->decmp_upl);
5583 		}
5584 	}
5585 #endif
5586 #if CONFIG_IOSCHED || UPL_DEBUG
5587 	if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5588 		upl->upl_creator = current_thread();
5589 		upl->flags |= UPL_TRACKED_BY_OBJECT;
5590 	}
5591 #endif
5592 
5593 #if UPL_DEBUG
5594 	upl->upl_create_btref = btref_get(__builtin_frame_address(0), 0);
5595 #endif /* UPL_DEBUG */
5596 
5597 	return upl;
5598 }
5599 
5600 static void
5601 upl_destroy(upl_t upl)
5602 {
5603 	uint32_t pages;
5604 
5605 //	DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5606 
5607 	if (upl->ext_ref_count) {
5608 		panic("upl(%p) ext_ref_count", upl);
5609 	}
5610 
5611 #if CONFIG_IOSCHED
5612 	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5613 		upl_t src_upl;
5614 		src_upl = upl->decmp_io_upl;
5615 		assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5616 		upl_lock(src_upl);
5617 		src_upl->decmp_io_upl = NULL;
5618 		upl_unlock(src_upl);
5619 		upl_deallocate(src_upl);
5620 	}
5621 #endif /* CONFIG_IOSCHED */
5622 
5623 #if CONFIG_IOSCHED || UPL_DEBUG
5624 	if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5625 	    !(upl->flags & UPL_VECTOR)) {
5626 		vm_object_t     object;
5627 
5628 		if (upl->flags & UPL_SHADOWED) {
5629 			object = upl->map_object->shadow;
5630 		} else {
5631 			object = upl->map_object;
5632 		}
5633 
5634 		vm_object_lock(object);
5635 		queue_remove(&object->uplq, upl, upl_t, uplq);
5636 		vm_object_activity_end(object);
5637 		vm_object_collapse(object, 0, TRUE);
5638 		vm_object_unlock(object);
5639 	}
5640 #endif
5641 	/*
5642 	 * drop a reference on the map_object whether or
5643 	 * not a pageout object is inserted
5644 	 */
5645 	if (upl->flags & UPL_SHADOWED) {
5646 		vm_object_deallocate(upl->map_object);
5647 	}
5648 
5649 	if (upl->flags & UPL_DEVICE_MEMORY) {
5650 		pages = 1;
5651 	} else {
5652 		pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5653 	}
5654 
5655 	upl_lock_destroy(upl);
5656 
5657 #if CONFIG_IOSCHED
5658 	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5659 		kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5660 	}
5661 #endif
5662 
5663 #if UPL_DEBUG
5664 	for (int i = 0; i < upl->upl_commit_index; i++) {
5665 		btref_put(upl->upl_commit_records[i].c_btref);
5666 	}
5667 	btref_put(upl->upl_create_btref);
5668 #endif /* UPL_DEBUG */
5669 
5670 	if ((upl->flags & UPL_LITE) && pages) {
5671 		bitmap_free(upl->lite_list, pages);
5672 	}
5673 	kfree_type(struct upl, struct upl_page_info,
5674 	    (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5675 }
5676 
5677 void
5678 upl_deallocate(upl_t upl)
5679 {
5680 	upl_lock(upl);
5681 
5682 	if (--upl->ref_count == 0) {
5683 		if (vector_upl_is_valid(upl)) {
5684 			vector_upl_deallocate(upl);
5685 		}
5686 		upl_unlock(upl);
5687 
5688 		if (upl->upl_iodone) {
5689 			upl_callout_iodone(upl);
5690 		}
5691 
5692 		upl_destroy(upl);
5693 	} else {
5694 		upl_unlock(upl);
5695 	}
5696 }
5697 
5698 #if CONFIG_IOSCHED
5699 void
5700 upl_mark_decmp(upl_t upl)
5701 {
5702 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5703 		upl->flags |= UPL_DECMP_REQ;
5704 		upl->upl_creator->decmp_upl = (void *)upl;
5705 	}
5706 }
5707 
5708 void
5709 upl_unmark_decmp(upl_t upl)
5710 {
5711 	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5712 		upl->upl_creator->decmp_upl = NULL;
5713 	}
5714 }
5715 
5716 #endif /* CONFIG_IOSCHED */
5717 
5718 #define VM_PAGE_Q_BACKING_UP(q)         \
5719 	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5720 
5721 boolean_t must_throttle_writes(void);
5722 
5723 boolean_t
5724 must_throttle_writes()
5725 {
5726 	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5727 	    vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5728 		return TRUE;
5729 	}
5730 
5731 	return FALSE;
5732 }
5733 
5734 int vm_page_delayed_work_ctx_needed = 0;
5735 KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5736 
5737 __startup_func
5738 static void
5739 vm_page_delayed_work_init_ctx(void)
5740 {
5741 	uint16_t min_delayed_work_ctx_allocated = 16;
5742 
5743 	/*
5744 	 * try really hard to always keep NCPU elements around in the zone
5745 	 * in order for the UPL code to almost always get an element.
5746 	 */
5747 	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5748 		min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5749 	}
5750 
5751 	zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
5752 }
5753 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5754 
5755 struct vm_page_delayed_work*
5756 vm_page_delayed_work_get_ctx(void)
5757 {
5758 	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5759 
5760 	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5761 
5762 	if (__probable(dw_ctx)) {
5763 		dw_ctx->delayed_owner = current_thread();
5764 	} else {
5765 		vm_page_delayed_work_ctx_needed++;
5766 	}
5767 	return dw_ctx ? dw_ctx->dwp : NULL;
5768 }
5769 
5770 void
5771 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5772 {
5773 	struct  vm_page_delayed_work_ctx *ldw_ctx;
5774 
5775 	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5776 	ldw_ctx->delayed_owner = NULL;
5777 
5778 	zfree(dw_ctx_zone, ldw_ctx);
5779 }
5780 
5781 /*
5782  *	Routine:	vm_object_upl_request
5783  *	Purpose:
5784  *		Cause the population of a portion of a vm_object.
5785  *		Depending on the nature of the request, the pages
5786  *		returned may be contain valid data or be uninitialized.
5787  *		A page list structure, listing the physical pages
5788  *		will be returned upon request.
5789  *		This function is called by the file system or any other
5790  *		supplier of backing store to a pager.
5791  *		IMPORTANT NOTE: The caller must still respect the relationship
5792  *		between the vm_object and its backing memory object.  The
5793  *		caller MUST NOT substitute changes in the backing file
5794  *		without first doing a memory_object_lock_request on the
5795  *		target range unless it is know that the pages are not
5796  *		shared with another entity at the pager level.
5797  *		Copy_in_to:
5798  *			if a page list structure is present
5799  *			return the mapped physical pages, where a
5800  *			page is not present, return a non-initialized
5801  *			one.  If the no_sync bit is turned on, don't
5802  *			call the pager unlock to synchronize with other
5803  *			possible copies of the page. Leave pages busy
5804  *			in the original object, if a page list structure
5805  *			was specified.  When a commit of the page list
5806  *			pages is done, the dirty bit will be set for each one.
5807  *		Copy_out_from:
5808  *			If a page list structure is present, return
5809  *			all mapped pages.  Where a page does not exist
5810  *			map a zero filled one. Leave pages busy in
5811  *			the original object.  If a page list structure
5812  *			is not specified, this call is a no-op.
5813  *
5814  *		Note:  access of default pager objects has a rather interesting
5815  *		twist.  The caller of this routine, presumably the file system
5816  *		page cache handling code, will never actually make a request
5817  *		against a default pager backed object.  Only the default
5818  *		pager will make requests on backing store related vm_objects
5819  *		In this way the default pager can maintain the relationship
5820  *		between backing store files (abstract memory objects) and
5821  *		the vm_objects (cache objects), they support.
5822  *
5823  */
5824 
5825 __private_extern__ kern_return_t
5826 vm_object_upl_request(
5827 	vm_object_t             object,
5828 	vm_object_offset_t      offset,
5829 	upl_size_t              size,
5830 	upl_t                   *upl_ptr,
5831 	upl_page_info_array_t   user_page_list,
5832 	unsigned int            *page_list_count,
5833 	upl_control_flags_t     cntrl_flags,
5834 	vm_tag_t                tag)
5835 {
5836 	vm_page_t               dst_page = VM_PAGE_NULL;
5837 	vm_object_offset_t      dst_offset;
5838 	upl_size_t              xfer_size;
5839 	unsigned int            size_in_pages;
5840 	boolean_t               dirty;
5841 	boolean_t               hw_dirty;
5842 	upl_t                   upl = NULL;
5843 	unsigned int            entry;
5844 	vm_page_t               alias_page = NULL;
5845 	int                     refmod_state = 0;
5846 	vm_object_t             last_copy_object;
5847 	uint32_t                last_copy_version;
5848 	struct  vm_page_delayed_work    dw_array;
5849 	struct  vm_page_delayed_work    *dwp, *dwp_start;
5850 	bool                    dwp_finish_ctx = TRUE;
5851 	int                     dw_count;
5852 	int                     dw_limit;
5853 	int                     io_tracking_flag = 0;
5854 	int                     grab_options;
5855 	int                     page_grab_count = 0;
5856 	ppnum_t                 phys_page;
5857 	pmap_flush_context      pmap_flush_context_storage;
5858 	boolean_t               pmap_flushes_delayed = FALSE;
5859 	task_t                  task = current_task();
5860 
5861 	dwp_start = dwp = NULL;
5862 
5863 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5864 		/*
5865 		 * For forward compatibility's sake,
5866 		 * reject any unknown flag.
5867 		 */
5868 		return KERN_INVALID_VALUE;
5869 	}
5870 	if ((!object->internal) && (object->paging_offset != 0)) {
5871 		panic("vm_object_upl_request: external object with non-zero paging offset");
5872 	}
5873 	if (object->phys_contiguous) {
5874 		panic("vm_object_upl_request: contiguous object specified");
5875 	}
5876 
5877 	assertf(page_aligned(offset) && page_aligned(size),
5878 	    "offset 0x%llx size 0x%x",
5879 	    offset, size);
5880 
5881 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5882 
5883 	dw_count = 0;
5884 	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5885 	dwp_start = vm_page_delayed_work_get_ctx();
5886 	if (dwp_start == NULL) {
5887 		dwp_start = &dw_array;
5888 		dw_limit = 1;
5889 		dwp_finish_ctx = FALSE;
5890 	}
5891 
5892 	dwp = dwp_start;
5893 
5894 	if (size > MAX_UPL_SIZE_BYTES) {
5895 		size = MAX_UPL_SIZE_BYTES;
5896 	}
5897 
5898 	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5899 		*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5900 	}
5901 
5902 #if CONFIG_IOSCHED || UPL_DEBUG
5903 	if (object->io_tracking || upl_debug_enabled) {
5904 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5905 	}
5906 #endif
5907 #if CONFIG_IOSCHED
5908 	if (object->io_tracking) {
5909 		io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5910 	}
5911 #endif
5912 
5913 	if (cntrl_flags & UPL_SET_INTERNAL) {
5914 		if (cntrl_flags & UPL_SET_LITE) {
5915 			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5916 		} else {
5917 			upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5918 		}
5919 		user_page_list = size ? upl->page_list : NULL;
5920 	} else {
5921 		if (cntrl_flags & UPL_SET_LITE) {
5922 			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5923 		} else {
5924 			upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5925 		}
5926 	}
5927 	*upl_ptr = upl;
5928 
5929 	if (user_page_list) {
5930 		user_page_list[0].device = FALSE;
5931 	}
5932 
5933 	if (cntrl_flags & UPL_SET_LITE) {
5934 		upl->map_object = object;
5935 	} else {
5936 		upl->map_object = vm_object_allocate(size, object->vmo_provenance);
5937 		vm_object_lock(upl->map_object);
5938 		/*
5939 		 * No neeed to lock the new object: nobody else knows
5940 		 * about it yet, so it's all ours so far.
5941 		 */
5942 		upl->map_object->shadow = object;
5943 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
5944 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
5945 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5946 		upl->map_object->vo_shadow_offset = offset;
5947 		upl->map_object->wimg_bits = object->wimg_bits;
5948 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
5949 		    "object %p shadow_offset 0x%llx",
5950 		    upl->map_object, upl->map_object->vo_shadow_offset);
5951 		vm_object_unlock(upl->map_object);
5952 
5953 		alias_page = vm_page_create_fictitious();
5954 
5955 		upl->flags |= UPL_SHADOWED;
5956 	}
5957 	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5958 		upl->flags |= UPL_PAGEOUT;
5959 	}
5960 
5961 	vm_object_lock(object);
5962 	vm_object_activity_begin(object);
5963 
5964 	grab_options = 0;
5965 #if CONFIG_SECLUDED_MEMORY
5966 	if (object->can_grab_secluded) {
5967 		grab_options |= VM_PAGE_GRAB_SECLUDED;
5968 	}
5969 #endif /* CONFIG_SECLUDED_MEMORY */
5970 
5971 	/*
5972 	 * we can lock in the paging_offset once paging_in_progress is set
5973 	 */
5974 	upl->u_size = size;
5975 	upl->u_offset = offset + object->paging_offset;
5976 
5977 #if CONFIG_IOSCHED || UPL_DEBUG
5978 	if (object->io_tracking || upl_debug_enabled) {
5979 		vm_object_activity_begin(object);
5980 		queue_enter(&object->uplq, upl, upl_t, uplq);
5981 	}
5982 #endif
5983 	if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5984 		/*
5985 		 * Honor copy-on-write obligations
5986 		 *
5987 		 * The caller is gathering these pages and
5988 		 * might modify their contents.  We need to
5989 		 * make sure that the copy object has its own
5990 		 * private copies of these pages before we let
5991 		 * the caller modify them.
5992 		 */
5993 		vm_object_update(object,
5994 		    offset,
5995 		    size,
5996 		    NULL,
5997 		    NULL,
5998 		    FALSE,              /* should_return */
5999 		    MEMORY_OBJECT_COPY_SYNC,
6000 		    VM_PROT_NO_CHANGE);
6001 
6002 		VM_PAGEOUT_DEBUG(upl_cow, 1);
6003 		VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
6004 	}
6005 	/*
6006 	 * remember which copy object we synchronized with
6007 	 */
6008 	last_copy_object = object->vo_copy;
6009 	last_copy_version = object->vo_copy_version;
6010 	entry = 0;
6011 
6012 	xfer_size = size;
6013 	dst_offset = offset;
6014 	size_in_pages = size / PAGE_SIZE;
6015 
6016 	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
6017 	    object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
6018 		object->scan_collisions = 0;
6019 	}
6020 
6021 	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
6022 		boolean_t       isSSD = FALSE;
6023 
6024 #if !XNU_TARGET_OS_OSX
6025 		isSSD = TRUE;
6026 #else /* !XNU_TARGET_OS_OSX */
6027 		vnode_pager_get_isSSD(object->pager, &isSSD);
6028 #endif /* !XNU_TARGET_OS_OSX */
6029 		vm_object_unlock(object);
6030 
6031 		OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6032 
6033 		if (isSSD == TRUE) {
6034 			delay(1000 * size_in_pages);
6035 		} else {
6036 			delay(5000 * size_in_pages);
6037 		}
6038 		OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6039 
6040 		vm_object_lock(object);
6041 	}
6042 
6043 	while (xfer_size) {
6044 		dwp->dw_mask = 0;
6045 
6046 		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
6047 			vm_object_unlock(object);
6048 			alias_page = vm_page_create_fictitious();
6049 			vm_object_lock(object);
6050 		}
6051 		if (cntrl_flags & UPL_COPYOUT_FROM) {
6052 			upl->flags |= UPL_PAGE_SYNC_DONE;
6053 
6054 			if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
6055 			    vm_page_is_fictitious(dst_page) ||
6056 			    dst_page->vmp_absent ||
6057 			    VMP_ERROR_GET(dst_page) ||
6058 			    dst_page->vmp_cleaning ||
6059 			    (VM_PAGE_WIRED(dst_page))) {
6060 				if (user_page_list) {
6061 					user_page_list[entry].phys_addr = 0;
6062 				}
6063 
6064 				goto try_next_page;
6065 			}
6066 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6067 
6068 			/*
6069 			 * grab this up front...
6070 			 * a high percentange of the time we're going to
6071 			 * need the hardware modification state a bit later
6072 			 * anyway... so we can eliminate an extra call into
6073 			 * the pmap layer by grabbing it here and recording it
6074 			 */
6075 			if (dst_page->vmp_pmapped) {
6076 				refmod_state = pmap_get_refmod(phys_page);
6077 			} else {
6078 				refmod_state = 0;
6079 			}
6080 
6081 			if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6082 				/*
6083 				 * page is on inactive list and referenced...
6084 				 * reactivate it now... this gets it out of the
6085 				 * way of vm_pageout_scan which would have to
6086 				 * reactivate it upon tripping over it
6087 				 */
6088 				dwp->dw_mask |= DW_vm_page_activate;
6089 			}
6090 			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6091 				/*
6092 				 * we're only asking for DIRTY pages to be returned
6093 				 */
6094 				if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6095 					/*
6096 					 * if we were the page stolen by vm_pageout_scan to be
6097 					 * cleaned (as opposed to a buddy being clustered in
6098 					 * or this request is not being driven by a PAGEOUT cluster
6099 					 * then we only need to check for the page being dirty or
6100 					 * precious to decide whether to return it
6101 					 */
6102 					if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6103 						goto check_busy;
6104 					}
6105 					goto dont_return;
6106 				}
6107 				/*
6108 				 * this is a request for a PAGEOUT cluster and this page
6109 				 * is merely along for the ride as a 'buddy'... not only
6110 				 * does it have to be dirty to be returned, but it also
6111 				 * can't have been referenced recently...
6112 				 */
6113 				if ((hibernate_cleaning_in_progress == TRUE ||
6114 				    (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6115 				    (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6116 				    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6117 					goto check_busy;
6118 				}
6119 dont_return:
6120 				/*
6121 				 * if we reach here, we're not to return
6122 				 * the page... go on to the next one
6123 				 */
6124 				if (dst_page->vmp_laundry == TRUE) {
6125 					/*
6126 					 * if we get here, the page is not 'cleaning' (filtered out above).
6127 					 * since it has been referenced, remove it from the laundry
6128 					 * so we don't pay the cost of an I/O to clean a page
6129 					 * we're just going to take back
6130 					 */
6131 					vm_page_lockspin_queues();
6132 
6133 					vm_pageout_steal_laundry(dst_page, TRUE);
6134 					vm_page_activate(dst_page);
6135 
6136 					vm_page_unlock_queues();
6137 				}
6138 				if (user_page_list) {
6139 					user_page_list[entry].phys_addr = 0;
6140 				}
6141 
6142 				goto try_next_page;
6143 			}
6144 check_busy:
6145 			if (dst_page->vmp_busy) {
6146 				if (cntrl_flags & UPL_NOBLOCK) {
6147 					if (user_page_list) {
6148 						user_page_list[entry].phys_addr = 0;
6149 					}
6150 					dwp->dw_mask = 0;
6151 
6152 					goto try_next_page;
6153 				}
6154 				/*
6155 				 * someone else is playing with the
6156 				 * page.  We will have to wait.
6157 				 */
6158 				vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6159 
6160 				continue;
6161 			}
6162 			if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6163 				vm_page_lockspin_queues();
6164 
6165 				if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6166 					/*
6167 					 * we've buddied up a page for a clustered pageout
6168 					 * that has already been moved to the pageout
6169 					 * queue by pageout_scan... we need to remove
6170 					 * it from the queue and drop the laundry count
6171 					 * on that queue
6172 					 */
6173 					vm_pageout_throttle_up(dst_page);
6174 				}
6175 				vm_page_unlock_queues();
6176 			}
6177 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6178 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6179 
6180 			if (phys_page > upl->highest_page) {
6181 				upl->highest_page = phys_page;
6182 			}
6183 
6184 			assert(!pmap_is_noencrypt(phys_page));
6185 
6186 			if (cntrl_flags & UPL_SET_LITE) {
6187 				unsigned int    pg_num;
6188 
6189 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6190 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6191 				bitmap_set(upl->lite_list, pg_num);
6192 
6193 				if (hw_dirty) {
6194 					if (pmap_flushes_delayed == FALSE) {
6195 						pmap_flush_context_init(&pmap_flush_context_storage);
6196 						pmap_flushes_delayed = TRUE;
6197 					}
6198 					pmap_clear_refmod_options(phys_page,
6199 					    VM_MEM_MODIFIED,
6200 					    PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6201 					    &pmap_flush_context_storage);
6202 				}
6203 
6204 				/*
6205 				 * Mark original page as cleaning
6206 				 * in place.
6207 				 */
6208 				dst_page->vmp_cleaning = TRUE;
6209 				dst_page->vmp_precious = FALSE;
6210 			} else {
6211 				/*
6212 				 * use pageclean setup, it is more
6213 				 * convenient even for the pageout
6214 				 * cases here
6215 				 */
6216 				vm_object_lock(upl->map_object);
6217 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6218 				vm_object_unlock(upl->map_object);
6219 
6220 				alias_page->vmp_absent = FALSE;
6221 				alias_page = NULL;
6222 			}
6223 			if (dirty) {
6224 				SET_PAGE_DIRTY(dst_page, FALSE);
6225 			} else {
6226 				dst_page->vmp_dirty = FALSE;
6227 			}
6228 
6229 			if (!dirty) {
6230 				dst_page->vmp_precious = TRUE;
6231 			}
6232 
6233 			if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6234 				if (!VM_PAGE_WIRED(dst_page)) {
6235 					dst_page->vmp_free_when_done = TRUE;
6236 				}
6237 			}
6238 		} else {
6239 			if ((cntrl_flags & UPL_WILL_MODIFY) &&
6240 			    (object->vo_copy != last_copy_object ||
6241 			    object->vo_copy_version != last_copy_version)) {
6242 				/*
6243 				 * Honor copy-on-write obligations
6244 				 *
6245 				 * The copy object has changed since we
6246 				 * last synchronized for copy-on-write.
6247 				 * Another copy object might have been
6248 				 * inserted while we released the object's
6249 				 * lock.  Since someone could have seen the
6250 				 * original contents of the remaining pages
6251 				 * through that new object, we have to
6252 				 * synchronize with it again for the remaining
6253 				 * pages only.  The previous pages are "busy"
6254 				 * so they can not be seen through the new
6255 				 * mapping.  The new mapping will see our
6256 				 * upcoming changes for those previous pages,
6257 				 * but that's OK since they couldn't see what
6258 				 * was there before.  It's just a race anyway
6259 				 * and there's no guarantee of consistency or
6260 				 * atomicity.  We just don't want new mappings
6261 				 * to see both the *before* and *after* pages.
6262 				 */
6263 				if (object->vo_copy != VM_OBJECT_NULL) {
6264 					vm_object_update(
6265 						object,
6266 						dst_offset,/* current offset */
6267 						xfer_size, /* remaining size */
6268 						NULL,
6269 						NULL,
6270 						FALSE,     /* should_return */
6271 						MEMORY_OBJECT_COPY_SYNC,
6272 						VM_PROT_NO_CHANGE);
6273 
6274 					VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6275 					VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6276 				}
6277 				/*
6278 				 * remember the copy object we synced with
6279 				 */
6280 				last_copy_object = object->vo_copy;
6281 				last_copy_version = object->vo_copy_version;
6282 			}
6283 			dst_page = vm_page_lookup(object, dst_offset);
6284 
6285 			if (dst_page != VM_PAGE_NULL) {
6286 				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6287 					/*
6288 					 * skip over pages already present in the cache
6289 					 */
6290 					if (user_page_list) {
6291 						user_page_list[entry].phys_addr = 0;
6292 					}
6293 
6294 					goto try_next_page;
6295 				}
6296 				if (vm_page_is_fictitious(dst_page)) {
6297 					panic("need corner case for fictitious page");
6298 				}
6299 
6300 				if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6301 					/*
6302 					 * someone else is playing with the
6303 					 * page.  We will have to wait.
6304 					 */
6305 					vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
6306 
6307 					continue;
6308 				}
6309 				if (dst_page->vmp_laundry) {
6310 					vm_pageout_steal_laundry(dst_page, FALSE);
6311 				}
6312 			} else {
6313 				if (object->private) {
6314 					/*
6315 					 * This is a nasty wrinkle for users
6316 					 * of upl who encounter device or
6317 					 * private memory however, it is
6318 					 * unavoidable, only a fault can
6319 					 * resolve the actual backing
6320 					 * physical page by asking the
6321 					 * backing device.
6322 					 */
6323 					if (user_page_list) {
6324 						user_page_list[entry].phys_addr = 0;
6325 					}
6326 
6327 					goto try_next_page;
6328 				}
6329 				if (object->scan_collisions) {
6330 					/*
6331 					 * the pageout_scan thread is trying to steal
6332 					 * pages from this object, but has run into our
6333 					 * lock... grab 2 pages from the head of the object...
6334 					 * the first is freed on behalf of pageout_scan, the
6335 					 * 2nd is for our own use... we use vm_object_page_grab
6336 					 * in both cases to avoid taking pages from the free
6337 					 * list since we are under memory pressure and our
6338 					 * lock on this object is getting in the way of
6339 					 * relieving it
6340 					 */
6341 					dst_page = vm_object_page_grab(object);
6342 
6343 					if (dst_page != VM_PAGE_NULL) {
6344 						vm_page_release(dst_page,
6345 						    FALSE);
6346 					}
6347 
6348 					dst_page = vm_object_page_grab(object);
6349 				}
6350 				if (dst_page == VM_PAGE_NULL) {
6351 					/*
6352 					 * need to allocate a page
6353 					 */
6354 					dst_page = vm_page_grab_options(grab_options);
6355 					if (dst_page != VM_PAGE_NULL) {
6356 						page_grab_count++;
6357 					}
6358 				}
6359 				if (dst_page == VM_PAGE_NULL) {
6360 					if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6361 						/*
6362 						 * we don't want to stall waiting for pages to come onto the free list
6363 						 * while we're already holding absent pages in this UPL
6364 						 * the caller will deal with the empty slots
6365 						 */
6366 						if (user_page_list) {
6367 							user_page_list[entry].phys_addr = 0;
6368 						}
6369 
6370 						goto try_next_page;
6371 					}
6372 					/*
6373 					 * no pages available... wait
6374 					 * then try again for the same
6375 					 * offset...
6376 					 */
6377 					vm_object_unlock(object);
6378 
6379 					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6380 
6381 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6382 
6383 					VM_PAGE_WAIT();
6384 					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6385 
6386 					VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6387 
6388 					vm_object_lock(object);
6389 
6390 					continue;
6391 				}
6392 				vm_page_insert(dst_page, object, dst_offset);
6393 
6394 				dst_page->vmp_absent = TRUE;
6395 				dst_page->vmp_busy = FALSE;
6396 
6397 				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6398 					/*
6399 					 * if UPL_RET_ONLY_ABSENT was specified,
6400 					 * than we're definitely setting up a
6401 					 * upl for a clustered read/pagein
6402 					 * operation... mark the pages as clustered
6403 					 * so upl_commit_range can put them on the
6404 					 * speculative list
6405 					 */
6406 					dst_page->vmp_clustered = TRUE;
6407 
6408 					if (!(cntrl_flags & UPL_FILE_IO)) {
6409 						counter_inc(&vm_statistics_pageins);
6410 					}
6411 				}
6412 			}
6413 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6414 
6415 			dst_page->vmp_overwriting = TRUE;
6416 
6417 			if (dst_page->vmp_pmapped) {
6418 				if (!(cntrl_flags & UPL_FILE_IO)) {
6419 					/*
6420 					 * eliminate all mappings from the
6421 					 * original object and its prodigy
6422 					 */
6423 					refmod_state = pmap_disconnect(phys_page);
6424 				} else {
6425 					refmod_state = pmap_get_refmod(phys_page);
6426 				}
6427 			} else {
6428 				refmod_state = 0;
6429 			}
6430 
6431 			hw_dirty = refmod_state & VM_MEM_MODIFIED;
6432 			dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6433 
6434 			if (cntrl_flags & UPL_SET_LITE) {
6435 				unsigned int    pg_num;
6436 
6437 				pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6438 				assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6439 				bitmap_set(upl->lite_list, pg_num);
6440 
6441 				if (hw_dirty) {
6442 					pmap_clear_modify(phys_page);
6443 				}
6444 
6445 				/*
6446 				 * Mark original page as cleaning
6447 				 * in place.
6448 				 */
6449 				dst_page->vmp_cleaning = TRUE;
6450 				dst_page->vmp_precious = FALSE;
6451 			} else {
6452 				/*
6453 				 * use pageclean setup, it is more
6454 				 * convenient even for the pageout
6455 				 * cases here
6456 				 */
6457 				vm_object_lock(upl->map_object);
6458 				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6459 				vm_object_unlock(upl->map_object);
6460 
6461 				alias_page->vmp_absent = FALSE;
6462 				alias_page = NULL;
6463 			}
6464 
6465 			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6466 				upl->flags &= ~UPL_CLEAR_DIRTY;
6467 				upl->flags |= UPL_SET_DIRTY;
6468 				dirty = TRUE;
6469 				/*
6470 				 * Page belonging to a code-signed object is about to
6471 				 * be written. Mark it tainted and disconnect it from
6472 				 * all pmaps so processes have to fault it back in and
6473 				 * deal with the tainted bit.
6474 				 */
6475 				if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6476 					dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6477 					vm_page_upl_tainted++;
6478 					if (dst_page->vmp_pmapped) {
6479 						refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6480 						if (refmod_state & VM_MEM_REFERENCED) {
6481 							dst_page->vmp_reference = TRUE;
6482 						}
6483 					}
6484 				}
6485 			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6486 				/*
6487 				 * clean in place for read implies
6488 				 * that a write will be done on all
6489 				 * the pages that are dirty before
6490 				 * a upl commit is done.  The caller
6491 				 * is obligated to preserve the
6492 				 * contents of all pages marked dirty
6493 				 */
6494 				upl->flags |= UPL_CLEAR_DIRTY;
6495 			}
6496 			dst_page->vmp_dirty = dirty;
6497 
6498 			if (!dirty) {
6499 				dst_page->vmp_precious = TRUE;
6500 			}
6501 
6502 			if (!VM_PAGE_WIRED(dst_page)) {
6503 				/*
6504 				 * deny access to the target page while
6505 				 * it is being worked on
6506 				 */
6507 				dst_page->vmp_busy = TRUE;
6508 			} else {
6509 				dwp->dw_mask |= DW_vm_page_wire;
6510 			}
6511 
6512 			/*
6513 			 * We might be about to satisfy a fault which has been
6514 			 * requested. So no need for the "restart" bit.
6515 			 */
6516 			dst_page->vmp_restart = FALSE;
6517 			if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6518 				/*
6519 				 * expect the page to be used
6520 				 */
6521 				dwp->dw_mask |= DW_set_reference;
6522 			}
6523 			if (cntrl_flags & UPL_PRECIOUS) {
6524 				if (object->internal) {
6525 					SET_PAGE_DIRTY(dst_page, FALSE);
6526 					dst_page->vmp_precious = FALSE;
6527 				} else {
6528 					dst_page->vmp_precious = TRUE;
6529 				}
6530 			} else {
6531 				dst_page->vmp_precious = FALSE;
6532 			}
6533 		}
6534 		if (dst_page->vmp_busy) {
6535 			upl->flags |= UPL_HAS_BUSY;
6536 		}
6537 		if (VM_PAGE_WIRED(dst_page)) {
6538 			upl->flags |= UPL_HAS_WIRED;
6539 		}
6540 
6541 		if (phys_page > upl->highest_page) {
6542 			upl->highest_page = phys_page;
6543 		}
6544 		assert(!pmap_is_noencrypt(phys_page));
6545 		if (user_page_list) {
6546 			user_page_list[entry].phys_addr = phys_page;
6547 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6548 			user_page_list[entry].absent    = dst_page->vmp_absent;
6549 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
6550 			user_page_list[entry].precious  = dst_page->vmp_precious;
6551 			user_page_list[entry].device    = FALSE;
6552 			user_page_list[entry].needed    = FALSE;
6553 			if (dst_page->vmp_clustered == TRUE) {
6554 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6555 			} else {
6556 				user_page_list[entry].speculative = FALSE;
6557 			}
6558 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6559 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6560 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6561 			user_page_list[entry].mark      = FALSE;
6562 		}
6563 		/*
6564 		 * if UPL_RET_ONLY_ABSENT is set, then
6565 		 * we are working with a fresh page and we've
6566 		 * just set the clustered flag on it to
6567 		 * indicate that it was drug in as part of a
6568 		 * speculative cluster... so leave it alone
6569 		 */
6570 		if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6571 			/*
6572 			 * someone is explicitly grabbing this page...
6573 			 * update clustered and speculative state
6574 			 *
6575 			 */
6576 			if (dst_page->vmp_clustered) {
6577 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
6578 			}
6579 		}
6580 try_next_page:
6581 		if (dwp->dw_mask) {
6582 			if (dwp->dw_mask & DW_vm_page_activate) {
6583 				counter_inc(&vm_statistics_reactivations);
6584 			}
6585 
6586 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6587 
6588 			if (dw_count >= dw_limit) {
6589 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6590 
6591 				dwp = dwp_start;
6592 				dw_count = 0;
6593 			}
6594 		}
6595 		entry++;
6596 		dst_offset += PAGE_SIZE_64;
6597 		xfer_size -= PAGE_SIZE;
6598 	}
6599 	if (dw_count) {
6600 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6601 		dwp = dwp_start;
6602 		dw_count = 0;
6603 	}
6604 
6605 	if (alias_page != NULL) {
6606 		VM_PAGE_FREE(alias_page);
6607 	}
6608 	if (pmap_flushes_delayed == TRUE) {
6609 		pmap_flush(&pmap_flush_context_storage);
6610 	}
6611 
6612 	if (page_list_count != NULL) {
6613 		if (upl->flags & UPL_INTERNAL) {
6614 			*page_list_count = 0;
6615 		} else if (*page_list_count > entry) {
6616 			*page_list_count = entry;
6617 		}
6618 	}
6619 #if UPL_DEBUG
6620 	upl->upl_state = 1;
6621 #endif
6622 	vm_object_unlock(object);
6623 
6624 	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6625 	if (task != NULL) {
6626 		counter_add(&task->pages_grabbed_upl, page_grab_count);
6627 	}
6628 
6629 	if (dwp_start && dwp_finish_ctx) {
6630 		vm_page_delayed_work_finish_ctx(dwp_start);
6631 		dwp_start = dwp = NULL;
6632 	}
6633 
6634 	return KERN_SUCCESS;
6635 }
6636 
6637 int cs_executable_create_upl = 0;
6638 extern int proc_selfpid(void);
6639 extern char *proc_name_address(void *p);
6640 
6641 kern_return_t
6642 vm_map_create_upl(
6643 	vm_map_t                map,
6644 	vm_map_address_t        offset,
6645 	upl_size_t              *upl_size,
6646 	upl_t                   *upl,
6647 	upl_page_info_array_t   page_list,
6648 	unsigned int            *count,
6649 	upl_control_flags_t     *flags,
6650 	vm_tag_t                tag)
6651 {
6652 	vm_map_entry_t          entry;
6653 	upl_control_flags_t     caller_flags;
6654 	int                     force_data_sync;
6655 	int                     sync_cow_data;
6656 	vm_object_t             local_object;
6657 	vm_map_offset_t         local_offset;
6658 	vm_map_offset_t         local_start;
6659 	kern_return_t           ret;
6660 	vm_map_address_t        original_offset;
6661 	vm_map_size_t           original_size, adjusted_size;
6662 	vm_map_offset_t         local_entry_start;
6663 	vm_object_offset_t      local_entry_offset;
6664 	vm_object_offset_t      offset_in_mapped_page;
6665 	boolean_t               release_map = FALSE;
6666 
6667 
6668 start_with_map:
6669 
6670 	original_offset = offset;
6671 	original_size = *upl_size;
6672 	adjusted_size = original_size;
6673 
6674 	caller_flags = *flags;
6675 
6676 	if (caller_flags & ~UPL_VALID_FLAGS) {
6677 		/*
6678 		 * For forward compatibility's sake,
6679 		 * reject any unknown flag.
6680 		 */
6681 		ret = KERN_INVALID_VALUE;
6682 		goto done;
6683 	}
6684 	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6685 	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6686 
6687 	if (upl == NULL) {
6688 		ret = KERN_INVALID_ARGUMENT;
6689 		goto done;
6690 	}
6691 
6692 REDISCOVER_ENTRY:
6693 	vm_map_lock_read(map);
6694 
6695 	if (!vm_map_lookup_entry(map, offset, &entry)) {
6696 		vm_map_unlock_read(map);
6697 		ret = KERN_FAILURE;
6698 		goto done;
6699 	}
6700 
6701 	local_entry_start = entry->vme_start;
6702 	local_entry_offset = VME_OFFSET(entry);
6703 
6704 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6705 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6706 	}
6707 
6708 	if (entry->vme_end - original_offset < adjusted_size) {
6709 		adjusted_size = entry->vme_end - original_offset;
6710 		assert(adjusted_size > 0);
6711 		*upl_size = (upl_size_t) adjusted_size;
6712 		assert(*upl_size == adjusted_size);
6713 	}
6714 
6715 	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6716 		*flags = 0;
6717 
6718 		if (!entry->is_sub_map &&
6719 		    VME_OBJECT(entry) != VM_OBJECT_NULL) {
6720 			if (VME_OBJECT(entry)->private) {
6721 				*flags = UPL_DEV_MEMORY;
6722 			}
6723 
6724 			if (VME_OBJECT(entry)->phys_contiguous) {
6725 				*flags |= UPL_PHYS_CONTIG;
6726 			}
6727 		}
6728 		vm_map_unlock_read(map);
6729 		ret = KERN_SUCCESS;
6730 		goto done;
6731 	}
6732 
6733 	offset_in_mapped_page = 0;
6734 	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6735 		offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6736 		*upl_size = (upl_size_t)
6737 		    (vm_map_round_page(original_offset + adjusted_size,
6738 		    VM_MAP_PAGE_MASK(map))
6739 		    - offset);
6740 
6741 		offset_in_mapped_page = original_offset - offset;
6742 		assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6743 
6744 		DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6745 	}
6746 
6747 	if (!entry->is_sub_map) {
6748 		if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6749 		    !VME_OBJECT(entry)->phys_contiguous) {
6750 			if (*upl_size > MAX_UPL_SIZE_BYTES) {
6751 				*upl_size = MAX_UPL_SIZE_BYTES;
6752 			}
6753 		}
6754 
6755 		/*
6756 		 *      Create an object if necessary.
6757 		 */
6758 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6759 			if (entry->max_protection == VM_PROT_NONE) {
6760 				/* don't create an object for a reserved range */
6761 				vm_map_unlock_read(map);
6762 				ret = KERN_PROTECTION_FAILURE;
6763 				goto done;
6764 			}
6765 
6766 			if (vm_map_lock_read_to_write(map)) {
6767 				goto REDISCOVER_ENTRY;
6768 			}
6769 
6770 			VME_OBJECT_SET(entry,
6771 			    vm_object_allocate((vm_size_t)
6772 			    vm_object_round_page((entry->vme_end - entry->vme_start)), map->serial_id),
6773 			    false, 0);
6774 			VME_OFFSET_SET(entry, 0);
6775 			assert(entry->use_pmap);
6776 
6777 			vm_map_lock_write_to_read(map);
6778 		}
6779 
6780 		if (!(caller_flags & UPL_COPYOUT_FROM) &&
6781 		    !(entry->protection & VM_PROT_WRITE)) {
6782 			vm_map_unlock_read(map);
6783 			ret = KERN_PROTECTION_FAILURE;
6784 			goto done;
6785 		}
6786 	}
6787 
6788 #if !XNU_TARGET_OS_OSX
6789 	if (map->pmap != kernel_pmap &&
6790 	    (caller_flags & UPL_COPYOUT_FROM) &&
6791 	    (entry->protection & VM_PROT_EXECUTE) &&
6792 	    !(entry->protection & VM_PROT_WRITE)) {
6793 		vm_offset_t     kaddr;
6794 		vm_size_t       ksize;
6795 
6796 		/*
6797 		 * We're about to create a read-only UPL backed by
6798 		 * memory from an executable mapping.
6799 		 * Wiring the pages would result in the pages being copied
6800 		 * (due to the "MAP_PRIVATE" mapping) and no longer
6801 		 * code-signed, so no longer eligible for execution.
6802 		 * Instead, let's copy the data into a kernel buffer and
6803 		 * create the UPL from this kernel buffer.
6804 		 * The kernel buffer is then freed, leaving the UPL holding
6805 		 * the last reference on the VM object, so the memory will
6806 		 * be released when the UPL is committed.
6807 		 */
6808 
6809 		vm_map_unlock_read(map);
6810 		entry = VM_MAP_ENTRY_NULL;
6811 		/* allocate kernel buffer */
6812 		ksize = round_page(*upl_size);
6813 		kaddr = 0;
6814 		ret = kmem_alloc(kernel_map, &kaddr, ksize,
6815 		    KMA_PAGEABLE | KMA_DATA, tag);
6816 		if (ret == KERN_SUCCESS) {
6817 			/* copyin the user data */
6818 			ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6819 		}
6820 		if (ret == KERN_SUCCESS) {
6821 			if (ksize > *upl_size) {
6822 				/* zero out the extra space in kernel buffer */
6823 				memset((void *)(kaddr + *upl_size),
6824 				    0,
6825 				    ksize - *upl_size);
6826 			}
6827 			/* create the UPL from the kernel buffer */
6828 			vm_object_offset_t      offset_in_object;
6829 			vm_object_offset_t      offset_in_object_page;
6830 
6831 			offset_in_object = offset - local_entry_start + local_entry_offset;
6832 			offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6833 			assert(offset_in_object_page < PAGE_SIZE);
6834 			assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6835 			*upl_size -= offset_in_object_page + offset_in_mapped_page;
6836 			ret = vm_map_create_upl(kernel_map,
6837 			    (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6838 			    upl_size, upl, page_list, count, flags, tag);
6839 		}
6840 		if (kaddr != 0) {
6841 			/* free the kernel buffer */
6842 			kmem_free(kernel_map, kaddr, ksize);
6843 			kaddr = 0;
6844 			ksize = 0;
6845 		}
6846 #if DEVELOPMENT || DEBUG
6847 		DTRACE_VM4(create_upl_from_executable,
6848 		    vm_map_t, map,
6849 		    vm_map_address_t, offset,
6850 		    upl_size_t, *upl_size,
6851 		    kern_return_t, ret);
6852 #endif /* DEVELOPMENT || DEBUG */
6853 		goto done;
6854 	}
6855 #endif /* !XNU_TARGET_OS_OSX */
6856 
6857 	if (!entry->is_sub_map) {
6858 		local_object = VME_OBJECT(entry);
6859 		assert(local_object != VM_OBJECT_NULL);
6860 	}
6861 
6862 	if (!entry->is_sub_map &&
6863 	    !entry->needs_copy &&
6864 	    *upl_size != 0 &&
6865 	    local_object->vo_size > *upl_size && /* partial UPL */
6866 	    entry->wired_count == 0 && /* No COW for entries that are wired */
6867 	    (map->pmap != kernel_pmap) && /* alias checks */
6868 	    (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6869 	    ||
6870 	    ( /* case 2 */
6871 		    local_object->internal &&
6872 		    (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6873 		    os_ref_get_count_raw(&local_object->ref_count) > 1))) {
6874 		vm_prot_t       prot;
6875 
6876 		/*
6877 		 * Case 1:
6878 		 * Set up the targeted range for copy-on-write to avoid
6879 		 * applying true_share/copy_delay to the entire object.
6880 		 *
6881 		 * Case 2:
6882 		 * This map entry covers only part of an internal
6883 		 * object.  There could be other map entries covering
6884 		 * other areas of this object and some of these map
6885 		 * entries could be marked as "needs_copy", which
6886 		 * assumes that the object is COPY_SYMMETRIC.
6887 		 * To avoid marking this object as COPY_DELAY and
6888 		 * "true_share", let's shadow it and mark the new
6889 		 * (smaller) object as "true_share" and COPY_DELAY.
6890 		 */
6891 
6892 		if (vm_map_lock_read_to_write(map)) {
6893 			goto REDISCOVER_ENTRY;
6894 		}
6895 		vm_map_lock_assert_exclusive(map);
6896 		assert(VME_OBJECT(entry) == local_object);
6897 
6898 		vm_map_clip_start(map,
6899 		    entry,
6900 		    vm_map_trunc_page(offset,
6901 		    VM_MAP_PAGE_MASK(map)));
6902 		vm_map_clip_end(map,
6903 		    entry,
6904 		    vm_map_round_page(offset + *upl_size,
6905 		    VM_MAP_PAGE_MASK(map)));
6906 		if ((entry->vme_end - offset) < *upl_size) {
6907 			*upl_size = (upl_size_t) (entry->vme_end - offset);
6908 			assert(*upl_size == entry->vme_end - offset);
6909 		}
6910 
6911 		prot = entry->protection & ~VM_PROT_WRITE;
6912 		if (override_nx(map, VME_ALIAS(entry)) && prot) {
6913 			prot |= VM_PROT_EXECUTE;
6914 		}
6915 		vm_object_pmap_protect(local_object,
6916 		    VME_OFFSET(entry),
6917 		    entry->vme_end - entry->vme_start,
6918 		    ((entry->is_shared ||
6919 		    map->mapped_in_other_pmaps)
6920 		    ? PMAP_NULL
6921 		    : map->pmap),
6922 		    VM_MAP_PAGE_SIZE(map),
6923 		    entry->vme_start,
6924 		    prot);
6925 
6926 		assert(entry->wired_count == 0);
6927 
6928 		/*
6929 		 * Lock the VM object and re-check its status: if it's mapped
6930 		 * in another address space, we could still be racing with
6931 		 * another thread holding that other VM map exclusively.
6932 		 */
6933 		vm_object_lock(local_object);
6934 		if (local_object->true_share) {
6935 			/* object is already in proper state: no COW needed */
6936 			assert(local_object->copy_strategy !=
6937 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6938 		} else {
6939 			/* not true_share: ask for copy-on-write below */
6940 			assert(local_object->copy_strategy ==
6941 			    MEMORY_OBJECT_COPY_SYMMETRIC);
6942 			entry->needs_copy = TRUE;
6943 		}
6944 		vm_object_unlock(local_object);
6945 
6946 		vm_map_lock_write_to_read(map);
6947 	}
6948 
6949 	if (entry->needs_copy) {
6950 		/*
6951 		 * Honor copy-on-write for COPY_SYMMETRIC
6952 		 * strategy.
6953 		 */
6954 		vm_map_t                local_map;
6955 		vm_object_t             object;
6956 		vm_object_offset_t      new_offset;
6957 		vm_prot_t               prot;
6958 		boolean_t               wired;
6959 		vm_map_version_t        version;
6960 		vm_map_t                real_map;
6961 		vm_prot_t               fault_type;
6962 
6963 		local_map = map;
6964 
6965 		if (caller_flags & UPL_COPYOUT_FROM) {
6966 			fault_type = VM_PROT_READ | VM_PROT_COPY;
6967 			vm_counters.create_upl_extra_cow++;
6968 			vm_counters.create_upl_extra_cow_pages +=
6969 			    (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6970 		} else {
6971 			fault_type = VM_PROT_WRITE;
6972 		}
6973 		if (vm_map_lookup_and_lock_object(&local_map,
6974 		    offset, fault_type,
6975 		    OBJECT_LOCK_EXCLUSIVE,
6976 		    &version, &object,
6977 		    &new_offset, &prot, &wired,
6978 		    NULL,
6979 		    &real_map, NULL) != KERN_SUCCESS) {
6980 			if (fault_type == VM_PROT_WRITE) {
6981 				vm_counters.create_upl_lookup_failure_write++;
6982 			} else {
6983 				vm_counters.create_upl_lookup_failure_copy++;
6984 			}
6985 			vm_map_unlock_read(local_map);
6986 			ret = KERN_FAILURE;
6987 			goto done;
6988 		}
6989 		if (real_map != local_map) {
6990 			vm_map_unlock(real_map);
6991 		}
6992 		vm_map_unlock_read(local_map);
6993 
6994 		vm_object_unlock(object);
6995 
6996 		goto REDISCOVER_ENTRY;
6997 	}
6998 
6999 	if (entry->is_sub_map) {
7000 		vm_map_t        submap;
7001 
7002 		submap = VME_SUBMAP(entry);
7003 		local_start = entry->vme_start;
7004 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7005 
7006 		vm_map_reference(submap);
7007 		vm_map_unlock_read(map);
7008 
7009 		DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7010 		offset += offset_in_mapped_page;
7011 		*upl_size -= offset_in_mapped_page;
7012 
7013 		if (release_map) {
7014 			vm_map_deallocate(map);
7015 		}
7016 		map = submap;
7017 		release_map = TRUE;
7018 		offset = local_offset + (offset - local_start);
7019 		goto start_with_map;
7020 	}
7021 
7022 	if (sync_cow_data &&
7023 	    (VME_OBJECT(entry)->shadow ||
7024 	    VME_OBJECT(entry)->vo_copy)) {
7025 		local_object = VME_OBJECT(entry);
7026 		local_start = entry->vme_start;
7027 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7028 
7029 		vm_object_reference(local_object);
7030 		vm_map_unlock_read(map);
7031 
7032 		if (local_object->shadow && local_object->vo_copy) {
7033 			vm_object_lock_request(local_object->shadow,
7034 			    ((vm_object_offset_t)
7035 			    ((offset - local_start) +
7036 			    local_offset) +
7037 			    local_object->vo_shadow_offset),
7038 			    *upl_size, FALSE,
7039 			    MEMORY_OBJECT_DATA_SYNC,
7040 			    VM_PROT_NO_CHANGE);
7041 		}
7042 		sync_cow_data = FALSE;
7043 		vm_object_deallocate(local_object);
7044 
7045 		goto REDISCOVER_ENTRY;
7046 	}
7047 	if (force_data_sync) {
7048 		local_object = VME_OBJECT(entry);
7049 		local_start = entry->vme_start;
7050 		local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7051 
7052 		vm_object_reference(local_object);
7053 		vm_map_unlock_read(map);
7054 
7055 		vm_object_lock_request(local_object,
7056 		    ((vm_object_offset_t)
7057 		    ((offset - local_start) +
7058 		    local_offset)),
7059 		    (vm_object_size_t)*upl_size,
7060 		    FALSE,
7061 		    MEMORY_OBJECT_DATA_SYNC,
7062 		    VM_PROT_NO_CHANGE);
7063 
7064 		force_data_sync = FALSE;
7065 		vm_object_deallocate(local_object);
7066 
7067 		goto REDISCOVER_ENTRY;
7068 	}
7069 	if (VME_OBJECT(entry)->private) {
7070 		*flags = UPL_DEV_MEMORY;
7071 	} else {
7072 		*flags = 0;
7073 	}
7074 
7075 	if (VME_OBJECT(entry)->phys_contiguous) {
7076 		*flags |= UPL_PHYS_CONTIG;
7077 	}
7078 
7079 	local_object = VME_OBJECT(entry);
7080 	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7081 	local_start = entry->vme_start;
7082 
7083 
7084 	/*
7085 	 * Wiring will copy the pages to the shadow object.
7086 	 * The shadow object will not be code-signed so
7087 	 * attempting to execute code from these copied pages
7088 	 * would trigger a code-signing violation.
7089 	 */
7090 	if (entry->protection & VM_PROT_EXECUTE) {
7091 #if MACH_ASSERT
7092 		printf("pid %d[%s] create_upl out of executable range from "
7093 		    "0x%llx to 0x%llx: side effects may include "
7094 		    "code-signing violations later on\n",
7095 		    proc_selfpid(),
7096 		    (get_bsdtask_info(current_task())
7097 		    ? proc_name_address(get_bsdtask_info(current_task()))
7098 		    : "?"),
7099 		    (uint64_t) entry->vme_start,
7100 		    (uint64_t) entry->vme_end);
7101 #endif /* MACH_ASSERT */
7102 		DTRACE_VM2(cs_executable_create_upl,
7103 		    uint64_t, (uint64_t)entry->vme_start,
7104 		    uint64_t, (uint64_t)entry->vme_end);
7105 		cs_executable_create_upl++;
7106 	}
7107 
7108 	vm_object_lock(local_object);
7109 
7110 	/*
7111 	 * Ensure that this object is "true_share" and "copy_delay" now,
7112 	 * while we're still holding the VM map lock.  After we unlock the map,
7113 	 * anything could happen to that mapping, including some copy-on-write
7114 	 * activity.  We need to make sure that the IOPL will point at the
7115 	 * same memory as the mapping.
7116 	 */
7117 	if (local_object->true_share) {
7118 		assert(local_object->copy_strategy !=
7119 		    MEMORY_OBJECT_COPY_SYMMETRIC);
7120 	} else if (!is_kernel_object(local_object) &&
7121 	    local_object != compressor_object &&
7122 	    !local_object->phys_contiguous) {
7123 #if VM_OBJECT_TRACKING_OP_TRUESHARE
7124 		if (!local_object->true_share &&
7125 		    vm_object_tracking_btlog) {
7126 			btlog_record(vm_object_tracking_btlog, local_object,
7127 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
7128 			    btref_get(__builtin_frame_address(0), 0));
7129 		}
7130 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7131 		VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
7132 		if (local_object->copy_strategy ==
7133 		    MEMORY_OBJECT_COPY_SYMMETRIC) {
7134 			local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7135 		}
7136 	}
7137 
7138 	vm_object_reference_locked(local_object);
7139 	vm_object_unlock(local_object);
7140 
7141 	vm_map_unlock_read(map);
7142 
7143 	offset += offset_in_mapped_page;
7144 	assert(*upl_size > offset_in_mapped_page);
7145 	*upl_size -= offset_in_mapped_page;
7146 
7147 	ret = vm_object_iopl_request(local_object,
7148 	    ((vm_object_offset_t)
7149 	    ((offset - local_start) + local_offset)),
7150 	    *upl_size,
7151 	    upl,
7152 	    page_list,
7153 	    count,
7154 	    caller_flags,
7155 	    tag);
7156 	vm_object_deallocate(local_object);
7157 
7158 
7159 done:
7160 	if (release_map) {
7161 		vm_map_deallocate(map);
7162 	}
7163 
7164 	return ret;
7165 }
7166 
7167 /*
7168  * Internal routine to enter a UPL into a VM map.
7169  *
7170  * JMM - This should just be doable through the standard
7171  * vm_map_enter() API.
7172  */
7173 kern_return_t
7174 vm_map_enter_upl_range(
7175 	vm_map_t                map,
7176 	upl_t                   upl,
7177 	vm_object_offset_t      offset_to_map,
7178 	vm_size_t               size_to_map,
7179 	vm_prot_t               prot_to_map,
7180 	vm_map_offset_t         *dst_addr)
7181 {
7182 	vm_map_size_t           size;
7183 	vm_object_offset_t      offset;
7184 	vm_map_offset_t         addr;
7185 	vm_page_t               m;
7186 	kern_return_t           kr;
7187 	int                     isVectorUPL = 0, curr_upl = 0;
7188 	upl_t                   vector_upl = NULL;
7189 	mach_vm_offset_t        vector_upl_dst_addr = 0;
7190 	vm_map_t                vector_upl_submap = NULL;
7191 	upl_offset_t            subupl_offset = 0;
7192 	upl_size_t              subupl_size = 0;
7193 
7194 	if (upl == UPL_NULL) {
7195 		return KERN_INVALID_ARGUMENT;
7196 	}
7197 
7198 	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7199 	assert(map == kernel_map);
7200 
7201 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7202 		int mapped = 0, valid_upls = 0;
7203 		vector_upl = upl;
7204 
7205 		upl_lock(vector_upl);
7206 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7207 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7208 			if (upl == NULL) {
7209 				continue;
7210 			}
7211 			valid_upls++;
7212 			if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7213 				mapped++;
7214 			}
7215 		}
7216 
7217 		if (mapped) {
7218 			if (mapped != valid_upls) {
7219 				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7220 			} else {
7221 				upl_unlock(vector_upl);
7222 				return KERN_FAILURE;
7223 			}
7224 		}
7225 
7226 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7227 			panic("TODO4K: vector UPL not implemented");
7228 		}
7229 
7230 		vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
7231 		    vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
7232 		    VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
7233 		    VM_KERN_MEMORY_NONE).kmr_submap;
7234 		map = vector_upl_submap;
7235 		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7236 		curr_upl = 0;
7237 	} else {
7238 		upl_lock(upl);
7239 	}
7240 
7241 process_upl_to_enter:
7242 	if (isVectorUPL) {
7243 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7244 			*dst_addr = vector_upl_dst_addr;
7245 			upl_unlock(vector_upl);
7246 			return KERN_SUCCESS;
7247 		}
7248 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7249 		if (upl == NULL) {
7250 			goto process_upl_to_enter;
7251 		}
7252 
7253 		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7254 		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7255 	} else {
7256 		/*
7257 		 * check to see if already mapped
7258 		 */
7259 		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7260 			upl_unlock(upl);
7261 			return KERN_FAILURE;
7262 		}
7263 	}
7264 
7265 	if ((!(upl->flags & UPL_SHADOWED)) &&
7266 	    ((upl->flags & UPL_HAS_BUSY) ||
7267 	    !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7268 		vm_object_t             object;
7269 		vm_page_t               alias_page;
7270 		vm_object_offset_t      new_offset;
7271 		unsigned int            pg_num;
7272 
7273 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7274 		object = upl->map_object;
7275 		upl->map_object = vm_object_allocate(
7276 			vm_object_round_page(size),
7277 			/* Provenance is copied from the object we're shadowing */
7278 			object->vmo_provenance);
7279 
7280 		vm_object_lock(upl->map_object);
7281 
7282 		upl->map_object->shadow = object;
7283 		VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
7284 		VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
7285 		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7286 		upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7287 		assertf(page_aligned(upl->map_object->vo_shadow_offset),
7288 		    "object %p shadow_offset 0x%llx",
7289 		    upl->map_object,
7290 		    (uint64_t)upl->map_object->vo_shadow_offset);
7291 		upl->map_object->wimg_bits = object->wimg_bits;
7292 		offset = upl->map_object->vo_shadow_offset;
7293 		new_offset = 0;
7294 
7295 		upl->flags |= UPL_SHADOWED;
7296 
7297 		while (size) {
7298 			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7299 			assert(pg_num == new_offset / PAGE_SIZE);
7300 
7301 			if (bitmap_test(upl->lite_list, pg_num)) {
7302 				alias_page = vm_page_create_fictitious();
7303 
7304 				vm_object_lock(object);
7305 
7306 				m = vm_page_lookup(object, offset);
7307 				if (m == VM_PAGE_NULL) {
7308 					panic("vm_upl_map: page missing");
7309 				}
7310 
7311 				/*
7312 				 * Convert the fictitious page to a private
7313 				 * shadow of the real page.
7314 				 */
7315 				alias_page->vmp_free_when_done = TRUE;
7316 				/*
7317 				 * since m is a page in the upl it must
7318 				 * already be wired or BUSY, so it's
7319 				 * safe to assign the underlying physical
7320 				 * page to the alias
7321 				 */
7322 
7323 				vm_object_unlock(object);
7324 
7325 				vm_page_lockspin_queues();
7326 				vm_page_make_private(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7327 				vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7328 				vm_page_unlock_queues();
7329 
7330 				vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7331 
7332 				assert(!alias_page->vmp_wanted);
7333 				alias_page->vmp_busy = FALSE;
7334 				alias_page->vmp_absent = FALSE;
7335 			}
7336 			size -= PAGE_SIZE;
7337 			offset += PAGE_SIZE_64;
7338 			new_offset += PAGE_SIZE_64;
7339 		}
7340 		vm_object_unlock(upl->map_object);
7341 	}
7342 	if (upl->flags & UPL_SHADOWED) {
7343 		if (isVectorUPL) {
7344 			offset = 0;
7345 		} else {
7346 			offset = offset_to_map;
7347 		}
7348 	} else {
7349 		offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7350 		if (!isVectorUPL) {
7351 			offset += offset_to_map;
7352 		}
7353 	}
7354 
7355 	if (isVectorUPL) {
7356 		size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7357 	} else {
7358 		size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7359 	}
7360 
7361 	vm_object_reference(upl->map_object);
7362 
7363 	if (!isVectorUPL) {
7364 		*dst_addr = 0;
7365 		/*
7366 		 * NEED A UPL_MAP ALIAS
7367 		 */
7368 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7369 		    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7370 		    upl->map_object, offset, FALSE,
7371 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7372 
7373 		if (kr != KERN_SUCCESS) {
7374 			vm_object_deallocate(upl->map_object);
7375 			upl_unlock(upl);
7376 			return kr;
7377 		}
7378 	} else {
7379 		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7380 		    VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7381 		    upl->map_object, offset, FALSE,
7382 		    prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7383 		if (kr) {
7384 			panic("vm_map_enter failed for a Vector UPL");
7385 		}
7386 	}
7387 	upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7388 	                                        /* this will have to be an increment rather than */
7389 	                                        /* an assignment. */
7390 	vm_object_lock(upl->map_object);
7391 
7392 	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7393 		m = vm_page_lookup(upl->map_object, offset);
7394 
7395 		if (m) {
7396 			m->vmp_pmapped = TRUE;
7397 
7398 			/*
7399 			 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7400 			 * but only in kernel space. If this was on a user map,
7401 			 * we'd have to set the wpmapped bit.
7402 			 */
7403 			/* m->vmp_wpmapped = TRUE; */
7404 			assert(map->pmap == kernel_pmap);
7405 
7406 			kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, TRUE);
7407 
7408 			assert(kr == KERN_SUCCESS);
7409 #if KASAN
7410 			kasan_notify_address(addr, PAGE_SIZE_64);
7411 #endif
7412 		}
7413 		offset += PAGE_SIZE_64;
7414 	}
7415 	vm_object_unlock(upl->map_object);
7416 
7417 	/*
7418 	 * hold a reference for the mapping
7419 	 */
7420 	upl->ref_count++;
7421 	upl->flags |= UPL_PAGE_LIST_MAPPED;
7422 	upl->kaddr = (vm_offset_t) *dst_addr;
7423 	assert(upl->kaddr == *dst_addr);
7424 
7425 	if (isVectorUPL) {
7426 		goto process_upl_to_enter;
7427 	}
7428 
7429 	if (!isVectorUPL) {
7430 		vm_map_offset_t addr_adjustment;
7431 
7432 		addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7433 		if (addr_adjustment) {
7434 			assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7435 			DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7436 			*dst_addr += addr_adjustment;
7437 		}
7438 	}
7439 
7440 	upl_unlock(upl);
7441 
7442 	return KERN_SUCCESS;
7443 }
7444 
7445 kern_return_t
7446 vm_map_enter_upl(
7447 	vm_map_t                map,
7448 	upl_t                   upl,
7449 	vm_map_offset_t         *dst_addr)
7450 {
7451 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7452 	return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
7453 }
7454 
7455 /*
7456  * Internal routine to remove a UPL mapping from a VM map.
7457  *
7458  * XXX - This should just be doable through a standard
7459  * vm_map_remove() operation.  Otherwise, implicit clean-up
7460  * of the target map won't be able to correctly remove
7461  * these (and release the reference on the UPL).  Having
7462  * to do this means we can't map these into user-space
7463  * maps yet.
7464  */
7465 kern_return_t
7466 vm_map_remove_upl_range(
7467 	vm_map_t        map,
7468 	upl_t           upl,
7469 	__unused vm_object_offset_t    offset_to_unmap,
7470 	__unused vm_size_t             size_to_unmap)
7471 {
7472 	vm_address_t    addr;
7473 	upl_size_t      size;
7474 	int             isVectorUPL = 0, curr_upl = 0;
7475 	upl_t           vector_upl = NULL;
7476 
7477 	if (upl == UPL_NULL) {
7478 		return KERN_INVALID_ARGUMENT;
7479 	}
7480 
7481 	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7482 		int     unmapped = 0, valid_upls = 0;
7483 		vector_upl = upl;
7484 		upl_lock(vector_upl);
7485 		for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
7486 			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7487 			if (upl == NULL) {
7488 				continue;
7489 			}
7490 			valid_upls++;
7491 			if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7492 				unmapped++;
7493 			}
7494 		}
7495 
7496 		if (unmapped) {
7497 			if (unmapped != valid_upls) {
7498 				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7499 			} else {
7500 				upl_unlock(vector_upl);
7501 				return KERN_FAILURE;
7502 			}
7503 		}
7504 		curr_upl = 0;
7505 	} else {
7506 		upl_lock(upl);
7507 	}
7508 
7509 process_upl_to_remove:
7510 	if (isVectorUPL) {
7511 		if (curr_upl == vector_upl_max_upls(vector_upl)) {
7512 			vm_map_t v_upl_submap;
7513 			vm_offset_t v_upl_submap_dst_addr;
7514 			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7515 
7516 			kmem_free_guard(map, v_upl_submap_dst_addr,
7517 			    vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
7518 			vm_map_deallocate(v_upl_submap);
7519 			upl_unlock(vector_upl);
7520 			return KERN_SUCCESS;
7521 		}
7522 
7523 		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7524 		if (upl == NULL) {
7525 			goto process_upl_to_remove;
7526 		}
7527 	}
7528 
7529 	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7530 		addr = upl->kaddr;
7531 		size = upl->u_mapped_size;
7532 
7533 		assert(upl->ref_count > 1);
7534 		upl->ref_count--;               /* removing mapping ref */
7535 
7536 		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7537 		upl->kaddr = (vm_offset_t) 0;
7538 		upl->u_mapped_size = 0;
7539 
7540 		if (isVectorUPL) {
7541 			/*
7542 			 * If it's a Vectored UPL, we'll be removing the entire
7543 			 * submap anyways, so no need to remove individual UPL
7544 			 * element mappings from within the submap
7545 			 */
7546 			goto process_upl_to_remove;
7547 		}
7548 
7549 		upl_unlock(upl);
7550 
7551 		vm_map_remove(map,
7552 		    vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7553 		    vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7554 		return KERN_SUCCESS;
7555 	}
7556 	upl_unlock(upl);
7557 
7558 	return KERN_FAILURE;
7559 }
7560 
7561 kern_return_t
7562 vm_map_remove_upl(
7563 	vm_map_t        map,
7564 	upl_t           upl)
7565 {
7566 	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7567 	return vm_map_remove_upl_range(map, upl, 0, upl_size);
7568 }
7569 
7570 void
7571 iopl_valid_data(
7572 	upl_t    upl,
7573 	vm_tag_t tag)
7574 {
7575 	vm_object_t     object;
7576 	vm_offset_t     offset;
7577 	vm_page_t       m, nxt_page = VM_PAGE_NULL;
7578 	upl_size_t      size;
7579 	int             wired_count = 0;
7580 
7581 	if (upl == NULL) {
7582 		panic("iopl_valid_data: NULL upl");
7583 	}
7584 	if (vector_upl_is_valid(upl)) {
7585 		panic("iopl_valid_data: vector upl");
7586 	}
7587 	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
7588 		panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7589 	}
7590 
7591 	object = upl->map_object;
7592 
7593 	if (is_kernel_object(object) || object == compressor_object) {
7594 		panic("iopl_valid_data: object == kernel or compressor");
7595 	}
7596 
7597 	if (object->purgable == VM_PURGABLE_VOLATILE ||
7598 	    object->purgable == VM_PURGABLE_EMPTY) {
7599 		panic("iopl_valid_data: object %p purgable %d",
7600 		    object, object->purgable);
7601 	}
7602 
7603 	size = upl_adjusted_size(upl, PAGE_MASK);
7604 
7605 	vm_object_lock(object);
7606 	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7607 
7608 	bool whole_object;
7609 
7610 	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
7611 		nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7612 		whole_object = true;
7613 	} else {
7614 		offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
7615 		whole_object = false;
7616 	}
7617 
7618 	while (size) {
7619 		if (whole_object) {
7620 			if (nxt_page != VM_PAGE_NULL) {
7621 				m = nxt_page;
7622 				nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7623 			}
7624 		} else {
7625 			m = vm_page_lookup(object, offset);
7626 			offset += PAGE_SIZE;
7627 
7628 			if (m == VM_PAGE_NULL) {
7629 				panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7630 			}
7631 		}
7632 		if (m->vmp_busy) {
7633 			if (!m->vmp_absent) {
7634 				panic("iopl_valid_data: busy page w/o absent");
7635 			}
7636 
7637 			if (m->vmp_pageq.next || m->vmp_pageq.prev) {
7638 				panic("iopl_valid_data: busy+absent page on page queue");
7639 			}
7640 			if (m->vmp_reusable) {
7641 				panic("iopl_valid_data: %p is reusable", m);
7642 			}
7643 
7644 			m->vmp_absent = FALSE;
7645 			m->vmp_dirty = TRUE;
7646 			assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7647 			assert(m->vmp_wire_count == 0);
7648 			m->vmp_wire_count++;
7649 			assert(m->vmp_wire_count);
7650 			if (m->vmp_wire_count == 1) {
7651 				m->vmp_q_state = VM_PAGE_IS_WIRED;
7652 				wired_count++;
7653 			} else {
7654 				panic("iopl_valid_data: %p already wired", m);
7655 			}
7656 
7657 
7658 			vm_page_wakeup_done(object, m);
7659 		}
7660 		size -= PAGE_SIZE;
7661 	}
7662 	if (wired_count) {
7663 		VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7664 		assert(object->resident_page_count >= object->wired_page_count);
7665 
7666 		/* no need to adjust purgeable accounting for this object: */
7667 		assert(object->purgable != VM_PURGABLE_VOLATILE);
7668 		assert(object->purgable != VM_PURGABLE_EMPTY);
7669 
7670 		vm_page_lockspin_queues();
7671 		vm_page_wire_count += wired_count;
7672 		vm_page_unlock_queues();
7673 	}
7674 	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7675 	vm_object_unlock(object);
7676 }
7677 
7678 
7679 void
7680 vm_object_set_pmap_cache_attr(
7681 	vm_object_t             object,
7682 	upl_page_info_array_t   user_page_list,
7683 	unsigned int            num_pages,
7684 	boolean_t               batch_pmap_op)
7685 {
7686 	unsigned int    cache_attr = 0;
7687 
7688 	cache_attr = object->wimg_bits & VM_WIMG_MASK;
7689 	assert(user_page_list);
7690 	if (!HAS_DEFAULT_CACHEABILITY(cache_attr)) {
7691 		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7692 	}
7693 }
7694 
7695 
7696 static bool
7697 vm_object_iopl_wire_full(
7698 	vm_object_t             object,
7699 	upl_t                   upl,
7700 	upl_page_info_array_t   user_page_list,
7701 	upl_control_flags_t     cntrl_flags,
7702 	vm_tag_t                tag)
7703 {
7704 	vm_page_t       dst_page;
7705 	unsigned int    entry;
7706 	int             page_count;
7707 	int             delayed_unlock = 0;
7708 	boolean_t       retval = TRUE;
7709 	ppnum_t         phys_page;
7710 
7711 	vm_object_lock_assert_exclusive(object);
7712 	assert(object->purgable != VM_PURGABLE_VOLATILE);
7713 	assert(object->purgable != VM_PURGABLE_EMPTY);
7714 	assert(object->pager == NULL);
7715 	assert(object->vo_copy == NULL);
7716 	assert(object->shadow == NULL);
7717 
7718 	page_count = object->resident_page_count;
7719 	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7720 
7721 	vm_page_lock_queues();
7722 
7723 	while (page_count--) {
7724 		if (dst_page->vmp_busy ||
7725 		    vm_page_is_fictitious(dst_page) ||
7726 		    dst_page->vmp_absent ||
7727 		    VMP_ERROR_GET(dst_page) ||
7728 		    dst_page->vmp_cleaning ||
7729 		    dst_page->vmp_restart ||
7730 		    dst_page->vmp_laundry) {
7731 			retval = FALSE;
7732 			goto done;
7733 		}
7734 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
7735 			retval = FALSE;
7736 			goto done;
7737 		}
7738 		dst_page->vmp_reference = TRUE;
7739 
7740 		vm_page_wire(dst_page, tag, FALSE);
7741 
7742 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7743 			SET_PAGE_DIRTY(dst_page, FALSE);
7744 		}
7745 		entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
7746 		assert(entry >= 0 && entry < object->resident_page_count);
7747 		bitmap_set(upl->lite_list, entry);
7748 
7749 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7750 
7751 		if (phys_page > upl->highest_page) {
7752 			upl->highest_page = phys_page;
7753 		}
7754 
7755 		if (user_page_list) {
7756 			user_page_list[entry].phys_addr = phys_page;
7757 			user_page_list[entry].absent    = dst_page->vmp_absent;
7758 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
7759 			user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
7760 			user_page_list[entry].precious  = dst_page->vmp_precious;
7761 			user_page_list[entry].device    = FALSE;
7762 			user_page_list[entry].speculative = FALSE;
7763 			user_page_list[entry].cs_validated = FALSE;
7764 			user_page_list[entry].cs_tainted = FALSE;
7765 			user_page_list[entry].cs_nx     = FALSE;
7766 			user_page_list[entry].needed    = FALSE;
7767 			user_page_list[entry].mark      = FALSE;
7768 		}
7769 		if (delayed_unlock++ > 256) {
7770 			delayed_unlock = 0;
7771 			lck_mtx_yield(&vm_page_queue_lock);
7772 
7773 			VM_CHECK_MEMORYSTATUS;
7774 		}
7775 		dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
7776 	}
7777 done:
7778 	vm_page_unlock_queues();
7779 
7780 	VM_CHECK_MEMORYSTATUS;
7781 
7782 	return retval;
7783 }
7784 
7785 
7786 static kern_return_t
7787 vm_object_iopl_wire_empty(
7788 	vm_object_t             object,
7789 	upl_t                   upl,
7790 	upl_page_info_array_t   user_page_list,
7791 	upl_control_flags_t     cntrl_flags,
7792 	vm_tag_t                tag,
7793 	vm_object_offset_t     *dst_offset,
7794 	int                     page_count,
7795 	int                    *page_grab_count)
7796 {
7797 	vm_page_t       dst_page;
7798 	boolean_t       no_zero_fill = FALSE;
7799 	int             interruptible;
7800 	int             pages_wired = 0;
7801 	int             pages_inserted = 0;
7802 	int             entry = 0;
7803 	uint64_t        delayed_ledger_update = 0;
7804 	kern_return_t   ret = KERN_SUCCESS;
7805 	int             grab_options;
7806 	ppnum_t         phys_page;
7807 
7808 	vm_object_lock_assert_exclusive(object);
7809 	assert(object->purgable != VM_PURGABLE_VOLATILE);
7810 	assert(object->purgable != VM_PURGABLE_EMPTY);
7811 	assert(object->pager == NULL);
7812 	assert(object->vo_copy == NULL);
7813 	assert(object->shadow == NULL);
7814 
7815 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
7816 		interruptible = THREAD_ABORTSAFE;
7817 	} else {
7818 		interruptible = THREAD_UNINT;
7819 	}
7820 
7821 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
7822 		no_zero_fill = TRUE;
7823 	}
7824 
7825 	grab_options = 0;
7826 #if CONFIG_SECLUDED_MEMORY
7827 	if (object->can_grab_secluded) {
7828 		grab_options |= VM_PAGE_GRAB_SECLUDED;
7829 	}
7830 #endif /* CONFIG_SECLUDED_MEMORY */
7831 
7832 	while (page_count--) {
7833 		while ((dst_page = vm_page_grab_options(grab_options))
7834 		    == VM_PAGE_NULL) {
7835 			OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7836 
7837 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7838 
7839 			if (vm_page_wait(interruptible) == FALSE) {
7840 				/*
7841 				 * interrupted case
7842 				 */
7843 				OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7844 
7845 				VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7846 
7847 				ret = MACH_SEND_INTERRUPTED;
7848 				goto done;
7849 			}
7850 			OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7851 
7852 			VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7853 		}
7854 
7855 		dst_page->vmp_absent = no_zero_fill;
7856 		dst_page->vmp_reference = TRUE;
7857 
7858 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7859 			SET_PAGE_DIRTY(dst_page, FALSE);
7860 		}
7861 		if (dst_page->vmp_absent == FALSE) {
7862 			assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
7863 			assert(dst_page->vmp_wire_count == 0);
7864 			dst_page->vmp_wire_count++;
7865 			dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
7866 			assert(dst_page->vmp_wire_count);
7867 			pages_wired++;
7868 
7869 
7870 			vm_page_wakeup_done(object, dst_page);
7871 		}
7872 		pages_inserted++;
7873 
7874 		vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
7875 
7876 		if (no_zero_fill == FALSE) {
7877 			vm_page_zero_fill(
7878 				dst_page
7879 				);
7880 		}
7881 
7882 		bitmap_set(upl->lite_list, entry);
7883 
7884 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7885 
7886 		if (phys_page > upl->highest_page) {
7887 			upl->highest_page = phys_page;
7888 		}
7889 
7890 		if (user_page_list) {
7891 			user_page_list[entry].phys_addr = phys_page;
7892 			user_page_list[entry].absent    = dst_page->vmp_absent;
7893 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
7894 			user_page_list[entry].free_when_done    = FALSE;
7895 			user_page_list[entry].precious  = FALSE;
7896 			user_page_list[entry].device    = FALSE;
7897 			user_page_list[entry].speculative = FALSE;
7898 			user_page_list[entry].cs_validated = FALSE;
7899 			user_page_list[entry].cs_tainted = FALSE;
7900 			user_page_list[entry].cs_nx     = FALSE;
7901 			user_page_list[entry].needed    = FALSE;
7902 			user_page_list[entry].mark      = FALSE;
7903 		}
7904 		entry++;
7905 		*dst_offset += PAGE_SIZE_64;
7906 	}
7907 done:
7908 	if (pages_wired) {
7909 		vm_page_lockspin_queues();
7910 		vm_page_wire_count += pages_wired;
7911 		vm_page_unlock_queues();
7912 	}
7913 	if (pages_inserted) {
7914 		if (object->internal) {
7915 			OSAddAtomic(pages_inserted, &vm_page_internal_count);
7916 		} else {
7917 			OSAddAtomic(pages_inserted, &vm_page_external_count);
7918 		}
7919 	}
7920 	if (delayed_ledger_update) {
7921 		task_t          owner;
7922 		int             ledger_idx_volatile;
7923 		int             ledger_idx_nonvolatile;
7924 		int             ledger_idx_volatile_compressed;
7925 		int             ledger_idx_nonvolatile_compressed;
7926 		int             ledger_idx_composite;
7927 		int             ledger_idx_external_wired;
7928 		boolean_t       do_footprint;
7929 
7930 		owner = VM_OBJECT_OWNER(object);
7931 		assert(owner);
7932 
7933 		vm_object_ledger_tag_ledgers(object,
7934 		    &ledger_idx_volatile,
7935 		    &ledger_idx_nonvolatile,
7936 		    &ledger_idx_volatile_compressed,
7937 		    &ledger_idx_nonvolatile_compressed,
7938 		    &ledger_idx_composite,
7939 		    &ledger_idx_external_wired,
7940 		    &do_footprint);
7941 
7942 		if (object->internal) {
7943 			/* more non-volatile bytes */
7944 			ledger_credit(owner->ledger,
7945 			    ledger_idx_nonvolatile,
7946 			    delayed_ledger_update);
7947 			if (do_footprint) {
7948 				/* more footprint */
7949 				ledger_credit(owner->ledger,
7950 				    task_ledgers.phys_footprint,
7951 				    delayed_ledger_update);
7952 			} else if (ledger_idx_composite != -1) {
7953 				ledger_credit(owner->ledger,
7954 				    ledger_idx_composite,
7955 				    delayed_ledger_update);
7956 			}
7957 		} else {
7958 			/* more external wired bytes */
7959 			ledger_credit(owner->ledger,
7960 			    ledger_idx_external_wired,
7961 			    delayed_ledger_update);
7962 			if (do_footprint) {
7963 				/* more footprint */
7964 				ledger_credit(owner->ledger,
7965 				    task_ledgers.phys_footprint,
7966 				    delayed_ledger_update);
7967 			} else if (ledger_idx_composite != -1) {
7968 				ledger_credit(owner->ledger,
7969 				    ledger_idx_composite,
7970 				    delayed_ledger_update);
7971 			}
7972 		}
7973 	}
7974 
7975 	assert(page_grab_count);
7976 	*page_grab_count = pages_inserted;
7977 
7978 	return ret;
7979 }
7980 
7981 
7982 kern_return_t
7983 vm_object_iopl_request(
7984 	vm_object_t             object,
7985 	vm_object_offset_t      offset,
7986 	upl_size_t              size,
7987 	upl_t                   *upl_ptr,
7988 	upl_page_info_array_t   user_page_list,
7989 	unsigned int            *page_list_count,
7990 	upl_control_flags_t     cntrl_flags,
7991 	vm_tag_t                tag)
7992 {
7993 	vm_page_t               dst_page;
7994 	vm_object_offset_t      dst_offset;
7995 	upl_size_t              xfer_size;
7996 	upl_t                   upl = NULL;
7997 	unsigned int            entry;
7998 	int                     no_zero_fill = FALSE;
7999 	unsigned int            size_in_pages;
8000 	int                     page_grab_count = 0;
8001 	u_int32_t               psize;
8002 	kern_return_t           ret;
8003 	vm_prot_t               prot;
8004 	struct vm_object_fault_info fault_info = {};
8005 	struct  vm_page_delayed_work    dw_array;
8006 	struct  vm_page_delayed_work    *dwp, *dwp_start;
8007 	bool                    dwp_finish_ctx = TRUE;
8008 	int                     dw_count;
8009 	int                     dw_limit;
8010 	int                     dw_index;
8011 	boolean_t               caller_lookup;
8012 	int                     io_tracking_flag = 0;
8013 	int                     interruptible;
8014 	ppnum_t                 phys_page;
8015 
8016 	boolean_t               set_cache_attr_needed = FALSE;
8017 	boolean_t               free_wired_pages = FALSE;
8018 	boolean_t               fast_path_empty_req = FALSE;
8019 	boolean_t               fast_path_full_req = FALSE;
8020 
8021 	task_t                  task = current_task();
8022 
8023 	dwp_start = dwp = NULL;
8024 
8025 	vm_object_offset_t original_offset = offset;
8026 	upl_size_t original_size = size;
8027 
8028 //	DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8029 
8030 	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8031 	offset = vm_object_trunc_page(offset);
8032 	if (size != original_size || offset != original_offset) {
8033 		DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8034 	}
8035 
8036 	if (cntrl_flags & ~UPL_VALID_FLAGS) {
8037 		/*
8038 		 * For forward compatibility's sake,
8039 		 * reject any unknown flag.
8040 		 */
8041 		return KERN_INVALID_VALUE;
8042 	}
8043 	if (vm_lopage_needed == FALSE) {
8044 		cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8045 	}
8046 
8047 	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8048 		if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8049 			return KERN_INVALID_VALUE;
8050 		}
8051 
8052 		if (object->phys_contiguous) {
8053 			if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8054 				return KERN_INVALID_ADDRESS;
8055 			}
8056 
8057 			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8058 				return KERN_INVALID_ADDRESS;
8059 			}
8060 		}
8061 	}
8062 	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8063 		no_zero_fill = TRUE;
8064 	}
8065 
8066 	if (cntrl_flags & UPL_COPYOUT_FROM) {
8067 		prot = VM_PROT_READ;
8068 	} else {
8069 		prot = VM_PROT_READ | VM_PROT_WRITE;
8070 	}
8071 
8072 	if ((!object->internal) && (object->paging_offset != 0)) {
8073 		panic("vm_object_iopl_request: external object with non-zero paging offset");
8074 	}
8075 
8076 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8077 
8078 #if CONFIG_IOSCHED || UPL_DEBUG
8079 	if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
8080 		io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8081 	}
8082 #endif
8083 
8084 #if CONFIG_IOSCHED
8085 	if (object->io_tracking) {
8086 		/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8087 		if (!is_kernel_object(object)) {
8088 			io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8089 		}
8090 	}
8091 #endif
8092 
8093 	if (object->phys_contiguous) {
8094 		psize = PAGE_SIZE;
8095 	} else {
8096 		psize = size;
8097 
8098 		dw_count = 0;
8099 		dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8100 		dwp_start = vm_page_delayed_work_get_ctx();
8101 		if (dwp_start == NULL) {
8102 			dwp_start = &dw_array;
8103 			dw_limit = 1;
8104 			dwp_finish_ctx = FALSE;
8105 		}
8106 
8107 		dwp = dwp_start;
8108 	}
8109 
8110 	if (cntrl_flags & UPL_SET_INTERNAL) {
8111 		upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8112 		user_page_list = size ? upl->page_list : NULL;
8113 	} else {
8114 		upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8115 	}
8116 	if (user_page_list) {
8117 		user_page_list[0].device = FALSE;
8118 	}
8119 	*upl_ptr = upl;
8120 
8121 	if (cntrl_flags & UPL_NOZEROFILLIO) {
8122 		DTRACE_VM4(upl_nozerofillio,
8123 		    vm_object_t, object,
8124 		    vm_object_offset_t, offset,
8125 		    upl_size_t, size,
8126 		    upl_t, upl);
8127 	}
8128 
8129 	upl->map_object = object;
8130 	upl->u_offset = original_offset;
8131 	upl->u_size = original_size;
8132 
8133 	size_in_pages = size / PAGE_SIZE;
8134 
8135 	if (is_kernel_object(object) &&
8136 	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8137 		upl->flags |= UPL_KERNEL_OBJECT;
8138 #if UPL_DEBUG
8139 		vm_object_lock(object);
8140 #else
8141 		vm_object_lock_shared(object);
8142 #endif
8143 	} else {
8144 		vm_object_lock(object);
8145 		vm_object_activity_begin(object);
8146 	}
8147 	/*
8148 	 * paging in progress also protects the paging_offset
8149 	 */
8150 	upl->u_offset = original_offset + object->paging_offset;
8151 
8152 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8153 		/*
8154 		 * The user requested that access to the pages in this UPL
8155 		 * be blocked until the UPL is commited or aborted.
8156 		 */
8157 		upl->flags |= UPL_ACCESS_BLOCKED;
8158 	}
8159 
8160 #if CONFIG_IOSCHED || UPL_DEBUG
8161 	if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8162 		vm_object_activity_begin(object);
8163 		queue_enter(&object->uplq, upl, upl_t, uplq);
8164 	}
8165 #endif
8166 
8167 	if (object->phys_contiguous) {
8168 		if (upl->flags & UPL_ACCESS_BLOCKED) {
8169 			assert(!object->blocked_access);
8170 			object->blocked_access = TRUE;
8171 		}
8172 
8173 		vm_object_unlock(object);
8174 
8175 		/*
8176 		 * don't need any shadow mappings for this one
8177 		 * since it is already I/O memory
8178 		 */
8179 		upl->flags |= UPL_DEVICE_MEMORY;
8180 
8181 		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
8182 
8183 		if (user_page_list) {
8184 			user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
8185 			user_page_list[0].device = TRUE;
8186 		}
8187 		if (page_list_count != NULL) {
8188 			if (upl->flags & UPL_INTERNAL) {
8189 				*page_list_count = 0;
8190 			} else {
8191 				*page_list_count = 1;
8192 			}
8193 		}
8194 
8195 		VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8196 		if (task != NULL) {
8197 			counter_add(&task->pages_grabbed_iopl, page_grab_count);
8198 		}
8199 		return KERN_SUCCESS;
8200 	}
8201 	if (!is_kernel_object(object) && object != compressor_object) {
8202 		/*
8203 		 * Protect user space from future COW operations
8204 		 */
8205 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8206 		if (!object->true_share &&
8207 		    vm_object_tracking_btlog) {
8208 			btlog_record(vm_object_tracking_btlog, object,
8209 			    VM_OBJECT_TRACKING_OP_TRUESHARE,
8210 			    btref_get(__builtin_frame_address(0), 0));
8211 		}
8212 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8213 
8214 		vm_object_lock_assert_exclusive(object);
8215 		VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
8216 
8217 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
8218 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8219 		}
8220 	}
8221 
8222 	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8223 	    object->vo_copy != VM_OBJECT_NULL) {
8224 		/*
8225 		 * Honor copy-on-write obligations
8226 		 *
8227 		 * The caller is gathering these pages and
8228 		 * might modify their contents.  We need to
8229 		 * make sure that the copy object has its own
8230 		 * private copies of these pages before we let
8231 		 * the caller modify them.
8232 		 *
8233 		 * NOTE: someone else could map the original object
8234 		 * after we've done this copy-on-write here, and they
8235 		 * could then see an inconsistent picture of the memory
8236 		 * while it's being modified via the UPL.  To prevent this,
8237 		 * we would have to block access to these pages until the
8238 		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
8239 		 * code path for that...
8240 		 */
8241 		vm_object_update(object,
8242 		    offset,
8243 		    size,
8244 		    NULL,
8245 		    NULL,
8246 		    FALSE,              /* should_return */
8247 		    MEMORY_OBJECT_COPY_SYNC,
8248 		    VM_PROT_NO_CHANGE);
8249 		VM_PAGEOUT_DEBUG(iopl_cow, 1);
8250 		VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
8251 	}
8252 	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8253 	    object->purgable != VM_PURGABLE_VOLATILE &&
8254 	    object->purgable != VM_PURGABLE_EMPTY &&
8255 	    object->vo_copy == NULL &&
8256 	    size == object->vo_size &&
8257 	    offset == 0 &&
8258 	    object->shadow == NULL &&
8259 	    object->pager == NULL) {
8260 		if (object->resident_page_count == size_in_pages) {
8261 			assert(object != compressor_object);
8262 			assert(!is_kernel_object(object));
8263 			fast_path_full_req = TRUE;
8264 		} else if (object->resident_page_count == 0) {
8265 			assert(object != compressor_object);
8266 			assert(!is_kernel_object(object));
8267 			fast_path_empty_req = TRUE;
8268 			set_cache_attr_needed = TRUE;
8269 		}
8270 	}
8271 
8272 	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8273 		interruptible = THREAD_ABORTSAFE;
8274 	} else {
8275 		interruptible = THREAD_UNINT;
8276 	}
8277 
8278 	entry = 0;
8279 
8280 	xfer_size = size;
8281 	dst_offset = offset;
8282 
8283 	if (fast_path_full_req) {
8284 		if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
8285 			goto finish;
8286 		}
8287 		/*
8288 		 * we couldn't complete the processing of this request on the fast path
8289 		 * so fall through to the slow path and finish up
8290 		 */
8291 	} else if (fast_path_empty_req) {
8292 		if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8293 			ret = KERN_MEMORY_ERROR;
8294 			goto return_err;
8295 		}
8296 		ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
8297 		    cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
8298 
8299 		if (ret) {
8300 			free_wired_pages = TRUE;
8301 			goto return_err;
8302 		}
8303 		goto finish;
8304 	}
8305 
8306 	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8307 	fault_info.lo_offset = offset;
8308 	fault_info.hi_offset = offset + xfer_size;
8309 	fault_info.mark_zf_absent = TRUE;
8310 	fault_info.interruptible = interruptible;
8311 	fault_info.batch_pmap_op = TRUE;
8312 
8313 	while (xfer_size) {
8314 		vm_fault_return_t       result;
8315 
8316 		dwp->dw_mask = 0;
8317 
8318 		if (fast_path_full_req) {
8319 			/*
8320 			 * if we get here, it means that we ran into a page
8321 			 * state we couldn't handle in the fast path and
8322 			 * bailed out to the slow path... since the order
8323 			 * we look at pages is different between the 2 paths,
8324 			 * the following check is needed to determine whether
8325 			 * this page was already processed in the fast path
8326 			 */
8327 			if (bitmap_test(upl->lite_list, entry)) {
8328 				goto skip_page;
8329 			}
8330 		}
8331 		dst_page = vm_page_lookup(object, dst_offset);
8332 
8333 		if (dst_page == VM_PAGE_NULL ||
8334 		    dst_page->vmp_busy ||
8335 		    VMP_ERROR_GET(dst_page) ||
8336 		    dst_page->vmp_restart ||
8337 		    dst_page->vmp_absent ||
8338 		    vm_page_is_fictitious(dst_page)) {
8339 			if (is_kernel_object(object)) {
8340 				panic("vm_object_iopl_request: missing/bad page in kernel object");
8341 			}
8342 			if (object == compressor_object) {
8343 				panic("vm_object_iopl_request: missing/bad page in compressor object");
8344 			}
8345 
8346 			if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8347 				ret = KERN_MEMORY_ERROR;
8348 				goto return_err;
8349 			}
8350 
8351 			if (dst_page != VM_PAGE_NULL &&
8352 			    dst_page->vmp_busy) {
8353 				wait_result_t wait_result;
8354 				vm_object_lock_assert_exclusive(object);
8355 				wait_result = vm_page_sleep(object, dst_page,
8356 				    interruptible, LCK_SLEEP_DEFAULT);
8357 				if (wait_result == THREAD_AWAKENED ||
8358 				    wait_result == THREAD_RESTART) {
8359 					continue;
8360 				}
8361 				ret = MACH_SEND_INTERRUPTED;
8362 				goto return_err;
8363 			}
8364 
8365 			set_cache_attr_needed = TRUE;
8366 
8367 			/*
8368 			 * We just looked up the page and the result remains valid
8369 			 * until the object lock is release, so send it to
8370 			 * vm_fault_page() (as "dst_page"), to avoid having to
8371 			 * look it up again there.
8372 			 */
8373 			caller_lookup = TRUE;
8374 
8375 			do {
8376 				vm_page_t       top_page;
8377 				kern_return_t   error_code;
8378 
8379 				fault_info.cluster_size = xfer_size;
8380 				vm_object_paging_begin(object);
8381 
8382 				result = vm_fault_page(object, dst_offset,
8383 				    prot | VM_PROT_WRITE, FALSE,
8384 				    caller_lookup,
8385 				    &prot, &dst_page, &top_page,
8386 				    (int *)0,
8387 				    &error_code, no_zero_fill,
8388 				    &fault_info);
8389 
8390 				/* our lookup is no longer valid at this point */
8391 				caller_lookup = FALSE;
8392 
8393 				switch (result) {
8394 				case VM_FAULT_SUCCESS:
8395 					page_grab_count++;
8396 
8397 					if (!dst_page->vmp_absent) {
8398 						vm_page_wakeup_done(object, dst_page);
8399 					} else {
8400 						/*
8401 						 * we only get back an absent page if we
8402 						 * requested that it not be zero-filled
8403 						 * because we are about to fill it via I/O
8404 						 *
8405 						 * absent pages should be left BUSY
8406 						 * to prevent them from being faulted
8407 						 * into an address space before we've
8408 						 * had a chance to complete the I/O on
8409 						 * them since they may contain info that
8410 						 * shouldn't be seen by the faulting task
8411 						 */
8412 					}
8413 					/*
8414 					 *	Release paging references and
8415 					 *	top-level placeholder page, if any.
8416 					 */
8417 					if (top_page != VM_PAGE_NULL) {
8418 						vm_object_t local_object;
8419 
8420 						local_object = VM_PAGE_OBJECT(top_page);
8421 
8422 						/*
8423 						 * comparing 2 packed pointers
8424 						 */
8425 						if (top_page->vmp_object != dst_page->vmp_object) {
8426 							vm_object_lock(local_object);
8427 							VM_PAGE_FREE(top_page);
8428 							vm_object_paging_end(local_object);
8429 							vm_object_unlock(local_object);
8430 						} else {
8431 							VM_PAGE_FREE(top_page);
8432 							vm_object_paging_end(local_object);
8433 						}
8434 					}
8435 					vm_object_paging_end(object);
8436 					break;
8437 
8438 				case VM_FAULT_RETRY:
8439 					vm_object_lock(object);
8440 					break;
8441 
8442 				case VM_FAULT_MEMORY_SHORTAGE:
8443 					OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8444 
8445 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8446 
8447 					if (vm_page_wait(interruptible)) {
8448 						OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8449 
8450 						VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8451 						vm_object_lock(object);
8452 
8453 						break;
8454 					}
8455 					OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8456 
8457 					VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8458 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
8459 					OS_FALLTHROUGH;
8460 
8461 				case VM_FAULT_INTERRUPTED:
8462 					error_code = MACH_SEND_INTERRUPTED;
8463 					OS_FALLTHROUGH;
8464 				case VM_FAULT_MEMORY_ERROR:
8465 memory_error:
8466 					ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8467 
8468 					vm_object_lock(object);
8469 					goto return_err;
8470 
8471 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
8472 					/* success but no page: fail */
8473 					vm_object_paging_end(object);
8474 					vm_object_unlock(object);
8475 					goto memory_error;
8476 
8477 				default:
8478 					panic("vm_object_iopl_request: unexpected error"
8479 					    " 0x%x from vm_fault_page()\n", result);
8480 				}
8481 			} while (result != VM_FAULT_SUCCESS);
8482 		}
8483 		phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8484 
8485 		if (upl->flags & UPL_KERNEL_OBJECT) {
8486 			goto record_phys_addr;
8487 		}
8488 
8489 		if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8490 			dst_page->vmp_busy = TRUE;
8491 			goto record_phys_addr;
8492 		}
8493 
8494 		if (dst_page->vmp_cleaning) {
8495 			/*
8496 			 * Someone else is cleaning this page in place.
8497 			 * In theory, we should be able to  proceed and use this
8498 			 * page but they'll probably end up clearing the "busy"
8499 			 * bit on it in upl_commit_range() but they didn't set
8500 			 * it, so they would clear our "busy" bit and open
8501 			 * us to race conditions.
8502 			 * We'd better wait for the cleaning to complete and
8503 			 * then try again.
8504 			 */
8505 			VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
8506 			vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
8507 			continue;
8508 		}
8509 		if (dst_page->vmp_laundry) {
8510 			vm_pageout_steal_laundry(dst_page, FALSE);
8511 		}
8512 
8513 		if (
8514 			((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8515 			phys_page >= (max_valid_dma_address >> PAGE_SHIFT))) {
8516 			vm_page_t       new_page;
8517 			int             refmod;
8518 
8519 			/*
8520 			 * support devices that can't DMA above 32 bits
8521 			 * by substituting pages from a pool of low address
8522 			 * memory for any pages we find above the 4G mark
8523 			 * can't substitute if the page is already wired because
8524 			 * we don't know whether that physical address has been
8525 			 * handed out to some other 64 bit capable DMA device to use
8526 			 */
8527 			if (VM_PAGE_WIRED(dst_page)) {
8528 				ret = KERN_PROTECTION_FAILURE;
8529 				goto return_err;
8530 			}
8531 
8532 			{
8533 				new_page = vm_page_grablo();
8534 			}
8535 
8536 			if (new_page == VM_PAGE_NULL) {
8537 				ret = KERN_RESOURCE_SHORTAGE;
8538 				goto return_err;
8539 			}
8540 			/*
8541 			 * from here until the vm_page_replace completes
8542 			 * we musn't drop the object lock... we don't
8543 			 * want anyone refaulting this page in and using
8544 			 * it after we disconnect it... we want the fault
8545 			 * to find the new page being substituted.
8546 			 */
8547 			if (dst_page->vmp_pmapped) {
8548 				refmod = pmap_disconnect(phys_page);
8549 			} else {
8550 				refmod = 0;
8551 			}
8552 
8553 			if (!dst_page->vmp_absent) {
8554 				vm_page_copy(dst_page, new_page);
8555 			}
8556 
8557 			new_page->vmp_reference = dst_page->vmp_reference;
8558 			new_page->vmp_dirty     = dst_page->vmp_dirty;
8559 			new_page->vmp_absent    = dst_page->vmp_absent;
8560 
8561 			if (refmod & VM_MEM_REFERENCED) {
8562 				new_page->vmp_reference = TRUE;
8563 			}
8564 			if (refmod & VM_MEM_MODIFIED) {
8565 				SET_PAGE_DIRTY(new_page, FALSE);
8566 			}
8567 
8568 			vm_page_replace(new_page, object, dst_offset);
8569 
8570 			dst_page = new_page;
8571 			/*
8572 			 * vm_page_grablo returned the page marked
8573 			 * BUSY... we don't need a PAGE_WAKEUP_DONE
8574 			 * here, because we've never dropped the object lock
8575 			 */
8576 			if (!dst_page->vmp_absent) {
8577 				dst_page->vmp_busy = FALSE;
8578 			}
8579 
8580 			phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8581 		}
8582 		if (!dst_page->vmp_busy) {
8583 			dwp->dw_mask |= DW_vm_page_wire;
8584 		}
8585 
8586 		if (cntrl_flags & UPL_BLOCK_ACCESS) {
8587 			/*
8588 			 * Mark the page "busy" to block any future page fault
8589 			 * on this page in addition to wiring it.
8590 			 * We'll also remove the mapping
8591 			 * of all these pages before leaving this routine.
8592 			 */
8593 			assert(!vm_page_is_fictitious(dst_page));
8594 			dst_page->vmp_busy = TRUE;
8595 		}
8596 		/*
8597 		 * expect the page to be used
8598 		 * page queues lock must be held to set 'reference'
8599 		 */
8600 		dwp->dw_mask |= DW_set_reference;
8601 
8602 		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8603 			SET_PAGE_DIRTY(dst_page, TRUE);
8604 			/*
8605 			 * Page belonging to a code-signed object is about to
8606 			 * be written. Mark it tainted and disconnect it from
8607 			 * all pmaps so processes have to fault it back in and
8608 			 * deal with the tainted bit.
8609 			 */
8610 			if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
8611 				dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
8612 				vm_page_iopl_tainted++;
8613 				if (dst_page->vmp_pmapped) {
8614 					int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
8615 					if (refmod & VM_MEM_REFERENCED) {
8616 						dst_page->vmp_reference = TRUE;
8617 					}
8618 				}
8619 			}
8620 		}
8621 		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8622 			pmap_sync_page_attributes_phys(phys_page);
8623 			dst_page->vmp_written_by_kernel = FALSE;
8624 		}
8625 
8626 record_phys_addr:
8627 		if (dst_page->vmp_busy) {
8628 			upl->flags |= UPL_HAS_BUSY;
8629 		}
8630 
8631 		bitmap_set(upl->lite_list, entry);
8632 
8633 		if (phys_page > upl->highest_page) {
8634 			upl->highest_page = phys_page;
8635 		}
8636 
8637 		if (user_page_list) {
8638 			user_page_list[entry].phys_addr = phys_page;
8639 			user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
8640 			user_page_list[entry].absent    = dst_page->vmp_absent;
8641 			user_page_list[entry].dirty     = dst_page->vmp_dirty;
8642 			user_page_list[entry].precious  = dst_page->vmp_precious;
8643 			user_page_list[entry].device    = FALSE;
8644 			user_page_list[entry].needed    = FALSE;
8645 			if (dst_page->vmp_clustered == TRUE) {
8646 				user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8647 			} else {
8648 				user_page_list[entry].speculative = FALSE;
8649 			}
8650 			user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
8651 			user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
8652 			user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
8653 			user_page_list[entry].mark      = FALSE;
8654 		}
8655 		if (!is_kernel_object(object) && object != compressor_object) {
8656 			/*
8657 			 * someone is explicitly grabbing this page...
8658 			 * update clustered and speculative state
8659 			 *
8660 			 */
8661 			if (dst_page->vmp_clustered) {
8662 				VM_PAGE_CONSUME_CLUSTERED(dst_page);
8663 			}
8664 		}
8665 skip_page:
8666 		entry++;
8667 		dst_offset += PAGE_SIZE_64;
8668 		xfer_size -= PAGE_SIZE;
8669 
8670 		if (dwp->dw_mask) {
8671 			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8672 
8673 			if (dw_count >= dw_limit) {
8674 				vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8675 
8676 				dwp = dwp_start;
8677 				dw_count = 0;
8678 			}
8679 		}
8680 	}
8681 	assert(entry == size_in_pages);
8682 
8683 	if (dw_count) {
8684 		vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
8685 		dwp = dwp_start;
8686 		dw_count = 0;
8687 	}
8688 finish:
8689 	if (user_page_list && set_cache_attr_needed == TRUE) {
8690 		vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8691 	}
8692 
8693 	if (page_list_count != NULL) {
8694 		if (upl->flags & UPL_INTERNAL) {
8695 			*page_list_count = 0;
8696 		} else if (*page_list_count > size_in_pages) {
8697 			*page_list_count = size_in_pages;
8698 		}
8699 	}
8700 	vm_object_unlock(object);
8701 
8702 	if (cntrl_flags & UPL_BLOCK_ACCESS) {
8703 		/*
8704 		 * We've marked all the pages "busy" so that future
8705 		 * page faults will block.
8706 		 * Now remove the mapping for these pages, so that they
8707 		 * can't be accessed without causing a page fault.
8708 		 */
8709 		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8710 		    PMAP_NULL,
8711 		    PAGE_SIZE,
8712 		    0, VM_PROT_NONE);
8713 		assert(!object->blocked_access);
8714 		object->blocked_access = TRUE;
8715 	}
8716 
8717 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
8718 	if (task != NULL) {
8719 		counter_add(&task->pages_grabbed_iopl, page_grab_count);
8720 	}
8721 
8722 	if (dwp_start && dwp_finish_ctx) {
8723 		vm_page_delayed_work_finish_ctx(dwp_start);
8724 		dwp_start = dwp = NULL;
8725 	}
8726 
8727 	return KERN_SUCCESS;
8728 
8729 return_err:
8730 	dw_index = 0;
8731 
8732 	for (; offset < dst_offset; offset += PAGE_SIZE) {
8733 		boolean_t need_unwire;
8734 		bool need_wakeup;
8735 
8736 		dst_page = vm_page_lookup(object, offset);
8737 
8738 		if (dst_page == VM_PAGE_NULL) {
8739 			panic("vm_object_iopl_request: Wired page missing.");
8740 		}
8741 
8742 		/*
8743 		 * if we've already processed this page in an earlier
8744 		 * dw_do_work, we need to undo the wiring... we will
8745 		 * leave the dirty and reference bits on if they
8746 		 * were set, since we don't have a good way of knowing
8747 		 * what the previous state was and we won't get here
8748 		 * under any normal circumstances...  we will always
8749 		 * clear BUSY and wakeup any waiters via vm_page_free
8750 		 * or PAGE_WAKEUP_DONE
8751 		 */
8752 		need_unwire = TRUE;
8753 
8754 		need_wakeup = false;
8755 		if (dw_count) {
8756 			if ((dwp_start)[dw_index].dw_m == dst_page) {
8757 				/*
8758 				 * still in the deferred work list
8759 				 * which means we haven't yet called
8760 				 * vm_page_wire on this page
8761 				 */
8762 				need_unwire = FALSE;
8763 
8764 				if (dst_page->vmp_busy &&
8765 				    ((dwp_start)[dw_index].dw_mask & DW_clear_busy)) {
8766 					/*
8767 					 * It's our own "busy" bit, so we need to clear it
8768 					 * now and wake up waiters below.
8769 					 */
8770 					dst_page->vmp_busy = false;
8771 					need_wakeup = true;
8772 				}
8773 
8774 				dw_index++;
8775 				dw_count--;
8776 			}
8777 		}
8778 		vm_page_lock_queues();
8779 
8780 		if (dst_page->vmp_absent || free_wired_pages == TRUE) {
8781 			vm_page_free(dst_page);
8782 
8783 			need_unwire = FALSE;
8784 		} else {
8785 			if (need_unwire == TRUE) {
8786 				vm_page_unwire(dst_page, TRUE);
8787 			}
8788 			if (dst_page->vmp_busy) {
8789 				/* not our "busy" or we would have cleared it above */
8790 				assert(!need_wakeup);
8791 			}
8792 			if (need_wakeup) {
8793 				assert(!dst_page->vmp_busy);
8794 				vm_page_wakeup(object, dst_page);
8795 			}
8796 		}
8797 		vm_page_unlock_queues();
8798 
8799 		if (need_unwire == TRUE) {
8800 			counter_inc(&vm_statistics_reactivations);
8801 		}
8802 	}
8803 #if UPL_DEBUG
8804 	upl->upl_state = 2;
8805 #endif
8806 	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8807 		vm_object_activity_end(object);
8808 		vm_object_collapse(object, 0, TRUE);
8809 	}
8810 	vm_object_unlock(object);
8811 	upl_destroy(upl);
8812 
8813 	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
8814 	if (task != NULL) {
8815 		counter_add(&task->pages_grabbed_iopl, page_grab_count);
8816 	}
8817 
8818 	if (dwp_start && dwp_finish_ctx) {
8819 		vm_page_delayed_work_finish_ctx(dwp_start);
8820 		dwp_start = dwp = NULL;
8821 	}
8822 	return ret;
8823 }
8824 
8825 kern_return_t
8826 upl_transpose(
8827 	upl_t           upl1,
8828 	upl_t           upl2)
8829 {
8830 	kern_return_t           retval;
8831 	boolean_t               upls_locked;
8832 	vm_object_t             object1, object2;
8833 
8834 	/* LD: Should mapped UPLs be eligible for a transpose? */
8835 	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
8836 		return KERN_INVALID_ARGUMENT;
8837 	}
8838 
8839 	upls_locked = FALSE;
8840 
8841 	/*
8842 	 * Since we need to lock both UPLs at the same time,
8843 	 * avoid deadlocks by always taking locks in the same order.
8844 	 */
8845 	if (upl1 < upl2) {
8846 		upl_lock(upl1);
8847 		upl_lock(upl2);
8848 	} else {
8849 		upl_lock(upl2);
8850 		upl_lock(upl1);
8851 	}
8852 	upls_locked = TRUE;     /* the UPLs will need to be unlocked */
8853 
8854 	object1 = upl1->map_object;
8855 	object2 = upl2->map_object;
8856 
8857 	if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
8858 	    upl1->u_size != upl2->u_size) {
8859 		/*
8860 		 * We deal only with full objects, not subsets.
8861 		 * That's because we exchange the entire backing store info
8862 		 * for the objects: pager, resident pages, etc...  We can't do
8863 		 * only part of it.
8864 		 */
8865 		retval = KERN_INVALID_VALUE;
8866 		goto done;
8867 	}
8868 
8869 	/*
8870 	 * Tranpose the VM objects' backing store.
8871 	 */
8872 	retval = vm_object_transpose(object1, object2,
8873 	    upl_adjusted_size(upl1, PAGE_MASK));
8874 
8875 	if (retval == KERN_SUCCESS) {
8876 		/*
8877 		 * Make each UPL point to the correct VM object, i.e. the
8878 		 * object holding the pages that the UPL refers to...
8879 		 */
8880 #if CONFIG_IOSCHED || UPL_DEBUG
8881 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8882 			vm_object_lock(object1);
8883 			vm_object_lock(object2);
8884 		}
8885 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8886 			queue_remove(&object1->uplq, upl1, upl_t, uplq);
8887 		}
8888 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8889 			queue_remove(&object2->uplq, upl2, upl_t, uplq);
8890 		}
8891 #endif
8892 		upl1->map_object = object2;
8893 		upl2->map_object = object1;
8894 
8895 #if CONFIG_IOSCHED || UPL_DEBUG
8896 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8897 			queue_enter(&object2->uplq, upl1, upl_t, uplq);
8898 		}
8899 		if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
8900 			queue_enter(&object1->uplq, upl2, upl_t, uplq);
8901 		}
8902 		if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8903 			vm_object_unlock(object2);
8904 			vm_object_unlock(object1);
8905 		}
8906 #endif
8907 	}
8908 
8909 done:
8910 	/*
8911 	 * Cleanup.
8912 	 */
8913 	if (upls_locked) {
8914 		upl_unlock(upl1);
8915 		upl_unlock(upl2);
8916 		upls_locked = FALSE;
8917 	}
8918 
8919 	return retval;
8920 }
8921 
8922 void
8923 upl_range_needed(
8924 	upl_t           upl,
8925 	int             index,
8926 	int             count)
8927 {
8928 	int             size_in_pages;
8929 
8930 	if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
8931 		return;
8932 	}
8933 
8934 	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8935 
8936 	while (count-- && index < size_in_pages) {
8937 		upl->page_list[index++].needed = TRUE;
8938 	}
8939 }
8940 
8941 
8942 /*
8943  * Reserve of virtual addresses in the kernel address space.
8944  * We need to map the physical pages in the kernel, so that we
8945  * can call the code-signing or slide routines with a kernel
8946  * virtual address.  We keep this pool of pre-allocated kernel
8947  * virtual addresses so that we don't have to scan the kernel's
8948  * virtaul address space each time we need to work with
8949  * a physical page.
8950  */
8951 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
8952 #define VM_PAGING_NUM_PAGES     64
8953 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
8954 bool            vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8955 int             vm_paging_max_index = 0;
8956 int             vm_paging_page_waiter = 0;
8957 int             vm_paging_page_waiter_total = 0;
8958 
8959 unsigned long   vm_paging_no_kernel_page = 0;
8960 unsigned long   vm_paging_objects_mapped = 0;
8961 unsigned long   vm_paging_pages_mapped = 0;
8962 unsigned long   vm_paging_objects_mapped_slow = 0;
8963 unsigned long   vm_paging_pages_mapped_slow = 0;
8964 
8965 __startup_func
8966 static void
8967 vm_paging_map_init(void)
8968 {
8969 	kmem_alloc(kernel_map, &vm_paging_base_address,
8970 	    ptoa(VM_PAGING_NUM_PAGES),
8971 	    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
8972 	    VM_KERN_MEMORY_NONE);
8973 }
8974 STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
8975 
8976 /*
8977  * vm_paging_map_object:
8978  *	Maps part of a VM object's pages in the kernel
8979  *      virtual address space, using the pre-allocated
8980  *	kernel virtual addresses, if possible.
8981  * Context:
8982  *      The VM object is locked.  This lock will get
8983  *      dropped and re-acquired though, so the caller
8984  *      must make sure the VM object is kept alive
8985  *	(by holding a VM map that has a reference
8986  *      on it, for example, or taking an extra reference).
8987  *      The page should also be kept busy to prevent
8988  *	it from being reclaimed.
8989  */
8990 kern_return_t
8991 vm_paging_map_object(
8992 	vm_page_t               page,
8993 	vm_object_t             object,
8994 	vm_object_offset_t      offset,
8995 	vm_prot_t               protection,
8996 	boolean_t               can_unlock_object,
8997 	vm_map_size_t           *size,          /* IN/OUT */
8998 	vm_map_offset_t         *address,       /* OUT */
8999 	boolean_t               *need_unmap)    /* OUT */
9000 {
9001 	kern_return_t           kr;
9002 	vm_map_offset_t         page_map_offset;
9003 	vm_map_size_t           map_size;
9004 	vm_object_offset_t      object_offset;
9005 	int                     i;
9006 
9007 	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9008 		/* use permanent 1-to-1 kernel mapping of physical memory ? */
9009 		*address = (vm_map_offset_t)
9010 		    phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9011 		*need_unmap = FALSE;
9012 		return KERN_SUCCESS;
9013 
9014 		assert(page->vmp_busy);
9015 		/*
9016 		 * Use one of the pre-allocated kernel virtual addresses
9017 		 * and just enter the VM page in the kernel address space
9018 		 * at that virtual address.
9019 		 */
9020 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9021 
9022 		/*
9023 		 * Try and find an available kernel virtual address
9024 		 * from our pre-allocated pool.
9025 		 */
9026 		page_map_offset = 0;
9027 		for (;;) {
9028 			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9029 				if (vm_paging_page_inuse[i] == FALSE) {
9030 					page_map_offset =
9031 					    vm_paging_base_address +
9032 					    (i * PAGE_SIZE);
9033 					break;
9034 				}
9035 			}
9036 			if (page_map_offset != 0) {
9037 				/* found a space to map our page ! */
9038 				break;
9039 			}
9040 
9041 			if (can_unlock_object) {
9042 				/*
9043 				 * If we can afford to unlock the VM object,
9044 				 * let's take the slow path now...
9045 				 */
9046 				break;
9047 			}
9048 			/*
9049 			 * We can't afford to unlock the VM object, so
9050 			 * let's wait for a space to become available...
9051 			 */
9052 			vm_paging_page_waiter_total++;
9053 			vm_paging_page_waiter++;
9054 			kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9055 			if (kr == THREAD_WAITING) {
9056 				simple_unlock(&vm_paging_lock);
9057 				kr = thread_block(THREAD_CONTINUE_NULL);
9058 				simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9059 			}
9060 			vm_paging_page_waiter--;
9061 			/* ... and try again */
9062 		}
9063 
9064 		if (page_map_offset != 0) {
9065 			/*
9066 			 * We found a kernel virtual address;
9067 			 * map the physical page to that virtual address.
9068 			 */
9069 			if (i > vm_paging_max_index) {
9070 				vm_paging_max_index = i;
9071 			}
9072 			vm_paging_page_inuse[i] = TRUE;
9073 			simple_unlock(&vm_paging_lock);
9074 
9075 			page->vmp_pmapped = TRUE;
9076 
9077 			/*
9078 			 * Keep the VM object locked over the PMAP_ENTER
9079 			 * and the actual use of the page by the kernel,
9080 			 * or this pmap mapping might get undone by a
9081 			 * vm_object_pmap_protect() call...
9082 			 */
9083 			kr = pmap_enter_check(kernel_pmap,
9084 			    page_map_offset,
9085 			    page,
9086 			    protection,
9087 			    VM_PROT_NONE,
9088 			    TRUE);
9089 			assert(kr == KERN_SUCCESS);
9090 			vm_paging_objects_mapped++;
9091 			vm_paging_pages_mapped++;
9092 			*address = page_map_offset;
9093 			*need_unmap = TRUE;
9094 
9095 #if KASAN
9096 			kasan_notify_address(page_map_offset, PAGE_SIZE);
9097 #endif
9098 
9099 			/* all done and mapped, ready to use ! */
9100 			return KERN_SUCCESS;
9101 		}
9102 
9103 		/*
9104 		 * We ran out of pre-allocated kernel virtual
9105 		 * addresses.  Just map the page in the kernel
9106 		 * the slow and regular way.
9107 		 */
9108 		vm_paging_no_kernel_page++;
9109 		simple_unlock(&vm_paging_lock);
9110 	}
9111 
9112 	if (!can_unlock_object) {
9113 		*address = 0;
9114 		*size = 0;
9115 		*need_unmap = FALSE;
9116 		return KERN_NOT_SUPPORTED;
9117 	}
9118 
9119 	object_offset = vm_object_trunc_page(offset);
9120 	map_size = vm_map_round_page(*size,
9121 	    VM_MAP_PAGE_MASK(kernel_map));
9122 
9123 	/*
9124 	 * Try and map the required range of the object
9125 	 * in the kernel_map. Given that allocation is
9126 	 * for pageable memory, it shouldn't contain
9127 	 * pointers and is mapped into the data range.
9128 	 */
9129 
9130 	vm_object_reference_locked(object);     /* for the map entry */
9131 	vm_object_unlock(object);
9132 
9133 	kr = vm_map_enter(kernel_map,
9134 	    address,
9135 	    map_size,
9136 	    0,
9137 	    VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
9138 	    object,
9139 	    object_offset,
9140 	    FALSE,
9141 	    protection,
9142 	    VM_PROT_ALL,
9143 	    VM_INHERIT_NONE);
9144 	if (kr != KERN_SUCCESS) {
9145 		*address = 0;
9146 		*size = 0;
9147 		*need_unmap = FALSE;
9148 		vm_object_deallocate(object);   /* for the map entry */
9149 		vm_object_lock(object);
9150 		return kr;
9151 	}
9152 
9153 	*size = map_size;
9154 
9155 	/*
9156 	 * Enter the mapped pages in the page table now.
9157 	 */
9158 	vm_object_lock(object);
9159 	/*
9160 	 * VM object must be kept locked from before PMAP_ENTER()
9161 	 * until after the kernel is done accessing the page(s).
9162 	 * Otherwise, the pmap mappings in the kernel could be
9163 	 * undone by a call to vm_object_pmap_protect().
9164 	 */
9165 
9166 	for (page_map_offset = 0;
9167 	    map_size != 0;
9168 	    map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9169 		page = vm_page_lookup(object, offset + page_map_offset);
9170 		if (page == VM_PAGE_NULL) {
9171 			printf("vm_paging_map_object: no page !?");
9172 			vm_object_unlock(object);
9173 			vm_map_remove(kernel_map, *address, *size);
9174 			*address = 0;
9175 			*size = 0;
9176 			*need_unmap = FALSE;
9177 			vm_object_lock(object);
9178 			return KERN_MEMORY_ERROR;
9179 		}
9180 		page->vmp_pmapped = TRUE;
9181 
9182 		kr = pmap_enter_check(kernel_pmap,
9183 		    *address + page_map_offset,
9184 		    page,
9185 		    protection,
9186 		    VM_PROT_NONE,
9187 		    TRUE);
9188 		assert(kr == KERN_SUCCESS);
9189 #if KASAN
9190 		kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9191 #endif
9192 	}
9193 
9194 	vm_paging_objects_mapped_slow++;
9195 	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9196 
9197 	*need_unmap = TRUE;
9198 
9199 	return KERN_SUCCESS;
9200 }
9201 
9202 /*
9203  * vm_paging_unmap_object:
9204  *	Unmaps part of a VM object's pages from the kernel
9205  *      virtual address space.
9206  * Context:
9207  *      The VM object is locked.  This lock will get
9208  *      dropped and re-acquired though.
9209  */
9210 void
9211 vm_paging_unmap_object(
9212 	vm_object_t     object,
9213 	vm_map_offset_t start,
9214 	vm_map_offset_t end)
9215 {
9216 	int             i;
9217 
9218 	if ((vm_paging_base_address == 0) ||
9219 	    (start < vm_paging_base_address) ||
9220 	    (end > (vm_paging_base_address
9221 	    + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9222 		/*
9223 		 * We didn't use our pre-allocated pool of
9224 		 * kernel virtual address.  Deallocate the
9225 		 * virtual memory.
9226 		 */
9227 		if (object != VM_OBJECT_NULL) {
9228 			vm_object_unlock(object);
9229 		}
9230 		vm_map_remove(kernel_map, start, end);
9231 		if (object != VM_OBJECT_NULL) {
9232 			vm_object_lock(object);
9233 		}
9234 	} else {
9235 		/*
9236 		 * We used a kernel virtual address from our
9237 		 * pre-allocated pool.  Put it back in the pool
9238 		 * for next time.
9239 		 */
9240 		assert(end - start == PAGE_SIZE);
9241 		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9242 		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9243 
9244 		/* undo the pmap mapping */
9245 		pmap_remove(kernel_pmap, start, end);
9246 
9247 		simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9248 		vm_paging_page_inuse[i] = FALSE;
9249 		if (vm_paging_page_waiter) {
9250 			thread_wakeup(&vm_paging_page_waiter);
9251 		}
9252 		simple_unlock(&vm_paging_lock);
9253 	}
9254 }
9255 
9256 
9257 /*
9258  * page->vmp_object must be locked
9259  */
9260 void
9261 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9262 {
9263 	if (!queues_locked) {
9264 		vm_page_lockspin_queues();
9265 	}
9266 
9267 	page->vmp_free_when_done = FALSE;
9268 	/*
9269 	 * need to drop the laundry count...
9270 	 * we may also need to remove it
9271 	 * from the I/O paging queue...
9272 	 * vm_pageout_throttle_up handles both cases
9273 	 *
9274 	 * the laundry and pageout_queue flags are cleared...
9275 	 */
9276 	vm_pageout_throttle_up(page);
9277 
9278 	if (!queues_locked) {
9279 		vm_page_unlock_queues();
9280 	}
9281 }
9282 
9283 #define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
9284 
9285 upl_t
9286 vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
9287 {
9288 	int i = 0;
9289 	upl_t   upl;
9290 
9291 	assert(max_upls > 0);
9292 	if (max_upls == 0) {
9293 		return NULL;
9294 	}
9295 
9296 	if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
9297 		max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
9298 	}
9299 	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
9300 
9301 	upl = upl_create(0, UPL_VECTOR, 0);
9302 	upl->vector_upl = vector_upl;
9303 	upl->u_offset = upl_offset;
9304 	vector_upl->size = 0;
9305 	vector_upl->offset = upl_offset;
9306 	vector_upl->invalid_upls = 0;
9307 	vector_upl->num_upls = 0;
9308 	vector_upl->pagelist = NULL;
9309 	vector_upl->max_upls = max_upls;
9310 
9311 	for (i = 0; i < max_upls; i++) {
9312 		vector_upl->upls[i].iostate.size = 0;
9313 		vector_upl->upls[i].iostate.offset = 0;
9314 	}
9315 	return upl;
9316 }
9317 
9318 upl_size_t
9319 vector_upl_get_size(const upl_t upl)
9320 {
9321 	if (!vector_upl_is_valid(upl)) {
9322 		return upl_get_size(upl);
9323 	} else {
9324 		return round_page_32(upl->vector_upl->size);
9325 	}
9326 }
9327 
9328 uint32_t
9329 vector_upl_max_upls(const upl_t upl)
9330 {
9331 	if (!vector_upl_is_valid(upl)) {
9332 		return 0;
9333 	}
9334 	return ((vector_upl_t)(upl->vector_upl))->max_upls;
9335 }
9336 
9337 void
9338 vector_upl_deallocate(upl_t upl)
9339 {
9340 	vector_upl_t vector_upl = upl->vector_upl;
9341 
9342 	assert(vector_upl_is_valid(upl));
9343 
9344 	if (vector_upl->invalid_upls != vector_upl->num_upls) {
9345 		panic("Deallocating non-empty Vectored UPL");
9346 	}
9347 	uint32_t max_upls = vector_upl->max_upls;
9348 	kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
9349 	kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
9350 	upl->vector_upl = NULL;
9351 }
9352 
9353 boolean_t
9354 vector_upl_is_valid(upl_t upl)
9355 {
9356 	return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
9357 }
9358 
9359 boolean_t
9360 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
9361 {
9362 	if (vector_upl_is_valid(upl)) {
9363 		vector_upl_t vector_upl = upl->vector_upl;
9364 
9365 		if (vector_upl) {
9366 			if (subupl) {
9367 				if (io_size) {
9368 					if (io_size < PAGE_SIZE) {
9369 						io_size = PAGE_SIZE;
9370 					}
9371 					subupl->vector_upl = (void*)vector_upl;
9372 					vector_upl->upls[vector_upl->num_upls++].elem = subupl;
9373 					vector_upl->size += io_size;
9374 					upl->u_size += io_size;
9375 				} else {
9376 					uint32_t i = 0, invalid_upls = 0;
9377 					for (i = 0; i < vector_upl->num_upls; i++) {
9378 						if (vector_upl->upls[i].elem == subupl) {
9379 							break;
9380 						}
9381 					}
9382 					if (i == vector_upl->num_upls) {
9383 						panic("Trying to remove sub-upl when none exists");
9384 					}
9385 
9386 					vector_upl->upls[i].elem = NULL;
9387 					invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
9388 					    relaxed);
9389 					if (invalid_upls == vector_upl->num_upls) {
9390 						return TRUE;
9391 					} else {
9392 						return FALSE;
9393 					}
9394 				}
9395 			} else {
9396 				panic("vector_upl_set_subupl was passed a NULL upl element");
9397 			}
9398 		} else {
9399 			panic("vector_upl_set_subupl was passed a non-vectored upl");
9400 		}
9401 	} else {
9402 		panic("vector_upl_set_subupl was passed a NULL upl");
9403 	}
9404 
9405 	return FALSE;
9406 }
9407 
9408 void
9409 vector_upl_set_pagelist(upl_t upl)
9410 {
9411 	if (vector_upl_is_valid(upl)) {
9412 		uint32_t i = 0;
9413 		vector_upl_t vector_upl = upl->vector_upl;
9414 
9415 		if (vector_upl) {
9416 			vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
9417 
9418 			vector_upl->pagelist = kalloc_type(struct upl_page_info,
9419 			    atop(vector_upl->size), Z_WAITOK);
9420 
9421 			for (i = 0; i < vector_upl->num_upls; i++) {
9422 				cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
9423 				bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9424 				pagelist_size += cur_upl_pagelist_size;
9425 				if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
9426 					upl->highest_page = vector_upl->upls[i].elem->highest_page;
9427 				}
9428 			}
9429 			assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
9430 		} else {
9431 			panic("vector_upl_set_pagelist was passed a non-vectored upl");
9432 		}
9433 	} else {
9434 		panic("vector_upl_set_pagelist was passed a NULL upl");
9435 	}
9436 }
9437 
9438 upl_t
9439 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9440 {
9441 	if (vector_upl_is_valid(upl)) {
9442 		vector_upl_t vector_upl = upl->vector_upl;
9443 		if (vector_upl) {
9444 			if (index < vector_upl->num_upls) {
9445 				return vector_upl->upls[index].elem;
9446 			}
9447 		} else {
9448 			panic("vector_upl_subupl_byindex was passed a non-vectored upl");
9449 		}
9450 	}
9451 	return NULL;
9452 }
9453 
9454 upl_t
9455 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9456 {
9457 	if (vector_upl_is_valid(upl)) {
9458 		uint32_t i = 0;
9459 		vector_upl_t vector_upl = upl->vector_upl;
9460 
9461 		if (vector_upl) {
9462 			upl_t subupl = NULL;
9463 			vector_upl_iostates_t subupl_state;
9464 
9465 			for (i = 0; i < vector_upl->num_upls; i++) {
9466 				subupl = vector_upl->upls[i].elem;
9467 				subupl_state = vector_upl->upls[i].iostate;
9468 				if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9469 					/* We could have been passed an offset/size pair that belongs
9470 					 * to an UPL element that has already been committed/aborted.
9471 					 * If so, return NULL.
9472 					 */
9473 					if (subupl == NULL) {
9474 						return NULL;
9475 					}
9476 					if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9477 						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9478 						if (*upl_size > subupl_state.size) {
9479 							*upl_size = subupl_state.size;
9480 						}
9481 					}
9482 					if (*upl_offset >= subupl_state.offset) {
9483 						*upl_offset -= subupl_state.offset;
9484 					} else if (i) {
9485 						panic("Vector UPL offset miscalculation");
9486 					}
9487 					return subupl;
9488 				}
9489 			}
9490 		} else {
9491 			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
9492 		}
9493 	}
9494 	return NULL;
9495 }
9496 
9497 void
9498 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9499 {
9500 	*v_upl_submap = NULL;
9501 
9502 	if (vector_upl_is_valid(upl)) {
9503 		vector_upl_t vector_upl = upl->vector_upl;
9504 		if (vector_upl) {
9505 			*v_upl_submap = vector_upl->submap;
9506 			*submap_dst_addr = vector_upl->submap_dst_addr;
9507 		} else {
9508 			panic("vector_upl_get_submap was passed a non-vectored UPL");
9509 		}
9510 	} else {
9511 		panic("vector_upl_get_submap was passed a null UPL");
9512 	}
9513 }
9514 
9515 void
9516 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9517 {
9518 	if (vector_upl_is_valid(upl)) {
9519 		vector_upl_t vector_upl = upl->vector_upl;
9520 		if (vector_upl) {
9521 			vector_upl->submap = submap;
9522 			vector_upl->submap_dst_addr = submap_dst_addr;
9523 		} else {
9524 			panic("vector_upl_get_submap was passed a non-vectored UPL");
9525 		}
9526 	} else {
9527 		panic("vector_upl_get_submap was passed a NULL UPL");
9528 	}
9529 }
9530 
9531 void
9532 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9533 {
9534 	if (vector_upl_is_valid(upl)) {
9535 		uint32_t i = 0;
9536 		vector_upl_t vector_upl = upl->vector_upl;
9537 
9538 		if (vector_upl) {
9539 			for (i = 0; i < vector_upl->num_upls; i++) {
9540 				if (vector_upl->upls[i].elem == subupl) {
9541 					break;
9542 				}
9543 			}
9544 
9545 			if (i == vector_upl->num_upls) {
9546 				panic("setting sub-upl iostate when none exists");
9547 			}
9548 
9549 			vector_upl->upls[i].iostate.offset = offset;
9550 			if (size < PAGE_SIZE) {
9551 				size = PAGE_SIZE;
9552 			}
9553 			vector_upl->upls[i].iostate.size = size;
9554 		} else {
9555 			panic("vector_upl_set_iostate was passed a non-vectored UPL");
9556 		}
9557 	} else {
9558 		panic("vector_upl_set_iostate was passed a NULL UPL");
9559 	}
9560 }
9561 
9562 void
9563 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9564 {
9565 	if (vector_upl_is_valid(upl)) {
9566 		uint32_t i = 0;
9567 		vector_upl_t vector_upl = upl->vector_upl;
9568 
9569 		if (vector_upl) {
9570 			for (i = 0; i < vector_upl->num_upls; i++) {
9571 				if (vector_upl->upls[i].elem == subupl) {
9572 					break;
9573 				}
9574 			}
9575 
9576 			if (i == vector_upl->num_upls) {
9577 				panic("getting sub-upl iostate when none exists");
9578 			}
9579 
9580 			*offset = vector_upl->upls[i].iostate.offset;
9581 			*size = vector_upl->upls[i].iostate.size;
9582 		} else {
9583 			panic("vector_upl_get_iostate was passed a non-vectored UPL");
9584 		}
9585 	} else {
9586 		panic("vector_upl_get_iostate was passed a NULL UPL");
9587 	}
9588 }
9589 
9590 void
9591 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9592 {
9593 	if (vector_upl_is_valid(upl)) {
9594 		vector_upl_t vector_upl = upl->vector_upl;
9595 		if (vector_upl) {
9596 			if (index < vector_upl->num_upls) {
9597 				*offset = vector_upl->upls[index].iostate.offset;
9598 				*size = vector_upl->upls[index].iostate.size;
9599 			} else {
9600 				*offset = *size = 0;
9601 			}
9602 		} else {
9603 			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
9604 		}
9605 	} else {
9606 		panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
9607 	}
9608 }
9609 
9610 void *
9611 upl_get_internal_vectorupl(upl_t upl)
9612 {
9613 	return upl->vector_upl;
9614 }
9615 
9616 upl_page_info_t *
9617 upl_get_internal_vectorupl_pagelist(upl_t upl)
9618 {
9619 	return upl->vector_upl->pagelist;
9620 }
9621 
9622 upl_page_info_t *
9623 upl_get_internal_page_list(upl_t upl)
9624 {
9625 	return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
9626 }
9627 
9628 void
9629 upl_clear_dirty(
9630 	upl_t           upl,
9631 	boolean_t       value)
9632 {
9633 	if (value) {
9634 		upl->flags |= UPL_CLEAR_DIRTY;
9635 	} else {
9636 		upl->flags &= ~UPL_CLEAR_DIRTY;
9637 	}
9638 }
9639 
9640 void
9641 upl_set_referenced(
9642 	upl_t           upl,
9643 	boolean_t       value)
9644 {
9645 	upl_lock(upl);
9646 	if (value) {
9647 		upl->ext_ref_count++;
9648 	} else {
9649 		if (!upl->ext_ref_count) {
9650 			panic("upl_set_referenced not %p", upl);
9651 		}
9652 		upl->ext_ref_count--;
9653 	}
9654 	upl_unlock(upl);
9655 }
9656 
9657 void
9658 upl_set_map_exclusive(upl_t upl)
9659 {
9660 	upl_lock(upl);
9661 	while (upl->map_addr_owner) {
9662 		upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
9663 		upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
9664 	}
9665 	upl->map_addr_owner = thread_get_ctid(current_thread());
9666 	upl_unlock(upl);
9667 }
9668 
9669 void
9670 upl_clear_map_exclusive(upl_t upl)
9671 {
9672 	assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
9673 	upl_lock(upl);
9674 	if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
9675 		upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
9676 		upl_wakeup(&upl->map_addr_owner);
9677 	}
9678 	upl->map_addr_owner = 0;
9679 	upl_unlock(upl);
9680 }
9681 
9682 #if CONFIG_IOSCHED
9683 void
9684 upl_set_blkno(
9685 	upl_t           upl,
9686 	vm_offset_t     upl_offset,
9687 	int             io_size,
9688 	int64_t         blkno)
9689 {
9690 	int i, j;
9691 	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
9692 		return;
9693 	}
9694 
9695 	assert(upl->upl_reprio_info != 0);
9696 	for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9697 		UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9698 	}
9699 }
9700 #endif
9701 
9702 void inline
9703 memoryshot(unsigned int event, unsigned int control)
9704 {
9705 	if (vm_debug_events) {
9706 		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9707 		    vm_page_active_count, vm_page_inactive_count,
9708 		    vm_page_free_count, vm_page_speculative_count,
9709 		    vm_page_throttled_count);
9710 	} else {
9711 		(void) event;
9712 		(void) control;
9713 	}
9714 }
9715 
9716 #ifdef MACH_BSD
9717 
9718 boolean_t
9719 upl_device_page(upl_page_info_t *upl)
9720 {
9721 	return UPL_DEVICE_PAGE(upl);
9722 }
9723 boolean_t
9724 upl_page_present(upl_page_info_t *upl, int index)
9725 {
9726 	return UPL_PAGE_PRESENT(upl, index);
9727 }
9728 boolean_t
9729 upl_speculative_page(upl_page_info_t *upl, int index)
9730 {
9731 	return UPL_SPECULATIVE_PAGE(upl, index);
9732 }
9733 boolean_t
9734 upl_dirty_page(upl_page_info_t *upl, int index)
9735 {
9736 	return UPL_DIRTY_PAGE(upl, index);
9737 }
9738 boolean_t
9739 upl_valid_page(upl_page_info_t *upl, int index)
9740 {
9741 	return UPL_VALID_PAGE(upl, index);
9742 }
9743 ppnum_t
9744 upl_phys_page(upl_page_info_t *upl, int index)
9745 {
9746 	return UPL_PHYS_PAGE(upl, index);
9747 }
9748 
9749 void
9750 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9751 {
9752 	upl[index].mark = v;
9753 }
9754 
9755 boolean_t
9756 upl_page_get_mark(upl_page_info_t *upl, int index)
9757 {
9758 	return upl[index].mark;
9759 }
9760 
9761 void
9762 vm_countdirtypages(void)
9763 {
9764 	vm_page_t m;
9765 	int dpages;
9766 	int pgopages;
9767 	int precpages;
9768 
9769 
9770 	dpages = 0;
9771 	pgopages = 0;
9772 	precpages = 0;
9773 
9774 	vm_page_lock_queues();
9775 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9776 	do {
9777 		if (m == (vm_page_t)0) {
9778 			break;
9779 		}
9780 
9781 		if (m->vmp_dirty) {
9782 			dpages++;
9783 		}
9784 		if (m->vmp_free_when_done) {
9785 			pgopages++;
9786 		}
9787 		if (m->vmp_precious) {
9788 			precpages++;
9789 		}
9790 
9791 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9792 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9793 		if (m == (vm_page_t)0) {
9794 			break;
9795 		}
9796 	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9797 	vm_page_unlock_queues();
9798 
9799 	vm_page_lock_queues();
9800 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9801 	do {
9802 		if (m == (vm_page_t)0) {
9803 			break;
9804 		}
9805 
9806 		dpages++;
9807 		assert(m->vmp_dirty);
9808 		assert(!m->vmp_free_when_done);
9809 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9810 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9811 		if (m == (vm_page_t)0) {
9812 			break;
9813 		}
9814 	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9815 	vm_page_unlock_queues();
9816 
9817 	vm_page_lock_queues();
9818 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9819 	do {
9820 		if (m == (vm_page_t)0) {
9821 			break;
9822 		}
9823 
9824 		if (m->vmp_dirty) {
9825 			dpages++;
9826 		}
9827 		if (m->vmp_free_when_done) {
9828 			pgopages++;
9829 		}
9830 		if (m->vmp_precious) {
9831 			precpages++;
9832 		}
9833 
9834 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9835 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9836 		if (m == (vm_page_t)0) {
9837 			break;
9838 		}
9839 	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9840 	vm_page_unlock_queues();
9841 
9842 	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9843 
9844 	dpages = 0;
9845 	pgopages = 0;
9846 	precpages = 0;
9847 
9848 	vm_page_lock_queues();
9849 	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9850 
9851 	do {
9852 		if (m == (vm_page_t)0) {
9853 			break;
9854 		}
9855 		if (m->vmp_dirty) {
9856 			dpages++;
9857 		}
9858 		if (m->vmp_free_when_done) {
9859 			pgopages++;
9860 		}
9861 		if (m->vmp_precious) {
9862 			precpages++;
9863 		}
9864 
9865 		assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
9866 		m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
9867 		if (m == (vm_page_t)0) {
9868 			break;
9869 		}
9870 	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9871 	vm_page_unlock_queues();
9872 
9873 	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9874 }
9875 #endif /* MACH_BSD */
9876 
9877 
9878 #if CONFIG_IOSCHED
9879 int
9880 upl_get_cached_tier(upl_t  upl)
9881 {
9882 	assert(upl);
9883 	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
9884 		return upl->upl_priority;
9885 	}
9886 	return -1;
9887 }
9888 #endif /* CONFIG_IOSCHED */
9889 
9890 
9891 void
9892 upl_callout_iodone(upl_t upl)
9893 {
9894 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
9895 
9896 	if (upl_ctx) {
9897 		void    (*iodone_func)(void *, int) = upl_ctx->io_done;
9898 
9899 		assert(upl_ctx->io_done);
9900 
9901 		(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
9902 	}
9903 }
9904 
9905 void
9906 upl_set_iodone(upl_t upl, void *upl_iodone)
9907 {
9908 	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
9909 }
9910 
9911 void
9912 upl_set_iodone_error(upl_t upl, int error)
9913 {
9914 	struct upl_io_completion *upl_ctx = upl->upl_iodone;
9915 
9916 	if (upl_ctx) {
9917 		upl_ctx->io_error = error;
9918 	}
9919 }
9920 
9921 
9922 ppnum_t
9923 upl_get_highest_page(
9924 	upl_t                      upl)
9925 {
9926 	return upl->highest_page;
9927 }
9928 
9929 upl_size_t
9930 upl_get_size(
9931 	upl_t                      upl)
9932 {
9933 	return upl_adjusted_size(upl, PAGE_MASK);
9934 }
9935 
9936 upl_size_t
9937 upl_adjusted_size(
9938 	upl_t upl,
9939 	vm_map_offset_t pgmask)
9940 {
9941 	vm_object_offset_t start_offset, end_offset;
9942 
9943 	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
9944 	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
9945 
9946 	return (upl_size_t)(end_offset - start_offset);
9947 }
9948 
9949 vm_object_offset_t
9950 upl_adjusted_offset(
9951 	upl_t upl,
9952 	vm_map_offset_t pgmask)
9953 {
9954 	return trunc_page_mask_64(upl->u_offset, pgmask);
9955 }
9956 
9957 vm_object_offset_t
9958 upl_get_data_offset(
9959 	upl_t upl)
9960 {
9961 	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
9962 }
9963 
9964 upl_t
9965 upl_associated_upl(upl_t upl)
9966 {
9967 	return upl->associated_upl;
9968 }
9969 
9970 void
9971 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
9972 {
9973 	upl->associated_upl = associated_upl;
9974 }
9975 
9976 struct vnode *
9977 upl_lookup_vnode(upl_t upl)
9978 {
9979 	if (!upl->map_object->internal) {
9980 		return vnode_pager_lookup_vnode(upl->map_object->pager);
9981 	} else {
9982 		return NULL;
9983 	}
9984 }
9985 
9986 boolean_t
9987 upl_has_wired_pages(upl_t upl)
9988 {
9989 	return (upl->flags & UPL_HAS_WIRED) ? TRUE : FALSE;
9990 }
9991 
9992 #if UPL_DEBUG
9993 kern_return_t
9994 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
9995 {
9996 	upl->ubc_alias1 = alias1;
9997 	upl->ubc_alias2 = alias2;
9998 	return KERN_SUCCESS;
9999 }
10000 int
10001 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10002 {
10003 	if (al) {
10004 		*al = upl->ubc_alias1;
10005 	}
10006 	if (al2) {
10007 		*al2 = upl->ubc_alias2;
10008 	}
10009 	return KERN_SUCCESS;
10010 }
10011 #endif /* UPL_DEBUG */
10012 
10013 #if VM_PRESSURE_EVENTS
10014 /*
10015  * Upward trajectory.
10016  */
10017 
10018 boolean_t
10019 VM_PRESSURE_NORMAL_TO_WARNING(void)
10020 {
10021 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10022 		/* Available pages below our threshold */
10023 		uint32_t available_pages = memorystatus_get_available_page_count();
10024 		if (available_pages < memorystatus_get_soft_memlimit_page_shortage_threshold()) {
10025 #if CONFIG_FREEZE
10026 			/* No frozen processes to kill */
10027 			if (memorystatus_frozen_count == 0) {
10028 				/* Not enough suspended processes available. */
10029 				if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10030 					return TRUE;
10031 				}
10032 			}
10033 #else /* CONFIG_FREEZE */
10034 			return TRUE;
10035 #endif /* CONFIG_FREEZE */
10036 		}
10037 		return FALSE;
10038 	} else {
10039 		return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10040 	}
10041 }
10042 
10043 boolean_t
10044 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10045 {
10046 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10047 		/* Available pages below our threshold */
10048 		uint32_t available_pages = memorystatus_get_available_page_count();
10049 		return available_pages < memorystatus_get_critical_page_shortage_threshold();
10050 	} else {
10051 		return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10052 	}
10053 }
10054 
10055 /*
10056  * Downward trajectory.
10057  */
10058 boolean_t
10059 VM_PRESSURE_WARNING_TO_NORMAL(void)
10060 {
10061 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10062 		/* Available pages above our threshold */
10063 		uint32_t available_pages = memorystatus_get_available_page_count();
10064 		uint32_t target_threshold = (((115 * memorystatus_get_soft_memlimit_page_shortage_threshold()) / 100));
10065 		return available_pages > target_threshold;
10066 	} else {
10067 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10068 	}
10069 }
10070 
10071 boolean_t
10072 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10073 {
10074 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10075 		uint32_t available_pages = memorystatus_get_available_page_count();
10076 		uint32_t target_threshold = (((115 * memorystatus_get_critical_page_shortage_threshold()) / 100));
10077 		return available_pages > target_threshold;
10078 	} else {
10079 		return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10080 	}
10081 }
10082 #endif /* VM_PRESSURE_EVENTS */
10083 
10084 #if DEVELOPMENT || DEBUG
10085 bool compressor_running_perf_test;
10086 uint64_t compressor_perf_test_pages_processed;
10087 
10088 static kern_return_t
10089 move_pages_to_queue(
10090 	vm_map_t map,
10091 	user_addr_t start_addr,
10092 	size_t buffer_size,
10093 	vm_page_queue_head_t *queue,
10094 	size_t *pages_moved)
10095 {
10096 	kern_return_t err = KERN_SUCCESS;
10097 	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
10098 	boolean_t addr_in_map = FALSE;
10099 	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
10100 	vm_object_t curr_object = VM_OBJECT_NULL;
10101 	*pages_moved = 0;
10102 
10103 
10104 	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
10105 		/*
10106 		 * We don't currently support benchmarking maps with a different page size
10107 		 * than the kernel.
10108 		 */
10109 		return KERN_INVALID_ARGUMENT;
10110 	}
10111 
10112 	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
10113 		return KERN_INVALID_ARGUMENT;
10114 	}
10115 
10116 	vm_map_lock_read(map);
10117 	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
10118 	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
10119 
10120 
10121 	while (curr_addr < end_addr) {
10122 		addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
10123 		if (!addr_in_map) {
10124 			err = KERN_INVALID_ARGUMENT;
10125 			break;
10126 		}
10127 		curr_object = VME_OBJECT(curr_entry);
10128 		if (curr_object) {
10129 			vm_object_lock(curr_object);
10130 			/* We really only want anonymous memory that's in the top level map and object here. */
10131 			if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
10132 			    curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
10133 				err = KERN_INVALID_ARGUMENT;
10134 				vm_object_unlock(curr_object);
10135 				break;
10136 			}
10137 			vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
10138 			vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
10139 			    (curr_entry->vme_start + VME_OFFSET(curr_entry));
10140 			vm_map_offset_t curr_offset = start_offset;
10141 			vm_page_t curr_page;
10142 			while (curr_offset < end_offset) {
10143 				curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
10144 				if (curr_page != VM_PAGE_NULL) {
10145 					vm_page_lock_queues();
10146 					if (curr_page->vmp_laundry) {
10147 						vm_pageout_steal_laundry(curr_page, TRUE);
10148 					}
10149 					/*
10150 					 * we've already factored out pages in the laundry which
10151 					 * means this page can't be on the pageout queue so it's
10152 					 * safe to do the vm_page_queues_remove
10153 					 */
10154 					bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
10155 					vm_page_queues_remove(curr_page, TRUE);
10156 					if (donate) {
10157 						/*
10158 						 * The compressor needs to see this bit to know
10159 						 * where this page needs to land. Also if stolen,
10160 						 * this bit helps put the page back in the right
10161 						 * special queue where it belongs.
10162 						 */
10163 						curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
10164 					}
10165 					// Clear the referenced bit so we ensure this gets paged out
10166 					curr_page->vmp_reference = false;
10167 					if (curr_page->vmp_pmapped) {
10168 						pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
10169 						    VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
10170 					}
10171 					vm_page_queue_enter(queue, curr_page, vmp_pageq);
10172 					vm_page_unlock_queues();
10173 					*pages_moved += 1;
10174 				}
10175 				curr_offset += PAGE_SIZE_64;
10176 				curr_addr += PAGE_SIZE_64;
10177 			}
10178 		}
10179 		vm_object_unlock(curr_object);
10180 	}
10181 	vm_map_unlock_read(map);
10182 	return err;
10183 }
10184 
10185 /*
10186  * Local queue for processing benchmark pages.
10187  * Can't be allocated on the stack because the pointer has to
10188  * be packable.
10189  */
10190 vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
10191 kern_return_t
10192 run_compressor_perf_test(
10193 	user_addr_t buf,
10194 	size_t buffer_size,
10195 	uint64_t *time,
10196 	uint64_t *bytes_compressed,
10197 	uint64_t *compressor_growth)
10198 {
10199 	kern_return_t err = KERN_SUCCESS;
10200 	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10201 		return KERN_NOT_SUPPORTED;
10202 	}
10203 	if (current_task() == kernel_task) {
10204 		return KERN_INVALID_ARGUMENT;
10205 	}
10206 	vm_page_lock_queues();
10207 	if (compressor_running_perf_test) {
10208 		/* Only run one instance of the benchmark at a time. */
10209 		vm_page_unlock_queues();
10210 		return KERN_RESOURCE_SHORTAGE;
10211 	}
10212 	vm_page_unlock_queues();
10213 	size_t page_count = 0;
10214 	vm_map_t map;
10215 	vm_page_t p, next;
10216 	uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
10217 	uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
10218 	*bytes_compressed = *compressor_growth = 0;
10219 
10220 	vm_page_queue_init(&compressor_perf_test_queue);
10221 	map = current_task()->map;
10222 	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
10223 	if (err != KERN_SUCCESS) {
10224 		goto out;
10225 	}
10226 
10227 	vm_page_lock_queues();
10228 	compressor_running_perf_test = true;
10229 	compressor_perf_test_pages_processed = 0;
10230 	/*
10231 	 * At this point the compressor threads should only process the benchmark queue
10232 	 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
10233 	 * to determine how many compressed bytes we ended up using.
10234 	 */
10235 	compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10236 	vm_page_unlock_queues();
10237 
10238 	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
10239 
10240 	vm_page_lock_queues();
10241 	compressor_perf_test_start = mach_absolute_time();
10242 
10243 	// Wake up the compressor thread(s)
10244 	sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
10245 	    pgo_iothread_internal_state[0].pgo_iothread);
10246 
10247 	/*
10248 	 * Depending on when this test is run we could overshoot or be right on the mark
10249 	 * with our page_count. So the comparison is of the _less than_ variety.
10250 	 */
10251 	while (compressor_perf_test_pages_processed < page_count) {
10252 		assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
10253 		vm_page_unlock_queues();
10254 		thread_block(THREAD_CONTINUE_NULL);
10255 		vm_page_lock_queues();
10256 	}
10257 	compressor_perf_test_end = mach_absolute_time();
10258 	compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
10259 	vm_page_unlock_queues();
10260 
10261 
10262 out:
10263 	/*
10264 	 * If we errored out above, then we could still have some pages
10265 	 * on the local queue. Make sure to put them back on the active queue before
10266 	 * returning so they're not orphaned.
10267 	 */
10268 	vm_page_lock_queues();
10269 	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
10270 	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
10271 	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
10272 		next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
10273 
10274 		vm_page_enqueue_active(p, FALSE);
10275 		p = next;
10276 	}
10277 
10278 	compressor_running_perf_test = false;
10279 	vm_page_unlock_queues();
10280 	if (err == KERN_SUCCESS) {
10281 		*bytes_compressed = page_count * PAGE_SIZE_64;
10282 		*compressor_growth = compressed_bytes_end - compressed_bytes_start;
10283 	}
10284 
10285 	/*
10286 	 * pageout_scan will consider waking the compactor swapper
10287 	 * before it blocks. Do the same thing here before we return
10288 	 * to ensure that back to back benchmark runs can't overly fragment the
10289 	 * compressor pool.
10290 	 */
10291 	vm_consider_waking_compactor_swapper();
10292 	return err;
10293 }
10294 #endif /* DEVELOPMENT || DEBUG */
10295